def main(): parser = argparse.ArgumentParser() opt = parse_arguments(parser) conf = Config(opt) reader = Reader(conf.digit2zero) dataset, max_length, label_length = reader.read_trigger_txt( conf.trigger_file, -1) reader.merge_labels(dataset) trains = reader.read_txt(conf.train_all_file, conf.train_num) devs = reader.read_txt(conf.dev_file, conf.dev_num) tests = reader.read_txt(conf.test_file, conf.test_num) print(len(dataset)) if conf.context_emb == ContextEmb.bert: print('Loading the BERT vectors for all datasets.') conf.context_emb_size = load_bert_vec( conf.trigger_file + "." + conf.context_emb.name + ".vec", dataset) # setting for data conf.use_iobes(trains) conf.use_iobes(dataset) conf.use_iobes(devs) conf.use_iobes(tests) conf.optimizer = opt.trig_optimizer conf.build_label_idx(dataset) conf.build_word_idx(trains, devs, tests) conf.build_emb_table() conf.map_insts_ids(dataset) conf.map_insts_ids(trains) conf.map_insts_ids(devs) conf.map_insts_ids(tests) dataset = reader.trigger_percentage(dataset, conf.percentage) encoder = SoftMatcher(conf, label_length) trainer = SoftMatcherTrainer(encoder, conf, devs, tests) # matching module training random.shuffle(dataset) trainer.train_model(conf.num_epochs_soft, dataset) logits, predicted, triggers = trainer.get_triggervec(dataset) # all the trigger vectors, trigger type, string name of the trigger triggers_remove = remove_duplicates(logits, predicted, triggers, dataset) numbers = int(len(trains) * (1 - opt.unlabeled_percentage)) print("number of train instances : ", numbers) initial_trains = trains[:numbers] unlabeled_x = trains[numbers:] for data in unlabeled_x: data.output_ids = None # sequence labeling module self-training random.shuffle(dataset) inference = SoftSequence(conf, encoder) sequence_trainer = SoftSequenceTrainer(inference, conf, devs, tests, triggers_remove) sequence_trainer.self_training(conf.num_epochs, dataset, unlabeled_x)
default=100, help="how much percentage of training dataset to use") args = parser.parse_args() for k in args.__dict__: print(k + ": " + str(args.__dict__[k])) return args parser = argparse.ArgumentParser() opt = parse_arguments(parser) conf = Config(opt) reader = Reader(conf.digit2zero) dataset, max_length, label_length = reader.read_trigger_txt( conf.trigger_file, -1) reader.merge_labels(dataset) devs = reader.read_txt(conf.dev_file, conf.dev_num) tests = reader.read_txt(conf.test_file, conf.test_num) print(len(dataset)) if conf.context_emb == ContextEmb.bert: print('Loading the BERT vectors for all datasets.') conf.context_emb_size = load_bert_vec( conf.trigger_file + "." + conf.context_emb.name + ".vec", dataset) # setting for data conf.use_iobes(dataset) conf.use_iobes(devs) conf.use_iobes(tests) conf.optimizer = opt.trig_optimizer