def setup(self, config): self.config = config print("Load the models") vocab = torch.load(config.vocab) # type: Vocab parser = load_parser(fetch_best_ckpt_name(config.parser_model)) self.task = ParserTask(vocab, parser) print("Load the dataset") train_corpus = Corpus.load(config.ftrain) if config.hk_training_set == 'on': self.corpus = train_corpus else: self.corpus = Corpus.load(config.fdata) dataset = TextDataset(vocab.numericalize(self.corpus, True)) # set the data loader self.loader = DataLoader(dataset=dataset, collate_fn=collate_fn) def embed_backward_hook(module, grad_in, grad_out): ram_write('embed_grad', grad_out[0]) self.parser.char_lstm.embed.register_backward_hook(embed_backward_hook) # self.parser.embed.register_backward_hook(embed_backward_hook) self.parser.eval() self.embed_searcher = EmbeddingSearcher( embed=self.parser.char_lstm.embed.weight, idx2word=lambda x: self.vocab.chars[x], word2idx=lambda x: self.vocab.char_dict[x]) random.seed(1) torch.manual_seed(1)
def __call__(self, config): print("Load the models") vocab = torch.load(config.vocab) parser = load_parser(fetch_best_ckpt_name(config.parser_model)) task = ParserTask(vocab, parser) if config.pred_tag: tagger = PosTagger.load(fetch_best_ckpt_name(config.tagger_model)) else: tagger = None print("Load the dataset") corpus = Corpus.load(config.fdata) dataset = TextDataset(vocab.numericalize(corpus)) # set the data loader loader = batchify(dataset, config.batch_size, config.buckets) print("Evaluate the dataset") loss, metric = task.evaluate(loader, config.punct, tagger, True) print(f"Loss: {loss:.4f} {metric}")
def pre_attack(self, config): print("Load the models") self.vocab = torch.load(config.vocab) self.parser = load_parser(fetch_best_ckpt_name(config.parser_model)) self.task = ParserTask(self.vocab, self.parser) print("Load the dataset") corpus = Corpus.load(config.fdata) dataset = TextDataset(self.vocab.numericalize(corpus, training=True)) loader = DataLoader(dataset=dataset, collate_fn=collate_fn) return corpus, loader
class Augmentation(Attack): def get_attack_seq_generator(self, config): method = config.blackbox_method input_type = config.input if input_type == 'char': return CharTypo(config, self.vocab) else: if method == 'insert': return InsertingPunct(config, self.vocab) elif method == 'substitute': return Substituting(config, self.task, self.vocab) elif method == 'delete': return DeletingPunct(config, self.vocab) def __call__(self, config): self.vocab = torch.load(config.vocab) self.parser = load_parser(fetch_best_ckpt_name(config.parser_model)) self.task = ParserTask(self.vocab, self.parser) # load training data corpus = Corpus.load(config.ftrain) dataset = TextDataset(self.vocab.numericalize(corpus, training=True)) loader = DataLoader(dataset=dataset, collate_fn=collate_fn) augmentation_corpus = Corpus([]) training_data_number = len(corpus.sentences) self.attack_seq_generator = self.get_attack_seq_generator(config) # random prob to decide whether to change a specific training data # if prob[index] < augmentation_rate, augmented. prob = np.random.uniform(0.0, 1.0, size=(training_data_number,)) for index, (seq_idx, tag_idx, chars, arcs, rels) in enumerate(loader): sentence = corpus.sentences[index] augmentation_corpus.sentences.append(sentence) if index % 1000 == 0: print("{} sentences have processed! ".format(index)) if prob[index] < config.augmentation_rate: seqs = self.get_seqs_name(seq_idx) tags = self.get_tags_name(tag_idx) mask = self.get_mask(seq_idx, self.vocab.pad_index, punct_list=self.vocab.puncts) raw_metric = self.task.evaluate([(seq_idx, tag_idx, chars, arcs, rels)],mst=config.mst) augmentation_seq, _, _, _, _, _ = self.attack_seq_generator.generate_attack_seq(' '.join(seqs[1:]), seq_idx, tags, tag_idx, chars, arcs, rels, mask, raw_metric) augmentation_corpus.sentences.append(init_sentence(sentence.FORM, tuple(augmentation_seq[1:]), sentence.POS, sentence.HEAD, sentence.DEPREL)) if config.input == 'char': saved_file = '{}/ptb_train_typo_only_substitute_{:.0f}%_{}.sd'.format(config.augmentation_dir, config.augmentation_rate*100, config.revised_rate) else: saved_file = '{}/ptb_train_{:.0f}%_{}.sd'.format(config.augmentation_dir, config.augmentation_rate*100, config.revised_rate) print("Complete! {} sentences have processed!".format(training_data_number)) print("Current training data number is {}.".format(len(augmentation_corpus.sentences))) print("The augmentation data are saved to file {}".format(saved_file)) augmentation_corpus.save(saved_file)
def __call__(self, config): print("Load the models") vocab = torch.load(config.vocab) parser = load_parser(fetch_best_ckpt_name(config.parser_model)) task = ParserTask(vocab, parser) if config.pred_tag: tagger = PosTagger.load(fetch_best_ckpt_name(config.tagger_model)) else: tagger = None print("Load the dataset") corpus = Corpus.load(config.fdata) dataset = TextDataset(vocab.numericalize(corpus, training=False)) # set the data loader loader = batchify(dataset, config.batch_size) print("Make predictions on the dataset") corpus.tags, corpus.heads, corpus.rels = task.predict(loader, tagger) saved_path = '{}/raw_result.conllx'.format(config.result_path) print(f"Save the predicted result to {saved_path}") corpus.save(saved_path)
def __call__(self, config): print("Preprocess the data") train = Corpus.load(config.ftrain) dev = Corpus.load(config.fdev) test = Corpus.load(config.ftest) if os.path.exists(config.vocab): vocab = torch.load(config.vocab) else: vocab = Vocab.from_corpus(corpus=train, min_freq=2) vocab.read_embeddings(Pretrained.load(config.fembed, config.unk)) torch.save(vocab, config.vocab) config.update({ 'n_words': vocab.n_train_words, 'n_tags': vocab.n_tags, 'n_rels': vocab.n_rels, 'n_chars': vocab.n_chars, 'pad_index': vocab.pad_index, 'unk_index': vocab.unk_index }) print(vocab) print("Load the dataset") trainset = TextDataset(vocab.numericalize(train)) devset = TextDataset(vocab.numericalize(dev)) testset = TextDataset(vocab.numericalize(test)) # set the data loaders train_loader = batchify(dataset=trainset, batch_size=config.batch_size, n_buckets=config.buckets, shuffle=True) dev_loader = batchify(dataset=devset, batch_size=config.batch_size, n_buckets=config.buckets) test_loader = batchify(dataset=testset, batch_size=config.batch_size, n_buckets=config.buckets) print(f"{'train:':6} {len(trainset):5} sentences in total, " f"{len(train_loader):3} batches provided") print(f"{'dev:':6} {len(devset):5} sentences in total, " f"{len(dev_loader):3} batches provided") print(f"{'test:':6} {len(testset):5} sentences in total, " f"{len(test_loader):3} batches provided") print("Create the models") assert config.train_task in ['parser', 'tagger'] is_training_parser = config.train_task == 'parser' if config.augmentation_training: aug_test = Corpus.load(config.augmentation_test_file) aug_testset = TextDataset(vocab.numericalize(aug_test)) aug_test_loader = batchify(dataset=aug_testset, batch_size=config.batch_size, n_buckets=config.buckets) print(f"{'test:':6} {len(aug_testset):5} sentences in total, " f"{len(aug_test_loader):3} batches provided") if is_training_parser: model = init_parser(config, vocab.embeddings) task = ParserTask(vocab, model) best_e, best_metric = 1, ParserMetric() else: model = PosTagger(config, vocab.embeddings) task = TaggerTask(vocab, model) best_e, best_metric = 1, TaggerMetric() if torch.cuda.is_available(): model = model.cuda() print(f"{model}\n") total_time = timedelta() # best_e, best_metric = 1, TaggerMetric() task.optimizer = Adam(task.model.parameters(), config.lr, (config.beta_1, config.beta_2), config.epsilon) task.scheduler = ExponentialLR(task.optimizer, config.decay**(1 / config.steps)) for epoch in range(1, config.epochs + 1): start = datetime.now() # train one epoch and update the parameters task.train(train_loader) print(f"Epoch {epoch} / {config.epochs}:") loss, train_metric = task.evaluate(train_loader, config.punct) print(f"{'train:':6} Loss: {loss:.4f} {train_metric}") loss, dev_metric = task.evaluate(dev_loader, config.punct) print(f"{'dev:':6} Loss: {loss:.4f} {dev_metric}") loss, test_metric = task.evaluate(test_loader, config.punct) print(f"{'test:':6} Loss: {loss:.4f} {test_metric}") if config.augmentation_training: loss, aug_test_metric = task.evaluate(aug_test_loader, config.punct) print(f"{'test:':6} Loss: {loss:.4f} {aug_test_metric}") t = datetime.now() - start if dev_metric > best_metric and epoch > config.patience: best_e, best_metric = epoch, dev_metric if is_training_parser: task.model.save(config.parser_model + f".{best_e}") else: task.model.save(config.tagger_model + f".{best_e}") print(f"{t}s elapsed (saved)\n") else: print(f"{t}s elapsed\n") sys.stdout.flush() total_time += t if epoch - best_e >= config.patience: break if is_training_parser: copyfile(config.parser_model + f'.{best_e}', config.parser_model + '.best') task.model = load_parser(config.parser_model + f".{best_e}") else: copyfile(config.tagger_model + f'.{best_e}', config.tagger_model + '.best') task.model = PosTagger.load(config.tagger_model + f".{best_e}") loss, metric = task.evaluate(test_loader, config.punct) print(f"max score of dev is {best_metric.score:.2%} at epoch {best_e}") print(f"the score of test at epoch {best_e} is {metric.score:.2%}") if config.augmentation_training: loss, metric = task.evaluate(aug_test_loader, config.punct) print( f"the score of aug test at epoch {best_e} is {metric.score:.2%}" ) print(f"average time of each epoch is {total_time / epoch}s") print(f"{total_time}s elapsed")