Python ParserTask 예제들, dpattack.task.ParserTask Python 예제들

예제 #1

0

파일 보기

파일: ihackc.py 프로젝트: zjiehang/DPAttack

    def setup(self, config):
        self.config = config

        print("Load the models")
        vocab = torch.load(config.vocab)  # type: Vocab
        parser = load_parser(fetch_best_ckpt_name(config.parser_model))

        self.task = ParserTask(vocab, parser)

        print("Load the dataset")

        train_corpus = Corpus.load(config.ftrain)

        if config.hk_training_set == 'on':
            self.corpus = train_corpus
        else:
            self.corpus = Corpus.load(config.fdata)
        dataset = TextDataset(vocab.numericalize(self.corpus, True))
        # set the data loader
        self.loader = DataLoader(dataset=dataset, collate_fn=collate_fn)

        def embed_backward_hook(module, grad_in, grad_out):
            ram_write('embed_grad', grad_out[0])

        self.parser.char_lstm.embed.register_backward_hook(embed_backward_hook)
        # self.parser.embed.register_backward_hook(embed_backward_hook)
        self.parser.eval()

        self.embed_searcher = EmbeddingSearcher(
            embed=self.parser.char_lstm.embed.weight,
            idx2word=lambda x: self.vocab.chars[x],
            word2idx=lambda x: self.vocab.char_dict[x])

        random.seed(1)
        torch.manual_seed(1)

예제 #2

0

파일 보기

    def __call__(self, config):
        print("Load the models")
        vocab = torch.load(config.vocab)
        parser = load_parser(fetch_best_ckpt_name(config.parser_model))
        task = ParserTask(vocab, parser)
        if config.pred_tag:
            tagger = PosTagger.load(fetch_best_ckpt_name(config.tagger_model))
        else:
            tagger = None

        print("Load the dataset")
        corpus = Corpus.load(config.fdata)
        dataset = TextDataset(vocab.numericalize(corpus))
        # set the data loader
        loader = batchify(dataset, config.batch_size, config.buckets)

        print("Evaluate the dataset")
        loss, metric = task.evaluate(loader, config.punct, tagger, True)
        print(f"Loss: {loss:.4f} {metric}")

예제 #3

0

파일 보기

파일: attack.py 프로젝트: zjiehang/DPAttack

    def pre_attack(self, config):
        print("Load the models")
        self.vocab = torch.load(config.vocab)
        self.parser = load_parser(fetch_best_ckpt_name(config.parser_model))
        self.task = ParserTask(self.vocab, self.parser)

        print("Load the dataset")
        corpus = Corpus.load(config.fdata)
        dataset = TextDataset(self.vocab.numericalize(corpus, training=True))
        loader = DataLoader(dataset=dataset, collate_fn=collate_fn)
        return corpus, loader

예제 #4

0

파일 보기

파일: augmentation.py 프로젝트: zjiehang/DPAttack

class Augmentation(Attack):
    def get_attack_seq_generator(self, config):
        method = config.blackbox_method
        input_type = config.input
        if input_type == 'char':
            return CharTypo(config, self.vocab)
        else:
            if method == 'insert':
                return InsertingPunct(config, self.vocab)
            elif method == 'substitute':
                return Substituting(config, self.task, self.vocab)
            elif method == 'delete':
                return DeletingPunct(config, self.vocab)

    def __call__(self, config):
        self.vocab = torch.load(config.vocab)
        self.parser = load_parser(fetch_best_ckpt_name(config.parser_model))
        self.task = ParserTask(self.vocab, self.parser)
        # load training data
        corpus = Corpus.load(config.ftrain)
        dataset = TextDataset(self.vocab.numericalize(corpus, training=True))
        loader = DataLoader(dataset=dataset, collate_fn=collate_fn)
        augmentation_corpus = Corpus([])
        training_data_number = len(corpus.sentences)
        self.attack_seq_generator = self.get_attack_seq_generator(config)

        # random prob to decide whether to change a specific training data
        # if prob[index] < augmentation_rate, augmented.
        prob = np.random.uniform(0.0, 1.0, size=(training_data_number,))
        for index, (seq_idx, tag_idx, chars, arcs, rels) in enumerate(loader):
            sentence = corpus.sentences[index]
            augmentation_corpus.sentences.append(sentence)
            if index % 1000 == 0:
                print("{} sentences have processed! ".format(index))

            if prob[index] < config.augmentation_rate:
                seqs = self.get_seqs_name(seq_idx)
                tags = self.get_tags_name(tag_idx)
                mask = self.get_mask(seq_idx, self.vocab.pad_index, punct_list=self.vocab.puncts)
                raw_metric = self.task.evaluate([(seq_idx, tag_idx, chars, arcs, rels)],mst=config.mst)
                augmentation_seq, _,  _,  _, _, _ = self.attack_seq_generator.generate_attack_seq(' '.join(seqs[1:]), seq_idx, tags, tag_idx, chars, arcs, rels, mask, raw_metric)
                augmentation_corpus.sentences.append(init_sentence(sentence.FORM, tuple(augmentation_seq[1:]), sentence.POS, sentence.HEAD, sentence.DEPREL))

        if config.input == 'char':
            saved_file = '{}/ptb_train_typo_only_substitute_{:.0f}%_{}.sd'.format(config.augmentation_dir, config.augmentation_rate*100, config.revised_rate)
        else:
            saved_file = '{}/ptb_train_{:.0f}%_{}.sd'.format(config.augmentation_dir, config.augmentation_rate*100, config.revised_rate)

        print("Complete! {} sentences have processed!".format(training_data_number))
        print("Current training data number is {}.".format(len(augmentation_corpus.sentences)))
        print("The augmentation data are saved to file {}".format(saved_file))
        augmentation_corpus.save(saved_file)

예제 #5

0

파일 보기

    def __call__(self, config):
        print("Load the models")
        vocab = torch.load(config.vocab)
        parser = load_parser(fetch_best_ckpt_name(config.parser_model))
        task = ParserTask(vocab, parser)
        if config.pred_tag:
            tagger = PosTagger.load(fetch_best_ckpt_name(config.tagger_model))
        else:
            tagger = None

        print("Load the dataset")
        corpus = Corpus.load(config.fdata)
        dataset = TextDataset(vocab.numericalize(corpus, training=False))
        # set the data loader
        loader = batchify(dataset, config.batch_size)

        print("Make predictions on the dataset")
        corpus.tags, corpus.heads, corpus.rels = task.predict(loader, tagger)

        saved_path = '{}/raw_result.conllx'.format(config.result_path)
        print(f"Save the predicted result to {saved_path}")
        corpus.save(saved_path)

예제 #6

0

파일 보기

    def __call__(self, config):
        print("Preprocess the data")
        train = Corpus.load(config.ftrain)
        dev = Corpus.load(config.fdev)
        test = Corpus.load(config.ftest)
        if os.path.exists(config.vocab):
            vocab = torch.load(config.vocab)
        else:
            vocab = Vocab.from_corpus(corpus=train, min_freq=2)
            vocab.read_embeddings(Pretrained.load(config.fembed, config.unk))
            torch.save(vocab, config.vocab)
        config.update({
            'n_words': vocab.n_train_words,
            'n_tags': vocab.n_tags,
            'n_rels': vocab.n_rels,
            'n_chars': vocab.n_chars,
            'pad_index': vocab.pad_index,
            'unk_index': vocab.unk_index
        })
        print(vocab)

        print("Load the dataset")
        trainset = TextDataset(vocab.numericalize(train))
        devset = TextDataset(vocab.numericalize(dev))
        testset = TextDataset(vocab.numericalize(test))
        # set the data loaders
        train_loader = batchify(dataset=trainset,
                                batch_size=config.batch_size,
                                n_buckets=config.buckets,
                                shuffle=True)
        dev_loader = batchify(dataset=devset,
                              batch_size=config.batch_size,
                              n_buckets=config.buckets)
        test_loader = batchify(dataset=testset,
                               batch_size=config.batch_size,
                               n_buckets=config.buckets)
        print(f"{'train:':6} {len(trainset):5} sentences in total, "
              f"{len(train_loader):3} batches provided")
        print(f"{'dev:':6} {len(devset):5} sentences in total, "
              f"{len(dev_loader):3} batches provided")
        print(f"{'test:':6} {len(testset):5} sentences in total, "
              f"{len(test_loader):3} batches provided")

        print("Create the models")
        assert config.train_task in ['parser', 'tagger']
        is_training_parser = config.train_task == 'parser'

        if config.augmentation_training:
            aug_test = Corpus.load(config.augmentation_test_file)
            aug_testset = TextDataset(vocab.numericalize(aug_test))
            aug_test_loader = batchify(dataset=aug_testset,
                                       batch_size=config.batch_size,
                                       n_buckets=config.buckets)
            print(f"{'test:':6} {len(aug_testset):5} sentences in total, "
                  f"{len(aug_test_loader):3} batches provided")

        if is_training_parser:
            model = init_parser(config, vocab.embeddings)
            task = ParserTask(vocab, model)
            best_e, best_metric = 1, ParserMetric()
        else:
            model = PosTagger(config, vocab.embeddings)
            task = TaggerTask(vocab, model)
            best_e, best_metric = 1, TaggerMetric()

        if torch.cuda.is_available():
            model = model.cuda()
        print(f"{model}\n")
        total_time = timedelta()
        # best_e, best_metric = 1, TaggerMetric()
        task.optimizer = Adam(task.model.parameters(), config.lr,
                              (config.beta_1, config.beta_2), config.epsilon)
        task.scheduler = ExponentialLR(task.optimizer,
                                       config.decay**(1 / config.steps))
        for epoch in range(1, config.epochs + 1):
            start = datetime.now()
            # train one epoch and update the parameters
            task.train(train_loader)

            print(f"Epoch {epoch} / {config.epochs}:")
            loss, train_metric = task.evaluate(train_loader, config.punct)
            print(f"{'train:':6} Loss: {loss:.4f} {train_metric}")
            loss, dev_metric = task.evaluate(dev_loader, config.punct)
            print(f"{'dev:':6} Loss: {loss:.4f} {dev_metric}")
            loss, test_metric = task.evaluate(test_loader, config.punct)
            print(f"{'test:':6} Loss: {loss:.4f} {test_metric}")
            if config.augmentation_training:
                loss, aug_test_metric = task.evaluate(aug_test_loader,
                                                      config.punct)
                print(f"{'test:':6} Loss: {loss:.4f} {aug_test_metric}")

            t = datetime.now() - start

            if dev_metric > best_metric and epoch > config.patience:
                best_e, best_metric = epoch, dev_metric
                if is_training_parser:
                    task.model.save(config.parser_model + f".{best_e}")
                else:
                    task.model.save(config.tagger_model + f".{best_e}")
                print(f"{t}s elapsed (saved)\n")
            else:
                print(f"{t}s elapsed\n")
            sys.stdout.flush()
            total_time += t
            if epoch - best_e >= config.patience:
                break

        if is_training_parser:
            copyfile(config.parser_model + f'.{best_e}',
                     config.parser_model + '.best')
            task.model = load_parser(config.parser_model + f".{best_e}")
        else:
            copyfile(config.tagger_model + f'.{best_e}',
                     config.tagger_model + '.best')
            task.model = PosTagger.load(config.tagger_model + f".{best_e}")
        loss, metric = task.evaluate(test_loader, config.punct)

        print(f"max score of dev is {best_metric.score:.2%} at epoch {best_e}")
        print(f"the score of test at epoch {best_e} is {metric.score:.2%}")

        if config.augmentation_training:
            loss, metric = task.evaluate(aug_test_loader, config.punct)
            print(
                f"the score of aug test at epoch {best_e} is {metric.score:.2%}"
            )

        print(f"average time of each epoch is {total_time / epoch}s")
        print(f"{total_time}s elapsed")