예제 #1
0
파일: main.py 프로젝트: wzc118/SKE_BERT_RE
 def _init_model(self, is_xgb=False):
     #self.model = MultiHeadSelection(self.hyper).cuda(self.gpu)
     if self.model_name == 'selection':
         self.model = MultiHeadSelection(self.hyper,
                                         is_xgb).to(self.hyper.device)
     elif self.model_name == 'pso_1':
         #self.model = P_Model(self.hyper).to(self.hyper.device)
         self.model = P_Model_new(self.hyper, is_xgb).to(self.hyper.device)
     elif self.model_name == 'pso_2':
         #self.model = SO_TAG_Model(self.hyper).to(self.hyper.device)
         #self.model = SO_WITHOUT_Model(self.hyper).to(self.hyper.device)
         self.model = SO_Model_New(self.hyper, is_xgb).to(self.hyper.device)
         self.model_p = P_Model_new(self.hyper,
                                    is_xgb).to(self.hyper.device)
예제 #2
0
 def _init_model(self):
     self.model = MultiHeadSelection(self.hyper).cuda(
         self.gpu) if torch.cuda.is_available() else MultiHeadSelection(
             self.hyper)
예제 #3
0
class Runner(object):
    def __init__(self, exp_name: str):
        self.exp_name = exp_name
        self.model_dir = 'saved_models'

        self.hyper = Hyper(os.path.join('experiments',
                                        self.exp_name + '.json'))

        self.gpu = self.hyper.gpu
        self.preprocessor = None
        self.triplet_metrics = F1_triplet()
        self.ner_metrics = F1_ner()
        self.optimizer = None
        self.model = None

    def _optimizer(self, name, model):
        m = {
            'adam': Adam(model.parameters()),
            'sgd': SGD(model.parameters(), lr=0.5),
            'adamw': AdamW(model.parameters())
        }
        return m[name]

    def _init_model(self):
        self.model = MultiHeadSelection(self.hyper).cuda(
            self.gpu) if torch.cuda.is_available() else MultiHeadSelection(
                self.hyper)

    def preprocessing(self):
        if self.exp_name == 'conll_selection_re':
            self.preprocessor = Conll_selection_preprocessing(self.hyper)
        elif self.exp_name == 'chinese_selection_re':
            self.preprocessor = Chinese_selection_preprocessing(self.hyper)
        elif self.exp_name == 'conll_bert_re':
            self.preprocessor = Conll_bert_preprocessing(self.hyper)
        self.preprocessor.gen_relation_vocab()
        self.preprocessor.gen_all_data()
        self.preprocessor.gen_vocab(min_freq=1)
        # for ner only
        self.preprocessor.gen_bio_vocab()

    def run(self, mode: str):
        if mode == 'preprocessing':
            self.preprocessing()
        elif mode == 'train':
            self._init_model()
            self.optimizer = self._optimizer(self.hyper.optimizer, self.model)
            self.train()
        elif mode == 'evaluation':
            self._init_model()
            self.load_model(epoch=self.hyper.evaluation_epoch)
            self.evaluation()
        else:
            raise ValueError('invalid mode')

    def load_model(self, epoch: int):
        self.model.load_state_dict(
            torch.load(
                os.path.join(self.model_dir,
                             self.exp_name + '_' + str(epoch))))

    def save_model(self, epoch: int):
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)
        torch.save(
            self.model.state_dict(),
            os.path.join(self.model_dir, self.exp_name + '_' + str(epoch)))

    def evaluation(self):
        dev_set = Selection_Dataset(self.hyper, self.hyper.dev)
        loader = Selection_loader(dev_set,
                                  batch_size=self.hyper.eval_batch,
                                  pin_memory=True)
        self.triplet_metrics.reset()
        self.model.eval()

        pbar = tqdm(enumerate(BackgroundGenerator(loader)), total=len(loader))

        with torch.no_grad():
            for batch_ndx, sample in pbar:
                output = self.model(sample, is_train=False)
                self.triplet_metrics(output['selection_triplets'],
                                     output['spo_gold'])
                self.ner_metrics(output['gold_tags'], output['decoded_tag'])

            triplet_result = self.triplet_metrics.get_metric()
            ner_result = self.ner_metrics.get_metric()
            print('Triplets-> ' + ', '.join([
                "%s: %.4f" % (name[0], value)
                for name, value in triplet_result.items()
                if not name.startswith("_")
            ]) + ' ||' + 'NER->' + ', '.join([
                "%s: %.4f" % (name[0], value)
                for name, value in ner_result.items()
                if not name.startswith("_")
            ]))

    def train(self):
        train_set = Selection_Dataset(self.hyper, self.hyper.train)
        loader = Selection_loader(train_set,
                                  batch_size=self.hyper.train_batch,
                                  pin_memory=True)

        for epoch in range(self.hyper.epoch_num):
            self.model.train()
            pbar = tqdm(enumerate(BackgroundGenerator(loader)),
                        total=len(loader))

            for batch_idx, sample in pbar:

                self.optimizer.zero_grad()
                output = self.model(sample, is_train=True)
                loss = output['loss']
                loss.backward()
                self.optimizer.step()

                pbar.set_description(output['description'](
                    epoch, self.hyper.epoch_num))

                # break

            self.save_model(epoch)

            if epoch % self.hyper.print_epoch == 0 and epoch > 3:
                self.evaluation()
예제 #4
0
 def _init_model(self):
     self.model = MultiHeadSelection(self.hyper).cuda(self.gpu)
예제 #5
0
    # vocab = Vocabulary.from_instances(train_dataset)
    vocab = Vocabulary.from_instances(train_dataset)

    config = Config()

    EMBEDDING_DIM = 200
    HIDDEN_DIM = 300
    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

    lstm = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

    # model = LstmTagger(word_embeddings, lstm, vocab)
    model = MultiHeadSelection(config, word_embeddings, lstm, vocab)
    # model = crf_tagger.CrfTagger(vocab=vocab, encoder=lstm, text_field_embedder=word_embeddings)
    if torch.cuda.is_available():
        cuda_device = 3
        model = model.cuda(cuda_device)
    else:
        cuda_device = -1
    optimizer = optim.Adam(model.parameters())
    iterator = BucketIterator(batch_size=200,
                              sorting_keys=[("tokens", "num_tokens")])
    #   max_instances_in_memory=8000,
    #   cache_instances=True)
    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
 def init_MultiHeadSelection(self):
     self.model = MultiHeadSelection(self.hyper).cuda(self.gpu)
     self.optimizer = self._optimizer(self.hyper.optimizer, self.model)
class Runner(object):
    def __init__(self, exp_name: str):
        self.exp_name = exp_name
        self.model_dir = 'saved_models'

        self.hyper = Hyper(os.path.join('experiments',
                                        self.exp_name + '.json'))

        self.gpu = self.hyper.gpu

        if self.hyper.is_bert == 'ERNIE':
            self.preprocessor = Chinese_selection_preprocessing(self.hyper)

        elif self.hyper.is_bert == "bert_bilstem_crf":
            self.preprocessor = NYT_selection_preprocessing(self.hyper)

        elif self.hyper.is_bert == "nyt_bert_tokenizer":
            self.preprocessor = NYT_bert_selection_preprocessing(self.hyper)

        elif self.hyper.is_bert == "nyt11_bert_tokenizer":
            self.preprocessor = NYT11_bert_selection_preprocessing(self.hyper)

        elif self.hyper.is_bert == "nyt10_bert_tokenizer":
            self.preprocessor = NYT10_bert_selection_preprocessing(self.hyper)

        self.metrics = F1_triplet()

    def _optimizer(self, name, model):
        m = {
            'adam': Adam(model.parameters()),
            'sgd': SGD(model.parameters(), lr=0.01)
        }
        return m[name]

    def init_MultiHeadSelection(self):
        self.model = MultiHeadSelection(self.hyper).cuda(self.gpu)
        self.optimizer = self._optimizer(self.hyper.optimizer, self.model)

    def preprocessing(self):
        self.preprocessor.gen_relation_vocab()
        self.preprocessor.gen_all_data()
        self.preprocessor.gen_vocab(min_freq=1)
        self.preprocessor.gen_ERNIE_vocab()
        # for ner only
        self.preprocessor.gen_bio_vocab()

    def run(self, mode: str):
        if mode == 'preprocessing':
            self.preprocessing()
        elif mode == 'train':
            self.init_MultiHeadSelection()

            #self.load_lastest_models()
            #self.load_model(40)
            self.train()
        elif mode == 'evaluation':
            self.init_MultiHeadSelection()
            #self.load_model(epoch=self.hyper.evaluation_epoch)
            self.load_lastest_models()

            #self.load_model(0)
            self.evaluation()
        else:
            raise ValueError('invalid mode')

    def get_lastest_model_dir(self, model_dir: str):
        file_new = ''
        lists = os.listdir(model_dir)
        if len(lists) != 0:
            lists.sort(key=lambda fn: os.path.getmtime(model_dir + "/" + fn))
            file_new = os.path.join(model_dir, lists[-1])
        return file_new

    def load_lastest_models(self):
        model_dir = self.get_lastest_model_dir(self.model_dir)
        if model_dir != '':
            self.model.load_state_dict(torch.load(model_dir))
        return None

    def load_model(self, epoch: int):
        self.model.load_state_dict(
            torch.load(
                os.path.join(self.model_dir,
                             self.exp_name + '_' + str(epoch))))

    def save_model(self, epoch: int):
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)
        torch.save(
            self.model.state_dict(),
            os.path.join(self.model_dir, self.exp_name + '_' + str(epoch)))

    def evaluation(self):
        if self.hyper.is_bert == "nyt_bert_tokenizer" or \
                self.hyper.is_bert == "nyt11_bert_tokenizer" or \
                self.hyper.is_bert == "nyt10_bert_tokenizer":
            dev_set = Selection_bert_Nyt_Dataset(self.hyper, self.hyper.dev)
            loader = Selection_bert_loader(dev_set,
                                           batch_size=100,
                                           pin_memory=True)

        elif self.hyper.is_bert == "bert_bilstem_crf":
            dev_set = Selection_Nyt_Dataset(self.hyper, self.hyper.dev)
            loader = Selection_loader(dev_set, batch_size=100, pin_memory=True)

        else:
            dev_set = Selection_Dataset(self.hyper, self.hyper.dev)
            loader = Selection_loader(dev_set, batch_size=100, pin_memory=True)

        self.metrics.reset()
        self.model.eval()

        pbar = tqdm(enumerate(BackgroundGenerator(loader)), total=len(loader))

        with torch.no_grad():
            for batch_ndx, sample in pbar:
                output = self.model(sample, is_train=False)
                #self.metrics(output['selection_triplets'], output['spo_gold'])
                self.metrics(output, output)

            result = self.metrics.get_metric()
            print(', '.join([
                "%s: %.4f" % (name, value)
                for name, value in result.items() if not name.startswith("_")
            ]) + " ||")

    def train(self):

        if self.hyper.is_bert == "bert_bilstem_crf":
            train_set = Selection_Nyt_Dataset(self.hyper, self.hyper.train)
            loader = Selection_loader(train_set,
                                      batch_size=100,
                                      pin_memory=True)

        elif self.hyper.is_bert == "nyt_bert_tokenizer" or \
                self.hyper.is_bert == "nyt11_bert_tokenizer" or \
                self.hyper.is_bert == "nyt10_bert_tokenizer":
            train_set = Selection_bert_Nyt_Dataset(self.hyper,
                                                   self.hyper.train)
            loader = Selection_bert_loader(train_set,
                                           batch_size=100,
                                           pin_memory=True)

        else:
            train_set = Selection_Dataset(self.hyper, self.hyper.train)
            loader = Selection_loader(train_set,
                                      batch_size=100,
                                      pin_memory=True)

        for epoch in range(self.hyper.epoch_num):
            self.model.train()
            pbar = tqdm(enumerate(BackgroundGenerator(loader)),
                        total=len(loader))

            for batch_idx, sample in pbar:
                self.optimizer.zero_grad()
                output = self.model(sample, is_train=True)
                loss = output['loss']
                loss.backward()
                self.optimizer.step()

                pbar.set_description(output['description'](
                    epoch, self.hyper.epoch_num))

            self.save_model(epoch)
            if epoch > 3:
                self.evaluation()

            #if epoch >= 6:
            #    self.evaluation()
            '''
예제 #8
0
 def _init_model(self):
     # if hyper.use_multi_gpu == 1:
     # self.model = nn.DataParallel(MultiHeadSelection(self.hyper).cuda())
     # else:
     self.model = MultiHeadSelection(self.hyper).cuda(self.gpu)
     self.criterion = nn.BCEWithLogitsLoss()
예제 #9
0
class Runner(object):
    def __init__(self, exp_name: str):
        self.exp_name = exp_name
        self.model_dir = 'saved_models'

        self.hyper = Hyper(os.path.join('experiments',
                                        self.exp_name + '.json'))

        self.gpu = self.hyper.gpu
        self.preprocessor = None
        self.selection_metrics = F1_selection()
        self.optimizer = None
        self.model = None

    def _optimizer(self, name, model):
        m = {
            'adam': Adam(model.parameters()),
            'sgd': SGD(model.parameters(), lr=0.5),
            'adamw': AdamW(model.parameters())
        }
        return m[name]

    def _init_model(self):
        # if hyper.use_multi_gpu == 1:
        # self.model = nn.DataParallel(MultiHeadSelection(self.hyper).cuda())
        # else:
        self.model = MultiHeadSelection(self.hyper).cuda(self.gpu)
        self.criterion = nn.BCEWithLogitsLoss()

    def preprocessing(self):
        if self.exp_name == 'DuIE_selection_re':
            self.preprocessor = Chinese_selection_preprocessing(self.hyper)
        self.preprocessor.gen_relation_vocab()
        self.preprocessor.gen_all_data()
        self.preprocessor.gen_vocab(min_freq=1)
        # for ner only
        self.preprocessor.gen_bio_vocab()

    def run(self, mode: str):
        if mode == 'preprocessing':
            self.preprocessing()
        elif mode == 'train':
            self._init_model()
            self.optimizer = self._optimizer(self.hyper.optimizer, self.model)
            self.train()
        elif mode == 'evaluation':
            self._init_model()
            self.load_model(epoch=self.hyper.evaluation_epoch)
            self.evaluation()
        else:
            raise ValueError('invalid mode')

    def load_model(self, epoch: int):
        self.model.load_state_dict(
            torch.load(
                os.path.join(self.model_dir,
                             self.exp_name + '_' + str(epoch))))

    def save_model(self, epoch: int):
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)
        torch.save(
            self.model.state_dict(),
            os.path.join(self.model_dir, self.exp_name + '_' + str(epoch)))

    @staticmethod
    def description(epoch, epoch_num, output):
        return "L: {:.7f} epoch: {}/{}:".format(output.item(), epoch,
                                                epoch_num)

    def evaluation(self):
        dev_set = Selection_Dataset(self.hyper, self.hyper.dev)
        loader = Selection_loader(dev_set,
                                  batch_size=self.hyper.eval_batch,
                                  pin_memory=True)
        self.selection_metrics.reset()
        self.model.eval()

        pbar = tqdm(enumerate(BackgroundGenerator(loader)), total=len(loader))

        with torch.no_grad():
            for batch_ndx, sample in pbar:
                pred = self.model(sample, is_train=False)
                pred = torch.sigmoid(pred) > 0.5
                labels = sample.selection_id
                self.selection_metrics(
                    np.array(pred.cpu().numpy(), dtype=int).tolist(),
                    np.array(labels.cpu().numpy(), dtype=int).tolist())

            triplet_result = self.selection_metrics.get_metric()

            print('Triplets-> ' + ', '.join([
                "%s: %.4f" % (name[0], value)
                for name, value in triplet_result.items()
                if not name.startswith("_")
            ]))

    def train(self):
        train_set = Selection_Dataset(self.hyper, self.hyper.train)
        loader = Selection_loader(train_set,
                                  batch_size=self.hyper.train_batch,
                                  pin_memory=True)

        for epoch in range(self.hyper.epoch_num):
            self.model.train()
            pbar = tqdm(enumerate(BackgroundGenerator(loader)),
                        total=len(loader))

            for batch_idx, sample in pbar:

                self.optimizer.zero_grad()
                output = self.model(sample, is_train=True)
                output = output.to('cpu')
                loss = self.criterion(output, sample.selection_id)

                loss.backward()
                self.optimizer.step()
                pbar.set_description(
                    self.description(epoch, self.hyper.epoch_num, loss))

            self.save_model(epoch)

            if epoch % self.hyper.print_epoch == 0:
                self.evaluation()
예제 #10
0
 def _init_model(self):
     self.model = MultiHeadSelection(self.hyper).to(self.device)
예제 #11
0
class Runner(object):
    def __init__(self, exp_name: str):
        self.exp_name = exp_name
        self.model_dir = 'saved_models'
        self.device = torch.device('cpu')
        self.hyper = Hyper(os.path.join('experiments',
                                        self.exp_name + '.json'))

        self.gpu = self.hyper.gpu
        self.preprocessor = None
        self.triplet_metrics = F1_triplet()
        self.ner_metrics = F1_ner()
        self.optimizer = None
        self.model = None

    def _optimizer(self, name, model):
        m = {
            'adam': Adam(model.parameters()),
            'sgd': SGD(model.parameters(), lr=0.5),
            'adamw': AdamW(model.parameters())
        }
        return m[name]

    def _init_model(self):
        self.model = MultiHeadSelection(self.hyper).to(self.device)

    def preprocessing(self):
        if self.exp_name == 'conll_selection_re':
            self.preprocessor = Conll_selection_preprocessing(self.hyper)
        elif self.exp_name == 'chinese_selection_re':
            self.preprocessor = Chinese_selection_preprocessing(self.hyper)
        elif self.exp_name == 'conll_bert_re':
            self.preprocessor = Conll_bert_preprocessing(self.hyper)
        elif self.exp_name == 'datafountain_selection_re':
            self.preprocessor = Datafountain_selection_preprocessing(
                self.hyper)

        self.preprocessor.gen_relation_vocab()
        self.preprocessor.gen_all_data()
        self.preprocessor.gen_vocab(min_freq=1)
        # for ner only
        self.preprocessor.gen_bio_vocab()

    def run(self, mode: str):
        if mode == 'preprocessing':
            self.preprocessing()
        elif mode == 'train':
            self._init_model()
            self.optimizer = self._optimizer(self.hyper.optimizer, self.model)
            self.train()
        elif mode == 'evaluation':
            self._init_model()
            self.load_model(epoch=self.hyper.evaluation_epoch)
            self.evaluation()
        else:
            raise ValueError('invalid mode')

    def load_model(self, epoch: int):
        self.model.load_state_dict(
            torch.load(
                os.path.join(self.model_dir,
                             self.exp_name + '_' + str(epoch))))

    def save_model(self, epoch: int):
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)
        torch.save(
            self.model.state_dict(),
            os.path.join(self.model_dir, self.exp_name + '_' + str(epoch)))

    def evaluation(self):
        dev_set = Selection_Dataset(self.hyper, self.hyper.dev)
        loader = Selection_loader(dev_set,
                                  batch_size=self.hyper.eval_batch,
                                  pin_memory=True)
        self.triplet_metrics.reset()
        self.model.eval()

        pbar = tqdm(enumerate(BackgroundGenerator(loader)), total=len(loader))

        with torch.no_grad():
            with open('./output/sub00.csv', 'w') as file:
                id = 0
                for batch_ndx, sample in pbar:
                    tokens = sample.tokens_id.to(self.device)
                    selection_gold = sample.selection_id.to(self.device)
                    bio_gold = sample.bio_id.to(self.device)
                    text_list = sample.text
                    spo_gold = sample.spo_gold
                    bio_text = sample.bio

                    output = self.model(sample, is_train=False)

                    self.triplet_metrics(output['selection_triplets'],
                                         output['spo_gold'])
                    self.ner_metrics(output['gold_tags'],
                                     output['decoded_tag'])

                    for i in range(len(output['decoded_tag'])):
                        file.write(str(8001 + id) + ',')
                        if len(output['selection_triplets'][i]) != 0:
                            file.write(output['selection_triplets'][i][0]
                                       ['predicate'] + ',')
                            file.write(
                                output['selection_triplets'][i][0]['subject'] +
                                ',')
                            file.write(
                                output['selection_triplets'][i][0]['object'] +
                                '\n')
                        else:
                            if output['decoded_tag'][i].count('B') < 2:
                                file.write('Other' + ',' + 'Other' + ',' +
                                           'Other')
                            else:
                                BIO = output['decoded_tag'][i]
                                tt = ''.join(reversed(BIO))
                                index1 = BIO.index('B')
                                index2 = len(tt) - tt.index('B') - 1
                                file.write('Other' + ',' +
                                           text_list[i][index2] + ',' +
                                           text_list[i][index1])
                            file.write('\n')
                        id += 1
                        # file.write('sentence {} BIO:\n'.format(i))
                        # for j in range(len(text_list[i])):
                        #     file.write(text_list[i][j]+' ')
                        # file.write('\n')
                        # file.writelines(bio_text[i])
                        # file.write('\n')
                        #
                        # file.writelines(output['decoded_tag'][i])
                        # file.write('\n')
                        # file.writelines(output['gold_tags'][i])
                        # file.write('\n')
                        # file.write('sentence {} relation:\n'.format(i))
                        # file.write('\n')
                        # if len(output['selection_triplets']) == 0:
                        #     file.write('empty')
                        # else:
                        #     file.writelines(str(output['selection_triplets'][i]))
                        # file.write('\n')
                        # file.writelines(str(output['spo_gold'][i]))
                        # file.write('\n')

            triplet_result = self.triplet_metrics.get_metric()
            ner_result = self.ner_metrics.get_metric()
            # print('triplet_result=', triplet_result)
            # print('ner_result=', ner_result)

            print('Triplets-> ' + ', '.join([
                "%s: %.4f" % (name[0], value)
                for name, value in triplet_result.items()
                if not name.startswith("_")
            ]) + ' ||' + 'NER->' + ', '.join([
                "%s: %.4f" % (name[0], value)
                for name, value in ner_result.items()
                if not name.startswith("_")
            ]))

    def train(self):
        #**
        self.load_model(epoch=self.hyper.epoch_num)
        # **
        train_set = Selection_Dataset(self.hyper, self.hyper.train)
        loader = Selection_loader(train_set,
                                  batch_size=self.hyper.train_batch,
                                  pin_memory=True)

        for epoch in range(self.hyper.epoch_num):
            self.model.train()
            pbar = tqdm(enumerate(BackgroundGenerator(loader)),
                        total=len(loader))

            for batch_idx, sample in pbar:
                # txt = sample.text
                # for j in range(len(txt)):
                #     print(txt[j])

                self.optimizer.zero_grad()
                output = self.model(sample, is_train=True)
                loss = output['loss']
                loss.backward()
                self.optimizer.step()

                pbar.set_description(output['description'](
                    epoch, self.hyper.epoch_num))
            ## self.hyper.epoch_num
            self.save_model(self.hyper.epoch_num)

            if epoch % self.hyper.print_epoch == 0 and epoch > 3:
                self.evaluation()