예제 #1
0
 def create_model(self, ):
     self.bert_embedding.processor.add_bos_eos = False
     model = BLSTMModel(embedding=self.bert_embedding)
     model.fit(valid_x, valid_y, epochs=1)
     res = model.predict(valid_x[:20])
     print(res)
     return model
    def train(self):
        x_items, train_y, valid_x, valid_y = self.read_message('car/train.csv')
        # 获取bert字向量

        model = BLSTMModel(bert)
        # 输入模型训练数据 标签 步数
        model.fit(x_items,
                  train_y,
                  valid_x,
                  valid_y,
                  batch_size=64,
                  epochs=12,
                  callbacks=[tf_board_callback])
        # 保存模型
        file = pd.read_csv("car/test.csv", encoding='utf-8').values.tolist()
        test_data = []
        id_list = []
        for i in file:
            test_data.append(list(str(i[1]) + str(i[2])))
            id_list.append(i[0])
        predict_answers = model.predict(x_data=test_data)
        file = open("data/test_predict_bert_car.csv", 'w', encoding='utf-8')
        file.write("id,flag\n")
        for i, j in zip(id_list, predict_answers):
            i = i.strip()
            file.write(str(i) + "," + str(j) + "\n")
        model.save("../model/news-classification-bert-model")
예제 #3
0
    def test_classification_eval_callback(self):
        train_x, train_y = SMP2018ECDTCorpus.load_data()
        test_x, test_y = SMP2018ECDTCorpus.load_data('test')

        train_x = train_x[:1000]
        train_y = train_y[:1000]
        model = BLSTMModel()
        eval_callback = callbacks.EvalCallBack(model, test_x, test_y, step=1)
        model.fit(train_x, train_y, callbacks=[eval_callback], epochs=1)
 def train(self):
     # filepath = "saved-model-{epoch:02d}-{acc:.2f}.hdf5"
     # checkpoint_callback = ModelCheckpoint(filepath,
     #                                       monitor='acc',
     #                                       verbose=1)
     x_items, train_y = self.read_message('../data/yingyangshi/train.txt')
     x_dev, dev_y = self.read_message('../data/yingyangshi/dev.txt')
     # 获取bert字向量
     bert = BERTEmbedding('textclassfation/input0/chinese_L-12_H-768_A-12')
     model = BLSTMModel(bert)
     # model.build_multi_gpu_model(gpus=2)
     model.fit(x_items, train_y, x_dev, dev_y, epochs=2, batch_size=64)
     # 保存模型
     model.save("../健康管理师单选分字BERT-model")
 def test_word2vec_embedding(self):
     embedding = WordEmbeddings('sgns.weibo.bigram',
                                sequence_length=30,
                                limit=5000)
     self.prepare_model(embedding)
     self.model = BLSTMModel(embedding=embedding)
     self.model.fit(self.x_data,
                    self.y_data,
                    x_validate=self.x_eval,
                    y_validate=self.y_eval)
     sentence = list('语言学包含了几种分支领域。')
     logging.info(self.model.embedding.tokenize(sentence))
     logging.info(self.model.predict(sentence))
     self.assertTrue(isinstance(self.model.predict(sentence), str))
     self.assertTrue(isinstance(self.model.predict([sentence]), list))
예제 #6
0
    def test_bert_model(self):
        embedding = BERTEmbedding(bert_path,
                                  task=kashgari.CLASSIFICATION,
                                  sequence_length=100)
        model = BLSTMModel(embedding=embedding)
        model.fit(valid_x, valid_y, epochs=1)
        res = model.predict(valid_x[:20])
        assert True

        model_path = os.path.join(tempfile.gettempdir(), str(time.time()))
        model.save(model_path)

        new_model = kashgari.utils.load_model(model_path)
        new_res = new_model.predict(valid_x[:20])
        assert np.array_equal(new_res, res)
    def train(self):
        x_items, train_y = self.read_message('../data/Chinese medicine licensed pharmacist/train.txt')
        x_dev, dev_y = self.read_message('../data/Chinese medicine licensed pharmacist/dev.txt')
        # 获取bert字向量

        model = BLSTMModel()
        # 输入模型训练数据 标签 步数
        model.fit(x_items,
                  train_y,
                  x_dev,
                  dev_y,
                  batch_size=32,
                  epochs=20,
                  fit_kwargs={'callbacks': [tf_board_callback]})
        # 保存模型
        model.save("../model/中医执业药师char-model")
예제 #8
0
    def train(self):
        x_xiyao, xiyao_y = self.read_message('../data/西药执业药师/train.txt')

        x_dev, dev_y = self.read_message('../data/西药执业药师/dev.txt')
        # 获取bert字向量
        bert = BERTEmbedding('bert-base-chinese', sequence_length=200)
        model = BLSTMModel(bert)
        # 输入模型训练数据 标签 步数
        model.fit(x_xiyao,
                  xiyao_y,
                  x_dev,
                  dev_y,
                  epochs=8,
                  batch_size=256,
                  fit_kwargs={'callbacks': [tf_board_callback]})
        # 保存模型
        model.save("../西药执业药师-model")
    def train(self):
        x_items, train_y = self.read_message('../data/健康管理师分类数据集/train.txt')
        x_xiyao, xiyao_y = self.read_message('../data/西药执业药师/train.txt')
        x_yingyangshi, yingyangshi_y = self.read_message(
            '../data/yingyangshi/train.txt')
        x_items.extend(x_xiyao)
        train_y.extend(xiyao_y)
        x_items.extend(x_yingyangshi)
        train_y.extend(yingyangshi_y)

        x_dev, dev_y = self.read_message('../data/健康管理师分类数据集/valid.txt')
        # 获取bert字向量
        bert = BERTEmbedding('bert-base-chinese', sequence_length=200)
        model = BLSTMModel(bert)
        # 输入模型训练数据 标签 步数
        model.fit(x_items,
                  train_y,
                  x_dev,
                  dev_y,
                  epochs=8,
                  batch_size=128,
                  fit_kwargs={'callbacks': [tf_board_callback]})
        # 保存模型
        model.save("../健康管理师分字BERT-model")
예제 #10
0
log_filepath = r"D:\data\biendata\ccks2019_el\clf_log"

early_stop = EarlyStopping(monitor="val_loss", mode="min", patience=2)
# early_stop = EarlyStopping(monitor="val_acc", mode="max", patience=2)

log = TensorBoard(log_dir=log_filepath,
                  write_images=False,
                  write_graph=True,
                  histogram_freq=0)

emn_path = r'D:\data\bert\chinese_L-12_H-768_A-12'
embedding = BERTEmbedding(emn_path, sequence_length=1024)
emn_path = r'D:/data/word2vec/zh/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5.utf8.txt'
embedding = WordEmbeddings(emn_path, sequence_length=1024)

# model = DropoutBGRUModel(embedding)
model = BLSTMModel(embedding)

model.fit(train_x[:100000],
          train_y[:100000],
          x_validate=validate_x[:20000],
          y_validate=validate_y[:20000],
          epochs=20,
          batch_size=256,
          labels_weight=True,
          fit_kwargs={'callbacks': [early_stop, log]})

model.evaluate(test_x, test_y)

model.save(model_path)
예제 #11
0
 def setUpClass(cls):
     cls.epochs = 3
     cls.model = BLSTMModel()
예제 #12
0
 def setUpClass(cls):
     cls.epochs = 3
     embedding = EmbeddingManager.get_w2v()
     cls.model = BLSTMModel(embedding)
예제 #13
0
def predict_each_line(args, model):
    import codecs
    fout = codecs.open(args.output_file, 'w')
    test_x, test_y = fetch_data_set(args.test_set_path)
    for line, y in zip(test_x, test_y):
       result = model.predict(text_processor(''.join(line)), batch_size=1, debug_info=False)
       if result != ''.join(y):
           str_message = ''.join(line) + "\t" + ''.join(y) +"\t" + result
           print(str_message)
           fout.write(str_message+'\n')
    fout.close()

if __name__ == '__main__':
    # initialize parameter
    args = params_setup()
    logging.basicConfig(filename=args.log_path, level=logging.DEBUG)

    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_id

    bert_embedding = BERTEmbedding('bert-base-chinese', sequence_length=30)

    model = BLSTMModel(bert_embedding)
    model = model.load_model(args.model_path)

    if (args.predict_mode == "from_input"):
        predict_from_user_input(model)
    else:
        predict_from_test_set(args, model)