def run_predict(data):
	ds=MyDataset()
	module = hub.Module(name="ernie",version="1.1.0")
	inputs, outputs, program = module.context(max_seq_len=512)
	pooled_output=outputs["sequence_output"]

	reader = MyClassifyReader(
		dataset=ds,
		vocab_path=module.get_vocab_path(),
		max_seq_len=512)

	strategy=hub.AdamWeightDecayStrategy(
		learning_rate=5e-5,
		lr_scheduler="linear_decay",
		warmup_proportion=0.1,
		weight_decay=0.01,
		optimizer_name="adam"
	)
	config=hub.RunConfig(use_cuda=False,enable_memory_optim=True,num_epoch=3,batch_size=16,strategy=strategy,checkpoint_dir="./models/Product",)

	feed_list=[
		inputs["input_ids"].name,inputs["position_ids"].name,
		inputs["segment_ids"].name,inputs["input_mask"].name,
	]


	cls_task=GRUTextClassifierTask(
		data_reader=reader,
		feature=pooled_output,
		feed_list=feed_list,
		num_classes=ds.num_labels,
		config=config
	)
	map = {3: '天使轮', 1: 'B轮', 4:'战略融资', 0: "A轮", 2: 'C轮'}
	predictions=[]
	index = 0
	run_states = cls_task.predict(data=data)
	results = [run_state.run_results for run_state in run_states]
	for batch_result in results:
		batch_result = np.argmax(batch_result, axis=2)[0]
		for result in batch_result:
			print(result)
			predictions.append(result)
			index += 1




	# In[27]:


	result=[]
	prob=[]
	index=0
	for batch_result in results:
		for single_result in batch_result[0]:
			print("=====")
			print(single_result)
			score=(1*single_result[0]+2*single_result[1]+3*single_result[2]+4*single_result[3]+5*single_result[4])/15*100
	return score
예제 #2
0
    def _initialize(self, use_gpu=False):
        # Load Paddlehub ERNIE Tiny pretrained model
        self.module = hub.Module(name="ernie_tiny")
        inputs, outputs, program = self.module.context(trainable=True,
                                                       max_seq_len=128)

        # Download dataset and get its label list and label num
        # If you just want labels information, you can omit its tokenizer parameter to avoid preprocessing the train set.
        dataset = hub.dataset.Couplet()
        self.label_list = dataset.get_labels()

        # Setup RunConfig for PaddleHub Fine-tune API
        config = hub.RunConfig(use_data_parallel=False,
                               use_cuda=use_gpu,
                               batch_size=1,
                               checkpoint_dir=os.path.join(
                                   self.directory, "assets", "ckpt"),
                               strategy=hub.AdamWeightDecayStrategy())

        # Construct transfer learning network
        # Use "pooled_output" for classification tasks on an entire sentence.
        # Use "sequence_output" for token-level output.
        pooled_output = outputs["pooled_output"]
        sequence_output = outputs["sequence_output"]

        # Define a classfication fine-tune task by PaddleHub's API
        self.gen_task = hub.TextGenerationTask(feature=pooled_output,
                                               token_feature=sequence_output,
                                               max_seq_len=128,
                                               num_classes=dataset.num_labels,
                                               config=config,
                                               metrics_choices=["bleu"])
예제 #3
0
    def _initialize(self,
                    ckpt_dir="ckpt_chnsenticorp",
                    num_class=2,
                    max_seq_len=128,
                    use_gpu=False,
                    batch_size=1):
        self.ckpt_dir = os.path.join(self.directory, ckpt_dir)
        self.num_class = num_class
        self.MAX_SEQ_LEN = max_seq_len

        # Load Paddlehub ERNIE Tiny pretrained model
        self.module = hub.Module(name="ernie_tiny")
        inputs, outputs, program = self.module.context(trainable=True,
                                                       max_seq_len=max_seq_len)

        self.vocab_path = self.module.get_vocab_path()

        # Download dataset and use accuracy as metrics
        # Choose dataset: GLUE/XNLI/ChinesesGLUE/NLPCC-DBQA/LCQMC
        # metric should be acc, f1 or matthews
        metrics_choices = ["acc"]

        # For ernie_tiny, it use sub-word to tokenize chinese sentence
        # If not ernie tiny, sp_model_path and word_dict_path should be set None
        reader = hub.reader.ClassifyReader(
            vocab_path=self.module.get_vocab_path(),
            max_seq_len=max_seq_len,
            sp_model_path=self.module.get_spm_path(),
            word_dict_path=self.module.get_word_dict_path())

        # Construct transfer learning network
        # Use "pooled_output" for classification tasks on an entire sentence.
        # Use "sequence_output" for token-level output.
        pooled_output = outputs["pooled_output"]

        # Setup feed list for data feeder
        # Must feed all the tensor of module need
        feed_list = [
            inputs["input_ids"].name,
            inputs["position_ids"].name,
            inputs["segment_ids"].name,
            inputs["input_mask"].name,
        ]

        # Setup runing config for PaddleHub Finetune API
        config = hub.RunConfig(use_data_parallel=False,
                               use_cuda=use_gpu,
                               batch_size=batch_size,
                               checkpoint_dir=self.ckpt_dir,
                               strategy=hub.AdamWeightDecayStrategy())

        # Define a classfication finetune task by PaddleHub's API
        self.cls_task = hub.TextClassifierTask(data_reader=reader,
                                               feature=pooled_output,
                                               feed_list=feed_list,
                                               num_classes=self.num_class,
                                               config=config,
                                               metrics_choices=metrics_choices)
예제 #4
0
def predict_tag(model_name, data):
    checkpoint_dir = "model/" + model_name
    dataset_dir = "data/" + model_name
    # Load Paddlehub ERNIE Tiny pretrained model
    module = hub.Module(name="ernie_tiny")
    inputs, outputs, program = module.context(trainable=UnicodeTranslateError,
                                              max_seq_len=128)
    # Download dataset and use accuracy as metrics
    # Choose dataset: GLUE/XNLI/ChinesesGLUE/NLPCC-DBQA/LCQMC
    dataset = ViolateDataset(dataset_dir=dataset_dir)
    # For ernie_tiny, it use sub-word to tokenize chinese sentence
    # If not ernie tiny, sp_model_path and word_dict_path should be set None
    reader = hub.reader.ClassifyReader(
        dataset=dataset,
        vocab_path=module.get_vocab_path(),
        max_seq_len=128,
        sp_model_path=module.get_spm_path(),
        word_dict_path=module.get_word_dict_path())

    # Construct transfer learning network
    # Use "pooled_output" for classification tasks on an entire sentence.
    # Use "sequence_output" for token-level output.
    pooled_output = outputs["pooled_output"]

    # Setup feed list for data feeder
    # Must feed all the tensor of module need
    feed_list = [
        inputs["input_ids"].name,
        inputs["position_ids"].name,
        inputs["segment_ids"].name,
        inputs["input_mask"].name,
    ]

    # Setup runing config for PaddleHub Finetune API
    config = hub.RunConfig(use_data_parallel=True,
                           use_cuda=False,
                           batch_size=24,
                           checkpoint_dir=checkpoint_dir,
                           strategy=hub.AdamWeightDecayStrategy())

    # Define a classfication finetune task by PaddleHub's API
    cls_task = hub.TextClassifierTask(data_reader=reader,
                                      feature=pooled_output,
                                      feed_list=feed_list,
                                      num_classes=dataset.num_labels,
                                      config=config)

    # Data to be prdicted
    # data = [["有保障"],
    #         ["无风险"],
    #         ["基金过往数据并不代表未来趋势"],
    #         ["为什么"],
    #         ["周杰伦"],
    #         ["吴东瀛"],
    #         ]
    # print(cls_task.predict(data=data, return_result=True))
    return cls_task.predict(data=data, return_result=True)
예제 #5
0
def train(train_i, args):
    dataset = MyDataset()
    module = hub.Module(name=args.model)
    reader = hub.reader.MultiLabelClassifyReader(
        dataset=dataset,
        vocab_path=module.get_vocab_path(),
        max_seq_len=args.max_seq_len)

    strategy = hub.AdamWeightDecayStrategy(
        weight_decay=args.weight_decay,
        warmup_proportion=args.warmup_proportion,
        lr_scheduler=args.lr_scheduler,
        learning_rate=args.learning_rate)
    config = hub.RunConfig(use_cuda=args.use_gpu,
                           num_epoch=args.num_epoch,
                           checkpoint_dir=args.checkpoint_dir + str(train_i),
                           batch_size=args.batch_size,
                           eval_interval=eval_interval,
                           log_interval=log_interval,
                           strategy=strategy)
    inputs, outputs, program = module.context(trainable=True,
                                              max_seq_len=args.max_seq_len)

    # Use "pooled_output" for classification tasks on an entire sentence.
    pooled_output = outputs["pooled_output"]

    # feed_list的Tensor顺序不可以调整
    feed_list = [
        inputs["input_ids"].name,
        inputs["position_ids"].name,
        inputs["segment_ids"].name,
        inputs["input_mask"].name,
    ]

    cls_task = hub.MultiLabelClassifierTask(data_reader=reader,
                                            feature=pooled_output,
                                            feed_list=feed_list,
                                            num_classes=dataset.num_labels,
                                            config=config)
    cls_task.main_program.random_seed = args.seed
    change_task(cls_task, train_i)
    return cls_task, reader
예제 #6
0
def get_task(args, schema_labels, id):
    # 加载PaddleHub 预训练模型ERNIE Tiny/RoBERTa large
    # 更多预训练模型 https://www.paddlepaddle.org.cn/hublist?filter=en_category&value=SemanticModel
    # model_name = "ernie_tiny"
    model_name = args.model_name
    module = hub.Module(name=model_name)
    inputs, outputs, program = module.context(trainable=True,
                                              max_seq_len=args.max_seq_len)

    # 加载数据并通过SequenceLabelReader读取数据
    dataset = EEDataset(args.data_dir, schema_labels, model=args.do_model)
    reader = SequenceLabelReader(dataset=dataset,
                                 vocab_path=module.get_vocab_path(),
                                 max_seq_len=args.max_seq_len,
                                 sp_model_path=module.get_spm_path(),
                                 word_dict_path=module.get_word_dict_path())

    # 构建序列标注任务迁移网络
    # 使用ERNIE模型字级别的输出sequence_output作为迁移网络的输入
    sequence_output = outputs["sequence_output"]
    # sequence_output  = fluid.layers.dropout(
    #     x=sequence_output ,
    #     dropout_prob=args.dropout,
    #     dropout_implementation="upscale_in_train")

    # 设置模型program需要输入的变量feed_list
    # 必须按照以下顺序设置
    feed_list = [
        inputs["input_ids"].name, inputs["position_ids"].name,
        inputs["segment_ids"].name, inputs["input_mask"].name
    ]

    # 选择优化策略
    strategy = hub.AdamWeightDecayStrategy(
        warmup_proportion=args.warmup_proportion,
        weight_decay=args.weight_decay,
        learning_rate=args.learning_rate)

    # 配置运行设置
    config = hub.RunConfig(
        log_interval=100,
        eval_interval=args.eval_step,
        save_ckpt_interval=args.model_save_step,
        use_data_parallel=args.use_data_parallel,
        use_cuda=args.use_gpu,
        # enable_memory_optim=True,
        num_epoch=args.num_epoch,
        batch_size=args.batch_size,
        checkpoint_dir=args.checkpoint_dir,
        strategy=strategy)

    # 构建序列标注迁移任务
    seq_label_task = SequenceLabelTask(data_reader=reader,
                                       feature=sequence_output,
                                       feed_list=feed_list,
                                       max_seq_len=args.max_seq_len,
                                       num_classes=dataset.num_labels,
                                       config=config,
                                       add_crf=args.add_crf)
    seq_label_task.main_program.random_seed = args.random_seed
    add_hook(args, seq_label_task, id)
    return seq_label_task, reader
예제 #7
0
    # Add elmo embedding
    input_feature = fluid.layers.concat(input=[elmo_embedding, word_embedding],
                                        axis=1)

    # Choose the net which you would like: bow, cnn, gru, bilstm, lstm
    # We recommend you to choose the gru_net
    fc = gru_net(program, input_feature)

    # Setup feed list for data feeder
    # Must feed all the tensor of senta's module need
    feed_list = [word_ids.name]

    # Step4: Select finetune strategy, setup config and finetune
    strategy = hub.AdamWeightDecayStrategy(
        weight_decay=args.weight_decay,
        learning_rate=args.learning_rate,
        lr_scheduler="linear_decay",
        warmup_proportion=args.warmup_proportion)

    # Step5: Setup runing config for PaddleHub Finetune API
    config = hub.RunConfig(use_cuda=args.use_gpu,
                           use_data_parallel=True,
                           num_epoch=args.num_epoch,
                           batch_size=args.batch_size,
                           checkpoint_dir=args.checkpoint_dir,
                           strategy=strategy)

    # Step6: Define a classfication finetune task by PaddleHub's API
    elmo_task = hub.TextClassifierTask(data_reader=reader,
                                       feature=fc,
                                       feed_list=feed_list,
예제 #8
0
                '12', '13'
            ])


dataset = ThuNews()

module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16")
reader = hub.reader.ClassifyReader(dataset=dataset,
                                   vocab_path=module.get_vocab_path(),
                                   sp_model_path=module.get_spm_path(),
                                   word_dict_path=module.get_word_dict_path(),
                                   max_seq_len=128)

strategy = hub.AdamWeightDecayStrategy(
    weight_decay=0.01,
    warmup_proportion=0.1,
    # learning_rate=5e-5,
    lr_scheduler="linear_decay",
    learning_rate=5e-5)

config = hub.RunConfig(use_cuda=True,
                       use_data_parallel=True,
                       num_epoch=1,
                       checkpoint_dir="module",
                       batch_size=20,
                       eval_interval=400,
                       strategy=strategy)

inputs, outputs, program = module.context(trainable=True, max_seq_len=128)

# Use "pooled_output" for classification tasks on an entire sentence.
pooled_output = outputs["pooled_output"]
예제 #9
0
    # Setup feed list for data feeder
    # Must feed all the tensor of ERNIE's module need
    feed_list = [
        inputs["input_ids"].name,
        inputs["position_ids"].name,
        inputs["segment_ids"].name,
        inputs["input_mask"].name,
    ]

    if args.use_taskid:
        feed_list.append(inputs["task_ids"].name)

    # Select finetune strategy, setup config and finetune
    strategy = hub.AdamWeightDecayStrategy(weight_decay=args.weight_decay,
                                           learning_rate=args.learning_rate,
                                           lr_scheduler="linear_decay")

    # Setup runing config for PaddleHub Finetune API
    config = hub.RunConfig(use_data_parallel=args.use_data_parallel,
                           use_pyreader=args.use_pyreader,
                           use_cuda=args.use_gpu,
                           num_epoch=args.num_epoch,
                           batch_size=args.batch_size,
                           checkpoint_dir=args.checkpoint_dir,
                           strategy=strategy)

    # Define a regression finetune task by PaddleHub's API
    reg_task = hub.RegressionTask(data_reader=reader,
                                  feature=pooled_output,
                                  feed_list=feed_list,
예제 #10
0
            spm_path=module.get_spm_path(),
            word_dict_path=module.get_word_dict_path())
    else:
        tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())

    dataset = hub.dataset.ChnSentiCorp(
        tokenizer=tokenizer, max_seq_len=args.max_seq_len)

    # Construct transfer learning network
    # Use "pooled_output" for classification tasks on an entire sentence.
    # Use "sequence_output" for token-level output.
    pooled_output = outputs["pooled_output"]

    # Select fine-tune strategy, setup config and fine-tune
    strategy = hub.AdamWeightDecayStrategy(
        warmup_proportion=args.warmup_proportion,
        weight_decay=args.weight_decay,
        learning_rate=args.learning_rate)

    # Setup RunConfig for PaddleHub Fine-tune API
    config = hub.RunConfig(
        use_data_parallel=args.use_data_parallel,
        use_cuda=args.use_gpu,
        num_epoch=args.num_epoch,
        batch_size=args.batch_size,
        checkpoint_dir=args.checkpoint_dir,
        strategy=strategy)

    # Define a classfication fine-tune task by PaddleHub's API
    cls_task = hub.TextClassifierTask(
        dataset=dataset,
        feature=pooled_output,
def one(id, train_i, args):
    # 加载PaddleHub ERNIE预训练模型
    module = hub.Module(name=args.model)

    # ERNIE预训练模型输入变量inputs、输出变量outputs、以及模型program
    inputs, outputs, program = module.context(trainable=True,
                                              max_seq_len=args.max_seq_len)

    # 加载竞赛数据集并使用ReadingComprehensionReader读取数据
    dataset = DuReader(id)
    reader = hub.reader.ReadingComprehensionReader(
        dataset=dataset,
        vocab_path=module.get_vocab_path(),
        max_seq_len=args.max_seq_len,
        doc_stride=128,
        max_query_length=args.max_que_len)

    # 取ERNIE的字级别预训练输出
    seq_output = outputs["sequence_output"]

    # 设置运行program所需的feed_list
    feed_list = [
        inputs["input_ids"].name,
        inputs["position_ids"].name,
        inputs["segment_ids"].name,
        inputs["input_mask"].name,
    ]

    # 选择Fine-tune优化策略
    strategy = hub.AdamWeightDecayStrategy(
        weight_decay=args.weight_decay,
        learning_rate=args.learning_rate,
        warmup_proportion=args.warmup_proportion)

    # 设置运行配置
    config = hub.RunConfig(eval_interval=200,
                           use_pyreader=False,
                           use_data_parallel=args.use_data_parallel,
                           use_cuda=args.use_gpu,
                           num_epoch=args.num_epoch,
                           batch_size=args.batch_size,
                           checkpoint_dir=args.checkpoint_dir + str(id),
                           strategy=strategy)

    # 定义阅读理解Fine-tune Task
    # 由于竞赛数据集与cmrc2018数据集格式比较相似,此处sub_task应为cmrc2018
    # 否则运行可能出错
    reading_comprehension_task = hub.ReadingComprehensionTask(
        data_reader=reader,
        feature=seq_output,
        feed_list=feed_list,
        config=config,
        sub_task="cmrc2018",
    )
    reading_comprehension_task.main_program.random_seed = args.seed
    change_task(reading_comprehension_task, id)
    # 调用finetune_and_eval API,将会自动进行训练、评估以及保存最佳模型
    reading_comprehension_task.finetune_and_eval()

    # 竞赛数据集测试集部分数据用于预测
    data = dataset.predict_examples
    # 调用predict接口, 打开return_result(True),将自动返回预测结果
    all_prediction = reading_comprehension_task.predict(data=data,
                                                        return_result=True)
    # 写入预测结果
    json.dump(all_prediction,
              open('./work/result/submit{}_{}.json'.format(train_i, id), 'w'),
              ensure_ascii=False)
    value = [id, reading_comprehension_task.best_score] + list(
        args.__dict__.values())
    value = [str(x) for x in value]
    with open('./work/log/MRC_log.txt', 'a', encoding='utf-8') as f:
        f.write(','.join(value) + ',-\n')
    return reading_comprehension_task.best_score, value[2:]
@time: 2020/6/5 1:01
@desc: 情感分类
'''
import paddlehub as hub

if __name__ == '__main__':
	module = hub.Module(name="ernie")
	dataset = hub.dataset.ChnSentiCorp()

	reader = hub.reader.ClassifyReader(
		dataset=dataset,
		vocab_path=module.get_vocab_path(),
		max_seq_len=128)

	strategy = hub.AdamWeightDecayStrategy(
		weight_decay=0.01,
		warmup_proportion=0.1,
		learning_rate=5e-5)

	config = hub.RunConfig(
		use_cuda=False,
		num_epoch=1,
		checkpoint_dir="ernie_txt_cls_turtorial_demo",
		batch_size=100,
		eval_interval=50,
		strategy=strategy)

	inputs, outputs, program = module.context(
		trainable=True, max_seq_len=128)

	# Use "pooled_output" for classification tasks on an entire sentence.
	pooled_output = outputs["pooled_output"]
예제 #13
0
inputs, outputs, program = module.context(trainable="True", max_seq_len=128)

pooled_output = outputs["pooled_output"]
sequence_output = outputs["sequence_output"]

ds = hub.dataset.ChnSentiCorp()
reader = hub.reader.ClassifyReader(dataset=ds,
                                   vocab_path=module.get_vocab_path(),
                                   max_seq_len=128)

ds = hub.dataset.ChnSentiCorp()
for e in ds.get_train_examples():
    print(e.text_a, e.label)

strategy = hub.AdamWeightDecayStrategy(learning_rate=1e-4,
                                       lr_scheduler="linear_decay",
                                       warmup_proportion=0.0,
                                       weight_decay=0.01)

config = hub.RunConfig(use_cuda=False,
                       num_epoch=3,
                       batch_size=32,
                       strategy=strategy)
feed_list = [
    inputs["input_ids"].name, inputs["position_ids"].name,
    inputs["segment_ids"].name, inputs["input_mask"].name
]

cls_task = hub.TextClassifierTask(data_reader=reader,
                                  feature=pooled_output,
                                  feed_list=feed_list,
                                  num_classes=ds.num_labels,
예제 #14
0
# 定义数据集
dataset = TextClassification(dataset_dir)

# 定义数据读取器
reader = hub.reader.ClassifyReader(dataset=dataset,
                                   vocab_path=module.get_vocab_path(),
                                   max_seq_len=args.max_seq_len,
                                   do_lower_case=True,
                                   sp_model_path=module.get_spm_path(),
                                   word_dict_path=module.get_word_dict_path())

# 设置优化策略
strategy = hub.AdamWeightDecayStrategy(
    learning_rate=args.learning_rate,
    lr_scheduler="linear_decay",
    warmup_proportion=args.warmup_proportion,
    weight_decay=args.weight_decay,
    optimizer_name="adam")

# 设置训练参数
config = hub.RunConfig(log_interval=20,
                       eval_interval=500,
                       use_pyreader=True,
                       use_data_parallel=True,
                       save_ckpt_interval=1000,
                       use_cuda=True,
                       checkpoint_dir="%s_TextClassification" % dataset_name,
                       num_epoch=args.num_epoch,
                       batch_size=args.batch_size,
                       strategy=strategy)
    reader = hub.reader.MultiLabelClassifyReader(
        dataset=dataset,
        vocab_path=module.get_vocab_path(),
        max_seq_len=max_seq_len,
        use_task_id=False)

    metrics_choices = ['acc', 'f1']
    # Construct transfer learning network
    # Use "pooled_output" for classification tasks on an entire sentence.

    #优化器设置
    # Select finetune strategy, setup config and finetune
    strategy = hub.AdamWeightDecayStrategy(
        weight_decay=weight_decay,
        learning_rate=learning_rate,
        warmup_proportion=warmup_proportion,
        lr_scheduler="linear_decay")

    # Setup runing config for PaddleHub Finetune API
    config = hub.RunConfig(
        use_cuda=use_gpu,
        num_epoch=num_epoch,
        batch_size=batch_size,
        checkpoint_dir=checkpoint_dir,
        strategy=strategy)

    #运行模型
    pooled_output = outputs["pooled_output"]
    feed_list = [
        inputs["input_ids"].name,
예제 #16
0
파일: SelfDataset.py 프로젝트: wshzd/Paddle


import paddlehub as hub
module = hub.Module(name="ernie", version="1.0.2")
dataset = DemoDataset()


reader = hub.reader.ClassifyReader(
    dataset=dataset,
    vocab_path=module.get_vocab_path(),
    max_seq_len=128)

strategy = hub.AdamWeightDecayStrategy(
    weight_decay=0.01,
    warmup_proportion=0.1,
    learning_rate=1e-5,
    lr_scheduler="linear_decay",
    optimizer_name="adam")

config = hub.RunConfig(
    #是否使用GPU
    use_cuda=True,
    num_epoch=50,
    #模型保存地址
    checkpoint_dir="ernie_turtorial_demo",
    batch_size=64,
    log_interval=10,
    eval_interval=500,
    strategy=strategy)
inputs, outputs, program = module.context(
    trainable=True, max_seq_len=128)
def main():
    # Load Paddlehub pretrained model
    # 更多预训练模型 https://www.paddlepaddle.org.cn/hublist?filter=en_category&value=SemanticModel
    #model_name = "ernie_tiny"
    model_name = "chinese-roberta-wwm-ext-large"
    module = hub.Module(name=model_name)
    inputs, outputs, program = module.context(trainable=True,
                                              max_seq_len=args.max_seq_len)

    # Download dataset and use SequenceLabelReader to read dataset
    dataset = EEDataset(args.data_dir, schema_labels, model=args.do_model)
    reader = hub.reader.SequenceLabelReader(
        dataset=dataset,
        vocab_path=module.get_vocab_path(),
        max_seq_len=args.max_seq_len,
        sp_model_path=module.get_spm_path(),
        word_dict_path=module.get_word_dict_path())

    # Construct transfer learning network
    # Use "sequence_output" for token-level output.
    sequence_output = outputs["sequence_output"]

    # Setup feed list for data feeder
    # Must feed all the tensor of module need
    feed_list = [
        inputs["input_ids"].name, inputs["position_ids"].name,
        inputs["segment_ids"].name, inputs["input_mask"].name
    ]

    # Select a finetune strategy
    strategy = hub.AdamWeightDecayStrategy(
        warmup_proportion=args.warmup_proportion,
        weight_decay=args.weight_decay,
        learning_rate=args.learning_rate)

    # Setup runing config for PaddleHub Finetune API
    config = hub.RunConfig(eval_interval=args.eval_step,
                           save_ckpt_interval=args.model_save_step,
                           use_data_parallel=args.use_data_parallel,
                           use_cuda=args.use_gpu,
                           num_epoch=args.num_epoch,
                           batch_size=args.batch_size,
                           checkpoint_dir=args.checkpoint_dir,
                           strategy=strategy)

    # Define a sequence labeling finetune task by PaddleHub's API
    # If add crf, the network use crf as decoder
    seq_label_task = hub.SequenceLabelTask(data_reader=reader,
                                           feature=sequence_output,
                                           feed_list=feed_list,
                                           max_seq_len=args.max_seq_len,
                                           num_classes=dataset.num_labels,
                                           config=config,
                                           add_crf=args.add_crf)

    # Finetune and evaluate model by PaddleHub's API
    # will finish training, evaluation, testing, save model automatically
    if args.do_train:
        print("start finetune and eval process")
        seq_label_task.finetune_and_eval()

    if args.do_predict:
        print("start predict process")
        ret = []
        id2label = {val: key for key, val in reader.label_map.items()}
        input_data = [[d] for d in predict_data]
        run_states = seq_label_task.predict(data=input_data[1:])
        results = []
        for batch_states in run_states:
            batch_results = batch_states.run_results
            batch_infers = batch_results[0].reshape([-1]).astype(
                np.int32).tolist()
            seq_lens = batch_results[1].reshape([-1]).astype(np.int32).tolist()
            current_id = 0
            for length in seq_lens:
                seq_infers = batch_infers[current_id:current_id + length]
                seq_result = list(map(id2label.get, seq_infers[1:-1]))
                current_id += length if args.add_crf else args.max_seq_len
                results.append(seq_result)

        ret = []
        for sent, r_label in zip(predict_sents, results):
            sent["labels"] = r_label
            ret.append(json.dumps(sent, ensure_ascii=False))
        write_by_lines("{}.{}.pred".format(args.predict_data, args.do_model),
                       ret)
예제 #18
0
        # Construct transfer learning network
        # Use "pooled_output" for classification tasks on an entire sentence.
        # Use "sequence_output" for token-level output.
        pooled_output = outputs["pooled_output"]

        # Setup feed list for data feeder
        # Must feed all the tensor of module need
        feed_list = [
            inputs["input_ids"].name,
            inputs["position_ids"].name,
            inputs["segment_ids"].name,
            inputs["input_mask"].name,
        ]

        # Select finetune strategy, setup config and finetune
        strategy = hub.AdamWeightDecayStrategy(
            learning_rate=args.learning_rate)

        # Setup runing config for PaddleHub Finetune API
        if "L_12" in name or name in [
                "ernie", "ernie_tiny", "ernie_v2_eng_base"
        ] or "L_3" in name:
            batch_size = 16
        else:
            batch_size = 8

        config = hub.RunConfig(use_data_parallel=True,
                               use_cuda=True,
                               num_epoch=2,
                               batch_size=batch_size,
                               checkpoint_dir="ckpt_%s" % name,
                               strategy=strategy,
예제 #19
0
    # Otherwise, tokenizer should be hub.CustomTokenizer.
    tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path(),
                                  tokenize_chinese_chars=True)

    # Load dataset
    dataset = hub.dataset.DuEL(tokenizer=tokenizer,
                               max_seq_len=args.max_seq_len)

    # Construct transfer learning network.
    # Use sequence-level output.
    query = outputs["sequence_output"]
    left = outputs['sequence_output_2']
    right = outputs['sequence_output_3']

    # Select fine-tune strategy.
    strategy = hub.AdamWeightDecayStrategy()

    # Setup RunConfig for PaddleHub Fine-tune API.
    config = hub.RunConfig(use_data_parallel=False,
                           use_cuda=args.use_gpu,
                           num_epoch=args.num_epoch,
                           batch_size=args.batch_size,
                           checkpoint_dir=args.checkpoint_dir,
                           strategy=strategy)

    # Define a pairwise text matching task by PaddleHub's API.
    pairwise_matching_task = hub.PairwiseTextMatchingTask(query_feature=query,
                                                          left_feature=left,
                                                          right_feature=right,
                                                          tokenizer=tokenizer,
                                                          dataset=dataset,
예제 #20
0
            text = row["text_a"]
            labels = [int(value) for value in row[2:]]
            example = InputExample(guid=guid, label=labels, text_a=text)
            examples.append(example)

        return examples


dataset = MultiMydatas()
module = hub.Module(name='ernie')
reader = hub.reader.MultiLabelClassifyReader(
    dataset=dataset, vocab_path=module.get_vocab_path(), max_seq_len=128)

strategy = hub.AdamWeightDecayStrategy(weight_decay=0.01,
                                       warmup_proportion=0.1,
                                       learning_rate=5e-5,
                                       lr_scheduler='linear_decay',
                                       optimizer_name='adam')

config = hub.RunConfig(
    use_data_parallel=False,
    use_pyreader=False,
    use_cuda=False,
    batch_size=32,
    enable_memory_optim=False,
    checkpoint_dir='ernie_txt_cls_turtorial_demo',
    num_epoch=100,
    strategy=strategy,
)
inputs, outputs, program = module.context(trainable=True, max_seq_len=128)
pooled_output = outputs['pooled_output']
예제 #21
0
def train_model(model_name):
    checkpoint_dir = "model/" + model_name
    dataset_dir = "data/" + model_name
    # Load Paddlehub ERNIE Tiny pretrained model
    module = hub.Module(name="ernie_tiny")
    inputs, outputs, program = module.context(
        trainable=True, max_seq_len=128)

    # Download dataset and use accuracy as metrics
    # Choose dataset: GLUE/XNLI/ChinesesGLUE/NLPCC-DBQA/LCQMC
    # metric should be acc, f1 or matthews
    # dataset = hub.dataset.ChnSentiCorp()
    dataset = ViolateDataset(dataset_dir=dataset_dir)
    metrics_choices = ["acc"]

    # For ernie_tiny, it use sub-word to tokenize chinese sentence
    # If not ernie tiny, sp_model_path and word_dict_path should be set None
    reader = hub.reader.ClassifyReader(
        dataset=dataset,
        vocab_path=module.get_vocab_path(),
        max_seq_len=128,
        sp_model_path=module.get_spm_path(),
        word_dict_path=module.get_word_dict_path())

    # Construct transfer learning network
    # Use "pooled_output" for classification tasks on an entire sentence.
    # Use "sequence_output" for token-level output.
    pooled_output = outputs["pooled_output"]

    # Setup feed list for data feeder
    # Must feed all the tensor of module need
    feed_list = [
        inputs["input_ids"].name,
        inputs["position_ids"].name,
        inputs["segment_ids"].name,
        inputs["input_mask"].name,
    ]

    # Select finetune strategy, setup config and finetune
    strategy = hub.AdamWeightDecayStrategy(
        warmup_proportion=0.1,
        weight_decay=0.01,
        learning_rate=5e-5,
        lr_scheduler="linear_decay")

    # Setup runing config for PaddleHub Finetune API
    config = hub.RunConfig(
        use_data_parallel=True,
        use_cuda=False,
        num_epoch=3,
        batch_size=24,
        checkpoint_dir=checkpoint_dir,
        # model_dir="./models",
        enable_memory_optim=True,
        strategy=strategy)

    # Define a classfication finetune task by PaddleHub's API
    cls_task = hub.TextClassifierTask(
        data_reader=reader,
        feature=pooled_output,
        feed_list=feed_list,
        num_classes=dataset.num_labels,
        config=config,
        metrics_choices=metrics_choices)
    # with cls_task.phase_guard(phase="train"):
    #     cls_task.init_if_necessary()
    #     cls_task.load_parameters("./models/model")
    # Finetune and evaluate by PaddleHub's API
    # will finish training, evaluation, testing, save model automatically
    # cls_task.finetune_and_eval()
    cls_task.finetune()
    # Evaluate by PaddleHub's API
    run_states = cls_task.eval()
    # Get acc score on dev
    eval_avg_score, eval_avg_loss, eval_run_speed = cls_task._calculate_metrics(
        run_states)
    # acc on dev will be used by auto finetune
    print("AutoFinetuneEval" + "\t" + str(float(eval_avg_score["acc"])))
예제 #22
0
def main(type, cnf):
    class SouhuCompetition(TextMatchingDataset):
        def __init__(self, tokenizer=None, max_seq_len=None):
            base_path = './data'
            if type in ['ssA', 'slA', 'llA']:
                train_file = 'data78383/{}_train.tsv'.format(type)
                dev_file = 'data78383/{}_valid.tsv'.format(type)
            elif type in ['ssB', 'slB', 'llB']:
                train_file = 'data78384/{}_train.tsv'.format(type)
                dev_file = 'data78384/{}_valid.tsv'.format(type)
            super(SouhuCompetition, self).__init__(
                is_pair_wise=False,  # 文本匹配类型,是否为pairwise
                base_path=base_path,
                train_file=train_file,  # 相对于base_path的文件路径
                dev_file=dev_file,  # 相对于base_path的文件路径
                train_file_with_header=True,
                dev_file_with_header=True,
                label_list=["0", "1"],
                tokenizer=tokenizer,
                max_seq_len=max_seq_len)

    module = hub.Module(name="ernie")

    # pointwise任务需要: query, title_left (2 slots)
    inputs, outputs, program = module.context(trainable=True,
                                              max_seq_len=cnf.max_seq_len,
                                              num_slots=2)

    tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path(),
                                  tokenize_chinese_chars=True)
    dataset = SouhuCompetition(tokenizer=tokenizer,
                               max_seq_len=cnf.max_seq_len)

    strategy = hub.AdamWeightDecayStrategy(weight_decay=0.01,
                                           warmup_proportion=0.1,
                                           learning_rate=1e-5)
    config = hub.RunConfig(
        eval_interval=300,
        use_cuda=True,
        num_epoch=10,
        batch_size=cnf.train_and_eval_batch,
        checkpoint_dir='./ckpt_ernie_pointwise_matching_{}'.format(type),
        strategy=strategy)
    # 构建迁移网络,使用ernie的token-level输出
    query = outputs["sequence_output"]
    title = outputs['sequence_output_2']
    # 创建pointwise文本匹配任务
    pointwise_matching_task = hub.PointwiseTextMatchingTask(
        dataset=dataset,
        query_feature=query,
        title_feature=title,
        tokenizer=tokenizer,
        config=config)
    run_states = pointwise_matching_task.finetune_and_eval()

    # # 预测数据样例
    # text_pairs = [
    #     [
    #         "小孩吃了百令胶囊能打预防针吗",  # query
    #         "小孩吃了百令胶囊能不能打预防针",  # title
    #     ],
    #     [
    #         "请问呕血与咯血有什么区别?",  # query
    #         "请问呕血与咯血异同?",  # title
    #     ]
    # ]
    save_df = pd.DataFrame(columns=['id', 'label'])

    def predict(text_pairs):
        results = pointwise_matching_task.predict(
            data=text_pairs,
            max_seq_len=cnf.max_seq_len,
            label_list=dataset.get_labels(),
            return_result=True,
            accelerate_mode=False)
        return results

    if type in ['ssA', 'slA', 'llA']:
        test_file = './data/data78383/{}_test.tsv'.format(type)
    elif type in ['ssB', 'slB', 'llB']:
        test_file = './data/data78384/{}_test.tsv'.format(type)
    test_df = pd.read_csv(test_file, sep='\t')
    test_df.columns = ['text_a', 'text_b', 'id']
    text_pairs = []
    ids = []
    for index, row in test_df.iterrows():
        text_pairs.append([row['text_a'], row['text_b']])
        ids.append(row['id'])
        if len(text_pairs) == cnf.test_batch:
            results = predict(text_pairs)
            for i in range(len(ids)):
                new = pd.DataFrame({
                    'id': ids[i],
                    'label': results[i]
                },
                                   index=[0])
                save_df = save_df.append(new, ignore_index=True)
            text_pairs = []
            ids = []
    if len(text_pairs) != 0:
        results = predict(text_pairs)
        for i in range(len(ids)):
            new = pd.DataFrame({'id': ids[i], 'label': results[i]}, index=[0])
            save_df = save_df.append(new, ignore_index=True)

    save_df.to_csv('./results/{}.csv'.format(type),
                   header=True,
                   sep=',',
                   index=False)