示例#1
0
    def _initialize(self, use_gpu=False):
        # Load Paddlehub ERNIE Tiny pretrained model
        self.module = hub.Module(name="ernie_tiny")
        inputs, outputs, program = self.module.context(trainable=True,
                                                       max_seq_len=128)

        # Download dataset and get its label list and label num
        # If you just want labels information, you can omit its tokenizer parameter to avoid preprocessing the train set.
        dataset = hub.dataset.Couplet()
        self.label_list = dataset.get_labels()

        # Setup RunConfig for PaddleHub Fine-tune API
        config = hub.RunConfig(use_data_parallel=False,
                               use_cuda=use_gpu,
                               batch_size=1,
                               checkpoint_dir=os.path.join(
                                   self.directory, "assets", "ckpt"),
                               strategy=hub.AdamWeightDecayStrategy())

        # Construct transfer learning network
        # Use "pooled_output" for classification tasks on an entire sentence.
        # Use "sequence_output" for token-level output.
        pooled_output = outputs["pooled_output"]
        sequence_output = outputs["sequence_output"]

        # Define a classfication fine-tune task by PaddleHub's API
        self.gen_task = hub.TextGenerationTask(feature=pooled_output,
                                               token_feature=sequence_output,
                                               max_seq_len=128,
                                               num_classes=dataset.num_labels,
                                               config=config,
                                               metrics_choices=["bleu"])
def run_predict(data):
	ds=MyDataset()
	module = hub.Module(name="ernie",version="1.1.0")
	inputs, outputs, program = module.context(max_seq_len=512)
	pooled_output=outputs["sequence_output"]

	reader = MyClassifyReader(
		dataset=ds,
		vocab_path=module.get_vocab_path(),
		max_seq_len=512)

	strategy=hub.AdamWeightDecayStrategy(
		learning_rate=5e-5,
		lr_scheduler="linear_decay",
		warmup_proportion=0.1,
		weight_decay=0.01,
		optimizer_name="adam"
	)
	config=hub.RunConfig(use_cuda=False,enable_memory_optim=True,num_epoch=3,batch_size=16,strategy=strategy,checkpoint_dir="./models/Product",)

	feed_list=[
		inputs["input_ids"].name,inputs["position_ids"].name,
		inputs["segment_ids"].name,inputs["input_mask"].name,
	]


	cls_task=GRUTextClassifierTask(
		data_reader=reader,
		feature=pooled_output,
		feed_list=feed_list,
		num_classes=ds.num_labels,
		config=config
	)
	map = {3: '天使轮', 1: 'B轮', 4:'战略融资', 0: "A轮", 2: 'C轮'}
	predictions=[]
	index = 0
	run_states = cls_task.predict(data=data)
	results = [run_state.run_results for run_state in run_states]
	for batch_result in results:
		batch_result = np.argmax(batch_result, axis=2)[0]
		for result in batch_result:
			print(result)
			predictions.append(result)
			index += 1




	# In[27]:


	result=[]
	prob=[]
	index=0
	for batch_result in results:
		for single_result in batch_result[0]:
			print("=====")
			print(single_result)
			score=(1*single_result[0]+2*single_result[1]+3*single_result[2]+4*single_result[3]+5*single_result[4])/15*100
	return score
示例#3
0
    def load(self):
        inputs, outputs, program = self.module.context(
        trainable=True, max_seq_len=128)

        reader = hub.reader.ClassifyReader(
        dataset=self.dataset,
                vocab_path=self.module.get_vocab_path(),
        	max_seq_len=128,
        	use_task_id=False)

        pooled_output = outputs["pooled_output"]
        feed_list = [
                inputs["input_ids"].name,
                inputs["position_ids"].name,
                inputs["segment_ids"].name,
                inputs["input_mask"].name,
                ]
	
        config = hub.RunConfig(
                use_pyreader=False,
                use_cuda=True,
        	batch_size=30,
        	enable_memory_optim=False,
        	checkpoint_dir=self.module_in,
        	strategy=hub.finetune.strategy.DefaultFinetuneStrategy())

        cls_task = hub.TextClassifierTask(
        	data_reader=reader,
        	feature=pooled_output,
        	feed_list=feed_list,
        	num_classes=self.dataset.num_labels,
        	config=config,
        	metrics_choices=self.metrics_choices)

        return cls_task
示例#4
0
def finetune(args):
    # Load Paddlehub pretrained model, default as mobilenet
    module = hub.Module(name=args.module)
    input_dict, output_dict, program = module.context(trainable=True)

    # Download dataset and use ImageClassificationReader to read dataset
    dataset = hub.dataset.Flowers()
    data_reader = hub.reader.ImageClassificationReader(
        image_width=module.get_expected_image_width(),
        image_height=module.get_expected_image_height(),
        images_mean=module.get_pretrained_images_mean(),
        images_std=module.get_pretrained_images_std(),
        dataset=dataset)

    # The last 2 layer of resnet_v2_101_imagenet network
    feature_map = output_dict["feature_map"]

    img = input_dict["image"]
    feed_list = [img.name]

    # Select finetune strategy, setup config and finetune
    strategy = hub.DefaultFinetuneStrategy(learning_rate=args.learning_rate)
    config = hub.RunConfig(
        use_cuda=True,
        num_epoch=args.epochs,
        batch_size=args.batch_size,
        checkpoint_dir=args.checkpoint_dir,
        strategy=strategy)

    # Construct transfer learning network
    task = hub.ImageClassifierTask(
        data_reader=data_reader,
        feed_list=feed_list,
        feature=feature_map,
        num_classes=dataset.num_labels,
        config=config)

    # Load model from the defined model path or not
    if args.model_path != "":
        with task.phase_guard(phase="train"):
            task.init_if_necessary()
            task.load_parameters(args.model_path)
            logger.info("PaddleHub has loaded model from %s" % args.model_path)

    # Finetune by PaddleHub's API
    task.finetune()
    # Evaluate by PaddleHub's API
    run_states = task.eval()
    # Get acc score on dev
    eval_avg_score, eval_avg_loss, eval_run_speed = task._calculate_metrics(
        run_states)

    # Move ckpt/best_model to the defined saved parameters directory
    best_model_dir = os.path.join(config.checkpoint_dir, "best_model")
    if is_path_valid(args.saved_params_dir) and os.path.exists(best_model_dir):
        shutil.copytree(best_model_dir, args.saved_params_dir)
        shutil.rmtree(config.checkpoint_dir)

    # acc on dev will be used by auto finetune
    hub.report_final_result(eval_avg_score["acc"])
示例#5
0
    def get_embedding(self, texts, use_gpu=False, batch_size=1):
        """
        get pooled_output and sequence_output for input texts.
        Warnings: this method depends on Paddle Inference Library, it may not work properly in PaddlePaddle <= 1.6.2.

        Args:
            texts (list): each element is a text sample, each sample include text_a and text_b where text_b can be omitted.
                          for example: [[sample0_text_a, sample0_text_b], [sample1_text_a, sample1_text_b], ...]
            use_gpu (bool): use gpu or not, default False.
            batch_size (int): the data batch size, default 1.

        Returns:
            pooled_outputs(list): its element is a numpy array, the first feature of each text sample.
            sequence_outputs(list): its element is a numpy array, the whole features of each text sample.
        """
        if not hasattr(
                self, "emb_job"
        ) or self.emb_job["batch_size"] != batch_size or self.emb_job[
                "use_gpu"] != use_gpu:
            inputs, outputs, program = self.context(
                trainable=True, max_seq_len=self.MAX_SEQ_LEN)

            reader = hub.reader.ClassifyReader(
                dataset=None,
                vocab_path=self.get_vocab_path(),
                max_seq_len=self.MAX_SEQ_LEN,
                sp_model_path=self.get_spm_path() if hasattr(
                    self, "get_spm_path") else None,
                word_dict_path=self.get_word_dict_path() if hasattr(
                    self, "word_dict_path") else None)

            feed_list = [
                inputs["input_ids"].name,
                inputs["position_ids"].name,
                inputs["segment_ids"].name,
                inputs["input_mask"].name,
            ]

            pooled_feature, seq_feature = outputs["pooled_output"], outputs[
                "sequence_output"]

            config = hub.RunConfig(
                use_data_parallel=False,
                use_cuda=use_gpu,
                batch_size=batch_size)

            self.emb_job = {}
            self.emb_job["task"] = _TransformerEmbeddingTask(
                pooled_feature=pooled_feature,
                seq_feature=seq_feature,
                feed_list=feed_list,
                data_reader=reader,
                config=config,
            )
            self.emb_job["batch_size"] = batch_size
            self.emb_job["use_gpu"] = use_gpu

        return self.emb_job["task"].predict(
            data=texts, return_result=True, accelerate_mode=True)
示例#6
0
def predict(args):
    module = hub.Module(name=args.module)
    input_dict, output_dict, program = module.context(trainable=True)

    if args.dataset.lower() == "flowers":
        dataset = hub.dataset.Flowers()
    elif args.dataset.lower() == "dogcat":
        dataset = hub.dataset.DogCat()
    elif args.dataset.lower() == "indoor67":
        dataset = hub.dataset.Indoor67()
    elif args.dataset.lower() == "food101":
        dataset = hub.dataset.Food101()
    elif args.dataset.lower() == "stanforddogs":
        dataset = hub.dataset.StanfordDogs()
    else:
        raise ValueError("%s dataset is not defined" % args.dataset)

    data_reader = hub.reader.ImageClassificationReader(
        image_width=module.get_expected_image_width(),
        image_height=module.get_expected_image_height(),
        images_mean=module.get_pretrained_images_mean(),
        images_std=module.get_pretrained_images_std(),
        dataset=dataset)

    feature_map = output_dict["feature_map"]

    img = input_dict["image"]
    feed_list = [img.name]

    config = hub.RunConfig(
        use_data_parallel=False,
        use_pyreader=args.use_pyreader,
        use_cuda=args.use_gpu,
        batch_size=args.batch_size,
        enable_memory_optim=False,
        checkpoint_dir=args.checkpoint_dir,
        strategy=hub.finetune.strategy.DefaultFinetuneStrategy())

    task = hub.ClassifierTask(data_reader=data_reader,
                              feed_list=feed_list,
                              feature=feature_map,
                              num_classes=dataset.num_labels,
                              config=config)

    data = ["./test/test_img_daisy.jpg", "./test/test_img_roses.jpg"]
    label_map = dataset.label_dict()
    index = 0
    # get classification result
    run_states = task.predict(data=data)
    results = [run_state.run_results for run_state in run_states]
    for batch_result in results:
        # get predict index
        batch_result = np.argmax(batch_result, axis=2)[0]
        for result in batch_result:
            index += 1
            result = label_map[result]
            print("input %i is %s, and the predict result is %s" %
                  (index, data[index - 1], result))
示例#7
0
    def _initialize(self,
                    ckpt_dir="ckpt_chnsenticorp",
                    num_class=2,
                    max_seq_len=128,
                    use_gpu=False,
                    batch_size=1):
        self.ckpt_dir = os.path.join(self.directory, ckpt_dir)
        self.num_class = num_class
        self.MAX_SEQ_LEN = max_seq_len

        # Load Paddlehub ERNIE Tiny pretrained model
        self.module = hub.Module(name="ernie_tiny")
        inputs, outputs, program = self.module.context(trainable=True,
                                                       max_seq_len=max_seq_len)

        self.vocab_path = self.module.get_vocab_path()

        # Download dataset and use accuracy as metrics
        # Choose dataset: GLUE/XNLI/ChinesesGLUE/NLPCC-DBQA/LCQMC
        # metric should be acc, f1 or matthews
        metrics_choices = ["acc"]

        # For ernie_tiny, it use sub-word to tokenize chinese sentence
        # If not ernie tiny, sp_model_path and word_dict_path should be set None
        reader = hub.reader.ClassifyReader(
            vocab_path=self.module.get_vocab_path(),
            max_seq_len=max_seq_len,
            sp_model_path=self.module.get_spm_path(),
            word_dict_path=self.module.get_word_dict_path())

        # Construct transfer learning network
        # Use "pooled_output" for classification tasks on an entire sentence.
        # Use "sequence_output" for token-level output.
        pooled_output = outputs["pooled_output"]

        # Setup feed list for data feeder
        # Must feed all the tensor of module need
        feed_list = [
            inputs["input_ids"].name,
            inputs["position_ids"].name,
            inputs["segment_ids"].name,
            inputs["input_mask"].name,
        ]

        # Setup runing config for PaddleHub Finetune API
        config = hub.RunConfig(use_data_parallel=False,
                               use_cuda=use_gpu,
                               batch_size=batch_size,
                               checkpoint_dir=self.ckpt_dir,
                               strategy=hub.AdamWeightDecayStrategy())

        # Define a classfication finetune task by PaddleHub's API
        self.cls_task = hub.TextClassifierTask(data_reader=reader,
                                               feature=pooled_output,
                                               feed_list=feed_list,
                                               num_classes=self.num_class,
                                               config=config,
                                               metrics_choices=metrics_choices)
示例#8
0
def predict_tag(model_name, data):
    checkpoint_dir = "model/" + model_name
    dataset_dir = "data/" + model_name
    # Load Paddlehub ERNIE Tiny pretrained model
    module = hub.Module(name="ernie_tiny")
    inputs, outputs, program = module.context(trainable=UnicodeTranslateError,
                                              max_seq_len=128)
    # Download dataset and use accuracy as metrics
    # Choose dataset: GLUE/XNLI/ChinesesGLUE/NLPCC-DBQA/LCQMC
    dataset = ViolateDataset(dataset_dir=dataset_dir)
    # For ernie_tiny, it use sub-word to tokenize chinese sentence
    # If not ernie tiny, sp_model_path and word_dict_path should be set None
    reader = hub.reader.ClassifyReader(
        dataset=dataset,
        vocab_path=module.get_vocab_path(),
        max_seq_len=128,
        sp_model_path=module.get_spm_path(),
        word_dict_path=module.get_word_dict_path())

    # Construct transfer learning network
    # Use "pooled_output" for classification tasks on an entire sentence.
    # Use "sequence_output" for token-level output.
    pooled_output = outputs["pooled_output"]

    # Setup feed list for data feeder
    # Must feed all the tensor of module need
    feed_list = [
        inputs["input_ids"].name,
        inputs["position_ids"].name,
        inputs["segment_ids"].name,
        inputs["input_mask"].name,
    ]

    # Setup runing config for PaddleHub Finetune API
    config = hub.RunConfig(use_data_parallel=True,
                           use_cuda=False,
                           batch_size=24,
                           checkpoint_dir=checkpoint_dir,
                           strategy=hub.AdamWeightDecayStrategy())

    # Define a classfication finetune task by PaddleHub's API
    cls_task = hub.TextClassifierTask(data_reader=reader,
                                      feature=pooled_output,
                                      feed_list=feed_list,
                                      num_classes=dataset.num_labels,
                                      config=config)

    # Data to be prdicted
    # data = [["有保障"],
    #         ["无风险"],
    #         ["基金过往数据并不代表未来趋势"],
    #         ["为什么"],
    #         ["周杰伦"],
    #         ["吴东瀛"],
    #         ]
    # print(cls_task.predict(data=data, return_result=True))
    return cls_task.predict(data=data, return_result=True)
示例#9
0
def recognize():
    global flag
    module = hub.Module(name="resnet_v2_50_imagenet")
    dataset = DemoDataset()

    data_reader = hub.reader.ImageClassificationReader(
        image_width=module.get_expected_image_width(),
        image_height=module.get_expected_image_height(),
        images_mean=module.get_pretrained_images_mean(),
        images_std=module.get_pretrained_images_std(),
        dataset=dataset)

    config = hub.RunConfig(
        use_cuda=False,  # 是否使用GPU训练,默认为False;
        num_epoch=5,  # Fine-tune的轮数;
        checkpoint_dir="cv_finetune_turtorial_demo",  # 模型checkpoint保存路径, 若用户没有指定,程序会自动生成;
        batch_size=10,  # 训练的批大小,如果使用GPU,请根据实际情况调整batch_size;
        eval_interval=10,  # 模型评估的间隔,默认每100个step评估一次验证集;
        strategy=hub.finetune.strategy.DefaultFinetuneStrategy())  #Fine-tune优化策略;
        #strategy=hub.finetune.strategy.AdamWeightDecayStrategy())

    input_dict, output_dict, program = module.context(trainable=True)
    img = input_dict["image"]
    feature_map = output_dict["feature_map"]
    feed_list = [img.name]

    task = hub.ImageClassifierTask(
        data_reader=data_reader,
        feed_list=feed_list,
        feature=feature_map,
        num_classes=dataset.num_labels,
        config=config)

    label_map = dataset.label_dict()
    #run_states = task.finetune_and_eval()
    while 1:
        if flag is 1:
            data = []
            data.append("/home/xmy/PycharmProjects/test/paddle/proj3_recognizeMyself/temp_out/cap.jpg")
            index = 0
            run_states = task.predict(data=data)
            results = [run_state.run_results for run_state in run_states]

            for batch_result in results:
                batch_result = np.argmax(batch_result, axis=2)[0]
                for result in batch_result:
                    index += 1
                    result = label_map[result]
                    #print("input %i is %s, and the predict result is %s" %
                        #(index, data[index - 1], result))

            if "科比" in result:
                os.system("wmctrl -a \"pycharm\"")
            elif "库里" in result:
                os.system("wmctrl -a \"chrome\"")
            flag = 0
示例#10
0
def predict(args):
    # Load Paddlehub  pretrained model
    module = hub.Module(name=args.module)
    input_dict, output_dict, program = module.context(trainable=True)

    # Download dataset
    if args.dataset.lower() == "flowers":
        dataset = hub.dataset.Flowers()
    elif args.dataset.lower() == "dogcat":
        dataset = hub.dataset.DogCat()
    elif args.dataset.lower() == "indoor67":
        dataset = hub.dataset.Indoor67()
    elif args.dataset.lower() == "food101":
        dataset = hub.dataset.Food101()
    elif args.dataset.lower() == "stanforddogs":
        dataset = hub.dataset.StanfordDogs()
    else:
        raise ValueError("%s dataset is not defined" % args.dataset)

    # Use ImageClassificationReader to read dataset
    data_reader = hub.reader.ImageClassificationReader(
        image_width=module.get_expected_image_width(),
        image_height=module.get_expected_image_height(),
        images_mean=module.get_pretrained_images_mean(),
        images_std=module.get_pretrained_images_std(),
        dataset=dataset)

    feature_map = output_dict["feature_map"]

    # Setup feed list for data feeder
    feed_list = [input_dict["image"].name]

    # Setup runing config for PaddleHub Finetune API
    config = hub.RunConfig(
        use_data_parallel=False,
        use_cuda=args.use_gpu,
        batch_size=args.batch_size,
        checkpoint_dir=args.checkpoint_dir,
        strategy=hub.finetune.strategy.DefaultFinetuneStrategy())

    # Define a reading comprehension finetune task by PaddleHub's API
    task = hub.ImageClassifierTask(
        data_reader=data_reader,
        feed_list=feed_list,
        feature=feature_map,
        num_classes=dataset.num_labels,
        config=config)

    data = ["./test/test_img_daisy.jpg", "./test/test_img_roses.jpg"]
    print(task.predict(data=data, return_result=True))
示例#11
0
def human_classfication(data):
    '''
    使用前面训练好的图片进行人脸识别分类
    :param data: 要检测的图片的地址
    :return: 人脸的标签(是谁)
    '''
    module = hub.Module(name="resnet_v2_18_imagenet")
    dataset = DemoDataset()

    # 模型构建
    data_reader = hub.reader.ImageClassificationReader(
        image_width=module.get_expected_image_width(),
        image_height=module.get_expected_image_height(),
        images_mean=module.get_pretrained_images_mean(),
        images_std=module.get_pretrained_images_std(),
        dataset=dataset)

    config = hub.RunConfig(
        use_cuda=False,  # 是否使用GPU训练,默认为False;
        num_epoch=4,  # Fine-tune的轮数;
        checkpoint_dir="cv_finetune",  # 模型checkpoint保存路径, 若用户没有指定,程序会自动生成;
        batch_size=10,  # 训练的批大小,如果使用GPU,请根据实际情况调整batch_size;
        eval_interval=10,  # 模型评估的间隔,默认每100个step评估一次验证集;
        strategy=hub.finetune.strategy.DefaultFinetuneStrategy()
    )  # Fine-tune优化策略;
    # 组建FinetuneTask
    input_dict, output_dict, program = module.context(trainable=True)
    img = input_dict["image"]
    feature_map = output_dict["feature_map"]
    feed_list = [img.name]

    task = hub.ImageClassifierTask(data_reader=data_reader,
                                   feed_list=feed_list,
                                   feature=feature_map,
                                   num_classes=dataset.num_labels,
                                   config=config)

    task.load_checkpoint()

    # ##--------------开始预测

    label_map = dataset.label_dict()
    index = 0
    run_states = task.predict(data=data)
    results = [run_state.run_results for run_state in run_states]
    for batch_result in results:
        batch_result = np.argmax(batch_result, axis=2)[0]
        for result in batch_result:
            return result
示例#12
0
def finetune(args):
    module = hub.Module(name=args.module)
    input_dict, output_dict, program = module.context(trainable=True)

    if args.dataset.lower() == "flowers":
        dataset = hub.dataset.Flowers()
    elif args.dataset.lower() == "dogcat":
        dataset = hub.dataset.DogCat()
    elif args.dataset.lower() == "indoor67":
        dataset = hub.dataset.Indoor67()
    elif args.dataset.lower() == "food101":
        dataset = hub.dataset.Food101()
    elif args.dataset.lower() == "stanforddogs":
        dataset = hub.dataset.StanfordDogs()
    else:
        raise ValueError("%s dataset is not defined" % args.dataset)

    data_reader = hub.reader.ImageClassificationReader(
        image_width=module.get_expected_image_width(),
        image_height=module.get_expected_image_height(),
        images_mean=module.get_pretrained_images_mean(),
        images_std=module.get_pretrained_images_std(),
        dataset=dataset)

    feature_map = output_dict["feature_map"]

    img = input_dict["image"]
    feed_list = [img.name]

    config = hub.RunConfig(
        use_data_parallel=args.use_data_parallel,
        use_pyreader=args.use_pyreader,
        use_cuda=args.use_gpu,
        num_epoch=args.num_epoch,
        batch_size=args.batch_size,
        enable_memory_optim=False,
        checkpoint_dir=args.checkpoint_dir,
        strategy=hub.finetune.strategy.DefaultFinetuneStrategy())

    task = hub.ImageClassifierTask(
        data_reader=data_reader,
        feed_list=feed_list,
        feature=feature_map,
        num_classes=dataset.num_labels,
        config=config)
    task.finetune_and_eval()
示例#13
0
def train(train_i, args):
    dataset = MyDataset()
    module = hub.Module(name=args.model)
    reader = hub.reader.MultiLabelClassifyReader(
        dataset=dataset,
        vocab_path=module.get_vocab_path(),
        max_seq_len=args.max_seq_len)

    strategy = hub.AdamWeightDecayStrategy(
        weight_decay=args.weight_decay,
        warmup_proportion=args.warmup_proportion,
        lr_scheduler=args.lr_scheduler,
        learning_rate=args.learning_rate)
    config = hub.RunConfig(use_cuda=args.use_gpu,
                           num_epoch=args.num_epoch,
                           checkpoint_dir=args.checkpoint_dir + str(train_i),
                           batch_size=args.batch_size,
                           eval_interval=eval_interval,
                           log_interval=log_interval,
                           strategy=strategy)
    inputs, outputs, program = module.context(trainable=True,
                                              max_seq_len=args.max_seq_len)

    # Use "pooled_output" for classification tasks on an entire sentence.
    pooled_output = outputs["pooled_output"]

    # feed_list的Tensor顺序不可以调整
    feed_list = [
        inputs["input_ids"].name,
        inputs["position_ids"].name,
        inputs["segment_ids"].name,
        inputs["input_mask"].name,
    ]

    cls_task = hub.MultiLabelClassifierTask(data_reader=reader,
                                            feature=pooled_output,
                                            feed_list=feed_list,
                                            num_classes=dataset.num_labels,
                                            config=config)
    cls_task.main_program.random_seed = args.seed
    change_task(cls_task, train_i)
    return cls_task, reader
示例#14
0
    dataset = hub.dataset.DuEL(
        tokenizer=tokenizer, max_seq_len=args.max_seq_len)

    # Construct transfer learning network
    # Use token-level output.
    query = outputs["emb"]
    left = outputs['emb_2']
    right = outputs['emb_3']

    # Select fine-tune strategy
    strategy = hub.DefaultStrategy(optimizer_name="sgd")

    # Setup RunConfig for PaddleHub Fine-tune API
    config = hub.RunConfig(
        use_data_parallel=False,
        use_cuda=False,
        batch_size=args.batch_size,
        checkpoint_dir=args.checkpoint_dir,
        strategy=strategy)

    # Define a text matching task by PaddleHub's API
    # network choice: bow, cnn, gru, lstm (PaddleHub pre-defined network)
    pairwise_matching_task = hub.PairwiseTextMatchingTask(
        dataset=dataset,
        query_feature=query,
        left_feature=left,
        right_feature=right,
        tokenizer=tokenizer,
        network=args.network,
        config=config)

    # Prediction data sample.
        max_seq_len=args.max_seq_len,
        use_task_id=args.use_taskid)

    # Construct transfer learning network
    # Use "pooled_output" for classification tasks on an entire sentence.
    pooled_output = outputs["pooled_output"]

    # Select finetune strategy, setup config and finetune
    strategy = hub.AdamWeightDecayStrategy(weight_decay=args.weight_decay,
                                           learning_rate=args.learning_rate,
                                           lr_scheduler="linear_decay")

    # Setup runing config for PaddleHub Finetune API
    config = hub.RunConfig(use_cuda=args.use_gpu,
                           num_epoch=args.num_epoch,
                           batch_size=args.batch_size,
                           checkpoint_dir=args.checkpoint_dir,
                           strategy=strategy)

    # Define a classfication finetune task by PaddleHub's API
    multi_label_cls_task = hub.MultiLabelClassifierTask(
        data_reader=reader,
        feature=pooled_output,
        feed_list=feed_list,
        num_classes=dataset.num_labels,
        config=config)

    # Finetune and evaluate by PaddleHub's API
    # will finish training, evaluation, testing, save model automatically
    multi_label_cls_task.finetune_and_eval()
示例#16
0
module = hub.Module(name="chinese-roberta-wwm-ext-large")
inputs, outputs, program = module.context(trainable=True, max_seq_len=128)
program.random_seed = 1


reader = hub.reader.ClassifyReader(
    dataset=dataset,
    vocab_path=module.get_vocab_path(),
    max_seq_len=128,
    random_seed=1)

print("learning rate: ", eval(args.lr))
print("max epoch: ", args.max_epoch)
strategy = hub.DefaultFinetuneStrategy(learning_rate=eval(args.lr), optimizer_name="sgd")

config = hub.RunConfig(use_cuda=True, num_epoch=args.max_epoch, batch_size=32, strategy=strategy, log_interval=100,
                 eval_interval=1400,save_ckpt_interval=1400, checkpoint_dir='./checkpoint_aug')
                 

pooled_output = outputs["pooled_output"]

feed_list = [
    inputs["input_ids"].name,
    inputs["position_ids"].name,
    inputs["segment_ids"].name,
    inputs["input_mask"].name
]

cls_task = hub.TextClassifierTask(
    data_reader=reader,
    feature=pooled_output,
    feed_list=feed_list,
示例#17
0
def predict(args):
    module_name = args.module  # 'yolov3_darknet53_coco2017'
    model_type = get_model_type(module_name)  # 'yolo'
    # define data
    ds = hub.dataset.Coco10(model_type)
    print("ds.num_labels", ds.num_labels)

    data_reader = ObjectDetectionReader(dataset=ds, model_type=model_type)

    # define model(program)
    module = hub.Module(name=module_name)
    if model_type == 'rcnn':
        input_dict, output_dict, program = module.context(trainable=True,
                                                          phase='train')
        input_dict_pred, output_dict_pred, program_pred = module.context(
            trainable=False)
    else:
        input_dict, output_dict, program = module.context(trainable=True)
        input_dict_pred = output_dict_pred = None
    feed_list, pred_feed_list = get_feed_list(module_name, input_dict,
                                              input_dict_pred)
    feature, pred_feature = get_mid_feature(module_name, output_dict,
                                            output_dict_pred)

    config = hub.RunConfig(
        use_data_parallel=False,
        use_pyreader=True,
        use_cuda=args.use_gpu,
        batch_size=args.batch_size,
        enable_memory_optim=False,
        checkpoint_dir=args.checkpoint_dir,
        strategy=hub.finetune.strategy.DefaultFinetuneStrategy())

    task = hub.DetectionTask(data_reader=data_reader,
                             num_classes=ds.num_labels,
                             feed_list=feed_list,
                             feature=feature,
                             predict_feed_list=pred_feed_list,
                             predict_feature=pred_feature,
                             model_type=model_type,
                             config=config)

    data = [
        "./test/test_img_bird.jpg",
        "./test/test_img_cat.jpg",
    ]
    label_map = ds.label_dict()
    run_states = task.predict(data=data, accelerate_mode=False)
    results = [run_state.run_results for run_state in run_states]
    for outs in results:
        keys = ['im_shape', 'im_id', 'bbox']
        res = {
            k: (np.array(v), v.recursive_sequence_lengths())
            for k, v in zip(keys, outs)
        }
        print("im_id", res['im_id'])
        is_bbox_normalized = dconf.conf[model_type]['is_bbox_normalized']
        clsid2catid = {}
        for k in label_map:
            clsid2catid[k] = k
        bbox_results = bbox2out([res], clsid2catid, is_bbox_normalized)
        print(bbox_results)
示例#18
0
        inputs["input_mask"].name,
    ]

    # Select finetune strategy, setup config and finetune
    strategy = hub.AdamWeightDecayStrategy(
        weight_decay=args.weight_decay,
        learning_rate=args.learning_rate,
        warmup_proportion=args.warmup_proportion,
        lr_scheduler="linear_decay")

    # Setup runing config for PaddleHub Finetune API
    config = hub.RunConfig(eval_interval=300,
                           use_pyreader=args.use_pyreader,
                           use_data_parallel=args.use_data_parallel,
                           use_cuda=args.use_gpu,
                           num_epoch=args.num_epoch,
                           batch_size=args.batch_size,
                           checkpoint_dir=args.checkpoint_dir,
                           enable_memory_optim=True,
                           strategy=strategy)

    # Define a reading comprehension finetune task by PaddleHub's API
    reading_comprehension_task = hub.ReadingComprehensionTask(
        data_reader=reader,
        feature=seq_output,
        feed_list=feed_list,
        config=config,
        sub_task=args.dataset,
    )

    # Finetune by PaddleHub's API
示例#19
0
        inputs["segment_ids"].name,
        inputs["input_mask"].name,
    ]

    if args.use_taskid:
        feed_list.append(inputs["task_ids"].name)

    # Select finetune strategy, setup config and finetune
    strategy = hub.AdamWeightDecayStrategy(weight_decay=args.weight_decay,
                                           learning_rate=args.learning_rate,
                                           lr_scheduler="linear_decay")

    # Setup runing config for PaddleHub Finetune API
    config = hub.RunConfig(use_data_parallel=args.use_data_parallel,
                           use_pyreader=args.use_pyreader,
                           use_cuda=args.use_gpu,
                           num_epoch=args.num_epoch,
                           batch_size=args.batch_size,
                           checkpoint_dir=args.checkpoint_dir,
                           strategy=strategy)

    # Define a regression finetune task by PaddleHub's API
    reg_task = hub.RegressionTask(data_reader=reader,
                                  feature=pooled_output,
                                  feed_list=feed_list,
                                  config=config)

    # Finetune and evaluate by PaddleHub's API
    # will finish training, evaluation, testing, save model automatically
    reg_task.finetune_and_eval()
def main():
    # Load Paddlehub pretrained model
    # 更多预训练模型 https://www.paddlepaddle.org.cn/hublist?filter=en_category&value=SemanticModel
    #model_name = "ernie_tiny"
    model_name = "chinese-roberta-wwm-ext-large"
    module = hub.Module(name=model_name)
    inputs, outputs, program = module.context(trainable=True,
                                              max_seq_len=args.max_seq_len)

    # Download dataset and use SequenceLabelReader to read dataset
    dataset = EEDataset(args.data_dir, schema_labels, model=args.do_model)
    reader = hub.reader.SequenceLabelReader(
        dataset=dataset,
        vocab_path=module.get_vocab_path(),
        max_seq_len=args.max_seq_len,
        sp_model_path=module.get_spm_path(),
        word_dict_path=module.get_word_dict_path())

    # Construct transfer learning network
    # Use "sequence_output" for token-level output.
    sequence_output = outputs["sequence_output"]

    # Setup feed list for data feeder
    # Must feed all the tensor of module need
    feed_list = [
        inputs["input_ids"].name, inputs["position_ids"].name,
        inputs["segment_ids"].name, inputs["input_mask"].name
    ]

    # Select a finetune strategy
    strategy = hub.AdamWeightDecayStrategy(
        warmup_proportion=args.warmup_proportion,
        weight_decay=args.weight_decay,
        learning_rate=args.learning_rate)

    # Setup runing config for PaddleHub Finetune API
    config = hub.RunConfig(eval_interval=args.eval_step,
                           save_ckpt_interval=args.model_save_step,
                           use_data_parallel=args.use_data_parallel,
                           use_cuda=args.use_gpu,
                           num_epoch=args.num_epoch,
                           batch_size=args.batch_size,
                           checkpoint_dir=args.checkpoint_dir,
                           strategy=strategy)

    # Define a sequence labeling finetune task by PaddleHub's API
    # If add crf, the network use crf as decoder
    seq_label_task = hub.SequenceLabelTask(data_reader=reader,
                                           feature=sequence_output,
                                           feed_list=feed_list,
                                           max_seq_len=args.max_seq_len,
                                           num_classes=dataset.num_labels,
                                           config=config,
                                           add_crf=args.add_crf)

    # Finetune and evaluate model by PaddleHub's API
    # will finish training, evaluation, testing, save model automatically
    if args.do_train:
        print("start finetune and eval process")
        seq_label_task.finetune_and_eval()

    if args.do_predict:
        print("start predict process")
        ret = []
        id2label = {val: key for key, val in reader.label_map.items()}
        input_data = [[d] for d in predict_data]
        run_states = seq_label_task.predict(data=input_data[1:])
        results = []
        for batch_states in run_states:
            batch_results = batch_states.run_results
            batch_infers = batch_results[0].reshape([-1]).astype(
                np.int32).tolist()
            seq_lens = batch_results[1].reshape([-1]).astype(np.int32).tolist()
            current_id = 0
            for length in seq_lens:
                seq_infers = batch_infers[current_id:current_id + length]
                seq_result = list(map(id2label.get, seq_infers[1:-1]))
                current_id += length if args.add_crf else args.max_seq_len
                results.append(seq_result)

        ret = []
        for sent, r_label in zip(predict_sents, results):
            sent["labels"] = r_label
            ret.append(json.dumps(sent, ensure_ascii=False))
        write_by_lines("{}.{}.pred".format(args.predict_data, args.do_model),
                       ret)
示例#21
0
def main(type, cnf):
    class SouhuCompetition(TextMatchingDataset):
        def __init__(self, tokenizer=None, max_seq_len=None):
            base_path = './data'
            if type in ['ssA', 'slA', 'llA']:
                train_file = 'data78383/{}_train.tsv'.format(type)
                dev_file = 'data78383/{}_valid.tsv'.format(type)
            elif type in ['ssB', 'slB', 'llB']:
                train_file = 'data78384/{}_train.tsv'.format(type)
                dev_file = 'data78384/{}_valid.tsv'.format(type)
            super(SouhuCompetition, self).__init__(
                is_pair_wise=False,  # 文本匹配类型,是否为pairwise
                base_path=base_path,
                train_file=train_file,  # 相对于base_path的文件路径
                dev_file=dev_file,  # 相对于base_path的文件路径
                train_file_with_header=True,
                dev_file_with_header=True,
                label_list=["0", "1"],
                tokenizer=tokenizer,
                max_seq_len=max_seq_len)

    module = hub.Module(name="ernie")

    # pointwise任务需要: query, title_left (2 slots)
    inputs, outputs, program = module.context(trainable=True,
                                              max_seq_len=cnf.max_seq_len,
                                              num_slots=2)

    tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path(),
                                  tokenize_chinese_chars=True)
    dataset = SouhuCompetition(tokenizer=tokenizer,
                               max_seq_len=cnf.max_seq_len)

    strategy = hub.AdamWeightDecayStrategy(weight_decay=0.01,
                                           warmup_proportion=0.1,
                                           learning_rate=1e-5)
    config = hub.RunConfig(
        eval_interval=300,
        use_cuda=True,
        num_epoch=10,
        batch_size=cnf.train_and_eval_batch,
        checkpoint_dir='./ckpt_ernie_pointwise_matching_{}'.format(type),
        strategy=strategy)
    # 构建迁移网络,使用ernie的token-level输出
    query = outputs["sequence_output"]
    title = outputs['sequence_output_2']
    # 创建pointwise文本匹配任务
    pointwise_matching_task = hub.PointwiseTextMatchingTask(
        dataset=dataset,
        query_feature=query,
        title_feature=title,
        tokenizer=tokenizer,
        config=config)
    run_states = pointwise_matching_task.finetune_and_eval()

    # # 预测数据样例
    # text_pairs = [
    #     [
    #         "小孩吃了百令胶囊能打预防针吗",  # query
    #         "小孩吃了百令胶囊能不能打预防针",  # title
    #     ],
    #     [
    #         "请问呕血与咯血有什么区别?",  # query
    #         "请问呕血与咯血异同?",  # title
    #     ]
    # ]
    save_df = pd.DataFrame(columns=['id', 'label'])

    def predict(text_pairs):
        results = pointwise_matching_task.predict(
            data=text_pairs,
            max_seq_len=cnf.max_seq_len,
            label_list=dataset.get_labels(),
            return_result=True,
            accelerate_mode=False)
        return results

    if type in ['ssA', 'slA', 'llA']:
        test_file = './data/data78383/{}_test.tsv'.format(type)
    elif type in ['ssB', 'slB', 'llB']:
        test_file = './data/data78384/{}_test.tsv'.format(type)
    test_df = pd.read_csv(test_file, sep='\t')
    test_df.columns = ['text_a', 'text_b', 'id']
    text_pairs = []
    ids = []
    for index, row in test_df.iterrows():
        text_pairs.append([row['text_a'], row['text_b']])
        ids.append(row['id'])
        if len(text_pairs) == cnf.test_batch:
            results = predict(text_pairs)
            for i in range(len(ids)):
                new = pd.DataFrame({
                    'id': ids[i],
                    'label': results[i]
                },
                                   index=[0])
                save_df = save_df.append(new, ignore_index=True)
            text_pairs = []
            ids = []
    if len(text_pairs) != 0:
        results = predict(text_pairs)
        for i in range(len(ids)):
            new = pd.DataFrame({'id': ids[i], 'label': results[i]}, index=[0])
            save_df = save_df.append(new, ignore_index=True)

    save_df.to_csv('./results/{}.csv'.format(type),
                   header=True,
                   sep=',',
                   index=False)
示例#22
0
# * `eval_interval`:每隔50 step在验证集上进行一次性能评估;
#
# * `checkpoint_dir`:将训练的参数和数据保存到cv_finetune_turtorial_demo目录中;
#
# * `strategy`:使用DefaultFinetuneStrategy策略进行finetune;
#
# 更多运行配置,请查看[RunConfig](https://github.com/PaddlePaddle/PaddleHub/wiki/PaddleHub-API:-RunConfig)
#
# 同时PaddleHub提供了许多优化策略,如`AdamWeightDecayStrategy`、`ULMFiTStrategy`、`DefaultFinetuneStrategy`等,详细信息参见[策略](https://github.com/PaddlePaddle/PaddleHub/wiki/PaddleHub-API:-Strategy)

# In[15]:

config = hub.RunConfig(
    use_cuda=False,  #是否使用GPU训练,默认为False;
    num_epoch=3,  #Fine-tune的轮数;
    checkpoint_dir=
    "cv_finetune_turtorial_demo",  #模型checkpoint保存路径, 若用户没有指定,程序会自动生成;
    batch_size=3,  #训练的批大小,如果使用GPU,请根据实际情况调整batch_size;
    eval_interval=10,  #模型评估的间隔,默认每100个step评估一次验证集;
    strategy=hub.finetune.strategy.DefaultFinetuneStrategy())  #Fine-tune优化策略;

# ### Step6、组建Finetune Task
# 有了合适的预训练模型和准备要迁移的数据集后,我们开始组建一个Task。
#
# 由于该数据设置是一个二分类的任务,而我们下载的分类module是在ImageNet数据集上训练的千分类模型,所以我们需要对模型进行简单的微调,把模型改造为一个二分类模型:
#
# 1. 获取module的上下文环境,包括输入和输出的变量,以及Paddle Program;
# 2. 从输出变量中找到特征图提取层feature_map;
# 3. 在feature_map后面接入一个全连接层,生成Task;

# In[16]:
示例#23
0
        inputs["position_ids"].name,
        inputs["segment_ids"].name,
        inputs["input_mask"].name,
    ]

    # Select finetune strategy, setup config and finetune
    strategy = hub.AdamWeightDecayStrategy(
        warmup_proportion=args.warmup_proportion,
        weight_decay=args.weight_decay,
        learning_rate=args.learning_rate)

    # Setup runing config for PaddleHub Finetune API
    config = hub.RunConfig(
        eval_interval=300,
        use_data_parallel=args.use_data_parallel,
        use_cuda=args.use_gpu,
        num_epoch=args.num_epoch,
        batch_size=args.batch_size,
        checkpoint_dir=args.checkpoint_dir,
        strategy=strategy)

    # Define a regression finetune task by PaddleHub's API
    reg_task = hub.RegressionTask(
        data_reader=reader,
        feature=pooled_output,
        feed_list=feed_list,
        config=config)

    # Finetune and evaluate by PaddleHub's API
    # will finish training, evaluation, testing, save model automatically
    reg_task.finetune_and_eval()
示例#24
0
def get_task(args, schema_labels, id):
    # 加载PaddleHub 预训练模型ERNIE Tiny/RoBERTa large
    # 更多预训练模型 https://www.paddlepaddle.org.cn/hublist?filter=en_category&value=SemanticModel
    # model_name = "ernie_tiny"
    model_name = args.model_name
    module = hub.Module(name=model_name)
    inputs, outputs, program = module.context(trainable=True,
                                              max_seq_len=args.max_seq_len)

    # 加载数据并通过SequenceLabelReader读取数据
    dataset = EEDataset(args.data_dir, schema_labels, model=args.do_model)
    reader = SequenceLabelReader(dataset=dataset,
                                 vocab_path=module.get_vocab_path(),
                                 max_seq_len=args.max_seq_len,
                                 sp_model_path=module.get_spm_path(),
                                 word_dict_path=module.get_word_dict_path())

    # 构建序列标注任务迁移网络
    # 使用ERNIE模型字级别的输出sequence_output作为迁移网络的输入
    sequence_output = outputs["sequence_output"]
    # sequence_output  = fluid.layers.dropout(
    #     x=sequence_output ,
    #     dropout_prob=args.dropout,
    #     dropout_implementation="upscale_in_train")

    # 设置模型program需要输入的变量feed_list
    # 必须按照以下顺序设置
    feed_list = [
        inputs["input_ids"].name, inputs["position_ids"].name,
        inputs["segment_ids"].name, inputs["input_mask"].name
    ]

    # 选择优化策略
    strategy = hub.AdamWeightDecayStrategy(
        warmup_proportion=args.warmup_proportion,
        weight_decay=args.weight_decay,
        learning_rate=args.learning_rate)

    # 配置运行设置
    config = hub.RunConfig(
        log_interval=100,
        eval_interval=args.eval_step,
        save_ckpt_interval=args.model_save_step,
        use_data_parallel=args.use_data_parallel,
        use_cuda=args.use_gpu,
        # enable_memory_optim=True,
        num_epoch=args.num_epoch,
        batch_size=args.batch_size,
        checkpoint_dir=args.checkpoint_dir,
        strategy=strategy)

    # 构建序列标注迁移任务
    seq_label_task = SequenceLabelTask(data_reader=reader,
                                       feature=sequence_output,
                                       feed_list=feed_list,
                                       max_seq_len=args.max_seq_len,
                                       num_classes=dataset.num_labels,
                                       config=config,
                                       add_crf=args.add_crf)
    seq_label_task.main_program.random_seed = args.random_seed
    add_hook(args, seq_label_task, id)
    return seq_label_task, reader
示例#25
0
    # Use "sequence_output" for token-level output.
    seq_output = outputs["sequence_output"]

    # Setup feed list for data feeder
    feed_list = [
        inputs["input_ids"].name,
        inputs["position_ids"].name,
        inputs["segment_ids"].name,
        inputs["input_mask"].name,
    ]

    # Setup runing config for PaddleHub Finetune API
    config = hub.RunConfig(
        use_data_parallel=False,
        use_cuda=args.use_gpu,
        batch_size=args.batch_size,
        checkpoint_dir=args.checkpoint_dir,
        strategy=hub.AdamWeightDecayStrategy())

    # Define a reading comprehension finetune task by PaddleHub's API
    reading_comprehension_task = hub.ReadingComprehensionTask(
        data_reader=reader,
        feature=seq_output,
        feed_list=feed_list,
        config=config)

    # Data to be predicted
    data = dataset.dev_examples[:10]
    reading_comprehension_task.predict(data=data)
示例#26
0
def finetune(args):
    module_name = args.module  # 'yolov3_darknet53_coco2017'
    model_type = get_model_type(module_name)  # 'yolo'
    # define dataset
    ds = hub.dataset.Coco10(model_type)
    # base_path = '/home/local3/zhaopenghao/data/detect/paddle-job-84942-0'
    # train_dir = 'train_data/images'
    # train_list = 'train_data/coco/instances_coco.json'
    # val_dir = 'eval_data/images'
    # val_list = 'eval_data/coco/instances_coco.json'
    # ds = ObjectDetectionDataset(base_path, train_dir, train_list, val_dir, val_list, val_dir, val_list, model_type=model_type)
    # print(ds.label_dict())
    print("ds.num_labels", ds.num_labels)

    # define batch reader
    data_reader = ObjectDetectionReader(dataset=ds, model_type=model_type)

    # define model(program)
    module = hub.Module(name=module_name)
    if model_type == 'rcnn':
        input_dict, output_dict, program = module.context(trainable=True,
                                                          phase='train')
        input_dict_pred, output_dict_pred, program_pred = module.context(
            trainable=False)
    else:
        input_dict, output_dict, program = module.context(trainable=True)
        input_dict_pred = output_dict_pred = None

    print("input_dict keys", input_dict.keys())
    print("output_dict keys", output_dict.keys())
    feed_list, pred_feed_list = get_feed_list(module_name, input_dict,
                                              input_dict_pred)
    print("output_dict length:", len(output_dict))
    print(output_dict.keys())
    if output_dict_pred is not None:
        print(output_dict_pred.keys())
    feature, pred_feature = get_mid_feature(module_name, output_dict,
                                            output_dict_pred)

    config = hub.RunConfig(
        log_interval=10,
        eval_interval=100,
        use_data_parallel=args.use_data_parallel,
        use_pyreader=True,
        use_cuda=args.use_gpu,
        num_epoch=args.num_epoch,
        batch_size=args.batch_size,
        enable_memory_optim=False,
        checkpoint_dir=args.checkpoint_dir,
        strategy=hub.finetune.strategy.DefaultFinetuneStrategy(
            learning_rate=0.00025, optimizer_name="adam"))

    task = hub.DetectionTask(data_reader=data_reader,
                             num_classes=ds.num_labels,
                             feed_list=feed_list,
                             feature=feature,
                             predict_feed_list=pred_feed_list,
                             predict_feature=pred_feature,
                             model_type=model_type,
                             config=config)
    task.finetune_and_eval()
示例#27
0
    sequence_output = outputs["sequence_output"]

    # Setup feed list for data feeder
    # Must feed all the tensor of ERNIE's module need
    feed_list = [
        inputs["input_ids"].name,
        inputs["position_ids"].name,
        inputs["segment_ids"].name,
        inputs["input_mask"].name,
    ]

    # Setup runing config for PaddleHub Finetune API
    config = hub.RunConfig(
        use_data_parallel=False,
        use_pyreader=args.use_pyreader,
        use_cuda=args.use_gpu,
        batch_size=args.batch_size,
        enable_memory_optim=False,
        checkpoint_dir=args.checkpoint_dir,
        strategy=hub.finetune.strategy.DefaultFinetuneStrategy())

    # Define a sequence labeling finetune task by PaddleHub's API
    seq_label_task = hub.SequenceLabelTask(data_reader=reader,
                                           feature=sequence_output,
                                           feed_list=feed_list,
                                           max_seq_len=args.max_seq_len,
                                           num_classes=dataset.num_labels,
                                           config=config)

    # test data
    data = [
        ["我们变而以书会友,以书结缘,把欧美、港台流行的食品类图谱、画册、工具书汇集一堂。"],
示例#28
0
                                   vocab_path=module.get_vocab_path(),
                                   sp_model_path=module.get_spm_path(),
                                   word_dict_path=module.get_word_dict_path(),
                                   max_seq_len=128)

strategy = hub.AdamWeightDecayStrategy(
    weight_decay=0.01,
    warmup_proportion=0.1,
    # learning_rate=5e-5,
    lr_scheduler="linear_decay",
    learning_rate=5e-5)

config = hub.RunConfig(use_cuda=True,
                       use_data_parallel=True,
                       num_epoch=1,
                       checkpoint_dir="module",
                       batch_size=20,
                       eval_interval=400,
                       strategy=strategy)

inputs, outputs, program = module.context(trainable=True, max_seq_len=128)

# Use "pooled_output" for classification tasks on an entire sentence.
pooled_output = outputs["pooled_output"]

feed_list = [
    inputs["input_ids"].name,
    inputs["position_ids"].name,
    inputs["segment_ids"].name,
    inputs["input_mask"].name,
]
def train_model(model_name):
    checkpoint_dir = "model/" + model_name
    dataset_dir = "data/" + model_name
    # Load Paddlehub ERNIE Tiny pretrained model
    module = hub.Module(name="ernie_tiny")
    inputs, outputs, program = module.context(
        trainable=True, max_seq_len=128)

    # Download dataset and use accuracy as metrics
    # Choose dataset: GLUE/XNLI/ChinesesGLUE/NLPCC-DBQA/LCQMC
    # metric should be acc, f1 or matthews
    # dataset = hub.dataset.ChnSentiCorp()
    dataset = ViolateDataset(dataset_dir=dataset_dir)
    metrics_choices = ["acc"]

    # For ernie_tiny, it use sub-word to tokenize chinese sentence
    # If not ernie tiny, sp_model_path and word_dict_path should be set None
    reader = hub.reader.ClassifyReader(
        dataset=dataset,
        vocab_path=module.get_vocab_path(),
        max_seq_len=128,
        sp_model_path=module.get_spm_path(),
        word_dict_path=module.get_word_dict_path())

    # Construct transfer learning network
    # Use "pooled_output" for classification tasks on an entire sentence.
    # Use "sequence_output" for token-level output.
    pooled_output = outputs["pooled_output"]

    # Setup feed list for data feeder
    # Must feed all the tensor of module need
    feed_list = [
        inputs["input_ids"].name,
        inputs["position_ids"].name,
        inputs["segment_ids"].name,
        inputs["input_mask"].name,
    ]

    # Select finetune strategy, setup config and finetune
    strategy = hub.AdamWeightDecayStrategy(
        warmup_proportion=0.1,
        weight_decay=0.01,
        learning_rate=5e-5,
        lr_scheduler="linear_decay")

    # Setup runing config for PaddleHub Finetune API
    config = hub.RunConfig(
        use_data_parallel=True,
        use_cuda=False,
        num_epoch=3,
        batch_size=24,
        checkpoint_dir=checkpoint_dir,
        # model_dir="./models",
        enable_memory_optim=True,
        strategy=strategy)

    # Define a classfication finetune task by PaddleHub's API
    cls_task = hub.TextClassifierTask(
        data_reader=reader,
        feature=pooled_output,
        feed_list=feed_list,
        num_classes=dataset.num_labels,
        config=config,
        metrics_choices=metrics_choices)
    # with cls_task.phase_guard(phase="train"):
    #     cls_task.init_if_necessary()
    #     cls_task.load_parameters("./models/model")
    # Finetune and evaluate by PaddleHub's API
    # will finish training, evaluation, testing, save model automatically
    # cls_task.finetune_and_eval()
    cls_task.finetune()
    # Evaluate by PaddleHub's API
    run_states = cls_task.eval()
    # Get acc score on dev
    eval_avg_score, eval_avg_loss, eval_run_speed = cls_task._calculate_metrics(
        run_states)
    # acc on dev will be used by auto finetune
    print("AutoFinetuneEval" + "\t" + str(float(eval_avg_score["acc"])))
示例#30
0
        inputs["segment_ids"].name,
        inputs["input_mask"].name,
    ]

    # Select fine-tune strategy, setup config and fine-tune
    strategy = hub.AdamWeightDecayStrategy(
        warmup_proportion=args.warmup_prop,
        learning_rate=args.learning_rate,
        weight_decay=args.weight_decay,
        lr_scheduler="linear_decay")

    # Setup RunConfig for PaddleHub Fine-tune API
    config = hub.RunConfig(
        checkpoint_dir=args.checkpoint_dir,
        use_cuda=True,
        num_epoch=args.epochs,
        batch_size=args.batch_size,
        enable_memory_optim=True,
        strategy=strategy)

    # Define a classfication fine-tune task by PaddleHub's API
    cls_task = hub.TextClassifierTask(
        data_reader=reader,
        feature=pooled_output,
        feed_list=feed_list,
        num_classes=dataset.num_labels,
        config=config,
        metrics_choices=metrics_choices)

    # Load model from the defined model path or not
    if args.model_path != "":