예제 #1
0
    def train(self, train_examples, dev_examples, dir_path=None):

        train_examples = SentencesDataset(train_examples, self.model)
        dev_examples = SentencesDataset(dev_examples, self.model)

        train_dataloader = DataLoader(train_examples,
                                      shuffle=True,
                                      batch_size=self.args.train_batch_size)
        dev_dataloader = DataLoader(dev_examples,
                                    shuffle=False,
                                    batch_size=self.args.eval_batch_size)

        train_loss = losses.CosineSimilarityLoss(model=self.model)
        evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)

        warmup_steps = math.ceil(
            len(train_examples) * self.args.num_train_epochs /
            self.args.train_batch_size * self.args.warmup_proportion)

        self.model.zero_grad()
        self.model.train()
        self.model.fit(train_objectives=[(train_dataloader, train_loss)],
                       evaluator=evaluator,
                       epochs=self.args.num_train_epochs,
                       evaluation_steps=10000,
                       warmup_steps=warmup_steps,
                       output_path=None,
                       optimizer_params={
                           'lr': self.args.learning_rate,
                           'eps': 1e-6,
                           'correct_bias': False
                       })
예제 #2
0
    def train(self, train_df, eval_df):
        """

        :param train_df: dataframe with columns 'text_a', 'text_b', 'labels'
        :param eval_df: dataframe with columns 'text_a', 'text_b', 'labels'
        :return:
        """

        # format training data
        if "text_a" in train_df.columns and "text_b" in train_df.columns and "labels" in train_df.columns:
            if self.args.do_lower_case:
                train_df.loc[:, 'text_a'] = train_df['text_a'].str.lower()
                train_df.loc[:, 'text_b'] = train_df['text_b'].str.lower()

            train_examples = [
                InputExample(str(i), [text_a, text_b], label)
                for i, (text_a, text_b, label) in enumerate(
                    zip(
                        train_df["text_a"].astype(str),
                        train_df["text_b"].astype(str),
                        train_df["labels"].astype(float),
                    ))
            ]
        else:
            raise KeyError(
                'Training data processing - Required columns not found!')

        # format evaluation data
        if "text_a" in train_df.columns and "text_b" in train_df.columns and "labels" in eval_df.columns:
            if self.args.do_lower_case:
                eval_df.loc[:, 'text_a'] = eval_df['text_a'].str.lower()
                eval_df.loc[:, 'text_b'] = eval_df['text_b'].str.lower()

            evaluator = evaluation.EmbeddingSimilarityEvaluator(
                list(eval_df["text_a"]),
                list(eval_df["text_b"]),
                list(eval_df["labels"]),
                batch_size=self.args.eval_batch_size)
        else:
            raise KeyError(
                'Evaluation data processing - Required columns not found!')

        # Define train dataset, the dataloader and the train loss
        train_dataloader = DataLoader(train_examples,
                                      shuffle=True,
                                      batch_size=self.args.train_batch_size)
        train_loss = losses.CosineSimilarityLoss(self.model)

        # Tune the model
        self.model.fit(
            train_objectives=[(train_dataloader, train_loss)],
            epochs=self.args.num_train_epochs,
            warmup_steps=self.args.warmup_steps,
            optimizer_params={'lr': self.args.learning_rate},
            weight_decay=self.args.weight_decay,
            evaluator=evaluator,
            evaluation_steps=self.args.evaluate_during_training_steps,
            max_grad_norm=self.args.max_grad_norm,
            output_path=self.args.best_model_dir,
            show_progress_bar=self.args.show_progress_bar)
예제 #3
0
def get_loss(loss_type, model):
    if loss_type == 'BatchAllTripletLoss':
        return losses.BatchAllTripletLoss(model=model)

    if loss_type == 'BatchHardSoftMarginTripletLoss':
        return losses.BatchHardSoftMarginTripletLoss(model=model)

    if loss_type == 'BatchHardTripletLoss':
        return losses.BatchHardTripletLoss(model=model)

    if loss_type == 'BatchSemiHardTripletLoss':
        return losses.BatchSemiHardTripletLoss(model=model)

    if loss_type == 'ContrastiveLoss':
        return losses.ContrastiveLoss(model=model)

    if loss_type == 'CosineSimilarityLoss':
        return losses.CosineSimilarityLoss(model=model)

    if loss_type == 'MegaBatchMarginLoss':
        return losses.MegaBatchMarginLoss(model=model)

    if loss_type == 'MultipleNegativesRankingLoss':
        return losses.MultipleNegativesRankingLoss(model=model)

    if loss_type == 'OnlineContrastiveLoss':
        return losses.OnlineContrastiveLoss(model=model)

    raise ValueError('Invalid loss type')
def create_posts_ranking(fl, data_dir, model, validate=None, is_test=False):
    train_posts_ranking = []
    disbn = []
    with open(os.path.join(data_dir, fl)) as f:
        data = json.load(f)
        for obj in data:
            answers = obj['answers']
            filtered_answers = []
            votes = 1000000
            for answer in answers:
                my_votes = answer['a_votes']
                if my_votes < votes:
                    votes = my_votes
                    filtered_answers.append(answer)

            if len(filtered_answers) > 1:
                rank = len(filtered_answers)
                for answer in filtered_answers:
                    dist = rank / len(filtered_answers)
                    disbn.append(answer['a_rank'])
                    rank = rank - 1
                    train_posts_ranking.append(
                        InputExample(texts=[obj['q_text'], answer['a_text']],
                                     label=dist))

    random.shuffle(train_posts_ranking)

    print("data size " + str(len(train_posts_ranking)))

    if is_test:
        return train_posts_ranking

    if max_size:
        train_posts_ranking = train_posts_ranking[:max_size]

    evaluator = None
    if posts_rank_str == validate:
        train_posts_ranking, dev_posts_ranking = train_test_split(
            train_posts_ranking, test_size=0.1)
        evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
            dev_posts_ranking, name='posts ranking')

    warmup_steps = math.ceil(
        len(train_posts_ranking) * num_epochs / batch_size *
        0.1)  # 10% of train data for warm-up

    train_data_posts_ranking = SentencesDataset(train_posts_ranking,
                                                model=model)
    train_dataloader_posts_ranking = DataLoader(train_data_posts_ranking,
                                                shuffle=True,
                                                batch_size=batch_size)
    train_loss_posts_ranking = losses.CosineSimilarityLoss(model=model)

    print('R: Number of training examples: ', len(train_posts_ranking))

    global evaluation_steps
    evaluation_steps = math.ceil(len(train_posts_ranking) / 0.1)

    return train_dataloader_posts_ranking, train_loss_posts_ranking, evaluator, warmup_steps
예제 #5
0
def dev_config(sts_reader, model, batch_size):
	'''dev dataloader and model'''	
	logger.info(f"Read STSbenchmark dev dataset")
	dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model)
	dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size)
	dev_loss = losses.CosineSimilarityLoss(model=model)
	dev_evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)
	return dev_loss, dev_dataloader, dev_evaluator
예제 #6
0
def train_config(sts_reader, model, batch_size):
	'''train dataloader and model.'''
	logger.info(f"Read STSbenchmark train dataset")
	train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model)
	train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
	train_loss = losses.CosineSimilarityLoss(model=model)
	train_evaluator = EmbeddingSimilarityEvaluator(train_dataloader)
	return train_data, train_loss, train_dataloader, train_evaluator
예제 #7
0
def main():
    parser = set_parser()
    args = parser.parse_args()

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )

    logger.info("arguments are parsed")
    args.world_size = args.gpus * args.nodes

    patent_reader = PatentDataReader(args.data_dir, normalize_scores=True)
    # Use BERT for mapping tokens to embeddings
    logger.warning("Loading Bert Model")
    word_embedding_model = models.BERT('bert-base-uncased', max_seq_length=510)
    logger.warning("Model is loaded")
    # Apply mean pooling to get one fixed sized sentence vector
    pooling_model = models.Pooling(
        word_embedding_model.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=True,
        pooling_mode_cls_token=False,
        pooling_mode_max_tokens=False)

    if args.use_tpu:
        logger.warning("TPU training")
        device = xm.xla_device()
        args.n_gpu = 1
    elif args.local_rank == -1:
        logger.warning("Non dist training")
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        logger.warning("Dist training local rank %s", args.local_rank)
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(
            backend="nccl", timeout=datetime.timedelta(hours=10))
        args.n_gpu = 1
    args.device = device

    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    train_loss = losses.CosineSimilarityLoss(model=model)
    model.to(args.device)
    train_loss.to(args.device)

    # Training
    if args.do_train:
        logger.warning("Read Patent Training dataset")
        train_data = load_and_cache_examples(args, patent_reader, model)
        logger.warning("Training dataset is loaded")
        # train_data = SentencesDataset(patent_reader.get_examples('train.tsv', max_examples=17714), model)
        tr_loss = train(args, train_data, model, train_loss)
        logger.info(" average loss = %s", tr_loss)
예제 #8
0
def fit_model(trial, train_fold, val_fold, fold_index):
    print("######################")
    print("start of fold_index:", fold_index)
    print("len(train_fold)", len(train_fold))
    print("len(val_fold)", len(val_fold))

    batch_size = trial.suggest_int("train_batch_size", 4, 50)
    num_epochs = trial.suggest_int("num_epochs", 1, 4)
    lr = trial.suggest_uniform("lr", 2e-6, 2e-4)
    eps = trial.suggest_uniform("eps", 1e-7, 1e-5)
    weight_decay = trial.suggest_uniform("weight_decay", 0.001, 0.1)
    warmup_steps_mul = trial.suggest_uniform("warmup_steps_mul", 0.1, 0.5)

    model = SentenceTransformer(model_name)

    # create train dataloader
    # train_sentece_dataset = SentencesDataset(train_fold, model=model) # this is deprecated
    train_dataloader = DataLoader(train_fold,
                                  shuffle=True,
                                  batch_size=batch_size)

    # define loss
    train_loss = losses.CosineSimilarityLoss(model=model)

    warmup_steps = math.ceil(
        len(train_fold) * num_epochs / batch_size * warmup_steps_mul)

    # Train the model
    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        evaluator=None,
        epochs=num_epochs,
        warmup_steps=warmup_steps,
        optimizer_params={
            "lr": lr,
            "eps": eps,
            "correct_bias": False
        },
        weight_decay=weight_decay,
    )

    # evaluate the model
    val_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
        val_fold, name="val_set", main_similarity=SimilarityFunction.COSINE)
    result = val_evaluator(model)

    print("######################################################")
    print("test result:", result)
    print("######################################################")

    if math.isnan(result):
        result = 0.0

    return result
def create_hirerachy_examples(fl,
                              data_dir,
                              model,
                              validate=None,
                              is_test=False):
    train_hierarchy_samples = []
    disbn = []
    with open(os.path.join(data_dir, fl)) as f:
        data = json.load(f)
        max_distance = 0
        for obj in data:
            if obj['distance'] > max_distance:
                max_distance = obj['distance']
        for obj in data:
            # flip the meaning of similarity, since the more distant the two classes, the closer to 0 it should be
            dist = (max_distance - obj['distance']) / (max_distance - 1)
            train_hierarchy_samples.append(
                InputExample(texts=[obj['class1'], obj['class2']], label=dist))
            disbn.append(obj['distance'])
    random.shuffle(train_hierarchy_samples)
    train_hierarchy_samples = train_hierarchy_samples[:100000]
    disbn = disbn[:100000]

    if max_size:
        train_hierarchy_samples = train_hierarchy_samples[:max_size]
        disbn = disbn[:max_size]

    if is_test:
        return train_hierarchy_samples

    evaluator = None

    if hierarchy_str == validate:
        train_hierarchy_samples, dev_hierarchy_samples = train_test_split(
            train_hierarchy_samples, stratify=disbn, test_size=0.1)
        evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
            dev_hierarchy_samples, name='hierarchy')

    warmup_steps = math.ceil(
        len(train_hierarchy_samples) * num_epochs / batch_size *
        0.1)  # 10% of train data for warm-up

    train_data_hierarchy = SentencesDataset(train_hierarchy_samples,
                                            model=model)
    train_dataloader_hierarchy = DataLoader(train_data_hierarchy,
                                            shuffle=True,
                                            batch_size=batch_size)
    train_loss_hierarchy = losses.CosineSimilarityLoss(model=model)

    print('H: Number of training examples: ', len(train_hierarchy_samples))

    global evaluation_steps
    evaluation_steps = math.ceil(len(train_hierarchy_samples) / 0.1)
    return train_dataloader_hierarchy, train_loss_hierarchy, evaluator, warmup_steps
예제 #10
0
def construct_model(base_model, encoder_style):
    # word_embedding_model = models.Transformer(base_model, max_seq_length=256)
    # pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
    # model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    if encoder_style == BIENCODER:
        model = SentenceTransformer(base_model)
        train_loss = losses.CosineSimilarityLoss(model)
    elif encoder_style == CROSSENCODER:
        model = CrossEncoder(base_model, num_labels=1, max_length=512)
        train_loss = None
    return model, train_loss
def run():
    train_file = config.TRAINING_FILE
    train_batch = config.TRAIN_BATCH_SIZE
    vaild_batch = config.VALID_BATCH_SIZE
    model_path = config.BERT_PATH
    max_length = config.MAX_LEN
    dfs = pd.read_csv(train_file,
                      sep="\t",
                      names=['idx', 'sent1', 'sent2', 'label'])
    dfs['label'] = pd.to_numeric(dfs["label"], downcast='float')
    df_train, df_valid = model_selection.train_test_split(
        dfs,
        test_size=0.1,
        random_state=42,
        stratify=dfs.label.values,
    )

    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    dataset_reader = dataset.Dataset()

    train_dataset = dataset_reader.read(df_train, return_pt=True)
    valid_sentence1, valid_sentence2, valid_labels = dataset_reader.read(
        df_valid)

    train_dataloader = DataLoader(train_dataset,
                                  shuffle=True,
                                  batch_size=train_batch)
    # evaluator = evaluation.EmbeddingSimilarityEvaluator(valid_sentence1, valid_sentence2, valid_labels)
    evaluator = evaluation.BinaryClassificationEvaluator(
        valid_sentence1,
        valid_sentence2,
        valid_labels,
        batch_size=vaild_batch,
        show_progress_bar=False)

    word_embedding_model = models.Transformer(model_path,
                                              max_seq_length=max_length)
    pooling_model = models.Pooling(
        word_embedding_model.get_word_embedding_dimension())
    dense_model = models.Dense(
        in_features=pooling_model.get_sentence_embedding_dimension(),
        out_features=max_length,
        activation_function=nn.Tanh())

    model = SentenceTransformer(
        modules=[word_embedding_model, pooling_model, dense_model])

    train_loss = losses.CosineSimilarityLoss(model)

    engine.train(train_dataloader, model, train_loss, evaluator)
    def __init__(self, ):
        self.batch_size = 16
        self.reader = DataReader('')

        self.model_save_path1 = DATAPATH + 'model_dump' + datetime.now(
        ).strftime("%Y-%m-%d_%H-%M-%S")
        self.model1 = SentenceTransformer('bert-base-nli-mean-tokens')

        self.model_save_path2 = DATAPATH + 'model_2_dump' + datetime.now(
        ).strftime("%Y-%m-%d_%H-%M-%S")
        self.model2 = SentenceTransformer('bert-base-nli-mean-tokens')

        self.train_loss1 = losses.CosineSimilarityLoss(model=self.model1)
        self.train_loss2 = losses.CosineSimilarityLoss(model=self.model2)

        self.df1 = None
        self.train_data1 = None
        self.train_dataloader1 = None

        self.df2 = None
        self.train_data2 = None
        self.train_dataloader2 = None
def nlptrain(premodel,ver,tr_data,te_data):
	
#### Just some code to print debug information to stdout
	logging.basicConfig(format='%(asctime)s - %(message)s',
                    	datefmt='%Y-%m-%d %H:%M:%S',
                    	level=logging.INFO,
                    	handlers=[LoggingHandler()])
#### /print debug information to stdout

# Read the dataset
	model_name = 'roberta-large-nli-stsb-mean-tokens'
	train_batch_size = 16
	num_epochs = 4
	model_save_path = ver
	sts_reader = STSDataReader('kt_datasets/kt_benchmark', normalize_scores=True)

# Load a pre-trained sentence transformer model
	model = SentenceTransformer(premodel)

# Convert the dataset to a DataLoader ready for training
	logging.info("")
	train_data = SentencesDataset(sts_reader.get_examples(tr_data), model)
	train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
	train_loss = losses.CosineSimilarityLoss(model=model)


	logging.info("")
	dev_data = SentencesDataset(examples=sts_reader.get_examples(te_data), model=model)
	dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size)
	evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)


# Configure the training. We skip evaluation in this example
	warmup_steps = math.ceil(len(train_data)*num_epochs/train_batch_size*0.1) #10% of train data for warm-up
	logging.info("Warmup-steps: {}".format(warmup_steps))


# Train the model
	model.fit(train_objectives=[(train_dataloader, train_loss)],
          	evaluator=evaluator,
          	epochs=num_epochs,
          	evaluation_steps=1000,
         	warmup_steps=warmup_steps,
          	output_path=model_save_path)

	list=['model saved in '+ ver+' directory']

	return(list)
예제 #14
0
def fit_model(params, languages, train_data):
    print("######################")
    print("start of languages:", languages)

    batch_size = params["train_batch_size"]
    num_epochs = params["num_epochs"]
    lr = params["lr"]
    eps = params["eps"]
    weight_decay = params["weight_decay"]
    warmup_steps_mul = params["warmup_steps_mul"]

    model = SentenceTransformer(model_name)

    # create train dataloader
    # train_sentece_dataset = SentencesDataset(train_fold, model=model) # this is deprecated
    train_dataloader = DataLoader(train_data,
                                  shuffle=True,
                                  batch_size=batch_size)

    # define loss
    train_loss = losses.CosineSimilarityLoss(model=model)

    warmup_steps = math.ceil(
        len(train_data) * num_epochs / batch_size * warmup_steps_mul)

    output_path = os.path.join(
        base_output_path,
        f"cross-{languages[0]}-{languages[1]}-roberta-sentence-transformer",
    )

    # Train the model
    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        evaluator=None,
        epochs=num_epochs,
        warmup_steps=warmup_steps,
        optimizer_params={
            "lr": lr,
            "eps": eps,
            "correct_bias": False
        },
        weight_decay=weight_decay,
    )

    # save model
    model.save(output_path)

    return output_path
예제 #15
0
def create_train_usage(fl, data_dir, model, validate=None, is_test=False):
    train_usage = []
    with open(os.path.join(data_dir, fl)) as f:
        data = json.load(f)
        min_d = 10000000
        max_d = 0
        for obj in data:
            dist = obj['distance']
            if dist < min_d:
                min_d = dist
            if dist > max_d:
                max_d = dist
        for obj in data:
            dist = (max_d - obj['distance']) / (max_d - min_d)
            train_usage.append(
                InputExample(texts=[obj['class1'], obj['class2']], label=dist))

    random.shuffle(train_usage)

    if is_test:
        return train_usage

    if max_size:
        train_usage = train_usage[:max_size]

    evaluator = None

    if usage_str == validate:
        train_usage, dev_usage = train_test_split(train_usage, test_size=0.1)
        evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
            dev_usage, name='usage')
    warmup_steps = math.ceil(len(train_usage) * num_epochs / batch_size *
                             0.1)  # 10% of train data for warm-up

    train_data_usage = SentencesDataset(train_usage, model=model)
    train_dataloader_usage = DataLoader(train_data_usage,
                                        shuffle=True,
                                        batch_size=batch_size)
    train_loss_usage = losses.CosineSimilarityLoss(model=model)

    print('U: Number of training examples: ', len(train_usage))

    global evaluation_steps
    evaluation_steps = math.ceil(len(train_usage) / 0.1)

    return train_dataloader_usage, train_loss_usage, evaluator, warmup_steps
    def test_train_stsb(self):
        word_embedding_model = models.Transformer('distilbert-base-uncased')
        pooling_model = models.Pooling(
            word_embedding_model.get_word_embedding_dimension())
        model = SentenceTransformer(
            modules=[word_embedding_model, pooling_model])
        train_dataset = SentencesDataset(self.stsb_train_samples, model)
        train_dataloader = DataLoader(train_dataset,
                                      shuffle=True,
                                      batch_size=16)
        train_loss = losses.CosineSimilarityLoss(model=model)
        model.fit(train_objectives=[(train_dataloader, train_loss)],
                  evaluator=None,
                  epochs=1,
                  evaluation_steps=1000,
                  warmup_steps=int(len(train_dataloader) * 0.1),
                  use_amp=True)

        self.evaluate_stsb_test(model, 80.0)
예제 #17
0
    def sentenceTransformers(self, tokens, preprocess_obj, batch_size,
                             num_epoch):

        model = SentenceTransformer('distilbert-base-nli-mean-tokens')
        word_embedding_model = model._first_module()
        train_dataloader = DataLoader(self.train_examples,
                                      shuffle=True,
                                      batch_size=batch_size)
        train_loss = losses.CosineSimilarityLoss(model)
        print(tokens)
        word_embedding_model.tokenizer.add_tokens(list(tokens),
                                                  special_tokens=True)
        word_embedding_model.auto_model.resize_token_embeddings(
            len(word_embedding_model.tokenizer))
        model.fit(train_objectives=[(train_dataloader, train_loss)],
                  epochs=num_epoch,
                  warmup_steps=100,
                  output_path=os.path.join(os.getcwd(),
                                           "bureau/models/" + "ST"))
예제 #18
0
#################################################################################################

logging.info(
    "Step 3: Train bi-encoder: {} with STSbenchmark (gold + silver dataset)".
    format(model_name))

# Convert the dataset to a DataLoader ready for training
logging.info("Read STSbenchmark gold and silver train dataset")
silver_samples = list(InputExample(texts=[data[0], data[1]], label=score) for \
    data, score in zip(silver_data, silver_scores))

train_dataset = SentencesDataset(gold_samples + silver_samples, bi_encoder)
train_dataloader = DataLoader(train_dataset,
                              shuffle=True,
                              batch_size=batch_size)
train_loss = losses.CosineSimilarityLoss(model=bi_encoder)

logging.info("Read STSbenchmark dev dataset")
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples,
                                                             name='sts-dev')

# Configure the training.
warmup_steps = math.ceil(len(train_dataset) * num_epochs / batch_size *
                         0.1)  #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# Train the bi-encoder model
bi_encoder.fit(train_objectives=[(train_dataloader, train_loss)],
               evaluator=evaluator,
               epochs=num_epochs,
               evaluation_steps=1000,
예제 #19
0
def train_similarity_sentenceBERT(robot_id, version):
    """
    训练 意图识别 模型
    """
    max_seq_length = 24
    batch_size = 128
    labels = ["0", "1"]
    # 和蓝博士反复测试, bert-tiny 版,训练异常,一直无法学习,尝试多组参数(训练epoch、学习率、批次大小等)
    # pretrain_name = "bert-tiny"
    # 哈工大版本,可以学习
    # pretrain_name = "roberta_wwm_ext_3"
    # 经测试,下面预训练好相似度模型(sentence bert结构会加快收敛速度,由于测试数据少,准确率都在100%,这个无意义)
    pretrain_name = "distiluse-base-multilingual-cased-v2"
    train_dir = "train_files"
    # 初始化权重模型位置
    pretrain_path = f"pretrained_models/{pretrain_name}"
    path = f"config_models/robot_{robot_id}_version_{version}.model"
    print("model_path")
    print(path)
    if os.path.exists(pretrain_path):
        _ = f"start train sentence_bert model, robot_id: {robot_id}, version:{version} "
        print(_), logging.info(_)
        c: Config = pickle.load(open(path, "rb"))
        temp_dir = f"{train_dir}/robot_{robot_id}_version_{version}_sentbert"
        if not os.path.exists(temp_dir):
            os.mkdir(temp_dir)
        examples_train, examples_dev = prepare_csv_data(c, temp_dir)
        # pretrain_path='/data4/azun/project_dialout/pretrained_models/distiluse-base-multilingual-cased-v2'
        print(pretrain_path)

        print("训练集")
        print(len(examples_train))
        print("测试集")
        print(len(examples_dev))
        if (len(examples_train) > 50000):
            examples_train = examples_train[:50000]
        if (len(examples_dev) > 5000):
            examples_dev = examples_dev[:4000]
        ####################### ####################### ####################### ####################### ####################### #######################
        ####################### ####################### ####################### ####################### ####################### #######################

        model = SentenceTransformer(pretrain_path)

        train_dataset = SentencesDataset(examples_train, model)
        train_dataloader = DataLoader(train_dataset,
                                      shuffle=True,
                                      batch_size=16)
        train_loss = losses.CosineSimilarityLoss(model)

        model.fit(train_objectives=[(train_dataloader, train_loss)],
                  epochs=1,
                  warmup_steps=100)
        model.save(
            f"config_models/robot_{robot_id}_version_{version}_similarity_sentbert"
        )
        print("模型保存成功,地址是:")
        print(
            f"config_models/robot_{robot_id}_version_{version}_similarity_sentbert"
        )

        ####################### ####################### ####################### ####################### ####################### #######################
        ####################### ####################### ####################### ####################### ####################### #######################
        result = {"train": 0.921, "dev": 0.932}
        # command = f"cp {pretrain_path}/bert_config.json config_models/robot_{robot_id}_version_{version}_similarity"
        # os.system(command)
        # command = f"cp {pretrain_path}/vocab.txt config_models/robot_{robot_id}_version_{version}_similarity"
        # os.system(command)
        # 需要上传下成绩,更新到数据库
        conn = pool.connection()  # 以后每次需要数据库连接就是用connection()函数获取连接就好了
        cur = conn.cursor()
        try:
            similarity_result = json.dumps(result, ensure_ascii=False)
            sql_ = f"UPDATE {TABLE_NAME} SET SIMILARITY_RESULT='{similarity_result}',UPDATED_AT=NOW() " \
                   f"WHERE robot_id='{robot_id}' and version_id='{version}' and DELETE_FLAG=0 and CLUSTER='{CLUSTER}';"
            print(sql_)
            index = cur.execute(sql_)
            conn.commit()
        except Exception as e:
            print(repr(e))
            pass
        finally:
            cur.close()
            conn.close()
        #####这里是更新intent,result现在做只是为了适配以前的后端,以后删除
        conn = pool.connection()  # 以后每次需要数据库连接就是用connection()函数获取连接就好了
        cur = conn.cursor()
        try:
            similarity_result = json.dumps(result, ensure_ascii=False)
            sql_ = f"UPDATE {TABLE_NAME} SET INTENT_RESULT='{similarity_result}',UPDATED_AT=NOW() " \
                   f"WHERE robot_id='{robot_id}' and version_id='{version}' and DELETE_FLAG=0 and CLUSTER='{CLUSTER}';"
            print(sql_)
            index = cur.execute(sql_)
            conn.commit()
        except Exception as e:
            print(repr(e))
            pass
        finally:
            cur.close()
            conn.close()
        print(result)
    else:
        _ = f"can not found, robot_id: {robot_id}, version:{version} "
        print(_), logging.info(_)
def main():
    parser = argparse.ArgumentParser()

    # Input and output configs
    parser.add_argument("--task",
                        default=None,
                        type=str,
                        required=True,
                        help="the task to run bert ranker for")
    parser.add_argument("--data_folder",
                        default=None,
                        type=str,
                        required=True,
                        help="the folder containing data")
    parser.add_argument("--output_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="the folder to output predictions")

    # #Training procedure
    parser.add_argument("--num_epochs",
                        default=5,
                        type=int,
                        required=False,
                        help="Number of epochs for training.")
    parser.add_argument("--train_batch_size",
                        default=8,
                        type=int,
                        required=False,
                        help="Training batch size.")
    # #Model hyperparameters
    parser.add_argument("--transformer_model",
                        default="bert-base-cased",
                        type=str,
                        required=False,
                        help="Bert model to use (default = bert-base-cased).")

    args = parser.parse_args()

    word_embedding_model = models.Transformer(args.transformer_model)

    pooling_model = models.Pooling(
        word_embedding_model.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=True,
        pooling_mode_cls_token=False,
        pooling_mode_max_tokens=False)

    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

    logging.info("Creating train CRR dataset.")
    crr_reader = CRRBenchmarkDataReader('{}/{}'.format(args.data_folder,
                                                       args.task))

    train_data = SentencesDataset(crr_reader.get_examples("train.tsv"), model)
    train_dataloader = DataLoader(train_data,
                                  shuffle=True,
                                  batch_size=args.train_batch_size)
    train_loss = losses.CosineSimilarityLoss(model=model)

    logging.info("Creating dev CRR dataset.")
    dev_data = SentencesDataset(crr_reader.get_examples('valid.tsv'), model)
    dev_dataloader = DataLoader(dev_data,
                                shuffle=False,
                                batch_size=args.train_batch_size)
    evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)

    warmup_steps = math.ceil(
        len(train_data) * args.num_epochs / args.train_batch_size *
        0.1)  #10% of train data for warm-up
    logging.info("Warmup-steps: {}".format(warmup_steps))

    logging.info("Fitting sentenceBERT")
    model.fit(train_objectives=[(train_dataloader, train_loss)],
              evaluator=evaluator,
              epochs=args.num_epochs,
              evaluation_steps=1000,
              warmup_steps=warmup_steps,
              output_path=args.output_dir +
              "{}_{}".format(args.transformer_model, args.task))
def main():
    parser = set_parser()
    args = parser.parse_args()

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )

    logger.info("arguments are parsed")
    args.world_size = args.gpus * args.nodes

    patent_reader = PatentDataReader(args.data_dir, normalize_scores=True)
    # Use BERT for mapping tokens to embeddings
    word_embedding_model = models.BERT('bert-base-cased', max_seq_length=510)

    # Apply mean pooling to get one fixed sized sentence vector
    pooling_model = models.Pooling(
        word_embedding_model.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=True,
        pooling_mode_cls_token=False,
        pooling_mode_max_tokens=False)

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1:
        logger.warning("Non dist training")
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        logger.warning("Dist training local rank %s", args.local_rank)
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(
            backend="nccl", timeout=datetime.timedelta(hours=10))
        args.n_gpu = 1
    args.device = device

    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    train_loss = losses.CosineSimilarityLoss(model=model)
    model.to(args.device)
    train_loss.to(args.device)

    if args.fp16:
        try:
            import apex
            apex.amp.register_half_function(torch, "einsum")
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )

    logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        logger.warning("Read Patent Training dataset")
        train_data = load_and_cache_examples(args, patent_reader, model)
        if args.eval_during_train:
            logging.info("Read STSbenchmark dev dataset")
            dev_data = load_and_cache_examples(args,
                                               patent_reader,
                                               model,
                                               evaluate=True)

        else:
            dev_data = None

        # train_data = SentencesDataset(patent_reader.get_examples('train.tsv', max_examples=17714), model)
        tr_loss = train(args,
                        train_data,
                        model,
                        train_loss,
                        dev_dataset=dev_data)
        logger.info(" average loss = %s", tr_loss)
예제 #22
0
# print(word_embedding_model.fc1(x).size())

import math

from sentence_transformers import losses

from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

# model_save_path = 'output/training_stsbenchmark_continue_training-'+model_name+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
# model_save_path = './models/clinical_bert/finetuned/output2'

train_dataloader = DataLoader(train_samples,
                              shuffle=True,
                              batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=word_embedding_model)

# Development set: Measure correlation between cosine score and gold labels
logging.info("Read SNLIbenchmark dev dataset")
# evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev')

# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(len(train_dataloader) * num_epochs *
                         0.1)  #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# Train the model
word_embedding_model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    #           evaluator=evaluator,
    epochs=num_epochs,
예제 #23
0
def main(args):
    #### Just some code to print debug information to stdout
    logging.basicConfig(format='%(asctime)s - %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S',
                        level=logging.INFO,
                        handlers=[LoggingHandler()])
    #### /print debug information to stdout

    #You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
    #model_name = sys.argv[1] if len(sys.argv) > 1 else 'bert-base-uncased'

    model_name = args.model_name
    if model_name is None:
        model_name = 'bert-base-chinese'

    # Read the dataset
    batch_size = args.batch_size

    model_output_dir = args.model_output_dir
    #model_save_path = os.path.join(model_output_dir, "bert-base", datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
    model_save_path = model_output_dir

    # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
    # word_embedding_model = models.Transformer(model_name)
    if args.init_model is None:
        word_embedding_model = models.Transformer(model_name)

        # Apply mean pooling to get one fixed sized sentence vector
        pooling_model = models.Pooling(
            word_embedding_model.get_word_embedding_dimension(),
            pooling_mode_mean_tokens=True,
            pooling_mode_cls_token=False,
            pooling_mode_max_tokens=False)

        model = SentenceTransformer(
            modules=[word_embedding_model, pooling_model])
    else:
        model = SentenceTransformer(args.init_model)

    if args.do_train == 1:
        # Convert the dataset to a DataLoader ready for training
        data_reader = SimTextDataReader()
        logging.info("train_data:%s" % (args.train_data))
        logging.info("cache_data:%s" % (args.cached_data))
        train_data_files = args.train_data.split('#')
        cached_data_file = args.cached_data
        logging.info("Read train dataset")
        if not os.path.isfile(cached_data_file):
            train_examples = []
            for train_file in train_data_files:
                if os.path.isfile(train_file):
                    logging.info("load train file:%s" % (train_file))
                    now_examples = data_reader.get_examples(train_file)
                    train_examples.extend(now_examples)

            train_data = SentencesDataset(train_examples, model=model)
            torch.save(train_data, args.cached_data)
        else:
            train_data = torch.load(cached_data_file)
            logging.info("Load cached dataset %s" % (cached_data_file))
        logging.info("Build train dataset")
        train_dataloader = DataLoader(train_data,
                                      shuffle=True,
                                      batch_size=batch_size)
        # train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels)
        train_loss = losses.CosineSimilarityLoss(model=model)

        logging.info("Read dev dataset")
        dev_data_files = args.dev_data.split('#')
        dev_examples = []
        for dev_file in dev_data_files:
            if os.path.isfile(dev_file):
                logging.info("load dev file:%s" % (dev_file))
                now_examples = data_reader.get_examples(dev_file)
                dev_examples.extend(now_examples)
        dev_data = SentencesDataset(examples=dev_examples, model=model)
        logging.info("Build dev dataset")
        dev_dataloader = DataLoader(dev_data,
                                    shuffle=False,
                                    batch_size=batch_size)
        evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)

        # Configure the training
        num_epochs = args.num_epochs
        warmup_steps = math.ceil(
            len(train_dataloader) * num_epochs / batch_size *
            0.1)  #10% of train data for warm-up
        logging.info("Warmup-steps: {}".format(warmup_steps))

        logging.info("Start training")
        # Train the model
        model.fit(train_objectives=[(train_dataloader, train_loss)],
                  evaluator=evaluator,
                  epochs=num_epochs,
                  evaluation_steps=1000,
                  warmup_steps=warmup_steps,
                  output_path=model_save_path)

    if args.do_predict == 1:
        logging.info("Read predict dataset")
        pred_data_file = args.pred_data
        output_file = os.path.join(args.model_output_dir, "pred_res")
        text_pairs = load_pred_data(pred_data_file)
        with open(output_file, "w", encoding="utf-8") as fp:
            for tpair in text_pairs:
                embedding_pair = model.encode(tpair)
                cos_sim = cosine_similarity(embedding_pair[0],
                                            embedding_pair[1])
                fp.write("%s\t%s\t%f\n" % (tpair[0], tpair[1], cos_sim))
예제 #24
0
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task.",
    )

    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help=
        "Path to pre-trained model or shortcut name selected in the list: ",
    )

    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written.",
    )

    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument(
        "--max_seq_length",
        default=510,
        type=int,
        help=
        "The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.",
    )

    parser.add_argument(
        "--per_gpu_train_batch_size",
        default=8,
        type=int,
        help="Batch size per GPU/CPU for training.",
    )
    parser.add_argument(
        "--per_gpu_eval_batch_size",
        default=8,
        type=int,
        help="Batch size per GPU/CPU for evaluation.",
    )
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="For distributed training: local_rank")
    parser.add_argument("--do_train",
                        action="store_true",
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action="store_true",
                        help="Whether to run eval on the dev set.")

    args = parser.parse_args()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )

    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
    )

    #TODO: Prepare Dataloader
    patent_reader = PatentDataReader(args.data_dir, normalize_scores=True)
    model = SentenceTransformer(args.model_name_or_path)
    test_data = SentencesDataset(examples=patent_reader.get_examples(
        "dev.tsv", max_examples=40),
                                 model=model)
    test_dataloader = DataLoader(test_data,
                                 shuffle=False,
                                 batch_size=args.per_gpu_train_batch_size)
    evaluator = EmbeddingSimilarityEvaluator(test_dataloader)
    model.evaluate(evaluator)

    # Convert the dataset to a DataLoader ready for training
    print("Read STSbenchmark train dataset")
    train_data = SentencesDataset(
        patent_reader.get_examples('train.tsv', max_examples=17714), model)
    train_dataloader = DataLoader(train_data,
                                  shuffle=True,
                                  batch_size=args.per_gpu_train_batch_size)
    train_loss = losses.CosineSimilarityLoss(model=model)

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    model.to(args.device)

    logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        train_data = SentencesDataset(
            patent_reader.get_examples('train.tsv', max_examples=17714), model)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step,
                    tr_loss)
예제 #25
0
# Add two trainable feed-forward networks (DAN)
sent_embeddings_dimension = pooling_model.get_sentence_embedding_dimension()
dan1 = models.Dense(in_features=sent_embeddings_dimension,
                    out_features=sent_embeddings_dimension)
dan2 = models.Dense(in_features=sent_embeddings_dimension,
                    out_features=sent_embeddings_dimension)

model = SentenceTransformer(
    modules=[word_embedding_model, word_weights, pooling_model, dan1, dan2])

# Convert the dataset to a DataLoader ready for training
logging.info("Read STSbenchmark train dataset")
train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'),
                              model=model)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

logging.info("Read STSbenchmark dev dataset")
dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'),
                            model=model)
dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size)
evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)

# Configure the training
num_epochs = 10
warmup_steps = math.ceil(len(train_data) * num_epochs / batch_size *
                         0.1)  #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
예제 #26
0
def BertEM(path_train, path_valid, path_test, path_error, epochs_num,
           warmup_steps_num, evaluation_steps_num):
    #实例化进度条
    bar = progressbar
    #定义模型
    #model = SentenceTransformer('bert-large-nli-stsb-mean-tokens',device='cuda:1')
    model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens',
                                device='cuda:6')
    #model = SentenceTransformer('roberta-large-nli-stsb-mean-tokens',device='cuda:2')
    data_type = {"text_a": str, "text_b": str}
    train_data = pd.read_csv(path_train, encoding='utf-8', dtype=data_type)
    valid_data = pd.read_csv(path_valid, encoding='utf-8', dtype=data_type)
    test_data = pd.read_csv(path_test, encoding='utf-8', dtype=data_type)

    #训练集
    train_examples = []
    for i in bar.progressbar(range(len(train_data))):
        time.sleep(0.0001)
        text_a = train_data.iloc[i]['text_a']
        text_b = train_data.iloc[i]['text_b']
        text_a = str(text_a)
        text_b = str(text_b)
        label_data = train_data.iloc[i]['label']
        label_data = float(label_data)
        train_examples.append(
            InputExample(texts=[text_a, text_b], label=label_data))
    print(InputExample)

    #验证集
    sentence_a = []
    sentence_b = []
    label_valid = []
    for i in bar.progressbar(range(len(valid_data))):
        time.sleep(0.0001)
        sentence1 = valid_data.iloc[i]['text_a']
        sentence2 = valid_data.iloc[i]['text_b']
        label_valid_t = valid_data.iloc[i]['label']
        label_valid_t = float(label_valid_t)
        sentence_a.append(sentence1)
        sentence_b.append(sentence2)
        label_valid.append(label_valid_t)
    #定义评估器
    #evaluator = evaluation.EmbeddingSimilarityEvaluator(sentence_a, sentence_b, label_valid)
    evaluator = evaluation.BinaryClassificationEvaluator(
        sentence_a, sentence_b, label_valid)
    #定义数据集,损失函数
    train_dataset = SentencesDataset(train_examples, model)
    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=16)
    train_loss = losses.CosineSimilarityLoss(model)

    #计算时间
    start_time = time.clock()
    #训练模型
    model.fit(train_objectives=[(train_dataloader, train_loss)],
              epochs=epochs_num,
              warmup_steps=warmup_steps_num,
              evaluator=evaluator,
              evaluation_steps=evaluation_steps_num,
              use_amp=True)
    end_time = time.clock()

    #=========================================评估过程===================================================
    #读取并把test所有属性转化成str
    test_data = pd.read_csv(path_test, encoding='utf-8')
    test_data['text_a'] = test_data['text_a'].map(lambda x: str(x))
    test_data['text_b'] = test_data['text_b'].map(lambda x: str(x))

    #循环创建预测的list字典
    list_num = 40
    prefix = 'pred_list_'
    test_map = {prefix + str(i): [] for i in range(list_num)}
    label_list = []
    score = 0.20
    error_csv = pd.DataFrame(columns=('id', 'text_a', 'text_b', 'cos_scores'))
    #进入测试集测试
    for i in bar.progressbar(range(len(test_data))):
        time.sleep(0.0001)
        text_a_embedding = model.encode(test_data.iloc[i]['text_a'],
                                        convert_to_tensor=True)
        text_b_embedding = model.encode(test_data.iloc[i]['text_b'],
                                        convert_to_tensor=True)
        cos_scores = util.pytorch_cos_sim(text_a_embedding,
                                          text_b_embedding)[0]
        cos_scores = cos_scores.cpu()
        #标签list
        label = test_data.iloc[i]['label']
        label_list.append(int(label))
        #记录下错误的数据
        if cos_scores >= 0.80:
            pred_test = 1
        else:
            pred_test = 0
        if pred_test != label:
            error_text_a = test_data.iloc[i]['text_a']
            error_text_b = test_data.iloc[i]['text_b']
            error_cos_scores = cos_scores
            error_csv = error_csv.append(pd.DataFrame({
                'id': [i],
                'text_a': [error_text_a],
                'text_b': [error_text_b],
                'cos_scores': [error_cos_scores]
            }),
                                         ignore_index=True)
        #生成预测list
        compute_pred(score, cos_scores, prefix, test_map)

    error_csv.to_csv(path_error, index=0)
    max_f1 = 0
    target_threshold = 0.01
    target_accuracy = 0.01
    target_recall = 0.01
    threshold = 0.20
    #循环输出各种得分结果
    for i in range(len(test_map.keys())):
        #循环计算得分
        accuracy, recall, f1 = compute_score(test_map[prefix + str(i)],
                                             label_list)
        if f1 >= max_f1:
            max_f1 = f1
            target_threshold = threshold
            target_accuracy = accuracy
            target_recall = recall
        print('The score > {} result is accuracy: {}, | recall:{}, | f1: {}'.
              format(round(threshold, 2), accuracy, recall, f1))
        threshold += 0.02
    #输出所有结果
    print('================dataset_name==================', path_a)
    print(
        '================threshold:{}, target_accuracy:{}, target_recall:{}, max_f1:{}'
        .format(target_threshold, target_accuracy, target_recall, max_f1))
    print('================train_time:{}'.format(str(end_time - start_time)))
예제 #27
0
def main(args):
    #### Just some code to print debug information to stdout
    logging.basicConfig(format='%(asctime)s - %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S',
                        level=logging.INFO,
                        handlers=[LoggingHandler()])
    #### /print debug information to stdout
    # Read the dataset
    train_batch_size = 64
    num_epochs = 1000

    if args.pretrained:
        model = SentenceTransformer(args.pretrained)
        model_save_path = os.path.join(
            args.save_path,
            args.pretrained.split("/")[-1] + '-' +
            datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
    else:
        #You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
        model_name = 'cl-tohoku/bert-base-japanese-char-whole-word-masking'
        model_save_path = os.path.join(
            args.save_path,
            model_name.replace("/", "-") + '-' +
            datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
        # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
        word_embedding_model = models.Transformer(model_name)

        # Apply mean pooling to get one fixed sized sentence vector
        pooling_model = models.Pooling(
            word_embedding_model.get_word_embedding_dimension(),
            pooling_mode_mean_tokens=True,
            pooling_mode_cls_token=False,
            pooling_mode_max_tokens=False)

        model = SentenceTransformer(
            modules=[word_embedding_model, pooling_model])

    # Convert the dataset to a DataLoader ready for training
    logging.info("Read custom train dataset")

    train_samples = []
    val_samples = []
    inp_list = []
    dataset_path = args.data_path
    with gzip.open(dataset_path, 'rt', encoding='utf8') as fIn:
        reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
        for row in reader:
            score = float(
                row['score']) / 10  # Normalize score to range 0 ... 1
            inp_list.append(
                InputExample(texts=[row['sentence1'], row['sentence2']],
                             label=score))

    from sklearn.model_selection import train_test_split
    train_samples, val_samples = train_test_split(inp_list, test_size=0.2)
    # import ipdb; ipdb.set_trace()

    train_dataset = SentencesDataset(train_samples, model)
    train_dataloader = DataLoader(train_dataset,
                                  shuffle=True,
                                  batch_size=train_batch_size)
    train_loss = losses.CosineSimilarityLoss(model=model)

    logging.info("Read custom dev dataset")
    # evaluator = EmbeddingSimilarityEvaluator.from_input_examples(val_samples, name='sts-dev')
    evaluator = EmbeddingSimilarityEvaluator.from_input_examples(val_samples)

    # Configure the training. We skip evaluation in this example
    warmup_steps = math.ceil(
        len(train_dataset) * num_epochs / train_batch_size *
        0.1)  #10% of train data for warm-up
    logging.info("Warmup-steps: {}".format(warmup_steps))

    # import ipdb; ipdb.set_trace()
    # Train the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
              evaluator=evaluator,
              epochs=num_epochs,
              evaluation_steps=1000,
              warmup_steps=warmup_steps,
              output_path=model_save_path)
예제 #28
0
def train(model_name_or_path: str,
          hf_dataset: str,
          aspect: str,
          fold: Union[int, str],
          output_path: str,
          train_epochs: int = 3,
          train_batch_size: int = 25,
          eval_batch_size: int = 32,
          evaluation_steps: int = 5000,
          train_on_test: bool = False,
          loss: str = 'multiple_negatives_ranking',
          override: bool = False):
    """

    # $MODEL_NAME $HF_DATASET $ASPECT $FOLD $OUTPUT_DIR --train_epochs=3 --train_batch_size=$TRAIN_BATCH_SIZE --eval_batch_size=$EVAL_BATCH_SIZE

    Run with:
    $ export CUDA_VISIBLE_DEVICES=1
    $ ./sentence_transformer_cli.py train scibert-scivocab-uncased paperswithcode_task_docs 1 ./output/st_scibert/1 --train_epochs=3 --train_batch_size=25 --eval_batch_size=32


    :param loss: Training loss function (choices: multiple_negatives_ranking, cosine)
    :param train_on_test: If True, joint training on train and test set (validation disabled)
    :param aspect:
    :param evaluation_steps:
    :param train_epochs:
    :param model_name_or_path:
    :param hf_dataset:
    :param fold:
    :param output_path:
    :param train_batch_size:
    :param eval_batch_size:
    :param override:
    :return:
    """

    top_ks = [5, 10, 25, 50]
    # cuda_device = -1

    # hf_dataset = 'paperswithcode_task_docs'
    # model_name_or_path = 'scibert-scivocab-uncased'
    # fold = 1
    max_token_length = 336  # ssee pwc_token_stats.ipynb
    nlp_cache_dir = './data/nlp_cache'

    # train_batch_size = 25
    # eval_batch_size = 32
    # override = False

    # output_path = './output/pwc_task_st/1/sci-bert'
    # output_path = os.path.join(output_path, str(fold), model_name_or_path)  # output/1/sci-bert

    if os.path.exists(output_path) and not override:
        logger.error(f'Stop. Output path exists already: {output_path}')
        sys.exit(1)

    # if cuda_device >= 0:
    #     os.environ["CUDA_VISIBLE_DEVICES"] = str(cuda_device)

    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Model path from env
    if not os.path.exists(model_name_or_path) and os.path.exists(
            os.path.join(env['bert_dir'], model_name_or_path)):
        model_name_or_path = os.path.join(env['bert_dir'], model_name_or_path)

    word_embedding_model = Transformer(model_name_or_path,
                                       max_seq_length=max_token_length)
    pooling_model = Pooling(
        word_embedding_model.get_word_embedding_dimension())

    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    # tokenizer = BertTokenizer.from_pretrained(model_name_or_path)

    # dataset
    docs_ds = load_dataset(get_local_hf_dataset_path(hf_dataset),
                           name='docs',
                           cache_dir=nlp_cache_dir,
                           split='docs')
    train_ds = load_dataset(get_local_hf_dataset_path(hf_dataset),
                            name='relations',
                            cache_dir=nlp_cache_dir,
                            split=get_train_split(aspect, fold))
    test_ds = load_dataset(get_local_hf_dataset_path(hf_dataset),
                           name='relations',
                           cache_dir=nlp_cache_dir,
                           split=get_test_split(aspect, fold))

    # filter for positive labels only
    train_ds = train_ds.filter(lambda row: row['label'] == 'y')

    logger.info(f'After filtering: {len(train_ds):,}')

    # joint training on train and test?
    if train_on_test:
        #
        # import pyarrow
        # from datasets.arrow_dataset import Dataset
        #
        # full_ds_table = pyarrow.concat_tables([train_ds.data, test_ds.data])
        # full_ds = Dataset(arrow_table=full_ds_table)
        raise NotImplementedError('TODO Evaluator')
    else:
        # standard training on test only
        train_sds = DocumentPairSentencesDataset(docs_ds,
                                                 train_ds,
                                                 model,
                                                 max_length=max_token_length,
                                                 forced_length=0)
        train_sds.tokenize_all_docs()

        evaluator = NearestNeighborsEvaluator(model,
                                              docs_ds,
                                              test_ds,
                                              top_ks=top_ks,
                                              batch_size=eval_batch_size,
                                              show_progress_bar=True)

    if loss == 'cosine':
        train_loss = losses.CosineSimilarityLoss(model)
    elif loss == 'multiple_negatives_ranking':
        # A nice advantage of MultipleNegativesRankingLoss is that it only requires positive pairs
        # https://github.com/UKPLab/sentence-transformers/tree/master/examples/training/quora_duplicate_questions
        train_loss = losses.MultipleNegativesRankingLoss(model)
    else:
        raise ValueError(f'Unsupported loss function: {loss}')

    train_dl = DataLoader(train_sds, shuffle=True, batch_size=train_batch_size)

    # Training
    model.fit(
        train_objectives=[(train_dl, train_loss)],
        epochs=train_epochs,  # try 1-4
        warmup_steps=100,
        evaluator=evaluator,
        evaluation_steps=
        evaluation_steps,  # increase to 5000 (full dataset => 20k steps)
        output_path=output_path,
        output_path_ignore_not_empty=True)

    logger.info('Training done')