예제 #1
0
def get_binary_experimental_setup():
    # Items
    train_items, valid_items = extract_examples("items")

    # Domains
    train_domains, valid_domains = extract_examples("domains")

    # Regroup items and domains together
    train_examples = train_items + train_domains
    valid_examples = valid_items + valid_domains

    print(
        f"{len(train_examples)} training examples to {len(valid_examples)} valid examples"
    )

    # Postprocess train examples to correct format
    train_examples = [
        InputExample(texts=[sent1, sent2], label=label)
        for (sent1, sent2, label) in train_examples
    ]

    # Get evaluator from valid data
    evaluator = evaluation.BinaryClassificationEvaluator(*zip(*valid_examples),
                                                         batch_size=128)

    return train_examples, evaluator
def run():
    train_file = config.TRAINING_FILE
    train_batch = config.TRAIN_BATCH_SIZE
    vaild_batch = config.VALID_BATCH_SIZE
    model_path = config.BERT_PATH
    max_length = config.MAX_LEN
    dfs = pd.read_csv(train_file,
                      sep="\t",
                      names=['idx', 'sent1', 'sent2', 'label'])
    dfs['label'] = pd.to_numeric(dfs["label"], downcast='float')
    df_train, df_valid = model_selection.train_test_split(
        dfs,
        test_size=0.1,
        random_state=42,
        stratify=dfs.label.values,
    )

    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    dataset_reader = dataset.Dataset()

    train_dataset = dataset_reader.read(df_train, return_pt=True)
    valid_sentence1, valid_sentence2, valid_labels = dataset_reader.read(
        df_valid)

    train_dataloader = DataLoader(train_dataset,
                                  shuffle=True,
                                  batch_size=train_batch)
    # evaluator = evaluation.EmbeddingSimilarityEvaluator(valid_sentence1, valid_sentence2, valid_labels)
    evaluator = evaluation.BinaryClassificationEvaluator(
        valid_sentence1,
        valid_sentence2,
        valid_labels,
        batch_size=vaild_batch,
        show_progress_bar=False)

    word_embedding_model = models.Transformer(model_path,
                                              max_seq_length=max_length)
    pooling_model = models.Pooling(
        word_embedding_model.get_word_embedding_dimension())
    dense_model = models.Dense(
        in_features=pooling_model.get_sentence_embedding_dimension(),
        out_features=max_length,
        activation_function=nn.Tanh())

    model = SentenceTransformer(
        modules=[word_embedding_model, pooling_model, dense_model])

    train_loss = losses.CosineSimilarityLoss(model)

    engine.train(train_dataloader, model, train_loss, evaluator)
예제 #3
0
def run():
    test_file = config.TEST_FILE
    test_batch = config.TEST_BATCH_SIZE
    model_save_path = config.MODEL_SAVE_PATH

    dfs = pd.read_csv(test_file,
                      sep='\t',
                      names=['idx', 'sent1', 'sent2', 'label'])
    dfs['label'] = pd.to_numeric(dfs['label'], downcast='float')

    dataset_reader = dataset.Dataset()
    test_sent1, test_sent2, test_labels = dataset_reader.read(dfs)

    evaluator = evaluation.BinaryClassificationEvaluator(
        test_sent1,
        test_sent2,
        test_labels,
        batch_size=test_batch,
        show_progress_bar=True)

    model = SentenceTransformer(model_save_path)
    model.evaluate(evaluator)
예제 #4
0
###### Classification ######
# Given (quesiton1, question2), is this a duplicate or not?
# The evaluator will compute the embeddings for both questions and then compute
# a cosine similarity. If the similarity is above a threshold, we have a duplicate.
dev_sentences1 = []
dev_sentences2 = []
dev_labels = []
with open(os.path.join(dataset_path, "classification/dev_pairs.tsv"),
          encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        dev_sentences1.append(row['question1'])
        dev_sentences2.append(row['question2'])
        dev_labels.append(int(row['is_duplicate']))

binary_acc_evaluator = evaluation.BinaryClassificationEvaluator(
    dev_sentences1, dev_sentences2, dev_labels)
evaluators.append(binary_acc_evaluator)

###### Duplicate Questions Mining ######
# Given a large corpus of questions, identify all duplicates in that corpus.

# For faster processing, we limit the development corpus to only 10,000 sentences.
max_dev_samples = 10000
dev_sentences = {}
dev_duplicates = []
with open(os.path.join(dataset_path, "duplicate-mining/dev_corpus.tsv"),
          encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        dev_sentences[row['qid']] = row['question']
예제 #5
0
def main():
    parser = argparse.ArgumentParser(description='Start training with SBERT')
    parser.add_argument('--model_path',
                    type=str,
                    help='Path to trained model folder ./models/[MODEL_NAME]')
    parser.add_argument('--dataset',
                    type=str,
                    default='few_rel',
                    help='Name dataset')  
    parser.add_argument('--mask_method',
                    type=str,
                    default='bracket',
                    help='Type of masking')    
    parser.add_argument('--num_epochs',
                    type=int,
                    default=15,
                    help='Number epochs')                                
    parser.add_argument('--num_samples',
                    type=int,
                    default=-1,
                    help='Number of samples for test run, default -1 means all data')
    parser.add_argument('--max_seq_length',
                    type=int,
                    default=256,
                    help='Max token length for BERT')
    args = parser.parse_args()

    model_path = args.model_path
    dataset = args.dataset
    mask_method = args.mask_method
    num_samples = args.num_samples
    max_seq_length=args.max_seq_length
    num_epochs = args.num_epochs
    evaluation_steps = 1000 # Frequency of evaluation results
    warmup_steps = 1000 # warm up steps
    sentence_out_embedding_dimension = 256

    if model_path.endswith('/'):
        model_path = model_path[:-1]
    model_name = model_path.split('/')[-1]

    path_train_data = f'./data/train_samples/{dataset}_train_{mask_method}_train.csv'
    path_eval_data = f'./data/train_samples/{dataset}_val_{mask_method}_test.csv'
    if num_samples>0:
        model_save_path = f'./trained_models/{model_name}_sbert_bi_{dataset}_test/'
    else:
        model_save_path = f'./trained_models/{model_name}_sbert_bi_{dataset}/'
    ### Define the model
    word_embedding_model = models.Transformer(model_path, max_seq_length=max_seq_length)

    ### Add special tokens - this helps us add tokens like Doc or query or Entity1 / Entity2 
    # but in our case we already added that to the model prior
    #tokens = ["[DOC]", "[QRY]"]
    #word_embedding_model.tokenizer.add_tokens(tokens, special_tokens=True)
    #word_embedding_model.auto_model.resize_token_embeddings(len(word_embedding_model.tokenizer))

    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
    dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), 
                        out_features=sentence_out_embedding_dimension, activation_function=nn.Tanh())
    # Model pipeline
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model])

    # Prep DataLoader
    train_examples = load_train_sbert(path_train_data, num_samples)
    train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

    # Prep Evaluator
    sentences1, sentences2, scores = load_eval_sbert(path_eval_data, num_samples)
    #evaluator = evaluation.EmbeddingSimilarityEvaluator(sentences1, sentences2, scores)
    evaluator = evaluation.BinaryClassificationEvaluator(sentences1, sentences2, scores)
    #train_loss = losses.CosineSimilarityLoss(model)
    train_loss = losses.SoftmaxLoss(model, sentence_embedding_dimension= sentence_out_embedding_dimension, num_labels = 2)

    #Tune the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
            evaluator=evaluator,
            epochs=num_epochs,
            evaluation_steps=evaluation_steps,
            warmup_steps=warmup_steps,
            output_path=model_save_path)
        word_embedding_model.get_word_embedding_dimension())

    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    print("SentenceTransformer model created")

    train_loss = losses.MultipleNegativesRankingLoss(model)

    # Set up a set of different performatnce evaluators

    evaluators = []

    ###### Classification ######
    # Given (quesiton1, question2), is this a duplicate or not?
    # The evaluator will compute the embeddings for both questions and then compute
    # a cosine similarity. If the similarity is above a threshold, we have a duplicate.
    binary_acc_evaluator = evaluation.BinaryClassificationEvaluator(
        dev_seqs1, dev_seqs2, dev_labels)
    evaluators.append(binary_acc_evaluator)

    binary_acc_evaluator = evaluation.BinaryClassificationEvaluator(
        train_seqs1, train_seqs2, train_labels)
    evaluators.append(binary_acc_evaluator)

    logging.info("binary acc evaluator added")
    dev_seq_dict = {}
    dev_duplicates = []

    # create dict of id:seq
    #for i in range(len(dev_seqs1)):
    for i in range(0, 5):
        dev_seq_dict[dev_ids1[i]] = dev_seqs1[i]
        dev_seq_dict[dev_ids2[i]] = dev_seqs2[i]
예제 #7
0
def BertEM(path_train, path_valid, path_test, path_error, epochs_num,
           warmup_steps_num, evaluation_steps_num):
    #实例化进度条
    bar = progressbar
    #定义模型
    #model = SentenceTransformer('bert-large-nli-stsb-mean-tokens',device='cuda:1')
    model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens',
                                device='cuda:6')
    #model = SentenceTransformer('roberta-large-nli-stsb-mean-tokens',device='cuda:2')
    data_type = {"text_a": str, "text_b": str}
    train_data = pd.read_csv(path_train, encoding='utf-8', dtype=data_type)
    valid_data = pd.read_csv(path_valid, encoding='utf-8', dtype=data_type)
    test_data = pd.read_csv(path_test, encoding='utf-8', dtype=data_type)

    #训练集
    train_examples = []
    for i in bar.progressbar(range(len(train_data))):
        time.sleep(0.0001)
        text_a = train_data.iloc[i]['text_a']
        text_b = train_data.iloc[i]['text_b']
        text_a = str(text_a)
        text_b = str(text_b)
        label_data = train_data.iloc[i]['label']
        label_data = float(label_data)
        train_examples.append(
            InputExample(texts=[text_a, text_b], label=label_data))
    print(InputExample)

    #验证集
    sentence_a = []
    sentence_b = []
    label_valid = []
    for i in bar.progressbar(range(len(valid_data))):
        time.sleep(0.0001)
        sentence1 = valid_data.iloc[i]['text_a']
        sentence2 = valid_data.iloc[i]['text_b']
        label_valid_t = valid_data.iloc[i]['label']
        label_valid_t = float(label_valid_t)
        sentence_a.append(sentence1)
        sentence_b.append(sentence2)
        label_valid.append(label_valid_t)
    #定义评估器
    #evaluator = evaluation.EmbeddingSimilarityEvaluator(sentence_a, sentence_b, label_valid)
    evaluator = evaluation.BinaryClassificationEvaluator(
        sentence_a, sentence_b, label_valid)
    #定义数据集,损失函数
    train_dataset = SentencesDataset(train_examples, model)
    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=16)
    train_loss = losses.CosineSimilarityLoss(model)

    #计算时间
    start_time = time.clock()
    #训练模型
    model.fit(train_objectives=[(train_dataloader, train_loss)],
              epochs=epochs_num,
              warmup_steps=warmup_steps_num,
              evaluator=evaluator,
              evaluation_steps=evaluation_steps_num,
              use_amp=True)
    end_time = time.clock()

    #=========================================评估过程===================================================
    #读取并把test所有属性转化成str
    test_data = pd.read_csv(path_test, encoding='utf-8')
    test_data['text_a'] = test_data['text_a'].map(lambda x: str(x))
    test_data['text_b'] = test_data['text_b'].map(lambda x: str(x))

    #循环创建预测的list字典
    list_num = 40
    prefix = 'pred_list_'
    test_map = {prefix + str(i): [] for i in range(list_num)}
    label_list = []
    score = 0.20
    error_csv = pd.DataFrame(columns=('id', 'text_a', 'text_b', 'cos_scores'))
    #进入测试集测试
    for i in bar.progressbar(range(len(test_data))):
        time.sleep(0.0001)
        text_a_embedding = model.encode(test_data.iloc[i]['text_a'],
                                        convert_to_tensor=True)
        text_b_embedding = model.encode(test_data.iloc[i]['text_b'],
                                        convert_to_tensor=True)
        cos_scores = util.pytorch_cos_sim(text_a_embedding,
                                          text_b_embedding)[0]
        cos_scores = cos_scores.cpu()
        #标签list
        label = test_data.iloc[i]['label']
        label_list.append(int(label))
        #记录下错误的数据
        if cos_scores >= 0.80:
            pred_test = 1
        else:
            pred_test = 0
        if pred_test != label:
            error_text_a = test_data.iloc[i]['text_a']
            error_text_b = test_data.iloc[i]['text_b']
            error_cos_scores = cos_scores
            error_csv = error_csv.append(pd.DataFrame({
                'id': [i],
                'text_a': [error_text_a],
                'text_b': [error_text_b],
                'cos_scores': [error_cos_scores]
            }),
                                         ignore_index=True)
        #生成预测list
        compute_pred(score, cos_scores, prefix, test_map)

    error_csv.to_csv(path_error, index=0)
    max_f1 = 0
    target_threshold = 0.01
    target_accuracy = 0.01
    target_recall = 0.01
    threshold = 0.20
    #循环输出各种得分结果
    for i in range(len(test_map.keys())):
        #循环计算得分
        accuracy, recall, f1 = compute_score(test_map[prefix + str(i)],
                                             label_list)
        if f1 >= max_f1:
            max_f1 = f1
            target_threshold = threshold
            target_accuracy = accuracy
            target_recall = recall
        print('The score > {} result is accuracy: {}, | recall:{}, | f1: {}'.
              format(round(threshold, 2), accuracy, recall, f1))
        threshold += 0.02
    #输出所有结果
    print('================dataset_name==================', path_a)
    print(
        '================threshold:{}, target_accuracy:{}, target_recall:{}, max_f1:{}'
        .format(target_threshold, target_accuracy, target_recall, max_f1))
    print('================train_time:{}'.format(str(end_time - start_time)))
예제 #8
0
    def train(self, train_df, eval_df):
        """

        :param train_df: dataframe with columns 'text_a', 'text_b', 'labels'
        :param eval_df: dataframe with columns 'text_a', 'text_b', 'labels'
        :return:
        """

        # format training data
        if "text_a" in train_df.columns and "text_b" in train_df.columns and "labels" in train_df.columns:
            if self.args.do_lower_case:
                train_df.loc[:, 'text_a'] = train_df['text_a'].str.lower()
                train_df.loc[:, 'text_b'] = train_df['text_b'].str.lower()

            train_examples = [
                InputExample(str(i), texts=[text_a, text_b], label=label)
                for i, (text_a, text_b, label) in enumerate(
                    zip(
                        train_df["text_a"].astype(str),
                        train_df["text_b"].astype(str),
                        train_df["labels"].astype(int),
                    ))
            ]
        else:
            raise KeyError(
                'Training data processing - Required columns not found!')

        # format evaluation data
        if "text_a" in train_df.columns and "text_b" in train_df.columns and "labels" in eval_df.columns:
            if self.args.do_lower_case:
                eval_df.loc[:, 'text_a'] = eval_df['text_a'].str.lower()
                eval_df.loc[:, 'text_b'] = eval_df['text_b'].str.lower()

            evaluator = evaluation.BinaryClassificationEvaluator(
                list(eval_df["text_a"]),
                list(eval_df["text_b"]),
                list(eval_df["labels"].astype(int)),
                batch_size=self.args.eval_batch_size)
        else:
            raise KeyError(
                'Evaluation data processing - Required columns not found!')

        # Define train dataset, the dataloader and the train loss
        train_dataloader = DataLoader(train_examples,
                                      shuffle=True,
                                      batch_size=self.args.train_batch_size)
        if self.args.loss_func is not None and self.args.loss_func == 'MultipleNegativesRankingLoss':
            train_loss = losses.MultipleNegativesRankingLoss(self.model)
        else:
            distance_metric = losses.SiameseDistanceMetric.COSINE_DISTANCE
            train_loss = losses.OnlineContrastiveLoss(
                model=self.model,
                distance_metric=distance_metric,
                margin=self.args.margin)

        # Tune the model
        self.model.fit(
            train_objectives=[(train_dataloader, train_loss)],
            epochs=self.args.num_train_epochs,
            warmup_steps=self.args.warmup_steps,
            optimizer_params={'lr': self.args.learning_rate},
            weight_decay=self.args.weight_decay,
            evaluator=evaluator,
            evaluation_steps=self.args.evaluate_during_training_steps,
            max_grad_norm=self.args.max_grad_norm,
            output_path=self.args.best_model_dir,
            show_progress_bar=self.args.show_progress_bar)

        evaluation_file = os.path.join(self.args.best_model_dir,
                                       evaluator.csv_file)
        eval_results_df = pd.read_csv(evaluation_file)
        eval_results_df.sort_values(self.score_type,
                                    inplace=True,
                                    ascending=False,
                                    ignore_index=True)
        self.threshold = eval_results_df.loc[0, self.threshold_type]
        print(
            f'Set model threshold to {self.threshold} acquiring a {self.score_type} of {eval_results_df.loc[0, self.score_type]}'
        )

        return self.threshold