Exemplo n.º 1
0
                              shuffle=True,
                              batch_size=batch_size)

# We add an evaluator, which evaluates the performance during training
evaluator = CECorrelationEvaluator.from_input_examples(dev_samples,
                                                       name='sts-dev')

# Configure the training
warmup_steps = math.ceil(len(train_dataloader) * num_epochs *
                         0.1)  #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# Train the cross-encoder model
cross_encoder.fit(train_dataloader=train_dataloader,
                  evaluator=evaluator,
                  epochs=num_epochs,
                  warmup_steps=warmup_steps,
                  output_path=cross_encoder_path)

############################################################################
#
# Step 2: Label BM25 sampled STSb (silver dataset) using cross-encoder model
#
############################################################################

#### Top k similar sentences to be retrieved ####
#### Larger the k, bigger the silver dataset ####

index_name = "stsb"  # index-name should be in lowercase
logging.info(
    "Step 2.1: Generate STSbenchmark (silver dataset) using top-{} bm25 combinations"
Exemplo n.º 2
0
#We wrap train_samples, which is a list ot InputExample, in a pytorch DataLoader
train_dataloader = DataLoader(train_samples,
                              shuffle=False,
                              batch_size=train_batch_size)

evaluator = CEBinaryClassificationEvaluator.from_input_examples(
    dev_samples, name='UnfoldingStructure-dev')

warmup_steps = math.ceil(len(train_dataloader) * num_epochs *
                         0.1)  #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# Train the model
model.fit(train_dataloader=train_dataloader,
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=10000,
          warmup_steps=warmup_steps,
          output_path=model_save_path)

##############################################################################
# Testing
##############################################################################

logging.info("Read test examples")

model = CrossEncoder(model_save_path)
test_evaluator = CEBinaryClassificationEvaluator.from_input_examples(
    test_samples, name='test')
test_evaluator(model, output_path=model_save_path)
Exemplo n.º 3
0
# We wrap train_samples, which is a list ot InputExample, in a pytorch DataLoader
train_dataloader = DataLoader(train_samples,
                              shuffle=True,
                              batch_size=train_batch_size)

# During training, we use CESoftmaxAccuracyEvaluator to measure the accuracy on the dev set.
evaluator = CESoftmaxAccuracyEvaluator.from_input_examples(dev_samples,
                                                           name='AllNLI-dev')

warmup_steps = math.ceil(len(train_dataloader) * num_epochs *
                         0.1)  # 10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# Train the model
model.fit(train_dataloader=train_dataloader,
          epochs=num_epochs,
          warmup_steps=warmup_steps,
          output_path=model_save_path)

test_dataset = 'contradictory-my-dear-watson/test.csv'
df = pandas.read_csv(test_dataset)
sentence_pairs = []
ids = []
for id, row in df.iterrows():
    label_id = 0
    ids.append(row['id'])
    sentence_pairs.append([row['premise'], row['hypothesis']])

pred_scores = model.predict(sentence_pairs,
                            convert_to_numpy=True,
                            show_progress_bar=False,
                            batch_size=4)
Exemplo n.º 4
0
        config, os.path.join(config['train_dir'], config['train_hyp_file']))
    num_labels = 3
else:
    train_samples, dev_samples = get_train_dev_data(
        config, os.path.join(config['train_dir'], config['train_flat_file']))
logging.info("Done Processing Data ...")

model = CrossEncoder(config['crossencoder_base_model'], num_labels=num_labels)

batch_size = config['batch_size']
num_epochs = config['num_epochs']

train_dataloader = DataLoader(train_samples,
                              shuffle=True,
                              batch_size=batch_size)
train_loss = get_loss(config['loss_type'], model)
evaluator = CEBinaryAccuracyEvaluator.from_input_examples(dev_samples)

warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)
logging.info("Warmup-steps: {}".format(warmup_steps))

model_dir = os.path.join(config['saved_model_dir'], config['checkpoint_path'])

logging.info("Starting training ...")
model.fit(train_dataloader=train_dataloader,
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=int(config['eval_steps']),
          warmup_steps=warmup_steps,
          output_path=model_dir)
        if qid in dev_qids: #Skip queries in our dev dataset
            continue

        train_samples.append(InputExample(texts=[queries[qid], corpus[pid1]], label=float(pos_score)))
        train_samples.append(InputExample(texts=[queries[qid], corpus[pid2]], label=float(neg_score)))

# We create a DataLoader to load our train samples
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size, drop_last=True)

# We add an evaluator, which evaluates the performance during training
# It performs a classification task and measures scores like F1 (finding relevant passages) and Average Precision
evaluator = CERerankingEvaluator(dev_samples, name='train-eval')

# Configure the training
warmup_steps = 5000
logging.info("Warmup-steps: {}".format(warmup_steps))


# Train the model
model.fit(train_dataloader=train_dataloader,
          loss_fct=torch.nn.MSELoss(),
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=5000,
          warmup_steps=warmup_steps,
          output_path=model_save_path,
          optimizer_params={'lr': 7e-6},
          use_amp=True)

#Save latest model
model.save(model_save_path+'-latest')