shuffle=True, batch_size=batch_size) # We add an evaluator, which evaluates the performance during training evaluator = CECorrelationEvaluator.from_input_examples(dev_samples, name='sts-dev') # Configure the training warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # Train the cross-encoder model cross_encoder.fit(train_dataloader=train_dataloader, evaluator=evaluator, epochs=num_epochs, warmup_steps=warmup_steps, output_path=cross_encoder_path) ############################################################################ # # Step 2: Label BM25 sampled STSb (silver dataset) using cross-encoder model # ############################################################################ #### Top k similar sentences to be retrieved #### #### Larger the k, bigger the silver dataset #### index_name = "stsb" # index-name should be in lowercase logging.info( "Step 2.1: Generate STSbenchmark (silver dataset) using top-{} bm25 combinations"
#We wrap train_samples, which is a list ot InputExample, in a pytorch DataLoader train_dataloader = DataLoader(train_samples, shuffle=False, batch_size=train_batch_size) evaluator = CEBinaryClassificationEvaluator.from_input_examples( dev_samples, name='UnfoldingStructure-dev') warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # Train the model model.fit(train_dataloader=train_dataloader, evaluator=evaluator, epochs=num_epochs, evaluation_steps=10000, warmup_steps=warmup_steps, output_path=model_save_path) ############################################################################## # Testing ############################################################################## logging.info("Read test examples") model = CrossEncoder(model_save_path) test_evaluator = CEBinaryClassificationEvaluator.from_input_examples( test_samples, name='test') test_evaluator(model, output_path=model_save_path)
# We wrap train_samples, which is a list ot InputExample, in a pytorch DataLoader train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size) # During training, we use CESoftmaxAccuracyEvaluator to measure the accuracy on the dev set. evaluator = CESoftmaxAccuracyEvaluator.from_input_examples(dev_samples, name='AllNLI-dev') warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # Train the model model.fit(train_dataloader=train_dataloader, epochs=num_epochs, warmup_steps=warmup_steps, output_path=model_save_path) test_dataset = 'contradictory-my-dear-watson/test.csv' df = pandas.read_csv(test_dataset) sentence_pairs = [] ids = [] for id, row in df.iterrows(): label_id = 0 ids.append(row['id']) sentence_pairs.append([row['premise'], row['hypothesis']]) pred_scores = model.predict(sentence_pairs, convert_to_numpy=True, show_progress_bar=False, batch_size=4)
config, os.path.join(config['train_dir'], config['train_hyp_file'])) num_labels = 3 else: train_samples, dev_samples = get_train_dev_data( config, os.path.join(config['train_dir'], config['train_flat_file'])) logging.info("Done Processing Data ...") model = CrossEncoder(config['crossencoder_base_model'], num_labels=num_labels) batch_size = config['batch_size'] num_epochs = config['num_epochs'] train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=batch_size) train_loss = get_loss(config['loss_type'], model) evaluator = CEBinaryAccuracyEvaluator.from_input_examples(dev_samples) warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) logging.info("Warmup-steps: {}".format(warmup_steps)) model_dir = os.path.join(config['saved_model_dir'], config['checkpoint_path']) logging.info("Starting training ...") model.fit(train_dataloader=train_dataloader, evaluator=evaluator, epochs=num_epochs, evaluation_steps=int(config['eval_steps']), warmup_steps=warmup_steps, output_path=model_dir)
if qid in dev_qids: #Skip queries in our dev dataset continue train_samples.append(InputExample(texts=[queries[qid], corpus[pid1]], label=float(pos_score))) train_samples.append(InputExample(texts=[queries[qid], corpus[pid2]], label=float(neg_score))) # We create a DataLoader to load our train samples train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size, drop_last=True) # We add an evaluator, which evaluates the performance during training # It performs a classification task and measures scores like F1 (finding relevant passages) and Average Precision evaluator = CERerankingEvaluator(dev_samples, name='train-eval') # Configure the training warmup_steps = 5000 logging.info("Warmup-steps: {}".format(warmup_steps)) # Train the model model.fit(train_dataloader=train_dataloader, loss_fct=torch.nn.MSELoss(), evaluator=evaluator, epochs=num_epochs, evaluation_steps=5000, warmup_steps=warmup_steps, output_path=model_save_path, optimizer_params={'lr': 7e-6}, use_amp=True) #Save latest model model.save(model_save_path+'-latest')