def create_linked_posts(fl, data_dir, model, validate=None, is_test=False): train_linked_posts = [] disbn = [] with open(os.path.join(data_dir, fl)) as f: data = json.load(f) for obj in data: if obj['class'] == 'relevant': label = 1 else: label = 0 disbn.append(label) train_linked_posts.append( InputExample(texts=[obj['text_1'], obj['text_2']], label=label)) random.shuffle(train_linked_posts) if is_test: return train_linked_posts if max_size: train_linked_posts = train_linked_posts[:max_size] evaluator = None if linked_posts_str == validate: train_linked_posts, dev_linked_posts = train_test_split( train_linked_posts, stratify=disbn, test_size=0.1) evaluator = BinaryClassificationEvaluator.from_input_examples( dev_linked_posts, name='linked-posts') warmup_steps = math.ceil( len(train_linked_posts) * num_epochs / batch_size * 0.1) # 10% of train data for warm-up train_data_linked_posts = SentencesDataset(train_linked_posts, model=model) train_dataloader_linked_posts = DataLoader(train_data_linked_posts, shuffle=True, batch_size=batch_size) train_loss_linked_posts = losses.ContrastiveLoss(model=model) print('L: Number of training examples: ', len(train_linked_posts)) global evaluation_steps evaluation_steps = math.ceil(len(train_linked_posts) / 0.1) return train_dataloader_linked_posts, train_loss_linked_posts, evaluator, warmup_steps
sent2 = row[1] + ' ' + row[2] inp_example = InputExample(texts=[sent1, sent2], label=label) dev_samples.append(inp_example) with open('./tmp.txt', 'w', encoding='utf-8') as file: for sample in train_samples[:10]: file.write(str(sample.texts) + ' ' + str(sample.label) + '\n') train_dataset = SentencesDataset(train_samples, model) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size) train_loss = losses.CosineSimilarityLoss(model=model) logging.info("Read dev dataset") evaluator = BinaryClassificationEvaluator.from_input_examples(dev_samples, name='doc-dev') # Configure the training. We skip evaluation in this example warmup_steps = math.ceil( len(train_dataset) * num_epochs / train_batch_size * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, evaluation_steps=1000, warmup_steps=warmup_steps, output_path=model_save_path)
validate=None, is_test=True) test_search = create_search( 'stackoverflow_matches_codesearchnet_5k_test_collection.tsv', 'stackoverflow_matches_codesearchnet_5k_test_queries.tsv', 'stackoverflow_matches_codesearchnet_5k_test_blanca-qidpidtriples.train.tsv', data_dir, model, validate=None, is_test=True) test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples( test_hierarchy_samples, name='test-hierarchy-samples') test_evaluator(model, output_path=args.output_dir) test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples( test_posts_ranking, name='test-post-ranking') test_evaluator(model, output_path=args.output_dir) test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples( test_usage, name='test-usage') test_evaluator(model, output_path=args.output_dir) test_evaluator = BinaryClassificationEvaluator.from_input_examples( test_linked_posts, name='test-linked-posts') test_evaluator(model, output_path=args.output_dir) test_evaluator = BinaryClassificationEvaluator.from_input_examples( test_class_posts, name='test-class-posts') test_evaluator(model, output_path=args.output_dir) test_evaluator = BinaryClassificationEvaluator.from_input_examples( test_search, name='test-search') test_evaluator(model, output_path=args.output_dir)
def create_search(collection, query_file, train, data_dir, model, validate=None, is_test=False): corpus = {} with open(os.path.join(data_dir, collection), 'r', encoding='utf8') as fIn: for line in fIn: pid, passage = line.strip().split("\t") corpus[pid] = passage queries = {} with open(os.path.join(data_dir, query_file), 'r', encoding='utf8') as fIn: for line in fIn: qid, query = line.strip().split("\t") queries[qid] = query train_search = [] disbn = [] with open(os.path.join(data_dir, train), 'r', encoding='utf8') \ as f: added_q = set() for line in f.readlines(): qid, pos_id, neg_id = line.strip().split() query = queries[qid] passage = corpus[pos_id] neg_passage = corpus[neg_id] if qid not in added_q: train_search.append( InputExample(texts=[query, passage], label=1)) disbn.append(1) added_q.add(qid) train_search.append( InputExample(texts=[query, neg_passage], label=0)) disbn.append(0) random.shuffle(train_search) if is_test: return train_search if max_size: train_search = train_search[:max_size] evaluator = None if search_str == validate: train_search, dev_search = train_test_split(train_search, stratify=disbn, test_size=0.1) evaluator = BinaryClassificationEvaluator.from_input_examples( dev_search, name='search') warmup_steps = math.ceil( len(train_search) * num_epochs / batch_size * 0.1) # 10% of train data for warm-up # We create a DataLoader to load our train samples train_dataloader_search = DataLoader(train_search, shuffle=True, batch_size=batch_size) train_loss_search = losses.ContrastiveLoss(model=model) print('S: Number of training examples: ', len(train_search)) global evaluation_steps evaluation_steps = math.ceil(len(train_search) / 0.1) return train_dataloader_search, train_loss_search, evaluator, warmup_steps
# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings word_embedding_model = models.Transformer(model_name, max_seq_length=max_length) print(word_embedding_model.get_config_dict()) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) # Convert the dataset to a DataLoader ready for training logging.info("Read Document Classification train dataset") test_samples = [] with open(os.path.join(dataset_path,'test.txt'), 'rt', encoding='utf8') as fIn: reader = fIn.readlines() for row in reader: row = row.strip().split('\t') if len(row) != 4: continue label = float(row[3]) sent1 = row[0] sent2 = row[1]+' '+row[2] inp_example = InputExample(texts=[sent1, sent2], label=label) test_samples.append(inp_example) model = SentenceTransformer(model_save_path) test_evaluator = BinaryClassificationEvaluator.from_input_examples(test_samples, name='doc-test') test_evaluator(model, output_path=model_save_path)