def create_linked_posts(fl, data_dir, model, validate=None, is_test=False):
    train_linked_posts = []
    disbn = []

    with open(os.path.join(data_dir, fl)) as f:
        data = json.load(f)
        for obj in data:
            if obj['class'] == 'relevant':
                label = 1
            else:
                label = 0
            disbn.append(label)

            train_linked_posts.append(
                InputExample(texts=[obj['text_1'], obj['text_2']],
                             label=label))
    random.shuffle(train_linked_posts)

    if is_test:
        return train_linked_posts

    if max_size:
        train_linked_posts = train_linked_posts[:max_size]

    evaluator = None
    if linked_posts_str == validate:
        train_linked_posts, dev_linked_posts = train_test_split(
            train_linked_posts, stratify=disbn, test_size=0.1)
        evaluator = BinaryClassificationEvaluator.from_input_examples(
            dev_linked_posts, name='linked-posts')

    warmup_steps = math.ceil(
        len(train_linked_posts) * num_epochs / batch_size *
        0.1)  # 10% of train data for warm-up

    train_data_linked_posts = SentencesDataset(train_linked_posts, model=model)
    train_dataloader_linked_posts = DataLoader(train_data_linked_posts,
                                               shuffle=True,
                                               batch_size=batch_size)
    train_loss_linked_posts = losses.ContrastiveLoss(model=model)

    print('L: Number of training examples: ', len(train_linked_posts))

    global evaluation_steps
    evaluation_steps = math.ceil(len(train_linked_posts) / 0.1)

    return train_dataloader_linked_posts, train_loss_linked_posts, evaluator, warmup_steps
        sent2 = row[1] + ' ' + row[2]
        inp_example = InputExample(texts=[sent1, sent2], label=label)
        dev_samples.append(inp_example)

with open('./tmp.txt', 'w', encoding='utf-8') as file:
    for sample in train_samples[:10]:
        file.write(str(sample.texts) + ' ' + str(sample.label) + '\n')

train_dataset = SentencesDataset(train_samples, model)
train_dataloader = DataLoader(train_dataset,
                              shuffle=True,
                              batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

logging.info("Read dev dataset")
evaluator = BinaryClassificationEvaluator.from_input_examples(dev_samples,
                                                              name='doc-dev')

# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(
    len(train_dataset) * num_epochs / train_batch_size *
    0.1)  #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path)
예제 #3
0
        validate=None,
        is_test=True)
    test_search = create_search(
        'stackoverflow_matches_codesearchnet_5k_test_collection.tsv',
        'stackoverflow_matches_codesearchnet_5k_test_queries.tsv',
        'stackoverflow_matches_codesearchnet_5k_test_blanca-qidpidtriples.train.tsv',
        data_dir,
        model,
        validate=None,
        is_test=True)

    test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
        test_hierarchy_samples, name='test-hierarchy-samples')
    test_evaluator(model, output_path=args.output_dir)
    test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
        test_posts_ranking, name='test-post-ranking')
    test_evaluator(model, output_path=args.output_dir)
    test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
        test_usage, name='test-usage')
    test_evaluator(model, output_path=args.output_dir)

    test_evaluator = BinaryClassificationEvaluator.from_input_examples(
        test_linked_posts, name='test-linked-posts')
    test_evaluator(model, output_path=args.output_dir)
    test_evaluator = BinaryClassificationEvaluator.from_input_examples(
        test_class_posts, name='test-class-posts')
    test_evaluator(model, output_path=args.output_dir)
    test_evaluator = BinaryClassificationEvaluator.from_input_examples(
        test_search, name='test-search')
    test_evaluator(model, output_path=args.output_dir)
def create_search(collection,
                  query_file,
                  train,
                  data_dir,
                  model,
                  validate=None,
                  is_test=False):
    corpus = {}
    with open(os.path.join(data_dir, collection), 'r', encoding='utf8') as fIn:
        for line in fIn:
            pid, passage = line.strip().split("\t")
            corpus[pid] = passage

    queries = {}
    with open(os.path.join(data_dir, query_file), 'r', encoding='utf8') as fIn:
        for line in fIn:
            qid, query = line.strip().split("\t")
            queries[qid] = query

    train_search = []
    disbn = []
    with open(os.path.join(data_dir, train), 'r', encoding='utf8') \
            as f:
        added_q = set()
        for line in f.readlines():
            qid, pos_id, neg_id = line.strip().split()
            query = queries[qid]
            passage = corpus[pos_id]
            neg_passage = corpus[neg_id]
            if qid not in added_q:
                train_search.append(
                    InputExample(texts=[query, passage], label=1))
                disbn.append(1)
                added_q.add(qid)
            train_search.append(
                InputExample(texts=[query, neg_passage], label=0))
            disbn.append(0)
    random.shuffle(train_search)

    if is_test:
        return train_search

    if max_size:
        train_search = train_search[:max_size]
    evaluator = None

    if search_str == validate:
        train_search, dev_search = train_test_split(train_search,
                                                    stratify=disbn,
                                                    test_size=0.1)
        evaluator = BinaryClassificationEvaluator.from_input_examples(
            dev_search, name='search')

    warmup_steps = math.ceil(
        len(train_search) * num_epochs / batch_size *
        0.1)  # 10% of train data for warm-up

    # We create a DataLoader to load our train samples
    train_dataloader_search = DataLoader(train_search,
                                         shuffle=True,
                                         batch_size=batch_size)
    train_loss_search = losses.ContrastiveLoss(model=model)

    print('S: Number of training examples: ', len(train_search))

    global evaluation_steps
    evaluation_steps = math.ceil(len(train_search) / 0.1)

    return train_dataloader_search, train_loss_search, evaluator, warmup_steps
예제 #5
0
# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
word_embedding_model = models.Transformer(model_name, max_seq_length=max_length)
print(word_embedding_model.get_config_dict())
# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
# Convert the dataset to a DataLoader ready for training
logging.info("Read Document Classification train dataset")

test_samples = []

with open(os.path.join(dataset_path,'test.txt'), 'rt', encoding='utf8') as fIn:
    reader = fIn.readlines()
    for row in reader:
        row = row.strip().split('\t')
        if len(row) != 4:
            continue
        label = float(row[3])
        sent1 = row[0]
        sent2 = row[1]+' '+row[2]
        inp_example = InputExample(texts=[sent1, sent2], label=label)
        test_samples.append(inp_example)

model = SentenceTransformer(model_save_path)
test_evaluator = BinaryClassificationEvaluator.from_input_examples(test_samples, name='doc-test')
test_evaluator(model, output_path=model_save_path)