示例#1
0
#Limit torch to 4 threads
torch.set_num_threads(4)

model_name = sys.argv[1] if len(sys.argv) > 1 else 'bert-base-nli-mean-tokens'

# Load a named sentence model (based on BERT). This will download the model from our server.
# Alternatively, you can also pass a filepath to SentenceTransformer()
model = SentenceTransformer(model_name)

nli_dataset_path = 'datasets/AllNLI.tsv.gz'
sentences = set()
max_sentences = 100000

#Download datasets if needed
if not os.path.exists(nli_dataset_path):
    util.http_get('https://sbert.net/datasets/AllNLI.tsv.gz', nli_dataset_path)

with gzip.open(nli_dataset_path, 'rt', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        sentences.add(row['sentence1'])
        if len(sentences) >= max_sentences:
            break

sentences = list(sentences)
print("Model Name:", model_name)
print("Number of sentences:", len(sentences))

for i in range(3):
    print("Run", i)
    start_time = time.time()
示例#2
0
#### /print debug information to stdout

#Model for which we apply dimensionality reduction
model = SentenceTransformer('all-MiniLM-L6-v2')

#New size for the embeddings
new_dimension = 128

#We use AllNLI as a source of sentences to compute PCA
nli_dataset_path = 'datasets/AllNLI.tsv.gz'

#We use the STS benchmark dataset to see how much performance we loose by using the dimensionality reduction
sts_dataset_path = 'datasets/stsbenchmark.tsv.gz'

if not os.path.exists(nli_dataset_path):
    util.http_get('https://sbert.net/datasets/AllNLI.tsv.gz', nli_dataset_path)

if not os.path.exists(sts_dataset_path):
    util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz',
                  sts_dataset_path)

# We measure the performance of the original model
# and later we will measure the performance with the reduces dimension size
logger.info("Read STSbenchmark test dataset")
eval_examples = []
with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        if row['split'] == 'test':
            score = float(
                row['score']) / 5.0  #Normalize score to range 0 ... 1
示例#3
0
#As distance metric, we use cosine distance (cosine_distance = 1-cosine_similarity)
distance_metric = losses.SiameseDistanceMetric.COSINE_DISTANCE

#Negative pairs should have a distance of at least 0.5
margin = 0.5

dataset_path = 'quora-IR-dataset'
model_save_path = 'output/training_OnlineConstrativeLoss-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

os.makedirs(model_save_path, exist_ok=True)

# Check if the dataset exists. If not, download and extract
if not os.path.exists(dataset_path):
    logging.info("Dataset not found. Download")
    zip_save_path = 'quora-IR-dataset.zip'
    util.http_get(url='https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/quora-IR-dataset.zip', path=zip_save_path)
    with ZipFile(zip_save_path, 'r') as zip:
        zip.extractall(dataset_path)


######### Read train data  ##########
# Read train data
train_samples = []
with open(os.path.join(dataset_path, "classification/train_pairs.tsv"), encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        sample = InputExample(texts=[row['question1'], row['question2']], label=int(row['is_duplicate']))
        train_samples.append(sample)


train_dataset = SentencesDataset(train_samples, model=model)
示例#4
0
max_seq_length = 75

################# Download AskUbuntu and extract training corpus  #################
askubuntu_folder = 'askubuntu'
output_path = 'output/train_askubuntu_ct-improved-{}-{}-{}'.format(
    model_name, batch_size,
    datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))

## Download the AskUbuntu dataset from https://github.com/taolei87/askubuntu
for filename in [
        'text_tokenized.txt.gz', 'dev.txt', 'test.txt', 'train_random.txt'
]:
    filepath = os.path.join(askubuntu_folder, filename)
    if not os.path.exists(filepath):
        util.http_get(
            'https://github.com/taolei87/askubuntu/raw/master/' + filename,
            filepath)

# Read the corpus
corpus = {}
dev_test_ids = set()
with gzip.open(os.path.join(askubuntu_folder, 'text_tokenized.txt.gz'),
               'rt',
               encoding='utf8') as fIn:
    for line in fIn:
        splits = line.strip().split("\t")
        id = splits[0]
        title = splits[1]
        corpus[id] = title

示例#5
0
model = SentenceTransformer(model_name)

url = "http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv"
dataset_path = "quora_duplicate_questions.tsv"
max_corpus_size = 100000

embedding_cache_path = 'quora-embeddings-{}-size-{}.pkl'.format(
    model_name.replace('/', '_'), max_corpus_size)

#Check if embedding cache path exists
if not os.path.exists(embedding_cache_path):
    # Check if the dataset exists. If not, download and extract
    # Download dataset if needed
    if not os.path.exists(dataset_path):
        print("Download dataset")
        util.http_get(url, dataset_path)

    # Get all unique sentences from the file
    corpus_sentences = set()
    with open(dataset_path, encoding='utf8') as fIn:
        reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
        for row in reader:
            corpus_sentences.add(row['question1'])
            if len(corpus_sentences) >= max_corpus_size:
                break

            corpus_sentences.add(row['question2'])
            if len(corpus_sentences) >= max_corpus_size:
                break

    corpus_sentences = list(corpus_sentences)
As model, we use SPECTER (https://github.com/allenai/specter), which encodes paper titles and abstracts 
into a vector space.

When can then use util.semantic_search() to find the most similar papers.

Colab example: https://colab.research.google.com/drive/12hfBveGHRsxhPIUMmJYrll2lFU4fOX06
"""
import json
import os
from sentence_transformers import SentenceTransformer, util

#First, we load the papers dataset (with title and abstract information)
dataset_file = 'emnlp2016-2018.json'

if not os.path.exists(dataset_file):
    util.http_get("https://sbert.net/datasets/emnlp2016-2018.json",
                  dataset_file)

with open(dataset_file) as fIn:
    papers = json.load(fIn)

print(len(papers), "papers loaded")

#We then load the allenai-specter model with SentenceTransformers
model = SentenceTransformer('allenai-specter')

#To encode the papers, we must combine the title and the abstracts to a single string
paper_texts = [
    paper['title'] + '[SEP]' + paper['abstract'] for paper in papers
]

#Compute embeddings for all papers
示例#7
0
translated_qids = set()
if os.path.exists(output_filename):
    with open(output_filename, 'r', encoding='utf8') as fIn:
        for line in fIn:
            splits = line.strip().split("\t")
            translated_qids.add(splits[0])

### Now we read the MS Marco dataset
data_folder = '../msmarco-data'
os.makedirs(data_folder, exist_ok=True)

# Read qrels file for relevant positives per query
train_queries = {}
qrels_train = os.path.join(data_folder, 'qrels.train.tsv')
if not os.path.exists(qrels_train):
    util.http_get('https://msmarco.blob.core.windows.net/msmarcoranking/qrels.train.tsv', qrels_train)

with open(qrels_train) as fIn:
    for line in fIn:
        qid, _, pid, _ = line.strip().split()
        if qid not in translated_qids:
            train_queries[qid] = None

# Read all queries
queries_filepath = os.path.join(data_folder, 'queries.train.tsv')
if not os.path.exists(queries_filepath):
    tar_filepath = os.path.join(data_folder, 'queries.tar.gz')
    if not os.path.exists(tar_filepath):
        logging.info("Download queries.tar.gz")
        util.http_get('https://msmarco.blob.core.windows.net/msmarcoranking/queries.tar.gz', tar_filepath)
#### /print debug information to stdout

#You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
model_name = sys.argv[1] if len(sys.argv) > 1 else 'bert-base-uncased'
batch_size = 16
num_epochs = 1
max_seq_length = 128
use_cuda = torch.cuda.is_available()

###### Read Datasets ######
sts_dataset_path = 'datasets/stsbenchmark.tsv.gz'
qqp_dataset_path = 'quora-IR-dataset'

# Check if the STSb dataset exsist. If not, download and extract it
if not os.path.exists(sts_dataset_path):
    util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz',
                  sts_dataset_path)

# Check if the QQP dataset exists. If not, download and extract
if not os.path.exists(qqp_dataset_path):
    logging.info("Dataset not found. Download")
    zip_save_path = 'quora-IR-dataset.zip'
    util.http_get(url='https://sbert.net/datasets/quora-IR-dataset.zip',
                  path=zip_save_path)
    with ZipFile(zip_save_path, 'r') as zipIn:
        zipIn.extractall(qqp_dataset_path)

cross_encoder_path = 'output/cross-encoder/stsb_indomain_' + model_name.replace(
    "/", "-") + '-' + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
bi_encoder_path = 'output/bi-encoder/qqp_cross_domain_' + model_name.replace(
    "/", "-") + '-' + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
示例#9
0
inference_batch_size = 64
train_batch_size = 64

#We use AllNLI as a source of sentences for the distillation
nli_dataset_path = 'datasets/AllNLI.tsv.gz'

#Further, we use sentences extracted from the English Wikipedia to train the distillation
wikipedia_dataset_path = 'datasets/wikipedia-en-sentences.txt.gz'

#We use the STS benchmark dataset to see how much performance we loose
sts_dataset_path = 'datasets/stsbenchmark.tsv.gz'

#Download datasets if needed
if not os.path.exists(nli_dataset_path):
    util.http_get('https://sbert.net/datasets/AllNLI.tsv.gz', nli_dataset_path)

if not os.path.exists(wikipedia_dataset_path):
    util.http_get('https://sbert.net/datasets/wikipedia-en-sentences.txt.gz',
                  wikipedia_dataset_path)

if not os.path.exists(sts_dataset_path):
    util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz',
                  sts_dataset_path)

#We need sentences to train our distillation. Here, we use sentences from AllNLI and from WikiPedia
train_sentences_nli = set()
dev_sentences_nli = set()

train_sentences_wikipedia = []
dev_sentences_wikipedia = []
示例#10
0
import logging
import os

logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])

#You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
model_name = 'distilbert-base-uncased'

dataset_path = 'datasets/wikipedia-sections'
if not os.path.exists(dataset_path):
    os.makedirs(dataset_path, exist_ok=True)
    filepath = os.path.join(dataset_path, 'wikipedia-sections-triplets.zip')
    util.http_get('https://sbert.net/datasets/wikipedia-sections-triplets.zip',
                  filepath)
    with ZipFile(filepath, 'r') as zip:
        zip.extractall(dataset_path)

### Create a torch.DataLoader that passes training batch instances to our model
train_batch_size = 16
output_path = "output/training-wikipedia-sections-" + model_name + "-" + datetime.now(
).strftime("%Y-%m-%d_%H-%M-%S")
num_epochs = 1

### Configure sentence transformers for training and train on the provided dataset
# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
word_embedding_model = models.Transformer(model_name)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(
示例#11
0
# Training parameters
model_name = 'distilroberta-base'
train_batch_size = 128
num_epochs = 1

# Save path to store our model
model_save_path = 'output/training_stsb_simcse-{}-{}-{}'.format(
    model_name, train_batch_size,
    datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))

# Check if dataset exsist. If not, download and extract  it
sts_dataset_path = 'datasets/stsbenchmark.tsv.gz'

if not os.path.exists(sts_dataset_path):
    util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz',
                  sts_dataset_path)

# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
word_embedding_model = models.Transformer(model_name, max_seq_length=32)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension())
dense = models.Dense(pooling_model.get_sentence_embedding_dimension(),
                     pooling_model.get_sentence_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# We use 1 Million sentences from Wikipedia to train our model
wikipedia_dataset_path = 'datasets/wiki1m_for_simcse.txt'
if not os.path.exists(wikipedia_dataset_path):
    util.http_get(
model = CrossEncoder(model_name, num_labels=1, max_length=512, default_activation_function=torch.nn.Identity())


### Now we read the MS Marco dataset
data_folder = 'msmarco-data'
os.makedirs(data_folder, exist_ok=True)


#### Read the corpus files, that contain all the passages. Store them in the corpus dict
corpus = {}
collection_filepath = os.path.join(data_folder, 'collection.tsv')
if not os.path.exists(collection_filepath):
    tar_filepath = os.path.join(data_folder, 'collection.tar.gz')
    if not os.path.exists(tar_filepath):
        logging.info("Download collection.tar.gz")
        util.http_get('https://msmarco.blob.core.windows.net/msmarcoranking/collection.tar.gz', tar_filepath)

    with tarfile.open(tar_filepath, "r:gz") as tar:
        tar.extractall(path=data_folder)

with open(collection_filepath, 'r', encoding='utf8') as fIn:
    for line in fIn:
        pid, passage = line.strip().split("\t")
        corpus[pid] = passage


### Read the train queries, store in queries dict
queries = {}
queries_filepath = os.path.join(data_folder, 'queries.train.tsv')
if not os.path.exists(queries_filepath):
    tar_filepath = os.path.join(data_folder, 'queries.tar.gz')
示例#13
0
import tqdm
import numpy as np
import sys
import pytrec_eval
from sentence_transformers import SentenceTransformer, util, CrossEncoder
import os

data_folder = 'trec2019-data'
os.makedirs(data_folder, exist_ok=True)

#Read test queries
queries = {}
queries_filepath = os.path.join(data_folder, 'msmarco-test2019-queries.tsv.gz')
if not os.path.exists(queries_filepath):
    logging.info("Download "+os.path.basename(queries_filepath))
    util.http_get('https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-test2019-queries.tsv.gz', queries_filepath)

with gzip.open(queries_filepath, 'rt', encoding='utf8') as fIn:
    for line in fIn:
        qid, query = line.strip().split("\t")
        queries[qid] = query

#Read which passages are relevant
relevant_docs = defaultdict(lambda: defaultdict(int))
qrels_filepath = os.path.join(data_folder, '2019qrels-pass.txt')

if not os.path.exists(qrels_filepath):
    logging.info("Download "+os.path.basename(qrels_filepath))
    util.http_get('https://trec.nist.gov/data/deep/2019qrels-pass.txt', qrels_filepath)

示例#14
0
import os

#We use the Bi-Encoder to encode all passages, so that we can use it with sematic search
bi_encoder = SentenceTransformer('msmarco-distilroberta-base-v2')
top_k = 100  #Number of passages we want to retrieve with the bi-encoder

#The bi-encoder will retrieve 100 documents. We use a cross-encoder, to re-rank the results list to improve the quality
cross_encoder = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-6')

# As dataset, we use Simple English Wikipedia. Compared to the full English wikipedia, it has only
# about 170k articles. We split these articles into paragraphs and encode them with the bi-encoder

wikipedia_filepath = 'data/simplewiki-2020-11-01.jsonl.gz'

if not os.path.exists(wikipedia_filepath):
    util.http_get('http://sbert.net/datasets/simplewiki-2020-11-01.jsonl.gz',
                  wikipedia_filepath)

passages = []
with gzip.open(wikipedia_filepath, 'rt', encoding='utf8') as fIn:
    for line in fIn:
        data = json.loads(line.strip())
        paragraphs = data['text'].split("\n\n")
        for p in paragraphs:
            if len(p.strip()) > 20:
                passages.append(p.strip()[0:5000])

#If you like, you can also limit the number of passages you want to use
#passages = passages[0:50000]
print("Passages:", len(passages))

#Now we encode all passages we have in our Simple Wikipedia corpus
示例#15
0
#We set num_labels=1, which predicts a continous score between 0 and 1
model = CrossEncoder(model_name, num_labels=1, max_length=512)

### Now we read the MS Marco dataset
data_folder = 'msmarco-data'
os.makedirs(data_folder, exist_ok=True)

#### Read the corpus files, that contain all the passages. Store them in the corpus dict
corpus = {}
collection_filepath = os.path.join(data_folder, 'collection.tsv')
if not os.path.exists(collection_filepath):
    tar_filepath = os.path.join(data_folder, 'collection.tar.gz')
    if not os.path.exists(tar_filepath):
        logging.info("Download collection.tar.gz")
        util.http_get(
            'https://msmarco.blob.core.windows.net/msmarcoranking/collection.tar.gz',
            tar_filepath)

    with tarfile.open(tar_filepath, "r:gz") as tar:
        tar.extractall(path=data_folder)

with open(collection_filepath, 'r', encoding='utf8') as fIn:
    for line in fIn:
        pid, passage = line.strip().split("\t")
        corpus[pid] = passage

### Read the train queries, store in queries dict
queries = {}
queries_filepath = os.path.join(data_folder, 'queries.train.tsv')
if not os.path.exists(queries_filepath):
    tar_filepath = os.path.join(data_folder, 'queries.tar.gz')
示例#16
0
distance_metric = losses.SiameseDistanceMetric.COSINE_DISTANCE

#Negative pairs should have a distance of at least 0.5
margin = 0.5

dataset_path = 'quora-IR-dataset'
model_save_path = 'output/training_multi-task-learning' + datetime.now(
).strftime("%Y-%m-%d_%H-%M-%S")

os.makedirs(model_save_path, exist_ok=True)

# Check if the dataset exists. If not, download and extract
if not os.path.exists(dataset_path):
    logger.info("Dataset not found. Download")
    zip_save_path = 'quora-IR-dataset.zip'
    util.http_get(url='https://sbert.net/datasets/quora-IR-dataset.zip',
                  path=zip_save_path)
    with ZipFile(zip_save_path, 'r') as zip:
        zip.extractall(dataset_path)

######### Read train data  ##########
train_samples_MultipleNegativesRankingLoss = []
train_samples_ConstrativeLoss = []

with open(os.path.join(dataset_path, "classification/train_pairs.tsv"),
          encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        train_samples_ConstrativeLoss.append(
            InputExample(texts=[row['question1'], row['question2']],
                         label=int(row['is_duplicate'])))
        if row['is_duplicate'] == '1':
示例#17
0
import os
from sentence_transformers import util

random.seed(42)

#Get raw file
source_file = "quora-IR-dataset/quora_duplicate_questions.tsv"
os.makedirs('quora-IR-dataset', exist_ok=True)
os.makedirs('quora-IR-dataset/graph', exist_ok=True)
os.makedirs('quora-IR-dataset/information-retrieval', exist_ok=True)
os.makedirs('quora-IR-dataset/classification', exist_ok=True)
os.makedirs('quora-IR-dataset/duplicate-mining', exist_ok=True)

if not os.path.exists(source_file):
    print("Download file to", source_file)
    util.http_get('http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv',
                  source_file)

#Read pairwise file
sentences = {}
duplicates = defaultdict(lambda: defaultdict(bool))
rows = []
with open(source_file, encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
    for row in reader:
        id1 = row['qid1']
        id2 = row['qid2']
        question1 = row['question1'].replace("\r", "").replace("\n",
                                                               " ").replace(
                                                                   "\t", " ")
        question2 = row['question2'].replace("\r", "").replace("\n",
                                                               " ").replace(