#Limit torch to 4 threads torch.set_num_threads(4) model_name = sys.argv[1] if len(sys.argv) > 1 else 'bert-base-nli-mean-tokens' # Load a named sentence model (based on BERT). This will download the model from our server. # Alternatively, you can also pass a filepath to SentenceTransformer() model = SentenceTransformer(model_name) nli_dataset_path = 'datasets/AllNLI.tsv.gz' sentences = set() max_sentences = 100000 #Download datasets if needed if not os.path.exists(nli_dataset_path): util.http_get('https://sbert.net/datasets/AllNLI.tsv.gz', nli_dataset_path) with gzip.open(nli_dataset_path, 'rt', encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: sentences.add(row['sentence1']) if len(sentences) >= max_sentences: break sentences = list(sentences) print("Model Name:", model_name) print("Number of sentences:", len(sentences)) for i in range(3): print("Run", i) start_time = time.time()
#### /print debug information to stdout #Model for which we apply dimensionality reduction model = SentenceTransformer('all-MiniLM-L6-v2') #New size for the embeddings new_dimension = 128 #We use AllNLI as a source of sentences to compute PCA nli_dataset_path = 'datasets/AllNLI.tsv.gz' #We use the STS benchmark dataset to see how much performance we loose by using the dimensionality reduction sts_dataset_path = 'datasets/stsbenchmark.tsv.gz' if not os.path.exists(nli_dataset_path): util.http_get('https://sbert.net/datasets/AllNLI.tsv.gz', nli_dataset_path) if not os.path.exists(sts_dataset_path): util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path) # We measure the performance of the original model # and later we will measure the performance with the reduces dimension size logger.info("Read STSbenchmark test dataset") eval_examples = [] with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: if row['split'] == 'test': score = float( row['score']) / 5.0 #Normalize score to range 0 ... 1
#As distance metric, we use cosine distance (cosine_distance = 1-cosine_similarity) distance_metric = losses.SiameseDistanceMetric.COSINE_DISTANCE #Negative pairs should have a distance of at least 0.5 margin = 0.5 dataset_path = 'quora-IR-dataset' model_save_path = 'output/training_OnlineConstrativeLoss-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") os.makedirs(model_save_path, exist_ok=True) # Check if the dataset exists. If not, download and extract if not os.path.exists(dataset_path): logging.info("Dataset not found. Download") zip_save_path = 'quora-IR-dataset.zip' util.http_get(url='https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/quora-IR-dataset.zip', path=zip_save_path) with ZipFile(zip_save_path, 'r') as zip: zip.extractall(dataset_path) ######### Read train data ########## # Read train data train_samples = [] with open(os.path.join(dataset_path, "classification/train_pairs.tsv"), encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: sample = InputExample(texts=[row['question1'], row['question2']], label=int(row['is_duplicate'])) train_samples.append(sample) train_dataset = SentencesDataset(train_samples, model=model)
max_seq_length = 75 ################# Download AskUbuntu and extract training corpus ################# askubuntu_folder = 'askubuntu' output_path = 'output/train_askubuntu_ct-improved-{}-{}-{}'.format( model_name, batch_size, datetime.now().strftime("%Y-%m-%d_%H-%M-%S")) ## Download the AskUbuntu dataset from https://github.com/taolei87/askubuntu for filename in [ 'text_tokenized.txt.gz', 'dev.txt', 'test.txt', 'train_random.txt' ]: filepath = os.path.join(askubuntu_folder, filename) if not os.path.exists(filepath): util.http_get( 'https://github.com/taolei87/askubuntu/raw/master/' + filename, filepath) # Read the corpus corpus = {} dev_test_ids = set() with gzip.open(os.path.join(askubuntu_folder, 'text_tokenized.txt.gz'), 'rt', encoding='utf8') as fIn: for line in fIn: splits = line.strip().split("\t") id = splits[0] title = splits[1] corpus[id] = title
model = SentenceTransformer(model_name) url = "http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv" dataset_path = "quora_duplicate_questions.tsv" max_corpus_size = 100000 embedding_cache_path = 'quora-embeddings-{}-size-{}.pkl'.format( model_name.replace('/', '_'), max_corpus_size) #Check if embedding cache path exists if not os.path.exists(embedding_cache_path): # Check if the dataset exists. If not, download and extract # Download dataset if needed if not os.path.exists(dataset_path): print("Download dataset") util.http_get(url, dataset_path) # Get all unique sentences from the file corpus_sentences = set() with open(dataset_path, encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_MINIMAL) for row in reader: corpus_sentences.add(row['question1']) if len(corpus_sentences) >= max_corpus_size: break corpus_sentences.add(row['question2']) if len(corpus_sentences) >= max_corpus_size: break corpus_sentences = list(corpus_sentences)
As model, we use SPECTER (https://github.com/allenai/specter), which encodes paper titles and abstracts into a vector space. When can then use util.semantic_search() to find the most similar papers. Colab example: https://colab.research.google.com/drive/12hfBveGHRsxhPIUMmJYrll2lFU4fOX06 """ import json import os from sentence_transformers import SentenceTransformer, util #First, we load the papers dataset (with title and abstract information) dataset_file = 'emnlp2016-2018.json' if not os.path.exists(dataset_file): util.http_get("https://sbert.net/datasets/emnlp2016-2018.json", dataset_file) with open(dataset_file) as fIn: papers = json.load(fIn) print(len(papers), "papers loaded") #We then load the allenai-specter model with SentenceTransformers model = SentenceTransformer('allenai-specter') #To encode the papers, we must combine the title and the abstracts to a single string paper_texts = [ paper['title'] + '[SEP]' + paper['abstract'] for paper in papers ] #Compute embeddings for all papers
translated_qids = set() if os.path.exists(output_filename): with open(output_filename, 'r', encoding='utf8') as fIn: for line in fIn: splits = line.strip().split("\t") translated_qids.add(splits[0]) ### Now we read the MS Marco dataset data_folder = '../msmarco-data' os.makedirs(data_folder, exist_ok=True) # Read qrels file for relevant positives per query train_queries = {} qrels_train = os.path.join(data_folder, 'qrels.train.tsv') if not os.path.exists(qrels_train): util.http_get('https://msmarco.blob.core.windows.net/msmarcoranking/qrels.train.tsv', qrels_train) with open(qrels_train) as fIn: for line in fIn: qid, _, pid, _ = line.strip().split() if qid not in translated_qids: train_queries[qid] = None # Read all queries queries_filepath = os.path.join(data_folder, 'queries.train.tsv') if not os.path.exists(queries_filepath): tar_filepath = os.path.join(data_folder, 'queries.tar.gz') if not os.path.exists(tar_filepath): logging.info("Download queries.tar.gz") util.http_get('https://msmarco.blob.core.windows.net/msmarcoranking/queries.tar.gz', tar_filepath)
#### /print debug information to stdout #You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base model_name = sys.argv[1] if len(sys.argv) > 1 else 'bert-base-uncased' batch_size = 16 num_epochs = 1 max_seq_length = 128 use_cuda = torch.cuda.is_available() ###### Read Datasets ###### sts_dataset_path = 'datasets/stsbenchmark.tsv.gz' qqp_dataset_path = 'quora-IR-dataset' # Check if the STSb dataset exsist. If not, download and extract it if not os.path.exists(sts_dataset_path): util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path) # Check if the QQP dataset exists. If not, download and extract if not os.path.exists(qqp_dataset_path): logging.info("Dataset not found. Download") zip_save_path = 'quora-IR-dataset.zip' util.http_get(url='https://sbert.net/datasets/quora-IR-dataset.zip', path=zip_save_path) with ZipFile(zip_save_path, 'r') as zipIn: zipIn.extractall(qqp_dataset_path) cross_encoder_path = 'output/cross-encoder/stsb_indomain_' + model_name.replace( "/", "-") + '-' + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") bi_encoder_path = 'output/bi-encoder/qqp_cross_domain_' + model_name.replace( "/", "-") + '-' + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
inference_batch_size = 64 train_batch_size = 64 #We use AllNLI as a source of sentences for the distillation nli_dataset_path = 'datasets/AllNLI.tsv.gz' #Further, we use sentences extracted from the English Wikipedia to train the distillation wikipedia_dataset_path = 'datasets/wikipedia-en-sentences.txt.gz' #We use the STS benchmark dataset to see how much performance we loose sts_dataset_path = 'datasets/stsbenchmark.tsv.gz' #Download datasets if needed if not os.path.exists(nli_dataset_path): util.http_get('https://sbert.net/datasets/AllNLI.tsv.gz', nli_dataset_path) if not os.path.exists(wikipedia_dataset_path): util.http_get('https://sbert.net/datasets/wikipedia-en-sentences.txt.gz', wikipedia_dataset_path) if not os.path.exists(sts_dataset_path): util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path) #We need sentences to train our distillation. Here, we use sentences from AllNLI and from WikiPedia train_sentences_nli = set() dev_sentences_nli = set() train_sentences_wikipedia = [] dev_sentences_wikipedia = []
import logging import os logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) #You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base model_name = 'distilbert-base-uncased' dataset_path = 'datasets/wikipedia-sections' if not os.path.exists(dataset_path): os.makedirs(dataset_path, exist_ok=True) filepath = os.path.join(dataset_path, 'wikipedia-sections-triplets.zip') util.http_get('https://sbert.net/datasets/wikipedia-sections-triplets.zip', filepath) with ZipFile(filepath, 'r') as zip: zip.extractall(dataset_path) ### Create a torch.DataLoader that passes training batch instances to our model train_batch_size = 16 output_path = "output/training-wikipedia-sections-" + model_name + "-" + datetime.now( ).strftime("%Y-%m-%d_%H-%M-%S") num_epochs = 1 ### Configure sentence transformers for training and train on the provided dataset # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings word_embedding_model = models.Transformer(model_name) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling(
# Training parameters model_name = 'distilroberta-base' train_batch_size = 128 num_epochs = 1 # Save path to store our model model_save_path = 'output/training_stsb_simcse-{}-{}-{}'.format( model_name, train_batch_size, datetime.now().strftime("%Y-%m-%d_%H-%M-%S")) # Check if dataset exsist. If not, download and extract it sts_dataset_path = 'datasets/stsbenchmark.tsv.gz' if not os.path.exists(sts_dataset_path): util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path) # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings word_embedding_model = models.Transformer(model_name, max_seq_length=32) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension()) dense = models.Dense(pooling_model.get_sentence_embedding_dimension(), pooling_model.get_sentence_embedding_dimension()) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) # We use 1 Million sentences from Wikipedia to train our model wikipedia_dataset_path = 'datasets/wiki1m_for_simcse.txt' if not os.path.exists(wikipedia_dataset_path): util.http_get(
model = CrossEncoder(model_name, num_labels=1, max_length=512, default_activation_function=torch.nn.Identity()) ### Now we read the MS Marco dataset data_folder = 'msmarco-data' os.makedirs(data_folder, exist_ok=True) #### Read the corpus files, that contain all the passages. Store them in the corpus dict corpus = {} collection_filepath = os.path.join(data_folder, 'collection.tsv') if not os.path.exists(collection_filepath): tar_filepath = os.path.join(data_folder, 'collection.tar.gz') if not os.path.exists(tar_filepath): logging.info("Download collection.tar.gz") util.http_get('https://msmarco.blob.core.windows.net/msmarcoranking/collection.tar.gz', tar_filepath) with tarfile.open(tar_filepath, "r:gz") as tar: tar.extractall(path=data_folder) with open(collection_filepath, 'r', encoding='utf8') as fIn: for line in fIn: pid, passage = line.strip().split("\t") corpus[pid] = passage ### Read the train queries, store in queries dict queries = {} queries_filepath = os.path.join(data_folder, 'queries.train.tsv') if not os.path.exists(queries_filepath): tar_filepath = os.path.join(data_folder, 'queries.tar.gz')
import tqdm import numpy as np import sys import pytrec_eval from sentence_transformers import SentenceTransformer, util, CrossEncoder import os data_folder = 'trec2019-data' os.makedirs(data_folder, exist_ok=True) #Read test queries queries = {} queries_filepath = os.path.join(data_folder, 'msmarco-test2019-queries.tsv.gz') if not os.path.exists(queries_filepath): logging.info("Download "+os.path.basename(queries_filepath)) util.http_get('https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-test2019-queries.tsv.gz', queries_filepath) with gzip.open(queries_filepath, 'rt', encoding='utf8') as fIn: for line in fIn: qid, query = line.strip().split("\t") queries[qid] = query #Read which passages are relevant relevant_docs = defaultdict(lambda: defaultdict(int)) qrels_filepath = os.path.join(data_folder, '2019qrels-pass.txt') if not os.path.exists(qrels_filepath): logging.info("Download "+os.path.basename(qrels_filepath)) util.http_get('https://trec.nist.gov/data/deep/2019qrels-pass.txt', qrels_filepath)
import os #We use the Bi-Encoder to encode all passages, so that we can use it with sematic search bi_encoder = SentenceTransformer('msmarco-distilroberta-base-v2') top_k = 100 #Number of passages we want to retrieve with the bi-encoder #The bi-encoder will retrieve 100 documents. We use a cross-encoder, to re-rank the results list to improve the quality cross_encoder = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-6') # As dataset, we use Simple English Wikipedia. Compared to the full English wikipedia, it has only # about 170k articles. We split these articles into paragraphs and encode them with the bi-encoder wikipedia_filepath = 'data/simplewiki-2020-11-01.jsonl.gz' if not os.path.exists(wikipedia_filepath): util.http_get('http://sbert.net/datasets/simplewiki-2020-11-01.jsonl.gz', wikipedia_filepath) passages = [] with gzip.open(wikipedia_filepath, 'rt', encoding='utf8') as fIn: for line in fIn: data = json.loads(line.strip()) paragraphs = data['text'].split("\n\n") for p in paragraphs: if len(p.strip()) > 20: passages.append(p.strip()[0:5000]) #If you like, you can also limit the number of passages you want to use #passages = passages[0:50000] print("Passages:", len(passages)) #Now we encode all passages we have in our Simple Wikipedia corpus
#We set num_labels=1, which predicts a continous score between 0 and 1 model = CrossEncoder(model_name, num_labels=1, max_length=512) ### Now we read the MS Marco dataset data_folder = 'msmarco-data' os.makedirs(data_folder, exist_ok=True) #### Read the corpus files, that contain all the passages. Store them in the corpus dict corpus = {} collection_filepath = os.path.join(data_folder, 'collection.tsv') if not os.path.exists(collection_filepath): tar_filepath = os.path.join(data_folder, 'collection.tar.gz') if not os.path.exists(tar_filepath): logging.info("Download collection.tar.gz") util.http_get( 'https://msmarco.blob.core.windows.net/msmarcoranking/collection.tar.gz', tar_filepath) with tarfile.open(tar_filepath, "r:gz") as tar: tar.extractall(path=data_folder) with open(collection_filepath, 'r', encoding='utf8') as fIn: for line in fIn: pid, passage = line.strip().split("\t") corpus[pid] = passage ### Read the train queries, store in queries dict queries = {} queries_filepath = os.path.join(data_folder, 'queries.train.tsv') if not os.path.exists(queries_filepath): tar_filepath = os.path.join(data_folder, 'queries.tar.gz')
distance_metric = losses.SiameseDistanceMetric.COSINE_DISTANCE #Negative pairs should have a distance of at least 0.5 margin = 0.5 dataset_path = 'quora-IR-dataset' model_save_path = 'output/training_multi-task-learning' + datetime.now( ).strftime("%Y-%m-%d_%H-%M-%S") os.makedirs(model_save_path, exist_ok=True) # Check if the dataset exists. If not, download and extract if not os.path.exists(dataset_path): logger.info("Dataset not found. Download") zip_save_path = 'quora-IR-dataset.zip' util.http_get(url='https://sbert.net/datasets/quora-IR-dataset.zip', path=zip_save_path) with ZipFile(zip_save_path, 'r') as zip: zip.extractall(dataset_path) ######### Read train data ########## train_samples_MultipleNegativesRankingLoss = [] train_samples_ConstrativeLoss = [] with open(os.path.join(dataset_path, "classification/train_pairs.tsv"), encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: train_samples_ConstrativeLoss.append( InputExample(texts=[row['question1'], row['question2']], label=int(row['is_duplicate']))) if row['is_duplicate'] == '1':
import os from sentence_transformers import util random.seed(42) #Get raw file source_file = "quora-IR-dataset/quora_duplicate_questions.tsv" os.makedirs('quora-IR-dataset', exist_ok=True) os.makedirs('quora-IR-dataset/graph', exist_ok=True) os.makedirs('quora-IR-dataset/information-retrieval', exist_ok=True) os.makedirs('quora-IR-dataset/classification', exist_ok=True) os.makedirs('quora-IR-dataset/duplicate-mining', exist_ok=True) if not os.path.exists(source_file): print("Download file to", source_file) util.http_get('http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv', source_file) #Read pairwise file sentences = {} duplicates = defaultdict(lambda: defaultdict(bool)) rows = [] with open(source_file, encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_MINIMAL) for row in reader: id1 = row['qid1'] id2 = row['qid2'] question1 = row['question1'].replace("\r", "").replace("\n", " ").replace( "\t", " ") question2 = row['question2'].replace("\r", "").replace("\n", " ").replace(