def __init__(self,
                 dataframe: List[Dict[str, str]],
                 teacher_model: LanguageTransformer,
                 combinations: List[Tuple[str, str]],
                 batch_size: int = 8,
                 name=''):

        self.combinations = combinations
        self.name = name
        self.batch_size = batch_size

        if name:
            name = "_" + name

        self.csv_file = "mse_evaluation" + name + "_results.csv"
        self.csv_headers = ["epoch", "steps"]
        self.data = {}

        logging.info("Compute teacher embeddings")
        all_source_sentences = set()
        for src_lang, trg_lang in self.combinations:
            src_sentences = []
            trg_sentences = []

            for row in dataframe:
                if row[src_lang].strip() != "" and row[trg_lang].strip() != "":
                    all_source_sentences.add(row[src_lang])
                    src_sentences.append(row[src_lang])
                    trg_sentences.append(row[trg_lang])

            self.data[(src_lang, trg_lang)] = (src_sentences, trg_sentences)
            self.csv_headers.append("{}-{}".format(src_lang, trg_lang))

        all_source_sentences = list(all_source_sentences)
        all_src_embeddings = teacher_model.encode(all_source_sentences,
                                                  batch_size=self.batch_size)
        self.teacher_embeddings = {
            sent: emb
            for sent, emb in zip(all_source_sentences, all_src_embeddings)
        }
if not os.path.exists(sts_dataset_path):
    util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path)




# Read the dataset
model_name = 'bert-base-nli-mean-tokens'
train_batch_size = 16
num_epochs = 4
model_save_path = 'output/training_stsbenchmark_continue_training-'+model_name+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")



# Load a pre-trained sentence transformer model
model = LanguageTransformer(model_name)

# Convert the dataset to a DataLoader ready for training
logging.info("Read STSbenchmark train dataset")

train_samples = []
dev_samples = []
test_samples = []
with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        score = float(row['score']) / 5.0  # Normalize score to range 0 ... 1
        inp_example = InputExample(texts=[row['sentence1'], row['sentence2']], label=score)

        if row['split'] == 'dev':
            dev_samples.append(inp_example)
from language_bert.readers import STSBenchmarkDataReader
import logging
import torch

#Limit torch to 4 threads, as this example runs on the CPU
torch.set_num_threads(4)

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

#1) Point the transformer model to the BERT / RoBERTa etc. model you would like to use. Ensure that output_hidden_states is true
word_embedding_model = models.Transformer(
    'bert-base-uncased', model_args={'output_hidden_states': True})

#2) Add WKPooling
pooling_model = models.WKPooling(
    word_embedding_model.get_word_embedding_dimension())

#3) Create a sentence transformer model to glue both models together
model = LanguageTransformer(modules=[word_embedding_model, pooling_model])

sts_reader = STSBenchmarkDataReader('../datasets/stsbenchmark')
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    sts_reader.get_examples("sts-test.csv"))

model.evaluate(evaluator)
"""
This examples clusters different sentences that come from the same wikipedia article.

It uses the 'wikipedia-sections' model, a model that was trained to differentiate if two sentences from the
same article come from the same section or from different sections in that article.
"""
from language_bert import LanguageTransformer
from sklearn.cluster import AgglomerativeClustering

embedder = LanguageTransformer('bert-base-wikipedia-sections-mean-tokens')

#Sentences and sections are from Wikipeda.
#Source: https://en.wikipedia.org/wiki/Bushnell,_Illinois
corpus = [
    ("Bushnell is located at 40°33′6″N 90°30′29″W (40.551667, -90.507921).",
     "Geography"),
    ("According to the 2010 census, Bushnell has a total area of 2.138 square miles (5.54 km2), of which 2.13 square miles (5.52 km2) (or 99.63%) is land and 0.008 square miles (0.02 km2) (or 0.37%) is water.",
     "Geography"),
    ("The town was founded in 1854 when the Northern Cross Railroad built a line through the area.",
     "History"),
    ("Nehemiah Bushnell was the President of the Railroad, and townspeople honored him by naming their community after him. ",
     "History"),
    ("Bushnell was also served by the Toledo, Peoria and Western Railway, now the Keokuk Junction Railway.",
     "History"),
    ("As of the census[6] of 2000, there were 3,221 people, 1,323 households, and 889 families residing in the city. ",
     "Demographics"),
    ("The population density was 1,573.9 people per square mile (606.7/km²).",
     "Demographics"),
    ("There were 1,446 housing units at an average density of 706.6 per square mile (272.3/km²).",
     "Demographics"),
    ("From 1991 to 2012, Bushnell was home to one of the largest Christian Music and Arts festivals in the world, known as the Cornerstone Festival.",
model_name = 'bert-base-uncased'
logging.info("Model %s will be created from based model %s" %
             (model_save_path, model_name))

word_embedding_model = models.Transformer(model_name)

cnn = models.CNN(in_word_embedding_dimension=word_embedding_model.
                 get_word_embedding_dimension())

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(cnn.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

model = LanguageTransformer(modules=[word_embedding_model, cnn, pooling_model],
                            device='cpu')

# -----------PREPARE TRAIN DATASET -------------------
# create the training dataset
logging.info("Read train dataset")
train_dataset = SentencesDataset(train_samples, model=model)
train_dataloader = DataLoader(train_dataset,
                              shuffle=True,
                              batch_size=train_batch_size)
train_loss = losses.MutualInformationLoss(
    model=model,
    sentence_embedding_dimension=model.get_sentence_embedding_dimension())

# -------- PREPARE Dev evalution DATASET ------------------
#Read dataset and use it as development set
logging.info("Read dev dataset")
Exemplo n.º 6
0
from language_bert import LanguageTransformer, LoggingHandler
import numpy as np
import logging

#### Just some code to print debug information to stdout
np.set_printoptions(threshold=100)

logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

# Load pre-trained Sentence Transformer Model (based on DistilBERT). It will be downloaded automatically
model = LanguageTransformer('distilbert-base-nli-stsb-mean-tokens')

# Embed a list of sentences
sentences = [
    'This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.',
    'The quick brown fox jumps over the lazy dog.'
]
sentence_embeddings = model.encode(sentences)

# The result is a list of sentence embeddings as numpy arrays
for sentence, embedding in zip(sentences, sentence_embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")
Exemplo n.º 7
0
"""
This is a simple application for sentence embeddings: clustering

Sentences are mapped to sentence embeddings and then k-mean clustering is applied.
"""
from language_bert import LanguageTransformer
from sklearn.cluster import KMeans

embedder = LanguageTransformer('bert-base-nli-mean-tokens')

# Corpus with example sentences
corpus = [
    'A man is eating food.', 'A man is eating a piece of bread.',
    'A man is eating pasta.', 'The girl is carrying a baby.',
    'The baby is carried by the woman', 'A man is riding a horse.',
    'A man is riding a white horse on an enclosed ground.',
    'A monkey is playing drums.',
    'Someone in a gorilla costume is playing a set of drums.',
    'A cheetah is running behind its prey.',
    'A cheetah chases prey on across a field.'
]
corpus_embeddings = embedder.encode(corpus)

# Perform kmean clustering
num_clusters = 5
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
Exemplo n.º 8
0
#Limit torch to 4 threads
torch.set_num_threads(4)

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

model_name = '../training/nli/output/training_nli_bert-base-uncased-2021-01-10_14-44-13'

# Load a named sentence model (based on BERT). This will download the model from our server.
# Alternatively, you can also pass a filepath to LanguageTransformer()
model = LanguageTransformer(model_name)

sts_corpus = "../datasets/stsbenchmark/"
target_eval_files = set(
    ['sts', 'sts12', 'sts13', 'sts14', 'sts15', 'sts16', 'sick-r'])

evaluators = [
]  #evaluators has a list of different evaluator classes we call periodically
sts_reader = STSBenchmarkDataReader(
    os.path.join(script_folder_path, sts_corpus))
for target in target_eval_files:
    output_filename_eval = os.path.join(script_folder_path,
                                        sts_corpus + target + "-test.csv")
    evaluators.append(
        EmbeddingSimilarityEvaluator.from_input_examples(
            sts_reader.get_examples(output_filename_eval), name=target))
Exemplo n.º 9
0
As embeddings model, we use the SBERT model 'distilbert-multilingual-nli-stsb-quora-ranking',
that it aligned for 100 languages. I.e., you can type in a question in various languages and it will
return the closest questions in the corpus (questions in the corpus are mainly in English).
"""
from language_bert import LanguageTransformer, util
import os
import csv
import pickle
import time
import torch
from annoy import AnnoyIndex

if __name__ == '__main__':
    model_name = 'distilbert-multilingual-nli-stsb-quora-ranking'
    model = LanguageTransformer(model_name)

    url = "http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv"
    dataset_path = "quora_duplicate_questions.tsv"
    max_corpus_size = 100000

    n_trees = 256  #Number of trees used for Annoy. More trees => better recall, worse run-time
    embedding_size = 768  #Size of embeddings
    top_k_hits = 10  #Output k hits

    annoy_index_path = 'quora-embeddings-{}-size-{}-annoy_index-trees-{}.ann'.format(
        model_name.replace('/', '_'), max_corpus_size, n_trees)
    embedding_cache_path = 'quora-embeddings-{}-size-{}.pkl'.format(
        model_name.replace('/', '_'), max_corpus_size)

    #Check if embedding cache path exists
Exemplo n.º 10
0
sentences in parallel. This gives a near linear speed-up
when encoding large text collections.
"""

from language_bert import LanguageTransformer, LoggingHandler
import logging

logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])

#Important, you need to shield your code with if __name__. Otherwise, CUDA runs into issues when spawning new processes.
if __name__ == '__main__':

    #Create a large list of 100k sentences
    sentences = ["This is sentence {}".format(i) for i in range(100000)]

    #Define the model
    model = LanguageTransformer('distilbert-base-nli-stsb-mean-tokens')

    #Start the multi-process pool on all available CUDA devices
    pool = model.start_multi_process_pool()

    #Compute the embeddings using the multi-process pool
    emb = model.encode_multi_process(sentences, pool)
    print("Embeddings computed. Shape:", emb.shape)

    #Optional: Stop the proccesses in the pool
    model.stop_multi_process_pool(pool)
from language_bert import LanguageTransformer, evaluation, LoggingHandler
import sys
import gzip
import os
import logging

logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])

model_name = sys.argv[1]
filepaths = sys.argv[2:]
inference_batch_size = 32

model = LanguageTransformer(model_name)

for filepath in filepaths:
    src_sentences = []
    trg_sentences = []
    with gzip.open(filepath, 'rt',
                   encoding='utf8') if filepath.endswith('.gz') else open(
                       filepath, 'r', encoding='utf8') as fIn:
        for line in fIn:
            splits = line.strip().split('\t')
            if len(splits) >= 2:
                src_sentences.append(splits[0])
                trg_sentences.append(splits[1])

    logging.info(
        os.path.basename(filepath) + ": " + str(len(src_sentences)) +
Exemplo n.º 12
0
that it aligned for 100 languages. I.e., you can type in a question in various languages and it will
return the closest questions in the corpus (questions in the corpus are mainly in English).
"""

from language_bert import LanguageTransformer, util
import os
from elasticsearch import Elasticsearch
import csv
import time
import tqdm.autonotebook
from elasticsearch import helpers

if __name__ == '__main__':
    es = Elasticsearch()

    model = LanguageTransformer('distilbert-multilingual-nli-stsb-quora-ranking')

    url = "http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv"
    dataset_path = "quora_duplicate_questions.tsv"
    max_corpus_size = 100000

    #Download dataset if needed
    if not os.path.exists(dataset_path):
        print("Download dataset")
        util.http_get(url, dataset_path)

    #Get all unique sentences from the file
    all_questions = {}
    with open(dataset_path, encoding='utf8') as fIn:
        reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
        for row in reader:
Exemplo n.º 13
0
"""
This is a simple application for sentence embeddings: semantic search

We have a corpus with various sentences. Then, for a given query sentence,
we want to find the most similar sentence in this corpus.

This script outputs for various queries the top 5 most similar sentences in the corpus.
"""
from language_bert import LanguageTransformer, util
import torch

embedder = LanguageTransformer('distilbert-base-nli-stsb-mean-tokens')

# Corpus with example sentences
corpus = [
    'A man is eating food.', 'A man is eating a piece of bread.',
    'The girl is carrying a baby.', 'A man is riding a horse.',
    'A woman is playing violin.', 'Two men pushed carts through the woods.',
    'A man is riding a white horse on an enclosed ground.',
    'A monkey is playing drums.', 'A cheetah is running behind its prey.'
]
corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)

# Query sentences:
queries = [
    'A man is eating pasta.',
    'Someone in a gorilla costume is playing a set of drums.',
    'A cheetah chases prey on across a field.'
]

# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
from language_bert import LanguageTransformer, util
import sys
import os
import time
import torch
import gzip
import csv

#Limit torch to 4 threads
torch.set_num_threads(4)

model_name = sys.argv[1] if len(sys.argv) > 1 else 'bert-base-nli-mean-tokens'

# Load a named sentence model (based on BERT). This will download the model from our server.
# Alternatively, you can also pass a filepath to LanguageTransformer()
model = LanguageTransformer(model_name)

nli_dataset_path = 'datasets/AllNLI.tsv.gz'
sentences = set()
max_sentences = 100000

#Download datasets if needed
if not os.path.exists(nli_dataset_path):
    util.http_get('https://sbert.net/datasets/AllNLI.tsv.gz', nli_dataset_path)

with gzip.open(nli_dataset_path, 'rt', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        sentences.add(row['sentence1'])
        if len(sentences) >= max_sentences:
            break