It was trained on the Natural Questions dataset, a dataset with real questions from Google Search
together with annotated data from Wikipedia providing the answer. For the passages, we encode the
Wikipedia article tile together with the individual text passages.

Google Colab Example: https://colab.research.google.com/drive/11GunvCqJuebfeTlgbJWkIMT0xJH6PWF1?usp=sharing
"""
import json
from sentence_transformers import SentenceTransformer, util
import time
import gzip
import os
import torch

# We use the Bi-Encoder to encode all passages, so that we can use it with sematic search
model_name = 'nq-distilbert-base-v1'
bi_encoder = SentenceTransformer(model_name)
top_k = 5  # Number of passages we want to retrieve with the bi-encoder

# As dataset, we use Simple English Wikipedia. Compared to the full English wikipedia, it has only
# about 170k articles. We split these articles into paragraphs and encode them with the bi-encoder

wikipedia_filepath = 'data/simplewiki-2020-11-01.jsonl.gz'

if not os.path.exists(wikipedia_filepath):
    util.http_get('http://sbert.net/datasets/simplewiki-2020-11-01.jsonl.gz',
                  wikipedia_filepath)

passages = []
with gzip.open(wikipedia_filepath, 'rt', encoding='utf8') as fIn:
    for line in fIn:
        data = json.loads(line.strip())
Exemplo n.º 2
0
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')
            dev_files.append(arg)
        else:
            train_files.append(arg)

if not train_files:
    print("Please pass at least some train files")
    print("python make_multilingual_sys.py file1.tsv.gz file2.tsv.gz --dev dev1.tsv.gz dev2.tsv.gz")
    exit()


logger.info("Train files: {}".format(", ".join(train_files)))
logger.info("Dev files: {}".format(", ".join(dev_files)))

######## Start the extension of the teacher model to multiple languages ########
logger.info("Load teacher model")
teacher_model = SentenceTransformer(teacher_model_name)


logger.info("Create student model from scratch")
word_embedding_model = models.Transformer(student_model_name, max_seq_length=max_seq_length)
# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
student_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])


###### Read Parallel Sentences Dataset ######
train_data = ParallelSentencesDataset(student_model=student_model, teacher_model=teacher_model, batch_size=inference_batch_size, use_embedding_cache=True)
for train_file in train_files:
    train_data.load_data(train_file, max_sentences=max_sentences_per_trainfile, max_sentence_length=train_max_sentence_length)

train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
Exemplo n.º 4
0
print(text_df)

!pip install -U sentence-transformers

"""## BERT Sentence Tranformers Semantic Search"""

"""
This is a simple application for sentence embeddings: semantic search
given query sentence,this finds the most similar sentence in this corpus
script outputs for various queries the top 5 most similar publications in the corpus
*Used open source code to aid in development
"""
from sentence_transformers import SentenceTransformer
import scipy.spatial
import pickle as pkl
embedder = SentenceTransformer('bert-base-nli-mean-tokens')

sentences = list(text_df['Text Processed'])

# Eaxmple query sentences
queries = ['How to evolve architecture for constellations and simulation', 'Build behavior of complex aerospace and modeling of safety']
query_embeddings = embedder.encode(queries,show_progress_bar=True)
text_embeddings = embedder.encode(sentences, show_progress_bar=True)
#
# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
closest_n = 5
print("\nTop 5 most similar sentences in corpus:")
for query, query_embedding in zip(queries, query_embeddings):
    distances = scipy.spatial.distance.cdist([query_embedding], text_embeddings, "cosine")[0]

    results = zip(range(len(distances)), distances)
Exemplo n.º 5
0
import spacy, random
nlp = spacy.load('en_core_web_lg')

from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer('bert-base-nli-mean-tokens')

# Load Movielines & Conversations
movie_lines = {}
for line in open("./cornell movie-dialogs corpus/movie_lines.txt",
                 encoding="latin1"):
    line = line.strip()
    parts = line.split(" +++$+++ ")
    if len(parts) == 5:
        movie_lines[parts[0]] = parts[4]
    else:
        movie_lines[parts[0]] = ""

import json
responses = {}
for line in open("./cornell movie-dialogs corpus/movie_conversations.txt",
                 encoding="latin1"):
    line = line.strip()
    parts = line.split(" +++$+++ ")
    line_ids = json.loads(parts[3].replace("'", '"'))
    for first, second in zip(line_ids[:-1], line_ids[1:]):
        responses[first] = second

import numpy as np


def sentence_mean(nlp, s):
Exemplo n.º 6
0
# fastext_raw = fasttext_embedding(fastText_model, data.text.to_list(), dictionary, tfidf_model,False)
fasttext_tfidf = fasttext_embedding(fastText_model, data.text.to_list(),
                                    dictionary, tfidf_model, True)

# data['fasttext_raw'] = fastext_raw
data['fasttext_tfidf'] = fasttext_tfidf

from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
x = mlb.fit_transform(
    [tuple(int(x) for x in i.split(',')) for i in data.label.to_list()])
data['new_label'] = list(x)
len(data.new_label[0])

###Bert
modelBert = SentenceTransformer('monologg/bert-base-cased-goemotions-original')


def bert_embedding(sentences):
    sentence_embeddings = modelBert.encode(sentences)
    return sentence_embeddings


bert_features = bert_embedding(data.text.to_list())

fabeec = []
for i in range(len(bert_features)):
    fabeec.append(bert_features[i].tolist() + fasttext_tfidf[i].tolist())

#imbalance
Exemplo n.º 7
0
 def __init__(self, nlp):
     spacy_name = nlp.meta['name']
     model_name = util.name_spacy_to_sentencebert(spacy_name)
     self.model = SentenceTransformer(model_name)
Exemplo n.º 8
0
def main():
    a = get_args()

    prev_enc = 0

    def train(i):
        loss = 0

        noise = a.noise * torch.randn(1, 1, *params[0].shape[2:4],
                                      1).cuda() if a.noise > 0 else None
        img_out = image_f(noise)

        micro = None if a.in_txt2 is None else False
        imgs_sliced = slice_imgs([img_out],
                                 a.samples,
                                 a.modsize,
                                 norm_in,
                                 a.overscan,
                                 micro=micro)
        out_enc = model_clip.encode_image(imgs_sliced[-1])
        if a.diverse != 0:
            imgs_sliced = slice_imgs([image_f(noise)],
                                     a.samples,
                                     a.modsize,
                                     norm_in,
                                     a.overscan,
                                     micro=micro)
            out_enc2 = model_clip.encode_image(imgs_sliced[-1])
            loss += a.diverse * torch.cosine_similarity(
                out_enc, out_enc2, dim=-1).mean()
            del out_enc2
            torch.cuda.empty_cache()
        if a.in_img is not None and os.path.isfile(a.in_img):  # input image
            loss += sign * 0.5 * torch.cosine_similarity(
                img_enc, out_enc, dim=-1).mean()
        if a.in_txt is not None:  # input text
            loss += sign * torch.cosine_similarity(txt_enc, out_enc,
                                                   dim=-1).mean()
        if a.in_txt0 is not None:  # subtract text
            loss += -sign * torch.cosine_similarity(txt_enc0, out_enc,
                                                    dim=-1).mean()
        if a.sync > 0 and a.in_img is not None and os.path.isfile(
                a.in_img):  # image composition
            loss -= a.sync * ssim_loss(
                F.interpolate(img_out, ssim_size).float(), img_in)
        if a.in_txt2 is not None:  # input text for micro details
            imgs_sliced = slice_imgs([img_out],
                                     a.samples,
                                     a.modsize,
                                     norm_in,
                                     a.overscan,
                                     micro=True)
            out_enc2 = model_clip.encode_image(imgs_sliced[-1])
            loss += sign * torch.cosine_similarity(txt_enc2, out_enc2,
                                                   dim=-1).mean()
            del out_enc2
            torch.cuda.empty_cache()
        if a.expand > 0:
            global prev_enc
            if i > 0:
                loss += a.expand * torch.cosine_similarity(
                    out_enc, prev_enc, dim=-1).mean()
            prev_enc = out_enc.detach()

        del img_out, imgs_sliced, out_enc
        torch.cuda.empty_cache()
        assert not isinstance(loss, int), ' Loss not defined, check the inputs'

        if a.prog is True:
            lr_cur = lr0 + (i / a.steps) * (lr1 - lr0)
            for g in optimizer.param_groups:
                g['lr'] = lr_cur

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if i % a.fstep == 0:
            with torch.no_grad():
                img = image_f(contrast=a.contrast).cpu().numpy()[0]
            checkout(img,
                     os.path.join(tempdir, '%04d.jpg' % (i // a.fstep)),
                     verbose=a.verbose)
            pbar.upd()

    # Load CLIP models
    model_clip, _ = clip.load(a.model)
    if a.verbose is True: print(' using model', a.model)
    xmem = {'RN50': 0.5, 'RN50x4': 0.16, 'RN101': 0.33}
    if 'RN' in a.model:
        a.samples = int(a.samples * xmem[a.model])

    if a.multilang is True:
        model_lang = SentenceTransformer(
            'clip-ViT-B-32-multilingual-v1').cuda()

    def enc_text(txt):
        if a.multilang is True:
            emb = model_lang.encode([txt],
                                    convert_to_tensor=True,
                                    show_progress_bar=False)
        else:
            emb = model_clip.encode_text(clip.tokenize(txt).cuda())
        return emb.detach().clone()

    if a.diverse != 0:
        a.samples = int(a.samples * 0.5)

    norm_in = torchvision.transforms.Normalize(
        (0.48145466, 0.4578275, 0.40821073),
        (0.26862954, 0.26130258, 0.27577711))

    out_name = []
    if a.in_img is not None and os.path.isfile(a.in_img):
        if a.verbose is True: print(' ref image:', basename(a.in_img))
        img_in = torch.from_numpy(
            img_read(a.in_img) / 255.).unsqueeze(0).permute(0, 3, 1, 2).cuda()
        img_in = img_in[:, :3, :, :]  # fix rgb channels
        in_sliced = slice_imgs([img_in],
                               a.samples,
                               a.modsize,
                               transform=norm_in,
                               overscan=a.overscan)[0]
        img_enc = model_clip.encode_image(in_sliced).detach().clone()
        if a.sync > 0:
            ssim_loss = ssim.SSIM(window_size=11)
            ssim_size = [s // 8 for s in a.size]
            img_in = F.interpolate(img_in, ssim_size).float()
        else:
            del img_in
        del in_sliced
        torch.cuda.empty_cache()
        out_name.append(basename(a.in_img).replace(' ', '_'))

    if a.in_txt is not None:
        if a.verbose is True: print(' ref text: ', basename(a.in_txt))
        if a.translate:
            translator = Translator()
            a.in_txt = translator.translate(a.in_txt, dest='en').text
            if a.verbose is True: print(' translated to:', a.in_txt)
        txt_enc = enc_text(a.in_txt)
        out_name.append(txt_clean(a.in_txt))

    if a.in_txt2 is not None:
        if a.verbose is True: print(' micro text:', basename(a.in_txt2))
        a.samples = int(a.samples * 0.75)
        if a.translate:
            translator = Translator()
            a.in_txt2 = translator.translate(a.in_txt2, dest='en').text
            if a.verbose is True: print(' translated to:', a.in_txt2)
        txt_enc2 = enc_text(a.in_txt2)
        out_name.append(txt_clean(a.in_txt2))

    if a.in_txt0 is not None:
        if a.verbose is True: print(' subtract text:', basename(a.in_txt0))
        a.samples = int(a.samples * 0.75)
        if a.translate:
            translator = Translator()
            a.in_txt0 = translator.translate(a.in_txt0, dest='en').text
            if a.verbose is True: print(' translated to:', a.in_txt0)
        txt_enc0 = enc_text(a.in_txt0)
        out_name.append('off-' + txt_clean(a.in_txt0))

    if a.multilang is True: del model_lang

    params, image_f = fft_image([1, 3, *a.size], resume=a.resume)
    image_f = to_valid_rgb(image_f)

    if a.prog is True:
        lr1 = a.lrate * 2
        lr0 = lr1 * 0.01
    else:
        lr0 = a.lrate
    optimizer = torch.optim.Adam(params, lr0)
    sign = 1. if a.invert is True else -1.

    if a.verbose is True: print(' samples:', a.samples)
    out_name = '-'.join(out_name)
    out_name += '-%s' % a.model if 'RN' in a.model.upper() else ''
    tempdir = os.path.join(a.out_dir, out_name)
    os.makedirs(tempdir, exist_ok=True)

    pbar = ProgressBar(a.steps // a.fstep)
    for i in range(a.steps):
        train(i)

    os.system('ffmpeg -v warning -y -i %s\%%04d.jpg "%s.mp4"' %
              (tempdir, os.path.join(a.out_dir, out_name)))
    shutil.copy(
        img_list(tempdir)[-1],
        os.path.join(a.out_dir, '%s-%d.jpg' % (out_name, a.steps)))
    if a.save_pt is True:
        torch.save(params, '%s.pt' % os.path.join(a.out_dir, out_name))
Exemplo n.º 9
0
            title_long.append(title[i])
            url_long.append(url[i])
    data = {
        'article_index': article_long,
        'title': title_long,
        'snippet': sentence_long,
        'summary': summary_long,
        'url': url_long
    }
    return data


content = pd.read_csv('final_content.csv')
print("this will take some time if u are running it for many articles")
content = summarizer(content)
print("running the sentence splitting")
snippet_content = pd.DataFrame(
    sentence_creator(content),
    columns=['article_index', 'title', 'snippet', 'summary', 'url'])
snippet_content.to_excel('snippet_content.xlsx', index=False)

sentence = []
for i in range(len(snippet_content)):
    sentence.append(snippet_content['snippet'][i])

model = SentenceTransformer('model')
sentence_content_embeddings = model.encode(sentence)

with open('sentence_split_encoder_content', 'wb') as f:
    pickle.dump(sentence_content_embeddings, f)
# Beginning to calculate features include BERT and TF-IDF; this process can be a bit of bottleneck
# TODO: Consider writing these variables to a file to "pre-compute" them if experiments are taking awhile
print(" ")
class_report.write(" \n")
print("===============")
class_report.write("===============\n")
print("Fitting Features: ")
class_report.write("Fitting Features: \n")
print(" ")
class_report.write('\n')
bert_dimension = 0
if Features == "All" or Features == "BERT":
    # Create BERT Features and add to data frame
    print('Fitting BERT Features')
    class_report.write('Fitting BERT Features')
    model = SentenceTransformer('bert-base-nli-mean-tokens')
    sentences = df['Sentence'].tolist()
    sentence_embeddings = model.encode(sentences)
    encoded_values = pd.DataFrame(np.row_stack(sentence_embeddings))

    FeatureNames = []
    bert_dimension = encoded_values.shape[1]
    for x in range(0, bert_dimension):
        FeatureNames.append("BERT_" + str(x))

    training_corpus = encoded_values.head(dataset)
    test_corpus = encoded_values.tail((df['Set'] == 1).sum())

tf_dimension = 0
if Features == "All" or Features == "TF":
    # Create TF-IDF Features and add to data frame
Exemplo n.º 11
0
def bert_main():
    # Tweet data using a Vectorizer
    tweets = importTweets(True)
    tweetsk = list(tweets.keys())
    tweetsl = list(tweets.values())

    # Query data using a Vectorizer
    queries = importQuery(True)
    queriesl = list(queries.values())

    # Embedding the Tweet data words using a bert model.
    bert_model = SentenceTransformer('bert-base-nli-mean-tokens')
    print("-" * 70)
    print("Embedding the tweet strings with the Bert token model...")
    tweet_embeddings = bert_model.encode(tweetsl,
                                         batch_size=500,
                                         show_progress_bar=True)

    # Embedding the Query data words using a bert model.
    print("-" * 70)
    print("Embedding the Query strings with the Bert token model...")
    query_embeddings = bert_model.encode(queriesl,
                                         batch_size=500,
                                         show_progress_bar=True)

    # Calculation the Cosine Similarity for the embedded words.
    print("-" * 70)
    print(
        "Calculating the Cosine Similarity for the Bert embedded Tweets and Queries..."
    )
    Rankings = {}
    for q in range(0, len(queriesl)):
        # Dictionary to sort the Cosine Similarity of each document per query.
        docCurrentQuery = {}
        for t in range(0, len(tweetsl)):
            docCurrentQuery[tweetsk[t]] = 1 - spatial.distance.cosine(
                tweet_embeddings[t], query_embeddings[q])
        # Sorting the document in descending order of the Cosine Similarity per query.
        docCurrentQuery = dict(
            sorted(docCurrentQuery.items(),
                   key=lambda item: item[1],
                   reverse=True))

        # Creating a new dictionary of only the Top 1000 documents for each query.
        doc_counter = 1
        docCurrentQuery_1000 = {}
        for key, value in docCurrentQuery.items():
            if (doc_counter <= 1000):
                docCurrentQuery_1000[key] = value
                doc_counter += 1
            else:
                break
        Rankings[q + 1] = docCurrentQuery_1000

    print("-" * 70)
    print("Creating a results file with all the required details...")
    # Creating a txt file with the results
    resultFileCreation(Rankings, True)
    print("-" * 70)
    print("Results file is created (visit the dist folder)")
    print("-" * 70)
Exemplo n.º 12
0
 def __init__(self):
     self.es = Elasticsearch(maxsize=1000)
     self.bc = SentenceTransformer('distiluse-base-multilingual-cased')
Exemplo n.º 13
0
class SimiSearch:
    def __init__(self):
        self.es = Elasticsearch(maxsize=1000)
        self.bc = SentenceTransformer('distiluse-base-multilingual-cased')

    def findSimQuestions(self, q: str, topk: int, minScore=0.5):
        """
        Find similar questions based on cosine similarity to a question q and return top k results
        Params:
        q: question that needs searching for similar questions
        topk: nb of top results returned
        """
        embedding_start = time.time()

        query_vector = self.bc.encode([q])

        query_vector = query_vector[0].tolist()
        embedding_time = time.time() - embedding_start

        script_query = {
            "script_score": {
                "query": {
                    "multi_match": {
                        "query": q,
                        "type": "bool_prefix",
                        "fields": ["text", "text._2gram", "text._3gram"]
                    }
                },
                "script": {
                    "source":
                    "1+cosineSimilarity(params.query_vector, 'vectorisation')",
                    "params": {
                        "query_vector": query_vector
                    }
                },
                "min_score": minScore + 1
            }
        }

        #print('encoding time: {}'.format(embedding_time))

        search_start = time.time()
        response = self.es.search(index='qa',
                                  body={
                                      "size": topk,
                                      "query": script_query,
                                      "_source": ['id', 'text', 'rep']
                                  })

        search_time = time.time() - search_start
        #print('search time: {}'.format(search_time))

        res = []
        reps = []
        for r in response['hits']['hits'][:topk]:
            if r['_source']['rep'] not in reps:
                reps.append(r['_source']['rep'])
                res.append({
                    'id': r['_source']['id'],
                    'text': r['_source']['text'],
                    'score': r['_score'],
                    'rep': r['_source']['rep']
                })
        return res
Exemplo n.º 14
0
    def fit(self, token_lists):
        """ Generate topic model, dictionary, corpus from token lists 
        
        :param token_lists: list of document tokens
        """
        try:
            
            # create Gensim dictionary & corpus for validation
            dictionary = Dictionary(token_lists)
            corpus = [dictionary.doc2bow(text) for text in token_lists]

            if self.method == "BERT":
                model = SentenceTransformer(self.pre_trained_name)
            
                # convert list of document tokens to list of sentences
                sentences = [Utility.to_sentence(token_list) for token_list in token_lists]
                
                # generate BERT sentence embeddings
                embeddings = model.encode(sentences, show_progress_bar=True)
                
                # reduce dimensionality of all embeddings using umap model
                umap_model = umap.UMAP(
                    n_neighbors=self.n_neighbors, n_components=self.n_components,
                    min_dist=self.min_dist, metric=self.umap_metric,random_state=self.random_state
                ).fit(embeddings)
                umap_embeddings = umap_model.transform(embeddings)

                # cluster documents using HDBSCAN
                cluster_model = hdbscan.HDBSCAN(
                    min_cluster_size=self.min_cluster_size, metric=self.cluster_metric,
                    cluster_selection_method=self.cluster_selection_method, 
                    prediction_data=self.prediction_data
                ).fit(umap_embeddings)
                
                # get cluster labels
                labels = cluster_model.labels_
                
                # generate label_docs dataframe
                label_docs_df = self.get_label_docs_df(sentences, labels)
               
                # calculate word importance per topic
                tf_idf, cv = self.c_tf_idf(label_docs_df.doc.values, m=len(sentences))

                self.k = len(np.unique(labels))
                self.labels = labels
                self.dictionary = dictionary
                self.corpus = corpus
                self.sentences = sentences
                self.token_lists = token_lists
                self.cluster_model = cluster_model
                self.umap_model = umap_model
                self.embeddings = embeddings
                self.umap_embeddings = umap_embeddings
                self.cv = cv
                self.tf_idf = tf_idf
                self.feature_names = cv.get_feature_names()

            else:
                raise Exception('method not exist')
        except Exception:
            logging.error("exception occured", exc_info=True)   
                string, convert_to_tensor=True).unsqueeze(0).cpu()
        output.append(features[string])
    return torch.cat(output).to(device), features


if __name__ == '__main__':
    device = 'cuda'
    if os.path.isdir('/media/palm/BiggerData/dictionaries/'):
        root_data = '/media/palm/BiggerData/dictionaries/'
    elif os.path.isdir('/home/palm/PycharmProjects/cp/cp10-work/'):
        root_data = '/home/palm/PycharmProjects/cp'
    elif os.path.isdir('/home/palm/PycharmProjects/nlp/cp10-work'):
        root_data = '/home/palm/PycharmProjects/nlp/'
    else:
        raise ValueError('Well, something\'s wrong here')
    eng_sm = SentenceTransformer(os.path.join(root_data, 'cp10-work'))
    eng_sm.requires_grad_(False)
    eng_sm.train(False)

    embeddings = copy.deepcopy(
        eng_sm._first_module().auto_model.embeddings).to(device)
    embeddings.requires_grad_(True)
    embeddings.train(True)
    dataset = SentenceTokenized(eng_sm.tokenizer,
                                'first',
                                language='eng',
                                true_only=True)

    model = AEPretrainedEmbedding(dataset.vocab_size, embeddings)
    model.to(device)
Exemplo n.º 16
0
import os
from pprint import pprint
from flask import Flask, render_template, jsonify, request
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer
import codecs, json
import numpy as np
#import gluonnlp as nlp
#import mxnet as mx

SEARCH_SIZE = 10
INDEX_NAME = os.environ['INDEX_NAME']
model = "roberta-base-nli-stsb-mean-tokens"
embedder = SentenceTransformer(model)

#model, vocab = nlp.model.get_model('roberta_12_768_12', dataset_name='openwebtext_ccnews_stories_books_cased', use_decoder=False);

#tokenizer = nlp.data.GPT2BPETokenizer();

app = Flask(__name__)


@app.route('/')
def index():
    return render_template('index.html')


@app.route('/search')
def analyzer():
    client = Elasticsearch('elasticsearch:9200')
Exemplo n.º 17
0
class SemanticEngine:
    def __init__(self, text_df: pd.DataFrame) -> None:
        """
        Args:
            text_df (pd.DataFrame): pandas dataframe with fields: ts, text
        """
        self.model = SentenceTransformer("paraphrase-distilroberta-base-v1")
        self.text_df = text_df.to_numpy()
        self.embeddings = None

    def load_embeddings(self, path) -> None:
        """ load embeddings from pickle file """
        with open(path, "rb") as file:
            self.embeddings = pickle.load(file)

    def save_embeddings(self, path) -> None:
        """ save embeddings to pickle file """
        with open(path, "wb") as file:
            pickle.dump(self.embeddings, file)

    def calc_embeddings(self, corpus: List[str]):
        """ calculate new embeddings """
        if len(corpus) == 0:
            raise ValueError("corpus is empty")

        corpus_embeddings = self.model.encode(corpus,
                                              convert_to_tensor=True,
                                              show_progress_bar=False)
        self.embeddings = corpus_embeddings

    def get_top_k(self, query: str, k=5) -> List[Dict]:
        r"""Get k most similar to query sentences
        You need to call load_embeddings or calc_embeddings first to use this method
        Args:
            query (str): text for which you want to find similar sentences
            k (int, optional): number of sentences to find. Defaults to 5.

        Returns:
            List[Dict[float, str, float]]: List with dictionaries of the following structure:
            {
                ts: timestamp of message,
                score: cosin similarity score
                text: message text
            }
        Example 1: calculate embeddings, save them and get top 5 sentences :: 
            >>> df = pd.read_csv("data/prepared/edu_courses.tsv", sep="\t")
            >>> engine = SemanticEngine(text_df=df)
            >>> engine.calc_embeddings(df.text.tolist())
            >>> engine.save_embeddings("data/embeddings/edu_courses.pkl")
            >>> query = "посоветуйте каких-нибудь курсов по pytorch"
            >>> result = engine.get_top_k(query, k=5)
            >>> for res in result:
            ...     print(res["ts"], res["text"], res["score"], sep="\n")

        Example 2: load embeddings from file, and get top 5 sentences
            >>> df = pd.read_csv("data/prepared/edu_courses.tsv", sep="\t")
            >>> engine = SemanticEngine(text_df=df)
            >>> engine.load_embeddings("data/embeddings/edu_courses.pkl")
            >>> query = "посоветуйте каких-нибудь курсов по pytorch"
            >>> result = engine.get_top_k(query, k=5)
            >>> for res in result:
            ...     print(res["ts"], res["text"], res["score"], sep="\n")
        """
        if self.embeddings is None:
            raise ValueError(
                "embeddings are not initialized. Call `load_embeddings` or `calc_embeddings` first"
            )
        if k > len(self.embeddings):
            warnings.warn(f"""`k` with value of {k} is bigger then number of 
                sentences with value of {len(self.embeddings)}.
                Value of k is set to {len(self.embeddings)}
                """)
            k = len(self.embeddings)

        query_embedding = self.model.encode([query],
                                            convert_to_tensor=True,
                                            show_progress_bar=False)
        hits = util.semantic_search(query_embedding, self.embeddings, top_k=k)
        hits = hits[0]
        result = [{
            "ts": str(self.text_df[hit["corpus_id"]][0]),
            "score": str(hit["score"]),
            "text": self.text_df[hit["corpus_id"]][1],
        } for hit in hits]
        return result
Exemplo n.º 18
0
def load_my_model():
    model = SentenceTransformer('distilbert-base-nli-mean-tokens')
    return model
Exemplo n.º 19
0
                    type=str,
                    help="path where pretrained part was stored")
parser.add_argument("-p",
                    "--prefix",
                    required=True,
                    type=str,
                    help="prefix to output files")
parser.add_argument("-n",
                    "--n",
                    default=16,
                    type=int,
                    help="number of predictors used")

args = parser.parse_args()

bert = SentenceTransformer('bert-base-nli-mean-tokens')
bert_size = 768

loaded_model = HiddenLabelPredictorModel(bert, bert_size, args.n)
loaded_model.load_state_dict(torch.load(args.model))

descriptions = []
description_embeddings = {}
UIs = []
UI_embedding = []
screen_names = []

trace_to_index = {}

i = 0
for package_dir in os.listdir(args.dataset):
Exemplo n.º 20
0
from sentence_transformers import SentenceTransformer
import scipy.spatial
import numpy as np
import PrepareData

embedder = SentenceTransformer(
    'output/training_stsbenchmark_bert-base-uncased-2020-06-19_19-51-26')

corpus = PrepareData.load_data()

# Corpus with example sentences
corpus_embeddings = []
for document in corpus:
    sentences_embeddings = embedder.encode(document)
    sentences_embeddings = np.array(sentences_embeddings)
    document_embedding = np.mean(sentences_embeddings, axis=0)
    corpus_embeddings.append(document_embedding)

# Query sentences:
#
#similarity_matrix = []
#for first_doc in corpus_embeddings:
#    similarity_vector = []
#    for second_doc in corpus_embeddings:
#        similarity_vector.append(1 - scipy.spatial.distance.cosine(first_doc, second_doc))
#    similarity_matrix.append(similarity_vector)
#
#similarity_matrix = np.array(similarity_matrix)
#print(similarity_matrix)

# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
# print("f1 score:",f1_score(y_dev, pred, average='weighted'))
# print("acc:",accuracy_score(y_dev, pred))
print(classification_report(y_dev, pred))

clf = BernoulliNB()
clf.fit(X_train, y_train)
pred = clf.predict(X_dev)
print("TFIDF vectorization + NB:")
# print("f1 score:",f1_score(y_dev, pred, average='weighted'))
# print("acc:",accuracy_score(y_dev, pred))
print(classification_report(y_dev, pred))
'''
multilingual BERT model loading and embedding generation
'''
test = pd.read_csv('../Dravidian-CodeMix/tamil_test.csv')
model = SentenceTransformer('distiluse-base-multilingual-cased',
                            device='cuda:1')
X_train = model.encode(X_train_ori, batch_size=20, show_progress_bar=True)
X_dev = model.encode(X_dev_ori, batch_size=20, show_progress_bar=True)
X_test = model.encode(X_test_ori, batch_size=20, show_progress_bar=True)

clf = MLPClassifier(hidden_layer_sizes=(512, ), max_iter=25)
clf.fit(X_train, y_train)
pred = clf.predict(X_dev)
print("BERT + MLP:")
# print("f1 score:",f1_score(y_dev, pred, average='weighted'))
# print("acc:",accuracy_score(y_dev, pred))
print(classification_report(y_dev, pred))
'''
Loading Tamil specific pretrained fastText model
'''
from pymagnitude import *
Exemplo n.º 22
0
class Recommender():
    def __init__(self,
                 db_path,
                 pretrained_model='stsb-roberta-large',
                 no_cuda=True):

        self.device = "cuda" if torch.cuda.is_available(
        ) and not no_cuda else "cpu"
        # self.device = 'cpu'
        self.db_path = db_path
        self.pretrained_model = pretrained_model

        self.load_model()
        self.load_db()

    def load_model(self):
        """
        Load the SentenceTransformer model
        base ond
        :return:
        """

        print(f"SentenceTransformer for model {self.pretrained_model}")
        self.model = SentenceTransformer(self.pretrained_model,
                                         device=self.device)

    def load_db(self):
        self.conn = sqlite3.connect(self.db_path)

        df_chapter = pd.read_sql('select * from chapter', self.conn)
        df_chapter.sort_values(['chapter_number'], inplace=True)

        df_section = pd.read_sql('select * from section', self.conn)
        df_section.sort_values(['chapter_number', 'section_number'],
                               inplace=True)

        df_text = pd.read_sql('select * from text', self.conn)
        df_text.sort_values(['chapter_number', 'section_number', 'id'],
                            inplace=True)

        df_text = pd.merge(df_text,
                           df_chapter.drop(['id'], axis=1),
                           how='left',
                           on='chapter_number')
        self.df_text = pd.merge(df_text,
                                df_section.drop(['id'], axis=1),
                                how='left',
                                on=['chapter_number', 'section_number'])

        print(f"DB data text table loaded with shape {self.df_text.shape}")

    # predict method from run_pplm_discrim_train.py
    def match(self, input_text, source_tradition, top_labels):

        print(f"Finding closest passage for {input_text}")

        # get the candidate sources
        candidate_text = self.df_text[
            self.df_text['chapter_name'].isin(top_labels)
            & self.df_text['source_tradition'].isin(
                source_tradition)]['source_text'].tolist()

        embedding1 = self.model.encode(input_text, convert_to_tensor=True)
        embedding2 = self.model.encode(candidate_text, convert_to_tensor=True)

        # compute similarity scores of two embeddings
        cosine_scores = util.pytorch_cos_sim(embedding1, embedding2)
        max_match_text = ''
        max_sim = 0.0
        for i in range(len(input_text)):
            for j in range(len(candidate_text)):
                sim = cosine_scores[i][j].item()
                if sim > max_sim:
                    max_match_text = candidate_text[j]
                    max_sim = sim
                    # print(f"New best match: {sim}: {max_match_text}")

        source = self.df_text[self.df_text['source_text'] ==
                              max_match_text]['source_location'].item()

        return f"{source_tradition[0]}, {source}: {max_match_text}"
Exemplo n.º 23
0
from pathlib import Path
from sentence_transformers import SentenceTransformer
import scipy

DFILE = "spanish"
DTYPE = "test"

print("Running for %s / %s" % (DTYPE, DFILE))

print("Loading BERT model")
model = SentenceTransformer(
    '/data/wordembeddings/BERT/bert-large-nli-stsb-mean-tokens')

print("Begin annotation")

with Path("my/sentences/%s_%s_sentences.txt" %
          (DTYPE, DFILE)).open('r') as reader:
    lines = (line.strip().split('\t') for line in reader)
    sentences = dict((int(number), sentence) for number, sentence in lines)

lnum = 0
with Path("my/sentences/%s_%s_pairs_dist.txt" %
          (DTYPE, DFILE)).open('w') as writer:
    with Path("my/sentences/%s_%s_pairs.txt" %
              (DTYPE, DFILE)).open('r') as reader:
        lines = (line.strip().split('\t') for line in reader)
        for n1, n2 in lines:
            lnum += 1
            print("Line %d" % (lnum))
            n1 = int(n1)
            n2 = int(n2)
Exemplo n.º 24
0
def generate_embeddings(docs,
                        batch_size,
                        model_name='bert-base-cased',
                        pooling='mean',
                        offset=0):
    """
    Generator function for generating embeddings from strings using a flair model. Takes a list of sentences and
    returns a list tuple. The first element represents failure (0) or success (1 or 2) and
    the second element contains a list of embeddings as numpy arrays if successful, and the indices of the failed batch
    if unsuccessful.
    The first element is 1, if batch_size embeddings were created
    :param docs: a list of strings for which embeddings should be created
    :param batch_size: integer representing how many embeddings should be created at once
    :param model_name: the model for creating the embeddings. Defaults to document embeddings using BERT-Base
    :param pooling: the pooling strategy to generate Document Embeddings
    :param offset: the offset of the integers, for printing out the correct index
    :return: a tuple (success/failure, embeddings/failed_indices)
    """
    rest = len(docs) % batch_size
    model = False
    if pooling == 'mean':
        embedding = TransformerWordEmbeddings(model_name,
                                              layers='-1',
                                              allow_long_sentences=True)
        model = DocumentPoolEmbeddings([embedding], fine_tune_mode='none')
    elif pooling == 'CLS':
        model = TransformerDocumentEmbeddings(model_name)
    if model:
        for i in range(0, len(docs) - rest, batch_size):
            sentences = [
                Sentence(sentence) for sentence in docs[i:i + batch_size]
            ]
            try:
                model.embed(sentences)
                print(
                    f'successfully embedded sentences {offset + i} to {offset + i + batch_size-1}'
                )
                yield 1, [
                    sentence.get_embedding().detach().cpu().numpy()
                    for sentence in sentences
                ]
            except RuntimeError:
                print(
                    f'could not embed sentences with index {offset + i} '
                    f'to {offset + i + batch_size-1}\nstoring in failed index list'
                )
                yield 0, (offset + i, offset + i + batch_size - 1)
        if rest:
            sentences = [Sentence(sentence) for sentence in docs[-rest:]]
            try:
                model.embed(sentences)
                print(
                    f'successfully embedded sentences from {len(docs) + offset - rest} to the end'
                )
                yield 1, [
                    sentence.get_embedding().detach().cpu().numpy()
                    for sentence in sentences
                ]
            except RuntimeError:
                yield 0, (len(docs) - rest, 0)
    elif pooling == 'SentenceBert':
        model = SentenceTransformer(model_name)
        for i in range(0, len(docs) - rest, batch_size):
            try:
                embeddings = model.encode(docs[i:i + batch_size])
                print(
                    f'successfully embedded sentences {offset + i} to {offset + i + batch_size-1}'
                )
                yield 1, embeddings
            except RuntimeError:
                print(
                    f'could not embed sentences with index {offset + i} '
                    f'to {offset + i + batch_size-1}\nstoring in failed index list'
                )
                yield 0, (offset + i, offset + i + batch_size - 1)
        if rest:
            try:
                embeddings = model.encode(docs[-rest:])
                print(
                    f'successfully embedded sentences from {len(docs) + offset - rest} to the end'
                )
                yield 1, embeddings
            except RuntimeError:
                yield 0, (len(docs) - rest, 0)
    else:
        raise Exception("No Valid model")
train_batch_size = 16
num_epochs = 4
model_save_path = 'output/training_stsbenchmark_' + model_name.replace(
    "/", "-") + '-' + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
word_embedding_model = models.Transformer(model_name)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# Convert the dataset to a DataLoader ready for training
logging.info("Read STSbenchmark train dataset")

train_samples = []
dev_samples = []
test_samples = []
with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        score = float(row['score']) / 5.0  # Normalize score to range 0 ... 1
        inp_example = InputExample(texts=[row['sentence1'], row['sentence2']],
                                   label=score)

        if row['split'] == 'dev':
Exemplo n.º 26
0
#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

#You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
# model_name = sys.argv[1] if len(sys.argv) > 1 else 'bert-base-uncased'
set_seed(args)
# Read the dataset
train_batch_size = args.batch_size
model_save_path = args.model_path

model = SentenceTransformer(model_save_path)

folder = '../datasets/temp-sts/STS-data'
#'STS2012-gold','STS2013-gold','STS2014-gold','STS2015-gold',
names = [
    'STS2012-gold', 'STS2013-gold', 'STS2014-gold', 'STS2015-gold',
    'STS2016-gold', 'SICK-data'
]

for name in names:

    sts_reader = STSDataReader(os.path.join(folder, name))
    test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
        sts_reader.get_examples('all.tsv'),
        batch_size=train_batch_size,
        name=name + '-test')
Exemplo n.º 27
0
 def __init__(self, model_name: str):
     self.encoder_model = SentenceTransformer(model_name)
     self.classifier = None
Exemplo n.º 28
0
 def __init__(self, knowledge_index):
     self.knowledge_vecs = knowledge_index["knowledge_vecs"]
     self.model = SentenceTransformer('bert-base-nli-stsb-mean-tokens')
     self.threshold = 0.35
    sorted_df["Topic"] = sorted_df["Topic"].apply(str)
    sorted_df["That"] = sorted_df["That"].apply(str)
    sorted_df["Template"] = sorted_df["Template"].apply(str)
    
    # Sort by topic
    sorted_df = sorted_df.sort_values(by=['Topic'])
    
    # print(sorted_df.info())
    # print(sorted_df.head())
    # print(sorted_df['Topic'].value_counts())

    train, test = train_test_split(sorted_df, stratify=sorted_df['Topic'])
    test, val = train_test_split(test, stratify=test['Topic'])

    print("Getting the bert-base-nli-mean-tokens model.")
    model = SentenceTransformer("bert-base-nli-mean-tokens")

    print("Read AIML QA dataset")
    train_dataloader = DataLoader(train, shuffle=True, batch_size=train_batch_size)
    print("Calculate loss")
    train_loss = losses.CosineSimilarityLoss(model=model)
    print("Create evaluator")
    evaluator = EmbeddingSimilarityEvaluator.from_input_examples(val)

    # Train the model
    warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
    print("training the model...")
    model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
Exemplo n.º 30
0
    def __init__(
        self,
        document_store: BaseDocumentStore,
        embedding_model: str,
        use_gpu: bool = True,
        model_format: str = "farm",
        pooling_strategy: str = "reduce_mean",
        emb_extraction_layer: int = -1,
    ):
        """
        :param document_store: An instance of DocumentStore from which to retrieve documents.
        :param embedding_model: Local path or name of model in Hugging Face's model hub such as ``'deepset/sentence_bert'``
        :param use_gpu: Whether to use gpu or not
        :param model_format: Name of framework that was used for saving the model. Options:

                             - ``'farm'``
                             - ``'transformers'``
                             - ``'sentence_transformers'``
        :param pooling_strategy: Strategy for combining the embeddings from the model (for farm / transformers models only).
                                 Options:

                                 - ``'cls_token'`` (sentence vector)
                                 - ``'reduce_mean'`` (sentence vector)
                                 - ``'reduce_max'`` (sentence vector)
                                 - ``'per_token'`` (individual token vectors)
        :param emb_extraction_layer: Number of layer from which the embeddings shall be extracted (for farm / transformers models only).
                                     Default: -1 (very last layer).
        """
        self.document_store = document_store
        self.model_format = model_format
        self.pooling_strategy = pooling_strategy
        self.emb_extraction_layer = emb_extraction_layer

        logger.info(f"Init retriever using embeddings of model {embedding_model}")
        if model_format == "farm" or model_format == "transformers":
            self.embedding_model = Inferencer.load(
                embedding_model, task_type="embeddings", extraction_strategy=self.pooling_strategy,
                extraction_layer=self.emb_extraction_layer, gpu=use_gpu, batch_size=4, max_seq_len=512, num_processes=0
            )
            # Check that document_store has the right similarity function
            similarity = document_store.similarity
            # If we are using a sentence transformer model
            if "sentence" in embedding_model.lower() and similarity != "cosine":
                logger.warning(f"You seem to be using a Sentence Transformer with the {similarity} function. "
                               f"We recommend using cosine instead. "
                               f"This can be set when initializing the DocumentStore")
            elif "dpr" in embedding_model.lower() and similarity != "dot_product":
                logger.warning(f"You seem to be using a DPR model with the {similarity} function. "
                               f"We recommend using dot_product instead. "
                               f"This can be set when initializing the DocumentStore")


        elif model_format == "sentence_transformers":
            try:
                from sentence_transformers import SentenceTransformer
            except ImportError:
                raise ImportError("Can't find package `sentence-transformers` \n"
                                  "You can install it via `pip install sentence-transformers` \n"
                                  "For details see https://github.com/UKPLab/sentence-transformers ")
            # pretrained embedding models coming from: https://github.com/UKPLab/sentence-transformers#pretrained-models
            # e.g. 'roberta-base-nli-stsb-mean-tokens'
            if use_gpu:
                device = "cuda"
            else:
                device = "cpu"
            self.embedding_model = SentenceTransformer(embedding_model, device=device)
            if document_store.similarity != "cosine":
                logger.warning(
                    f"You are using a Sentence Transformer with the {document_store.similarity} function. "
                    f"We recommend using cosine instead. "
                    f"This can be set when initializing the DocumentStore")
        else:
            raise NotImplementedError