def evaluate_ce(num_labels, dataset_name, evalset_name, evaluator, threshold): model = CrossEncoder('ssce_save/fsce/' + dataset_name, num_labels=num_labels) with open(evalset_name, 'r') as r: OOD_data = r.readlines() OOD_sentence_pairs = [] OOD_labels = [] for line in OOD_data: pair = line.strip('\n').split('\t') new_entry = [] try: new_entry.append([pair[0], pair[1]]) except: continue try: new_entry.append(int(pair[2])) except: continue OOD_sentence_pairs.append(new_entry[0]) OOD_labels.append(new_entry[1]) if evaluator == 'accuracy': OOD_evaluator = CEBinaryAccuracyEvaluator(OOD_sentence_pairs, OOD_labels, threshold=threshold) elif evaluator == 'classification': OOD_evaluator = CEBinaryClassificationEvaluator( OOD_sentence_pairs, OOD_labels) OOD_evaluator(model=model, output_path='ssce_save/fsce/' + dataset_name)
def __init__(self, root_dir='.'): """Load models, preprocess text, precompute embeddings.""" self.root_dir = root_dir # Load language models self.qa = pipeline('question-answering') self.sum = pipeline('summarization') self.text_encoder = SentenceTransformer('msmarco-distilbert-base-v2') self.pair_encoder = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-6') # Load list of entries self.entries = [ open(self.root_dir + '/' + file).read() for file in sorted(os.listdir(root_dir)) ] # Tokenize entries into sentences self.entries = [sent_tokenize(entry.strip()) for entry in self.entries] # Merge each 3 consecutive sentences into one passage self.entries = list( chain(*[[ ' '.join(entry[start_idx:min(start_idx + 3, len(entry))]) for start_idx in range(0, len(entry), 3) ] for entry in self.entries])) # Pre-compute passage embeddings self.passage_embeddings = self.text_encoder.encode( self.entries, show_progress_bar=True)
def test_train_stsb(self): model = CrossEncoder('distilroberta-base', num_labels=1) train_dataloader = DataLoader(self.stsb_train_samples, shuffle=True, batch_size=16) model.fit(train_dataloader=train_dataloader, epochs=1, warmup_steps=int(len(train_dataloader)*0.1)) self.evaluate_stsb_test(model, 75)
def stratifiedkfoldtest(data): data = data.sample(frac=1,random_state=1).reset_index(drop=True) skf = StratifiedKFold(n_splits=10) splits=[(x,y) for x,y in skf.split(data, data['label'])] f1list=[] acclist=[] import torch torch.cuda.empty_cache() t = torch.cuda.get_device_properties(0).total_memory r = torch.cuda.memory_reserved(0) a = torch.cuda.memory_allocated(0) f = r-a # free inside reserved print(f"Total:{t/1e+9}, Reserved:{r}, Allocated:{a}, Free:{f}") for b in [24]: for l in [2e-5]: for e in [4]: for train_index, test_index in splits: #resetting the model for every fold model=CrossEncoder('cross-encoder/stsb-roberta-base',num_labels=1) #train split train=data.loc[train_index] #test split test=data.loc[test_index] #data loaders train_=SentencesDataset([InputExample(texts=[d['query_p'],d['citation_p']],label=int(d['label'])) for i,d in train.iterrows()],model) test_=SentencesDataset([InputExample(texts=[d['query_p'],d['citation_p']],label=int(d['label'])) for i,d in test.iterrows()],model) train_=DataLoader(train_,batch_size=b) test_=DataLoader(test_) #loss function #training model.fit(train_,epochs=e,optimizer_params={'lr':l}) #predictions using encoder similarity y=test['label'] dlist=list(test.apply(lambda d:(d['query_p'],d['citation_p']), axis=1)) yh=sts_sim(dlist,model) #f1 f1scores,thresholds=f1_macro(y,yh) print(np.nan in f1scores) f1=max(f1scores) f1list.append(f1) print(f1) #accuracy mthres=thresholds[np.nanargmax(f1scores)] yh1=np.zeros(len(yh)) yh1[yh>=mthres]=1 f12=metrics.f1_score(y,yh1,average='macro') if f12!=f1: import pdb pdb.set_trace() acc=metrics.accuracy_score(y, yh1) print(acc) acclist.append(acc) print(b,l,e) print("Average Macro F1 across folds:",np.mean(f1list)) print("Average Acc across folds:",np.mean(acclist))
def kfoldtest(data): data = data.sample(frac=1,random_state=1).reset_index(drop=True) skf = KFold(n_splits=100) splits=[(x,y) for x,y in skf.split(data)] f1list=[] acclist=[] import torch print(torch.cuda.is_available()) for b in [20]: for l in [2e-5]: for e in [4]: yh=np.array([]) y=np.array([]) i=0 for train_index, test_index in splits: i+=1 print(f"Fold {i}") #resetting the model for every fold model=CrossEncoder('cross-encoder/stsb-roberta-base',num_labels=1) #train split train=data.loc[train_index] #test split test=data.loc[test_index] #data loaders train_=SentencesDataset([InputExample(texts=[d['query_p'],d['citation_p']],label=int(d['label'])) for i,d in train.iterrows()],model) test_=SentencesDataset([InputExample(texts=[d['query_p'],d['citation_p']],label=int(d['label'])) for i,d in test.iterrows()],model) train_=DataLoader(train_,batch_size=b) test_=DataLoader(test_) #training model.fit(train_,epochs=e,optimizer_params={'lr':l}) #predictions using cos_similarity y=np.append(y,test['label']) dlist=list(test.apply(lambda d:(d['query_p'],d['citation_p']), axis=1)) yh=np.append(yh,sts_sim(dlist,model)) #f1 f1scores,thresholds=f1_macro(y,yh) print(np.nan in f1scores) f1=max(f1scores) f1list.append(f1) print(f1) #accuracy mthres=thresholds[np.nanargmax(f1scores)] yh1=np.zeros(len(yh)) yh1[yh>=mthres]=1 f12=metrics.f1_score(y,yh1,average='macro') if f12!=f1: import pdb pdb.set_trace() acc=metrics.accuracy_score(y, yh1) print(acc) acclist.append(acc) print(b,l,e) print("BERT Fine-Tuned: Average F1 across folds:",np.mean(f1list)) print("BERT Fine-Tuned: Average Acc across folds:",np.mean(acclist))
def __init__( self, pretrained_model_name_or_path='cross-encoder/ms-marco-MiniLM-L-2-v2', max_length=512, device=None, use_amp=False): device = device or ('cuda' if torch.cuda.is_available() else 'cpu') self.use_amp = use_amp self.model = CrossEncoder(pretrained_model_name_or_path, max_length=max_length, device=device)
def get_sbert_mostsimilar_crossencoder(vecs, query, query_vec, num_responses): distances, most_similar = most_sim_cos(vecs, query_vec, 50) from sentence_transformers import CrossEncoder model = CrossEncoder('cross-encoder/stsb-roberta-base') cross_inp = [[query, i] for i in most_similar] cross_scores = model.predict(cross_inp) cross_scores, most_similar = zip( *sorted(zip(cross_scores, most_similar), reverse=True)) return list(cross_scores[:num_responses]), list( most_similar[:num_responses])
class SentenceTransformersReranker(Reranker): def __init__( self, pretrained_model_name_or_path='cross-encoder/ms-marco-MiniLM-L-2-v2', max_length=512, device=None, use_amp=False): device = device or ('cuda' if torch.cuda.is_available() else 'cpu') self.use_amp = use_amp self.model = CrossEncoder(pretrained_model_name_or_path, max_length=max_length, device=device) def rescore(self, query: Query, texts: List[Text]) -> List[Text]: texts = deepcopy(texts) with torch.cuda.amp.autocast(enabled=self.use_amp): scores = self.model.predict( [(query.text, text.text) for text in texts], show_progress_bar=False, ) for (text, score) in zip(texts, scores): text.score = score.item() return texts
def get_answers_from_query(request): """ Uses infromational retrieval methods to get answers from user query about the inputed text. These queries are answered using the BERT NLP transformer. Input ---------- request variable: Flask request variable containing the text for the article Returns ---------- The answers to the query. """ text = request.form['text'] text = check_for_url(text) sentences = nltk.sent_tokenize(text) sentences = [s for s in sentences if s[-1] != "?"] query = request.form['query'] ''' This uses a Cross Encoder variant of a transformer. It is designed to return the most likely response given an input. i.e - its designed for question answering ''' model = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-2') model_inputs = [[query, passage] for passage in sentences] print(model_inputs) scores = model.predict(model_inputs) #Sort the scores in decreasing order results = [{ 'input': inp, 'score': score } for inp, score in zip(model_inputs, scores)] results = sorted(results, key=lambda x: x['score'], reverse=True) answers = [] print("Query:", query) for hit in results[0:3]: print("Score: {:.2f}".format(hit['score']), "\t", hit['input'][1], '\n') if hit['score'] > 0.0: answers.append(hit['input'][1]) return answers
def __init__(self, ce_pretrained_model="stsb-roberta-large", ce_gpu_id=-1, **kargs): """Initialize ce model.""" super(CESemanticSimilarityMetric, self).__init__() if ce_gpu_id == -1: logger.warning("CE metric is running on CPU.") device = "cpu" else: logger.info("CE metric is running on GPU %d.", ce_gpu_id) device = "cuda:%d" % ce_gpu_id logger.info("load ce model.") # TODO: use resources utils to manage model. self._model = CrossEncoder( resources.get_transformers(ce_pretrained_model), device=device)
def _get_relevant_comments_helper(comments, query, query_embedding, corpus_embeddings): hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=10) hits = hits[0] cross_encoder = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-6') cross_inp = [[query, comments[hit['corpus_id']]] for hit in hits] cross_scores = cross_encoder.predict(cross_inp) for idx in range(len(cross_scores)): hits[idx]['cross-score'] = cross_scores[idx] hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True) #print top 10 hits # for hit in hits[:10]: #print(hit['score'], comments[hit['corpus_id']]) return hits[:10]
def __init__(self, hparams: HParams, dataset: Dataset): super().__init__() self.hparams = hparams self.dataset = dataset # pre-process data for tf-idf questions = [[w.lower() for w in word_tokenize(question)] for question in self.dataset.questions] self.dictionary = gensim.corpora.Dictionary(questions) corpus = [self.dictionary.doc2bow(question) for question in questions] # tf-idf self.tf_idf = gensim.models.TfidfModel(corpus) self.sims = gensim.similarities.MatrixSimilarity(self.tf_idf[corpus], num_features=len(self.dictionary)) # load model self.model_qq = SentenceTransformer(hparams.nearest_neighbor_model_qq) self.model_qa = SentenceTransformer(hparams.nearest_neighbor_model_qa) self.cross_encoder_qq = CrossEncoder(hparams.binary_classifier_model_qq) self.cross_encoder_qa = CrossEncoder(hparams.binary_classifier_model_qa) # generate embeddings for questions/answers self.embeddings_q = self.model_qq.encode(self.dataset.questions) self.embeddings_a = self.model_qa.encode(self.dataset.answers)
def bert(data): model=CrossEncoder('cross-encoder/stsb-roberta-base',num_labels=1) dlist=list(data.apply(lambda d:(d['query_p'],d['citation_p']), axis=1)) y=data['label'] yh=sts_sim(dlist,model) #f1 f1scores,thresholds=f1_macro(y,yh) print(np.nan in f1scores) f1=max(f1scores) #accuracy mthres=thresholds[np.nanargmax(f1scores)] yh1=np.zeros(len(yh)) yh1[yh>=mthres]=1 f12=metrics.f1_score(y,yh1,average='macro') if f12!=f1: import pdb pdb.set_trace() acc=metrics.accuracy_score(y, yh1) print("BERT: Macro F1:",f1) print("BERT: Accuracy:",acc)
def print_cum_stats(run): run_results = evaluator.evaluate(run) map_scores = [v["map"] for k, v in run_results.items()] p_scores = [v["P_5"] for k, v in run_results.items()] ndcg_scores = [v['ndcg'] for k, v in run_results.items()] print("Aggregate results") print("Average MAP: ", np.mean(map_scores)) print("Average P_5: ", np.mean(p_scores)) print("Average NDCG: ", np.mean(ndcg_scores)) from sentence_transformers import CrossEncoder ranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2') # Base loss from sentence_transformers import SentencesDataset, losses from sentence_transformers.readers import InputExample examples = [] for topic in topics: gold = qrel[topic["number"]].items() query = topic["title"].strip() for item in gold: try: doc = db.lookup_docno(item[0]) examples.append(InputExample(texts=[query, doc], label=item[1]))
def test_pretrained_stsb(self): model = CrossEncoder("cross-encoder/stsb-distilroberta-base") self.evaluate_stsb_test(model, 87.92)
def topic_modelling(df, model): for aspect in aspects.keys(): df[aspect] = df.full_message.apply( lambda x: score_topic_sentence(x, aspect=aspect)) df["best_aspect"] = "None" df["max_score"] = df[aspects.keys()].max(axis=1) for aspect in aspects.keys(): df.loc[(df[aspect] == df["max_score"]) & (df["max_score"] > 0.1), "best_aspect"] = aspect df = df.drop(columns="max_score") return df def score_topic_sentence(sentence, aspect="food"): aspect_description = aspects[aspect] score = model.predict((sentence, aspect_description)) return score if __name__ == "__main__": path = "data/evaluation/" file_name = "text_data.txt" df = read_dirty_test_file(path + file_name) # Scoring Reviews against each topic model = CrossEncoder('cross-encoder/stsb-roberta-base') df = topic_modelling(df, model) df.to_csv(path + "TEST_data_with_Topics.csv")
def semantic_answer_similarity( predictions: List[List[str]], gold_labels: List[List[str]], sas_model_name_or_path: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" ) -> Tuple[List[float], List[float]]: """ Computes Transformer-based similarity of predicted answer to gold labels to derive a more meaningful metric than EM or F1. Returns per QA pair a) the similarity of the most likely prediction (top 1) to all available gold labels b) the highest similarity of all predictions to gold labels :param predictions: Predicted answers as list of multiple preds per question :param gold_labels: Labels as list of multiple possible answers per question :param sas_model_name_or_path: SentenceTransformers semantic textual similarity model, should be path or string pointing to downloadable models. :return top_1_sas, top_k_sas """ assert len(predictions) == len(gold_labels) config = AutoConfig.from_pretrained(sas_model_name_or_path) cross_encoder_used = False if config.architectures is not None: cross_encoder_used = any([ arch.endswith('ForSequenceClassification') for arch in config.architectures ]) # Compute similarities top_1_sas = [] top_k_sas = [] # Based on Modelstring we can load either Bi-Encoders or Cross Encoders. # Similarity computation changes for both approaches if cross_encoder_used: model = CrossEncoder(sas_model_name_or_path) for preds, labels in zip(predictions, gold_labels): # TODO add efficient batch mode: put all texts and labels into grid and extract scores afterwards grid = [] for p in preds: for l in labels: grid.append((p, l)) scores = model.predict(grid) top_1_sas.append(np.max(scores[:len(labels)])) top_k_sas.append(np.max(scores)) else: # For Bi-encoders we can flatten predictions and labels into one list model = SentenceTransformer(sas_model_name_or_path) lengths: List[Tuple[int, int]] = [] all_texts: List[str] = [] for p, l in zip(predictions, gold_labels): # type: ignore # TODO potentially exclude (near) exact matches from computations all_texts.extend(p) all_texts.extend(l) lengths.append((len(p), len(l))) # then compute embeddings embeddings = model.encode(all_texts) # then select which embeddings will be used for similarity computations current_position = 0 for i, (len_p, len_l) in enumerate(lengths): pred_embeddings = embeddings[current_position:current_position + len_p, :] current_position += len_p label_embeddings = embeddings[current_position:current_position + len_l, :] current_position += len_l sims = cosine_similarity(pred_embeddings, label_embeddings) top_1_sas.append(np.max(sims[0, :])) top_k_sas.append(np.max(sims)) return top_1_sas, top_k_sas
#Lager values: More context from the paragraph remains, but results are longer window_size = 3 passages = [] for paragraph in paragraphs: for start_idx in range(0, len(paragraph), window_size): end_idx = min(start_idx+window_size, len(paragraph)) passages.append(" ".join(paragraph[start_idx:end_idx])) print("Paragraphs: ", len(paragraphs)) print("Sentences: ", sum([len(p) for p in paragraphs])) print("Passages: ", len(passages)) ## Load our cross-encoder. Use fast tokenizer to speed up the tokenization model = CrossEncoder('sentence-transformers/ce-ms-marco-TinyBERT-L-2') ## Some queries we want to search for in the document queries = ["How large is Europe?", "Is Europe a continent?", "What is the currency in EU?", "Fall Roman Empire when", #We can also search for key word queries "Is Europa in the south part of the globe?"] #Europe is miss-spelled & the matching sentences does not mention any of the content words #Search in a loop for the individual queries for query in queries: start_time = time.time() #Concatenate the query and all passages and predict the scores for the pairs [query, passage] model_inputs = [[query, passage] for passage in passages] scores = model.predict(model_inputs)
from sentence_transformers import CrossEncoder import os import csv import pickle import time import sys # We use a BiEncoder (SentenceTransformer) that produces embeddings for questions. # We then search for similar questions using cosine similarity and identify the top 100 most similar questions model_name = 'paraphrase-MiniLM-L6-v2' model = SentenceTransformer(model_name) num_candidates = 500 # To refine the results, we use a CrossEncoder. A CrossEncoder gets both inputs (input_question, retrieved_question) # and outputs a score 0...1 indicating the similarity. cross_encoder_model = CrossEncoder('cross-encoder/roberta-base-stsb') # Dataset we want to use url = "http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv" dataset_path = "quora_duplicate_questions.tsv" max_corpus_size = 20000 # Some local file to cache computed embeddings embedding_cache_path = 'quora-embeddings-{}-size-{}.pkl'.format( model_name.replace('/', '_'), max_corpus_size) #Check if embedding cache path exists if not os.path.exists(embedding_cache_path): # Check if the dataset exists. If not, download and extract # Download dataset if needed if not os.path.exists(dataset_path):
from sentence_transformers.readers import InputExample from sentence_transformers.cross_encoder.evaluation import CEBinaryClassificationEvaluator from torch.utils.data import DataLoader from torch import nn class sigmoid_cond_loss(): def __init__(): loss = -tf.reduce_mean(pred*tf.log(pred+NEAR_0)+(1-pred)*tf.log(1-pred+NEAR_0)) return loss class virtual_adversarial_loss(): def __init__(pred): roberta = CrossEncoder('cross-encoder/stsb-roberta-base', num_labels = 1) tokens = ["<e1>", "<e2>"] roberta.tokenizer.add_tokens(tokens, special_tokens=True) roberta.model.resize_token_embeddings(len(roberta.tokenizer)) with open('fewrel_tag/pairwise_labeled_train.tsv','r') as r: labeled_data = r.readlines() with open('fewrel_tag/pairwise_test.tsv', 'r') as r: test_data = r.readlines() train_examples = [] for line in labeled_data: pair = line.strip('\n').split('\t') try:
Google Colab Example: https://colab.research.google.com/drive/1l6stpYdRMmeDBK_vw0L5NitdiAuhdsAr?usp=sharing """ import json from sentence_transformers import SentenceTransformer, CrossEncoder, util import time import gzip import os import torch #We use the Bi-Encoder to encode all passages, so that we can use it with sematic search model_name = 'msmarco-distilbert-base-v2' bi_encoder = SentenceTransformer(model_name) top_k = 100 #Number of passages we want to retrieve with the bi-encoder #The bi-encoder will retrieve 100 documents. We use a cross-encoder, to re-rank the results list to improve the quality cross_encoder = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-6') # As dataset, we use Simple English Wikipedia. Compared to the full English wikipedia, it has only # about 170k articles. We split these articles into paragraphs and encode them with the bi-encoder wikipedia_filepath = 'data/simplewiki-2020-11-01.jsonl.gz' if not os.path.exists(wikipedia_filepath): util.http_get('http://sbert.net/datasets/simplewiki-2020-11-01.jsonl.gz', wikipedia_filepath) passages = [] with gzip.open(wikipedia_filepath, 'rt', encoding='utf8') as fIn: for line in fIn: data = json.loads(line.strip()) passages.extend(data['paragraphs'])
formatter = logging.Formatter(config.get_string('logging.pattern', default='%(asctime)s [%(levelname)s] %(message)s')) if config.get_bool('logging.appenders.console.enabled', True): ch = logging.StreamHandler() ch.setFormatter(formatter) rootLogger.addHandler(ch) if config.get_bool('logging.appenders.file.enabled', True): fh = logging.FileHandler(config.get_string("logging.appenders.file.file-name")) fh.setFormatter(formatter) rootLogger.addHandler(fh) app = Quart(__name__) app = cors(app, allow_origin="*") t0 = perf_counter() model = SentenceTransformer(config.get_string('ss_search.bi-encoder-model')) cross_encoder = CrossEncoder(config.get_string('ss_search.cross-encoder-model')) def suggest_question(): return random.choice(all_questions) @app.route('/api/suggest') async def suggest(): return {'question': suggest_question()} @app.route('/api/compare') async def compare(): if 'q1' not in request.args or 'q2' not in request.args: return abort(400, description='Missing required parameters')
def __init__(self, model=None): self.model = model if (self.model is None): self.model = CrossEncoder( 'sentence-transformers/ce-ms-marco-electra-base', max_length=512)
def test_pretrained_stsb(self): model = CrossEncoder( "sentence-transformers/ce-distilroberta-base-stsb") self.evaluate_stsb_test(model, 87.92)
#We combine up to 3 sentences into a passage. You can choose smaller or larger values for window_size #Smaller value: Context from other sentences might get lost #Lager values: More context from the paragraph remains, but results are longer window_size = 3 passages = [] for paragraph in paragraphs: for start_idx in range(0, len(paragraph), window_size): end_idx = min(start_idx + window_size, len(paragraph)) passages.append(" ".join(paragraph[start_idx:end_idx])) print("Paragraphs: ", len(paragraphs)) print("Sentences: ", sum([len(p) for p in paragraphs])) print("Passages: ", len(passages)) ## Load our cross-encoder. Use fast tokenizer to speed up the tokenization model = CrossEncoder('sentence-transformers/ce-ms-marco-TinyBERT-L-2', use_fast_tokenizer=True) ## Some queries we want to search for in the document queries = [ "How large is Europe?", "Is Europe a continent?", "What is the currency in EU?", "Fall Roman Empire when", #We can also search for key word queries "Is Europa in the south part of the globe?" ] #Europe is miss-spelled & the matching sentences does not mention any of the content words #Search in a loop for the individual queries for query in queries: start_time = time.time() #Concatenate the query and all passages and predict the scores for the pairs [query, passage]
passage_filepath) passage_cand = {} with gzip.open(passage_filepath, 'rt', encoding='utf8') as fIn: for line in fIn: qid, pid, query, passage = line.strip().split("\t") if qid not in passage_cand: passage_cand[qid] = [] passage_cand[qid].append([pid, passage]) logging.info("Queries: {}".format(len(queries))) queries_result_list = [] run = {} model = CrossEncoder(sys.argv[1], max_length=512) for qid in tqdm.tqdm(relevant_qid): query = queries[qid] cand = passage_cand[qid] pids = [c[0] for c in cand] corpus_sentences = [c[1] for c in cand] cross_inp = [[query, sent] for sent in corpus_sentences] if model.config.num_labels > 1: #Cross-Encoder that predict more than 1 score, we use the last and apply softmax cross_scores = model.predict(cross_inp, apply_softmax=True)[:, 1].tolist() else: cross_scores = model.predict(cross_inp).tolist()
class MemNav: def __init__(self, root_dir='.'): """Load models, preprocess text, precompute embeddings.""" self.root_dir = root_dir # Load language models self.qa = pipeline('question-answering') self.sum = pipeline('summarization') self.text_encoder = SentenceTransformer('msmarco-distilbert-base-v2') self.pair_encoder = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-6') # Load list of entries self.entries = [ open(self.root_dir + '/' + file).read() for file in sorted(os.listdir(root_dir)) ] # Tokenize entries into sentences self.entries = [sent_tokenize(entry.strip()) for entry in self.entries] # Merge each 3 consecutive sentences into one passage self.entries = list( chain(*[[ ' '.join(entry[start_idx:min(start_idx + 3, len(entry))]) for start_idx in range(0, len(entry), 3) ] for entry in self.entries])) # Pre-compute passage embeddings self.passage_embeddings = self.text_encoder.encode( self.entries, show_progress_bar=True) def retrieval(self, query): """Utility for retrieving passages most relevant to a given query.""" # First pass, find passages most similar to query question_embedding = self.text_encoder.encode(query, convert_to_tensor=True) hits = util.semantic_search(question_embedding, self.passage_embeddings, top_k=100)[0] # Second pass, re-rank passages more thoroughly cross_scores = self.pair_encoder.predict( [[query, self.entries[hit['corpus_id']]] for hit in hits]) for idx in range(len(cross_scores)): hits[idx]['cross-score'] = cross_scores[idx] # Select best few results hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True) results = [] for hit in hits[:5]: if hit['cross-score'] > 1e-3: results += [self.entries[hit['corpus_id']]] return results def search(self, query): """Search knowledge base for passages most relevant to a given query.""" print(*self.retrieval(query), sep='\n\n') def ask(self, question): """Obtain an answer to a question posed to the knowledge base. Provides retrieved passages as context for a question-answering pipeline.""" return self.qa(question, ' '.join(self.retrieval(question)))['answer'] def summarize(self, query): """Obtain a summary related to the query using the knowledge base. Provides retrieved passages as input for a summarization pipeline.""" return self.sum(' '.join(self.retrieval(query)), 130, 30, False)[0]['summary_text']
#We combine up to 3 sentences into a passage. You can choose smaller or larger values for window_size #Smaller value: Context from other sentences might get lost #Lager values: More context from the paragraph remains, but results are longer window_size = 3 passages = [] for paragraph in paragraphs: for start_idx in range(0, len(paragraph), window_size): end_idx = min(start_idx + window_size, len(paragraph)) passages.append(" ".join(paragraph[start_idx:end_idx])) print("Paragraphs: ", len(paragraphs)) print("Sentences: ", sum([len(p) for p in paragraphs])) print("Passages: ", len(passages)) ## Load our cross-encoder. Use fast tokenizer to speed up the tokenization model = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-2') ## Some queries we want to search for in the document queries = [ "How large is Europe?", "Is Europe a continent?", "What is the currency in EU?", "Fall Roman Empire when", #We can also search for key word queries "Is Europa in the south part of the globe?" ] #Europe is miss-spelled & the matching sentences does not mention any of the content words #Search in a loop for the individual queries for query in queries: start_time = time.time() #Concatenate the query and all passages and predict the scores for the pairs [query, passage]
def evaluate_answering( ground_truth, run, eval_missing_truth, sas_model_name_or_path="cross-encoder/stsb-roberta-large"): print("Evaluate: Question Answering") answering_run = get_answering_run(run) metric = load_metric("squad_v2") metric2 = load_metric("rouge") s = scorer.POSSCORE() # init POSSCORE sas_model = CrossEncoder(sas_model_name_or_path) result = {} answers = 0 posscores, sasscores = [], [] for turn in tqdm(ground_truth, desc=" "): turn_id = get_turn_id(turn) gt = turn["Truth_answer"] if eval_missing_truth or gt != "": reference = { "id": turn_id, "answers": { 'answer_start': [0], 'text': [gt] } } prediction_text = "" if turn_id in answering_run: prediction_text = answering_run[turn_id] answers = answers + 1 prediction = { "id": turn_id, "prediction_text": prediction_text, 'no_answer_probability': 0. } metric.add(prediction=prediction, reference=reference) metric2.add(prediction=prediction_text, reference=gt) ps = s.get_posscore(gt, prediction_text) if ps: posscores.append(ps) else: posscores.append(0) sas = sas_model.predict([(prediction_text, gt)]) sasscores.append(sas) if answers > 0: print(" used %d answers" % answers) score = metric.compute() score2 = metric2.compute() result["EM"] = score['exact'] / 100 result["F1"] = score['f1'] / 100 result["ROUGE1-R"] = score2['rouge1'].mid.recall result["POSSCORE"] = sum(posscores) / len( posscores) # average POSSCORE result["SAS"] = sum(sasscores) / len(sasscores) # average POSSCORE else: print(" skipped for no answers") return result
from sentence_transformers import CrossEncoder import os import csv import pickle import time import sys # We use a BiEncoder (SentenceTransformer) that produces embeddings for questions. # We then search for similar questions using cosine similarity and identify the top 100 most similar questions model_name = 'distilbert-multilingual-nli-stsb-quora-ranking' model = SentenceTransformer(model_name) num_candidates = 500 # To refine the results, we use a CrossEncoder. A CrossEncoder gets both inputs (input_question, retrieved_question) # and outputs a score 0...1 indicating the similarity. cross_encoder_model = CrossEncoder('sentence-transformers/ce-roberta-base-stsb') # Dataset we want to use url = "http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv" dataset_path = "quora_duplicate_questions.tsv" max_corpus_size = 20000 # Some local file to cache computed embeddings embedding_cache_path = 'quora-embeddings-{}-size-{}.pkl'.format(model_name.replace('/', '_'), max_corpus_size) #Check if embedding cache path exists if not os.path.exists(embedding_cache_path): # Check if the dataset exists. If not, download and extract # Download dataset if needed if not os.path.exists(dataset_path): print("Download dataset")