if idx != int(hit["_id"]) and (idx, int( hit["_id"])) not in set(duplicates): silver_data.append((sent, hit['_source']["sent"])) duplicates.add((idx, int(hit["_id"]))) progress.reset() progress.close() logging.info("Number of silver pairs generated for STSbenchmark: {}".format( len(silver_data))) logging.info( "Step 2.2: Label STSbenchmark (silver dataset) with cross-encoder: {}". format(model_name)) cross_encoder = CrossEncoder(cross_encoder_path) silver_scores = cross_encoder.predict(silver_data) # All model predictions should be between [0,1] assert all(0.0 <= score <= 1.0 for score in silver_scores) ################################################################################################# # # Step 3: Train bi-encoder model with both (gold + silver) STSbenchmark dataset - Augmented SBERT # ################################################################################################# logging.info( "Step 3: Train bi-encoder: {} with STSbenchmark (gold + silver dataset)". format(model_name)) # Convert the dataset to a DataLoader ready for training
# Pre-trained cross encoder model = CrossEncoder('cross-encoder/distilroberta-base-stsb') # We want to compute the similarity between the query sentence query = 'A man is eating pasta.' # With all sentences in the corpus corpus = [ 'A man is eating food.', 'A man is eating a piece of bread.', 'The girl is carrying a baby.', 'A man is riding a horse.', 'A woman is playing violin.', 'Two men pushed carts through the woods.', 'A man is riding a white horse on an enclosed ground.', 'A monkey is playing drums.', 'A cheetah is running behind its prey.' ] # So we create the respective sentence combinations sentence_combinations = [[query, corpus_sentence] for corpus_sentence in corpus] # Compute the similarity scores for these combinations similarity_scores = model.predict(sentence_combinations) # Sort the scores in decreasing order sim_scores_argsort = reversed(np.argsort(similarity_scores)) # Print the scores print("Query:", query) for idx in sim_scores_argsort: print("{:.2f}\t{}".format(similarity_scores[idx], corpus[idx]))
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # Train the model model.fit(train_dataloader=train_dataloader, epochs=num_epochs, warmup_steps=warmup_steps, output_path=model_save_path) test_dataset = 'contradictory-my-dear-watson/test.csv' df = pandas.read_csv(test_dataset) sentence_pairs = [] ids = [] for id, row in df.iterrows(): label_id = 0 ids.append(row['id']) sentence_pairs.append([row['premise'], row['hypothesis']]) pred_scores = model.predict(sentence_pairs, convert_to_numpy=True, show_progress_bar=False, batch_size=4) pred_labels = np.argmax(pred_scores, axis=1) out_df = pandas.DataFrame([ids, pred_labels]).transpose() out_df = out_df.rename(columns={0: 'id', 1: 'prediction'}) out_df['prediction'] = out_df['prediction'].replace([2, 0], [0, 2]) out_df.to_csv('submission.csv', index=False)
query_texts.append(row["text"]) dataset_texts, dataset_ids = read_dataset_collection( args.search_collection) all_query_dataset_pairs = [] for i, query in enumerate(query_texts): first_stage_doc_idxs = [int(docidx) for docidx in psg_indices[i]] first_stage_dataset_texts = [ dataset_texts[docidx] for docidx in first_stage_doc_idxs ] for dataset_text in first_stage_dataset_texts: all_query_dataset_pairs.append([query, dataset_text]) scores = model.predict(all_query_dataset_pairs, batch_size=args.batch_size, show_progress_bar=True) all_ranks = [] all_scores = [] for i in range(len(query_texts)): first_stage_doc_idxs = [int(docidx) for docidx in psg_indices[i]] query_predictions_start = args.first_stage_depth * i query_predictions_end = args.first_stage_depth * (i + 1) query_scores = -1 * scores[ query_predictions_start:query_predictions_end] query_reranks = np.argsort(query_scores)[:args.results_limit] all_ranks.append( [first_stage_doc_idxs[rerank] for rerank in query_reranks]) all_scores.append(list(query_scores[query_reranks]))
class PathwayGenerator(): def __init__(self, file_path, pilot, service, use_cuda=False, cuda_device=-1, annotation_model=None, section_split_model=None): ''' PathwayGenerator object constructor Args: path (str): path of the file from which the pathway is generated. pilot (str): name of the pilot. service (str): name of the service considered. use_cuda (bool): flag to use gpu model. cuda_device (int, optional): Id of the gpu device to use. Defaults to -1. ''' assert file_path is not None, "A file path is required" languages = { 'Larissa': 'el', 'Birmingham': 'en', 'Malaga': 'es', 'Palermo': 'it' } self.path = file_path if os.path.splitext(self.path)[-1] == '.txt': self.converted_file = doc2txt.purge_urls( open(self.path, 'r').read(), os.path.splitext(self.path)[0]) self.use_cuda = use_cuda self.cuda_device = cuda_device self.language = languages[pilot] # TODO: language detection param? if len(annotation_model) != 2: self.annotation_model = Transner( pretrained_model=annotation_model, use_cuda=use_cuda, cuda_device=cuda_device, language_detection=True, threshold=0.85, args={"use_multiprocessing": False}) else: self.annotation_model = Transner( pretrained_model='bert_uncased_' + annotation_model, use_cuda=use_cuda, cuda_device=cuda_device, language_detection=True, threshold=0.85, args={"use_multiprocessing": False}) self.section_split_model = CrossEncoder(section_split_model, num_labels=1) self.annotation_metadata = metadata = pilot + ' - ' + service + ' - ' + os.path.basename( self.path) #self.generation_metadata = { # 'where': pilot + ' - ' + service + ' - ' + 'Where - ' + os.path.basename(self.path) + ' - ', # 'when': pilot + ' - ' + service + ' - ' + 'When - ' + os.path.basename(self.path) + ' - ', # 'how': pilot + ' - ' + service + ' - ' + 'How - ' + os.path.basename(self.path) + ' - ' #} self.generation_metadata = pilot + ' - ' + service + ' - ' + os.path.basename( self.path) + ' - ' def to_list(self): element_list = [] # Make an empty list for element in re.split('\n', self.converted_file): stripped_element = element.strip() if stripped_element != '': element_list.append( stripped_element) #Append to list the striped element return element_list def do_convert(self): self.converted_file = doc2txt.convert_to_txt(self.path) return self.converted_file def do_split(self, threshold=0.5): sentence_list = self.to_list() scores = [] for i in range(0, len(sentence_list) - 1): current_sentence = sentence_list[i] next_sentence = sentence_list[i + 1] score = self.section_split_model.predict( [current_sentence, next_sentence]) scores.append(score) sections = [ ] # sections = [['section1'], ['section2'], ... , ['sectionN']] section_text = [] section_text.append(sentence_list[0]) for i in range(0, len(scores)): if scores[i] >= threshold: section_text.append(sentence_list[i + 1]) else: sections.append(section_text) section_text = [] section_text.append(sentence_list[i + 1]) sections.append(section_text) return sections def do_annotate(self, sentence_list): self.ner_dict = self.annotation_model.ner(sentence_list, apply_regex=True) if self.language in ['es', 'en']: self.ner_dict = self.annotate_sutime(self.ner_dict) else: self.ner_dict = self.annotation_model.find_dates(self.ner_dict) self.ner_dict = annotator.aggregate_dict(self.ner_dict) self.ner_dict['entities'] = sorted(self.ner_dict['entities'], key=lambda ner: ner['start_offset']) self.ner_dict = annotator.resolve_uri_entities(self.ner_dict, self.path) return self.ner_dict def do_generate(self): if os.path.splitext(self.path)[-1] == '.json': self.ner_dict = json.load(open(self.path, 'r')) aggregated_ner_dict = aggregator.aggregate_entities(self.ner_dict) print(aggregated_ner_dict) #aggregated_ner_dict = self.ner_dict = {'text': 'test 1 of the section 1.\ntest 2 of the section 1.\ntest 3 of the section 1.\n', 'entities': {'LOCATION': [{'value': 'test', 'confidence': 0.9737, 'start_offset': 0, 'end_offset': 4}], 'ORGANIZATION': [{'value': 'test', 'confidence': 0.9676, 'start_offset': 25, 'end_offset': 29}], 'TIME': [{'value': 'test', 'confidence': 0.9573, 'start_offset': 50, 'end_offset': 54}]}} json_pathway = generator.generate(aggregated_ner_dict) mapped_entities = json.loads(json_pathway) dict_pathway = json.load(open("tools/dict_pathway.json", 'r')) self.pathway = {} #{'physical_office': [{'start', 'end'}...]} for key, sub_types in dict_pathway.items(): self.pathway[key] = {} for sub_type in sub_types: self.pathway[key][sub_type] = [] for entity in mapped_entities: self.pathway[self.keys_of_value( dict_pathway, entity['step'])][entity['step']].append(entity) # {'dove': [], 'come': [], 'quando': []} #todo: remove return because we can read the value in the pgr object return self.pathway def export_annotation_to_doccano(self, add_confidence=False): filename = os.path.splitext(self.path)[0] doccano_dict = {} doccano_dict['text'] = self.ner_dict['text'] doccano_dict['labels'] = [] doccano_dict['meta'] = self.annotation_metadata for item in self.ner_dict['entities']: if add_confidence: doccano_dict['labels'].append([ item['start_offset'], item['end_offset'], item['type'], item['confidence'] ]) else: doccano_dict['labels'].append( [item['start_offset'], item['end_offset'], item['type']]) file_out = open(filename + '_ner.jsonl', 'w', encoding='utf-8') file_out.write(json.dumps(doccano_dict)) file_out.write('\n') return doccano_dict, filename + '_ner.jsonl' def export_generation_to_doccano(self, pathway=None): dict_translations = json.load(open("tools/dict_translations.json", 'r')) filename = os.path.splitext(self.path)[0] pathway_jsonl = [] for key in pathway: tmp_dict = {"text": '', "labels": [], "meta": ''} tmp_dict["text"] = key for step, step_dict in pathway[key].items(): tmp_dict["meta"] = self.generation_metadata + key for sub_type, entities in step_dict.items(): label = dict_translations[ self.language][step] + ' - ' + dict_translations[ self.language][sub_type] + ': ' if len(entities) == 0: label = label + '-' tmp_dict['labels'].append(label) else: for entity in entities: label = label + entity['entity'].strip() + ' , ' tmp_dict['labels'].append(label[:-2].strip()) pathway_jsonl.append(tmp_dict) file_out = open(filename + '_pathway.jsonl', 'w', encoding='utf-8') return_string = '' for element in pathway_jsonl: string_element = str(json.dumps(element, ensure_ascii=False)) file_out.write(string_element) file_out.write('\n') return_string = return_string + string_element + '\n' return return_string, filename + '_pathway.jsonl' def keys_of_value(self, dct, value): for k in dct: if isinstance(dct[k], list): if value in dct[k]: return k else: if value == dct[k]: return k def annotate_sutime(self, ner_dict): for item in ner_dict: text = item['sentence'] jar_files = os.path.join('python-sutime/', 'jars') sutime = sutime_mod.SUTime(jars=jar_files, mark_time_ranges=True) json = sutime.parse(text) time_type = self.annotation_model.check_opening_time( item['entities']) for item_sutime in json: if not self.annotation_model.find_overlap( item['entities'], item_sutime['start'], item_sutime['end']): item['entities'].append({ 'type': time_type, 'value': item_sutime['text'], 'confidence': 0.85, 'offset': item_sutime['start'] }) return ner_dict def sections_to_doccano(self, sections): count, step = 0, 1 doccano_dict = {'text': '', 'labels': []} for section in sections: initial_count, final_count = count, 0 for sentence in section: doccano_dict['text'] = doccano_dict['text'] + sentence + '.\n' final_count = final_count + len(sentence) + 2 doccano_dict['labels'].append([ initial_count, initial_count + final_count - 1, 'Step' + str(step) ]) step = step + 1 count = initial_count + final_count return doccano_dict
class rerankPassages: def __init__(self, nlp): self.bm25_ranking = bm25(nlp) self.tfidf_ranking = tfidf(nlp) self.sbert_ranking = sbert() self.cross_encoder = CrossEncoder("cross-encoder/ms-marco-TinyBERT-L-6") self.kg = KnowledgeGraph('chatbot', 'password') self.document = None def fit(self, document): self.document = document self.bm25_ranking.fit(document) self.tfidf_ranking.preprocessDocument(document) self.sbert_ranking.fit(document) def matchParaSent(self, s, p): sList = s.split() if len(sList) < 1: return False count = 0 for i in sList: if i in p:count += 1 if count/len(sList) > 0.9: return True else: return False def getSentences(self, query, n): return self.kg.retrieveSentences(query, n) def withKg(self, query, paras, t): sentences = self.kg.retrieveSentences(query, 10) for i in paras: avgScore = 0 sentencesMatched = 0 for s in sentences: sentence = s['sentence'] score = s['score'] if self.matchParaSent(sentence, i[0]): if sentence not in i[0]: print(sentence, i[0]) # print(sentence, i[0]) sentencesMatched += 1 avgScore += score # if sentencesMatched == 0: sentencesMatched = 1 i[1] = 1/(t + i[1]) + 1/(t + sentencesMatched) paras.sort(key = lambda x : x[1]) return [i[0] for i in paras] def withCrossEncoder(self, query, paras): para_combination = [[query, p] for p in paras] score = self.cross_encoder.predict(para_combination) sim_scores_argsort = reversed(np.argsort(score)) reranked_passages = list() for idx in sim_scores_argsort: reranked_passages.append(paras[idx]) return reranked_passages def rankDocuments(self, query, mu, k): bm25_scores = self.bm25_ranking.rankDocuments(query) tfidf_scores = self.tfidf_ranking.rankDocuments(query) sbert_scores = self.sbert_ranking.rankDocuments(query) #Combined scoring # mu = 0.7 # k = 10 rrf = mu*sbert_scores + (1-mu)*tfidf_scores # rrf = 1/(k+c) + 1/(k + bm25_scores) # print(rrf) # print(np.shape(rrf)) #retrive top k passages scores = rrf.tolist() score_passage = [(s,i) for i, s in enumerate(scores[0])] score_passage.sort(reverse = True) # return self.withKg(query, [[self.document[i[1]], i[0]] for i in score_passage[:4]], k) return self.withCrossEncoder(query, [self.document[i[1]] for i in score_passage[:5]])