예제 #1
0
        if idx != int(hit["_id"]) and (idx, int(
                hit["_id"])) not in set(duplicates):
            silver_data.append((sent, hit['_source']["sent"]))
            duplicates.add((idx, int(hit["_id"])))

progress.reset()
progress.close()

logging.info("Number of silver pairs generated for STSbenchmark: {}".format(
    len(silver_data)))
logging.info(
    "Step 2.2: Label STSbenchmark (silver dataset) with cross-encoder: {}".
    format(model_name))

cross_encoder = CrossEncoder(cross_encoder_path)
silver_scores = cross_encoder.predict(silver_data)

# All model predictions should be between [0,1]
assert all(0.0 <= score <= 1.0 for score in silver_scores)

#################################################################################################
#
# Step 3: Train bi-encoder model with both (gold + silver) STSbenchmark dataset - Augmented SBERT
#
#################################################################################################

logging.info(
    "Step 3: Train bi-encoder: {} with STSbenchmark (gold + silver dataset)".
    format(model_name))

# Convert the dataset to a DataLoader ready for training
예제 #2
0
# Pre-trained cross encoder
model = CrossEncoder('cross-encoder/distilroberta-base-stsb')

# We want to compute the similarity between the query sentence
query = 'A man is eating pasta.'

# With all sentences in the corpus
corpus = [
    'A man is eating food.', 'A man is eating a piece of bread.',
    'The girl is carrying a baby.', 'A man is riding a horse.',
    'A woman is playing violin.', 'Two men pushed carts through the woods.',
    'A man is riding a white horse on an enclosed ground.',
    'A monkey is playing drums.', 'A cheetah is running behind its prey.'
]

# So we create the respective sentence combinations
sentence_combinations = [[query, corpus_sentence]
                         for corpus_sentence in corpus]

# Compute the similarity scores for these combinations
similarity_scores = model.predict(sentence_combinations)

# Sort the scores in decreasing order
sim_scores_argsort = reversed(np.argsort(similarity_scores))

# Print the scores
print("Query:", query)
for idx in sim_scores_argsort:
    print("{:.2f}\t{}".format(similarity_scores[idx], corpus[idx]))
예제 #3
0
warmup_steps = math.ceil(len(train_dataloader) * num_epochs *
                         0.1)  # 10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# Train the model
model.fit(train_dataloader=train_dataloader,
          epochs=num_epochs,
          warmup_steps=warmup_steps,
          output_path=model_save_path)

test_dataset = 'contradictory-my-dear-watson/test.csv'
df = pandas.read_csv(test_dataset)
sentence_pairs = []
ids = []
for id, row in df.iterrows():
    label_id = 0
    ids.append(row['id'])
    sentence_pairs.append([row['premise'], row['hypothesis']])

pred_scores = model.predict(sentence_pairs,
                            convert_to_numpy=True,
                            show_progress_bar=False,
                            batch_size=4)
pred_labels = np.argmax(pred_scores, axis=1)

out_df = pandas.DataFrame([ids, pred_labels]).transpose()
out_df = out_df.rename(columns={0: 'id', 1: 'prediction'})
out_df['prediction'] = out_df['prediction'].replace([2, 0], [0, 2])
out_df.to_csv('submission.csv', index=False)
        query_texts.append(row["text"])

    dataset_texts, dataset_ids = read_dataset_collection(
        args.search_collection)

    all_query_dataset_pairs = []
    for i, query in enumerate(query_texts):
        first_stage_doc_idxs = [int(docidx) for docidx in psg_indices[i]]
        first_stage_dataset_texts = [
            dataset_texts[docidx] for docidx in first_stage_doc_idxs
        ]
        for dataset_text in first_stage_dataset_texts:
            all_query_dataset_pairs.append([query, dataset_text])

    scores = model.predict(all_query_dataset_pairs,
                           batch_size=args.batch_size,
                           show_progress_bar=True)

    all_ranks = []
    all_scores = []
    for i in range(len(query_texts)):
        first_stage_doc_idxs = [int(docidx) for docidx in psg_indices[i]]

        query_predictions_start = args.first_stage_depth * i
        query_predictions_end = args.first_stage_depth * (i + 1)
        query_scores = -1 * scores[
            query_predictions_start:query_predictions_end]
        query_reranks = np.argsort(query_scores)[:args.results_limit]
        all_ranks.append(
            [first_stage_doc_idxs[rerank] for rerank in query_reranks])
        all_scores.append(list(query_scores[query_reranks]))
예제 #5
0
class PathwayGenerator():
    def __init__(self,
                 file_path,
                 pilot,
                 service,
                 use_cuda=False,
                 cuda_device=-1,
                 annotation_model=None,
                 section_split_model=None):
        ''' PathwayGenerator object constructor

        Args:
            path (str): path of the file from which the pathway is generated.
            pilot (str): name of the pilot.
            service (str): name of the service considered.
            use_cuda (bool): flag to use gpu model.
            cuda_device (int, optional): Id of the gpu device to use. Defaults to -1.
        '''

        assert file_path is not None, "A file path is required"

        languages = {
            'Larissa': 'el',
            'Birmingham': 'en',
            'Malaga': 'es',
            'Palermo': 'it'
        }

        self.path = file_path
        if os.path.splitext(self.path)[-1] == '.txt':
            self.converted_file = doc2txt.purge_urls(
                open(self.path, 'r').read(),
                os.path.splitext(self.path)[0])
        self.use_cuda = use_cuda
        self.cuda_device = cuda_device
        self.language = languages[pilot]
        # TODO: language detection param?
        if len(annotation_model) != 2:
            self.annotation_model = Transner(
                pretrained_model=annotation_model,
                use_cuda=use_cuda,
                cuda_device=cuda_device,
                language_detection=True,
                threshold=0.85,
                args={"use_multiprocessing": False})
        else:
            self.annotation_model = Transner(
                pretrained_model='bert_uncased_' + annotation_model,
                use_cuda=use_cuda,
                cuda_device=cuda_device,
                language_detection=True,
                threshold=0.85,
                args={"use_multiprocessing": False})

        self.section_split_model = CrossEncoder(section_split_model,
                                                num_labels=1)

        self.annotation_metadata = metadata = pilot + ' - ' + service + ' - ' + os.path.basename(
            self.path)
        #self.generation_metadata = {
        #    'where': pilot + ' - ' + service + ' - ' + 'Where - ' + os.path.basename(self.path) + ' - ',
        #    'when': pilot + ' - ' + service + ' - ' + 'When - ' + os.path.basename(self.path) + ' - ',
        #    'how': pilot + ' - ' + service + ' - ' + 'How - ' + os.path.basename(self.path) + ' - '
        #}

        self.generation_metadata = pilot + ' - ' + service + ' - ' + os.path.basename(
            self.path) + ' - '

    def to_list(self):
        element_list = []  # Make an empty list

        for element in re.split('\n', self.converted_file):
            stripped_element = element.strip()
            if stripped_element != '':
                element_list.append(
                    stripped_element)  #Append to list the striped element

        return element_list

    def do_convert(self):
        self.converted_file = doc2txt.convert_to_txt(self.path)
        return self.converted_file

    def do_split(self, threshold=0.5):
        sentence_list = self.to_list()

        scores = []
        for i in range(0, len(sentence_list) - 1):
            current_sentence = sentence_list[i]
            next_sentence = sentence_list[i + 1]

            score = self.section_split_model.predict(
                [current_sentence, next_sentence])
            scores.append(score)

        sections = [
        ]  # sections = [['section1'], ['section2'], ... , ['sectionN']]
        section_text = []
        section_text.append(sentence_list[0])
        for i in range(0, len(scores)):
            if scores[i] >= threshold:
                section_text.append(sentence_list[i + 1])
            else:
                sections.append(section_text)
                section_text = []
                section_text.append(sentence_list[i + 1])
        sections.append(section_text)

        return sections

    def do_annotate(self, sentence_list):
        self.ner_dict = self.annotation_model.ner(sentence_list,
                                                  apply_regex=True)
        if self.language in ['es', 'en']:
            self.ner_dict = self.annotate_sutime(self.ner_dict)
        else:
            self.ner_dict = self.annotation_model.find_dates(self.ner_dict)

        self.ner_dict = annotator.aggregate_dict(self.ner_dict)

        self.ner_dict['entities'] = sorted(self.ner_dict['entities'],
                                           key=lambda ner: ner['start_offset'])

        self.ner_dict = annotator.resolve_uri_entities(self.ner_dict,
                                                       self.path)

        return self.ner_dict

    def do_generate(self):
        if os.path.splitext(self.path)[-1] == '.json':
            self.ner_dict = json.load(open(self.path, 'r'))
        aggregated_ner_dict = aggregator.aggregate_entities(self.ner_dict)
        print(aggregated_ner_dict)
        #aggregated_ner_dict = self.ner_dict = {'text': 'test 1 of the section 1.\ntest 2 of the section 1.\ntest 3 of the section 1.\n', 'entities': {'LOCATION': [{'value': 'test', 'confidence': 0.9737, 'start_offset': 0, 'end_offset': 4}], 'ORGANIZATION': [{'value': 'test', 'confidence': 0.9676, 'start_offset': 25, 'end_offset': 29}], 'TIME': [{'value': 'test', 'confidence': 0.9573, 'start_offset': 50, 'end_offset': 54}]}}
        json_pathway = generator.generate(aggregated_ner_dict)
        mapped_entities = json.loads(json_pathway)

        dict_pathway = json.load(open("tools/dict_pathway.json", 'r'))

        self.pathway = {}

        #{'physical_office': [{'start', 'end'}...]}
        for key, sub_types in dict_pathway.items():
            self.pathway[key] = {}
            for sub_type in sub_types:
                self.pathway[key][sub_type] = []

        for entity in mapped_entities:
            self.pathway[self.keys_of_value(
                dict_pathway, entity['step'])][entity['step']].append(entity)

        # {'dove': [], 'come': [], 'quando': []}

        #todo: remove return because we can read the value in the pgr object
        return self.pathway

    def export_annotation_to_doccano(self, add_confidence=False):
        filename = os.path.splitext(self.path)[0]

        doccano_dict = {}
        doccano_dict['text'] = self.ner_dict['text']
        doccano_dict['labels'] = []

        doccano_dict['meta'] = self.annotation_metadata

        for item in self.ner_dict['entities']:
            if add_confidence:
                doccano_dict['labels'].append([
                    item['start_offset'], item['end_offset'], item['type'],
                    item['confidence']
                ])
            else:
                doccano_dict['labels'].append(
                    [item['start_offset'], item['end_offset'], item['type']])

        file_out = open(filename + '_ner.jsonl', 'w', encoding='utf-8')
        file_out.write(json.dumps(doccano_dict))
        file_out.write('\n')

        return doccano_dict, filename + '_ner.jsonl'

    def export_generation_to_doccano(self, pathway=None):
        dict_translations = json.load(open("tools/dict_translations.json",
                                           'r'))

        filename = os.path.splitext(self.path)[0]
        pathway_jsonl = []

        for key in pathway:
            tmp_dict = {"text": '', "labels": [], "meta": ''}
            tmp_dict["text"] = key

            for step, step_dict in pathway[key].items():
                tmp_dict["meta"] = self.generation_metadata + key
                for sub_type, entities in step_dict.items():
                    label = dict_translations[
                        self.language][step] + ' - ' + dict_translations[
                            self.language][sub_type] + ': '
                    if len(entities) == 0:
                        label = label + '-'
                        tmp_dict['labels'].append(label)
                    else:
                        for entity in entities:
                            label = label + entity['entity'].strip() + ' , '

                        tmp_dict['labels'].append(label[:-2].strip())

            pathway_jsonl.append(tmp_dict)

        file_out = open(filename + '_pathway.jsonl', 'w', encoding='utf-8')

        return_string = ''

        for element in pathway_jsonl:
            string_element = str(json.dumps(element, ensure_ascii=False))
            file_out.write(string_element)
            file_out.write('\n')

            return_string = return_string + string_element + '\n'

        return return_string, filename + '_pathway.jsonl'

    def keys_of_value(self, dct, value):
        for k in dct:
            if isinstance(dct[k], list):
                if value in dct[k]:
                    return k
            else:
                if value == dct[k]:
                    return k

    def annotate_sutime(self, ner_dict):
        for item in ner_dict:
            text = item['sentence']
            jar_files = os.path.join('python-sutime/', 'jars')
            sutime = sutime_mod.SUTime(jars=jar_files, mark_time_ranges=True)

            json = sutime.parse(text)

            time_type = self.annotation_model.check_opening_time(
                item['entities'])

            for item_sutime in json:
                if not self.annotation_model.find_overlap(
                        item['entities'], item_sutime['start'],
                        item_sutime['end']):
                    item['entities'].append({
                        'type': time_type,
                        'value': item_sutime['text'],
                        'confidence': 0.85,
                        'offset': item_sutime['start']
                    })

        return ner_dict

    def sections_to_doccano(self, sections):
        count, step = 0, 1
        doccano_dict = {'text': '', 'labels': []}

        for section in sections:
            initial_count, final_count = count, 0

            for sentence in section:
                doccano_dict['text'] = doccano_dict['text'] + sentence + '.\n'
                final_count = final_count + len(sentence) + 2

            doccano_dict['labels'].append([
                initial_count, initial_count + final_count - 1,
                'Step' + str(step)
            ])
            step = step + 1
            count = initial_count + final_count

        return doccano_dict
예제 #6
0
class rerankPassages:

	def __init__(self, nlp):
		self.bm25_ranking = bm25(nlp)
		self.tfidf_ranking = tfidf(nlp)
		self.sbert_ranking = sbert()
		self.cross_encoder = CrossEncoder("cross-encoder/ms-marco-TinyBERT-L-6")
		self.kg = KnowledgeGraph('chatbot', 'password')
		self.document = None
	
	def fit(self, document):
		self.document = document
		self.bm25_ranking.fit(document)
		self.tfidf_ranking.preprocessDocument(document)
		self.sbert_ranking.fit(document)
	
	def matchParaSent(self, s, p):
		
		sList = s.split()
		if len(sList) < 1:
			return False
		count = 0
		for i in sList:
			if i in p:count += 1
	
		if count/len(sList) > 0.9: return True
		else: 
			return False

	def getSentences(self, query, n):
		return self.kg.retrieveSentences(query, n)
	def withKg(self, query, paras, t):
			sentences = self.kg.retrieveSentences(query, 10)

			for i in paras:
				avgScore = 0
				sentencesMatched = 0
				for s in sentences:
					sentence = s['sentence']
					score = s['score']
					if self.matchParaSent(sentence, i[0]):
						if sentence not in i[0]: print(sentence, i[0])
						# print(sentence, i[0])
						sentencesMatched += 1
						avgScore += score
				# if sentencesMatched == 0: sentencesMatched = 1
				i[1] = 1/(t + i[1]) + 1/(t + sentencesMatched)

			paras.sort(key = lambda x : x[1])
			return [i[0] for i in paras]

	def withCrossEncoder(self, query, paras):
		para_combination = [[query, p] for p in paras]

		score = self.cross_encoder.predict(para_combination)
		sim_scores_argsort = reversed(np.argsort(score))
		
		reranked_passages = list()
		for idx in sim_scores_argsort:
			reranked_passages.append(paras[idx])
		return reranked_passages

	def rankDocuments(self, query, mu, k):
		bm25_scores = self.bm25_ranking.rankDocuments(query)
		tfidf_scores = self.tfidf_ranking.rankDocuments(query)
		sbert_scores = self.sbert_ranking.rankDocuments(query)
		#Combined scoring
		# mu = 0.7
		# k = 10
		rrf = mu*sbert_scores + (1-mu)*tfidf_scores
		# rrf = 1/(k+c) + 1/(k + bm25_scores)
		# print(rrf)
		# print(np.shape(rrf))
		#retrive top k passages
		scores = rrf.tolist()
		score_passage = [(s,i) for i, s in enumerate(scores[0])]
		score_passage.sort(reverse = True)
		# return self.withKg(query, [[self.document[i[1]], i[0]] for i in score_passage[:4]], k)
		return self.withCrossEncoder(query, [self.document[i[1]] for i in score_passage[:5]])