def evaluate_sentence_similarity(self, dataset_name="MSRvid", metric = "wpath_graph", relatedness=True, save_results = False, database="wikidata"): concepts, cc, texts = get_ideas_in_format(dataset_name, database=database) KG = DAC(concepts=concepts, dataset=dataset_name, relatedness=relatedness, database=database) if(KG.graph.__len__()==0): print("start building knowledge graph") KG.build_nx_graph() ConSim = ConceptSimilarity(KG) sim_M = ConSim.similarityMatrix(lcs_pref_value="freq1", metric=metric) WMD = WordMoversSimilarity(sim_M, KG._concepts) sen_pairs, human_sim = self._dataset.load_sentence_pairs_and_similarities(dataset_name) sim_values = [] map_sen2bow = dict(zip(texts, [[c["id"] for c in bow] for bow in cc])) pg, total_len = 0 , len(sen_pairs) remove_index = [] for sen1, sen2 in sen_pairs: show_progression(pg, total_len) bow1, bow2 = list(set(map_sen2bow[sen1]) & set(KG._concepts)), list(set(map_sen2bow[sen2]) & set(KG._concepts)) sim_value = WMD.word_mover_distance(bow1, bow2) if sim_value is None: print(sen1, sen2) remove_index.append(pg) else: sim_values.append(sim_value) pg = pg+1 human_sim = np.delete(human_sim, remove_index) cor = pearsonr(sim_values, human_sim)[0] if save_results: results = list(zip([round(x, 3) for x in sim_values], sen_pairs)) self._dataset.save_dataset(dict(zip(("correlation", "similarities"),(cor, results))), dataset_name+"_"+metric) return cor
def global_secondorder_freq(self, write_value="freq1"): i, num_nodes = 0, self.graph.__len__() for n in self.graph.nodes: i = i + 1 show_progression(i, num_nodes) try: num1 = sparql_request( sql.query_babelnet_number_of(n))[0]["count"]["value"] self.graph.nodes[n][write_value] = int(num1) except: print("timeout babelnet", n) self.graph.nodes[n][write_value] = 6000000 self.write_to_file()
def build_nx_graph(self, query=sql.query_ancestors): start_t = time.time() concepts = self._init_concepts nxKG = nx.DiGraph() i, concepts_len = 0, len(concepts) for con in concepts: keys = [con] i = i + 1 show_progression(i, concepts_len) if self._relatedness: keys += self.add_edges_for_concept( query(con, sql.relation_prop), nxKG) while len(keys) > 0: key = keys.pop() keys += self.add_edges_for_concept(query(key), nxKG) self.graph = nxKG self.init_key2pos() self.write_to_file() return print("building the Graph from %s took: %s seconds." % (self._database, (time.time() - start_t)))
def build_nx_graph(self): if self._database != "babelnet": return print("Function only for babelnet") start_t = time.time() concepts = self._init_concepts nxKG = nx.DiGraph() i, concepts_len = 0, len(concepts) for con in concepts: keys = [con] i = i + 1 show_progression(i, concepts_len) while len(keys) > 0: key = keys.pop() keys += self.add_edges_for_concept(sql.babelnet_paths2top(key), nxKG, split_delimiter="/s", prefix="bn:") self.graph = nxKG self.init_key2pos() self.write_to_file() return print("building the Graph from %s took: %s seconds." % (self._database, (time.time() - start_t)))
def information_content(concepts): IC = [] for c, i in concepts: show_progression(i, len(concepts)) IC.append(sparql_request(sql.query_freq_wikidata(c, 2), "wikidata")) return [c[0]["count"]["value"] if c != None else None for c in IC]