def separate_dataset(self, in_file, out_file, check_function): """ This function is used to separate the original word similarity dataset. word similarity of noun: noun_rg.txt, noun_mc.txt, noun_ws353.txt, noun_ws353-sim.txt, noun_simlex.txt the lcs is in knowledge graph: graph_rg.txt, graph_mc.txt, graph_ws353.txt, graph_ws353-sim.txt, graph_simlex.txt both words are in knowledge graph: type_rg.txt, type_mc.txt, type_ws353.txt, type_ws353-sim.txt, type_simlex.txt :param in_file: source dataset file :param out_file: target dataset file :param check_function: the function of mapping criteria for deciding the word pairs. :return: """ out_data = [] word_pairs, human = self.load_dataset(in_file) for i, pairs in enumerate(word_pairs): w1, w2 = pairs h = human[i] if check_function(w1, w2): out_data.append(' '.join([w1, w2, str(h)])) FileIO.save_list_file('dataset/wordsim/%s.txt' % out_file, out_data)
def separate_dataset(self, in_file, out_file, check_function): """ This function is used to separate the original word similarity dataset. word similarity of noun: noun_rg.txt, noun_mc.txt, noun_ws353.txt, noun_ws353-sim.txt, noun_simlex.txt the lcs is in knowledge graph: graph_rg.txt, graph_mc.txt, graph_ws353.txt, graph_ws353-sim.txt, graph_simlex.txt both words are in knowledge graph: type_rg.txt, type_mc.txt, type_ws353.txt, type_ws353-sim.txt, type_simlex.txt :param in_file: source dataset file :param out_file: target dataset file :param check_function: the function of mapping criteria for deciding the word pairs. :return: """ out_data = [] word_pairs, human = self.load_dataset(in_file) for i, pairs in enumerate(word_pairs): w1, w2 = pairs h = human[i] if check_function(w1, w2): out_data.append(' '.join([w1, w2, str(h)])) FileIO.save_list_file('eval/word_similarity/%s.txt' % out_file, out_data)
def save_result(self, cor, sim_values, sim_name, dataset_name): """ This function save the result computed by a similarity metric :param cor: correlation with human rating :param sim_values: similarity scores for word pairs :param sim_name: the name of similarity metric :param dataset_name: the name of word similarity dataset :return: """ data = ["%.3f" % cor] data += map(lambda x: "%.3f" % x, sim_values) FileIO.save_list_file('dataset/wordsim/results/%s-%s.txt' % (dataset_name, sim_name), data)
def save_result(self, cor, sim_values, sim_name, dataset_name): """ This function save the result computed by a similarity metric :param cor: correlation with human rating :param sim_values: similarity scores for word pairs :param sim_name: the name of similarity metric :param dataset_name: the name of word similarity dataset :return: """ data = ["%.3f" % cor] data += map(lambda x: "%.3f" % x, sim_values) FileIO.save_list_file('eval/word_similarity/results/%s-%s.txt' % (dataset_name, sim_name), data)
def evaluate(self, input_file, output_file): """ Evaluate the sentence similarity :param input_file: corpus file :param output_file: result file :return: similarity scores of text pairs """ corpus = self.load_dataset(input_file) print 'dataset size: ', len(corpus) result = [self._sim_metric(t1, t2) for t1, t2 in corpus] result = map(lambda x:"%.3f" % round(x,3), result) FileIO.save_list_file(output_file, result) return result
def evaluate(self, input_file, output_file): """ Evaluate the sentence similarity :param input_file: corpus file :param output_file: result file :return: similarity scores of text pairs """ corpus = self.load_dataset(input_file) print 'dataset size: ', len(corpus) result = [self._sim_metric(t1, t2) for t1, t2 in corpus] result = map(lambda x: "%.3f" % round(x, 3), result) FileIO.save_list_file(output_file, result) return result