예제 #1
0
    def separate_dataset(self, in_file, out_file, check_function):
        """
        This function is used to separate the original word similarity dataset.

        word similarity of noun: noun_rg.txt, noun_mc.txt, noun_ws353.txt, noun_ws353-sim.txt, noun_simlex.txt

        the lcs is in knowledge graph: graph_rg.txt, graph_mc.txt, graph_ws353.txt,
        graph_ws353-sim.txt, graph_simlex.txt

        both words are in knowledge graph:  type_rg.txt, type_mc.txt, type_ws353.txt, type_ws353-sim.txt,
        type_simlex.txt

        :param in_file: source dataset file
        :param out_file: target dataset file
        :param check_function: the function of mapping criteria for deciding the word pairs.
        :return:
        """
        out_data = []
        word_pairs, human = self.load_dataset(in_file)
        for i, pairs in enumerate(word_pairs):
            w1, w2 = pairs
            h = human[i]
            if check_function(w1, w2):
                out_data.append(' '.join([w1, w2, str(h)]))
        FileIO.save_list_file('dataset/wordsim/%s.txt' % out_file, out_data)
예제 #2
0
    def separate_dataset(self, in_file, out_file, check_function):
        """
        This function is used to separate the original word similarity dataset.

        word similarity of noun: noun_rg.txt, noun_mc.txt, noun_ws353.txt, noun_ws353-sim.txt, noun_simlex.txt

        the lcs is in knowledge graph: graph_rg.txt, graph_mc.txt, graph_ws353.txt,
        graph_ws353-sim.txt, graph_simlex.txt

        both words are in knowledge graph:  type_rg.txt, type_mc.txt, type_ws353.txt, type_ws353-sim.txt,
        type_simlex.txt

        :param in_file: source dataset file
        :param out_file: target dataset file
        :param check_function: the function of mapping criteria for deciding the word pairs.
        :return:
        """
        out_data = []
        word_pairs, human = self.load_dataset(in_file)
        for i, pairs in enumerate(word_pairs):
            w1, w2 = pairs
            h = human[i]
            if check_function(w1, w2):
                out_data.append(' '.join([w1, w2, str(h)]))
        FileIO.save_list_file('eval/word_similarity/%s.txt' % out_file, out_data)
예제 #3
0
 def save_result(self, cor, sim_values, sim_name, dataset_name):
     """
     This function save the result computed by a similarity metric
     :param cor: correlation with human rating
     :param sim_values: similarity scores for word pairs
     :param sim_name: the name of similarity metric
     :param dataset_name: the name of word similarity dataset
     :return:
     """
     data = ["%.3f" % cor]
     data += map(lambda x: "%.3f" % x, sim_values)
     FileIO.save_list_file('dataset/wordsim/results/%s-%s.txt' % (dataset_name, sim_name), data)
예제 #4
0
 def save_result(self, cor, sim_values, sim_name, dataset_name):
     """
     This function save the result computed by a similarity metric
     :param cor: correlation with human rating
     :param sim_values: similarity scores for word pairs
     :param sim_name: the name of similarity metric
     :param dataset_name: the name of word similarity dataset
     :return:
     """
     data = ["%.3f" % cor]
     data += map(lambda x: "%.3f" % x, sim_values)
     FileIO.save_list_file('eval/word_similarity/results/%s-%s.txt' % (dataset_name, sim_name), data)
예제 #5
0
 def evaluate(self, input_file, output_file):
     """
     Evaluate the sentence similarity
     :param input_file: corpus file
     :param output_file: result file
     :return: similarity scores of text pairs
     """
     corpus = self.load_dataset(input_file)
     print 'dataset size: ', len(corpus)
     result = [self._sim_metric(t1, t2) for t1, t2 in corpus]
     result = map(lambda x:"%.3f" % round(x,3), result)
     FileIO.save_list_file(output_file, result)
     return result
예제 #6
0
 def evaluate(self, input_file, output_file):
     """
     Evaluate the sentence similarity
     :param input_file: corpus file
     :param output_file: result file
     :return: similarity scores of text pairs
     """
     corpus = self.load_dataset(input_file)
     print 'dataset size: ', len(corpus)
     result = [self._sim_metric(t1, t2) for t1, t2 in corpus]
     result = map(lambda x: "%.3f" % round(x, 3), result)
     FileIO.save_list_file(output_file, result)
     return result