Exemplo n.º 1
0
    def noContextSim(self, tokens, trained_model):
        print('Running: Word Similarity - Context False')
        synset_op = SynsetParserVector()
        new_tokens = []

        for token in tokens:
            new_token = TokenData()

            synsets_a = wordnet.synsets(token.word1)
            synsets_b = wordnet.synsets(token.word2)

            # clean synsets that only exist in the model
            vec_syna = synset_op.validate_synsets_model(
                token.word1, synsets_a, trained_model)
            vec_synb = synset_op.validate_synsets_model(
                token.word2, synsets_b, trained_model)

            # perform all metrics Word Similarity
            new_token.sim.max = self.maxSim(vec_syna, vec_synb)
            new_token.sim.avg = self.avgSim(vec_syna, vec_synb)
            new_token.sim.glob = self.globalSim(vec_syna, vec_synb)

            new_token.word1 = token.word1
            new_token.word2 = token.word2
            new_tokens.append(new_token)

        return new_tokens
Exemplo n.º 2
0
    def yesContextSim(self, tokens, trained_model):
        print('Running: Word Similarity - Context True')
        sim_nocontext = NoContextSimilarity()
        synset_op = SynsetParserVector()
        text_parser = TextParser()
        new_tokens = []

        for token in tokens:
            new_token = TokenData()
            synsets_a = wordnet.synsets(token.word1)
            synsets_b = wordnet.synsets(token.word2)
            # clean and tokenizer context
            context_a = text_parser.cleanText(token.sent1)
            context_b = text_parser.cleanText(token.sent2)
            # average vector for the context for each word
            clean_context_a = self.__contextParser(context_a, trained_model)
            clean_context_b = self.__contextParser(context_b, trained_model)
            # clean synsets that only exist in the model
            vec_syna = synset_op.validate_synsets_model(
                token.word1, synsets_a, trained_model)
            vec_synb = synset_op.validate_synsets_model(
                token.word2, synsets_b, trained_model)

            # perform all metrics Word Similarity
            # Context
            new_token.sim.maxC = self.__maxSimC(vec_syna, clean_context_a,
                                                vec_synb, clean_context_b)
            new_token.sim.avgC = self.__avgSimC(vec_syna, clean_context_a,
                                                vec_synb, clean_context_b)
            new_token.sim.globC = self.__globalSimC(clean_context_a,
                                                    clean_context_b)
            # No Context
            new_token.sim.max = sim_nocontext.maxSim(vec_syna, vec_synb)
            new_token.sim.avg = sim_nocontext.avgSim(vec_syna, vec_synb)
            new_token.sim.glob = sim_nocontext.globalSim(vec_syna, vec_synb)

            new_token.word1 = token.word1
            new_token.word2 = token.word2
            new_tokens.append(new_token)

        return new_tokens
Exemplo n.º 3
0
    def convert_SCWS(self, input_file, output_file, delimiter='\t'):
        fio = FileManipulation()
        ranges = RangeData()
        tokens = []

        with open(input_file, 'r', encoding='utf-8') as fin:
            for line in fin:
                token = TokenData()
                block = line.split(delimiter)
                token.word1 = block[1]
                token.word2 = block[3]
                token.sent1 = block[5]  # target word1  between <b> </b>
                token.sent2 = block[6]  # target word2  between <b> </b>
                token.simvalue = float(block[7].strip('\n'))
                new_range = ranges.range_category['cos']
                old_range = ranges.range_category['scws']
                token.simvalue = numpy.interp(token.simvalue, old_range, new_range)
                tokens.append(token)
        fio.writeSimilarityContext(output_file, tokens)  # specific writer for SCWS
Exemplo n.º 4
0
    def readFileLine(self, fname, delimiter='\t'):
        tokens = []
        record_items = 3
        with open(fname, 'r', encoding='utf-8') as fin:
            for line in fin:
                token = TokenData()
                record = line.split(delimiter)
                token.word1 = record[0]
                token.word2 = record[1]

                if len(record) == record_items:  # w1 w2 sim (3)
                    token.simvalue = float(record[2].strip('\n'))
                else:  # w1 w2 s1 s2 sim (5)
                    token.sent1 = record[2]
                    token.sent2 = record[3]
                    token.simvalue = float(record[4].strip('\n'))

                tokens.append(token)
        return tokens
Exemplo n.º 5
0
    def convert_MC28(self, input_file, output_file, delimiter=';'):
        fio = FileManipulation()
        ranges = RangeData()
        tokens = []

        with open(input_file, 'r', encoding='utf-8') as fin:
            for line in fin:
                token = TokenData()
                block = line.split(delimiter)
                token.word1 = block[0]
                token.word2 = block[1]
                token.simvalue = float(block[2].strip('\n'))
                new_range = ranges.range_category['cos']
                old_range = ranges.range_category['mc28']
                token.simvalue = numpy.interp(token.simvalue, old_range, new_range)
                tokens.append(token)
        fio.writeSimilarityWord(output_file, tokens)