def noContextSim(self, tokens, trained_model): print('Running: Word Similarity - Context False') synset_op = SynsetParserVector() new_tokens = [] for token in tokens: new_token = TokenData() synsets_a = wordnet.synsets(token.word1) synsets_b = wordnet.synsets(token.word2) # clean synsets that only exist in the model vec_syna = synset_op.validate_synsets_model( token.word1, synsets_a, trained_model) vec_synb = synset_op.validate_synsets_model( token.word2, synsets_b, trained_model) # perform all metrics Word Similarity new_token.sim.max = self.maxSim(vec_syna, vec_synb) new_token.sim.avg = self.avgSim(vec_syna, vec_synb) new_token.sim.glob = self.globalSim(vec_syna, vec_synb) new_token.word1 = token.word1 new_token.word2 = token.word2 new_tokens.append(new_token) return new_tokens
def yesContextSim(self, tokens, trained_model): print('Running: Word Similarity - Context True') sim_nocontext = NoContextSimilarity() synset_op = SynsetParserVector() text_parser = TextParser() new_tokens = [] for token in tokens: new_token = TokenData() synsets_a = wordnet.synsets(token.word1) synsets_b = wordnet.synsets(token.word2) # clean and tokenizer context context_a = text_parser.cleanText(token.sent1) context_b = text_parser.cleanText(token.sent2) # average vector for the context for each word clean_context_a = self.__contextParser(context_a, trained_model) clean_context_b = self.__contextParser(context_b, trained_model) # clean synsets that only exist in the model vec_syna = synset_op.validate_synsets_model( token.word1, synsets_a, trained_model) vec_synb = synset_op.validate_synsets_model( token.word2, synsets_b, trained_model) # perform all metrics Word Similarity # Context new_token.sim.maxC = self.__maxSimC(vec_syna, clean_context_a, vec_synb, clean_context_b) new_token.sim.avgC = self.__avgSimC(vec_syna, clean_context_a, vec_synb, clean_context_b) new_token.sim.globC = self.__globalSimC(clean_context_a, clean_context_b) # No Context new_token.sim.max = sim_nocontext.maxSim(vec_syna, vec_synb) new_token.sim.avg = sim_nocontext.avgSim(vec_syna, vec_synb) new_token.sim.glob = sim_nocontext.globalSim(vec_syna, vec_synb) new_token.word1 = token.word1 new_token.word2 = token.word2 new_tokens.append(new_token) return new_tokens
def convert_SCWS(self, input_file, output_file, delimiter='\t'): fio = FileManipulation() ranges = RangeData() tokens = [] with open(input_file, 'r', encoding='utf-8') as fin: for line in fin: token = TokenData() block = line.split(delimiter) token.word1 = block[1] token.word2 = block[3] token.sent1 = block[5] # target word1 between <b> </b> token.sent2 = block[6] # target word2 between <b> </b> token.simvalue = float(block[7].strip('\n')) new_range = ranges.range_category['cos'] old_range = ranges.range_category['scws'] token.simvalue = numpy.interp(token.simvalue, old_range, new_range) tokens.append(token) fio.writeSimilarityContext(output_file, tokens) # specific writer for SCWS
def readFileLine(self, fname, delimiter='\t'): tokens = [] record_items = 3 with open(fname, 'r', encoding='utf-8') as fin: for line in fin: token = TokenData() record = line.split(delimiter) token.word1 = record[0] token.word2 = record[1] if len(record) == record_items: # w1 w2 sim (3) token.simvalue = float(record[2].strip('\n')) else: # w1 w2 s1 s2 sim (5) token.sent1 = record[2] token.sent2 = record[3] token.simvalue = float(record[4].strip('\n')) tokens.append(token) return tokens
def convert_MC28(self, input_file, output_file, delimiter=';'): fio = FileManipulation() ranges = RangeData() tokens = [] with open(input_file, 'r', encoding='utf-8') as fin: for line in fin: token = TokenData() block = line.split(delimiter) token.word1 = block[0] token.word2 = block[1] token.simvalue = float(block[2].strip('\n')) new_range = ranges.range_category['cos'] old_range = ranges.range_category['mc28'] token.simvalue = numpy.interp(token.simvalue, old_range, new_range) tokens.append(token) fio.writeSimilarityWord(output_file, tokens)