def test_sentence_1(self): test_sentence = "Hello, Jack. How is it going? Not bad; pretty good, actually... Very very good, in fact." results = a6.get_sentence_lists(test_sentence) answer = [['hello', 'jack'], ['how', 'is', 'it', 'going'], ['not', 'bad', 'pretty', 'good', 'actually'], ['very', 'very', 'good', 'in', 'fact']] self.assertEqual(answer, results)
def test_sentence_from_file(self): results = a6.get_sentence_lists_from_files(["sample_file.txt"]) answer = [['this', 'is', 'a', 'test'], ['i', 'm', 'going', 'to', 'say', 'it', 'is'], ['i', 'hope', 'it', 'works'], ['if', 'not', 'then', 'you', 'still', 'have', 'work', 'to', 'do'], ['e', 'm', 'c', '2']] self.assertEqual(answer, results)
def getSynonymMeaning(self, word_surface): flg = True return_message = "\"" + word_surface + "\"" if word_surface in self.getSurfaceDict().keys(): for word in self.words_list: if word_surface == word.getSurface(): if flg: return_message += "の日本語訳として「" for meaning in word.getMeaning(): return_message += meaning return_message += "」が存在します." flg = False return_message += "さらに,その類義語として" synonym_list = Synonym.getSynonymList( word.getMeaning()[0]) for i in range(5): s = random.choice(synonym_list) if s == word.getMeaning()[0]: continue return_message += "「" return_message += s return_message += "」" return_message += "が考えられます." else: return_message += "また,別の日本語訳として「" for meaning in word.getMeaning(): return_message += meaning return_message += "」が存在します." return_message += "さらに,その類義語として" synonym_list = Synonym.getSynonymList( word.getMeaning()[0]) for i in range(5): s = random.choice(synonym_list) if s == word.getMeaning()[0]: continue return_message += "「" return_message += s return_message += "」" return_message += "が考えられます." else: return_message += "に相当する日本語訳が見つかりませんでした." return return_message
def main(dirname): wordList = bulitDict(dirname + '/dict') synonymDict = Synonym.Synonym(dirname + "/dict/words.vector") value = [] stopWord = [] with open(dirname + "/dict/stopword.txt", "r") as f: while 1: line = f.readline() if not line: break stopWord.append(line) files = os.listdir(dirname + '/data') for filename in files: data = json.load(dirname + "/data" + filename) for cmt in data: cmt_seg = jieba.posseg.cut(cmt['text']) print(cmt_seg) cmt_value = 1 for seg in cmt_seg: if seg in stopWord: continue try: score = wordList[seg.word] except: print( "Word " + seg.word + " is not in the nornaml dict. Searching in the synonym..." ) try: syno = synonymDict.get_synonym(seg.word) score = wordList[syno[0]] * syno[syno[1]] except: print("Word " + seg.word + " POS " + seg.flag) cmt_value *= score value.append(cmt_value)
def getSynonymSurface(self, word_meaning): flg = True return_message = "「" + word_meaning + "」に相当するセラフェノ訳が見つかりませんでした." synonym_list = Synonym.getSynonymList(word_meaning) for synonym in synonym_list: if synonym in self.getMeaningDict().keys(): return_message += "「" + word_meaning + "」の類義語「" return_message += synonym + "」のセラフェノ訳として" for word in self.words_list: if synonym in word.getMeaning(): return_message += "\"" return_message += word.getSurface() return_message += "\"が存在します." flg = False if flg: return_message += "類義語に対するセラフェノ訳が見つかりませんでした." return return_message
def similarity_tags(tag1, tag2): #Nlevij = levij/max(length(ti),length(tj)). the smaller the better #multiple Nlevij to graph_distance v1 = graph_distance(tag1, tag2) v2 = 0.0 v2 = levenshtein(tag1, tag2)/(1.0*max(len(tag1), len(tag2))) #v2 = n + (1-n)*lev/max(len1,len2) v2 = 1- RATIO_LEVENSHTEIN + RATIO_LEVENSHTEIN*v2 #10,10,2013 try the multiply, performance is bad, the scale of v1 and v2 are not the same print >> outputf, "v2 %3.8f" %(v2), "| tag:", tag1, "| tagto: ", tag2 #consider synonym is_synonym = 0.0 if (Synonym.check_synonym (tag1, tag2) == 1): is_synonym = SYNONYM_YES_RATIO else: is_synonym = SYNONYM_NO_RATIO if (v2 != 0.0): #v2 = v2**(1.0/2) #too large last time, make squaring return v1*v2*is_synonym else: return v1*is_synonym
# build graph graph = {} for tag_item in myTagSet: graph[tag_item] = {} for pair_item in myTagPairs: #get related pairs that contains tag1 tag_pair = pair_item.split(',') tag1 = tag_pair[0] tag2 = tag_pair[1] #consider levenshtein lev2 = 0.0 lev2 = levenshtein(tag1, tag2)/(1.0*max(len(tag1), len(tag2))) #consider synonym is_synonym = 0.0 if (Synonym.check_synonym (tag1, tag2) == 1): is_synonym = SYNONYM_YES_RATIO else: is_synonym = SYNONYM_NO_RATIO if (lev2 == 0.0): likelihoodOfPair = 0.0 else: likelihoodOfPair = 1.0/((1.0)*myTagPairs[pair_item]*lev2*is_synonym) # distance = 1/(frequency * levenshtein * synonym) ###### #format(likelihoodOfPair, '3.8f') if tag_item == tag_pair[0]: #save the other tag in graph as a edge graph[tag_item][tag_pair[1]] = likelihoodOfPair if tag_item == tag_pair[1]: graph[tag_item][tag_pair[0]] = likelihoodOfPair
def test_sentence_from_assignment(self): test_sentence_list = [['i', 'am', 'a', 'sick', 'man'], ['i', 'am', 'a', 'spiteful', 'man'], ['i', 'am', 'an', 'unattractive', 'man'], ['i', 'believe', 'my', 'liver', 'is', 'diseased'], ['however', 'i', 'know', 'nothing', 'at', 'all', 'about', 'my', 'disease', 'and', 'do', 'not', 'know', 'for', 'certain', 'what', 'ails', 'me']] results = a6.build_semantic_descriptors(test_sentence_list) answer = {'sick': {'man': 1, 'a': 1, 'am': 1, 'i': 1}, 'am': {'sick': 1, 'a': 2, 'an': 1, 'i': 3, 'man': 3, 'unattractive': 1, 'spiteful': 1}, 'know': {'what': 2, 'about': 2, 'disease': 2, 'my': 2, 'me': 2, 'certain': 2, 'and': 2, 'for': 2, 'do': 2, 'i': 2, 'at': 2, 'however': 2, 'all': 2, 'ails': 2, 'not': 2, 'nothing': 2}, 'disease': {'know': 2, 'ails': 1, 'about': 1, 'my': 1, 'me': 1, 'certain': 1, 'and': 1, 'for': 1, 'do': 1, 'i': 1, 'at': 1, 'however': 1, 'all': 1, 'what': 1, 'not': 1, 'nothing': 1}, 'liver': {'believe': 1, 'is': 1, 'diseased': 1, 'i': 1, 'my': 1}, 'spiteful': {'man': 1, 'a': 1, 'am': 1, 'i': 1}, 'my': {'certain': 1, 'liver': 1, 'know': 2, 'ails': 1, 'and': 1, 'disease': 1, 'i': 2, 'not': 1, 'me': 1, 'believe': 1, 'nothing': 1, 'for': 1, 'do': 1, 'diseased': 1, 'at': 1, 'however': 1, 'all': 1, 'is': 1, 'about': 1, 'what': 1}, 'man': {'sick': 1, 'a': 2, 'am': 3, 'i': 3, 'unattractive': 1, 'an': 1, 'spiteful': 1}, 'and': {'know': 2, 'ails': 1, 'about': 1, 'my': 1, 'disease': 1, 'me': 1, 'certain': 1, 'nothing': 1, 'for': 1, 'do': 1, 'i': 1, 'at': 1, 'however': 1, 'all': 1, 'what': 1, 'not': 1}, 'an': {'man': 1, 'unattractive': 1, 'am': 1, 'i': 1}, 'do': {'know': 2, 'ails': 1, 'about': 1, 'my': 1, 'disease': 1, 'me': 1, 'certain': 1, 'and': 1, 'for': 1, 'i': 1, 'at': 1, 'however': 1, 'all': 1, 'what': 1, 'not': 1, 'nothing': 1}, 'ails': {'know': 2, 'disease': 1, 'not': 1, 'my': 1, 'me': 1, 'certain': 1, 'and': 1, 'for': 1, 'do': 1, 'i': 1, 'at': 1, 'however': 1, 'all': 1, 'what': 1, 'about': 1, 'nothing': 1}, 'however': {'know': 2, 'ails': 1, 'about': 1, 'disease': 1, 'my': 1, 'me': 1, 'certain': 1, 'and': 1, 'for': 1, 'do': 1, 'i': 1, 'at': 1, 'all': 1, 'what': 1, 'not': 1, 'nothing': 1}, 'all': {'know': 2, 'ails': 1, 'about': 1, 'my': 1, 'disease': 1, 'me': 1, 'certain': 1, 'and': 1, 'for': 1, 'do': 1, 'i': 1, 'at': 1, 'however': 1, 'nothing': 1, 'what': 1, 'not': 1}, 'is': {'believe': 1, 'diseased': 1, 'liver': 1, 'i': 1, 'my': 1}, 'not': {'know': 2, 'ails': 1, 'disease': 1, 'my': 1, 'me': 1, 'certain': 1, 'and': 1, 'for': 1, 'do': 1, 'i': 1, 'at': 1, 'however': 1, 'all': 1, 'what': 1, 'about': 1, 'nothing': 1}, 'nothing': {'know': 2, 'ails': 1, 'about': 1, 'my': 1, 'disease': 1, 'me': 1, 'certain': 1, 'and': 1, 'for': 1, 'do': 1, 'i': 1, 'at': 1, 'however': 1, 'all': 1, 'what': 1, 'not': 1}, 'believe': {'diseased': 1, 'is': 1, 'liver': 1, 'i': 1, 'my': 1}, 'what': {'know': 2, 'disease': 1, 'not': 1, 'my': 1, 'me': 1, 'certain': 1, 'and': 1, 'for': 1, 'do': 1, 'i': 1, 'at': 1, 'however': 1, 'all': 1, 'ails': 1, 'about': 1, 'nothing': 1}, 'a': {'sick': 1, 'man': 2, 'am': 2, 'i': 2, 'spiteful': 1}, 'about': {'know': 2, 'ails': 1, 'disease': 1, 'my': 1, 'me': 1, 'certain': 1, 'and': 1, 'for': 1, 'do': 1, 'i': 1, 'at': 1, 'however': 1, 'all': 1, 'what': 1, 'not': 1, 'nothing': 1}, 'me': {'know': 2, 'ails': 1, 'disease': 1, 'not': 1, 'my': 1, 'certain': 1, 'and': 1, 'for': 1, 'do': 1, 'i': 1, 'at': 1, 'however': 1, 'all': 1, 'what': 1, 'about': 1, 'nothing': 1}, 'for': {'know': 2, 'ails': 1, 'disease': 1, 'not': 1, 'my': 1, 'me': 1, 'certain': 1, 'and': 1, 'do': 1, 'i': 1, 'at': 1, 'however': 1, 'all': 1, 'what': 1, 'about': 1, 'nothing': 1}, 'diseased': {'believe': 1, 'is': 1, 'liver': 1, 'i': 1, 'my': 1}, 'at': {'know': 2, 'ails': 1, 'about': 1, 'my': 1, 'disease': 1, 'me': 1, 'certain': 1, 'and': 1, 'for': 1, 'do': 1, 'i': 1, 'however': 1, 'all': 1, 'what': 1, 'not': 1, 'nothing': 1}, 'certain': {'know': 2, 'ails': 1, 'disease': 1, 'not': 1, 'my': 1, 'me': 1, 'and': 1, 'for': 1, 'do': 1, 'i': 1, 'at': 1, 'however': 1, 'all': 1, 'what': 1, 'about': 1, 'nothing': 1}, 'i': {'sick': 1, 'am': 3, 'know': 2, 'disease': 1, 'liver': 1, 'spiteful': 1, 'my': 2, 'believe': 1, 'and': 1, 'an': 1, 'do': 1, 'ails': 1, 'however': 1, 'all': 1, 'is': 1, 'not': 1, 'nothing': 1, 'man': 3, 'what': 1, 'a': 2, 'me': 1, 'for': 1, 'diseased': 1, 'at': 1, 'certain': 1, 'unattractive': 1, 'about': 1}, 'unattractive': {'man': 1, 'am': 1, 'i': 1, 'an': 1}} self.assertEqual(answer, results)
def test_sentence_simple(self): test_sentence_list = [['two', 'words',]] results = a6.build_semantic_descriptors(test_sentence_list) answer = {'two': {'words': 1}, 'words': {'two': 1}} self.assertEqual(answer, results)
def test_sentence_removes_all_punctuations(self): test_sentence = "Hello( there, this{ }is_ a; test. Hope; it: works% for* you." results = a6.get_sentence_lists(test_sentence) answer = [['hello', 'there', 'this', 'is', 'a', 'test'], ["hope", "it", "works", "for", "you"]] self.assertEqual(answer, results)
def test_NormReturnsANormalizedVectorSOS(self): """ Tests the proper return of a normalized sum of squares. """ result = a6.norm({"a":5, "b":8, "c":9}) self.assertAlmostEqual(13.03840481, result, 3)
def test_sentence_has_line_feeds(self): test_sentence = "Hello there\nthis is a test." results = a6.get_sentence_lists(test_sentence) answer = [['hello', 'there', 'this', 'is', 'a', 'test']] self.assertEqual(answer, results)
def test_sentence_double_spaces(self): test_sentence = "Hello there? this is a test." results = a6.get_sentence_lists(test_sentence) answer = [['hello', 'there'], ['this', 'is', 'a', 'test']] self.assertEqual(answer, results)
def test_sentence(self): test_sentence = "a quick test" results = a6.get_sentence_lists(test_sentence) self.assertEqual([["a", "quick", "test"]], results)
def test_cosine_similarity(self): result = a6.cosine_similarity({"a":2, "d":8}, {"a":5, "b":8, "c":9}) self.assertAlmostEqual(0.0930081664755, result, 3)