示例#1
0
    def test_sentence_1(self):

        test_sentence = "Hello, Jack. How is it going? Not bad; pretty good, actually... Very very good, in fact."
        results = a6.get_sentence_lists(test_sentence)
    
        answer = [['hello', 'jack'], ['how', 'is', 'it', 'going'], ['not', 'bad', 'pretty', 'good', 'actually'], ['very', 'very', 'good', 'in', 'fact']]
        self.assertEqual(answer, results)
示例#2
0
 def test_sentence_from_file(self):
     results = a6.get_sentence_lists_from_files(["sample_file.txt"])
     answer = [['this', 'is', 'a', 'test'],
                 ['i', 'm', 'going', 'to', 'say', 'it', 'is'],
                 ['i', 'hope', 'it', 'works'],
                 ['if', 'not', 'then', 'you', 'still', 'have', 'work', 'to', 'do'],
                 ['e', 'm', 'c', '2']]
     self.assertEqual(answer, results)
示例#3
0
    def getSynonymMeaning(self, word_surface):
        flg = True
        return_message = "\"" + word_surface + "\""
        if word_surface in self.getSurfaceDict().keys():
            for word in self.words_list:
                if word_surface == word.getSurface():
                    if flg:
                        return_message += "の日本語訳として「"
                        for meaning in word.getMeaning():
                            return_message += meaning
                        return_message += "」が存在します."
                        flg = False
                        return_message += "さらに,その類義語として"
                        synonym_list = Synonym.getSynonymList(
                            word.getMeaning()[0])
                        for i in range(5):
                            s = random.choice(synonym_list)
                            if s == word.getMeaning()[0]:
                                continue
                            return_message += "「"
                            return_message += s
                            return_message += "」"
                        return_message += "が考えられます."
                    else:
                        return_message += "また,別の日本語訳として「"
                        for meaning in word.getMeaning():
                            return_message += meaning
                        return_message += "」が存在します."
                        return_message += "さらに,その類義語として"
                        synonym_list = Synonym.getSynonymList(
                            word.getMeaning()[0])
                        for i in range(5):
                            s = random.choice(synonym_list)
                            if s == word.getMeaning()[0]:
                                continue
                            return_message += "「"
                            return_message += s
                            return_message += "」"
                        return_message += "が考えられます."

        else:
            return_message += "に相当する日本語訳が見つかりませんでした."

        return return_message
示例#4
0
def main(dirname):
    wordList = bulitDict(dirname + '/dict')

    synonymDict = Synonym.Synonym(dirname + "/dict/words.vector")

    value = []

    stopWord = []
    with open(dirname + "/dict/stopword.txt", "r") as f:
        while 1:
            line = f.readline()
            if not line:
                break
            stopWord.append(line)

    files = os.listdir(dirname + '/data')
    for filename in files:
        data = json.load(dirname + "/data" + filename)
        for cmt in data:
            cmt_seg = jieba.posseg.cut(cmt['text'])

            print(cmt_seg)

            cmt_value = 1

            for seg in cmt_seg:
                if seg in stopWord:
                    continue
                try:
                    score = wordList[seg.word]
                except:
                    print(
                        "Word " + seg.word +
                        " is not in the nornaml dict. Searching in the synonym..."
                    )
                    try:
                        syno = synonymDict.get_synonym(seg.word)
                        score = wordList[syno[0]] * syno[syno[1]]
                    except:
                        print("Word " + seg.word + " POS " + seg.flag)
                cmt_value *= score

            value.append(cmt_value)
示例#5
0
    def getSynonymSurface(self, word_meaning):
        flg = True
        return_message = "「" + word_meaning + "」に相当するセラフェノ訳が見つかりませんでした."
        synonym_list = Synonym.getSynonymList(word_meaning)

        for synonym in synonym_list:
            if synonym in self.getMeaningDict().keys():
                return_message += "「" + word_meaning + "」の類義語「"
                return_message += synonym + "」のセラフェノ訳として"
                for word in self.words_list:
                    if synonym in word.getMeaning():
                        return_message += "\""
                        return_message += word.getSurface()
                        return_message += "\"が存在します."
                flg = False

        if flg:
            return_message += "類義語に対するセラフェノ訳が見つかりませんでした."

        return return_message
def similarity_tags(tag1, tag2):
	#Nlevij = levij/max(length(ti),length(tj)). the smaller the better
	#multiple Nlevij to graph_distance
	v1 = graph_distance(tag1, tag2)
	v2 = 0.0
	v2 = levenshtein(tag1, tag2)/(1.0*max(len(tag1), len(tag2)))
	#v2 = n + (1-n)*lev/max(len1,len2)
	v2 = 1- RATIO_LEVENSHTEIN + RATIO_LEVENSHTEIN*v2
	#10,10,2013 try the multiply, performance is bad, the scale of v1 and v2 are not the same
	print >> outputf, "v2 %3.8f" %(v2), "| tag:", tag1, "| tagto: ", tag2
	
	#consider synonym
	is_synonym = 0.0
	if (Synonym.check_synonym (tag1, tag2) == 1):
		is_synonym = SYNONYM_YES_RATIO
	else:
		is_synonym = SYNONYM_NO_RATIO
		
	if (v2 != 0.0):
		#v2 = v2**(1.0/2) #too large last time, make squaring
		return v1*v2*is_synonym
	else:
		return v1*is_synonym
# build graph
graph = {}
for tag_item in myTagSet:
	graph[tag_item] = {}
	for pair_item in myTagPairs: #get related pairs that contains tag1
		tag_pair = pair_item.split(',')
		tag1 = tag_pair[0]
		tag2 = tag_pair[1]
		
		#consider levenshtein
		lev2 = 0.0
		lev2 = levenshtein(tag1, tag2)/(1.0*max(len(tag1), len(tag2)))
		
		#consider synonym
		is_synonym = 0.0
		if (Synonym.check_synonym (tag1, tag2) == 1):
			is_synonym = SYNONYM_YES_RATIO
		else:
			is_synonym = SYNONYM_NO_RATIO		
		
		if (lev2 == 0.0):
			likelihoodOfPair = 0.0
		else: 
			likelihoodOfPair = 1.0/((1.0)*myTagPairs[pair_item]*lev2*is_synonym) # distance = 1/(frequency * levenshtein * synonym)  ######
			
			
		#format(likelihoodOfPair, '3.8f')
		if tag_item == tag_pair[0]: #save the other tag in graph as a edge 
			graph[tag_item][tag_pair[1]] = likelihoodOfPair 
		if tag_item == tag_pair[1]:
			graph[tag_item][tag_pair[0]] = likelihoodOfPair
示例#8
0
 def test_sentence_from_assignment(self):
     test_sentence_list = [['i', 'am', 'a', 'sick', 'man'], ['i', 'am', 'a', 'spiteful', 'man'], ['i', 'am', 'an', 'unattractive', 'man'], ['i', 'believe', 'my', 'liver', 'is', 'diseased'], ['however', 'i', 'know', 'nothing', 'at', 'all', 'about', 'my', 'disease', 'and', 'do', 'not', 'know', 'for', 'certain', 'what', 'ails', 'me']]
     results = a6.build_semantic_descriptors(test_sentence_list)
     answer = {'sick': {'man': 1, 'a': 1, 'am': 1, 'i': 1}, 'am': {'sick': 1, 'a': 2, 'an': 1, 'i': 3, 'man': 3, 'unattractive': 1, 'spiteful': 1}, 'know': {'what': 2, 'about': 2, 'disease': 2, 'my': 2, 'me': 2, 'certain': 2, 'and': 2, 'for': 2, 'do': 2, 'i': 2, 'at': 2, 'however': 2, 'all': 2, 'ails': 2, 'not': 2, 'nothing': 2}, 'disease': {'know': 2, 'ails': 1, 'about': 1, 'my': 1, 'me': 1, 'certain': 1, 'and': 1, 'for': 1, 'do': 1, 'i': 1, 'at': 1, 'however': 1, 'all': 1, 'what': 1, 'not': 1, 'nothing': 1}, 'liver': {'believe': 1, 'is': 1, 'diseased': 1, 'i': 1, 'my': 1}, 'spiteful': {'man': 1, 'a': 1, 'am': 1, 'i': 1}, 'my': {'certain': 1, 'liver': 1, 'know': 2, 'ails': 1, 'and': 1, 'disease': 1, 'i': 2, 'not': 1, 'me': 1, 'believe': 1, 'nothing': 1, 'for': 1, 'do': 1, 'diseased': 1, 'at': 1, 'however': 1, 'all': 1, 'is': 1, 'about': 1, 'what': 1}, 'man': {'sick': 1, 'a': 2, 'am': 3, 'i': 3, 'unattractive': 1, 'an': 1, 'spiteful': 1}, 'and': {'know': 2, 'ails': 1, 'about': 1, 'my': 1, 'disease': 1, 'me': 1, 'certain': 1, 'nothing': 1, 'for': 1, 'do': 1, 'i': 1, 'at': 1, 'however': 1, 'all': 1, 'what': 1, 'not': 1}, 'an': {'man': 1, 'unattractive': 1, 'am': 1, 'i': 1}, 'do': {'know': 2, 'ails': 1, 'about': 1, 'my': 1, 'disease': 1, 'me': 1, 'certain': 1, 'and': 1, 'for': 1, 'i': 1, 'at': 1, 'however': 1, 'all': 1, 'what': 1, 'not': 1, 'nothing': 1}, 'ails': {'know': 2, 'disease': 1, 'not': 1, 'my': 1, 'me': 1, 'certain': 1, 'and': 1, 'for': 1, 'do': 1, 'i': 1, 'at': 1, 'however': 1, 'all': 1, 'what': 1, 'about': 1, 'nothing': 1}, 'however': {'know': 2, 'ails': 1, 'about': 1, 'disease': 1, 'my': 1, 'me': 1, 'certain': 1, 'and': 1, 'for': 1, 'do': 1, 'i': 1, 'at': 1, 'all': 1, 'what': 1, 'not': 1, 'nothing': 1}, 'all': {'know': 2, 'ails': 1, 'about': 1, 'my': 1, 'disease': 1, 'me': 1, 'certain': 1, 'and': 1, 'for': 1, 'do': 1, 'i': 1, 'at': 1, 'however': 1, 'nothing': 1, 'what': 1, 'not': 1}, 'is': {'believe': 1, 'diseased': 1, 'liver': 1, 'i': 1, 'my': 1}, 'not': {'know': 2, 'ails': 1, 'disease': 1, 'my': 1, 'me': 1, 'certain': 1, 'and': 1, 'for': 1, 'do': 1, 'i': 1, 'at': 1, 'however': 1, 'all': 1, 'what': 1, 'about': 1, 'nothing': 1}, 'nothing': {'know': 2, 'ails': 1, 'about': 1, 'my': 1, 'disease': 1, 'me': 1, 'certain': 1, 'and': 1, 'for': 1, 'do': 1, 'i': 1, 'at': 1, 'however': 1, 'all': 1, 'what': 1, 'not': 1}, 'believe': {'diseased': 1, 'is': 1, 'liver': 1, 'i': 1, 'my': 1}, 'what': {'know': 2, 'disease': 1, 'not': 1, 'my': 1, 'me': 1, 'certain': 1, 'and': 1, 'for': 1, 'do': 1, 'i': 1, 'at': 1, 'however': 1, 'all': 1, 'ails': 1, 'about': 1, 'nothing': 1}, 'a': {'sick': 1, 'man': 2, 'am': 2, 'i': 2, 'spiteful': 1}, 'about': {'know': 2, 'ails': 1, 'disease': 1, 'my': 1, 'me': 1, 'certain': 1, 'and': 1, 'for': 1, 'do': 1, 'i': 1, 'at': 1, 'however': 1, 'all': 1, 'what': 1, 'not': 1, 'nothing': 1}, 'me': {'know': 2, 'ails': 1, 'disease': 1, 'not': 1, 'my': 1, 'certain': 1, 'and': 1, 'for': 1, 'do': 1, 'i': 1, 'at': 1, 'however': 1, 'all': 1, 'what': 1, 'about': 1, 'nothing': 1}, 'for': {'know': 2, 'ails': 1, 'disease': 1, 'not': 1, 'my': 1, 'me': 1, 'certain': 1, 'and': 1, 'do': 1, 'i': 1, 'at': 1, 'however': 1, 'all': 1, 'what': 1, 'about': 1, 'nothing': 1}, 'diseased': {'believe': 1, 'is': 1, 'liver': 1, 'i': 1, 'my': 1}, 'at': {'know': 2, 'ails': 1, 'about': 1, 'my': 1, 'disease': 1, 'me': 1, 'certain': 1, 'and': 1, 'for': 1, 'do': 1, 'i': 1, 'however': 1, 'all': 1, 'what': 1, 'not': 1, 'nothing': 1}, 'certain': {'know': 2, 'ails': 1, 'disease': 1, 'not': 1, 'my': 1, 'me': 1, 'and': 1, 'for': 1, 'do': 1, 'i': 1, 'at': 1, 'however': 1, 'all': 1, 'what': 1, 'about': 1, 'nothing': 1}, 'i': {'sick': 1, 'am': 3, 'know': 2, 'disease': 1, 'liver': 1, 'spiteful': 1, 'my': 2, 'believe': 1, 'and': 1, 'an': 1, 'do': 1, 'ails': 1, 'however': 1, 'all': 1, 'is': 1, 'not': 1, 'nothing': 1, 'man': 3, 'what': 1, 'a': 2, 'me': 1, 'for': 1, 'diseased': 1, 'at': 1, 'certain': 1, 'unattractive': 1, 'about': 1}, 'unattractive': {'man': 1, 'am': 1, 'i': 1, 'an': 1}}
     self.assertEqual(answer, results)
示例#9
0
 def test_sentence_simple(self):
     test_sentence_list = [['two', 'words',]]
     results = a6.build_semantic_descriptors(test_sentence_list)
     answer = {'two': {'words': 1}, 'words': {'two': 1}}
     self.assertEqual(answer, results)
示例#10
0
 def test_sentence_removes_all_punctuations(self):
     test_sentence = "Hello( there, this{ }is_ a; test.  Hope; it: works% for* you."
     results = a6.get_sentence_lists(test_sentence)
     answer = [['hello', 'there', 'this', 'is', 'a', 'test'], ["hope", "it", "works", "for", "you"]]
     self.assertEqual(answer, results)
示例#11
0
 def test_NormReturnsANormalizedVectorSOS(self):
     """ Tests the proper return of a normalized sum of squares. """
     
     result = a6.norm({"a":5, "b":8, "c":9})
     self.assertAlmostEqual(13.03840481, result, 3)
示例#12
0
 def test_sentence_has_line_feeds(self):
     test_sentence = "Hello there\nthis is a test."
     results = a6.get_sentence_lists(test_sentence)
     answer = [['hello', 'there', 'this', 'is', 'a', 'test']]
     self.assertEqual(answer, results)
示例#13
0
    def test_sentence_double_spaces(self):

        test_sentence = "Hello there?   this    is    a     test."
        results = a6.get_sentence_lists(test_sentence)
        answer = [['hello', 'there'], ['this', 'is', 'a', 'test']]
        self.assertEqual(answer, results)
示例#14
0
    def test_sentence(self):

        test_sentence = "a quick test"
        results = a6.get_sentence_lists(test_sentence)
        self.assertEqual([["a", "quick", "test"]], results)
示例#15
0
 def test_cosine_similarity(self):
     result = a6.cosine_similarity({"a":2, "d":8}, {"a":5, "b":8, "c":9})
     self.assertAlmostEqual(0.0930081664755, result, 3)