示例#1
0
def similar(sisaan, maxl, countc):
    with update_after():
        similar_output = model.similar_by_text(sisaan,
                                               cosine=True,
                                               max_l=maxl,
                                               count_c=countc)
        output_list = []
        for count, item in enumerate(similar_output):
            keys = ['frequency', 'rank', 'similarity', 'text']

            values = [
                item.this_many,
                str(count + 1),
                str(
                    text_tools.similarity(
                        model.try_search(sisaan, use_wiki=True), item)),
                text_tools.process_for_display(item.lower_tokens)
            ]

            output_sub_dict = dict(zip(keys, values))
            output_list.append(output_sub_dict)
            print('company enumerated!')
            print(item.raw_text)
            print(item.display_text)
        print(output_list)

        return output_list

    similar_output = model.similar_by_text(sisaan, cosine=True, top_n=3)
    print(sisaan)
    output_list = []
    for count, item in enumerate(similar_output):
        keys = ['index', 'rank', 'distance', 'text', 'frequency']
        similarity_score = text_tools.similarity(model.try_search(sisaan),
                                                 item)
        if similarity_score > 0:
            values = [
                str(model.get_index(item)),
                str(count + 1),
                str(similarity_score),
                text_tools.process_for_display(item.lower_tokens),
                item.this_many
            ]
        else:
            values = [
                str(model.get_index(item)),
                str(count + 1),
                str(similarity_score), '', ''
            ]
        output_sub_dict = dict(zip(keys, values))
        output_list.append(output_sub_dict)
        print('company enumerated!')
    print(output_list)

    return output_list
示例#2
0
文件: category.py 项目: tituka/ml_s
def category_and_score(s_string: str):
    s = model.try_search(s_string, use_wiki=True)
    comp_dict = dict()
    for cat in categories:
        comp_dict.update({cat: text_tools.similarity(cat, s, cosine=True)})
    return max(comp_dict.items(),
               key=operator.itemgetter(1))[0].lower_tokens, comp_dict[max(
                   comp_dict.items(), key=operator.itemgetter(1))[0]]
示例#3
0
def category_and_score(s_string: str):
    s = model_try_search(s_string, use_wiki=True)
    comp_dict = dict()
    for cat in categories:
        comp_dict.update({cat: text_tools.similarity(cat, s, cosine=True)})
    full.update({
        text_tools.process_for_display(s.lower_tokens):
        max(comp_dict.items(), key=operator.itemgetter(1))[0]
    })
示例#4
0
 def test_similar_by_search_item(self):
     int= random.randint(0, len(model.item_list))
     test_search_item1 = random_search_item(int)
     res1 = model.similar_by_SearchItem(test_search_item1, cosine=True, top_n=10)
     res2 = model.similar_by_SearchItem(test_search_item1, top_n=10)
     print(res1)
     print(res2)
     print(similarity(test_search_item1, res1[0]))
     print(similarity(test_search_item1, res1[3]))
     print(similarity(test_search_item1, res2[0], cosine=False))
     print(similarity(test_search_item1, res2[3], cosine=False))
     self.assertTrue(similarity(test_search_item1, res1[0]) > similarity(test_search_item1, res1[3]))
     self.assertTrue(similarity(test_search_item1, res2[0], cosine=False) < similarity(test_search_item1, res2[3], cosine=False))
示例#5
0
 def test_similar_by_text(self):
     int = random.randint(0, len(model.item_list))
     item_1 = random_search_item(int)
     counter=1
     while item_1.sum_vec.all()==np.zeros(model.word_vectors.vector_size).all():
         item_1 = random_search_item(counter)
         counter += 1
     item_2 = random_search_item(counter)
     while np.array_equal(item_1.sum_vec, item_2.sum_vec) or np.array_equal(item_2.sum_vec, np.zeros(
             model.word_vectors.vector_size)):
         counter += 1
         item_2 = random_search_item(counter)
     res1 = model.similar_by_text(item_1.display_text, cosine=True, top_n=5)
     res2 = model.similar_by_text(item_2.display_text, cosine=False, top_n=5)
     res3 = model.similar_by_text(item_2.display_text, cosine=True, top_n=5, print_list=True)
     """Assets that two different random search items do mot return the same similar searches """
     self.assertNotEqual(res3, res1)
     """Asserts that returned similar searches are more similar, the higher in the ranking they are"""
     self.assertTrue(similarity(item_1, res1[0]) > similarity(item_1, res1[3]))
     self.assertTrue(similarity(item_2, res2[0], cosine=False) < similarity(item_2, res2[3], cosine=False))
示例#6
0
文件: category.py 项目: tituka/ml_s
def category_from_summary(s_string: str):
    try:
        page = wikipedia.page(s_string)
        summary = page.summary
    except:
        summary = s_string
    comp_dict = dict()
    s = model.try_search(summary, display=summary, use_wiki=False)
    for cat in categories:
        comp_dict.update({cat: text_tools.similarity(cat, s, cosine=True)})
    return max(comp_dict.items(),
               key=operator.itemgetter(1))[0].lower_tokens, comp_dict[max(
                   comp_dict.items(), key=operator.itemgetter(1))[0]]
示例#7
0
    def test_move_out(self):
        ind = random.randint(0, len(model.item_list)-1)

        item = random_search_item(ind)
        similars = model.similar_by_SearchItem(item)
        old_position = similars[2].sum_vec
        move_vec = similars[2]
        model.move_out(item, similars[2], similars, cosine=False)
        self.assertTrue(similarity(item, move_vec) < similarity_vec(item.sum_vec, old_position))
        similars = model.similar_by_SearchItem(item)
        self.assertNotEqual(move_vec, similars[2])
        self.assertEqual(move_vec.modified, True)
        move_vec.reset()
        self.assertEqual(move_vec.modified, False)
示例#8
0
 def test_similar_by_vector(self):
     ind = random.randint(0, len(model.item_list)-1)
     test_search_item1 = random_search_item(ind)
     res2 = model.similar_by_vector(test_search_item1.sum_vec, cosine=False)
     self.assertTrue(similarity(test_search_item1, res2[0], cosine=False) < similarity(test_search_item1, res2[3], cosine=False))
示例#9
0
    def similar_two_text(self, text_1, text_2, cosine=True):
        """Returns the similarity score or distance"""

        search_item_1 = Company.Company(self, 'Example search_item 1', text_1)
        search_item_2 = Company.Company(self, 'Example search_item 1', text_2)
        return text_tools.similarity(search_item_1, search_item_2, cosine)