예제 #1
0
def fuzzyMatching(control, full_names):
    """
    :Summary: compare the full names in the full_names dictionary to the control
              full name, gather parcentage similarity in an array and return the
              average.
    :control: A control full name string against which other full names in
              full_names dict are compared.
    :full_names: A Dictionary containing a users full names as specified in the
              neccessary platforms
    :return: returns average parcentage similarity or zero if any of the
             usernames doesn't exist.
    """
    _list = []
    ave = 0
    for key in full_names:
        if full_names[key] != username_does_not_exist:
            if full_names[key] != no_username_provided:
                _list.append(
                    fuzz.WRatio(control, full_names[key], score_cutoff=60))
        else:
            return 0

    for each in _list:
        ave += each

    return ave / len(_list)
예제 #2
0
def keyword_in_search(search_item, keywords=()):
    """ whether `keyword` is present in `search_item` """

    link = search_item['link']
    title = search_item['title']
    snippet = search_item['snippet']

    for information in (link, title, snippet):

        if any(
                # normal search
                keyword.lower() in information.lower()
                for keyword in keywords):
            return True

        if any(
                # fuzzy search
                fuzz.WRatio(keyword, information, score_cutoff=90)
                for keyword in keywords):

            return True

    return False
def keyword_in_search(search_item, keywords=(), must_contain_all=False):
    """ whether `keyword` is present in `search_item` """

    link = search_item['link']
    title = search_item['title']
    snippet = search_item['snippet']
    filter_func = all if must_contain_all else any

    for information in (link, title, snippet):

        if filter_func(
            # normal search
            keyword.lower() in information.lower() for keyword in keywords
        ):
            return True

        if filter_func(
            # fuzzy search
            fuzz.WRatio(keyword, information, score_cutoff=90) for keyword in keywords
        ):

            return True

    return False
예제 #4
0
 def testWRatioUnicodeString(self):
     s1 = "Á"
     s2 = "ABCD"
     score = fuzz.WRatio(s1, s2)
     self.assertEqual(0, score)
예제 #5
0
 def testQRatioUnicode(self):
     self.assertEqual(fuzz.WRatio(self.s1, self.s1a), 100)
예제 #6
0
 def testWRatioMisorderedMatch(self):
     # misordered full matches are scaled by .95
     self.assertEqual(fuzz.WRatio(self.s4, self.s5), 95)
예제 #7
0
 def testWRatioPartialMatch(self):
     # a partial match is scaled by .9
     self.assertEqual(fuzz.WRatio(self.s1, self.s3), 90)
예제 #8
0
 def testWRatioCaseInsensitive(self):
     self.assertEqual(fuzz.WRatio(self.s1, self.s2), 100)
예제 #9
0
 def testWRatioEqual(self):
     self.assertEqual(fuzz.WRatio(self.s1, self.s1a), 100)
예제 #10
0
    def search(query_str,
               n=3,
               fuzzy_weight=default_fuzzy_weight,
               text_weight=default_text_weight,
               image_weight=default_image_weight):
        print(query_str)
        if use_bert:
            query_embedding = model.encode(query_str)
        else:
            # import time
            # start_time = time.time()
            query_embedding = embed_text(query_str)
            # print("--- %s seconds ---" % (time.time() - start_time))

        if use_bert:
            embeddings = normalized_sentence_embeddings
            scores, indices = query(query_embedding, embeddings, n)
            results1 = Counter({i: text_weight * scores[i] for i in indices})
            # print(results1)
            if fuzzy_weight > 0:
                # results2 = process.extract(query_str, {i:x for i,x in enumerate(names)}, limit=n)
                results2 = process.extract(query_str,
                                           names,
                                           scorer=fuzz.WRatio,
                                           limit=n)
                results2 = Counter(
                    {x[2]: (fuzzy_weight * x[1] / 100)
                     for x in results2})
                # print(results2)
                for key, value in list(results1.most_common()):
                    results2[key] = fuzzy_weight * fuzz.WRatio(
                        query_str, names[key]) / 100
                for key, value in list(results2.most_common()):
                    results1[key] = text_weight * scores[key]

                results = results1 + results2
                return [key for key, value in results.most_common(n)]
            else:
                return [key for key, value in results1.most_common(n)]
        else:
            # embeddings = sentence_weight * sentence_embeddings + (1-sentence_weight) * image_embeddings
            # embeddings = sentence_weight * normalized_sentence_embeddings + (1-sentence_weight) * normalized_image_embeddings
            # import time
            # start_time = time.time()
            scores_text, indices_text = query(query_embedding,
                                              normalized_sentence_embeddings,
                                              n)
            # print("--- %s seconds ---" % (time.time() - start_time))
            # start_time = time.time()
            scores_images, indices_images = query(query_embedding,
                                                  normalized_image_embeddings,
                                                  n)
            # print("--- %s seconds ---" % (time.time() - start_time))

            # return_dict = manager.dict()
            # p = Process(target=queryParal, args=("text",query_embedding,normalized_sentence_embeddings,n, return_dict))
            # p.start()
            # p2 = Process(target=queryParal, args=("images",query_embedding,normalized_image_embeddings,n, return_dict))
            # p2.start()
            # p.join()
            # p2.join()
            # scores_text, indices_text = return_dict["text"]
            # scores_images, indices_images = return_dict["images"]
            results_text = Counter(
                {i: text_weight * scores_text[i]
                 for i in indices_text})
            results_images = Counter(
                {i: image_weight * scores_images[i]
                 for i in indices_images})
            # print(results1)
            if fuzzy_weight > 0:
                import time
                start_time = time.time()
                # results2 = process.extract(query_str, {i:x for i,x in enumerate(names)}, limit=n)
                # print(query_str)
                # print(type(names))
                print(names[0])
                results2 = process.extract(query_str,
                                           names,
                                           scorer=fuzz.WRatio,
                                           limit=n)
                # results2 = process.extract("hahatest", ["test","tost"], scorer=fuzz.WRatio, limit=1)
                print("--- %s seconds ---" % (time.time() - start_time))
                results2 = Counter(
                    {x[2]: (fuzzy_weight * x[1] / 100)
                     for x in results2})
                # print(results2)
                for key, value in list(results_text.most_common()):
                    results2[key] = fuzzy_weight * fuzz.WRatio(
                        query_str, names[key]) / 100
                for key, value in list(results_images.most_common()):
                    results2[key] = fuzzy_weight * fuzz.WRatio(
                        query_str, names[key]) / 100
                for key, value in list(results2.most_common()):
                    results_text[key] = text_weight * scores_text[key]
                    results_images[key] = image_weight * scores_images[key]

                results = results_text + results_images + results2
                return [key for key, value in results.most_common(n)]
            else:
                return [key for key, value in results1.most_common(n)]