Python extractBests 예제들, fuzzywuzzy.process.extractBests Python 예제들

예제 #1

0

파일 보기

파일: find_restaurants.py 프로젝트: GregMFriedman/Dish_Recommender

def best_matches(bigkeys, menu, vector_dict, threshold=90):

    """Matches 2-gram and 3-gram keyword vectors to menu items 
    Args:
        bigkeys (list): a list of 2-grams and 3-grams
        menu (dict): a dictionary of a menu for a given restaurant
        vector_dict (dict): a dictionary whose keys are keywords and values are vectors
        threshold (int): fuzzy-match score required for keyword matching 
    Returns:
        best_matches (dict): a dictionary whose keys menu items and values are tuples of keyword 
        matches and their corresponding fuzzy match score
    """


    best_matches = {}
    for name, meals in menu.iteritems():
        for meal, course in meals.iteritems():
            for food in course:
                item = food.replace('-', ' ')
                if len(item.split()) > 1 and len(item.split()) < 4:
                    matches = process.extractBests(item, bigkeys, scorer=fuzz.QRatio, score_cutoff=threshold, limit=3)
                    matches += process.extractBests(item, bigkeys, scorer=fuzz.UWRatio, score_cutoff=threshold, limit=3)
                    vectors = []
                    for match in matches:
                        vectors.append(vector_dict[match[0]])
                    best_matches[item] = dict(zip(matches, vectors))
                if len(item.split()) > 3:
                    descriptor = item.split()[-2:]
                    matches = process.extractBests(item, bigkeys, scorer=fuzz.QRatio, score_cutoff=threshold, limit=3)
                    matches += process.extractBests(item, bigkeys, scorer=fuzz.UWRatio, score_cutoff=threshold, limit=3)
                    vectors = []
                    for match in matches:
                        vectors.append(vector_dict[match[0]])
                    best_matches[item] = dict(zip(matches, vectors))
    return best_matches

예제 #2

0

파일 보기

파일: views.py 프로젝트: nep-alx/library

    def search(self, data):
        books = Book.objects.all()
        resID = set([b.id for b in books])

        for key in ['title', 'isbn']:
            if key in data:
                values = [b.__dict__[key] for b in books]
                fuzzy = process.extractBests(data[key], values, score_cutoff=60)
                fvalues = [val for val, _ in fuzzy]
                ids = [b.id for b in books if b.__dict__[key] in fvalues]
                resID &= set(ids)

        if 'author' in data:
            values = [b.author.lname for b in books]
            fuzzy = process.extractBests(data['author'], values, score_cutoff=60)
            fvalues = [val for val, _ in fuzzy]
            ids = [b.id for b in books if b.author.lname in fvalues]
            resID &= set(ids)

        resBooks = [{
            'id': b.id,
            'title': b.title,
            'author': f'{b.author.fname} {b.author.lname}',
            'isbn': b.isbn
        } for b in books if b.id in resID]

        return resBooks

예제 #3

0

파일 보기

파일: priceMatching_fuzzy.py 프로젝트: vyaswanth965/Pyspark_project_interacting_with_MLClassifier-Database-web_servers_Dockers

    def common_process(self, proddesc_list, Item_Description,
                       shouldConsiderOnlyTopScore, matchingThreshold, cursor,
                       listType):
        if len(proddesc_list) == 0:
            return '|||||||||||||||||||||'

        if shouldConsiderOnlyTopScore:
            bestMatch = process.extractBests(Item_Description,
                                             proddesc_list,
                                             scorer=fuzz.token_sort_ratio,
                                             limit=1,
                                             score_cutoff=matchingThreshold)
            if len(bestMatch) == 0:
                return '|||||||||||||||||||||'
            else:
                return self.getmatchedrecord(bestMatch[0][0], bestMatch[0][1],
                                             cursor, listType)
        else:
            bestMatches = process.extractBests(Item_Description,
                                               proddesc_list,
                                               scorer=fuzz.token_sort_ratio,
                                               limit=5,
                                               score_cutoff=matchingThreshold)
            if len(bestMatches) == 0:
                return '|||||||||||||||||||||'
            else:
                bestMatches = set(bestMatches)
                rtrn_str = ''
                for match in bestMatches:
                    matched_record = self.getmatchedrecord(
                        match[0], match[1], cursor, listType)
                    rtrn_str = rtrn_str + '#' + matched_record
                rtrn_str.strip('#')
                return rtrn_str

예제 #4

0

파일 보기

파일: IngredientMatcher.py 프로젝트: drschwenk/mixedCompany

 def match_ingredients(self, recipe):
     """
     Makes a fuzzy match between ingredients in a recipe and in the ingredient:compound dict
     :param recipe: ingredients to match
     :param comp_ing_dict: compound ingredient dict
     :return:
     """
     matches = []
     for ing in recipe:
         component_matches = self.decomposer.get_component_ingredients(ing)
         if component_matches:
             return self.match_ingredients(component_matches)
         match_list = process.extractBests(ing, self.sanitized_comp_ing_dict.keys(),
                                           scorer=fuzz.ratio, score_cutoff=98)
         matches += [match[0] for match in match_list]
         match_list = process.extractBests(ing, self.sanitized_comp_ing_dict.keys(),
                                           scorer=fuzz.token_sort_ratio,
                                           score_cutoff=85)
         matches += [match[0] for match in match_list]
         if not matches:
             sub_ing = ing.split()
             for i in sub_ing:
                 match_list = process.extractBests(i, self.sanitized_comp_ing_dict.keys(),
                                                   scorer=fuzz.ratio, score_cutoff=88)
                 matches += [match[0] for match in match_list]
     return list(set(matches))

예제 #5

0

파일 보기

 def _find_person(self, player_name, choices):
     print('Searching %s' % player_name)
     matching_results = process.extractBests(unidecode(player_name), choices,
                                             scorer=fuzz.partial_token_set_ratio,
                                             score_cutoff=75)
     if len(matching_results) > 0:
         # si les meilleurs matchs sont à egalité de score, chercher à nouveau avec méthode différente
         best_score = 0
         creme = dict()
         for name, score, plid in matching_results:
             if score >= best_score:
                 best_score = score
                 creme.update({plid: name})
             else:
                 # la liste renvoyée par extractBests est triée donc on peut s'arreter dès que le niveau baisse.
                 break
         # combien de meilleurs scores ?
         if len(creme) == 1:
             plid, plname = creme.popitem()
             print('Found %s at first round with ratio %s' % (plname, best_score))
             matching_player = FootballPerson.objects.get(pk=plid)
             return matching_player, best_score
         else:
             print('Multiple matches found with ratio %s, refining...' % best_score)
             refine_results = process.extractBests(player_name, creme,
                                                   scorer=myfuzz.partial_token_set_ratio_with_avg)
             plname, ratio, plid = refine_results[0]
             print('Found %s at second round with ratio %s then %s' % (plname, best_score, ratio))
             matching_player = FootballPerson.objects.get(pk=plid)
             return matching_player, best_score
     else:
         print("Alert : no match for %s" % player_name)
         return None, 0.0

예제 #6

0

파일 보기

파일: command.py 프로젝트: landmaj/qbot

def fuzzy_match(
    mistyped_command: str, score_cutoff: int = 75
) -> Optional[Tuple[str, int]]:
    match = process.extractBests(
        mistyped_command, FUZZY_COMMANDS.keys(), score_cutoff=score_cutoff, limit=1
    )
    if len(match) == 0:
        match = process.extractBests(
            mistyped_command, FUZZY_ALIASES.keys(), score_cutoff=score_cutoff, limit=1
        )
    return match[0] if len(match) != 0 else None

예제 #7

0

파일 보기

    def buildEachLocatorTagPattern(self, eachLocatorArray, i, sourceDataProcessed, patternBuild):
        try:
            import re

            eachLocator = eachLocatorArray[i].upper()# mjj
            """considering if a string contains number, prevent it for fuzzy search"""
            # searchOnlyNumAndCharObj = re.search(r'^[0-9-`!@#$%^&*()_+=\\|}\]\[{\';:\/\?>\.,<~ ]+$', eachLocator)
            searchOnlyNumAndCharObj = re.search(r'[0-9]', eachLocator)
            if searchOnlyNumAndCharObj:
                """ Converting number to coresponding regular expression """
                bestMatch = re.sub('\d', '\d', eachLocator)
                # print('pattern found', bestMatch)

                if i == 0:
                    patternBuild = patternBuild + bestMatch
                else:
                    patternBuild = patternBuild + '(.*)' + bestMatch

                return patternBuild
            else:
                """ Get fuzzy matching string array """
                matchingFuzzyWord = self.getFuzzySearchData(eachLocator, sourceDataProcessed)
                if len(process.extractBests(eachLocator, matchingFuzzyWord)) > 0:
                    """ 
                        Find the best amoung them
                        It is very importent for selecting appropriate confidence limit, one thing is based on string length
                        :todo: find other parameters for improving
                    """
                    if len(eachLocator) < 5:
                        confidenceLimit = 90
                    else:
                        confidenceLimit = 80

                    bestMatch, confidence = process.extractBests(eachLocator, matchingFuzzyWord, limit=1)[0]

                    if len(matchingFuzzyWord) > 0 and confidenceLimit < confidence:
                        bestMatch = self.regularExpressionHandling(bestMatch, 0)
                        if i == 0:
                            patternBuild = patternBuild + bestMatch
                        else:
                            patternBuild = patternBuild + '(.*)' + bestMatch

                        return patternBuild
                    else:
                        if i == 0 or len(eachLocatorArray) == i + 1:
                            """ if first or last locator doesnot match then no need for further process (Improvement in searching) """
                            return False
                elif len(matchingFuzzyWord) == 0 and (i == 0 or len(eachLocatorArray) == i + 1):
                    """ if first or last locator doesnot match then no need for further process (Improvement in searching) """
                    return False
        except Exception as e:
            print('error in buildEachLocatorTagPattern in StringHandling', e)
            return False

예제 #8

0

파일 보기

    def search(bank_query: str, branch_query: str):
        """

        :param bank_query: Bank string. Example: "icici", "hdfc bank"
        :param branch_query: Branch address string. Example:
        :return: DF row/object of the matching branch with IFSC & MICR code

        Step1: Load CSV to a DF
        Step2: Slice the results to the substring i.e. bank_string
        Step3:
            3.1) Fuzzy search the BRANCH col. with branch_query. BRANCH_NAME
            3.2) Fuzzy search the ADDRESS col. with branch_query. ADDRESS
            3.3) Compare the fuzzy search score of 3.1 and 3.2. If both scores are equal or BRANCH_NAME > ADDRESS,
                we assume the BRANCH_NAME row to be returned else ADDRESS row.

        Order of CSV Headers: ['IFSC', 'BRANCH', 'CENTRE', 'DISTRICT', 'STATE', 'ADDRESS', 'CONTACT',
               'IMPS', 'RTGS', 'CITY', 'NEFT', 'MICR']

        """

        # Step 1: Find the branch from `BANK` column
        # bank_dict = process.extractBests('ICIC', df1.BANK.to_dict(), score_cutoff=90, limit=None)
        # bank_branches = df1.loc[[l[2] for l in bank_dict]]
        # OR =>
        # PS: The following will fail if the bank name provided exceeds the bank name we have
        bank_branches = df1[df1['BANK'].str.lower().str.contains(bank_query.lower())]

        print('===== ALL BRANCHES OF BANK ======'.center(10))
        print(bank_branches)

        # FYI to convert whole column to lower case: bank_branches.ADDRESS.str.lower()
        branch_matches = process.extractBests(branch_query, bank_branches.BRANCH.to_dict())  # , limit=2)
        address_matches = process.extractBests(branch_query, bank_branches.ADDRESS.to_dict())

        best_branch = branch_matches[0]
        best_address = address_matches[0]

        print(best_address, best_branch)
        if best_branch[1] == best_address[1]:
            optimal_result = best_branch
        elif best_branch[1] > best_address[1]:
            optimal_result = best_branch
        else:
            optimal_result = best_address

        result_index = optimal_result[2]

        result_row = bank_branches.loc[result_index]

        print('==== Queried BRANCH ===='.center(10))
        print(result_row)
        logger.error("{}".format(result_row))
        return result_row

예제 #9

0

파일 보기

파일: clean_category.py 프로젝트: OscarHeMe/catalogue

def get_categories_related(categories_raw, min_score=90, min_bad_score=80, is_name=False, names=None):
    if categories_raw:
        categories_raw = clean(categories_raw, is_name)
    if names:
        names = clean(names, True)
    # print(categories_raw)
    if categories_raw:
        match_categories = []
        for name, attrs in categories_json.items():
            not_choices = attrs.get('banned')
            bad_results = process.extractBests(categories_raw, not_choices, scorer=fuzz.partial_token_set_ratio,
                                               score_cutoff=min_bad_score)
            if names:
                bad_results += process.extractBests(names, not_choices, scorer=fuzz.partial_token_set_ratio,
                                                   score_cutoff=min_bad_score)
            if not bad_results:
                choices = attrs.get('tokens')
                results = process.extractBests(categories_raw, choices, scorer=fuzz.partial_token_set_ratio, score_cutoff=min_score)
                if results:
                    # print("++++++ \t", name, ': ', results)
                    match_categories.append(name)
                result_keys = {result[0] for result in results}

                for cat in attrs.get('subcats'):
                    choices = set(list(cat.values())[0].get('tokens'))

                    if (choices & result_keys):
                        not_choices = list(cat.values())[0].get('banned')
                        bad_results_sub = process.extractBests(categories_raw, not_choices,
                                                           scorer=fuzz.partial_token_set_ratio,
                                                           score_cutoff=min_bad_score)
                        if names:
                            bad_results_sub += process.extractBests(names, not_choices,
                                                                   scorer=fuzz.partial_token_set_ratio,
                                                                   score_cutoff=min_bad_score)
                        if not bad_results_sub:
                            cat_name = list(cat.keys())[0]
                            match_categories.append(cat_name)
            #                 print("++++++ \t", cat_name, ': ', results)
            #             else:
            #                 cat_name = list(cat.keys())[0]
            #                 print("------ \t", cat_name, ': ', bad_results_sub)
            # else:
            #     print("------ \t", name, ': ', bad_results)
        if {"Mascotas", "Autos, Motos y llantas"} & set(match_categories):
            aux = set(match_categories) - {"Super"}
            if len(aux) > 1:
                return []


        return match_categories
    else:
        return []

예제 #10

0

파일 보기

파일: get_manga.py 프로젝트: AadityaNair/MangaDownload

def get_best_match_location(manga_name, choices):
    """
    Extracts the best possible matching manga.
    If it cannot find the exact match asks the user for the best matches.
    """
    
    matches=process.extractBests(manga_name, choices, limit=10, score_cutoff=100)
    if len(matches) == 0:
        matches=process.extractBests(manga_name, choices, limit=10, score_cutoff=80)

    match=ask_best_match(matches)
    return website_specific.absoulute_location( match )

예제 #11

0

파일 보기

파일: norm_script_alpha.py 프로젝트: PulkitMishra/peracotta

def get_normalized(starting_name: str, names_coll: list):
    reading = process.extractBests(starting_name, names_coll, score_cutoff=75)
    candidates = [starting_name]
    best_n = starting_name
    max_n = len(reading)

    while reading != []:
        (new, points) = reading.pop()
        if points < 100 and new not in candidates:
            candidates.append(new)
            tmp = process.extractBests(new, li, score_cutoff=75)
            reading = reading + tmp
            if len(tmp) > max_n:
                best_n = new
    return (best_n, candidates)

예제 #12

0

파일 보기

파일: manager_match.py 프로젝트: circld/Work

def get_match(word, word_list, precision=90):
    if len(word) == 0:
        return ''
    best_match = process.extractBests(word, word_list, limit=1)
    if best_match[0][1] <= precision:
        best_match = ''
    return best_match

예제 #13

0

파일 보기

파일: address.py 프로젝트: alpha-beta-soup/errorgeopy

    def extract(self, expectation, limit=4):
        """extract(extraction, limit=4)
        Returns the address or addresses within the set of the reverse
        geocoded addresses that best match an expected result. Uses fuzzywuzzy
        under the hood for matching.

        Args:
            expectation (str): The string indicating your expected result for a
            reverse geocoding operation. It should probably look like an
            address. Results are returned in the order that best meets this
            expected address.

        Kwargs:
            limit (int): The maximum number of match candidates to retrieve
            from fuzzywuzzy. The length of the returned array may be longer, if
            the set of addresses has identical addresses that are good matches
            for the expected address (i.e. if two geocoders resolve to the same
            string address).

        Returns:
            list. Return value is a list of tuples, where each tuple contains a
            geopy Location, and a matching score based on an extension of the
            Levenshtien distance between the expectation and the Location's
            address (a higher score is a better match). The algorithm is
            implemented by SeatGeek's fuzzywuzzy, and you can read more here:
            http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
        """
        extractions = fuzzyprocess.extractBests(
            expectation, [str(a) for a in self.addresses],
            limit=limit)
        result = []
        for extraction in extractions:
            result.extend([(x, extraction[1]) for x in self.addresses
                           if str(x) == extraction[0]])
        return result

예제 #14

0

파일 보기

파일: test_fuzzywuzzy_hypothesis.py 프로젝트: G10DRAS/fuzzywuzzy

def test_identical_strings_extracted(scorer, processor, data):
    """
    Test that identical strings will always return a perfect match.

    :param scorer:
    :param processor:
    :param data:
    :return:
    """
    # Draw a list of random strings
    strings = data.draw(
        st.lists(st.text(min_size=10, max_size=100),
                 min_size=1, max_size=50))
    # Draw a random integer for the index in that list
    choiceidx = data.draw(st.integers(min_value=0, max_value=(len(strings) - 1)))

    # Extract our choice from the list
    choice = strings[choiceidx]

    # Check process doesn't make our choice the empty string
    assume(processor(choice) != '')

    # Extract all perfect matches
    result = process.extractBests(choice,
                                  strings,
                                  scorer=scorer,
                                  processor=processor,
                                  score_cutoff=100,
                                  limit=None)

    # Check we get a result
    assert result != []

    # Check the original is in the list
    assert (choice, 100) in result

예제 #15

0

파일 보기

파일: keychain.py 프로젝트: ryanpetrello/1pass

 def item(self, name, fuzzy_threshold=100):
     """
     Extract a password from an unlocked Keychain using fuzzy
     matching. ``fuzzy_threshold`` can be an integer between 0 and
     100, where 100 is an exact match.
     """
     matches = process.extractBests(
         name,
         self._items.keys(),
         score_cutoff=(fuzzy_threshold-1),
     )
     if matches:
         if len(matches) > 1:
             for i, m in enumerate(matches):
                 sys.stderr.write('[%s] %s\n' % (i, m[0]))
             sys.stdin = open('/dev/tty')
             try:
                 exact_name = matches[int(raw_input())][0]
             except:
                 return None
         else:
             exact_name = matches[0][0]
         item = self._items[exact_name]
         item.decrypt_with(self)
         return item
     else:
         return None

예제 #16

0

파일 보기

 def bot_message(self, cpu_lookup):
     self.cpu_lookup = cpu_lookup
     logging.info('Looking for CPU...')
     try:
         choices = []
         for cpu in self.cpu_list:
             match_criteria = fuzz.token_set_ratio(
                 self.clean_input(cpu), self.clean_input(cpu_lookup))
             if match_criteria >= 45:
                 choices.append(cpu)
         closest_match = process.extractOne(cpu_lookup,
                                            choices,
                                            scorer=fuzz.token_set_ratio,
                                            score_cutoff=85)
         logging.info(f"Searching: {cpu_lookup}, Closest: {closest_match}")
         closest_match_name = closest_match[0]
         bot_reply = self.display_cpu_info(closest_match_name)
     except TypeError:
         limit_choices = process.extractBests(cpu_lookup, choices)
         if limit_choices:
             bot_reply = f"No direct CPU  match found for **{cpu_lookup}**, displaying {len(limit_choices)} potential matches:\n\n"
             search_results = ''
             for result in limit_choices[:6]:
                 cpu_name = result[0]
                 search_results += f"[{cpu_name}]({self.cpu_list[cpu_name]})\n\n"
             bot_reply += search_results
             bot_reply += "\n\nFeel free to ask me again (`CPUBot! cpu model`) with these models or visit PassMark directly!\n"
         # Handles no results being found in search
         if not limit_choices:
             bot_reply = f"\n\nI'm sorry, I couldn't find any information on **{cpu_lookup}**.\n\nPlease feel free to try again; perhaps you had a spelling mistake, or your CPU is not listed in the [Passmark CPU list]({self.passmark_page})."
     return bot_reply

예제 #17

0

파일 보기

    def search_without_properties_list(self):
        """
        Первоначальный поиск не использующий список связанных слов
        """
        search_list = Suggest.root.suggest_db[self.first_let][str(
            self.len_word)]
        tokens = self.search_token(self.search_word, list(search_list.keys()),
                                   50, 3)

        for token, percent in tokens:
            tokens_list = search_list[token[:self.len_word]]
            token = process.extractBests(self.search_word,
                                         tokens_list,
                                         limit=3,
                                         score_cutoff=50)
            self.logger.debug(msg=[token, tokens_list])

            for word, percent in token:
                if word not in self.root.stop_words:
                    word_prop = process.extractOne(
                        word, Suggest.root.search_words_db[word[0]].keys())[0]
                    gm_names = Suggest.root.search_words_db[
                        word[0]][word_prop]['gm_name']

                    self.res_list.append(
                        (self.start_phrase + word, percent, gm_names))

예제 #18

0

파일 보기

파일: app.py 프로젝트: sofiamuller93/Adoption_Predictions

def upload_file ():
    data = {"success": False}
    print(request.files)
    if request.files.get('file'):
        # read the file
        file = request.files['file']

        # read the filename
        filename = file.filename

        # create a path to the uploads folder
        filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)

        file.save(filepath)

        prediction = predict(filepath)
        breed = prediction[0][1]

        animal_outcome = pd.read_csv('../Animals.csv')

        r = process.extractBests(breed.replace('_', ' '), animal_outcome.Breed.to_dict(), scorer=partial_ratio, score_cutoff=70, limit=1000000000)
        animal_analysis = animal_outcome.loc[map(lambda x: x[-1], r)]
        return animal_analysis.to_json(orient='records')
        # return render_template('table.html', dogs=animal_analysis.to_dict(orient='records')) 

    return jsonify({"error": "there is an error!"})

예제 #19

0

파일 보기

    def fuzzyExtract(self, qs, ls, threshold):
        '''
        todo fuzzy search seperation in words
        :param qs: query string
        :param ls: large string
        :param threshold: threshold
        :return:
        '''
        '''fuzzy matches 'qs' in 'ls' and returns list of
        tuples of (word,index)
        '''

        if len(qs) < self.fuzzySearchOptimumLength:
            processThreshold = 60
            max_l_dist = 0
        else:
            processThreshold = threshold
            max_l_dist = 1

        for word, confidence in process.extractBests(qs, (ls,), score_cutoff=processThreshold):
            print('word {}'.format(word), confidence)
            for match in find_near_matches(qs, word, max_l_dist=max_l_dist):
                match = word[match.start:match.end]
                print('match {}'.format(match))
                index = ls.find(match)

예제 #20

0

파일 보기

파일: helpers.py 프로젝트: robobenklein/unhexium-cogs

def get_best_events_matching(event_list: list, search: str):
    if len(event_list) == 0:
        return []
    if search is None or len(search) == 0:
        return event_list
    try:
        i = int(search)
        for event in event_list:
            if event["id"] == event_id:
                return [event]
    except ValueError:
        pass
    except TypeError:
        pass
    # try doing a search through the names of events:
    event_names = [(e["event_name"], e) for e in event_list]

    def process_by_first(v):
        return fuzzy_fullproc(v[0])

    results = process.extractBests((search, None),
                                   event_names,
                                   processor=process_by_first,
                                   scorer=fuzz.partial_ratio,
                                   score_cutoff=90)
    rl = []
    for e, score in results:
        rl.append(e[1])
    return rl

예제 #21

0

파일 보기

파일: evaluate.py 프로젝트: TU-Berlin/namespacediscovery-pipeline

    def fuzzy_combine_def(self, definitions, scorer=None):
        d = dict(definitions)
        order_key = lambda name: d[name]

        result = []
        names = set(d.keys())

        if scorer is None:
            scorer = fuzz.token_set_ratio

        while names:
            first = names.pop()

            similar = process.extractBests(first, names, scorer=scorer, limit=1000, score_cutoff=65)
            similar_names = [name for name, _ in similar]

            for name in similar_names:
                names.remove(name)

            res_names = [first] + similar_names
            ordered_by_score = sorted(res_names, key=order_key, reverse=True)

            total_score = sum([d[name] for name in ordered_by_score])
            result.append((ordered_by_score, total_score))

        resorted = sorted(result, key=operator.itemgetter(1), reverse=True)
        return resorted

예제 #22

0

파일 보기

def DupliFuzzyMatch(query):
    
    #choices = list(result[['FullTitle'].itertuples(index=False, name=None)) 
    #choices=list(dict.fromkeys(choices))
    choices = result['FullTitle'].tolist()
    prc=process.extractBests(query,choices,score_cutoff=60, limit=10)
    prc=list(dict.fromkeys(prc))
    '''
    labels = ['FullTitles', 'Score']
    df = pd.DataFrame.from_records(prc, columns=labels)
    df1=pd.DataFrame(df['FullTitles'].values.tolist(), index=df.index)
    df = pd.concat([df1, df['Score']], axis=1, sort=False)
    df.rename(columns={0: 'FullTitle',1:'System'}, inplace=True)
    '''
    lim=len(prc)
    if lim>0: 
        df = pd.DataFrame([])
        for i in range(0,lim):
            prcc=Dup_Count[Dup_Count['FullTitle'].str.match((prc[i][0]))]
            df = df.append(prcc)
        df=df.drop_duplicates(['Titles'],keep= 'last')  
        df=df[['FullTitle','System', 'Count']]  
        df=df.sort_values(['Count'],ascending=False)
        df.reset_index(drop = True, inplace = True)
        
    else:
        df = pd.DataFrame(columns=['FullTitle','System','ISBN13'])  

    return df

예제 #23

0

파일 보기

def fuzzy_search(query_string,
                 options_dict,
                 scorer=fuzz.QRatio,
                 score_cutoff=81):
    """
    Uses fuzzy search to find best matches for the `query_string` in
    `options_dict`

    Args:
        query_string (str):
            String used to search for matches
        options_dict (list):
            List of options to find matches in.
        scorer (fuzz.Scorer, optional):
            Strategy to use when searching for matches.
            Defaults to fuzz.QRatio.
        score_cutoff (int, optional):
            Similarity score cutoff threshold.
            Defaults to 81.

    Returns:
        array:
            Array of matching words
    """

    fuzzy_results = process.extractBests(query_string,
                                         options_dict,
                                         scorer=fuzz.QRatio,
                                         score_cutoff=score_cutoff)

    return fuzzy_results

예제 #24

0

파일 보기

파일: tools.py 프로젝트: scamera/cobaya

def fuzzy_match(input_string, choices, n=3, score_cutoff=50):
    try:
        return list(
            zip(*(fuzzy_process.extractBests(
                input_string, choices, score_cutoff=score_cutoff))))[0][:n]
    except IndexError:
        return []

예제 #25

0

파일 보기

def fuzzy_extract_matches(word, possibilities, junk_seq=None, n=3, cutoff=0.5):
    score_cutoff = int(cutoff*100)
    matches = process.extractBests(word,possibilities,score_cutoff=score_cutoff,limit=n)
    final_matches = []
    for m,s in matches:
        final_matches.append((m,s/100))
    return final_matches

예제 #26

0

파일 보기

def get_match(word_list: list,
              word: str,
              score_cutoff: int = 60,
              isPartial: bool = False,
              limit: int = 1):
    """Uses fuzzywuzzy to see if word is close to entries in word_list

    Returns a tuple of (MATCH, SCORE)
    """
    if not word:
        return (None, None)
    result = None
    scorer = fuzz.ratio
    if isPartial:
        scorer = fuzz.partial_ratio
    if limit == 1:
        result = process.extractOne(word,
                                    word_list,
                                    scorer=scorer,
                                    score_cutoff=score_cutoff)
    else:
        result = process.extractBests(word,
                                      word_list,
                                      scorer=scorer,
                                      score_cutoff=score_cutoff,
                                      limit=limit)
    if not result:
        return (None, None)
    return result

예제 #27

0

파일 보기

def get_fuzzy_search_movies(fuzzy, repo):
    """
    """
    movies = list(repo.dataset_of_movies)
    movie_title_dict = dict(enumerate([movie.title for movie in movies]))
    best_matches = process.extractBests(fuzzy, movie_title_dict, score_cutoff=50)
    return [utils.movie_to_dict(movie) for movie in [movies[z] for (x, y, z) in best_matches]]

예제 #28

0

파일 보기

파일: mainfct.py 프로젝트: erhla/ptap

def address_candidates(input_data, cutoff_info):
    """
    Returns address candidates
    """
    output = {}
    st_num = input_data['st_num']
    st_name = input_data['st_name']

    if input_data['appeal_type'] == "detroit_single_family":
        cutoff = cutoff_info['detroit']
        region = 'detroit'

    elif input_data['appeal_type'] == "cook_county_single_family":
        cutoff = cutoff_info['cook']
        region = 'cook'

    mini = address_candidates_query(region, st_num)
    candidate_matches = process.extractBests(st_name, mini.st_name, score_cutoff=50)
    selected = mini[mini['st_name'].isin([i for i, _, _ in candidate_matches])].copy()
    selected['Distance'] = 0

    if input_data['appeal_type'] == "detroit_single_family":
        selected = prettify_detroit(selected, False)
    elif input_data['appeal_type'] == "cook_county_single_family":
        selected = prettify_cook(selected, False)

    selected['eligible'] = selected.assessed_value <= cutoff
    selected.dropna(axis=0, inplace=True)
    output['candidates'] = selected.to_dict(orient='records')

    if len(output['candidates']) == 0: #if none found raise
        raise Exception('No Matches Found')
    return output

예제 #29

0

파일 보기

파일: MovieScriptsAllTitles.py 프로젝트: dfluhman/orange3-textable-prototypes

def view_script(title_to_href):
    #This is what will get the actual script of a single movie
    movie_names_row = input(
        '\033[31m Entrez le nom du film et l\'année entre parenthèses, ex : 99 Homes (2014) : \033[0m'
    )
    #The first attribute of extract will be user's input, second is the list of all movie scripts, third is number of results determined by user
    movie_names = process.extractBests(movie_names_row,
                                       title_to_href.keys(),
                                       limit=1,
                                       score_cutoff=70)
    titles = [movie_name[0] for movie_name in movie_names]
    title = titles[0]

    print(title)
    if input('\033[31m Entrez "yes" pour continuer : \033[0m') == 'yes':

        if title in title_to_href:
            print(title_to_href[title])
        else:
            print('Aucun résultat')

        page_url = "https://www.springfieldspringfield.co.uk/movie_script.php?movie=" + title_to_href[
            title]
        page = urllib.request.urlopen(page_url)
        soup = BeautifulSoup(page, 'html.parser')
        script = soup.find("div", {"class": "movie_script"})
        print(script.text)
    else:
        pass

예제 #30

0

파일 보기

파일: normalization.py 프로젝트: wtsi-hgi/serapis-web

def normalize_country(country):
    if process.extractBests(country, choices=['N/A', 'n/a'], score_cutoff=HIGH_CUTOFF):
        return None
    if country == 'null':
        return None
    if country in ['USA', 'UK']:
        return country
    if process.extractOne(country, choices=['United Kingdom', 'UK'], score_cutoff=HIGH_CUTOFF):
        return 'UK'
    if process.extractOne(country, choices=['England', 'Wales', 'Scotland'], score_cutoff=HIGH_CUTOFF):
        return 'UK'
    if process.extractOne(country, choices=['Russia', 'Russian Federation'], score_cutoff=LOW_CUTOFF):
        return 'Russia'
    try:
        country = pycountry.historic_countries.get(name=country.capitalize())
        return country.name
    except KeyError: pass
    
    try:
        country = pycountry.historic_countries.get(alpha2=country.upper())
        return country.name
    except KeyError: pass

    countries = [country.name for country in pycountry.countries]
    best_match = process.extractOne(country, choices=countries, score_cutoff=90)
    return best_match if best_match else country

예제 #31

0

파일 보기

파일: test_fuzzywuzzy_hypothesis.py 프로젝트: tmlrnc/ai-ml-fuzzy

def test_identical_strings_extracted(scorer, processor, data):
    """
    Test that identical strings will always return a perfect match.

    :param scorer:
    :param processor:
    :param data:
    :return:
    """
    # Draw a list of random strings
    strings = data.draw(
        st.lists(st.text(min_size=10, max_size=100), min_size=1, max_size=50))
    # Draw a random integer for the index in that list
    choiceidx = data.draw(
        st.integers(min_value=0, max_value=(len(strings) - 1)))

    # Extract our choice from the list
    choice = strings[choiceidx]

    # Check process doesn't make our choice the empty string
    assume(processor(choice) != '')

    # Extract all perfect matches
    result = process.extractBests(choice,
                                  strings,
                                  scorer=scorer,
                                  processor=processor,
                                  score_cutoff=100,
                                  limit=None)

    # Check we get a result
    assert result != []

    # Check the original is in the list
    assert (choice, 100) in result

예제 #32

0

파일 보기

파일: reformat_clean_hesa.py 프로젝트: dragondave/cameron_diversity

def get_grid(name):
    name = name.lower()
    if name in grid_lookup.keys():
        return grid_lookup[name]

    if name in grid_names:
        index = grid_names.index(name)
        return grid_uk[index].get('id')

    elif name.replace('the ', '', 1) in grid_names:
        index = grid_names.index(name.replace('the ', '', 1))
        return grid_uk[index].get('id')

    fuzznames = grid_names + [name for name in grid_lookup.keys()]
    fuzzmatch = process.extractBests(
        name.replace('university', '').replace('the', ''), fuzznames)
    print('\nName to match:', name)
    for i, match in enumerate(fuzzmatch):
        print('  ' + str(i) + ': ' + match[0])
    i = input('Select a match (0-indexed) or give the correct GRID:')
    try:
        i = int(i)
        return grid_uk[grid_names.index(fuzzmatch[i][0])].get('id')
    except ValueError:
        return i

예제 #33

0

파일 보기

def fuzMatch(query):

    #choices = result['FullTitle'].tolist()
    choices = list(result[['FullTitle', 'System','ISBN13']].itertuples(index=False, name=None)) 
    choices=list(dict.fromkeys(choices))
    
    prc=process.extractBests(query,choices,score_cutoff=60, limit=10)
    prc=list(dict.fromkeys(prc))
    lim=len(prc)
    if lim>0:
        labels = ['FullTitles', 'Score']
        df = pd.DataFrame.from_records(prc, columns=labels)
        df1=pd.DataFrame(df['FullTitles'].values.tolist(), index=df.index)
        df = pd.concat([df1, df['Score']], axis=1, sort=False)
        df.rename(columns={0: 'FullTitle',1:'System',2:'ISBN13'}, inplace=True)
    else:
        df = pd.DataFrame(columns=['FullTitle','System','Score','ISBN13'])
    #for i in range(0,lim):
       # prcc=result[result['FullTitle'].str.match((prc[i][0]))]
       # df = df.append(prcc,sort=True)
    '''
    lent=len(df)
    for j in range(0,lent):
        kk= df['FullTitle'].values[j]
        score= fuzz.WRatio(query,kk)
        df['Score'].values[j]=score
    
    df=df.drop_duplicates(['FullTitle','System'],keep= 'last')    
    '''
    #df.reset_index(drop = True, inplace = True)
    return df

예제 #34

0

파일 보기

def main() -> None:
    args = sys.argv
    first = args[1]
    second: Optional[str]
    try:
        second = args[2]
    except IndexError:
        second = None

    first_res: Tuple[str, int] = process.extractOne(first, first_args)
    if first_res[1] >= MIN_SCORE:
        if first_res[0] == "spells":
            if second is None:
                print(spells)
            else:
                spell_keywords = get_spell_keywords()
                second_res: List[Tuple[str, int]] = process.extractBests(
                    second, spell_keywords, score_cutoff=MIN_SCORE)
                potential_matches = [second_res[0][0]]
                potential_matches.extend([
                    x[0] for x in second_res[1:]
                    if second_res[0][1] - x[1] <= 5 and second_res[0][1] != 100
                ])

                if len(potential_matches) > 1:
                    string = "Multiple potential matches found:"
                    for match in potential_matches:
                        string += f"\n{match}"
                    print(string)

                elif len(potential_matches) == 1:
                    print_keyword_info(potential_matches[0])
                else:
                    print(second_res)

예제 #35

0

파일 보기

    def thumbnail(self):
        """
        To find a list of thumbnail candidates, collections can be filtered inexactly against the title of the page (to catch spelling errors).
        """
        colls = []
        q_title = Collection.objects.filter(name__iexact=self.title)
        q_slug = Collection.objects.filter(name__iexact=self.slug)
        # We cast a very broad net to find all collections with a name matching the slug or title of the page. Collections created by Mediabox will have a name matching the name of the subdirectory they came from, so names are not unique.

        colls.append([e for e in q_title])
        colls.append([e for e in q_slug])
        candidates = []

        for coll in colls:
            imgs_for_coll = Image.objects.filter(collection=coll)

            candidates.append([entry.title for entry in imgs_for_coll])

        for query in ['thumbnail', 'directory', self.title]:
            hit = process.extractBests(query, candidates, score_cutoff=70)

            if hit:
                return Image.objects.get(title__iexact=hit[0])

        consolation = random.choice(candidates)  # Just pick one.
        return Image.objects.get(title__iexact=consolation)

예제 #36

0

파일 보기

    def _handle_not_found(self, query):
        """
        Handles when a query does not match a valid command or cog.

        Will pass on possible close matches along with the
        ``HelpQueryNotFound`` exception.

        Parameters
        ----------
        query: str
            The full query that was requested.

        Raises
        ------
        HelpQueryNotFound
        """

        # combine command and cog names
        choices = list(self._bot.all_commands) + list(self._bot.cogs)

        result = process.extractBests(query,
                                      choices,
                                      scorer=fuzz.ratio,
                                      score_cutoff=90)

        raise HelpQueryNotFound(f'Query "{query}" not found.', dict(result))

예제 #37

0

파일 보기

def enrich_supports(rows):
    relevant_rows = 0
    matched_rows = 0
    for row in rows:
        bests = []
        if row['request_type'] == 'א3':
            relevant_rows += 1
            payments = row['payments']
            if payments and len(payments) > 0:
                payment = payments[0]
                key = (payment['support_title'], payment['supporting_ministry'])
                if key in cache:
                    bests = cache[key]
                else:
                    bests = process.extractBests(
                        {
                            'purpose': key[0],
                            'office': key[1]
                        },
                        criteria,
                        processor=id,
                        scorer=criteria_scorer
                    )
                    cache[key] = bests
        if len(bests)>0:
            matched_rows += 1
        row['criteria_docs'] = [x[0] for x in bests]
        yield row
    
    logging.info('MATCH STATS: rel: %d, matched: %d', relevant_rows, matched_rows)

예제 #38

0

파일 보기

파일: find_restaurants.py 프로젝트: GregMFriedman/Dish_Recommender

def word_match(words, menu, vector_dict, threshold=93, best_matches=None):

    """Matches single-word keyword vectors to menu items 
    Args:
        littlekeys (list): a list of single keywords
        menu (dict): a dictionary of a menu for a given restaurant
        vector_dict (dict): a dictionary whose keys are keywords and values are vectors
        threshold (int): fuzzy-match score required for keyword matching 
        best_matches (dict): dict created in best_matches to add to
    Returns:
        best_matches (dict): a nested dictionary whose outer keys are menu items, inner keys
        are tuples of keyword matches and their corresponding fuzzy match score, and values
        are the review vectors of each keyword 
    """

    if not best_matches:
        best_matches = {}
    for name, meals in menu.iteritems():
        for meal, value in meals.iteritems():
            for food in value:
                item = remove_accents(food)
                if len(item.split()) == 1:
                    matches = process.extractBests(item, words, scorer=fuzz.QRatio, score_cutoff=threshold, limit=3)
                    vectors = []
                    for match in matches:
                        vectors.append(vector_dict[match[0]])
                    best_matches[item] = dict(zip(matches, vectors))
    return best_matches

예제 #39

0

파일 보기

파일: breeze_proxy.py 프로젝트: jaycle/breeze_proxy

    def get_breeze_user(self, name):
        self.get_all_users()

        # Put name in format so get_name applies
        temp_name = {'first_name': name, 'last_name': ''}
        get_name = lambda u: ' '.join([u['first_name'], u['last_name']])

        bests = process.extractBests(temp_name,
                                     self._people,
                                     get_name,
                                     limit=3)

        # Penalize 10 points if first names start with different letters
        best = (bests[0], 0)  # Hold best candidate and the score
        for item in bests:
            penalty = 0
            if name[0].casefold() != item[0]['first_name'][0].casefold():
                penalty = 10

            if item[1] - penalty > best[1]:
                best = (item, item[1] - penalty)

        if best[1] < 90:
            raise MatchNotFoundError

        return best[0]

예제 #40

0

파일 보기

파일: evaluate.py 프로젝트: TU-Berlin/namespacediscovery-pipeline

    def fuzzy_combine_def(self, definitions, scorer=None):
        d = dict(definitions)
        order_key = lambda name: d[name]

        result = []
        names = set(d.keys())

        if scorer is None:
            scorer = fuzz.token_set_ratio

        while names:
            first = names.pop()

            similar = process.extractBests(first,
                                           names,
                                           scorer=scorer,
                                           limit=1000,
                                           score_cutoff=65)
            similar_names = [name for name, _ in similar]

            for name in similar_names:
                names.remove(name)

            res_names = [first] + similar_names
            ordered_by_score = sorted(res_names, key=order_key, reverse=True)

            total_score = sum([d[name] for name in ordered_by_score])
            result.append((ordered_by_score, total_score))

        resorted = sorted(result, key=operator.itemgetter(1), reverse=True)
        return resorted

예제 #41

0

파일 보기

파일: __init__.py 프로젝트: tmthydvnprt/tome

def fuzzy_matches(x=None, query='', where='both', score_cutoff=50, limit=10):
    """
    Get the internal matches used in fuzzy search. Exposses the scores of each match.
    If item is a 2 tuple match is on a key, if the item is a 3 tuple match is on a value.
    """

    if where is 'keys':
        matches = process.extractBests(query, x.keys(), score_cutoff=score_cutoff, limit=limit)
    elif where is 'values':
        matches = process.extractBests(query, x, score_cutoff=score_cutoff, limit=limit)
    elif where is 'both':
        key_matches = process.extractBests(query, x.keys(), score_cutoff=score_cutoff, limit=limit)
        val_matches = process.extractBests(query, x, score_cutoff=score_cutoff, limit=limit)
        # Combine results
        results = key_matches + val_matches

    return results

예제 #42

0

파일 보기

파일: transfer.py 프로젝트: civica-digital/como-vamos-setup-data

def getclosestelement(name, array):
    if name in array:
        return name
    try:
        close = process.extractBests(name, array)[0][0]
    except:
        close = difflib.get_close_matches(name, array)[0]
    return close

예제 #43

0

파일 보기

파일: utils.py 프로젝트: Gr3yR0n1n/silverlining

def search_collection(items, query, key='title'):
    """Levenstein fuzzy search"""
    d = {i[key]: i for i in items}
    bests = process.extractBests(query, list(d.keys()))
    if bests:
        return [d[match[0]] for match in bests]
    else:
        return []

예제 #44

0

파일 보기

파일: models.py 프로젝트: fernandomrm/desafiohu1

 def busca(self, query):
     query = normaliza_string(query)
     amostra = self.cria_amostra()
     scorer = seleciona_scorer(query)
     resultado = process.extractBests(query, amostra, limit=10, scorer=scorer, score_cutoff=65)
     if scorer == fuzz.token_set_ratio:
         resultado = process.extract(query, lapida_extracao(resultado), limit=20, scorer=fuzz.partial_ratio)
     return lapida_extracao(resultado)

예제 #45

0

파일 보기

파일: xlibris.py 프로젝트: konsbn/xlibris

 def lookup(self, keyword):
     data = _concat(self.db.all())
     title = data.pop('Title')
     auth = data.pop('Authors')
     choices = title + auth
     searchKey = process.extractBests(keyword, choices)
     for i in searchKey:
         if i[1] >= 90:
             self.search(i[0])

예제 #46

0

파일 보기

파일: uflix.py 프로젝트: sg-s/uflix

	def search(self, name):
		'''searches list of movies for a movie'''

		movies_path = self.movies_path;

		onlyfolders = [f for f in os.listdir(movies_path) if os.path.isdir(os.path.join(movies_path, f))]

		results = process.extractBests(name,onlyfolders)
		for result in results:
			print(result[0])

예제 #47

0

파일 보기

파일: manager.py 프로젝트: leiserfg/lic

    def list(self, query=None, by_nick=False):
        if query is None:
            return self.licenses

        field = 'nick' if by_nick else 'title'
        choices = {getattr(lic, field): lic for lic in self.licenses}
        print choices
        field_values = extractBests(query, choices.iterkeys())

        lics = [choices[v] for v, _ in field_values]
        return lics

예제 #48

0

파일 보기

파일: __init__.py 프로젝트: tmthydvnprt/tome

def fuzzy_search(x=None, query='', where='both', score_cutoff=50, limit=10):
    """
    Search thru keys, values, or both via fuzzy string comparison.
    """

    if where is 'keys':
        matches = process.extractBests(query, x.keys(), score_cutoff=score_cutoff, limit=limit)
        results = {k : x[k] for k, score in matches}
    elif where is 'values':
        matches = process.extractBests(query, x, score_cutoff=score_cutoff, limit=limit)
        results = {k : v for v, score, k in matches}
    elif where is 'both':
        key_matches = process.extractBests(query, x.keys(), score_cutoff=score_cutoff, limit=limit)
        key_results = {k : x[k] for k, score in key_matches}
        val_matches = process.extractBests(query, x, score_cutoff=score_cutoff, limit=limit)
        val_results = {k : v for v, score, k in val_matches}
        # Combine results
        results = key_results.copy()
        results.update(val_results)

    return results

예제 #49

0

파일 보기

파일: pdfMetadataExtractor.py 프로젝트: n-witt/DataAnalyzer

def selectBestMatch(tokens, hits, qualityThreshold=50, creatorWeight=2, titleWeight=1):
   results = []
   for hit in hits:
      try:
         creators = set()
         normalizedScore = 0
         creatorScore = 0
         score = 0
         if hit.has_key('title'):
            foo = process.extractBests(hit['title'], tokens)
            if len(foo) > 0:
               score += foo[0][1] * titleWeight
         if hit.has_key('creator'):
            creators = creators.union(set(hit['creator']))
         if hit.has_key('person'):
            creators = creators.union(set(hit['person']))
         if hit.has_key('contributor'):
            creators = creators.union(set(hit['contributor']))
         if hit.has_key('publisher'):
            creators = creators.union(set(hit['publisher']))            
            
         if len(creators) > 0:
            for creator in creators:
               foo = process.extractBests(creator, tokens)
               if foo != None and foo != []:
                  creatorScore += foo[0][1]
            score += (creatorScore/len(creators)) * creatorWeight

         normalizedScore = score/(creatorWeight+titleWeight)   
         if normalizedScore >= qualityThreshold:
            bisect.insort_left(results, (normalizedScore, hit))
      except UnicodeDecodeError as e:
         continue
   if len(results) > 0:
      return results[-1]
   return None

예제 #50

0

파일 보기

파일: manager_match.py 프로젝트: circld/Work

def match_lists(list1, list2, precision=90):
    '''
    Takes two lists
    Returns dict of matches (if best match scores above precision)
        and non-matches (otherwise); both dicts have key = list1
        element and value = best match
    '''
    in_both = {}
    not_in_both = {}
    for manager1 in list1:
        best_match = process.extractBests(manager1, list2, limit=1)
        if best_match[0][1] > precision:
            in_both[manager1] = best_match[0][0]
        else:
            not_in_both[manager1] = best_match[0][0]
    return [in_both, not_in_both]

예제 #51

0

파일 보기

파일: shitlinks.py 프로젝트: kyleterry/tenyks-contrib

def get_closest_text_bump(search_text):
    search_text = {'text': search_text}
    text_bumps = filter(lambda bump: bump['text'], get_bumps())
    if text_bumps:
        best_bumps = process.extractBests(
            search_text,
            text_bumps,
            scorer=fuzz.token_set_ratio,
            processor=lambda bump: bump['text'].lower()
        )
        best_bumps = [
            bump[0]
            for bump in best_bumps
            if bump[1] == best_bumps[0][1]
        ]
        return get_bump_link(random.choice(best_bumps))
    else:
        return 'no dice m8'

예제 #52

0

파일 보기

파일: Trumpisms.py 프로젝트: laneshetron/monopoly

    def get_best(self, message):
        message = message.lower()
        tokens = self.create_tokens(message)
        matches = process.extractBests(' '.join(tokens), self.trump_quotes,
                                       score_cutoff=50, limit=3)
        if not matches:
            return None

        if re.search('trump|the donald', message):
            matches = [(x[0], x[1] + 10, x[2]) for x in matches ]

        adjusted = []
        for x in matches:
            b_tokens = self.create_tokens(x[0])
            score = self.score_ngrams(tokens, b_tokens)
            adjusted.append((x[0], x[1] + score, x[2]))
        top = [x for x in adjusted if x[1] == adjusted[0][1] ]
        return top

예제 #53

0

파일 보기

파일: test_fuzzywuzzy_hypothesis.py 프로젝트: karimia/fuzzywuzzy

def test_only_identical_strings_extracted(scorer, processor, data):
    """
    Test that only identical (post processing) strings score 100 on the test.

    If two strings are not identical then using full comparison methods they should
    not be a perfect (100) match.

    :param scorer:
    :param processor:
    :param data:
    :return:
    """
    # Draw a list of random strings
    strings = data.draw(
        st.lists(
            st.text(min_size=10, max_size=100, alphabet=HYPOTHESIS_ALPHABET),
            min_size=1,
            max_size=10)
    )
    # Draw a random integer for the index in that list
    choiceidx = data.draw(st.integers(min_value=0, max_value=(len(strings) - 1)))

    # Extract our choice from the list
    choice = strings[choiceidx]

    # Check process doesn't make our choice the empty string
    assume(processor(choice) != '')

    # Extract all perfect matches
    result = process.extractBests(choice,
                                  strings,
                                  scorer=scorer,
                                  processor=processor,
                                  score_cutoff=100,
                                  limit=None)

    # Check we get a result
    assert result != []

    # Check THE ONLY result(s) we get are a perfect match for the (processed) original data
    pchoice = processor(choice)
    for r in result:
        assert pchoice == processor(r[0])

예제 #54

0

파일 보기

파일: noticias.py 프로젝트: gcsgpp/crawler-ego

def extrairCitadosTextoBestExtract(link,f):
    listaFamosos = model.buscarListaFamoso()
    noticiaDB = model.buscarNoticia(link)

    if noticiaDB != False and listaFamosos != False:
        f.writelines(str(noticiaDB[1]) + "\n")
        noticia = {'id': noticiaDB[0], 'titulo': noticiaDB[1], 'subtitulo': noticiaDB[2], 'link': noticiaDB[3], 'tipo': noticiaDB[4], 'texto': noticiaDB[5]}
        print(str(noticia['titulo']))
        famosos = []
        for j in listaFamosos:
            famosos.append(j[1])

        textoTotal = noticia['titulo'] + " " + noticia['subtitulo'] + " " + noticia['texto']

        result = process.extractBests(textoTotal, famosos,None,None,57,limit=10000)

        for r in result:
            f.writelines(str(r[0])+","+str(round(r[1],1))+"\n")

        return result

예제 #55

0

파일 보기

파일: main.py 프로젝트: Juvawa/bib2web

def pdf(pdf_files, shared_pdf, bibtex_folder, bibtex_files, gscholar):
	for pdf in pdf_files:
		txt = re.sub("\W", " ", gs.convert_pdf_to_txt(pdf)).lower()
		#Research determined that the cutting of 35 words gives the 
		#highest accuracy
		words = txt.strip().split()[:35]
		words = " ".join(words)		
		print words
		if gscholar == True:
			bib = load(gs.pdflookup(pdf, all, gs.FORMAT_BIBTEX)[0])
			keys = bib.entries[0].keys()
			matching = [s for s in keys if "pdf" in s]
			if len(matching) - 1 <= 0:
				key = 'pdf'
			else:
				key = 'pdf' + str(len(matching))
			#link = os.symlink(pdf, str(shared_pdf) + str(pdf.split('/')[-1]))
			bib.entries = [add_field(bib.entries[0], key, bib)]
			bibtex(bib.entries, bibtex_folder, bib)
			sleep(random.uniform(1, 2))
		else:
			best_match = process.extractBests(words, bibtex_files, limit=1)
			print best_match
			if best_match:
				bib = best_match[0][0]
				score = best_match[0][1]
				#Research determined that matching score of 45
				#gives the highest accuracy
				if score > 45:
					with open(bib, 'r') as f:
						db = load(f)
					entries = db.entries[0]
					keys = entries.keys()
					matching = [s for s in keys if "pdf" in s]
					if len(matching) - 1 <= 0:
						key = 'pdf'
					else:
						key = 'pdf' + str(len(matching))
					entries = add_field(entries, key, bib)
					with open(bib, 'w') as f:
						f.write(writer._entry_to_bibtex(entries))

예제 #56

0

파일 보기

파일: thread_extrairCitados.py 프로젝트: gcsgpp/crawler-ego

def extrairCitadosTexto(threadName, queue, listaFamosos):
    queueLock.acquire()
    while not workQueue.empty():
        n = queue.get()
        print("Queue: " + str(workQueue.qsize()))
        queueLock.release()

        noticia = {'id': n[0], 'titulo': n[1], 'subtitulo': n[2], 'link': n[3], 'tipo': n[4], 'texto': n[5]}
        tempoBest = time.clock()
        textoTotal = noticia['titulo'] + " " + noticia['subtitulo'] + " " + noticia['texto']
        bestExtract = process.extractBests(textoTotal, listaFamosos,None,None,57,limit=10000)
        print("tempoBest:" + str(time.clock() - tempoBest))

        tempoLoop = time.clock()
        for j in bestExtract:
            ratioTotal = 0
            mediaRatio = 0
            nomeCompleto = j[0]
            todosNomes = nomeCompleto.split(" ")
            textoTotal = noticia['titulo'] + " " + noticia['subtitulo'] + " " + noticia['texto']

            ratioTotal += fuzz.token_set_ratio(nomeCompleto, textoTotal)
            if len(todosNomes) > 1:
                for nome in todosNomes:
                    ratioTotal += fuzz.token_set_ratio(nome, textoTotal)

            if len(todosNomes) == 1:
                mediaRatio = ratioTotal
            elif len(todosNomes) > 1:
                mediaRatio = ratioTotal / (len(todosNomes) + 1)

            if mediaRatio > 85:
                model.relacionarFamosoNoticia(nomeCompleto, noticia['id'])
        print("tempo loop:" + str(time.clock() - tempoLoop))

        print(str(threadName) + " - " + str(noticia['titulo']))
        queueLock.acquire()
    queueLock.release()

예제 #57

0

파일 보기

파일: convert.py 프로젝트: eDISCO/Item-conversion-script

 outlist[21] = row[11] # Room (raw location from input)
 outlist[27] = row[6] # Manufacturer website
 outlist[29] = row[18] # training_required?
 outlist[37] = row[7] # Asset ID
 outlist[38] = row[28] # Finance ID
 outlist[39] = row[5] # Serial No
 outlist[42] = row[24] # Date of Purchase
 outlist[43] = row[25] # Purchase_cost
 outlist[45] = row[23] # end_of_life
 outlist[46] = row[22] # maintenance
 outlist[49] = row[8] # comments
 # Fuzzy match building:
 if  not row[11]:
     row[11] = 'Unknown'
 # a -- results from fuzzy matching
 a = process.extractBests(row[11], buildings, limit=2)
 if a[0][0] == "SMC":
     outlist[20] = 'Scottish Microelectronics Centre'
 else:
     outlist[20] = a[0][0];
 # Fuzzy match names and emails:
 # Contact 1:
 flat_names = flatten_dict(names)
 b = process.extractBests(row[12],flat_names, limit=2, scorer=fuzz.token_set_ratio)
 custodian_score = b[0][1]
 outlist[22] = b[0][0] # contact 1 name
 outlist[23] = names[b[0][0]] # contact 1 email
 # Contact 2:
 if row[13]:
     c = process.extractBests(row[13],flat_names, limit=2, scorer=fuzz.token_set_ratio)
     technical_score = c[0][1]

예제 #58

0

파일 보기

파일: kya.py 프로젝트: jck/kya

 def handle_query(self):
     self.results.clear()
     res = extractBests(self.query.text(), self.all_results, limit=8)
     for r, s in res:
         self.results.add(r)
     self.results.setCurrentRow(0)

예제 #59

0

파일 보기

파일: tatort-rename.py 프로젝트: imphil/tatort-rename

def search_episode_by_filename(filename):
    # Split out file extension
    basename, extension = os.path.splitext(filename)

    # Remove common phrases not part of the title
    searchname = re.sub(r"Tatort", '', basename)

    # Find match
    match_results = process.extractBests(searchname, tatort_titles,
                                         score_cutoff = 60, limit = 5)

    # no match was found
    if not match_results:
        print("No match was found for file {}".format(filename))
        return

    # only one match was found with the minimum required score
    matching_episode = None
    if len(match_results) == 1:
        chosen_result = match_results[0]

    # multiple matches were found above the score threshold: ask the user
    # which one is right
    if len(match_results) > 1:
        if match_results[0][1] - match_results[1][1] > 20:
            # if choice 0 is 20 points more likely than choice 1, we directly
            # use the first choice
            chosen_result = match_results[0]
        else:
            # print choices
            print("Multiple matches were found for file {}".format(filename))
            print("Please choose the correct one from the list below.")
            for index, match_result in enumerate(match_results):
                (matching_title, matching_score, matching_id) = match_result
                episode = tatort_episodes[matching_id]
                print("{index}: {name} (score: {score:02d}/100)".format(
                    index = index, name = episode['episodename'],
                    score = matching_score))

            # let user choose
            chosen_id = int(input('Your choice: '))
            # FIXME: repeat on wrong inputs

            chosen_result = match_results[chosen_id]

    (matching_title, matching_score, matching_id) = chosen_result
    matching_episode = tatort_episodes[matching_id]

    # build new file name
    try:
        absolute_number = int(matching_episode['absolute_number'])
    except:
        absolute_number = 0

    new_filename = "Tatort {:04d} - {:02d}x{:02d} - {}{}".format(
        absolute_number, int(matching_episode['seasonnumber']),
        int(matching_episode['episodenumber']),
        matching_episode['episodename'], extension)

    new_filename = new_filename.replace('/', ' ')

    print("{} -> {}".format(filename, new_filename))
    os.rename(filename, new_filename)