def best_matches(bigkeys, menu, vector_dict, threshold=90): """Matches 2-gram and 3-gram keyword vectors to menu items Args: bigkeys (list): a list of 2-grams and 3-grams menu (dict): a dictionary of a menu for a given restaurant vector_dict (dict): a dictionary whose keys are keywords and values are vectors threshold (int): fuzzy-match score required for keyword matching Returns: best_matches (dict): a dictionary whose keys menu items and values are tuples of keyword matches and their corresponding fuzzy match score """ best_matches = {} for name, meals in menu.iteritems(): for meal, course in meals.iteritems(): for food in course: item = food.replace('-', ' ') if len(item.split()) > 1 and len(item.split()) < 4: matches = process.extractBests(item, bigkeys, scorer=fuzz.QRatio, score_cutoff=threshold, limit=3) matches += process.extractBests(item, bigkeys, scorer=fuzz.UWRatio, score_cutoff=threshold, limit=3) vectors = [] for match in matches: vectors.append(vector_dict[match[0]]) best_matches[item] = dict(zip(matches, vectors)) if len(item.split()) > 3: descriptor = item.split()[-2:] matches = process.extractBests(item, bigkeys, scorer=fuzz.QRatio, score_cutoff=threshold, limit=3) matches += process.extractBests(item, bigkeys, scorer=fuzz.UWRatio, score_cutoff=threshold, limit=3) vectors = [] for match in matches: vectors.append(vector_dict[match[0]]) best_matches[item] = dict(zip(matches, vectors)) return best_matches
def search(self, data): books = Book.objects.all() resID = set([b.id for b in books]) for key in ['title', 'isbn']: if key in data: values = [b.__dict__[key] for b in books] fuzzy = process.extractBests(data[key], values, score_cutoff=60) fvalues = [val for val, _ in fuzzy] ids = [b.id for b in books if b.__dict__[key] in fvalues] resID &= set(ids) if 'author' in data: values = [b.author.lname for b in books] fuzzy = process.extractBests(data['author'], values, score_cutoff=60) fvalues = [val for val, _ in fuzzy] ids = [b.id for b in books if b.author.lname in fvalues] resID &= set(ids) resBooks = [{ 'id': b.id, 'title': b.title, 'author': f'{b.author.fname} {b.author.lname}', 'isbn': b.isbn } for b in books if b.id in resID] return resBooks
def common_process(self, proddesc_list, Item_Description, shouldConsiderOnlyTopScore, matchingThreshold, cursor, listType): if len(proddesc_list) == 0: return '|||||||||||||||||||||' if shouldConsiderOnlyTopScore: bestMatch = process.extractBests(Item_Description, proddesc_list, scorer=fuzz.token_sort_ratio, limit=1, score_cutoff=matchingThreshold) if len(bestMatch) == 0: return '|||||||||||||||||||||' else: return self.getmatchedrecord(bestMatch[0][0], bestMatch[0][1], cursor, listType) else: bestMatches = process.extractBests(Item_Description, proddesc_list, scorer=fuzz.token_sort_ratio, limit=5, score_cutoff=matchingThreshold) if len(bestMatches) == 0: return '|||||||||||||||||||||' else: bestMatches = set(bestMatches) rtrn_str = '' for match in bestMatches: matched_record = self.getmatchedrecord( match[0], match[1], cursor, listType) rtrn_str = rtrn_str + '#' + matched_record rtrn_str.strip('#') return rtrn_str
def match_ingredients(self, recipe): """ Makes a fuzzy match between ingredients in a recipe and in the ingredient:compound dict :param recipe: ingredients to match :param comp_ing_dict: compound ingredient dict :return: """ matches = [] for ing in recipe: component_matches = self.decomposer.get_component_ingredients(ing) if component_matches: return self.match_ingredients(component_matches) match_list = process.extractBests(ing, self.sanitized_comp_ing_dict.keys(), scorer=fuzz.ratio, score_cutoff=98) matches += [match[0] for match in match_list] match_list = process.extractBests(ing, self.sanitized_comp_ing_dict.keys(), scorer=fuzz.token_sort_ratio, score_cutoff=85) matches += [match[0] for match in match_list] if not matches: sub_ing = ing.split() for i in sub_ing: match_list = process.extractBests(i, self.sanitized_comp_ing_dict.keys(), scorer=fuzz.ratio, score_cutoff=88) matches += [match[0] for match in match_list] return list(set(matches))
def _find_person(self, player_name, choices): print('Searching %s' % player_name) matching_results = process.extractBests(unidecode(player_name), choices, scorer=fuzz.partial_token_set_ratio, score_cutoff=75) if len(matching_results) > 0: # si les meilleurs matchs sont à egalité de score, chercher à nouveau avec méthode différente best_score = 0 creme = dict() for name, score, plid in matching_results: if score >= best_score: best_score = score creme.update({plid: name}) else: # la liste renvoyée par extractBests est triée donc on peut s'arreter dès que le niveau baisse. break # combien de meilleurs scores ? if len(creme) == 1: plid, plname = creme.popitem() print('Found %s at first round with ratio %s' % (plname, best_score)) matching_player = FootballPerson.objects.get(pk=plid) return matching_player, best_score else: print('Multiple matches found with ratio %s, refining...' % best_score) refine_results = process.extractBests(player_name, creme, scorer=myfuzz.partial_token_set_ratio_with_avg) plname, ratio, plid = refine_results[0] print('Found %s at second round with ratio %s then %s' % (plname, best_score, ratio)) matching_player = FootballPerson.objects.get(pk=plid) return matching_player, best_score else: print("Alert : no match for %s" % player_name) return None, 0.0
def fuzzy_match( mistyped_command: str, score_cutoff: int = 75 ) -> Optional[Tuple[str, int]]: match = process.extractBests( mistyped_command, FUZZY_COMMANDS.keys(), score_cutoff=score_cutoff, limit=1 ) if len(match) == 0: match = process.extractBests( mistyped_command, FUZZY_ALIASES.keys(), score_cutoff=score_cutoff, limit=1 ) return match[0] if len(match) != 0 else None
def buildEachLocatorTagPattern(self, eachLocatorArray, i, sourceDataProcessed, patternBuild): try: import re eachLocator = eachLocatorArray[i].upper()# mjj """considering if a string contains number, prevent it for fuzzy search""" # searchOnlyNumAndCharObj = re.search(r'^[0-9-`!@#$%^&*()_+=\\|}\]\[{\';:\/\?>\.,<~ ]+$', eachLocator) searchOnlyNumAndCharObj = re.search(r'[0-9]', eachLocator) if searchOnlyNumAndCharObj: """ Converting number to coresponding regular expression """ bestMatch = re.sub('\d', '\d', eachLocator) # print('pattern found', bestMatch) if i == 0: patternBuild = patternBuild + bestMatch else: patternBuild = patternBuild + '(.*)' + bestMatch return patternBuild else: """ Get fuzzy matching string array """ matchingFuzzyWord = self.getFuzzySearchData(eachLocator, sourceDataProcessed) if len(process.extractBests(eachLocator, matchingFuzzyWord)) > 0: """ Find the best amoung them It is very importent for selecting appropriate confidence limit, one thing is based on string length :todo: find other parameters for improving """ if len(eachLocator) < 5: confidenceLimit = 90 else: confidenceLimit = 80 bestMatch, confidence = process.extractBests(eachLocator, matchingFuzzyWord, limit=1)[0] if len(matchingFuzzyWord) > 0 and confidenceLimit < confidence: bestMatch = self.regularExpressionHandling(bestMatch, 0) if i == 0: patternBuild = patternBuild + bestMatch else: patternBuild = patternBuild + '(.*)' + bestMatch return patternBuild else: if i == 0 or len(eachLocatorArray) == i + 1: """ if first or last locator doesnot match then no need for further process (Improvement in searching) """ return False elif len(matchingFuzzyWord) == 0 and (i == 0 or len(eachLocatorArray) == i + 1): """ if first or last locator doesnot match then no need for further process (Improvement in searching) """ return False except Exception as e: print('error in buildEachLocatorTagPattern in StringHandling', e) return False
def search(bank_query: str, branch_query: str): """ :param bank_query: Bank string. Example: "icici", "hdfc bank" :param branch_query: Branch address string. Example: :return: DF row/object of the matching branch with IFSC & MICR code Step1: Load CSV to a DF Step2: Slice the results to the substring i.e. bank_string Step3: 3.1) Fuzzy search the BRANCH col. with branch_query. BRANCH_NAME 3.2) Fuzzy search the ADDRESS col. with branch_query. ADDRESS 3.3) Compare the fuzzy search score of 3.1 and 3.2. If both scores are equal or BRANCH_NAME > ADDRESS, we assume the BRANCH_NAME row to be returned else ADDRESS row. Order of CSV Headers: ['IFSC', 'BRANCH', 'CENTRE', 'DISTRICT', 'STATE', 'ADDRESS', 'CONTACT', 'IMPS', 'RTGS', 'CITY', 'NEFT', 'MICR'] """ # Step 1: Find the branch from `BANK` column # bank_dict = process.extractBests('ICIC', df1.BANK.to_dict(), score_cutoff=90, limit=None) # bank_branches = df1.loc[[l[2] for l in bank_dict]] # OR => # PS: The following will fail if the bank name provided exceeds the bank name we have bank_branches = df1[df1['BANK'].str.lower().str.contains(bank_query.lower())] print('===== ALL BRANCHES OF BANK ======'.center(10)) print(bank_branches) # FYI to convert whole column to lower case: bank_branches.ADDRESS.str.lower() branch_matches = process.extractBests(branch_query, bank_branches.BRANCH.to_dict()) # , limit=2) address_matches = process.extractBests(branch_query, bank_branches.ADDRESS.to_dict()) best_branch = branch_matches[0] best_address = address_matches[0] print(best_address, best_branch) if best_branch[1] == best_address[1]: optimal_result = best_branch elif best_branch[1] > best_address[1]: optimal_result = best_branch else: optimal_result = best_address result_index = optimal_result[2] result_row = bank_branches.loc[result_index] print('==== Queried BRANCH ===='.center(10)) print(result_row) logger.error("{}".format(result_row)) return result_row
def get_categories_related(categories_raw, min_score=90, min_bad_score=80, is_name=False, names=None): if categories_raw: categories_raw = clean(categories_raw, is_name) if names: names = clean(names, True) # print(categories_raw) if categories_raw: match_categories = [] for name, attrs in categories_json.items(): not_choices = attrs.get('banned') bad_results = process.extractBests(categories_raw, not_choices, scorer=fuzz.partial_token_set_ratio, score_cutoff=min_bad_score) if names: bad_results += process.extractBests(names, not_choices, scorer=fuzz.partial_token_set_ratio, score_cutoff=min_bad_score) if not bad_results: choices = attrs.get('tokens') results = process.extractBests(categories_raw, choices, scorer=fuzz.partial_token_set_ratio, score_cutoff=min_score) if results: # print("++++++ \t", name, ': ', results) match_categories.append(name) result_keys = {result[0] for result in results} for cat in attrs.get('subcats'): choices = set(list(cat.values())[0].get('tokens')) if (choices & result_keys): not_choices = list(cat.values())[0].get('banned') bad_results_sub = process.extractBests(categories_raw, not_choices, scorer=fuzz.partial_token_set_ratio, score_cutoff=min_bad_score) if names: bad_results_sub += process.extractBests(names, not_choices, scorer=fuzz.partial_token_set_ratio, score_cutoff=min_bad_score) if not bad_results_sub: cat_name = list(cat.keys())[0] match_categories.append(cat_name) # print("++++++ \t", cat_name, ': ', results) # else: # cat_name = list(cat.keys())[0] # print("------ \t", cat_name, ': ', bad_results_sub) # else: # print("------ \t", name, ': ', bad_results) if {"Mascotas", "Autos, Motos y llantas"} & set(match_categories): aux = set(match_categories) - {"Super"} if len(aux) > 1: return [] return match_categories else: return []
def get_best_match_location(manga_name, choices): """ Extracts the best possible matching manga. If it cannot find the exact match asks the user for the best matches. """ matches=process.extractBests(manga_name, choices, limit=10, score_cutoff=100) if len(matches) == 0: matches=process.extractBests(manga_name, choices, limit=10, score_cutoff=80) match=ask_best_match(matches) return website_specific.absoulute_location( match )
def get_normalized(starting_name: str, names_coll: list): reading = process.extractBests(starting_name, names_coll, score_cutoff=75) candidates = [starting_name] best_n = starting_name max_n = len(reading) while reading != []: (new, points) = reading.pop() if points < 100 and new not in candidates: candidates.append(new) tmp = process.extractBests(new, li, score_cutoff=75) reading = reading + tmp if len(tmp) > max_n: best_n = new return (best_n, candidates)
def get_match(word, word_list, precision=90): if len(word) == 0: return '' best_match = process.extractBests(word, word_list, limit=1) if best_match[0][1] <= precision: best_match = '' return best_match
def extract(self, expectation, limit=4): """extract(extraction, limit=4) Returns the address or addresses within the set of the reverse geocoded addresses that best match an expected result. Uses fuzzywuzzy under the hood for matching. Args: expectation (str): The string indicating your expected result for a reverse geocoding operation. It should probably look like an address. Results are returned in the order that best meets this expected address. Kwargs: limit (int): The maximum number of match candidates to retrieve from fuzzywuzzy. The length of the returned array may be longer, if the set of addresses has identical addresses that are good matches for the expected address (i.e. if two geocoders resolve to the same string address). Returns: list. Return value is a list of tuples, where each tuple contains a geopy Location, and a matching score based on an extension of the Levenshtien distance between the expectation and the Location's address (a higher score is a better match). The algorithm is implemented by SeatGeek's fuzzywuzzy, and you can read more here: http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/ """ extractions = fuzzyprocess.extractBests( expectation, [str(a) for a in self.addresses], limit=limit) result = [] for extraction in extractions: result.extend([(x, extraction[1]) for x in self.addresses if str(x) == extraction[0]]) return result
def test_identical_strings_extracted(scorer, processor, data): """ Test that identical strings will always return a perfect match. :param scorer: :param processor: :param data: :return: """ # Draw a list of random strings strings = data.draw( st.lists(st.text(min_size=10, max_size=100), min_size=1, max_size=50)) # Draw a random integer for the index in that list choiceidx = data.draw(st.integers(min_value=0, max_value=(len(strings) - 1))) # Extract our choice from the list choice = strings[choiceidx] # Check process doesn't make our choice the empty string assume(processor(choice) != '') # Extract all perfect matches result = process.extractBests(choice, strings, scorer=scorer, processor=processor, score_cutoff=100, limit=None) # Check we get a result assert result != [] # Check the original is in the list assert (choice, 100) in result
def item(self, name, fuzzy_threshold=100): """ Extract a password from an unlocked Keychain using fuzzy matching. ``fuzzy_threshold`` can be an integer between 0 and 100, where 100 is an exact match. """ matches = process.extractBests( name, self._items.keys(), score_cutoff=(fuzzy_threshold-1), ) if matches: if len(matches) > 1: for i, m in enumerate(matches): sys.stderr.write('[%s] %s\n' % (i, m[0])) sys.stdin = open('/dev/tty') try: exact_name = matches[int(raw_input())][0] except: return None else: exact_name = matches[0][0] item = self._items[exact_name] item.decrypt_with(self) return item else: return None
def bot_message(self, cpu_lookup): self.cpu_lookup = cpu_lookup logging.info('Looking for CPU...') try: choices = [] for cpu in self.cpu_list: match_criteria = fuzz.token_set_ratio( self.clean_input(cpu), self.clean_input(cpu_lookup)) if match_criteria >= 45: choices.append(cpu) closest_match = process.extractOne(cpu_lookup, choices, scorer=fuzz.token_set_ratio, score_cutoff=85) logging.info(f"Searching: {cpu_lookup}, Closest: {closest_match}") closest_match_name = closest_match[0] bot_reply = self.display_cpu_info(closest_match_name) except TypeError: limit_choices = process.extractBests(cpu_lookup, choices) if limit_choices: bot_reply = f"No direct CPU match found for **{cpu_lookup}**, displaying {len(limit_choices)} potential matches:\n\n" search_results = '' for result in limit_choices[:6]: cpu_name = result[0] search_results += f"[{cpu_name}]({self.cpu_list[cpu_name]})\n\n" bot_reply += search_results bot_reply += "\n\nFeel free to ask me again (`CPUBot! cpu model`) with these models or visit PassMark directly!\n" # Handles no results being found in search if not limit_choices: bot_reply = f"\n\nI'm sorry, I couldn't find any information on **{cpu_lookup}**.\n\nPlease feel free to try again; perhaps you had a spelling mistake, or your CPU is not listed in the [Passmark CPU list]({self.passmark_page})." return bot_reply
def search_without_properties_list(self): """ Первоначальный поиск не использующий список связанных слов """ search_list = Suggest.root.suggest_db[self.first_let][str( self.len_word)] tokens = self.search_token(self.search_word, list(search_list.keys()), 50, 3) for token, percent in tokens: tokens_list = search_list[token[:self.len_word]] token = process.extractBests(self.search_word, tokens_list, limit=3, score_cutoff=50) self.logger.debug(msg=[token, tokens_list]) for word, percent in token: if word not in self.root.stop_words: word_prop = process.extractOne( word, Suggest.root.search_words_db[word[0]].keys())[0] gm_names = Suggest.root.search_words_db[ word[0]][word_prop]['gm_name'] self.res_list.append( (self.start_phrase + word, percent, gm_names))
def upload_file (): data = {"success": False} print(request.files) if request.files.get('file'): # read the file file = request.files['file'] # read the filename filename = file.filename # create a path to the uploads folder filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename) file.save(filepath) prediction = predict(filepath) breed = prediction[0][1] animal_outcome = pd.read_csv('../Animals.csv') r = process.extractBests(breed.replace('_', ' '), animal_outcome.Breed.to_dict(), scorer=partial_ratio, score_cutoff=70, limit=1000000000) animal_analysis = animal_outcome.loc[map(lambda x: x[-1], r)] return animal_analysis.to_json(orient='records') # return render_template('table.html', dogs=animal_analysis.to_dict(orient='records')) return jsonify({"error": "there is an error!"})
def fuzzyExtract(self, qs, ls, threshold): ''' todo fuzzy search seperation in words :param qs: query string :param ls: large string :param threshold: threshold :return: ''' '''fuzzy matches 'qs' in 'ls' and returns list of tuples of (word,index) ''' if len(qs) < self.fuzzySearchOptimumLength: processThreshold = 60 max_l_dist = 0 else: processThreshold = threshold max_l_dist = 1 for word, confidence in process.extractBests(qs, (ls,), score_cutoff=processThreshold): print('word {}'.format(word), confidence) for match in find_near_matches(qs, word, max_l_dist=max_l_dist): match = word[match.start:match.end] print('match {}'.format(match)) index = ls.find(match)
def get_best_events_matching(event_list: list, search: str): if len(event_list) == 0: return [] if search is None or len(search) == 0: return event_list try: i = int(search) for event in event_list: if event["id"] == event_id: return [event] except ValueError: pass except TypeError: pass # try doing a search through the names of events: event_names = [(e["event_name"], e) for e in event_list] def process_by_first(v): return fuzzy_fullproc(v[0]) results = process.extractBests((search, None), event_names, processor=process_by_first, scorer=fuzz.partial_ratio, score_cutoff=90) rl = [] for e, score in results: rl.append(e[1]) return rl
def fuzzy_combine_def(self, definitions, scorer=None): d = dict(definitions) order_key = lambda name: d[name] result = [] names = set(d.keys()) if scorer is None: scorer = fuzz.token_set_ratio while names: first = names.pop() similar = process.extractBests(first, names, scorer=scorer, limit=1000, score_cutoff=65) similar_names = [name for name, _ in similar] for name in similar_names: names.remove(name) res_names = [first] + similar_names ordered_by_score = sorted(res_names, key=order_key, reverse=True) total_score = sum([d[name] for name in ordered_by_score]) result.append((ordered_by_score, total_score)) resorted = sorted(result, key=operator.itemgetter(1), reverse=True) return resorted
def DupliFuzzyMatch(query): #choices = list(result[['FullTitle'].itertuples(index=False, name=None)) #choices=list(dict.fromkeys(choices)) choices = result['FullTitle'].tolist() prc=process.extractBests(query,choices,score_cutoff=60, limit=10) prc=list(dict.fromkeys(prc)) ''' labels = ['FullTitles', 'Score'] df = pd.DataFrame.from_records(prc, columns=labels) df1=pd.DataFrame(df['FullTitles'].values.tolist(), index=df.index) df = pd.concat([df1, df['Score']], axis=1, sort=False) df.rename(columns={0: 'FullTitle',1:'System'}, inplace=True) ''' lim=len(prc) if lim>0: df = pd.DataFrame([]) for i in range(0,lim): prcc=Dup_Count[Dup_Count['FullTitle'].str.match((prc[i][0]))] df = df.append(prcc) df=df.drop_duplicates(['Titles'],keep= 'last') df=df[['FullTitle','System', 'Count']] df=df.sort_values(['Count'],ascending=False) df.reset_index(drop = True, inplace = True) else: df = pd.DataFrame(columns=['FullTitle','System','ISBN13']) return df
def fuzzy_search(query_string, options_dict, scorer=fuzz.QRatio, score_cutoff=81): """ Uses fuzzy search to find best matches for the `query_string` in `options_dict` Args: query_string (str): String used to search for matches options_dict (list): List of options to find matches in. scorer (fuzz.Scorer, optional): Strategy to use when searching for matches. Defaults to fuzz.QRatio. score_cutoff (int, optional): Similarity score cutoff threshold. Defaults to 81. Returns: array: Array of matching words """ fuzzy_results = process.extractBests(query_string, options_dict, scorer=fuzz.QRatio, score_cutoff=score_cutoff) return fuzzy_results
def fuzzy_match(input_string, choices, n=3, score_cutoff=50): try: return list( zip(*(fuzzy_process.extractBests( input_string, choices, score_cutoff=score_cutoff))))[0][:n] except IndexError: return []
def fuzzy_extract_matches(word, possibilities, junk_seq=None, n=3, cutoff=0.5): score_cutoff = int(cutoff*100) matches = process.extractBests(word,possibilities,score_cutoff=score_cutoff,limit=n) final_matches = [] for m,s in matches: final_matches.append((m,s/100)) return final_matches
def get_match(word_list: list, word: str, score_cutoff: int = 60, isPartial: bool = False, limit: int = 1): """Uses fuzzywuzzy to see if word is close to entries in word_list Returns a tuple of (MATCH, SCORE) """ if not word: return (None, None) result = None scorer = fuzz.ratio if isPartial: scorer = fuzz.partial_ratio if limit == 1: result = process.extractOne(word, word_list, scorer=scorer, score_cutoff=score_cutoff) else: result = process.extractBests(word, word_list, scorer=scorer, score_cutoff=score_cutoff, limit=limit) if not result: return (None, None) return result
def get_fuzzy_search_movies(fuzzy, repo): """ """ movies = list(repo.dataset_of_movies) movie_title_dict = dict(enumerate([movie.title for movie in movies])) best_matches = process.extractBests(fuzzy, movie_title_dict, score_cutoff=50) return [utils.movie_to_dict(movie) for movie in [movies[z] for (x, y, z) in best_matches]]
def address_candidates(input_data, cutoff_info): """ Returns address candidates """ output = {} st_num = input_data['st_num'] st_name = input_data['st_name'] if input_data['appeal_type'] == "detroit_single_family": cutoff = cutoff_info['detroit'] region = 'detroit' elif input_data['appeal_type'] == "cook_county_single_family": cutoff = cutoff_info['cook'] region = 'cook' mini = address_candidates_query(region, st_num) candidate_matches = process.extractBests(st_name, mini.st_name, score_cutoff=50) selected = mini[mini['st_name'].isin([i for i, _, _ in candidate_matches])].copy() selected['Distance'] = 0 if input_data['appeal_type'] == "detroit_single_family": selected = prettify_detroit(selected, False) elif input_data['appeal_type'] == "cook_county_single_family": selected = prettify_cook(selected, False) selected['eligible'] = selected.assessed_value <= cutoff selected.dropna(axis=0, inplace=True) output['candidates'] = selected.to_dict(orient='records') if len(output['candidates']) == 0: #if none found raise raise Exception('No Matches Found') return output
def view_script(title_to_href): #This is what will get the actual script of a single movie movie_names_row = input( '\033[31m Entrez le nom du film et l\'année entre parenthèses, ex : 99 Homes (2014) : \033[0m' ) #The first attribute of extract will be user's input, second is the list of all movie scripts, third is number of results determined by user movie_names = process.extractBests(movie_names_row, title_to_href.keys(), limit=1, score_cutoff=70) titles = [movie_name[0] for movie_name in movie_names] title = titles[0] print(title) if input('\033[31m Entrez "yes" pour continuer : \033[0m') == 'yes': if title in title_to_href: print(title_to_href[title]) else: print('Aucun résultat') page_url = "https://www.springfieldspringfield.co.uk/movie_script.php?movie=" + title_to_href[ title] page = urllib.request.urlopen(page_url) soup = BeautifulSoup(page, 'html.parser') script = soup.find("div", {"class": "movie_script"}) print(script.text) else: pass
def normalize_country(country): if process.extractBests(country, choices=['N/A', 'n/a'], score_cutoff=HIGH_CUTOFF): return None if country == 'null': return None if country in ['USA', 'UK']: return country if process.extractOne(country, choices=['United Kingdom', 'UK'], score_cutoff=HIGH_CUTOFF): return 'UK' if process.extractOne(country, choices=['England', 'Wales', 'Scotland'], score_cutoff=HIGH_CUTOFF): return 'UK' if process.extractOne(country, choices=['Russia', 'Russian Federation'], score_cutoff=LOW_CUTOFF): return 'Russia' try: country = pycountry.historic_countries.get(name=country.capitalize()) return country.name except KeyError: pass try: country = pycountry.historic_countries.get(alpha2=country.upper()) return country.name except KeyError: pass countries = [country.name for country in pycountry.countries] best_match = process.extractOne(country, choices=countries, score_cutoff=90) return best_match if best_match else country
def test_identical_strings_extracted(scorer, processor, data): """ Test that identical strings will always return a perfect match. :param scorer: :param processor: :param data: :return: """ # Draw a list of random strings strings = data.draw( st.lists(st.text(min_size=10, max_size=100), min_size=1, max_size=50)) # Draw a random integer for the index in that list choiceidx = data.draw( st.integers(min_value=0, max_value=(len(strings) - 1))) # Extract our choice from the list choice = strings[choiceidx] # Check process doesn't make our choice the empty string assume(processor(choice) != '') # Extract all perfect matches result = process.extractBests(choice, strings, scorer=scorer, processor=processor, score_cutoff=100, limit=None) # Check we get a result assert result != [] # Check the original is in the list assert (choice, 100) in result
def get_grid(name): name = name.lower() if name in grid_lookup.keys(): return grid_lookup[name] if name in grid_names: index = grid_names.index(name) return grid_uk[index].get('id') elif name.replace('the ', '', 1) in grid_names: index = grid_names.index(name.replace('the ', '', 1)) return grid_uk[index].get('id') fuzznames = grid_names + [name for name in grid_lookup.keys()] fuzzmatch = process.extractBests( name.replace('university', '').replace('the', ''), fuzznames) print('\nName to match:', name) for i, match in enumerate(fuzzmatch): print(' ' + str(i) + ': ' + match[0]) i = input('Select a match (0-indexed) or give the correct GRID:') try: i = int(i) return grid_uk[grid_names.index(fuzzmatch[i][0])].get('id') except ValueError: return i
def fuzMatch(query): #choices = result['FullTitle'].tolist() choices = list(result[['FullTitle', 'System','ISBN13']].itertuples(index=False, name=None)) choices=list(dict.fromkeys(choices)) prc=process.extractBests(query,choices,score_cutoff=60, limit=10) prc=list(dict.fromkeys(prc)) lim=len(prc) if lim>0: labels = ['FullTitles', 'Score'] df = pd.DataFrame.from_records(prc, columns=labels) df1=pd.DataFrame(df['FullTitles'].values.tolist(), index=df.index) df = pd.concat([df1, df['Score']], axis=1, sort=False) df.rename(columns={0: 'FullTitle',1:'System',2:'ISBN13'}, inplace=True) else: df = pd.DataFrame(columns=['FullTitle','System','Score','ISBN13']) #for i in range(0,lim): # prcc=result[result['FullTitle'].str.match((prc[i][0]))] # df = df.append(prcc,sort=True) ''' lent=len(df) for j in range(0,lent): kk= df['FullTitle'].values[j] score= fuzz.WRatio(query,kk) df['Score'].values[j]=score df=df.drop_duplicates(['FullTitle','System'],keep= 'last') ''' #df.reset_index(drop = True, inplace = True) return df
def main() -> None: args = sys.argv first = args[1] second: Optional[str] try: second = args[2] except IndexError: second = None first_res: Tuple[str, int] = process.extractOne(first, first_args) if first_res[1] >= MIN_SCORE: if first_res[0] == "spells": if second is None: print(spells) else: spell_keywords = get_spell_keywords() second_res: List[Tuple[str, int]] = process.extractBests( second, spell_keywords, score_cutoff=MIN_SCORE) potential_matches = [second_res[0][0]] potential_matches.extend([ x[0] for x in second_res[1:] if second_res[0][1] - x[1] <= 5 and second_res[0][1] != 100 ]) if len(potential_matches) > 1: string = "Multiple potential matches found:" for match in potential_matches: string += f"\n{match}" print(string) elif len(potential_matches) == 1: print_keyword_info(potential_matches[0]) else: print(second_res)
def thumbnail(self): """ To find a list of thumbnail candidates, collections can be filtered inexactly against the title of the page (to catch spelling errors). """ colls = [] q_title = Collection.objects.filter(name__iexact=self.title) q_slug = Collection.objects.filter(name__iexact=self.slug) # We cast a very broad net to find all collections with a name matching the slug or title of the page. Collections created by Mediabox will have a name matching the name of the subdirectory they came from, so names are not unique. colls.append([e for e in q_title]) colls.append([e for e in q_slug]) candidates = [] for coll in colls: imgs_for_coll = Image.objects.filter(collection=coll) candidates.append([entry.title for entry in imgs_for_coll]) for query in ['thumbnail', 'directory', self.title]: hit = process.extractBests(query, candidates, score_cutoff=70) if hit: return Image.objects.get(title__iexact=hit[0]) consolation = random.choice(candidates) # Just pick one. return Image.objects.get(title__iexact=consolation)
def _handle_not_found(self, query): """ Handles when a query does not match a valid command or cog. Will pass on possible close matches along with the ``HelpQueryNotFound`` exception. Parameters ---------- query: str The full query that was requested. Raises ------ HelpQueryNotFound """ # combine command and cog names choices = list(self._bot.all_commands) + list(self._bot.cogs) result = process.extractBests(query, choices, scorer=fuzz.ratio, score_cutoff=90) raise HelpQueryNotFound(f'Query "{query}" not found.', dict(result))
def enrich_supports(rows): relevant_rows = 0 matched_rows = 0 for row in rows: bests = [] if row['request_type'] == 'א3': relevant_rows += 1 payments = row['payments'] if payments and len(payments) > 0: payment = payments[0] key = (payment['support_title'], payment['supporting_ministry']) if key in cache: bests = cache[key] else: bests = process.extractBests( { 'purpose': key[0], 'office': key[1] }, criteria, processor=id, scorer=criteria_scorer ) cache[key] = bests if len(bests)>0: matched_rows += 1 row['criteria_docs'] = [x[0] for x in bests] yield row logging.info('MATCH STATS: rel: %d, matched: %d', relevant_rows, matched_rows)
def word_match(words, menu, vector_dict, threshold=93, best_matches=None): """Matches single-word keyword vectors to menu items Args: littlekeys (list): a list of single keywords menu (dict): a dictionary of a menu for a given restaurant vector_dict (dict): a dictionary whose keys are keywords and values are vectors threshold (int): fuzzy-match score required for keyword matching best_matches (dict): dict created in best_matches to add to Returns: best_matches (dict): a nested dictionary whose outer keys are menu items, inner keys are tuples of keyword matches and their corresponding fuzzy match score, and values are the review vectors of each keyword """ if not best_matches: best_matches = {} for name, meals in menu.iteritems(): for meal, value in meals.iteritems(): for food in value: item = remove_accents(food) if len(item.split()) == 1: matches = process.extractBests(item, words, scorer=fuzz.QRatio, score_cutoff=threshold, limit=3) vectors = [] for match in matches: vectors.append(vector_dict[match[0]]) best_matches[item] = dict(zip(matches, vectors)) return best_matches
def get_breeze_user(self, name): self.get_all_users() # Put name in format so get_name applies temp_name = {'first_name': name, 'last_name': ''} get_name = lambda u: ' '.join([u['first_name'], u['last_name']]) bests = process.extractBests(temp_name, self._people, get_name, limit=3) # Penalize 10 points if first names start with different letters best = (bests[0], 0) # Hold best candidate and the score for item in bests: penalty = 0 if name[0].casefold() != item[0]['first_name'][0].casefold(): penalty = 10 if item[1] - penalty > best[1]: best = (item, item[1] - penalty) if best[1] < 90: raise MatchNotFoundError return best[0]
def fuzzy_matches(x=None, query='', where='both', score_cutoff=50, limit=10): """ Get the internal matches used in fuzzy search. Exposses the scores of each match. If item is a 2 tuple match is on a key, if the item is a 3 tuple match is on a value. """ if where is 'keys': matches = process.extractBests(query, x.keys(), score_cutoff=score_cutoff, limit=limit) elif where is 'values': matches = process.extractBests(query, x, score_cutoff=score_cutoff, limit=limit) elif where is 'both': key_matches = process.extractBests(query, x.keys(), score_cutoff=score_cutoff, limit=limit) val_matches = process.extractBests(query, x, score_cutoff=score_cutoff, limit=limit) # Combine results results = key_matches + val_matches return results
def getclosestelement(name, array): if name in array: return name try: close = process.extractBests(name, array)[0][0] except: close = difflib.get_close_matches(name, array)[0] return close
def search_collection(items, query, key='title'): """Levenstein fuzzy search""" d = {i[key]: i for i in items} bests = process.extractBests(query, list(d.keys())) if bests: return [d[match[0]] for match in bests] else: return []
def busca(self, query): query = normaliza_string(query) amostra = self.cria_amostra() scorer = seleciona_scorer(query) resultado = process.extractBests(query, amostra, limit=10, scorer=scorer, score_cutoff=65) if scorer == fuzz.token_set_ratio: resultado = process.extract(query, lapida_extracao(resultado), limit=20, scorer=fuzz.partial_ratio) return lapida_extracao(resultado)
def lookup(self, keyword): data = _concat(self.db.all()) title = data.pop('Title') auth = data.pop('Authors') choices = title + auth searchKey = process.extractBests(keyword, choices) for i in searchKey: if i[1] >= 90: self.search(i[0])
def search(self, name): '''searches list of movies for a movie''' movies_path = self.movies_path; onlyfolders = [f for f in os.listdir(movies_path) if os.path.isdir(os.path.join(movies_path, f))] results = process.extractBests(name,onlyfolders) for result in results: print(result[0])
def list(self, query=None, by_nick=False): if query is None: return self.licenses field = 'nick' if by_nick else 'title' choices = {getattr(lic, field): lic for lic in self.licenses} print choices field_values = extractBests(query, choices.iterkeys()) lics = [choices[v] for v, _ in field_values] return lics
def fuzzy_search(x=None, query='', where='both', score_cutoff=50, limit=10): """ Search thru keys, values, or both via fuzzy string comparison. """ if where is 'keys': matches = process.extractBests(query, x.keys(), score_cutoff=score_cutoff, limit=limit) results = {k : x[k] for k, score in matches} elif where is 'values': matches = process.extractBests(query, x, score_cutoff=score_cutoff, limit=limit) results = {k : v for v, score, k in matches} elif where is 'both': key_matches = process.extractBests(query, x.keys(), score_cutoff=score_cutoff, limit=limit) key_results = {k : x[k] for k, score in key_matches} val_matches = process.extractBests(query, x, score_cutoff=score_cutoff, limit=limit) val_results = {k : v for v, score, k in val_matches} # Combine results results = key_results.copy() results.update(val_results) return results
def selectBestMatch(tokens, hits, qualityThreshold=50, creatorWeight=2, titleWeight=1): results = [] for hit in hits: try: creators = set() normalizedScore = 0 creatorScore = 0 score = 0 if hit.has_key('title'): foo = process.extractBests(hit['title'], tokens) if len(foo) > 0: score += foo[0][1] * titleWeight if hit.has_key('creator'): creators = creators.union(set(hit['creator'])) if hit.has_key('person'): creators = creators.union(set(hit['person'])) if hit.has_key('contributor'): creators = creators.union(set(hit['contributor'])) if hit.has_key('publisher'): creators = creators.union(set(hit['publisher'])) if len(creators) > 0: for creator in creators: foo = process.extractBests(creator, tokens) if foo != None and foo != []: creatorScore += foo[0][1] score += (creatorScore/len(creators)) * creatorWeight normalizedScore = score/(creatorWeight+titleWeight) if normalizedScore >= qualityThreshold: bisect.insort_left(results, (normalizedScore, hit)) except UnicodeDecodeError as e: continue if len(results) > 0: return results[-1] return None
def match_lists(list1, list2, precision=90): ''' Takes two lists Returns dict of matches (if best match scores above precision) and non-matches (otherwise); both dicts have key = list1 element and value = best match ''' in_both = {} not_in_both = {} for manager1 in list1: best_match = process.extractBests(manager1, list2, limit=1) if best_match[0][1] > precision: in_both[manager1] = best_match[0][0] else: not_in_both[manager1] = best_match[0][0] return [in_both, not_in_both]
def get_closest_text_bump(search_text): search_text = {'text': search_text} text_bumps = filter(lambda bump: bump['text'], get_bumps()) if text_bumps: best_bumps = process.extractBests( search_text, text_bumps, scorer=fuzz.token_set_ratio, processor=lambda bump: bump['text'].lower() ) best_bumps = [ bump[0] for bump in best_bumps if bump[1] == best_bumps[0][1] ] return get_bump_link(random.choice(best_bumps)) else: return 'no dice m8'
def get_best(self, message): message = message.lower() tokens = self.create_tokens(message) matches = process.extractBests(' '.join(tokens), self.trump_quotes, score_cutoff=50, limit=3) if not matches: return None if re.search('trump|the donald', message): matches = [(x[0], x[1] + 10, x[2]) for x in matches ] adjusted = [] for x in matches: b_tokens = self.create_tokens(x[0]) score = self.score_ngrams(tokens, b_tokens) adjusted.append((x[0], x[1] + score, x[2])) top = [x for x in adjusted if x[1] == adjusted[0][1] ] return top
def test_only_identical_strings_extracted(scorer, processor, data): """ Test that only identical (post processing) strings score 100 on the test. If two strings are not identical then using full comparison methods they should not be a perfect (100) match. :param scorer: :param processor: :param data: :return: """ # Draw a list of random strings strings = data.draw( st.lists( st.text(min_size=10, max_size=100, alphabet=HYPOTHESIS_ALPHABET), min_size=1, max_size=10) ) # Draw a random integer for the index in that list choiceidx = data.draw(st.integers(min_value=0, max_value=(len(strings) - 1))) # Extract our choice from the list choice = strings[choiceidx] # Check process doesn't make our choice the empty string assume(processor(choice) != '') # Extract all perfect matches result = process.extractBests(choice, strings, scorer=scorer, processor=processor, score_cutoff=100, limit=None) # Check we get a result assert result != [] # Check THE ONLY result(s) we get are a perfect match for the (processed) original data pchoice = processor(choice) for r in result: assert pchoice == processor(r[0])
def extrairCitadosTextoBestExtract(link,f): listaFamosos = model.buscarListaFamoso() noticiaDB = model.buscarNoticia(link) if noticiaDB != False and listaFamosos != False: f.writelines(str(noticiaDB[1]) + "\n") noticia = {'id': noticiaDB[0], 'titulo': noticiaDB[1], 'subtitulo': noticiaDB[2], 'link': noticiaDB[3], 'tipo': noticiaDB[4], 'texto': noticiaDB[5]} print(str(noticia['titulo'])) famosos = [] for j in listaFamosos: famosos.append(j[1]) textoTotal = noticia['titulo'] + " " + noticia['subtitulo'] + " " + noticia['texto'] result = process.extractBests(textoTotal, famosos,None,None,57,limit=10000) for r in result: f.writelines(str(r[0])+","+str(round(r[1],1))+"\n") return result
def pdf(pdf_files, shared_pdf, bibtex_folder, bibtex_files, gscholar): for pdf in pdf_files: txt = re.sub("\W", " ", gs.convert_pdf_to_txt(pdf)).lower() #Research determined that the cutting of 35 words gives the #highest accuracy words = txt.strip().split()[:35] words = " ".join(words) print words if gscholar == True: bib = load(gs.pdflookup(pdf, all, gs.FORMAT_BIBTEX)[0]) keys = bib.entries[0].keys() matching = [s for s in keys if "pdf" in s] if len(matching) - 1 <= 0: key = 'pdf' else: key = 'pdf' + str(len(matching)) #link = os.symlink(pdf, str(shared_pdf) + str(pdf.split('/')[-1])) bib.entries = [add_field(bib.entries[0], key, bib)] bibtex(bib.entries, bibtex_folder, bib) sleep(random.uniform(1, 2)) else: best_match = process.extractBests(words, bibtex_files, limit=1) print best_match if best_match: bib = best_match[0][0] score = best_match[0][1] #Research determined that matching score of 45 #gives the highest accuracy if score > 45: with open(bib, 'r') as f: db = load(f) entries = db.entries[0] keys = entries.keys() matching = [s for s in keys if "pdf" in s] if len(matching) - 1 <= 0: key = 'pdf' else: key = 'pdf' + str(len(matching)) entries = add_field(entries, key, bib) with open(bib, 'w') as f: f.write(writer._entry_to_bibtex(entries))
def extrairCitadosTexto(threadName, queue, listaFamosos): queueLock.acquire() while not workQueue.empty(): n = queue.get() print("Queue: " + str(workQueue.qsize())) queueLock.release() noticia = {'id': n[0], 'titulo': n[1], 'subtitulo': n[2], 'link': n[3], 'tipo': n[4], 'texto': n[5]} tempoBest = time.clock() textoTotal = noticia['titulo'] + " " + noticia['subtitulo'] + " " + noticia['texto'] bestExtract = process.extractBests(textoTotal, listaFamosos,None,None,57,limit=10000) print("tempoBest:" + str(time.clock() - tempoBest)) tempoLoop = time.clock() for j in bestExtract: ratioTotal = 0 mediaRatio = 0 nomeCompleto = j[0] todosNomes = nomeCompleto.split(" ") textoTotal = noticia['titulo'] + " " + noticia['subtitulo'] + " " + noticia['texto'] ratioTotal += fuzz.token_set_ratio(nomeCompleto, textoTotal) if len(todosNomes) > 1: for nome in todosNomes: ratioTotal += fuzz.token_set_ratio(nome, textoTotal) if len(todosNomes) == 1: mediaRatio = ratioTotal elif len(todosNomes) > 1: mediaRatio = ratioTotal / (len(todosNomes) + 1) if mediaRatio > 85: model.relacionarFamosoNoticia(nomeCompleto, noticia['id']) print("tempo loop:" + str(time.clock() - tempoLoop)) print(str(threadName) + " - " + str(noticia['titulo'])) queueLock.acquire() queueLock.release()
outlist[21] = row[11] # Room (raw location from input) outlist[27] = row[6] # Manufacturer website outlist[29] = row[18] # training_required? outlist[37] = row[7] # Asset ID outlist[38] = row[28] # Finance ID outlist[39] = row[5] # Serial No outlist[42] = row[24] # Date of Purchase outlist[43] = row[25] # Purchase_cost outlist[45] = row[23] # end_of_life outlist[46] = row[22] # maintenance outlist[49] = row[8] # comments # Fuzzy match building: if not row[11]: row[11] = 'Unknown' # a -- results from fuzzy matching a = process.extractBests(row[11], buildings, limit=2) if a[0][0] == "SMC": outlist[20] = 'Scottish Microelectronics Centre' else: outlist[20] = a[0][0]; # Fuzzy match names and emails: # Contact 1: flat_names = flatten_dict(names) b = process.extractBests(row[12],flat_names, limit=2, scorer=fuzz.token_set_ratio) custodian_score = b[0][1] outlist[22] = b[0][0] # contact 1 name outlist[23] = names[b[0][0]] # contact 1 email # Contact 2: if row[13]: c = process.extractBests(row[13],flat_names, limit=2, scorer=fuzz.token_set_ratio) technical_score = c[0][1]
def handle_query(self): self.results.clear() res = extractBests(self.query.text(), self.all_results, limit=8) for r, s in res: self.results.add(r) self.results.setCurrentRow(0)
def search_episode_by_filename(filename): # Split out file extension basename, extension = os.path.splitext(filename) # Remove common phrases not part of the title searchname = re.sub(r"Tatort", '', basename) # Find match match_results = process.extractBests(searchname, tatort_titles, score_cutoff = 60, limit = 5) # no match was found if not match_results: print("No match was found for file {}".format(filename)) return # only one match was found with the minimum required score matching_episode = None if len(match_results) == 1: chosen_result = match_results[0] # multiple matches were found above the score threshold: ask the user # which one is right if len(match_results) > 1: if match_results[0][1] - match_results[1][1] > 20: # if choice 0 is 20 points more likely than choice 1, we directly # use the first choice chosen_result = match_results[0] else: # print choices print("Multiple matches were found for file {}".format(filename)) print("Please choose the correct one from the list below.") for index, match_result in enumerate(match_results): (matching_title, matching_score, matching_id) = match_result episode = tatort_episodes[matching_id] print("{index}: {name} (score: {score:02d}/100)".format( index = index, name = episode['episodename'], score = matching_score)) # let user choose chosen_id = int(input('Your choice: ')) # FIXME: repeat on wrong inputs chosen_result = match_results[chosen_id] (matching_title, matching_score, matching_id) = chosen_result matching_episode = tatort_episodes[matching_id] # build new file name try: absolute_number = int(matching_episode['absolute_number']) except: absolute_number = 0 new_filename = "Tatort {:04d} - {:02d}x{:02d} - {}{}".format( absolute_number, int(matching_episode['seasonnumber']), int(matching_episode['episodenumber']), matching_episode['episodename'], extension) new_filename = new_filename.replace('/', ' ') print("{} -> {}".format(filename, new_filename)) os.rename(filename, new_filename)