class FuzzyBaseIndex(object): def __init__(self, field=None, similarity=None, base=None, **kw): super(FuzzyBaseIndex, self).__init__(**kw) self.fuzz = FuzzySet(rel_sim_cutoff=1., use_levenshtein=False) self.content = {} self.field = field self.similarity = similarity self.base = base def add(self, x, i): self.fuzz.add(x) if x not in self.content: self.content[x] = set() self.content[x].add(i) def finalize(self): pass def search(self, x, top=25, debug=True): results = self.fuzz.get(x) ret = [] for r in results: for i in self.content[r[1]]: sim = self.similarity(x, r[1]) ret.append((i, r[0], sim)) ret = sorted(ret, key=lambda x: x[2], reverse=True) ret = ret[:top] return ret
def fuzzyset_alg(key, key_list): finder = FuzzySet() finder.add(key) candidates = list() for i in key_list: try: added = [i] #if the match score is below 50% key error raises matched = finder[i] added.extend(*matched) del added[-1] #remove rep's key from list added[1] *= 100 #convert to percentage ''' [0] the sf key [1] match percentage ''' candidates.append(added) except: pass #sort by score candidates.sort(key=lambda x: x[1], reverse=True) #take top take 10 top_candi = candidates[:10] #fuzzy match and sort again finalist = [[i[0], fuzz.ratio(key, i[0])] for i in top_candi] finalist.sort(key=lambda x: x[1], reverse=True) del finder, candidates, top_candi if len(finalist) > 0: return finalist[:3] else: return []
def run_profile(impl): if impl == "cFuzzySet": f = cFuzzySet() else: f = FuzzySet() with gzip.GzipFile(os.path.join(here, '..', 'cities.gz')) as input_file: for line in input_file: f.add(line.rstrip().decode()) print(f) cProfile.runctx("profiler(f)", globals(), locals(), "Profile.prof") s = pstats.Stats("Profile.prof") s.strip_dirs().sort_stats("time").print_stats()
class ListBasedPlaceExtractionService(NERService): def __init__(self, dist_file=CommonConstants.INDIA_DIST_NAMES): super().__init__() self.old_names = {'bangalore':'bengaluru','gurgaon':'gurugram','calcutta':'kolkata','prayagraj':'allahabad','delhi':'delhi'} self.fd = FuzzySet() self.set = set() with open(dist_file) as df: reader = csv.reader(df) header = next(reader) for row in reader: if 'rural' in row[1].lower() or 'urban' in row[1].lower() or 'dehat' in row[1].lower(): alternate = ' '.join(row[1].split(' ')[:-1]).lower() self.fd.add(alternate) self.set.add(alternate) continue self.fd.add(row[1].lower()) self.set.add(row[1].lower()) self.nlp = stanza.Pipeline(lang='en',processors='tokenize',use_gpu=False) def extract_entities_from_text(self,text): doc = self.nlp(text) closest_match = (0,None) ''' for token in doc.ents: tok_text = token.text.lower() closest_dist = self.fd.get(tok_text) if closest_dist and len(closest_dist): closest = closest_dist[0] if closest[0] > closest_match[0]: closest_match = closest if closest_match[0] > 0.5: return closest_match[1] ''' for sent in doc.sentences: for token in sent.tokens: tok_text = token.text.lower() if tok_text in self.set: return tok_text try: return self.old_names[tok_text] except KeyError: continue
def get_nutrition_data(image_class): url = "https://api.nal.usda.gov/ndb/search/?format=json&q=" + image_class + "&sort=n&max=25&offset=0&api_key=FLKBoKOh7C1apAA4bPL0jH4GAW6f2wS9Lw0a2iFu" r = requests.get(url).json() max_dist_ratio = 0 ndbno = 0 for item in r["list"]["item"]: fs = FuzzySet() fs.add(image_class) ratio = fs.get(item["name"])[0][0] if ratio > max_dist_ratio: max_dist_ratio = ratio ndbno = item["ndbno"] print(ndbno) nutrition_url = "https://api.nal.usda.gov/ndb/V2/reports?ndbno=" + ndbno + "&type=f&format=json&api_key=FLKBoKOh7C1apAA4bPL0jH4GAW6f2wS9Lw0a2iFu" nutrition_data = requests.get(nutrition_url).json() nutrition_facts = {} nutrients = nutrition_data["foods"][0]["food"]["nutrients"] nutrition_facts["serve_size"] = str( nutrients[0]["measures"][0]["qty"]) + " ounces" nutrition_facts["kcal"] = str( nutrients[0]["measures"][0]["value"]) + " calories" nutrition_facts["fat"] = str( nutrients[2]["measures"][0]["value"]) + " grams" nutrition_facts["carbs"] = str( nutrients[3]["measures"][0]["value"]) + " grams" nutrition_facts["protein"] = str( nutrients[1]["measures"][0]["value"]) + " grams" nutrition_facts["sugar"] = str( nutrients[4]["measures"][0]["value"]) + " grams" nutrition_facts["sodium"] = str( nutrients[5]["measures"][0]["value"]) + " milligrams" return nutrition_facts
] # list of countries from C19 data c19ctrs = c19.columns.tolist() # list of countries from population data popsctrs = pops['Country'].tolist() # geo data is king, we need to match everything else to it # the country name becomes the key matching the tables / dictionaries # some countries in c19 do not match any country in geo data # let's print fuzzy matches fzs = FuzzySet() for c in geoctrs: fzs.add(c) #for c in c19ctrs: # if c not in geoctrs: # print(c, fzs.get(c)) # In[8]: c19notfound = [ 'Andorra', 'Antigua and Barbuda', 'Bahrain', 'Barbados', 'Cabo Verde', 'Comoros', 'Diamond Princess', 'Dominica', 'Grenada', 'Holy See', 'Liechtenstein', 'MS Zaandam', 'Maldives', 'Mauritius', 'Monaco', 'Saint Kitts and Nevis', 'Saint Lucia', 'Saint Vincent and the Grenadines', 'San Marino', 'Sao Tome and Principe', 'Seychelles', 'Singapore' ]
class MiniBaseIndex(object): def __init__(self, field=None, tokenizer=None, similarity=None, base=None, idf_limit=0.05, **kw): super(MiniBaseIndex, self).__init__(**kw) self.content = {} self.field = field self.tokenizer = tokenizer self.similarity = similarity self.base = base self.counts = {} self.fuzzwords = FuzzySet(rel_sim_cutoff=0.7, use_levenshtein=False) self.blacklist = set() self.idf_limit = idf_limit def add(self, tok, i): if tok not in self.content: if tok not in self.blacklist: self.content[tok] = set() self.counts[tok] = 0 self.content[tok].add(i) self.counts[tok] += 1 # if self.counts[tok]/len(self.base.entries) > self.idf_limit: # self.blacklist.add(tok) # del self.counts[tok] # del self.content[tok] self.fuzzwords.add(tok) def finalize(self): for tok in self.content: pass # self.fuzzwords.add(tok) def search(self, x, expl=5000, top=25, maxtok=250, debug=False): tokenizer = self.tokenizer xtoks = tokenizer(x) # maxtok = maxtok * len(xtoks) results = {} # collect all toks alltoks = [] alltoks_set = set() for xtok in xtoks: for xtok_fuzz_score, xtok_fuzz_tok \ in self.fuzzwords.get(xtok): xtok_fuzz_sim = self.similarity(xtok, xtok_fuzz_tok) if xtok_fuzz_tok not in alltoks_set: alltoks.append( (xtok_fuzz_score, xtok_fuzz_tok, xtok_fuzz_sim)) alltoks_set.add(xtok_fuzz_tok) # alltoks = list(alltoks) # sort together by fuzziness alltoks = sorted(alltoks, key=lambda x: x[2] * 100 + 1 / self.counts[x[1]], reverse=True) # take maxtok only if debug: print(len(alltoks), maxtok) for tok in alltoks: print(tok, self.counts[tok[1]]) alltoks = alltoks[:maxtok] # sort by inverse frequency # alltoks = sorted(alltoks, key=lambda x: self.counts[x[1]]) # alltoksset = set(alltoks) for xtok_fuzz_score, xtok_fuzz_tok, xtok_fuzz_sim in alltoks: for _id in self.content[xtok_fuzz_tok]: if _id not in results: results[_id] = 0 results[_id] += xtok_fuzz_score if len(results) > expl: break if len(results) > expl: break if debug: print(len(results)) results = [(res[0], res[1], self.similarity(x, self.base.entries[res[0]][self.field])) for res in results.items()] def sortkey(x): entid = x[0] pop = self.base.entries[entid]["pop"] sim = x[2] return sim * 1e2 + pop * 1e-3 results = sorted(results, key=sortkey, reverse=True) results = results[:top] return results
def _learn(self, command): fs = FuzzySet(self.function_names) fs.add(command) print("No command found! Please input commands") commands = raw_input() commandArray = commands.split('.')
class MovedBlocksDetector(object): def __init__(self, removed_lines_dicts, added_lines_dicts): self.removed_lines = [] self.trim_text_to_array_of_added_lines = defaultdict(list) self.added_file_name_to_line_no_to_line = defaultdict(dict) self.removed_file_name_to_line_no_to_line = defaultdict(dict) self.added_lines_fuzzy_set = FuzzySet() for added_line_dict in added_lines_dicts: line = Line.from_dict(added_line_dict) self.trim_text_to_array_of_added_lines[line.trim_text].append(line) self.added_lines_fuzzy_set.add(line.trim_text) self.added_file_name_to_line_no_to_line[line.file][line.line_no] = line for removed_line_dict in removed_lines_dicts: line = Line.from_dict(removed_line_dict) self.removed_lines.append(line) self.removed_file_name_to_line_no_to_line[line.file][line.line_no] = line @staticmethod def from_diff(diff_text): parsed = diff_to_added_and_removed_lines(diff_text) return MovedBlocksDetector(parsed['removed_lines'], parsed['added_lines']) @measure_fun_time() def filter_out_block_inside_other_blocks(self, filtered_blocks: List[MatchingBlock]): filtered_blocks.sort(key=lambda fb: fb.get_filter_sort_tuple_for_remove()) last_matching_block = None for matching_block in filtered_blocks: if last_matching_block is None: last_matching_block = matching_block continue if matching_block.last_removed_line.file == last_matching_block.last_removed_line.file \ and matching_block.first_removed_line.line_no >= last_matching_block.first_removed_line.line_no \ and matching_block.last_removed_line.line_no <= last_matching_block.last_removed_line.line_no: if matching_block.weighted_lines_count < last_matching_block.weighted_lines_count\ and matching_block.removed_lines_numbers.issubset(last_matching_block.removed_lines_numbers): matching_block.remove_part_is_inside_larger_block = True else: last_matching_block = matching_block filtered_blocks.sort(key=lambda fb: fb.get_filter_sort_tuple_for_add()) ok_blocks = [] last_matching_block = None for matching_block in filtered_blocks: if getattr(matching_block, "remove_part_is_inside_larger_block", False): # TODO getattr was used to act like in javascript - rewrite it without getattr continue if last_matching_block is None: last_matching_block = matching_block ok_blocks.append(matching_block) continue if matching_block.last_added_line.file == last_matching_block.last_added_line.file \ and matching_block.first_added_line.line_no >= last_matching_block.first_added_line.line_no \ and matching_block.last_added_line.line_no <= last_matching_block.last_added_line.line_no\ and matching_block.weighted_lines_count < last_matching_block.weighted_lines_count\ and not matching_block.added_lines_numbers.issubset(last_matching_block.added_lines_numbers): pass else: last_matching_block = matching_block ok_blocks.append(matching_block) return ok_blocks def _filter_out_small_blocks(self, matching_blocks, min_lines_count): return [block for block in matching_blocks if block.weighted_lines_count >= min_lines_count and block.char_count >= 20] def _clear_not_matching_lines_at_end_and_filter_out_empty_blocks(self, matching_blocks): filtered_blocks = [] for matching_block in matching_blocks: block_without_empty_end = matching_block.clear_empty_lines_at_end() if block_without_empty_end is not None: filtered_blocks.append(matching_block) return filtered_blocks def merge_blocks(self, block1, block2): new_block = MatchingBlock() new_block.lines.extend(block1.lines) new_block.lines.extend(block2.lines) # TODO what about lines between those 2 blocks? new_block.first_added_line = block1.first_added_line or block2.first_added_line new_block.first_removed_line = block1.first_removed_line or block2.first_removed_line new_block.last_added_line = block2.last_added_line or block1.last_added_line new_block.last_removed_line = block2.last_removed_line or block1.last_removed_line new_block.weighted_lines_count = block1.weighted_lines_count + block2.weighted_lines_count new_block.not_empty_lines = block1.not_empty_lines + block2.not_empty_lines new_block.char_count = block1.char_count + block2.char_count new_block.weighted_chars_count = block1.weighted_chars_count + block2.weighted_chars_count new_block.match_density = new_block.weighted_chars_count / new_block.char_count new_block.added_lines_numbers = block1.added_lines_numbers | block2.added_lines_numbers new_block.removed_lines_numbers = block1.removed_lines_numbers | block2.removed_lines_numbers return new_block @measure_fun_time() def join_nearby_blocks(self, matching_blocks: List[MatchingBlock], max_space_between=2): max_space_between += 1 # if we want to allow 2 lines between blocks difference between line numbers is 3 blocks_grouped_by_files: Dict[tuple, List[MatchingBlock]] = defaultdict(list) for block in matching_blocks: blocks_grouped_by_files[(block.file_removed, block.file_added)].append(block) blocks_after_merge: List[MatchingBlock] = [] merged_blocks = 0 for block_list in blocks_grouped_by_files.values(): loops_made = 0 block_list.sort(key=lambda block: (block.first_removed_line.line_no, -block.match_density)) indexes_of_merged_blocks = set() merged_blocks_list = [] for i in range(len(block_list)): block = block_list[i] for j in range(i+1, len(block_list)): loops_made += 1 next_block = block_list[j] if next_block.first_removed_line.line_no - block.last_removed_line.line_no > max_space_between: break elif (next_block.first_removed_line.line_no > block.last_removed_line.line_no and next_block.first_added_line.line_no - block.last_added_line.line_no <= max_space_between and next_block.first_added_line.line_no > block.last_added_line.line_no): block = self.merge_blocks(block, next_block) merged_blocks += 1 indexes_of_merged_blocks.add(i) indexes_of_merged_blocks.add(j) if i in indexes_of_merged_blocks: merged_blocks_list.append(block) for i in range(len(block_list)): if i not in indexes_of_merged_blocks: blocks_after_merge.append(block_list[i]) blocks_after_merge.extend(merged_blocks_list) return blocks_after_merge @measure_fun_time() def filter_blocks(self, matching_blocks, min_lines_count=None): if min_lines_count is None: min_lines_count = 2 filtered_blocks = self._filter_out_small_blocks(matching_blocks, min_lines_count) filtered_blocks = self._clear_not_matching_lines_at_end_and_filter_out_empty_blocks(filtered_blocks) return self.filter_out_block_inside_other_blocks(filtered_blocks) def extend_matching_blocks_with_empty_added_lines_if_possible(self, currently_matching_blocks): for matching_block in currently_matching_blocks: while True: last_line = matching_block.last_added_line next_added_line = self.added_file_name_to_line_no_to_line[last_line.file].get(last_line.line_no + 1) if next_added_line and next_added_line.trim_text == '': matching_block.extend_with_empty_added_line(next_added_line) else: break def extend_matching_blocks_with_empty_removed_lines_if_possible(self, currently_matching_blocks: List[MatchingBlock]): extended_blocks = [] not_extended_blocks = [] for matching_block in currently_matching_blocks: last_line = matching_block.last_removed_line next_removed_line = self.removed_file_name_to_line_no_to_line[last_line.file].get(last_line.line_no + 1) if next_removed_line and next_removed_line.trim_text == '': matching_block.extend_with_empty_removed_line(next_removed_line) extended_blocks.append(matching_block) else: not_extended_blocks.append(matching_block) return extended_blocks, not_extended_blocks @measure_fun_time() def detect_moved_blocks(self, min_lines_count=None) -> List[MatchingBlock]: detected_blocks: List[MatchingBlock] = [] currently_matching_blocks = [] new_matching_blocks = [] for removed_line in self.removed_lines: if removed_line.trim_text: min_match_score = 0.5 if len(removed_line.trim_text) > 2 else 0.35 fuzzy_matching_pairs = self.added_lines_fuzzy_set.get( removed_line.trim_text, default=None, exact_match_only=False, min_match_score=min_match_score ) # iterate over currently_matching_blocks and try to extend them with empty lines self.extend_matching_blocks_with_empty_added_lines_if_possible(currently_matching_blocks) else: fuzzy_matching_pairs = [[1, '']] if not fuzzy_matching_pairs: continue for fuzz_pair in fuzzy_matching_pairs: match_probability, text = fuzz_pair added_lines = self.trim_text_to_array_of_added_lines[text] for added_line in added_lines: line_extended_any_block = False already_added = set() for i, matching_block in enumerate(currently_matching_blocks): if i in already_added: continue extended = matching_block.try_extend_with_line(removed_line, added_line, match_probability) if extended: new_matching_blocks.append(matching_block) line_extended_any_block = True already_added.add(i) if not line_extended_any_block and removed_line.trim_text != '': new_matching_blocks.append(MatchingBlock.from_line(removed_line, added_line, match_probability)) currently_matching_blocks = [matching_block for i, matching_block in enumerate(currently_matching_blocks) if i not in already_added] if removed_line.trim_text == '': extended_blocks, not_extended_blocks = \ self.extend_matching_blocks_with_empty_removed_lines_if_possible(currently_matching_blocks) new_matching_blocks.extend(extended_blocks) currently_matching_blocks = not_extended_blocks for matching_block in currently_matching_blocks: detected_blocks.append(matching_block) currently_matching_blocks = new_matching_blocks new_matching_blocks = [] for matching_block in currently_matching_blocks: detected_blocks.append(matching_block) detected_blocks = self.join_nearby_blocks(detected_blocks) filtered_blocks = self.filter_blocks(detected_blocks, min_lines_count) logger.info(f'Detected {len(filtered_blocks)} blocks ({len(detected_blocks) - len(filtered_blocks)} filtered)') return filtered_blocks
class TFIDFmatcher: def __init__(self, choices_corpus, ngram_range=(1, 2), use_cleaner=True, preprocess_func=None): """ :param choices_corpus: should be a list of texts :param preprocess_func: is a str->str function """ self.ngram_range = ngram_range self.use_cleaner = use_cleaner self.preprocess_func = preprocess_func self.initial_choices_corpus = choices_corpus if self.use_cleaner: choices_corpus = self.cleaner(choices_corpus) if self.preprocess_func: choices_corpus = [self.preprocess_func(k) for k in choices_corpus] self.tfidf = TfidfVectorizer(analyzer='word', sublinear_tf=True, # strip_accents='ascii', lowercase=True, ngram_range=self.ngram_range, min_df=0).fit(choices_corpus) self.initial_corpus_tf_idf = self.tfidf.transform(choices_corpus) self.initial_corpus_tf_idf_dict = {} for k in range(len(choices_corpus)): self.initial_corpus_tf_idf_dict[choices_corpus[k]] = self.initial_corpus_tf_idf[k] self.vocabulary = self.tfidf.vocabulary_.keys() self.fset_vocabulary = FuzzySet() for brnd in self.vocabulary: self.fset_vocabulary.add(brnd) def cleaner(self, x, verbose=False): if verbose: print("Before cleaning", type(x), x) def cleaning_function(x): return clean_string(x).lower() if type(x) == list: x = [cleaning_function(el) for el in x] if type(x) in [str]: x = cleaning_function(x) if verbose: print("After cleaning", type(x), x) return x def extract(self, query, choices=None, limit=5, verbose=False): """ :param choices should be a list of texts :param query: TODO add an input type checker :param processor: TODO : add a cleaning process :param scorer: TODO : Add other distances :return: """ # print("---------------------------\n" # Get rid of this case if choices == []: return [] if choices: choices = list(set(choices)) # Clean the choices corpus initial_choices = choices if self.use_cleaner: choices = self.cleaner(choices) if self.preprocess_func: choices = [self.preprocess_func(elk) for elk in choices] choices_corpus = choices corpus_tf_idf = self.tfidf.transform(choices_corpus) else: initial_choices = self.initial_choices_corpus choices_corpus = self.initial_choices_corpus corpus_tf_idf = self.initial_corpus_tf_idf # print("Defaulting" if self.use_cleaner: query = self.cleaner(query) if self.preprocess_func: query = self.preprocess_func(query) # building fuzzy query new_query = [] # print("Vocabulary", vocabulary) for q in query.split(): if q in self.vocabulary: new_query.append(q) else: fset_get = self.fset_vocabulary.get(q) if fset_get: tmp_score, new_q = fset_get[0] if verbose: print("Modified", q, new_q, tmp_score) if tmp_score >= 0.80: new_query.append(new_q) query = " ".join(new_query) if verbose: print("NEW QUERY", query) x = self.tfidf.transform([query]) cosine_similarities = linear_kernel(x, corpus_tf_idf).flatten() related_docs_indices = cosine_similarities.argsort().flatten() if choices: result = [(choices_corpus[k], cosine_similarities[k].flatten()[0]) for k in related_docs_indices if choices_corpus[k]] else: result = [(choices_corpus[k], cosine_similarities[k].flatten()[0]) for k in related_docs_indices] result.sort(key=lambda tup: tup[1], reverse=True) # sorts in place # print("Query", query, "\nChoices", choices, "\nResult", result result = [(initial_choices[choices_corpus.index(k[0])], k[1]) for k in result] # print("Query", query, "\nChoices", choices, "\nResult", result if limit: return result[0:limit] return result def export_vocabulary(self, vocabulary_csv_destination, choices_corpus=None): if not choices_corpus: choices_corpus = self.initial_choices_corpus if self.use_cleaner: choices_corpus = [clean_string(x).lower() for x in choices_corpus] cnt_vec = CountVectorizer(ngram_range=self.ngram_range) transformed_data = cnt_vec.fit_transform(choices_corpus) l = [{'word':k, 'freq':v} for k, v in zip(cnt_vec.get_feature_names(), np.ravel(transformed_data.sum(axis=0)))] df = pd.DataFrame(l) df = df[['word', 'freq']] df.sort_values('freq', ascending=False, inplace=True) df.to_csv(vocabulary_csv_destination, encoding='utf-8', index=False, sep=";", doublequote=True, quoting=csv.QUOTE_ALL) print('The vocabulary was exported at : ', vocabulary_csv_destination)
class BrandMatcher: def __init__(self, ngram_range=(1, 3)): """ :param choices_corpus: should be a list of texts :param preprocess_func: is a str->str function """ self.ngram_range = ngram_range choices_corpus = [str(x) for x in list(brands['brnd'].dropna().unique())] l = brands[['brnd', 'equivalents']].dropna().to_dict('records') self.equivalents = {} for el in l: for eq in el['equivalents'].split(';'): self.equivalents[eq.strip()] = el['brnd'] choices_corpus.extend(self.equivalents.keys()) self.initial_choices_corpus = choices_corpus self.cleaned_choices_corpus = self.cleaner(choices_corpus) self.tfidf = TfidfVectorizer(analyzer='word', sublinear_tf=True, # strip_accents='ascii', lowercase=True, ngram_range=self.ngram_range, min_df=0).fit(self.cleaned_choices_corpus) self.initial_corpus_tf_idf = self.tfidf.transform(choices_corpus) self.initial_corpus_tf_idf_dict = {} for k in range(len(choices_corpus)): self.initial_corpus_tf_idf_dict[choices_corpus[k]] = self.initial_corpus_tf_idf[k] # Creating fuzzy set self.fset_brands = FuzzySet() for token in [str(x) for x in list(brands['brnd'].dropna().unique())]: self.fset_brands.add(token) self.fset_tokens = FuzzySet() for token in list(self.tfidf.vocabulary_): self.fset_tokens.add(token) # Prepare the japanese matching jp_brands = brands[['brnd', 'brnd_jp_clean']] jp_brands = jp_brands[jp_brands.brnd_jp_clean.notnull()] jp_brands['brnd_jp_clean'] = jp_brands['brnd_jp_clean'].apply(lambda x: unicodedata.normalize('NFKC', x.replace('・', '').replace(' ', ''))) jp_brands['brnd_jp_size'] = jp_brands['brnd_jp_clean'].apply(lambda x: len(x)) jp_brands.sort_values(['brnd_jp_size', 'brnd'], ascending=[False, False], inplace=True) self.jp_brands = jp_brands # jp_brands.to_excel('/tmp/jp_brands.xlsx') def cleaner(self, x, verbose=False): if verbose: print("Before cleaning", type(x), x) def cleaning_function(x): return clean_string(x).lower() if type(x) == list: x = [cleaning_function(str(el)) for el in x] if type(x) in [str]: x = cleaning_function(x) if verbose: print("After cleaning", type(x), x) return x def extract(self, query, verbose=False): """ :param choices should be a list of texts :param query: TODO add an input type checker :param processor: TODO : add a cleaning process :param scorer: TODO : Add other distances :return: """ initial_choices = self.initial_choices_corpus choices_corpus = self.initial_choices_corpus corpus_tf_idf = self.initial_corpus_tf_idf query = self.cleaner(query) # building fuzzy query new_query = [] for q in query.split(): if verbose: print(q) fset_get = self.fset_tokens.get(q) if fset_get: tmp_score, new_q = fset_get[0] if verbose: print("Modified", q, new_q, tmp_score) if tmp_score > 0.80: new_query.append(new_q) query = " ".join(new_query) if verbose: print("NEW QUERY", query) x = self.tfidf.transform([query]) cosine_similarities = linear_kernel(x, corpus_tf_idf).flatten() related_docs_indices = cosine_similarities.argsort().flatten() result = [(choices_corpus[k], cosine_similarities[k].flatten()[0]) for k in related_docs_indices] result = [(initial_choices[choices_corpus.index(k[0])], k[1]) for k in result] # correcting with fuzzyratio score between result and query # result = [(k[0], k[1] * 0.01 * 0.5 * (fuzz.token_set_ratio(k[0], query) + fuzz.ratio(k[0], query))) for k in result] # result = [(k[0], k[1]) for k in result] result.sort(key=lambda tup: tup[1], reverse=True) # sorts in place if verbose: print("Query", query, "\nResult", result) max_score = max(result, key=itemgetter(1))[1] result = [k for k in result if k[1] == max_score] return result def find_brand(self, pdct_name_on_eretailer, special_country=None, verbose=False): if not pdct_name_on_eretailer: return {'brand': None, 'score': 0} assert special_country in ['JP', None] if bool(pattern_japanese_chinese_caracters.search(pdct_name_on_eretailer)) or special_country == 'JP': clean_jp_str = lambda x: unicodedata.normalize('NFKC', x.replace('・', '').replace(' ', '').replace('・', '')) clean_jp_name = clean_jp_str(pdct_name_on_eretailer) # Forbidden words: japanese_forbidden_words = [" shoulder ", ' bag ', '【CD】', "【SHM-CD】", 'dvd', 'helmet', 'rucksack', 'daypack', 'daiken', 'ダイケン', "スリープスパ", 'リンゴビール', 'パターソン', 'ヘネシー澄子', ] clean_japanese_forbidden_words = [clean_jp_str(x).lower() for x in japanese_forbidden_words] # print(clean_jp_name, clean_japanese_forbidden_words) if any(x in clean_jp_name.lower() for x in clean_japanese_forbidden_words): return {'brand': None, 'score': 0} for br in self.jp_brands.to_dict(orient='records'): if br['brnd_jp_clean'] in clean_jp_name: # print("clean_jp_name :", clean_jp_name, "candidate", br['brnd_jp_clean']) return {'brand': br['brnd'], "score": 98.765} if "モエ " in pdct_name_on_eretailer and any(x in clean_jp_name for x in ["750", 'ml', 'cl']): return {'brand': "Moët & Chandon", "score": 98.765} # Ad-hoc rules if any([x in pdct_name_on_eretailer.lower() for x in ["moet ", "moët"]]) and 'dom p' in pdct_name_on_eretailer.lower(): return {'brand': 'Dom Pérignon', 'score': 99} if any([x in pdct_name_on_eretailer.lower() for x in ["moet ", "moët"]]): return {'brand': 'Moët & Chandon', 'score': 99} if any([x in pdct_name_on_eretailer.lower() for x in ["clicquot"]]): return {'brand': 'Veuve Clicquot', 'score': 99} if any([x in pdct_name_on_eretailer.lower() for x in ["ruinart"]]): return {'brand': 'Ruinart', 'score': 99} # # Forbidden words: # forbidden_words = ['leinwand', "hamper ", ' hamper', ' poster', 'poster ', 'chocolates ', ' chocolates', # 'truffle ', ' truffle', 'birthday cake', ' cake', 'candle', 'poplin', ' sheet ', ' bed ', # ' cover ', ' kimono', 'towel', 'dvd'] # if any(x in pdct_name_on_eretailer.lower() for x in forbidden_words): # return {'brand': None, 'score': 0} # Cleaning pdct_name_on_eretailer = pdct_name_on_eretailer.replace('–', ' ') pdct_name_on_eretailer = pdct_name_on_eretailer.replace('-', ' ') pdct_name_on_eretailer = pdct_name_on_eretailer.replace('_', ' ') pdct_name_on_eretailer = ' '.join(w for w in pdct_name_on_eretailer.split() if w) pdct_name_on_eretailer = pdct_name_on_eretailer.replace("'", "").replace('é', 'e').replace('Â', '').replace( 'ë', 'e') # print(pdct_name_on_eretailer) candidates = self.extract(pdct_name_on_eretailer, verbose=verbose) if not candidates: return {'brand': None, 'score': 0} # print(candidates) # print("FIRST SCORE :", brand, score) # Post treatment clean_tokens = clean_string(pdct_name_on_eretailer).split() # s = FuzzySet() # s.add(candidate) # l = [deepcopy(s.get(ngram, candidate)) for ngram in ngrams] # l = [x[0][0] for x in l if type(x) == list] brand, score = candidates[0], 0 for candidate in candidates: candidate_str = self.cleaner(candidate[0]) candidate_str = " ".join(candidate_str.split()[:9]) nb_token_candidate = len(candidate_str.split()) ngrams = [" ".join(clean_tokens[start:start + length]) for start in range(len(clean_tokens)) for length in range(max(nb_token_candidate, min(4, len(clean_tokens) - start + 1)))] # print([("'" + ngram + "'", "'" + candidate + "'", fuzz.ratio(ngram, candidate)) for ngram in ngrams]) l = [fuzz.ratio(ngram, candidate_str) for ngram in list(set(ngrams))] max_score = (max(l + [0])*0.01) ** 2 if max_score > score: score = max_score brand = candidate[0] if brand in self.equivalents: brand = self.equivalents[brand] score = round(100 * score, 2) # print("SECOND SCORE :", brand, score) # Forbidden words if any([x in pdct_name_on_eretailer.lower() for x in ["poster", 'dvd']]): return {'brand': None, 'score': 0} if score >= 80: if brand in ['Mercier']: # Add Krug ??? if 'hampagne' in pdct_name_on_eretailer.lower(): return {'brand': brand, 'score': score} if brand in ["Krug"] and any([x.lower() in pdct_name_on_eretailer.lower() for x in ['butler']]): return {'brand': None, 'score': 0} elif brand == "Belvedere": if not any([x in pdct_name_on_eretailer.lower() for x in ['zinfandel', 'chardonnay', 'sauvignon', 'pinot', 'merlot', 'syrah']]): return {'brand': brand, 'score': score} else: return {'brand': brand, 'score': score} elif verbose: print("Score is too low for: ", pdct_name_on_eretailer, {'brand': brand, 'score': score}) return {'brand': None, 'score': 0}