예제 #1
0
class FuzzyBaseIndex(object):
    def __init__(self, field=None, similarity=None, base=None, **kw):
        super(FuzzyBaseIndex, self).__init__(**kw)
        self.fuzz = FuzzySet(rel_sim_cutoff=1., use_levenshtein=False)
        self.content = {}
        self.field = field
        self.similarity = similarity
        self.base = base

    def add(self, x, i):
        self.fuzz.add(x)
        if x not in self.content:
            self.content[x] = set()
        self.content[x].add(i)

    def finalize(self):
        pass

    def search(self, x, top=25, debug=True):
        results = self.fuzz.get(x)
        ret = []
        for r in results:
            for i in self.content[r[1]]:
                sim = self.similarity(x, r[1])
                ret.append((i, r[0], sim))
        ret = sorted(ret, key=lambda x: x[2], reverse=True)
        ret = ret[:top]
        return ret
예제 #2
0
def fuzzyset_alg(key, key_list):
    finder = FuzzySet()
    finder.add(key)
    candidates = list()
    for i in key_list:
        try:
            added = [i]
            #if the match score is below 50% key error raises
            matched = finder[i]
            added.extend(*matched)
            del added[-1]  #remove rep's key from list
            added[1] *= 100  #convert to percentage
            '''
            [0] the sf key
            [1] match percentage
            '''
            candidates.append(added)
        except:
            pass
    #sort by score
    candidates.sort(key=lambda x: x[1], reverse=True)

    #take top take 10
    top_candi = candidates[:10]
    #fuzzy match and sort again
    finalist = [[i[0], fuzz.ratio(key, i[0])] for i in top_candi]
    finalist.sort(key=lambda x: x[1], reverse=True)
    del finder, candidates, top_candi
    if len(finalist) > 0:
        return finalist[:3]
    else:
        return []
예제 #3
0
def run_profile(impl):
    if impl == "cFuzzySet":
        f = cFuzzySet()
    else:
        f = FuzzySet()
    with gzip.GzipFile(os.path.join(here, '..', 'cities.gz')) as input_file:
        for line in input_file:
            f.add(line.rstrip().decode())
    print(f)
    cProfile.runctx("profiler(f)", globals(), locals(), "Profile.prof")

    s = pstats.Stats("Profile.prof")
    s.strip_dirs().sort_stats("time").print_stats()
예제 #4
0
class ListBasedPlaceExtractionService(NERService):

    def __init__(self, dist_file=CommonConstants.INDIA_DIST_NAMES):
        super().__init__()
        self.old_names = {'bangalore':'bengaluru','gurgaon':'gurugram','calcutta':'kolkata','prayagraj':'allahabad','delhi':'delhi'}
        self.fd = FuzzySet()
        self.set = set()
        with open(dist_file) as df:
            reader = csv.reader(df)
            header = next(reader)
            for row in reader:
                if 'rural' in row[1].lower() or 'urban' in   row[1].lower() or 'dehat' in  row[1].lower():
                    alternate = ' '.join(row[1].split(' ')[:-1]).lower()
                    self.fd.add(alternate)
                    self.set.add(alternate)
                    continue
                self.fd.add(row[1].lower())
                self.set.add(row[1].lower())
        self.nlp = stanza.Pipeline(lang='en',processors='tokenize',use_gpu=False)

    def extract_entities_from_text(self,text):
        doc = self.nlp(text)
        closest_match = (0,None)
        '''
        for token in doc.ents:
            tok_text = token.text.lower()
            closest_dist = self.fd.get(tok_text)
            if closest_dist and len(closest_dist):
                closest = closest_dist[0]
                if closest[0] > closest_match[0]:
                    closest_match = closest
        if closest_match[0] > 0.5:
            return closest_match[1]
        '''
        for sent in doc.sentences:
            for token in sent.tokens:
                tok_text = token.text.lower()
                if tok_text in self.set:
                    return tok_text
                try:
                    return self.old_names[tok_text]
                except KeyError:
                    continue
예제 #5
0
def get_nutrition_data(image_class):
    url = "https://api.nal.usda.gov/ndb/search/?format=json&q=" + image_class + "&sort=n&max=25&offset=0&api_key=FLKBoKOh7C1apAA4bPL0jH4GAW6f2wS9Lw0a2iFu"
    r = requests.get(url).json()

    max_dist_ratio = 0
    ndbno = 0
    for item in r["list"]["item"]:
        fs = FuzzySet()
        fs.add(image_class)
        ratio = fs.get(item["name"])[0][0]

        if ratio > max_dist_ratio:
            max_dist_ratio = ratio
            ndbno = item["ndbno"]

    print(ndbno)

    nutrition_url = "https://api.nal.usda.gov/ndb/V2/reports?ndbno=" + ndbno + "&type=f&format=json&api_key=FLKBoKOh7C1apAA4bPL0jH4GAW6f2wS9Lw0a2iFu"
    nutrition_data = requests.get(nutrition_url).json()

    nutrition_facts = {}
    nutrients = nutrition_data["foods"][0]["food"]["nutrients"]

    nutrition_facts["serve_size"] = str(
        nutrients[0]["measures"][0]["qty"]) + " ounces"
    nutrition_facts["kcal"] = str(
        nutrients[0]["measures"][0]["value"]) + " calories"
    nutrition_facts["fat"] = str(
        nutrients[2]["measures"][0]["value"]) + " grams"
    nutrition_facts["carbs"] = str(
        nutrients[3]["measures"][0]["value"]) + " grams"
    nutrition_facts["protein"] = str(
        nutrients[1]["measures"][0]["value"]) + " grams"
    nutrition_facts["sugar"] = str(
        nutrients[4]["measures"][0]["value"]) + " grams"
    nutrition_facts["sodium"] = str(
        nutrients[5]["measures"][0]["value"]) + " milligrams"

    return nutrition_facts
예제 #6
0
]

# list of countries from C19 data
c19ctrs = c19.columns.tolist()

# list of countries from population data
popsctrs = pops['Country'].tolist()

# geo data is king, we need to match everything else to it
# the country name becomes the key matching the tables / dictionaries

# some countries in c19 do not match any country in geo data
# let's print fuzzy matches
fzs = FuzzySet()
for c in geoctrs:
    fzs.add(c)

#for c in c19ctrs:
#    if c not in geoctrs:
#        print(c, fzs.get(c))

# In[8]:

c19notfound = [
    'Andorra', 'Antigua and Barbuda', 'Bahrain', 'Barbados', 'Cabo Verde',
    'Comoros', 'Diamond Princess', 'Dominica', 'Grenada', 'Holy See',
    'Liechtenstein', 'MS Zaandam', 'Maldives', 'Mauritius', 'Monaco',
    'Saint Kitts and Nevis', 'Saint Lucia', 'Saint Vincent and the Grenadines',
    'San Marino', 'Sao Tome and Principe', 'Seychelles', 'Singapore'
]
예제 #7
0
class MiniBaseIndex(object):
    def __init__(self,
                 field=None,
                 tokenizer=None,
                 similarity=None,
                 base=None,
                 idf_limit=0.05,
                 **kw):
        super(MiniBaseIndex, self).__init__(**kw)
        self.content = {}
        self.field = field
        self.tokenizer = tokenizer
        self.similarity = similarity
        self.base = base
        self.counts = {}
        self.fuzzwords = FuzzySet(rel_sim_cutoff=0.7, use_levenshtein=False)
        self.blacklist = set()
        self.idf_limit = idf_limit

    def add(self, tok, i):
        if tok not in self.content:
            if tok not in self.blacklist:
                self.content[tok] = set()
            self.counts[tok] = 0
        self.content[tok].add(i)
        self.counts[tok] += 1
        # if self.counts[tok]/len(self.base.entries) > self.idf_limit:
        #     self.blacklist.add(tok)
        #     del self.counts[tok]
        #     del self.content[tok]
        self.fuzzwords.add(tok)

    def finalize(self):
        for tok in self.content:
            pass
            # self.fuzzwords.add(tok)

    def search(self, x, expl=5000, top=25, maxtok=250, debug=False):
        tokenizer = self.tokenizer
        xtoks = tokenizer(x)
        # maxtok = maxtok * len(xtoks)
        results = {}
        # collect all toks
        alltoks = []
        alltoks_set = set()
        for xtok in xtoks:
            for xtok_fuzz_score, xtok_fuzz_tok \
                    in self.fuzzwords.get(xtok):
                xtok_fuzz_sim = self.similarity(xtok, xtok_fuzz_tok)
                if xtok_fuzz_tok not in alltoks_set:
                    alltoks.append(
                        (xtok_fuzz_score, xtok_fuzz_tok, xtok_fuzz_sim))
                    alltoks_set.add(xtok_fuzz_tok)
        # alltoks = list(alltoks)
        # sort together by fuzziness
        alltoks = sorted(alltoks,
                         key=lambda x: x[2] * 100 + 1 / self.counts[x[1]],
                         reverse=True)
        # take maxtok only
        if debug:
            print(len(alltoks), maxtok)
            for tok in alltoks:
                print(tok, self.counts[tok[1]])
        alltoks = alltoks[:maxtok]
        # sort by inverse frequency
        # alltoks = sorted(alltoks, key=lambda x: self.counts[x[1]])
        # alltoksset = set(alltoks)
        for xtok_fuzz_score, xtok_fuzz_tok, xtok_fuzz_sim in alltoks:
            for _id in self.content[xtok_fuzz_tok]:
                if _id not in results:
                    results[_id] = 0
                results[_id] += xtok_fuzz_score
                if len(results) > expl:
                    break
            if len(results) > expl:
                break
        if debug:
            print(len(results))
        results = [(res[0], res[1],
                    self.similarity(x, self.base.entries[res[0]][self.field]))
                   for res in results.items()]

        def sortkey(x):
            entid = x[0]
            pop = self.base.entries[entid]["pop"]
            sim = x[2]
            return sim * 1e2 + pop * 1e-3

        results = sorted(results, key=sortkey, reverse=True)
        results = results[:top]
        return results
예제 #8
0
 def _learn(self, command):
     fs = FuzzySet(self.function_names)
     fs.add(command)
     print("No command found! Please input commands")
     commands = raw_input()
     commandArray = commands.split('.')
예제 #9
0
class MovedBlocksDetector(object):
    def __init__(self, removed_lines_dicts, added_lines_dicts):
        self.removed_lines = []
        self.trim_text_to_array_of_added_lines = defaultdict(list)
        self.added_file_name_to_line_no_to_line = defaultdict(dict)
        self.removed_file_name_to_line_no_to_line = defaultdict(dict)
        self.added_lines_fuzzy_set = FuzzySet()

        for added_line_dict in added_lines_dicts:
            line = Line.from_dict(added_line_dict)
            self.trim_text_to_array_of_added_lines[line.trim_text].append(line)
            self.added_lines_fuzzy_set.add(line.trim_text)
            self.added_file_name_to_line_no_to_line[line.file][line.line_no] = line

        for removed_line_dict in removed_lines_dicts:
            line = Line.from_dict(removed_line_dict)
            self.removed_lines.append(line)
            self.removed_file_name_to_line_no_to_line[line.file][line.line_no] = line

    @staticmethod
    def from_diff(diff_text):
        parsed = diff_to_added_and_removed_lines(diff_text)
        return MovedBlocksDetector(parsed['removed_lines'], parsed['added_lines'])

    @measure_fun_time()
    def filter_out_block_inside_other_blocks(self, filtered_blocks: List[MatchingBlock]):
        filtered_blocks.sort(key=lambda fb: fb.get_filter_sort_tuple_for_remove())

        last_matching_block = None
        for matching_block in filtered_blocks:
            if last_matching_block is None:
                last_matching_block = matching_block
                continue
            if matching_block.last_removed_line.file == last_matching_block.last_removed_line.file \
                    and matching_block.first_removed_line.line_no >= last_matching_block.first_removed_line.line_no \
                    and matching_block.last_removed_line.line_no <= last_matching_block.last_removed_line.line_no:
                if matching_block.weighted_lines_count < last_matching_block.weighted_lines_count\
                   and matching_block.removed_lines_numbers.issubset(last_matching_block.removed_lines_numbers):
                    matching_block.remove_part_is_inside_larger_block = True
            else:
                last_matching_block = matching_block

        filtered_blocks.sort(key=lambda fb: fb.get_filter_sort_tuple_for_add())
        ok_blocks = []
        last_matching_block = None
        for matching_block in filtered_blocks:
            if getattr(matching_block, "remove_part_is_inside_larger_block", False): # TODO getattr was used to act like in javascript - rewrite it without getattr
                continue
            if last_matching_block is None:
                last_matching_block = matching_block
                ok_blocks.append(matching_block)
                continue
            if matching_block.last_added_line.file == last_matching_block.last_added_line.file \
                    and matching_block.first_added_line.line_no >= last_matching_block.first_added_line.line_no \
                    and matching_block.last_added_line.line_no <= last_matching_block.last_added_line.line_no\
                    and matching_block.weighted_lines_count < last_matching_block.weighted_lines_count\
                    and not matching_block.added_lines_numbers.issubset(last_matching_block.added_lines_numbers):
                pass
            else:
                last_matching_block = matching_block
                ok_blocks.append(matching_block)

        return ok_blocks

    def _filter_out_small_blocks(self, matching_blocks, min_lines_count):
        return [block for block in matching_blocks if block.weighted_lines_count >= min_lines_count and block.char_count >= 20]

    def _clear_not_matching_lines_at_end_and_filter_out_empty_blocks(self, matching_blocks):
        filtered_blocks = []
        for matching_block in matching_blocks:
            block_without_empty_end = matching_block.clear_empty_lines_at_end()
            if block_without_empty_end is not None:
                filtered_blocks.append(matching_block)
        return filtered_blocks

    def merge_blocks(self, block1, block2):
        new_block = MatchingBlock()
        new_block.lines.extend(block1.lines)
        new_block.lines.extend(block2.lines)
        # TODO what about lines between those 2 blocks?
        new_block.first_added_line = block1.first_added_line or block2.first_added_line
        new_block.first_removed_line = block1.first_removed_line or block2.first_removed_line
        new_block.last_added_line = block2.last_added_line or block1.last_added_line
        new_block.last_removed_line = block2.last_removed_line or block1.last_removed_line
        new_block.weighted_lines_count = block1.weighted_lines_count + block2.weighted_lines_count
        new_block.not_empty_lines = block1.not_empty_lines + block2.not_empty_lines
        new_block.char_count = block1.char_count + block2.char_count
        new_block.weighted_chars_count = block1.weighted_chars_count + block2.weighted_chars_count
        new_block.match_density = new_block.weighted_chars_count / new_block.char_count
        new_block.added_lines_numbers = block1.added_lines_numbers | block2.added_lines_numbers
        new_block.removed_lines_numbers = block1.removed_lines_numbers | block2.removed_lines_numbers
        return new_block

    @measure_fun_time()
    def join_nearby_blocks(self, matching_blocks: List[MatchingBlock], max_space_between=2):
        max_space_between += 1  # if we want to allow 2 lines between blocks difference between line numbers is 3
        blocks_grouped_by_files: Dict[tuple, List[MatchingBlock]] = defaultdict(list)
        for block in matching_blocks:
            blocks_grouped_by_files[(block.file_removed, block.file_added)].append(block)
        blocks_after_merge: List[MatchingBlock] = []

        merged_blocks = 0
        for block_list in blocks_grouped_by_files.values():
            loops_made = 0
            block_list.sort(key=lambda block: (block.first_removed_line.line_no, -block.match_density))
            indexes_of_merged_blocks = set()
            merged_blocks_list = []
            for i in range(len(block_list)):
                block = block_list[i]
                for j in range(i+1, len(block_list)):
                    loops_made += 1
                    next_block = block_list[j]
                    if next_block.first_removed_line.line_no - block.last_removed_line.line_no > max_space_between:
                        break
                    elif (next_block.first_removed_line.line_no > block.last_removed_line.line_no
                            and next_block.first_added_line.line_no - block.last_added_line.line_no <= max_space_between
                            and next_block.first_added_line.line_no > block.last_added_line.line_no):
                        block = self.merge_blocks(block, next_block)
                        merged_blocks += 1
                        indexes_of_merged_blocks.add(i)
                        indexes_of_merged_blocks.add(j)
                if i in indexes_of_merged_blocks:
                    merged_blocks_list.append(block)
            for i in range(len(block_list)):
                if i not in indexes_of_merged_blocks:
                    blocks_after_merge.append(block_list[i])
            blocks_after_merge.extend(merged_blocks_list)
        return blocks_after_merge

    @measure_fun_time()
    def filter_blocks(self, matching_blocks, min_lines_count=None):
        if min_lines_count is None:
            min_lines_count = 2
        filtered_blocks = self._filter_out_small_blocks(matching_blocks, min_lines_count)
        filtered_blocks = self._clear_not_matching_lines_at_end_and_filter_out_empty_blocks(filtered_blocks)
        return self.filter_out_block_inside_other_blocks(filtered_blocks)

    def extend_matching_blocks_with_empty_added_lines_if_possible(self, currently_matching_blocks):
        for matching_block in currently_matching_blocks:
            while True:
                last_line = matching_block.last_added_line
                next_added_line = self.added_file_name_to_line_no_to_line[last_line.file].get(last_line.line_no + 1)
                if next_added_line and next_added_line.trim_text == '':
                    matching_block.extend_with_empty_added_line(next_added_line)
                else:
                    break

    def extend_matching_blocks_with_empty_removed_lines_if_possible(self, currently_matching_blocks: List[MatchingBlock]):
        extended_blocks = []
        not_extended_blocks = []
        for matching_block in currently_matching_blocks:
            last_line = matching_block.last_removed_line
            next_removed_line = self.removed_file_name_to_line_no_to_line[last_line.file].get(last_line.line_no + 1)
            if next_removed_line and next_removed_line.trim_text == '':
                matching_block.extend_with_empty_removed_line(next_removed_line)
                extended_blocks.append(matching_block)
            else:
                not_extended_blocks.append(matching_block)

        return extended_blocks, not_extended_blocks

    @measure_fun_time()
    def detect_moved_blocks(self, min_lines_count=None) -> List[MatchingBlock]:
        detected_blocks: List[MatchingBlock] = []
        currently_matching_blocks = []
        new_matching_blocks = []

        for removed_line in self.removed_lines:
            if removed_line.trim_text:
                min_match_score = 0.5 if len(removed_line.trim_text) > 2 else 0.35
                fuzzy_matching_pairs = self.added_lines_fuzzy_set.get(
                    removed_line.trim_text, default=None, exact_match_only=False, min_match_score=min_match_score
                )
                # iterate over currently_matching_blocks and try to extend them with empty lines
                self.extend_matching_blocks_with_empty_added_lines_if_possible(currently_matching_blocks)
            else:
                fuzzy_matching_pairs = [[1, '']]

            if not fuzzy_matching_pairs:
                continue

            for fuzz_pair in fuzzy_matching_pairs:
                match_probability, text = fuzz_pair
                added_lines = self.trim_text_to_array_of_added_lines[text]
                for added_line in added_lines:
                    line_extended_any_block = False
                    already_added = set()
                    for i, matching_block in enumerate(currently_matching_blocks):
                        if i in already_added:
                            continue
                        extended = matching_block.try_extend_with_line(removed_line, added_line, match_probability)
                        if extended:
                            new_matching_blocks.append(matching_block)
                            line_extended_any_block = True
                            already_added.add(i)

                    if not line_extended_any_block and removed_line.trim_text != '':
                        new_matching_blocks.append(MatchingBlock.from_line(removed_line, added_line, match_probability))
                    currently_matching_blocks = [matching_block for i, matching_block in
                                                 enumerate(currently_matching_blocks) if i not in already_added]

            if removed_line.trim_text == '':
                extended_blocks, not_extended_blocks = \
                    self.extend_matching_blocks_with_empty_removed_lines_if_possible(currently_matching_blocks)
                new_matching_blocks.extend(extended_blocks)
                currently_matching_blocks = not_extended_blocks

            for matching_block in currently_matching_blocks:
                detected_blocks.append(matching_block)

            currently_matching_blocks = new_matching_blocks
            new_matching_blocks = []

        for matching_block in currently_matching_blocks:
            detected_blocks.append(matching_block)

        detected_blocks = self.join_nearby_blocks(detected_blocks)
        filtered_blocks = self.filter_blocks(detected_blocks, min_lines_count)
        logger.info(f'Detected {len(filtered_blocks)} blocks ({len(detected_blocks) - len(filtered_blocks)} filtered)')
        return filtered_blocks
예제 #10
0
class TFIDFmatcher:
    def __init__(self, choices_corpus, ngram_range=(1, 2), use_cleaner=True, preprocess_func=None):
        """
        :param choices_corpus: should be a list of texts
        :param preprocess_func: is a str->str function
        """
        self.ngram_range = ngram_range
        self.use_cleaner = use_cleaner
        self.preprocess_func = preprocess_func

        self.initial_choices_corpus = choices_corpus
        if self.use_cleaner:
            choices_corpus = self.cleaner(choices_corpus)
        if self.preprocess_func:
            choices_corpus = [self.preprocess_func(k) for k in choices_corpus]

        self.tfidf = TfidfVectorizer(analyzer='word', sublinear_tf=True,  # strip_accents='ascii',
                                     lowercase=True, ngram_range=self.ngram_range, min_df=0).fit(choices_corpus)

        self.initial_corpus_tf_idf = self.tfidf.transform(choices_corpus)
        self.initial_corpus_tf_idf_dict = {}
        for k in range(len(choices_corpus)):
            self.initial_corpus_tf_idf_dict[choices_corpus[k]] = self.initial_corpus_tf_idf[k]
        self.vocabulary = self.tfidf.vocabulary_.keys()
        self.fset_vocabulary = FuzzySet()
        for brnd in self.vocabulary:
            self.fset_vocabulary.add(brnd)


    def cleaner(self, x, verbose=False):
        if verbose:
            print("Before cleaning", type(x), x)

        def cleaning_function(x):
            return clean_string(x).lower()

        if type(x) == list:
            x = [cleaning_function(el) for el in x]
        if type(x) in [str]:
            x = cleaning_function(x)
        if verbose:
            print("After cleaning", type(x), x)
        return x

    def extract(self, query, choices=None, limit=5, verbose=False):
        """
        :param choices should be a list of texts
        :param query: TODO add an input type checker
        :param processor: TODO : add a cleaning process
        :param scorer: TODO : Add other distances
        :return:
        """
        # print("---------------------------\n"
        # Get rid of this case
        if choices == []:
            return []

        if choices:
            choices = list(set(choices))

            # Clean the choices corpus
            initial_choices = choices
            if self.use_cleaner:
                choices = self.cleaner(choices)
            if self.preprocess_func:
                choices = [self.preprocess_func(elk) for elk in choices]
            choices_corpus = choices

            corpus_tf_idf = self.tfidf.transform(choices_corpus)
        else:
            initial_choices = self.initial_choices_corpus
            choices_corpus = self.initial_choices_corpus
            corpus_tf_idf = self.initial_corpus_tf_idf
            # print("Defaulting"

        if self.use_cleaner:
            query = self.cleaner(query)
        if self.preprocess_func:
            query = self.preprocess_func(query)

        # building fuzzy query
        new_query = []
        # print("Vocabulary", vocabulary)
        for q in query.split():
            if q in self.vocabulary:
                new_query.append(q)
            else:
                fset_get = self.fset_vocabulary.get(q)
                if fset_get:
                    tmp_score, new_q = fset_get[0]
                    if verbose:
                        print("Modified", q, new_q, tmp_score)
                    if tmp_score >= 0.80:
                        new_query.append(new_q)
        query = " ".join(new_query)
        if verbose:
            print("NEW QUERY", query)
        x = self.tfidf.transform([query])

        cosine_similarities = linear_kernel(x, corpus_tf_idf).flatten()
        related_docs_indices = cosine_similarities.argsort().flatten()
        if choices:
            result = [(choices_corpus[k], cosine_similarities[k].flatten()[0]) for k in related_docs_indices if choices_corpus[k]]
        else:
            result = [(choices_corpus[k], cosine_similarities[k].flatten()[0]) for k in related_docs_indices]
        result.sort(key=lambda tup: tup[1], reverse=True)  # sorts in place
        # print("Query", query, "\nChoices", choices, "\nResult", result
        result = [(initial_choices[choices_corpus.index(k[0])], k[1]) for k in result]
        # print("Query", query, "\nChoices", choices, "\nResult", result
        if limit:
            return result[0:limit]
        return result

    def export_vocabulary(self, vocabulary_csv_destination, choices_corpus=None):
        if not choices_corpus:
            choices_corpus = self.initial_choices_corpus

        if self.use_cleaner:
            choices_corpus = [clean_string(x).lower() for x in choices_corpus]

        cnt_vec = CountVectorizer(ngram_range=self.ngram_range)
        transformed_data = cnt_vec.fit_transform(choices_corpus)
        l = [{'word':k, 'freq':v} for k, v in zip(cnt_vec.get_feature_names(), np.ravel(transformed_data.sum(axis=0)))]
        df = pd.DataFrame(l)
        df = df[['word', 'freq']]
        df.sort_values('freq', ascending=False, inplace=True)
        df.to_csv(vocabulary_csv_destination, encoding='utf-8', index=False, sep=";", doublequote=True, quoting=csv.QUOTE_ALL)
        print('The vocabulary was exported at : ', vocabulary_csv_destination)
예제 #11
0
class BrandMatcher:
    def __init__(self, ngram_range=(1, 3)):
        """
        :param choices_corpus: should be a list of texts
        :param preprocess_func: is a str->str function
        """
        self.ngram_range = ngram_range
        choices_corpus = [str(x) for x in list(brands['brnd'].dropna().unique())]

        l = brands[['brnd', 'equivalents']].dropna().to_dict('records')
        self.equivalents = {}
        for el in l:
            for eq in el['equivalents'].split(';'):
                self.equivalents[eq.strip()] = el['brnd']

        choices_corpus.extend(self.equivalents.keys())

        self.initial_choices_corpus = choices_corpus
        self.cleaned_choices_corpus = self.cleaner(choices_corpus)

        self.tfidf = TfidfVectorizer(analyzer='word', sublinear_tf=True,  # strip_accents='ascii',
                                     lowercase=True, ngram_range=self.ngram_range, min_df=0).fit(self.cleaned_choices_corpus)

        self.initial_corpus_tf_idf = self.tfidf.transform(choices_corpus)
        self.initial_corpus_tf_idf_dict = {}
        for k in range(len(choices_corpus)):
            self.initial_corpus_tf_idf_dict[choices_corpus[k]] = self.initial_corpus_tf_idf[k]

        # Creating fuzzy set
        self.fset_brands = FuzzySet()
        for token in [str(x) for x in list(brands['brnd'].dropna().unique())]:
            self.fset_brands.add(token)

        self.fset_tokens = FuzzySet()
        for token in list(self.tfidf.vocabulary_):
            self.fset_tokens.add(token)

        # Prepare the japanese matching
        jp_brands = brands[['brnd', 'brnd_jp_clean']]
        jp_brands = jp_brands[jp_brands.brnd_jp_clean.notnull()]
        jp_brands['brnd_jp_clean'] = jp_brands['brnd_jp_clean'].apply(lambda x: unicodedata.normalize('NFKC', x.replace('・', '').replace(' ', '')))
        jp_brands['brnd_jp_size'] = jp_brands['brnd_jp_clean'].apply(lambda x: len(x))
        jp_brands.sort_values(['brnd_jp_size', 'brnd'], ascending=[False, False], inplace=True)
        self.jp_brands = jp_brands
        # jp_brands.to_excel('/tmp/jp_brands.xlsx')

    def cleaner(self, x, verbose=False):
        if verbose:
            print("Before cleaning", type(x), x)

        def cleaning_function(x):
            return clean_string(x).lower()

        if type(x) == list:
            x = [cleaning_function(str(el)) for el in x]
        if type(x) in [str]:
            x = cleaning_function(x)
        if verbose:
            print("After cleaning", type(x), x)
        return x

    def extract(self, query, verbose=False):
        """
        :param choices should be a list of texts
        :param query: TODO add an input type checker
        :param processor: TODO : add a cleaning process
        :param scorer: TODO : Add other distances
        :return:
        """
        initial_choices = self.initial_choices_corpus
        choices_corpus = self.initial_choices_corpus
        corpus_tf_idf = self.initial_corpus_tf_idf
        query = self.cleaner(query)

        # building fuzzy query
        new_query = []

        for q in query.split():
            if verbose:
                print(q)
            fset_get = self.fset_tokens.get(q)
            if fset_get:
                tmp_score, new_q = fset_get[0]
                if verbose:
                    print("Modified", q, new_q, tmp_score)
                if tmp_score > 0.80:
                    new_query.append(new_q)
        query = " ".join(new_query)
        if verbose:
            print("NEW QUERY", query)

        x = self.tfidf.transform([query])

        cosine_similarities = linear_kernel(x, corpus_tf_idf).flatten()
        related_docs_indices = cosine_similarities.argsort().flatten()
        result = [(choices_corpus[k], cosine_similarities[k].flatten()[0]) for k in related_docs_indices]
        result = [(initial_choices[choices_corpus.index(k[0])], k[1]) for k in result]
        # correcting with fuzzyratio score between result and query
        # result = [(k[0], k[1] * 0.01 * 0.5 * (fuzz.token_set_ratio(k[0], query) + fuzz.ratio(k[0], query))) for k in result]
        # result = [(k[0], k[1]) for k in result]
        result.sort(key=lambda tup: tup[1], reverse=True)  # sorts in place

        if verbose:
            print("Query", query, "\nResult", result)
        max_score = max(result, key=itemgetter(1))[1]
        result = [k for k in result if k[1] == max_score]
        return result

    def find_brand(self, pdct_name_on_eretailer, special_country=None, verbose=False):

        if not pdct_name_on_eretailer:
            return {'brand': None, 'score': 0}
        assert special_country in ['JP', None]

        if bool(pattern_japanese_chinese_caracters.search(pdct_name_on_eretailer)) or special_country == 'JP':
            clean_jp_str = lambda x: unicodedata.normalize('NFKC', x.replace('・', '').replace(' ', '').replace('・', ''))
            clean_jp_name = clean_jp_str(pdct_name_on_eretailer)

            # Forbidden words:
            japanese_forbidden_words = [" shoulder ", ' bag ', '【CD】', "【SHM-CD】", 'dvd', 'helmet', 'rucksack',
                                        'daypack', 'daiken', 'ダイケン', "スリープスパ", 'リンゴビール', 'パターソン', 'ヘネシー澄子',
                                        ]
            clean_japanese_forbidden_words = [clean_jp_str(x).lower() for x in japanese_forbidden_words]
            # print(clean_jp_name, clean_japanese_forbidden_words)
            if any(x in clean_jp_name.lower() for x in clean_japanese_forbidden_words):
                return {'brand': None, 'score': 0}

            for br in self.jp_brands.to_dict(orient='records'):
                if br['brnd_jp_clean'] in clean_jp_name:
                    # print("clean_jp_name :", clean_jp_name, "candidate", br['brnd_jp_clean'])
                    return {'brand': br['brnd'], "score": 98.765}
            if "モエ " in pdct_name_on_eretailer and any(x in clean_jp_name for x in ["750", 'ml', 'cl']):
                return {'brand': "Moët & Chandon", "score": 98.765}
        # Ad-hoc rules
        if any([x in pdct_name_on_eretailer.lower() for x in ["moet ", "moët"]]) and 'dom p' in pdct_name_on_eretailer.lower():
            return {'brand': 'Dom Pérignon', 'score': 99}
        if any([x in pdct_name_on_eretailer.lower() for x in ["moet ", "moët"]]):
            return {'brand': 'Moët & Chandon', 'score': 99}
        if any([x in pdct_name_on_eretailer.lower() for x in ["clicquot"]]):
            return {'brand': 'Veuve Clicquot', 'score': 99}
        if any([x in pdct_name_on_eretailer.lower() for x in ["ruinart"]]):
            return {'brand': 'Ruinart', 'score': 99}

        # # Forbidden words:
        # forbidden_words = ['leinwand', "hamper ", ' hamper', ' poster', 'poster ', 'chocolates ', ' chocolates',
        #                    'truffle ', ' truffle', 'birthday cake', ' cake', 'candle', 'poplin', ' sheet ', ' bed ',
        #                    ' cover ', ' kimono', 'towel', 'dvd']
        # if any(x in pdct_name_on_eretailer.lower() for x in forbidden_words):
        #     return {'brand': None, 'score': 0}

        # Cleaning
        pdct_name_on_eretailer = pdct_name_on_eretailer.replace('–', ' ')
        pdct_name_on_eretailer = pdct_name_on_eretailer.replace('-', ' ')
        pdct_name_on_eretailer = pdct_name_on_eretailer.replace('_', ' ')
        pdct_name_on_eretailer = ' '.join(w for w in pdct_name_on_eretailer.split() if w)
        pdct_name_on_eretailer = pdct_name_on_eretailer.replace("'", "").replace('é', 'e').replace('Â', '').replace(
            'ë', 'e')
        # print(pdct_name_on_eretailer)
        candidates = self.extract(pdct_name_on_eretailer, verbose=verbose)
        if not candidates:
            return {'brand': None, 'score': 0}
        # print(candidates)
        # print("FIRST SCORE :", brand, score)
        # Post treatment
        clean_tokens = clean_string(pdct_name_on_eretailer).split()
        # s = FuzzySet()
        # s.add(candidate)
        # l = [deepcopy(s.get(ngram, candidate)) for ngram in ngrams]
        # l = [x[0][0] for x in l if type(x) == list]
        brand, score = candidates[0], 0
        for candidate in candidates:
            candidate_str = self.cleaner(candidate[0])
            candidate_str = " ".join(candidate_str.split()[:9])
            nb_token_candidate = len(candidate_str.split())
            ngrams = [" ".join(clean_tokens[start:start + length]) for start in range(len(clean_tokens))
                      for length in range(max(nb_token_candidate, min(4, len(clean_tokens) - start + 1)))]
            # print([("'" + ngram + "'", "'" + candidate + "'", fuzz.ratio(ngram, candidate)) for ngram in ngrams])
            l = [fuzz.ratio(ngram, candidate_str) for ngram in list(set(ngrams))]
            max_score = (max(l + [0])*0.01) ** 2
            if max_score > score:
                score = max_score
                brand = candidate[0]

        if brand in self.equivalents:
            brand = self.equivalents[brand]
        score = round(100 * score, 2)
        # print("SECOND SCORE :", brand, score)

        # Forbidden words
        if any([x in pdct_name_on_eretailer.lower() for x in ["poster", 'dvd']]):
            return {'brand': None, 'score': 0}

        if score >= 80:
            if brand in ['Mercier']:  # Add Krug ???
                if 'hampagne' in pdct_name_on_eretailer.lower():
                    return {'brand': brand, 'score': score}
            if brand in ["Krug"] and any([x.lower() in pdct_name_on_eretailer.lower() for x in ['butler']]):
                return {'brand': None, 'score': 0}
            elif brand == "Belvedere":
                if not any([x in pdct_name_on_eretailer.lower() for x in
                            ['zinfandel', 'chardonnay', 'sauvignon', 'pinot', 'merlot', 'syrah']]):
                    return {'brand': brand, 'score': score}
            else:
                return {'brand': brand, 'score': score}
        elif verbose:
            print("Score is too low for: ", pdct_name_on_eretailer, {'brand': brand, 'score': score})
        return {'brand': None, 'score': 0}