Пример #1
0
def calcRelativity(csv_file, keyWord, data):
    csv_file_tmp = csv_file.fillna("").astype(str)
    data = [x[1] for x in process.extractWithoutOrder(keyWord, data)]
    data = pd.DataFrame({
        'Relativity': data
        }
    )
    return data
Пример #2
0
 def getConfidence(self, string, stringSet):
     """Print each values of extractOne,extract,extractBests,extractWithoutOrder of fuzzywuzz process for visualizing the output"""
     try:
         print('extractone confidence: ', process.extractOne(string, stringSet))
         print('extract confidence: ', process.extract(string, stringSet))
         print('extractbests: ', process.extractBests(string, stringSet))
         print('extractwithoutorder: ', process.extractWithoutOrder(string, stringSet))
     except Exception as e:
         print('Error in getConfidence in StringHandling', e)
Пример #3
0
def match_tester2(test_user, list_db, name_param, doc_param, site_param,
                  name_function):
    res_df = DataFrame()
    df1_ = DataFrame(
        process.extractWithoutOrder(str(test_user.iloc[0].name), list_db.name))
    res_df['name_score'] = df1_[1]
    df2_ = DataFrame(
        process.extractWithoutOrder(str(test_user.iloc[0].user_doc_number),
                                    list_db.doc,
                                    scorer=fuzz.ratio))
    res_df['doc_score'] = df2_[1]
    df3_ = DataFrame(
        process.extractWithoutOrder(str(test_user.iloc[0].country),
                                    list_db.country))
    res_df['country_score'] = df3_[1]
    res_df['counter'] = list_db.counter
    res_df['final_score'] = (res_df.name_score * name_param * name_function(
        res_df.counter)) + (res_df.doc_score *
                            doc_param) + (res_df.country_score * site_param)
    return res_df
Пример #4
0
    def set_city_matches(self, cutoff):
        """Find fuzzy matches between city and cities in streets_table."""

        if (not self.city) or (self.city == 'N/A') or ('No Match'
                                                       in self.city):
            self.city_matches = []
        else:
            valid_cities = self.streets_table['City'].unique()
            matches = process.extractWithoutOrder(self.city,
                                                  valid_cities,
                                                  scorer=fuzz.partial_ratio,
                                                  score_cutoff=cutoff)

            self.city_matches = [city for city, score in matches]
Пример #5
0
    async def qsearch(self, ctx, search_term: str):
        """
        Use fuzzy search to allow users to search the mp3quran.net reciter list.
        """
        reciter_list = await get_surah_reciters()
        reciters = [reciter.name for reciter in reciter_list]

        results = process.extractWithoutOrder(search_term, reciters, score_cutoff=65)
        formatted_results = ''
        i = 0
        for result in results:
            i += 1
            formatted_result = result[0].replace('-', ' - ').title().replace(' - ', '-')
            formatted_results = formatted_results + f'\n{i}. {formatted_result}'
        if formatted_results == '':
            await ctx.send('**No results.**')
        else:
            em = discord.Embed(title='Search Results', colour=0x006400, description=formatted_results)
            await ctx.send(embed=em)
Пример #6
0
def get_fuzzy_matches(search_term: str, term_counts: Dict[str, int],
                      cutoff_score: int) -> List[Tuple[str, int]]:
    """
    Return all terms that are similar to the search terms using the cutoff
    score as a threshold for the similarity.

    This expects a dictionary like the one returned from get_vocab_counts(),
    but can use any dictionary where the keys should be matched against the
    search terms.

    :param term_counts: Dictionary where the keys should be used to match the search term
    :param search_term: Term to use for comparing similarity of dictionary keys
    :param cutoff_score: Threshold for minimum similarity to possible match terms
    :return: All terms that meet the minimum similarity threshold
    """
    return sorted(extractWithoutOrder(search_term,
                                      term_counts.keys(),
                                      score_cutoff=cutoff_score),
                  key=lambda i: i[1],
                  reverse=True)
def match_string(string, documents):
    """
    Fuzzy matching of a string
    :param string:
    :param documents:
    :return:
    """

    def custom_full_process(token, **kwargs):

        try:
            s = token.text
        except Exception as e:
            s = str(token)
        return full_process(s, **kwargs)

    corpus_tokens = [tok for doc in documents for tok in doc.tokens]
    match_generator = fuzzprocess.extractWithoutOrder(
        string, corpus_tokens, processor=custom_full_process, score_cutoff=95
    )
    return match_generator
Пример #8
0
    def mash_list(self, buyers, short_threshold, long_threshold, short_size,
                  stemming):
        match_list = []
        for i, buyer in enumerate(buyers):
            if (buyer not in match_list):
                match_list.append(buyer)
                # excl_buyers is list of buyers excluding current check
                excl_buyers = buyers[:i] + buyers[i + 1:]
                # stem if stemming > 0
                if (stemming > 0):
                    excl_buyers = filter(
                        lambda x: x.startswith(buyer[:stemming]), excl_buyers)
                # threshold depends on length of buyer name
                threshold = long_threshold if len(
                    buyer) > short_size else short_threshold

                sl = process.extractWithoutOrder(buyer,
                                                 excl_buyers,
                                                 scorer=fuzz.token_sort_ratio,
                                                 score_cutoff=threshold)
                matches = sorted(sl, key=lambda i: i[1], reverse=True)
                for match in matches:
                    match_list.append(match[0])
                    print('\t'.join([buyer, match[0], str(match[1])]))
Пример #9
0
def calcRelativity(keyWord, data):
    data = [x[1] for x in process.extractWithoutOrder(keyWord, data)]
    data = pd.DataFrame({'Relativity': data})
    return data
Пример #10
0
 def find_matches(self, word, lst):
     generator = process.extractWithoutOrder(
         word, lst, score_cutoff=self.confidence_threshold)
     return [item[0] for item in generator]
Пример #11
0
def process_name_matching(df, desc):

    # Setup 
    #############
    chkoutid = None
    pmatch_name, pmatch_time, pmatch_max_score, pmatch_simple_score, pmatch_partial_score, pmatch_sort_score, pmatch_set_score = None, None, None, None, None, None, None
    candidate_chkout, candidate_names, p0_scores, p1_scores, p2_scores, p3_scores = None, None, None, None, None, None


    if len(df.index) == 0:
        ###################
        ### Situation 1 ### -- No candidate checkouts
        ###################
        chkoutid = u'Amount / Bank wrong'
        pmatch_name, pmatch_time, pmatch_max_score, pmatch_simple_score, pmatch_partial_score, pmatch_sort_score, pmatch_set_score = None, None, None, None, None, None, None
        candidate_chkout, candidate_names, p0_scores, p1_scores, p2_scores, p3_scores = None, None, None, None, None, None
    else:
        df_exact = df[df[u'name_clean'].map(lambda x: x in desc if x != u'' and (len(x)/len(desc)) > exact_name_match__min_ratio else False)]

        if len(df_exact.index) == 0:
            ###################
            ### Situation 2 ### -- No exact match
            ###################

            # Get 4 ratios for candidate checkouts
            score1 = list(process.extractWithoutOrder(desc, df[u'name_clean'].tolist(), scorer=fuzz.ratio))
            df[u'simple_ratio'] = pd.Series([s[1] for s in score1])
            score2 = list(process.extractWithoutOrder(desc, df[u'name_clean'].tolist(), scorer=fuzz.partial_ratio))
            df[u'partial_ratio'] = pd.Series([s[1] for s in score2])
            score3 = list(process.extractWithoutOrder(desc, df[u'name_clean'].tolist(), scorer=fuzz.token_sort_ratio))
            df[u'sort_ratio'] = pd.Series([s[1] for s in score3])
            score4 = list(process.extractWithoutOrder(desc, df[u'name_clean'].tolist(), scorer=fuzz.token_set_ratio))
            df[u'set_ratio'] = pd.Series([s[1] for s in score4])

            # Get max score
            df[u'max_ratio'] = df[[u'simple_ratio',u'partial_ratio',u'sort_ratio',u'set_ratio']].max(axis=1)

            # Get max of max
            df_approx = df[(df[u'max_ratio']>max_ratio_cutoff) & (df[u'simple_ratio']>simple_ratio_cutoff)]
            best_ratio = df_approx[u'max_ratio'].max()
            df_best = df_approx[df_approx[u'max_ratio'] == best_ratio]
            
            if len(df_best.index) == 0:
                #####################
                ### Situation 2.1 ### -- No exact match + No suitable candidates
                #####################
                chkoutid = None
                pmatch_name, pmatch_time, pmatch_max_score, pmatch_simple_score, pmatch_partial_score, pmatch_sort_score, pmatch_set_score = None, None, None, None, None, None, None

            else:
                df_best.drop_duplicates(subset=u'checkout_id', inplace=True)

                if len(df_best.index) == 1:
                    #####################
                    ### Situation 2.2 ### -- No exact match + 1 BEST found
                    #####################
                    chkoutid = df_best[u'checkout_id'].item()
                    pmatch_name = df_best[u'name_clean'].item()
                    pmatch_time = df_best[u'date_of_transfer'].item() # proof upload time
                    pmatch_max_score = df_best[u'max_ratio'].item()
                    pmatch_simple_score = df_best[u'simple_ratio'].item()
                    pmatch_partial_score = df_best[u'partial_ratio'].item()
                    pmatch_sort_score = df_best[u'sort_ratio'].item()
                    pmatch_set_score = df_best[u'set_ratio'].item()
                else:
                    #####################
                    ### Situation 2.2 ### -- No exact match + MANY BEST found
                    #####################
                    chkoutid = df_best[u'checkout_id'].tolist()
                    pmatch_name = df_best[u'name_clean'].tolist()
                    pmatch_time = df_best[u'date_of_transfer'].tolist() # proof upload time
                    pmatch_max_score = df_best[u'max_ratio'].tolist()
                    pmatch_simple_score = df_best[u'simple_ratio'].tolist()
                    pmatch_partial_score = df_best[u'partial_ratio'].tolist()
                    pmatch_sort_score = df_best[u'sort_ratio'].tolist()
                    pmatch_set_score = df_best[u'set_ratio'].tolist()

            candidate_chkout = df[u'checkout_id'].tolist() # check subset of amount & bank
            candidate_names = df[u'name_clean'].tolist()
            p0_scores = df[u'simple_ratio'].tolist()
            p1_scores = df[u'partial_ratio'].tolist()
            p2_scores = df[u'sort_ratio'].tolist()
            p3_scores = df[u'set_ratio'].tolist()

        elif len(df_exact.index) == 1:
            ###################
            ### Situation 3 ### -- 1 exact match
            ###################
            chkoutid = df_exact[u'checkout_id'].item() # use subset of amount, bank & name
            pmatch_name = df_exact[u'name_clean'].item()
            pmatch_time = df_exact[u'date_of_transfer'].item()
            pmatch_max_score = u'[EXACT MATCH: ARE YOU SURE]'
            pmatch_simple_score, pmatch_partial_score, pmatch_sort_score, pmatch_set_score = None, None, None, None
            candidate_chkout, candidate_names, p0_scores, p1_scores, p2_scores, p3_scores = None, None, None, None, None, None
        else:
            ###################
            ### Situation 4 ### -- MANY exact match
            ###################
            chkoutid = u'Many names found'
            pmatch_name, pmatch_time, pmatch_max_score, pmatch_simple_score, pmatch_partial_score, pmatch_sort_score, pmatch_set_score = None, None, None, None, None, None, None

            candidate_chkout = df_exact[u'checkout_id'].tolist() # check subset of amount & bank
            candidate_names = df_exact[u'name_clean'].tolist()
            p0_scores, p1_scores, p2_scores, p3_scores = None, None, None, None

    return (chkoutid, pmatch_name, pmatch_time, pmatch_max_score, pmatch_simple_score, pmatch_partial_score, pmatch_sort_score, pmatch_set_score, candidate_chkout, candidate_names, p0_scores, p1_scores, p2_scores, p3_scores)
Пример #12
0
    def get_annotations(self, doc: Doc) -> (pd.DataFrame, pd.DataFrame):
        """
        get_annotations will find the given named entities of the dictionary in the doc text
        :param doc:
        :returns a tuple of dataframes, the first contains the annotations the second contains relations between annotations
        """
        old_annotations = doc.get_annotations()
        doc_text = doc.get_text()

        # prelabel all words that occurs in the dictionary
        matches = pd.DataFrame(
            columns=[Annotation.BEGIN, Annotation.END, self.QUERY])
        for match in self._regex.finditer(doc_text):
            matches = matches.append(
                {
                    Annotation.BEGIN: match.span()[0],
                    Annotation.END: match.span()[1],
                    self.QUERY: match.group()
                },
                ignore_index=True)

        # initialize the new annotations and relations table
        new_annotations = pd.DataFrame(columns=Annotation.COLS)
        new_relations = pd.DataFrame(columns=Relation.COLS)

        # get all sentences from the document
        sentences = old_annotations[old_annotations[Annotation.LAYER] ==
                                    Layer.SENTENCE]

        # if no sentences are available stop here, because we want to label labels in the dictionary on sentence level
        if sentences.empty:
            raise Exception('No sentences available')

        # iterate thru the sentences of a document to search for entities in the dictionary
        for index, sentence in sentences.iterrows():

            # get the beginning and the end of each sentence to search for prelabeled
            begin = sentence[Annotation.BEGIN]
            end = sentence[Annotation.END]

            # find all prelabeld words that are in the current sentence boundaries
            sentence_matches = matches[(matches[Annotation.BEGIN] >= begin)
                                       & (matches[Annotation.END] <= end)]

            matched_words_list = list(sentence_matches[self.QUERY])
            matched_word_string = self.WHITESPACE.join(matched_words_list)

            # find all entries in the dictionary that have at most as many words as the number of prelabeled
            # words in the sentence
            filtered_data = self._data[
                self._data[self.LENGTH] <= len(matched_words_list)]

            # create a dict to search in for the fuzzywuzzy library
            data_index = list(filtered_data.index)
            data_queries = list(filtered_data[self.QUERY])

            #
            queries = dict(zip(data_index, data_queries))
            fuzzy_matches = process.extractWithoutOrder(
                matched_word_string,
                queries,
                scorer=fuzz.token_set_ratio,
                score_cutoff=self._min_matching_score)

            # create a dict that holds all found entities in a sentence
            sentence_index = sentence_matches[Annotation.BEGIN].map(
                str) + ':' + sentence_matches[Annotation.END].map(str)
            sentence_queries = dict(zip(sentence_index, matched_words_list))

            for span, score, idx in fuzzy_matches:
                words = span.split(self.WHITESPACE)

                # initialize the old index for assigning relations correctly
                old_index = 0

                # iterate over all words in the match
                for word_index, word in enumerate(words):

                    # for each word in the match find the corresponding word in the sentence
                    word_match = process.extractOne(word, sentence_queries)

                    # get the begin and end value of the key
                    begin = int(word_match[2].split(self.SEP)[0])
                    end = int(word_match[2].split(self.SEP)[0])

                    # append new annotation
                    new_annotations = new_annotations.append(
                        {
                            Annotation.BEGIN:
                            begin,
                            Annotation.END:
                            end,
                            Annotation.LAYER:
                            self._layer_name,
                            Annotation.FEATURE:
                            filtered_data.loc[idx][Annotation.FEATURE],
                            Annotation.FEATURE_VAL:
                            filtered_data.loc[idx][Annotation.FEATURE_VAL]
                        },
                        ignore_index=True)

                    # set the current annotation id for the relation
                    current_idx = max(list(new_annotations.index.values))

                    # if we have more than one word in the dictionary connect them via relations
                    if len(words) > 1 and word_index > 0:
                        new_relations = new_relations.append(
                            {
                                Relation.GOV_ID:
                                old_index,
                                Relation.DEP_ID:
                                current_idx,
                                Relation.LAYER:
                                self._layer_name,
                                Relation.BEGIN:
                                begin,
                                Relation.END:
                                end,
                                Relation.FEATURE:
                                filtered_data.loc[idx][Annotation.FEATURE],
                                Relation.FEATURE_VAL:
                                filtered_data.loc[idx][Annotation.FEATURE_VAL]
                            },
                            ignore_index=True)

                    old_index = current_idx

        return new_annotations, new_relations
Пример #13
0
def find_potential_checkouts_v2(df_chkout, stmt_amt, stmt_bank, stmt_desc):
    # Definitions:
    ################
    # _ab : subset of amt & bank
    # _abn: subset of amt, bank & exact name
    chkoutid, pmax_name, pmax_score, chkout_candidates, p0_names, p0_scores, p1_names, p1_scores, p2_names, p2_scores = None, None, None, None, None, None, None, None, None, None

    # Step 1:
    # Filter potential checkouts by proof amount & bank
    ################
    potential_chkouts_ab = df_chkout[
        (df_chkout['proof_amount'] == stmt_amt)
        & (df_chkout['[A] script_bank_cat'] == stmt_bank)]
    #
    # Step 2: Further filter potential checkouts if proof cust name is in description
    if len(potential_chkouts_ab.index) == 0:
        # Situation 1: No Amt Bank match
        chkoutid = 'Amount / Bank wrong'
        pmax_name, pmax_score, chkout_candidates, p0_names, p0_scores, p1_names, p1_scores, p2_names, p2_scores = None, None, None, None, None, None, None, None, None
    else:
        # Situation 2: Amt & Bank match, proceed to confirm using name (proof name ~ stmt desc)
        potential_chkouts_ab[
            '[B] proof_cust_name_clean'] = potential_chkouts_ab[
                '[B] proof_cust_name_clean'].fillna('').str.lower(
                ).str.replace(
                    '\"',
                    '')  # (1) fills na with un-matchable name (2) cleans it
        potential_chkouts_abn = potential_chkouts_ab[potential_chkouts_ab[
            '[B] proof_cust_name_clean'].map(lambda x: x in stmt_desc)]
        #
        if len(potential_chkouts_abn.index) == 1:
            # Situation 2a: Single match using Amt, Bank & exact Name
            chkoutid = potential_chkouts_abn['checkoutid'].item(
            )  # use subset of amount, bank & name
            pmax_name, pmax_score, chkout_candidates, p0_names, p0_scores, p1_names, p1_scores, p2_names, p2_scores = None, None, None, None, None, None, None, None, None
        elif len(potential_chkouts_abn.index) == 0:
            ########################
            ### WORK IN PROGRESS ###
            ########################
            # Situation 2b: (amt & bank --> some candidates, no exact match with name --> 2 options: possibility of approx match / no match at all)
            chkout_candidates = potential_chkouts_ab['checkoutid'].tolist(
            )  # check subset of amount & bank
            pmax = process.extractOne(
                stmt_desc,
                potential_chkouts_ab['[B] proof_cust_name_clean'].tolist(),
                scorer=fuzz.token_set_ratio,
                score_cutoff=50)
            if pmax is None:
                pmax_name, pmax_score = None, None
            else:
                pmax_name = pmax[0]
                pmax_score = pmax[1]
                try:
                    chkoutid = potential_chkouts_ab[
                        potential_chkouts_ab['[B] proof_cust_name_clean'] ==
                        str(pmax_name)]['checkoutid'].item()
                except ValueError:
                    chkoutid = None

            p0 = list(
                process.extractWithoutOrder(
                    stmt_desc,
                    potential_chkouts_ab['[B] proof_cust_name_clean'].tolist())
            )
            p0_names = [x[0] for x in p0]
            p0_scores = [x[1] for x in p0]
            p1 = list(
                process.extractWithoutOrder(
                    stmt_desc,
                    potential_chkouts_ab['[B] proof_cust_name_clean'].tolist(),
                    scorer=fuzz.token_sort_ratio)
            )  # Note: this is using token_sort_ratio
            p1_names = [x[0] for x in p1]
            p1_scores = [x[1] for x in p1]
            p2 = list(
                process.extractWithoutOrder(
                    stmt_desc,
                    potential_chkouts_ab['[B] proof_cust_name_clean'].tolist(),
                    scorer=fuzz.token_set_ratio)
            )  # Note: this is using token_set_ratio
            p2_names = [x[0] for x in p2]
            p2_scores = [x[1] for x in p2]
########################
### WORK IN PROGRESS ###
########################
        else:
            chkoutid = 'Many names found'
            pmax_name, pmax_score, chkout_candidates, p0_names, p0_scores, p1_names, p1_scores, p2_names, p2_scores = None, None, None, None, None, None, None, None, None
    #
    return (chkoutid, pmax_name, pmax_score, chkout_candidates, p0_names,
            p0_scores, p1_names, p1_scores, p2_names, p2_scores)