Пример #1
0
def _match_percentage(str1: str, str2: str, score_cutoff: float = 0) -> float:
    """
    A wrapper around `rapidfuzz.fuzz.partial_ratio` to handle UTF-8 encoded
    emojis that usually cause errors

    `str` `str1` : a random sentence
    `str` `str2` : another random sentence
    `float` `score_cutoff` : minimum score required to consider it a match
        returns 0 when similarity < score_cutoff

    RETURNS `float`
    """

    # ! this will throw an error if either string contains a UTF-8 encoded emoji
    try:
        return fuzz.partial_ratio(str1, str2, score_cutoff=score_cutoff)

    # ! we build new strings that contain only alphanumerical characters and spaces
    # ! and return the partial_ratio of that
    except:  # noqa:E722
        new_str1 = "".join(each_letter for each_letter in str1
                           if each_letter.isalnum() or each_letter.isspace())

        new_str2 = "".join(each_letter for each_letter in str2
                           if each_letter.isalnum() or each_letter.isspace())

        return fuzz.partial_ratio(new_str1,
                                  new_str2,
                                  score_cutoff=score_cutoff)
    def check_dict(self, question):
        txt_cut = "/".join(jieba.cut(cc.convert(question)))

        top = int(len(txt_cut) / 3)
        keywords = jieba.analyse.extract_tags(question, topK=top)
        selected_anime = []
        selected_staff = []
        maybe = []
        for keyword in keywords:
            p_anime_list = process.extractOne(keyword, self.anime_list)
            p_staff_list = process.extractOne(keyword, self.staff_list)
            #print(p_anime_list)
            if p_anime_list[1] >= 70:
                maybe.append(p_anime_list[0])
                #print(p_anime_list, fuzz.partial_ratio(p_anime_list[0], question))
                if fuzz.partial_ratio(p_anime_list[0], question) > 50:
                    selected_anime.append(p_anime_list[0])
            if p_staff_list[1] >= 70:
                maybe.append(p_staff_list[0])
                if fuzz.partial_ratio(p_staff_list[0], question) > 50:
                    selected_staff.append(p_staff_list[0])
        if not (selected_anime or selected_staff):
            check = []
            for i in maybe:
                ask = "请问您说的是{0}吗?(是/否)\n".format(i)
                check = input(ask)
                if check == '否':
                    maybe.remove(i)
            selected = maybe
        else:
            selected = selected_anime + selected_staff
        region_dict = {i: self.wdtype_dict.get(i) for i in selected}
        return region_dict, selected
Пример #3
0
def test_partial_ratio(s1, s2):
    """
    test partial_ratio. Currently this only tests, so there are no exceptions
    In the future this should validate the implementation. However this requires
    a correct implementation to be found.
    """
    fuzz.partial_ratio(s1, s2)
Пример #4
0
def match_percentage(str1: str, str2: str, score_cutoff: float = 0) -> float:
    '''
    `str` `str1` : a random sentence

    `str` `str2` : another random sentence

    `float` `score_cutoff` : minimum score required to consider it a match
                             returns 0 when similarity < score_cutoff

    RETURNS `float`

    A wrapper around `rapidfuzz.fuzz.partial_ratio` to handle UTF-8 encoded
    emojis that usually cause errors
    '''

    # ! this will throw an error if either string contains a UTF-8 encoded emoji
    try:
        return partial_ratio(str1, str2, score_cutoff=score_cutoff)

    # ! we build new strings that contain only alphanumerical characters and spaces
    # ! and return the partial_ratio of that
    except:
        newStr1 = ''

        for eachLetter in str1:
            if eachLetter.isalnum() or eachLetter.isspace():
                newStr1 += eachLetter

        newStr2 = ''

        for eachLetter in str2:
            if eachLetter.isalnum() or eachLetter.isspace():
                newStr2 += eachLetter

        return partial_ratio(newStr1, newStr2, score_cutoff=score_cutoff)
Пример #5
0
 def testIssue76(self):
     self.assertAlmostEqual(fuzz.partial_ratio("physics 2 vid",
                                               "study physics physics 2"),
                            81.81818,
                            places=4)
     self.assertEqual(
         fuzz.partial_ratio("physics 2 vid",
                            "study physics physics 2 video"), 100)
Пример #6
0
    def get_cpe_matches(
            self,
            cert_name: str,
            cert_candidate_cpe_vendors: List[str],
            cert_candidate_versions: List[str],
            relax_version: bool = False,
            n_max_matches=10,
            threshold: int = 60) -> Optional[List[Tuple[float, CPE]]]:
        replace_non_letter_non_numbers_with_space = re.compile(r"(?ui)\W")

        def sanitize_matched_string(string: str):
            string = string.replace('®', '').replace('™', '').lower()
            return replace_non_letter_non_numbers_with_space.sub(' ', string)

        candidates = self.get_candidate_cpe_items(cert_candidate_cpe_vendors,
                                                  cert_candidate_versions)

        sanitized_cert_name = sanitize_matched_string(cert_name)
        reasonable_matches = []
        for c in candidates:
            sanitized_title = sanitize_matched_string(c.title)
            sanitized_item_name = sanitize_matched_string(c.item_name)
            set_match_title = fuzz.token_set_ratio(sanitized_cert_name,
                                                   sanitized_title)
            partial_match_title = fuzz.partial_ratio(sanitized_cert_name,
                                                     sanitized_title)
            set_match_item = fuzz.token_set_ratio(sanitized_cert_name,
                                                  sanitized_item_name)
            partial_match_item = fuzz.partial_ratio(sanitized_cert_name,
                                                    sanitized_item_name)

            potential = max([
                set_match_title, partial_match_title, set_match_item,
                partial_match_item
            ])

            if potential > threshold:
                reasonable_matches.append((potential, c))

        if reasonable_matches:
            reasonable_matches = sorted(reasonable_matches,
                                        key=operator.itemgetter(0),
                                        reverse=True)

            # possibly filter short titles to avoid false positives
            # reasonable_matches = list(filter(lambda x: len(x[1].item_name) > 4, reasonable_matches))

            return reasonable_matches[:n_max_matches]

        if not reasonable_matches and not relax_version:
            return self.get_cpe_matches(cert_name,
                                        cert_candidate_cpe_vendors, ['-'],
                                        relax_version=True,
                                        n_max_matches=n_max_matches,
                                        threshold=threshold)

        return None
Пример #7
0
def partial_ratio_filter_2(claim_docs):
    """
    A simple key_word matching program, where we are (for now) looking at claim and doc title (doc id) and doing
    fuzz partial ratio matching to filter docs, but this will take the top 10 docs with the highest partial ratios.
    This is a heuristic that comes from how some claims have their keywords spread out and have low partial ratios
    with their gold evidence docs.

    :param claim_docs: Tuple that holds (claim, docs)
    claim: List of important words to look for in documents
    docs: List of doc ids that were retrieved after the first filter
    :return filtered_docs: List of doc ids of the documents that had the top 5 partial ratios
    """
    claim, retrieved_docs = claim_docs
    db = DocDB("../project_data/wiki_docs_skimmed.db")
    compare_claim = clean_text(claim)
    partial_ratios = [(doc_id,
                       fuzz.partial_ratio(compare_claim, clean_text(doc_id)))
                      for doc_id in retrieved_docs]
    ordered_partial_ratios = sorted(partial_ratios,
                                    key=lambda pair: pair[1],
                                    reverse=True)
    #print(f"claim = {claim} with highest partial ratios {ordered_partial_ratios[:5]}")
    filtered_docs = [pair[0] for pair in ordered_partial_ratios]
    if len(ordered_partial_ratios) > 5:
        filtered_docs = filtered_docs[:5]
    db.close()
    return filtered_docs
Пример #8
0
def partial_ratio_filter_1(claim_gold_pair):
    """
    A simple key_word matching program, where we are (for now) looking at claim and doc title (doc id) and doing
    fuzz partial ratio matching to filter docs, and returning docs with partial ratio of greater than 50.

    :param claim_gold_pair: Tuple that holds (claim, gold_evidence)
    claim: List of important words to look for in documents
    gold_evidence: The evidence set from training, just for purposes of keeping order for multiprocessing
    :return (docs, gold_evidence)
    docs: List of doc ids of the documents that matched claims
    gold_evidence: The evidence set from training, just for purposes of keeping order for multiprocessing
    """
    claim, gold_evidence = claim_gold_pair
    with open("../data/wiki_doc_skimmed_ids.obj", "rb") as file:
        ids = pickle.load(file)
    docs = []
    compare_claim = clean_text(claim)
    for doc_id in ids:
        #  clean up doc id
        title = clean_text(doc_id)
        similarity = fuzz.partial_ratio(compare_claim, title)
        if similarity > 75:  # tunable parameter, depends on how high recall should be in this stage
            docs.append(doc_id)
    #docs = partial_ratio_filter_2((claim, docs))
    docs = sentence_similarity_filter_2((claim, docs))
    return docs, gold_evidence
Пример #9
0
def package_to_merge(decrypted_package, decrypted_packages, known_packages):
    title = decrypted_package['name']
    mergable = []
    mergable_titles = []
    mergable_uuids = []
    mergable_linkids = []
    for dp in decrypted_packages:
        if dp['uuid'] not in known_packages:
            dp_title = dp['name']
            ratio = fuzz.partial_ratio(title, dp_title)
            if ratio > 55:
                mergable_titles.append(dp_title)
                mergable_uuids.append(dp['uuid'])
                for link in dp['linkids']:
                    mergable_linkids.append(link)
            elif "Verschiedene Dateien" in dp['name'] or "Various files" in dp[
                    'name']:
                mergable_titles.append(dp_title)
                mergable_uuids.append(dp['uuid'])
                for link in dp['linkids']:
                    mergable_linkids.append(link)

    mergable.append([mergable_titles, mergable_uuids, mergable_linkids])
    mergable.sort()
    return mergable
Пример #10
0
def binary_fuzzy_match(pat, txt, threshold, local=1):
    """
	Searches for fuzzy matches to a pattern in a longer string. A fuzzy match does 
	not necessarily need to be a perfect character for character match between a pattern
	and the larger text string, with a tolerance for mismatches controlled by the 
	threhsold parameter. The underlying metric is Levenshtein distance.
	
	Args:
		pat (str): The shorter text to search for.
		
		txt (str): The larger text to search within.
		
		threshold (int): Value between 0 and 1 at which matches are considered real.
		
		local (int, optional): Alignment method, 0 for global 1 for local.
	
	Returns:
		boolean: True if the pattern was found, false if it was not.
	"""
    # Make sure the pattern is smaller than the text.
    if len(pat) > len(txt):
        return (False)
    similarity_score = 0.000
    if local == 1:
        similarity_score = fuzz.partial_ratio(pat, txt)
    else:
        similarity_score = fuzz.ratio(pat, txt)
    if similarity_score >= threshold * 100:
        return (True)
    return (False)
Пример #11
0
    def get_combined_fuzz_score(self, a, b, mode='geom_mean'):
        a, b = clean_name(a), clean_name(b)

        simple = float(fuzz.ratio(a, b) * self.weight['simple'])
        partial = float(fuzz.partial_ratio(a, b) * self.weight['partial'])

        return self.combine_scores(simple, partial, mode=mode)
Пример #12
0
    def __score_match(gpm_track: GpmTrack, spotify_track: SpotifyTrack) -> int:
        """
        Using the gpm_track as the reference object, get each of the gpm track's attributes and compare it with the
        spotify track using fuzzy matching, then get the average score for each attribute.

        Args:
            gpm_track (GpmTrack): gpm track we want to compare
            spotify_track (SpotifyTrack): Spotify Track we want to compare

        Returns:
            int: Score for the match

        """

        score, common_attributes = 0, 0

        gpm_dict: dict = vars(gpm_track)
        spotify_dict: dict = vars(spotify_track)

        for key in gpm_dict:
            gpm_value: str = str(gpm_dict.get(key)).lower()
            spotify_value: str = str(spotify_dict.get(key)).lower()

            if gpm_value is not None and spotify_value is not None:
                common_attributes += 1
                score += fuzz.partial_ratio(gpm_value, spotify_value)

        return int(score / common_attributes)
Пример #13
0
    def guess_header(self, header, threshold=80):
        """ Try to match a header (from the file) to a list of known headers
        
        Args:
            header - Header name to look for
            threshold - Match threshold for fuzzy search
        """

        # Try for an exact match
        for h in self.HEADERS:
            if h == header:
                return h

        # Try for a case-insensitive match
        for h in self.HEADERS:
            if h.lower() == header.lower():
                return h

        # Finally, look for a close match using fuzzy matching
        matches = []

        for h in self.HEADERS:
            ratio = fuzz.partial_ratio(header, h)
            if ratio > threshold:
                matches.append({'header': h, 'match': ratio})

        if len(matches) > 0:
            matches = sorted(matches,
                             key=lambda item: item['match'],
                             reverse=True)
            return matches[0]['header']

        return None
Пример #14
0
 def _score_word(self, candidate, normalized_target):
     candidate_text = self._normalize(candidate.text)
     homonyms = self._homonyms.get(normalized_target, (normalized_target, ))
     for homonym in homonyms:
         if float(len(candidate_text)) / len(
                 homonym) < self.confidence_threshold:
             continue
         ratio = fuzz.partial_ratio(homonym,
                                    candidate_text,
                                    score_cutoff=self.confidence_threshold *
                                    100)
         if ratio:
             # Include char offsets if exact match.
             left_char_offset = candidate_text.find(homonym)
             if left_char_offset != -1:
                 right_char_offset = len(
                     candidate.text) - (left_char_offset + len(homonym))
                 match_text = candidate.text[left_char_offset:(
                     left_char_offset + len(homonym))]
             else:
                 left_char_offset = 0
                 right_char_offset = 0
                 match_text = candidate.text
             location = WordLocation(left=int(candidate.left),
                                     top=int(candidate.top),
                                     width=int(candidate.width),
                                     height=int(candidate.height),
                                     left_char_offset=left_char_offset,
                                     right_char_offset=right_char_offset,
                                     text=match_text)
             return (ratio / 100.0, location)
     return None
Пример #15
0
        def correct_postcode_from_uprn(record, addressbase_data):
            addressbase_record = addressbase_data[record["uprn"]]
            match_quality = fuzz.partial_ratio(
                record["address"].lower().replace(",", ""),
                addressbase_record["address"].lower().replace(",", ""),
            )
            if match_quality >= 100:
                record["postcode"] = addressbase_record["postcode"]
                self.logger.log_message(
                    logging.INFO,
                    "Replacing %s with %s for record:\n%s\n",
                    variable=(
                        record["postcode"],
                        addressbase_record["postcode"],
                        record,
                    ),
                )
                return True

            else:
                self.logger.log_message(
                    logging.WARNING,
                    'Invalid postcode: %s\nAddressbase correction: %s\nSUGGESTION:\nif uprn == "%s:\n\trec["postcode"] = "%s"\n',
                    variable=(
                        record["postcode"],
                        addressbase_record["postcode"],
                        record["uprn"],
                        addressbase_record["postcode"],
                    ),
                )
                return False
Пример #16
0
def longest_subseq_min_identity(seq1, seq2, min_identity=1.0):
    """
    Finds longest stretch of contiguous (gapless) sequences with at least $identity homology
    """

    seq1_len = len(seq1)
    longest = 0
    start = 0
    end = start + longest + 1
    fuzz_cutoff = 100 * min_identity

    while end <= seq1_len + 1:

        query = seq2[start:end]

        if min_identity == 1:
            match = query in seq1
        else:
            match = fuzz.partial_ratio(query, seq1) >= fuzz_cutoff

        if match:
            longest = len(query)
            end += 1
        else:
            start += 1

        end += 1

    return longest
Пример #17
0
    def find_and_return_max(self, name_of_given_text: str, text_to_match: str,
                            name_of_return_text: str) -> str:
        """
        Find the `name_of_given_text` value in the data that is most like `text_to_match`
        and return the associated `name_of_return_text`.
        Attributes
        ----------
        name_of_return_text : str
            name of the dictionary item to compare to (alpha-3, country, etc)
        text_to_match: str
            string of text to match values against.
        name_of_return_text: str
            name of the dictionary item to return (alpha-3, country, etc)

        Returns
        -------
        str
            Value in the data that best matches.
        """
        current_max, current_return_text = 0, None
        for loop_dict in self.data:
            ratio = fuzz.partial_ratio(loop_dict[name_of_given_text].lower(),
                                       text_to_match.lower())
            if ratio > current_max:
                current_return_text = loop_dict[name_of_return_text]
                current_max = ratio
        return current_return_text
Пример #18
0
def partial_ratio_experiment(df):
    """
    An experiment to check the train data to see if it is sufficient to look for key words (from the claim) in the doc id
    instead of the text of the doc, and also to see the range of partial ratios to determine the partial ratio
    threshold we should use for partial ratio filtering when retrieving document (see partial_ratio_filter_1 and 2)

    :return:
    """
    db = DocDB("../project_data/wiki_docs_skimmed.db")
    verifiable = df[df['verifiable'] != False]
    for i, row in verifiable.iterrows():
        claim = row["claim"]
        evidence_set = row["evidence"]
        titles = []
        print(f"**** Claim: {claim} ****")
        for st in evidence_set:
            st_titles = []
            for doc in st:
                st_titles.append((doc[0], doc[1]))
            print(f"Partial ratios for one complete evidence set:")
            for title, line_num in st_titles:
                id = title
                title = clean_text(title)
                first_line = get_cleaned_first_line(db, id)
                #more_text = title + " . " + first_line + " . "
                ratio = fuzz.partial_ratio(claim, title)
                print(ratio, "The first line: ", first_line)
    db.close()
Пример #19
0
def checkzonefile(filename):
    lastdomain = ""
    internalrows = 0
    global totalrows
    global matchdomains
    global searchphrases
    if not re.search(
            '\.(diff\.split[a-z]{2,7}|txt\.split[a-z]{2,7}|txt|diff)$',
            filename):
        return
    elif re.search(
            '^(app|biz|club|com|dev|icu|info|link|live|net|online|org|page|shop|site|store|top|vip|wang|work|xyz)\.(txt|diff)$',
            filename):
        return
    filewithpath = os.path.join(zonefiledirectory, filename)
    logging.info(f'Searching {filewithpath}')
    print(f'Searching {filewithpath}')
    with open(filewithpath, 'rt') as f:
        for line in f:
            internalrows += 1
            linearray = line.split()
            # Regex is to check for the tld in the first field. It has only one phrase with a trailing dot.
            # last check is to make sure that we're not checking the same domain as the previous line.
            linearray[0] = linearray[0].rstrip('.')
            if not re.search('^[a-z]*\.$', linearray[0]) and linearray[
                    2] == "in" and linearray[3] == "ns":
                for searchword in searchphrases:
                    score = fuzz.partial_ratio(searchword, linearray[0])
                    if score > accuracy:
                        print(linearray[0], score, searchword)
                        matchdomains.append(linearray[0])
            lastdomain = linearray[0]
    totalrows.append(int(internalrows))
    return
Пример #20
0
    def handler(self, row_data):
        '''
        The main parser using the partial string matching method uses the 
        rapidfuzz library.
        Accepts an array object (list).
        Returns a data-padded (exact matches, possible matches) array 
        object (list).
        '''

        probable_match_brand = []
        # 1 - Description / Nomenclature Position
        product_name = row_data[1].lower()
        exact_match_brand, exclusive_matches_brand = self.fast_string_comparison(
            self.brand, product_name)
        for brand_name in exclusive_matches_brand:
            similarity_number = int(
                fuzz.partial_ratio(product_name, brand_name))
            if similarity_number >= 90:
                # All exact matches
                exact_match_brand.append(brand_name)
            elif 76 <= similarity_number <= 89:
                # All possible matches
                probable_match_brand.append(brand_name)
        row_data.append(exact_match_brand)
        row_data.append(probable_match_brand)
        self.status_brand(row_data)
        return row_data
Пример #21
0
async def _(session: CommandSession):
    # 获取设置了名称的插件列表
    plugins = list(filter(lambda p: p.name, nonebot.get_loaded_plugins()))

    arg = session.current_arg_text.strip().lower()
    if not arg:
        # 如果用户没有发送参数,则发送功能列表
        await session.send(
            "我现在支持的功能有:\n" + "\n".join(get_description(p) for p in plugins)
        )
        await session.send("具体各功能帮助请查看:https://bot.artin.li/guide/")
        session.finish(
            '输入 "帮助+空格+功能名" 查看各功能使用指南以及命令。\n' + '如:"帮助 绑定教务处",不需要加上括号及括号内内容。'
        )

    found = False

    # 如果发了参数则发送相应命令的使用帮助
    for p in plugins:
        if fuzz.partial_ratio(p.name.lower(), arg) > 0.6:
            found = True
            session.finish(p.usage)

    if not found:
        session.finish(f"暂时没有 {arg} 这个功能呢")
Пример #22
0
    def search_teacher(self, teacher_name: str) -> list:
        print_function_call((teacher_name), header=self.logheader)

        teacher_name = teacher_name.replace(".", "").replace(",", "").replace(
            "-", "").lower().rstrip()
        matched_teachers = []
        all_teachers = self.get_all_teachers()

        # if self.search_teacher_exact(teacher_name):
        #     exact_teacher = self.search_teacher_exact(teacher_name)
        #     matched_teachers += [exact_teacher]
        #     all_teachers.remove(exact_teacher)

        for teacher in all_teachers:
            teacher_lower = teacher['name'].replace(".", "").replace(
                ",", "").replace("-", "").lower().rstrip()
            # if (len(find_near_matches(teacher_name, teacher['name'], max_l_dist=1)) > 0):
            if teacher_name in teacher_lower:
                matched_teachers.insert(0, teacher)

            # if (not teacher in matched_teachers) and all([(len(find_near_matches(each_sub, teacher_lower, max_l_dist=1)) > 0) for each_sub in [x for x in teacher_name.split(" ") if x]]):
            if (teacher not in matched_teachers) and all(
                [(fuzz.partial_ratio(word, teacher_lower) > 70.0)
                 for word in [x for x in teacher_name.split(" ") if x]]):
                matched_teachers += [teacher]

        return matched_teachers
Пример #23
0
 def fuzzy_function(candidate, query):
     if len(query) == 0:
         return candidate
     partial_ratio = candidate.map(
         lambda x: (fuzz.partial_ratio(x.lower(), query.lower()), x))
     selected = partial_ratio.filter(lambda x: x[0] > 50)
     score = selected.map(
         lambda x: x[0] * min(1,
                              len(x[1]) / len(query)) * min(
                                  1,
                                  len(query) / len(x[1]))**0.3,
         lambda x: round(x * 10) / 10).sort(lambda x: -x)
     return score
Пример #24
0
def get_maximum_matchval( check_string, input_string ):
    """
    Returns the `Levenshtein`_ distance of two strings, implemented using  A perfect match is a score of ``100.0``.

    :param str check_string: first string.
    :param str input_string: second string.
    :returns: the `Levenshtein`_ distance between the two strings.
    :rtype: float

    .. _Levenshtein: https://en.wikipedia.org/wiki/Levenshtein_distance
    """
    cstring = check_string.strip( ).lower( )
    istring = input_string.strip( ).lower( )
    return partial_ratio( check_string.strip( ).lower( ),
                          input_string.strip( ).lower( ) )
Пример #25
0
        def get_potential(iterable: Iterable,
                          *,
                          threshold: int = 80) -> list[str]:
            nonlocal name
            potential = []

            for item in iterable:
                original, item = item, item.lower()

                if name == item:
                    return [original]

                a, b = fuzz.ratio(name, item), fuzz.partial_ratio(name, item)
                if a >= threshold or b >= threshold:
                    potential.append(original)

            return potential
Пример #26
0
def text_in_note(note, query_string, client, confidence=97):
    """Performs fuzzy searching against note text.

    :param dict note: an ArchivesSpace note.
    :param str query_string: a string to match against.
    :param int confidence: minimum confidence ratio to match against.

    :returns: True if a match is found for `query_string`, False if no match is
            found.
    :rtype: bool
    """
    note = resolve_to_json(note, client)
    note_content = get_note_text(note, client)
    ratio = fuzz.partial_ratio(" ".join([n.lower() for n in note_content]),
                               query_string.lower(),
                               score_cutoff=confidence)
    return bool(ratio)
Пример #27
0
    def find_matches(text, threshold=65):
        """
        Attempt to match a 'name' to an existing Company.
        A list of potential matches will be returned.
        """

        matches = []

        for name in companies.keys():
            # Case-insensitive matching
            ratio = fuzz.partial_ratio(name.lower(), text.lower())

            if ratio > threshold:  # pragma: no cover
                matches.append({'name': name, 'match': ratio})

        if len(matches) > 0:  # pragma: no cover
            return [match['name'] for match in sorted(matches, key=lambda item: item['match'], reverse=True)]
        else:
            return []
Пример #28
0
    def guess_header(self, header, threshold=80):
        """Try to match a header (from the file) to a list of known headers.

        Args:
            header (Any): Header name to look for
            threshold (int, optional): Match threshold for fuzzy search. Defaults to 80.

        Returns:
            Any: Matched headers
        """
        # Replace null values with empty string
        if header is None:
            header = ''

        # Try for an exact match
        for h in self.HEADERS:
            if h == header:
                return h

        # Try for a case-insensitive match
        for h in self.HEADERS:
            if h.lower() == header.lower():
                return h

        # Try for a case-insensitive match with space replacement
        for h in self.HEADERS:
            if h.lower() == header.lower().replace(' ', '_'):
                return h

        # Finally, look for a close match using fuzzy matching
        matches = []

        for h in self.HEADERS:
            ratio = fuzz.partial_ratio(header, h)
            if ratio > threshold:
                matches.append({'header': h, 'match': ratio})

        if len(matches) > 0:
            matches = sorted(matches, key=lambda item: item['match'], reverse=True)
            return matches[0]['header']

        return None
Пример #29
0
    def find_mentor(self) -> None:

        for mentor in self.mentor_data:
            current_mentor_surname = mentor['surname']
            ratio: float = 0.0

            ratio = fuzz.partial_ratio(
                self.surname.lower(),
                current_mentor_surname.lower()
            )

            if ratio >= FUZZ_RATIO_CUTOFF:
                # TODO log every ratio and input values for debug purposes
                self.mentor_info = mentor

                return

        raise MentorNotFoundException('Mentor with surname "{}" not found!'.format(
            self.surname
        ))
Пример #30
0
def get_docs(claim: str):
    """
    Simple key_word matching, where we look at the claim and doc title (doc id) and doing
    fuzz partial ratio matching to filter docs, and return the top 5 ranked documents.

    :param claim: string
                  the raw claim read in from the data
    :return docs: list
                  doc ids of the top 5 documents that matched with claim
    """
    with open(WIKI_IDS_PATH, "rb") as file:
        ids = pickle.load(file)
    docs = []
    compare_claim = _clean_text(claim)
    for doc_id in ids:
        title = _clean_text(doc_id)
        similarity = fuzz.partial_ratio(compare_claim, title)
        if similarity > 75:  # this threshold ensures less docs returned inorder for sentence embedding filter to be tractable
            docs.append(doc_id)
    top_five_docs = _rank((claim, docs))
    return top_five_docs