예제 #1
0
파일: cpe.py 프로젝트: crocs-muni/sec-certs
    def get_cpe_matches(
            self,
            cert_name: str,
            cert_candidate_cpe_vendors: List[str],
            cert_candidate_versions: List[str],
            relax_version: bool = False,
            n_max_matches=10,
            threshold: int = 60) -> Optional[List[Tuple[float, CPE]]]:
        replace_non_letter_non_numbers_with_space = re.compile(r"(?ui)\W")

        def sanitize_matched_string(string: str):
            string = string.replace('®', '').replace('™', '').lower()
            return replace_non_letter_non_numbers_with_space.sub(' ', string)

        candidates = self.get_candidate_cpe_items(cert_candidate_cpe_vendors,
                                                  cert_candidate_versions)

        sanitized_cert_name = sanitize_matched_string(cert_name)
        reasonable_matches = []
        for c in candidates:
            sanitized_title = sanitize_matched_string(c.title)
            sanitized_item_name = sanitize_matched_string(c.item_name)
            set_match_title = fuzz.token_set_ratio(sanitized_cert_name,
                                                   sanitized_title)
            partial_match_title = fuzz.partial_ratio(sanitized_cert_name,
                                                     sanitized_title)
            set_match_item = fuzz.token_set_ratio(sanitized_cert_name,
                                                  sanitized_item_name)
            partial_match_item = fuzz.partial_ratio(sanitized_cert_name,
                                                    sanitized_item_name)

            potential = max([
                set_match_title, partial_match_title, set_match_item,
                partial_match_item
            ])

            if potential > threshold:
                reasonable_matches.append((potential, c))

        if reasonable_matches:
            reasonable_matches = sorted(reasonable_matches,
                                        key=operator.itemgetter(0),
                                        reverse=True)

            # possibly filter short titles to avoid false positives
            # reasonable_matches = list(filter(lambda x: len(x[1].item_name) > 4, reasonable_matches))

            return reasonable_matches[:n_max_matches]

        if not reasonable_matches and not relax_version:
            return self.get_cpe_matches(cert_name,
                                        cert_candidate_cpe_vendors, ['-'],
                                        relax_version=True,
                                        n_max_matches=n_max_matches,
                                        threshold=threshold)

        return None
예제 #2
0
파일: extract.py 프로젝트: wangvei/papers
def _crossref_score(txt, r):
    # high score means high similarity
    from rapidfuzz.fuzz import token_set_ratio
    score = 0
    if 'author' in r:
        author = ' '.join(
            [p['family'] for p in r.get('author', []) if 'family' in p])
        score += token_set_ratio(author, txt)
    if 'title' in r:
        score += token_set_ratio(r['title'][0], txt)
    if 'abstract' in r:
        score += token_set_ratio(r['abstract'], txt)
    return score
예제 #3
0
    def search_score(self, keywords, keys=None):
        """
        search_score calcualtes the matching score of the herb for any given keyword

        :param key_word: keyword for the search
        :type key_word: list
        :param keys: list of keys in the dictionary to look into.
        :type keys: list, optional
        """

        if keys is None:
            keys = ["name", "id", "repository", "tags", "description"]

        if not isinstance(keywords, (list, tuple, set)):
            keywords = [keywords]

        herb_for_search = {
            key: val for key, val in self.herb_meta_json.items() if key in keys
        }

        herb_for_search = _flatten_dict(herb_for_search)

        keywords_scores = []
        for keyword in keywords:
            for key, val in herb_for_search.items():
                score = fuzz.token_set_ratio(val, keyword)
                keywords_scores.append(score)

        max_score = 0
        if keywords_scores:
            max_score = max(keywords_scores)

        return max_score
예제 #4
0
    def display_label_search(self, query, n=10, **kwargs):
        '''Search display labels (place names).'''
        d = self.get_data_products()

        dpi_instances = d['demographicprofiles']
        return sorted(dpi_instances, key=lambda x: \
                            fuzz.token_set_ratio(query, x.name), reverse=True)[:n]
예제 #5
0
파일: extract.py 프로젝트: wangvei/papers
def _scholar_score(txt, bib):
    # high score means high similarity
    from rapidfuzz.fuzz import token_set_ratio
    return sum([
        token_set_ratio(bib[k], txt) for k in ['title', 'author', 'abstract']
        if k in bib
    ])
예제 #6
0
def compare_cells(cell1, cell2, comparison_type, ignore_case):
    if not cell1 and not cell2:
        return True
    elif not cell1 or not cell2:
        return False

    try:
        if comparison_type == 'fuzzy_string':
            if fuzz.token_set_ratio(cell1, cell2, score_cutoff=CELL_THRESHOLD):
                return True
        elif comparison_type == 'int':
            return int(cell1) == int(cell2)
        elif comparison_type == 'float':
            return float(cell1) == float(cell2)
        else:
            if ignore_case:
                cell1 = cell1.lower()
                cell2 = cell2.lower()

            return str(cell1).strip() == str(cell2).strip()
    except:
        if ignore_case:
            cell1 = cell1.lower()
            cell2 = cell2.lower()
        return str(cell1).strip() == str(cell2).strip()
예제 #7
0
def fpartial(x):
    #    return fuzz.ratio(str(x["alias"]),str(x["data"])) # Also take length of string into account -> uber <-> "pubermarkt test" = 42
    return fuzz.token_set_ratio(
        str(x["alias"]), str(x["data"])
    )  # Whole word match (best practice for ALIAS mapping short->withstring) -> uber <-> "pubermarkt test" = 57


#    return fuzz.partial_token_set_ratio(str(x["alias"]),str(x["data"])) # Partial word match (best practise to create distance matrix)  -> uber <-> "pubermarkt test" = 100

#%%
예제 #8
0
def management(l_args, s_ticker):
    parser = argparse.ArgumentParser(prog='mgmt',
                                     description="""Print management team. Namely: Name, Title, Information from google and
                                                    (potentially) Insider Activity page. [Source: Business Insider]""")

    try:
        (ns_parser, l_unknown_args) = parser.parse_known_args(l_args)

        if l_unknown_args:
            print(f"The following args couldn't be interpreted: {l_unknown_args}\n")
            return

        url_market_business_insider = f"https://markets.businessinsider.com/stocks/{s_ticker.lower()}-stock"
        text_soup_market_business_insider = BeautifulSoup(requests.get(url_market_business_insider).text, "lxml")

        l_titles = list()
        for s_title in text_soup_market_business_insider.findAll('td', {'class': 'table__td text-right'}):
            if any(c.isalpha() for c in s_title.text.strip()) and ('USD' not in s_title.text.strip()):
                l_titles.append(s_title.text.strip())

        l_names = list()
        for s_name in text_soup_market_business_insider.findAll('td', {'class': 'table__td table--allow-wrap'}):
            l_names.append(s_name.text.strip())

        df_management = pd.DataFrame({'Name': l_names[-len(l_titles):], 'Title':l_titles}, columns=['Name','Title'])

        df_management['Info'] = '-'
        df_management['Insider Activity'] = '-'
        df_management = df_management.set_index('Name')

        for s_name in df_management.index:
            df_management.loc[s_name]['Info'] = f"http://www.google.com/search?q={s_name} {s_ticker.upper()}".replace(' ', '%20')

        s_url_base = "https://markets.businessinsider.com"
        for insider in text_soup_market_business_insider.findAll('a', {'onclick':"silentTrackPI()"}):
            for s_name in df_management.index:
                if fuzz.token_set_ratio(s_name, insider.text.strip()) > 70:
                    df_management.loc[s_name]['Insider Activity'] = s_url_base + insider.attrs['href']

        for ind in df_management.index:
            s_name = f"{ind}{(max([len(x) for x in df_management.index])-len(ind))*' '}"
            s_title = f"{df_management['Title'][ind]}{(max([len(x) for x in df_management['Title']])-len(df_management['Title'][ind]))*' '}"
            s_management = f"""{s_name} {s_title} {df_management['Info'][ind]}"""
            print(s_management)
            if df_management['Insider Activity'][ind] not in '-':
                print(f"{df_management['Insider Activity'][ind]}")
            print("")

    except:
        print("")
        return
예제 #9
0
    def search_by_true_name(self, name, threshold=80):
        """Finds all items which match closely to all given query parameters.

        Args:
            name: Name to search by. Ignored if None.
            threshold: Threshold for matching with RapidFuzz.

        Returns:
            List of matching triplets with NameItem, RapidFuzz ratio and RapidFuzz token_set_ratio
        """
        matches = []
        for item in self.items:
            # Search with false name
            ratio = fuzz.ratio(item.true_name, name)
            token_set_ratio = fuzz.token_set_ratio(item.true_name.lower(),
                                                   name.lower())
            if ratio > threshold or token_set_ratio > threshold:
                matches.append((item, ratio, token_set_ratio))
        return sorted(matches, key=lambda x: x[1], reverse=True)
예제 #10
0
    def iterate(self, uid, start_date=None, end_date=None, class_name=None):

        date1 = (self.tmzn.localize(dtparse(start_date).replace(
            tzinfo=None)) if start_date else datetime.now(tz=self.tmzn))
        date2 = (self.tmzn.localize(dtparse(end_date).replace(tzinfo=None))
                 if end_date else date1.replace(hour=23, minute=59, second=59))

        class_list = []

        for event in self.calendars[uid].walk("vevent"):
            if (event["dtstart"].dt.astimezone(self.tmzn) >= date1
                    and event["dtstart"].dt.astimezone(self.tmzn) <= date2):

                _ = (class_list.extend(
                    event for c_name in class_name
                    if fuzz.token_set_ratio(c_name.lower(), event["summary"].
                                            lower()) > self.fuzz_threshold)
                     if class_name else class_list.append(event))

        return class_list
예제 #11
0
def featurize(df):
    if len(df.columns)==3:
        df.columns=['a', 'b', 'target']
    elif len(df.columns)==2:
        df.columns=['a', 'b']
    else:
        df = df.rename(columns={df.columns[0]: 'a', df.columns[1]: 'b' })
        
    df['TM_A'] = df.apply(lambda row: re.sub(
        '[^a-zA-Z]+', '', unidecode.unidecode(row['a']).lower()), axis=1)
    df['TM_B'] = df.apply(lambda row: re.sub(
        '[^a-zA-Z]+', '', unidecode.unidecode(row['b']).lower()), axis=1)

    df['partial'] = df.apply(lambda row: fuzz.partial_ratio(row.TM_A,row.TM_B), axis=1)
    df['tkn_sort'] = df.apply(lambda row: fuzz.token_sort_ratio(row.TM_A,row.TM_B), axis=1)
    df['tkn_set'] = df.apply(lambda row: fuzz.token_set_ratio(row.TM_A,row.TM_B), axis=1)
    
    df['sum_ipa'] = df.apply(lambda row: sum_ipa(row.TM_A,row.TM_B), axis=1)
    
    # Jellyfish levenshtein
    df['levenshtein']= df.apply(lambda row: jellyfish.levenshtein_distance(row.TM_A,row.TM_B), axis=1)
    # Scale Levenshtein column
    scaler = MinMaxScaler()
    df['levenshtein'] = scaler.fit_transform(df['levenshtein'].values.reshape(-1,1))

    # Jellyfish phoneme
    df['metaphone'] = df.apply(
        lambda row: 1 if jellyfish.metaphone(row.TM_A)==jellyfish.metaphone(row.TM_B) else 0, axis=1)
    df['nysiis'] = df.apply(
        lambda row: 1 if jellyfish.nysiis(row.TM_A)==jellyfish.nysiis(row.TM_B) else 0, axis=1)
    df['mtch_rtng_cdx'] = df.apply(
        lambda row: 1 if jellyfish.match_rating_codex(row.TM_A)==jellyfish.match_rating_codex(row.TM_B) else 0, axis=1)
    
    df['pshp_soundex_first'] = df.apply(
        lambda row: 1 if pshp_soundex_first.encode(row.TM_A)==pshp_soundex_first.encode(row.TM_B) else 0, axis=1)
    
    for i, algo in enumerate(algos):
            df[algo_names[i]] = df.apply(lambda row: algo.sim(row.TM_A, row.TM_B), axis=1)
    
    return df
예제 #12
0
    def get_one(self, uid, class_name=None):

        date1 = datetime.now(tz=self.tmzn)
        class_list = []

        if not class_name:
            for event in self.calendars[uid].walk("vevent"):
                if event["dtstart"].dt.astimezone(self.tmzn) >= date1:
                    class_list.append(event)
                    break

        else:
            for c_name in class_name:
                for event in self.calendars[uid].walk("vevent"):
                    if (event["dtstart"].dt.astimezone(self.tmzn) >= date1
                            and fuzz.token_set_ratio(
                                c_name.lower(), event["summary"].lower()) >
                            self.fuzz_threshold):
                        class_list.append(event)
                        break

        return class_list
예제 #13
0
def get_management(ticker: str) -> pd.DataFrame:
    """Get company managers from Business Insider

    Parameters
    ----------
    ticker : str
        Stock ticker

    Returns
    -------
    pd.DataFrame
        Dataframe of managers
    """
    url_market_business_insider = (
        f"https://markets.businessinsider.com/stocks/{ticker.lower()}-stock"
    )
    text_soup_market_business_insider = BeautifulSoup(
        requests.get(
            url_market_business_insider, headers={"User-Agent": get_user_agent()}
        ).text,
        "lxml",
    )

    found_h2s = {}

    for next_h2 in text_soup_market_business_insider.findAll(
        "h2", {"class": "header-underline"}
    ):
        next_table = next_h2.find_next_sibling("table", {"class": "table"})

        if next_table:
            found_h2s[next_h2.text] = next_table

    if found_h2s.get("Management") is None:
        print(f"No management information in Business Insider for {ticker}")
        print("")
        return pd.DataFrame()

    l_titles = []
    for s_title in found_h2s["Management"].findAll(
        "td", {"class": "table__td text-right"}
    ):
        if any(c.isalpha() for c in s_title.text.strip()) and (
            "USD" not in s_title.text.strip()
        ):
            l_titles.append(s_title.text.strip())

    l_names = []
    for s_name in found_h2s["Management"].findAll(
        "td", {"class": "table__td table--allow-wrap"}
    ):
        l_names.append(s_name.text.strip())

    df_management = pd.DataFrame(
        {"Name": l_names[-len(l_titles) :], "Title": l_titles},
        columns=["Name", "Title"],
    )

    df_management["Info"] = "-"
    df_management["Insider Activity"] = "-"
    df_management = df_management.set_index("Name")

    for s_name in df_management.index:
        df_management.loc[s_name][
            "Info"
        ] = f"http://www.google.com/search?q={s_name} {ticker.upper()}".replace(
            " ", "%20"
        )

    s_url_base = "https://markets.businessinsider.com"
    for insider in text_soup_market_business_insider.findAll(
        "a", {"onclick": "silentTrackPI()"}
    ):
        for s_name in df_management.index:
            if fuzz.token_set_ratio(s_name, insider.text.strip()) > 70:  # type: ignore
                df_management.loc[s_name]["Insider Activity"] = (
                    s_url_base + insider.attrs["href"]
                )
    return df_management
예제 #14
0
def management(other_args: List[str], ticker: str):
    """Display company's managers

    Parameters
    ----------
    other_args : List[str]
        argparse other args
    ticker : str
        Stock ticker
    """
    parser = argparse.ArgumentParser(
        add_help=False,
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        prog="mgmt",
        description="""
            Print management team. Namely: Name, Title, Information from google and
            (potentially) Insider Activity page. [Source: Business Insider]
        """,
    )

    try:
        ns_parser = parse_known_args_and_warn(parser, other_args)
        if not ns_parser:
            return

        url_market_business_insider = (
            f"https://markets.businessinsider.com/stocks/{ticker.lower()}-stock"
        )
        text_soup_market_business_insider = BeautifulSoup(
            requests.get(
                url_market_business_insider, headers={"User-Agent": get_user_agent()}
            ).text,
            "lxml",
        )

        found_h2s = dict()

        for next_h2 in text_soup_market_business_insider.findAll(
            "h2", {"class": "header-underline"}
        ):
            next_table = next_h2.find_next_sibling("table", {"class": "table"})

            if next_table:
                found_h2s[next_h2.text] = next_table

        if found_h2s.get("Management") is None:
            print(f"No management information in Business Insider for {ticker}")
            print("")
            return

        l_titles = []
        for s_title in found_h2s["Management"].findAll(
            "td", {"class": "table__td text-right"}
        ):
            if any(c.isalpha() for c in s_title.text.strip()) and (
                "USD" not in s_title.text.strip()
            ):
                l_titles.append(s_title.text.strip())

        l_names = []
        for s_name in found_h2s["Management"].findAll(
            "td", {"class": "table__td table--allow-wrap"}
        ):
            l_names.append(s_name.text.strip())

        df_management = pd.DataFrame(
            {"Name": l_names[-len(l_titles) :], "Title": l_titles},
            columns=["Name", "Title"],
        )

        df_management["Info"] = "-"
        df_management["Insider Activity"] = "-"
        df_management = df_management.set_index("Name")

        for s_name in df_management.index:
            df_management.loc[s_name][
                "Info"
            ] = f"http://www.google.com/search?q={s_name} {ticker.upper()}".replace(
                " ", "%20"
            )

        s_url_base = "https://markets.businessinsider.com"
        for insider in text_soup_market_business_insider.findAll(
            "a", {"onclick": "silentTrackPI()"}
        ):
            for s_name in df_management.index:
                if fuzz.token_set_ratio(s_name, insider.text.strip()) > 70:
                    df_management.loc[s_name]["Insider Activity"] = (
                        s_url_base + insider.attrs["href"]
                    )

        for ind in df_management.index:
            s_name = f"{ind}{(max([len(x) for x in df_management.index])-len(ind))*' '}"
            df_mgmt_title = df_management["Title"]
            spaces = max(len(x) for x in df_mgmt_title) - len(df_mgmt_title[ind])
            s_title = f"{df_mgmt_title[ind]}{spaces * ' '}"
            s_management = f"""{s_name} {s_title} {df_management['Info'][ind]}"""
            print(s_management)
            if df_management["Insider Activity"][ind] not in "-":
                print(f"{df_management['Insider Activity'][ind]}")
            print("")

    except Exception as e:
        print(e, "\n")
예제 #15
0
def test_token_ratio(s1, s2):
    """
    token_ratio should be max(token_sort_ratio, token_set_ratio)
    """
    assert fuzz.token_ratio(s1, s2) == max(fuzz.token_sort_ratio(s1, s2),
                                           fuzz.token_set_ratio(s1, s2))
예제 #16
0
 def testTokenSetRatio(self):
     self.assertEqual(fuzz.token_set_ratio(self.s4, self.s5), 100)
예제 #17
0
def findMatchingDownloadedFile(torrentDataRootName,
                               torrentDataFilesize,
                               torrentDataFilePath,
                               isDisc=False,
                               isTV=False):
    torrentDataFilename = os.path.basename(torrentDataFilePath)
    # maximum difference, in MB, the downloaded filesize and listed file size can be
    MAX_FILESIZE_DIFFERENCE = 2 * 1000000
    if isTV or torrentDataFilesize < 100 * 1000000:
        MAX_FILESIZE_DIFFERENCE = 0

    listings = os.listdir(args.ROOT_PATH)
    for listing in listings:
        listingPath = os.path.join(args.ROOT_PATH, listing)
        # if 'planet' in listing.lower():
        # 	print(listing)
        # 	print(fuzz.token_set_ratio(listing, torrentDataFilename))
        if os.path.isfile(listingPath) and fuzz.token_set_ratio(
                listing, torrentDataFilename, score_cutoff=80):
            localFilesize = get_file_size(listingPath)
            # print((localFilesize - torrentDataFilesize)/1000000)
            if localFilesize == None:
                return None
            if abs(localFilesize -
                   torrentDataFilesize) <= MAX_FILESIZE_DIFFERENCE:
                return listingPath
        elif fuzz.token_set_ratio(listing,
                                  torrentDataRootName,
                                  score_cutoff=85):
            for root, dirs, filenames in os.walk(listingPath):
                for filename in filenames:
                    localFilePath = os.path.join(root, filename)
                    localFilesize = get_size(localFilePath)
                    if localFilesize == None:
                        continue

                    if isDisc and areRootPathsSimilar(
                            localFilePath, listingPath, torrentDataFilePath
                    ) and filename == torrentDataFilename:
                        if abs(localFilesize -
                               torrentDataFilesize) <= MAX_FILESIZE_DIFFERENCE:
                            return localFilePath
                    elif re.search(SEASON_EP_RE, torrentDataFilePath,
                                   re.IGNORECASE) and fuzz.token_set_ratio(
                                       filename,
                                       torrentDataFilename,
                                       score_cutoff=95):
                        season_ep_str_torrent = getSeasonEpisodeStr(
                            torrentDataFilePath)
                        season_ep_str_filename = getSeasonEpisodeStr(filename)
                        if season_ep_str_torrent == season_ep_str_filename and abs(
                                localFilesize - torrentDataFilesize
                        ) <= MAX_FILESIZE_DIFFERENCE:
                            return localFilePath
                    elif fuzz.token_set_ratio(filename,
                                              torrentDataFilename,
                                              score_cutoff=95):
                        if abs(localFilesize -
                               torrentDataFilesize) <= MAX_FILESIZE_DIFFERENCE:
                            return localFilePath
    return None