Пример #1
2
def test_multi_type_cleanups():
    expected = "Hello World"
    errmsg = "cleanup of %s failed"
    for testname, variation in multi_cleanup_tests.items():
        result = cleanco(variation).clean_name(prefix=True, suffix=True, middle=True, multi=True)
        assert result == expected, errmsg % testname
Пример #2
0
def clean_co_names(df, col):
    df['clean_co'] = df[col]
    df['clean_co'] = df['clean_co'].str.upper()  # uppercase
    print(f'>Set Upper')
    df['clean_co'] = df['clean_co'].str.replace(',', '')  # Remove commas
    print(f'>Remove commas')
    df['clean_co'] = df['clean_co'].str.replace(' - ', ' ')  # Remove hyphens
    print(f'>Remove hyphens')
    df['clean_co'] = df['clean_co'].str.replace(
        r"\(.*\)", "")  # Remove text between parenthesis
    print(f'>Remove text between parens')
    df['clean_co'] = df['clean_co'].str.replace(' AND ',
                                                ' & ')  #replace AND with &
    print(f'>replace AND with &')
    df['clean_co'] = df['clean_co'].str.strip(
    )  # Remove spaces in the begining/end
    print(f'>Remove leading/trailing spaces')
    df['clean_co'] = df['clean_co'].apply(
        lambda x: cleanco(x).clean_name()
        if type(x) == str else x)  # Remove business entities extensions (1)
    print(f'>Cleanco Pass1')
    df['clean_co'] = df['clean_co'].str.replace('.', '')  # Remove dots
    print(f'>Remove dots')
    df['clean_co'] = df['clean_co'].str.encode('utf-8')  # Encode
    print(f'>Encode utf-8')
    df['clean_co'] = df['clean_co'].apply(
        lambda x: cleanco(x).clean_name() if type(x) == str else x
    )  # Remove business entities extensions (2) - after removing the dots
    print(f'>Cleanco Pass2')
    return df
Пример #3
0
def convert_name(name_list):

    converted_names = []
    for name in name_list:
        #print(name)
        name = name.translate(
            str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
        name_single_spapce = " ".join(name.split())
        upper_name = name_single_spapce.upper()
        cleaned_name = cleanco(upper_name).clean_name()
        cleaned_name = cleanco(cleaned_name).clean_name()
        print(cleaned_name)
        converted_names.append(cleaned_name)

    return converted_names
Пример #4
0
 def __init__(self, file_name: str, name: str):
     self.__file_name = file_name
     self.__name = Utils.replace_redundant_ws(name).lower()
     self.__cleaned_name = cleanco(self.__name).clean_name()
     self.__tokens = set(self.__cleaned_name.split(" "))
     if len(self.tokens) == 0:
         raise AssertionError("length of list is zero")
Пример #5
0
def entity_search(pageContent, response, title=""):
    """
    get entities in pagecontent that are recognized as legal entities
    """
    try:
        body = h.handle(pageContent)
    except:
        try:
            charset = response.headers.get_content_charset()
            if charset is None: raise ValueError
        except:
            charset = 'utf-8'
        try:
            content = pageContent.decode(charset)
            body = h.handle(content)
        except Exception as e:
            print(e, pageContent)
            return []
    body = body.replace("\n", " ").replace("\r",
                                           " ").replace("*",
                                                        " ").replace("#", " ")
    entities = []
    doc = nlp(body)
    for ent in doc.ents:
        name = cleanco(str(ent))
        if name.type() is not None or name.country() is not None:
            name_to_add = name_cleaner(str(ent))
            if name_to_add is not None:
                entities.append(name_to_add)
        elif str(ent).lower() in title:
            name_to_add = name_cleaner(str(ent))
            if name_to_add is not None:
                entities.append(name_cleaner(str(name_to_add)))
    return list(dict.fromkeys(entities))
Пример #6
0
def standardize_name(raw_name):
    std_name = unidecode(raw_name)
    std_name = std_name.lower()
    std_name = std_name.lstrip().strip()
    std_name = cleanco(std_name).clean_name()
    std_name = std_name.translate({ord(c): None for c in string.punctuation})
    return std_name
Пример #7
0
def name_cleaner(text, legal_clean=False):
    if text is None: return None
    if text == "": return None
    text = text.lower()
    for repl in copyright_replace:
        text = text.replace(repl, "")
    to_del = re.findall('(\d{4})', text)
    if len(to_del) > 1:
        text = text.replace(to_del[0] + "-" + to_del[1], " ")
        text = text.replace(to_del[0] + " - " + to_del[1], " ")
    for year in to_del:
        text = text.replace(year, " ")
    if len(str(text).strip()) < 2: return None
    while True:
        if text[-1] == "." or text[-1] == " ": text = text[:-1]
        else: break
    text = re.sub(' +', ' ', text).lower().strip()
    text = re.sub(r'^[\.?&|*]', '', text)
    text = re.sub(r'[\.?&|*]$', '', text)
    if "(" in text:
        if text.find("(") < text.find(")"):
            text = text.replace(text[text.find("("):text.find(")")] + ")", " ")
    if text[0] == ".": text = text[1:]
    if legal_clean: text = cleanco(text).clean_name()
    return string.capwords(text.strip())
Пример #8
0
def imprint_analyzer(domain, link, imprint_queue):
    """
    Loads the imprint URL and returns possible legal names that occur in it
    """
    # Maybe check for adresses (through city/country recognition and highlight the
    # elements preceding the adress)
    # Expand list of legal entities in cleanco (through excel list downloaded)
    elements, imprint_names = [], []
    if urlparse(link)[1] == "": end_link = domain + "/" + link
    elif domain in link: end_link = link
    else:
        imprint_queue.put([])
        imprint_queue.task_done()
        return None
    if "http://" not in end_link and "https://" not in end_link:
        end_link = "http://" + end_link
    try:
        response = urllib.request.urlopen(Request(
            end_link, headers={'User-Agent': User_ag}),
                                          context=context,
                                          timeout=10)
        pageContent = response.read()


#        pageContent = clean_html(pageContent)
    except Exception as e:
        imprint_queue.put([])
        return []
    tree = html.fromstring(pageContent)
    titles = [el.text for el in tree.xpath("//*") if el.tag == "title"]

    tree = tree.xpath("//text()")
    # get title element - if text is equal to title --> half the similarity ratio
    for el in tree:
        if el is None: continue
        el = el.replace("\n", "").replace("\r",
                                          "").replace("\t",
                                                      "").replace("\\t", "")
        el = re.sub(' +', ' ', el).lower()
        if el == "None" or el == "" or el == " ": continue
        elements.append(el)
    for el in elements:
        name = cleanco(el)
        if name.type() is not None:
            if len(el) > 50: continue
            imprint_names.append(el)
    try:
        imprint_names.extend(entity_search(pageContent, response))
    except:
        print(response.headers.get_content_charset())
    imprint_queue.put((imprint_names, titles))
    try:
        imprint_queue.task_done()
    except:
        imprint_queue.put([])
        imprint_queue.task_done()
    return []
Пример #9
0
def test_multi_type_cleanups():
    expected = "Hello World"
    errmsg = "cleanup of %s failed"
    for testname, variation in multi_cleanup_tests.items():
        result = cleanco(variation).clean_name(prefix=True,
                                               suffix=True,
                                               middle=True,
                                               multi=True)
        assert result == expected, errmsg % testname
Пример #10
0
def load_stocks():
    
    stocks = requests.get('https://api.iextrading.com/1.0/ref-data/symbols')
    stocks = stocks.json()
        
    stock_map = {cleanco(stock['name']).clean_name(): stock['symbol'] for stock in stocks}
    
    for stock in stock_map:
        if 'WAYFAIR' in stock:
            print(stock, stock_map[stock])
Пример #11
0
    def cross_validation(title, company_name):
        """
        :param title: The string of the title of the first result that google returns
        :param company_name: The company name that we use as a searching keyword
        :return: True if title and keyword share one or more words, False otherwise.
        """
        company_name = company_name.translate(
            str.maketrans(
                string.punctuation, ' ' *
                len(string.punctuation)))  # Replace all punctuation with space
        company_name_set = set(
            cleanco(company_name).clean_name().lower().split(' '))

        title = title.translate(
            str.maketrans(
                string.punctuation, ' ' *
                len(string.punctuation)))  # Replace all punctuation with space
        title_set = set(cleanco(title).clean_name().lower().split(' '))

        return len(company_name_set & title_set)
Пример #12
0
def convert_name(name):
    if type(name) == str:
        #print(name)
        cleaned_name = cleanco(name).clean_name()
        cleaned_name = cleanco(cleaned_name).clean_name()
        name = cleaned_name.translate(
            str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
        name_split = name.split()
        #            name_copy=name_split.copy()
        #            for s in name_copy:
        #                if len(s)==1:
        #                    name_split.remove(s)
        name_single_spapce = " ".join(name_split)
        final_name = name_single_spapce.upper()

        print(final_name)

    else:
        final_name = name
    return final_name
Пример #13
0
 def _clean_text(self, name, lower=True):
     try:
         if name:
             name = name.strip().lower()
             name = name.translate(str.maketrans(' ', ' ', PUNCT))
             name = re.sub('\s\s+', ' ', name)
             name = cleanco(name).clean_name()
             #name = name + ' website'
             name = name.replace(' ', '+')
             return name
         return name
     except Exception as ex:
         return ''
Пример #14
0
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    anal = cleanco(word)
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
        'stop_word': word in stop,
        'hyphen': '-' in word,
        'size_small': True if len(word) <= 2 else False,
        #'wordnet_lemmatizer': wordnet_lemmatizer.lemmatize(word),
        'stemmer_lanc': lancaster_stemmer.stem(word),
        #'has_number': hasNumbers(word),
        #'postag_similar_max': get_similar_words_pos(word)
        #'gaz_per': True if word in NAMES else False
    }
    if i > 0:
        word1 = sent[i - 1][0]
        postag1 = sent[i - 1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i + 1][0]
        postag1 = sent[i + 1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features
Пример #15
0
def footer_crawler(tree):
    tree = tree.xpath('//footer//text()')
    elements, imprint_names = [], []
    for element in tree:
        el = element
        if el is None: continue
        el = el.replace("\n", "").replace("\r", "").replace("\t", "")
        el = re.sub(' +', ' ', el).lower()
        if el == "None" or el == "" or el == " ": continue
        elements.append(el)
    for el in elements:
        name = cleanco(el)
        # somehow doesnt recognize "nielen schuman b.v."
        if name.type() is not None or name.country() is not None:
            imprint_names.append(name_cleaner(el))
    return imprint_names
Пример #16
0
 def cleanMatches(self, matchFrame):
     stopwords = {'the'}
     for index, row in matchFrame.iterrows():
         tempString = cleanco(
             matchFrame.iloc[index]['company'].lower()).clean_name()
         resultwords = [
             word for word in re.split("\W+", tempString)
             if word.lower() not in stopwords
         ]
         result = ' '.join(resultwords)
         matchFrame.at[index, 'Clean Company'] = result.translate(
             None, string.punctuation)
         try:
             matchFrame.at[index, 'Email Domain'] = email_split(
                 row['email']).domain
         except Exception as e:
             print 'error ' + str(e)
             print 'no email'
             pass
Пример #17
0
def name_cleaner(df):
    from cleanco import cleanco

    df_names = df['name'].fillna(' ')

    # removal of text between parentheses
    df_names = df_names.str.replace(r"\(.*\)", "")

    # 'AND' and '&' are equivalent
    df_names = df_names.str.replace(' AND ', ' & ')

    # cleaning utilities from cleanco package (takes off suffixes from a database)
    df_names = df_names.str.replace('.', '')
    df_names = df_names.apply(lambda x: cleanco(x).clean_name()
                              if type(x) == str else x)

    # make all names lower-case
    df_names = df_names.str.lower()

    return df_names
Пример #18
0
    def process(self, in_, out):
        #f = ["year","month","carrier","carrier_name","airport","airport_name","arr_flights","arr_del15","carrier_ct","weather_ct","nas_ct","security_ct","late_aircraft_ct","arr_cancelled","arr_diverted","arr_delay"," carrier_delay","weather_delay","nas_delay","security_delay","late_aircraft_delay"]
        reader = csv.DictReader(open(in_, "r"), delimiter=",")
        for row in reader:

            row["date"] = row["year"] + "-" + row["month"]
            row["month"] = row["year"] + "-" + row["month"]
            row["airline"] = cleanco(row["carrier_name"]).clean_name()
            row["cancelled"] = row["arr_cancelled"]
            row["delay"] = mk_float(row['late_aircraft_delay']) + mk_float(
                row['carrier_delay']) + mk_float(row["arr_delay"])
            out_r = {}
            for k in self.fields:
                try:
                    out_r[k] = row[k]
                except KeyError:
                    pass
            out_r["date"] = out_r["date"] + "-01"
            out.writerow(out_r)
            continue
Пример #19
0
    def process(self, in_, out):
        """

        :param in_:
        :param out: csv.DictWriter
        :return:
        """
        book = xlrd.open_workbook(in_)
        sheet = book.sheet_by_index(0)
        for row_index in range(1, sheet.nrows):
            # A B C D E F G H I J K  L  M  N  O  P
            # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
            # print(sheet.cell(row_index,1).value)
            # print(sheet.row(row_index))
            # print(type(sheet.cell(row_index,1).value))
            try:
                val = sheet.cell(row_index, 1).value if sheet.cell(
                    row_index, 1).value is not "" else sheet.cell(
                        row_index, 2).value
                date = datetime.datetime(
                    *xlrd.xldate_as_tuple(val, book.datemode))
                airline_name = sheet.cell(row_index, 5).value
                val = sheet.cell(row_index, 9).value
                r = {
                    "date": date.date().strftime("%Y-%m-%d"),
                    "month": date.date().strftime("%Y-%m"),
                    "airline": cleanco(airline_name).clean_name(),
                    "item": sheet.cell(row_index, 8).value,
                    "claim_amount": val if val is not "-" else 0
                }

                l = [
                    sheet.cell(row_index, 2),
                    sheet.cell(row_index, 5),
                    sheet.cell(row_index, 8),
                    sheet.cell(row_index, 9)
                ]
                if not "" in r.values() and all(r.values()):
                    out.writerow(r)
            except Exception as e:
                pass
Пример #20
0
def generate_wordcloud(term_field):
    field = term_field.split('_')[-1]
    term = '_'.join(term_field.split('_')[:-1])
    query = """
        SELECT {}
        FROM web_certificates
        WHERE cert_id in (
            SELECT cert_id 
            FROM cert_search 
            WHERE text MATCH %s
        )
    """
    with create_connection() as conn:
        df = pd.read_sql(query.format(field), conn, params=[term])
    df['contractor_clean'] = df[field].apply(lambda x: cleanco(x).clean_name())
    relevant_words = [
        word.lower().lstrip().rstrip().replace('.', '')
        for word in df['contractor_clean']
    ]
    relevant_text = " ".join(relevant_words)
    stopwords = set(STOPWORDS)
    stopwords.update(general_terms + dvision_terms + term.split(' '))
    if field != 'owner':
        stopwords.update(geographic_locations)
    try:
        wordcloud = WordCloud(
            stopwords=stopwords,
            background_color=None,
            mode='RGBA',
            width=1000,
            height=400,
            color_func=lambda *args, **kwargs: "black").generate(
                relevant_text.upper())
        if len(wordcloud.words_):
            wordcloud.recolor(color_func=grey_color_func, random_state=3)
            wordcloud.to_file(
                f"static/wordcloud_{term.replace(' ', '_')}_{field}.png")
        return len(df), len(wordcloud.words_) / len(df)
    except ValueError:
        pass  # search term did not generate enough words
        return len(df), 0
Пример #21
0
    def cleanTargetAccounts(self, targetAccountsFrame):
        stopwords = {'the'}
        for index, row in targetAccountsFrame.iterrows():
            tempString = cleanco(targetAccountsFrame.iloc[index]
                                 ['Account Name'].lower()).clean_name()
            resultwords = [
                word for word in re.split("\W+", tempString)
                if word.lower() not in stopwords
            ]
            result = ' '.join(resultwords)
            targetAccountsFrame.at[index, 'Clean Target'] = result.translate(
                None, string.punctuation)
            try:
                if 'www.' in row['Website']:
                    targetAccountsFrame.at[index, 'Website Host'] = urlparse(
                        row['Website']).path.split('.')[1].lower()
                else:
                    targetAccountsFrame.at[index, 'Website Host'] = urlparse(
                        row['Website']).path.split('.')[0].lower()

            except Exception as e:
                targetAccountsFrame.at[index, 'Website Host'] = 'None'
                pass
def clean_company_name(raw):
    if raw in (" ", "None", None):
        return ""
    name = unidecode.unidecode(raw)
    try:
        name = re.findall("o/a (.*)", name, flags=re.I)[0]
    except IndexError:
        pass
    try:
        name = re.findall("c/o (.*)", name, flags=re.I)[0]
    except IndexError:
        pass
    try:
        name = re.findall("(.*) for ", name, flags=re.I)[0]
    except IndexError:
        pass
    name = cleanco(name).clean_name()
    name = name.lower()
    for stopword in ["of", "d'", "l'"]:
        name = name.replace(stopword, "")
    name = name.replace("and", "&")
    for punct in ["-", ".", ",", "(", ")"]:
        name = name.replace(punct, " ")
    for punct in ["'"]:
        name = name.replace(punct, "")
    if (not name.startswith("s ")) and (not " s " in name):
        name = " ".join([word.rstrip("s") for word in name.split(" ")])
    name = "".join([word for word in name.split(" ")])
    for word in [
            "constructor",
            "construction",
            "contracting",
            "contractor",
            "mechanical",
            "plumbing",
            "heating",
            "mech",
            "electrical",
            "electric",
            "development",
            "interior"
            "builders",
            "building",
            "enterprise",
            "infrastructure",
            "management",
            "excavating",
            "trucking",
            "company",
            "restoration",
            "service",
            "servicing",
            "hvac",
            "system",
            "paving",
            "industrie",
            "industry",
            "engineering",
            "consulting",
            "consultant",
            "solution",
            "commercial",
            "group",
            "insulation",
            "insulators",
            "ontario",
            "canada",
    ]:
        name = name.replace(word, "")
    return name
Пример #23
0
def test_with_unicode_umlauted_name():
    errmsg = "preserving cleanup of %s failed"
    for testname, (variation, expected) in unicode_umlaut_tests.items():
        assert cleanco(variation).clean_name() == expected, errmsg % testname
Пример #24
0
def test_preserving_cleanups():
    errmsg = "preserving cleanup of %s failed"
    for testname, (variation, expected) in preserving_cleanup_tests.items():
        assert cleanco(variation).clean_name() == expected, errmsg % testname
def clean_company_name(s):
    s = process_string(s)
    cleaned = cleanco(s)
    return cleaned.clean_name()
Пример #26
0
def test_basic_cleanups():
    expected = "Hello World"
    errmsg = "cleanup of %s failed"
    for testname, variation in basic_cleanup_tests.items():
        assert cleanco(variation).clean_name() == expected, errmsg % testname
def get_keyword_candidates(NounPhrase, doc_title):
    '''
    Get keyword candidates from list of noun phrase by term frequency and if the phrase is in keyword
    - filter out city, US States and territories, country, region state_names
    - filter out domain specific stop words
    - filter out annual event (Super Bowl, Oscars, Grammys, etc.), politics word (Islam, terrorism, white house), natural disaster (flood, hurricane) 
    
    Parameters
    -----------
    NounPhrase: list
        a list of noun phrases extracted from the article

    title: str
        the title of the article where the noun phrases come from
        
    Returns
    -------
    reranked: list
        a list of keyword candidates
    '''

    import nltk
    import re
    import collections
    from fuzzywuzzy import fuzz
    from cleanco import cleanco

    # remove company suffix
    kw = []
    for np in NounPhrase:
        x = cleanco(np)
        kw.append(x.clean_name())
#     print(kw)
    kw0 = [
        re.sub('(Companys|Company|Companies|Firm|Organization|Corporation)',
               '', k) for k in kw
    ]
    kw0 = [k.strip() for k in kw0]  # remove whitespace

    # remove leading, tailing, between-character punctuation
    punctuation = '’!"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~'
    kw1 = []
    for k in kw0:
        k = ''.join(ch for ch in k if ch not in punctuation)
        if k and k != ' ':  # remove empty string
            kw1.append(k)
#     print(kw1)

# remove irrelevant keyword candidates
###### substring ######
    type1 = [
        'Percent', 'Major', 'Vote', 'Unit', 'Method', 'Option', 'Euro',
        'Angel', 'Offer', 'Market', 'Review', 'Nation', 'Present', 'Direct',
        'Terror', 'Islam', 'FY', 'Holder'
    ]

    kw2 = [
        word for word in kw1
        if not any(w.lower() in word.lower() for w in type1)
    ]
    #     print(kw2)

    ###### subword ######
    import calendar
    import us
    from geotext import GeoText

    # time related words
    time_ls = [weekday for weekday in calendar.day_name]
    time_ls.extend([m for m in calendar.day_abbr
                    ])  # weekday: Monday, Tuesday, etc.
    time_ls.extend([month for month in calendar.month_name[1:]
                    ])  # month: January, February, etc.
    time_ls.extend([m for m in calendar.month_abbr[1:]
                    ])  # month name abbreviation: Jan, Feb, Mar, etc.

    type2 = [
        'ISIS', 'Info', 'Cent', 'Part', 'RMB', 'EPS', 'SAR', 'IFRS', 'Plan',
        'Deal', 'Time', 'Age', 'Rate', 'NYSE', 'GNP', 'REIT', 'Mr', 'Mrs',
        'Ms', 'Co', 'Bn', 'Get', 'Bad', 'Dow', 'River', 'Lead', 'Employer',
        'Difference', 'ROI', 'AMEX', 'IRA', 'DJIA', 'NAV', 'PSP', 'FOREX',
        'EFT', 'ETF', 'FDIC', 'FRB', 'LOI', 'NAV', 'SEC', 'YTM', 'NDA', 'SP',
        'DC', 'etc', 'Zone', 'Such', 'SEK', 'Army', 'CFA', 'Net', 'Lake',
        'Hotel', 'HKD', 'IST', 'Side', 'EBITDA', 'FASB', 'FBMS', 'FDIC', 'GDP',
        'BFY', 'OWCP', 'Gov', 'BLS', 'DOL', 'FDA', 'Site', 'EIS', 'Page',
        'New', 'News', 'Old', 'Ltd', 'Corp', 'Task', 'Park', 'Esq', 'Tower',
        'State', 'Return', 'War', 'Snow', 'Sign', 'Step', 'Sale', 'NASDAQ',
        'Job', 'No', 'CAGR', 'Discount', 'FBI', 'IRS', 'Cash', 'IRR', 'Tax',
        'Taxation', 'Sir', 'Goal', 'Poor', 'Poors', 'ID', 'CPA', 'Hall',
        'Stake', 'Association', 'Provision', 'Way', 'Fact', 'Idea', 'Second',
        'First', 'Half', 'Role', 'Big', 'Act', 'Share', 'DOJ', 'Sum', 'ASX',
        'PhD', 'Line', 'Risk', 'Right', 'Rule', 'Read', 'See', 'TSX', 'Fed',
        'IDF', 'NZX', 'Lot', 'Name', 'Soldier', 'Storm', 'Loss', 'Gain',
        'Person', 'Late', 'Team', 'Debt', 'Cost', 'Same', 'Last', 'Only',
        'Area', 'Earnings', 'Earnings', 'Related', 'Performance', 'Palace',
        'Temple', 'Stages', 'Inc', 'FTSE', 'Further', 'Rain', 'Investigation'
        'Year', 'Month', 'Quarter', 'Day', 'Morning', 'Evening', 'Afternoon',
        'Date', 'Week', 'Hour', 'Minute', 'Period', 'Certain', 'Member',
        'Republic', 'Prospect', 'Senate', 'Growth', 'Oscars', 'Source',
        'Grammys', 'Clinton', 'Trump', 'Obama', 'Election', 'Federal',
        'Congress', 'Brand', 'Exchange', 'Authority', 'Requirement',
        'Additional', 'Purchase', 'Esquire', 'Institute', 'Place', 'Crime',
        'NBA', 'Available', 'EU', 'Party', 'Government', 'Department',
        'Ministry', 'Minister', 'President', 'Cabinet', 'Court', 'Bureau',
        'Country', 'Society', 'Capitol', 'Assumption', 'Litte', 'Gross',
        'Corporate', 'Said', 'Shares', 'UN', 'Office', 'Officer', 'Board',
        'Police', 'Law', 'Attorney', 'Analyst', 'Council', 'Street', 'Union',
        'Branch', 'Request', 'Saving', 'Study', 'Expense', 'Strong', 'Per',
        'Appendix', 'Billion', 'Competitive', 'Now', 'Headquarters',
        'University', 'College', 'Instituition', 'School', 'Academy',
        'Airport', 'Station', 'Property', 'Avenue', 'Place', 'Quantity',
        'Attachment', 'Next', 'Title', 'Yield', 'Ill', 'Sept', 'CAD', 'Top',
        'Statement', 'Statements', 'Report', 'Reports', 'Sheet', 'Sheets',
        'Session', 'Term', 'Charter', 'Assessment', 'Application',
        'Instruction', 'Publication', 'Period', 'Chapter', 'Weather', 'Times',
        'EUR', 'Chair', 'Document', 'Information', 'Transaction', 'Content',
        'Press', 'Release', 'Journal', 'Form', 'Description', 'Section',
        'Subsidiary', 'Attached', 'Editions', 'Relevant', 'Comment',
        'Liquidity', 'Fortune', 'Free', 'Agreement', 'Settlement', 'Filing',
        'File', 'Award', 'Awards', 'Patent', 'Copyright', 'Strategy', 'Price',
        'Asset', 'Factor', 'Documentation', 'Impact', 'Initiative', 'Several',
        'Further', 'Choice', 'Sq', 'Ft', 'Dividend', 'Profit', 'Income',
        'Revenue', 'Margin', 'Interest', 'Influence', 'Problem', 'Securities',
        'Currency', 'Great', 'Wrong', 'Claim', 'Proceeding', 'Strategic',
        'Decision', 'Merge', 'Decline', 'Europe', 'Security', 'Bond',
        'Profile', 'Portfolio', 'Ratio', 'Rating', 'Value', 'Credit', 'Audit',
        'Future', 'Instrument', 'Instruments', 'Policy', 'Other', 'Different',
        'Expectation', 'Olympic', 'Decrease', 'Australia', 'Finance',
        'Financial', 'Financing', 'Component', 'Trade', 'Forecast',
        'Prediction', 'Buy', 'Sell', 'Index', 'Staff', 'Concern',
        'Expenditure', 'Justice', 'Edition', 'Inflation', 'Increase',
        'Continue', 'Africa', 'Business', 'Valuation', 'Series', 'Condition',
        'Disclosure', 'Regulation', 'Committee', 'Rating', 'Stock', 'Excahnge',
        'Quality', 'Spokesman', 'Competition', 'Serious', 'Average', 'Balance',
        'Table', 'America', 'Acquisition', 'Outlook', 'Prospectus', 'Stage',
        'Executive', 'Budget', 'Investor', 'Owner', 'Leader',
        'Acknowledgement', 'Overall', 'Competitor', 'Daily', 'Current',
        'Medal', 'Buyouts', 'Allowance', 'Tsunami', 'Announcement',
        'Development', 'Account', 'Demand', 'Dollar', 'Crore', 'Pound',
        'Number', 'Round', 'Many', 'Range', 'Relationship', 'Important',
        'Chairman', 'Improvement', 'Allocation', 'Buyout', 'Flight', 'UK',
        'Assembly', 'Meeting', 'Conference', 'Access', 'Archive', 'Exhibit',
        'Opportunity', 'Chance', 'Responsibility', 'Parameter', 'Later', 'Key',
        'Hundred', 'Estimate', 'Phase', 'Judge', 'Governor', 'Asia', 'Drought',
        'Item', 'Product', 'Issue', 'Type', 'Class', 'Category', 'Amount',
        'Result', 'Notes', 'Event', 'Order', 'Basis', 'Previous', 'Employee',
        'Thousand', 'Summary', 'Chief', 'Position', 'Festival', 'Note',
        'Earthquake', 'Enquiry', 'Question', 'Answer', 'Reference', 'Action',
        'Story', 'Headline', 'World', 'Article', 'Figure', 'Promotion',
        'Certification', 'Level', 'Million', 'Notification', 'Principal',
        'Did', 'Road', 'Flood', 'US', 'High', 'Low', 'Total', 'Enough', 'Good',
        'Recent', 'Annual', 'Above', 'Detail', 'Aggregate', 'Former',
        'Manager', 'Effect', 'Thing', 'Standard', 'Deposit', 'Notice',
        'Mortagage', 'Certificate', 'Agent', 'Hurricane'
    ]

    type2.extend(time_ls)

    kw3 = [
        word for word in kw2
        if not any(w.lower() in word.lower().split() for w in type2)
    ]
    #     print(kw3)

    ###### the whole term ######
    type3 = [
        'Hongkong', 'Calif', 'Isarel', 'Korea', 'England', 'Britain', 'Agency',
        'USA', 'U.S.A', 'U.S.', 'U.S', 'U.K.', 'UKs', 'UAE', 'Antarctica',
        'Store', 'Silicon Valley', 'West', 'East', 'North', 'South',
        'Northwest', 'Service Provider', 'Trust', 'Management', 'Partner',
        'Program', 'Group', 'Super Bowl', 'Limited Partner', 'General Partner',
        'San', 'Southwest', 'Justice', 'Commerce', 'Head', 'Due Diligence',
        'District', 'World', 'Square Foot', 'PartnerSite', 'Parent Company',
        'Don', 'Northeast', 'Partnership', 'Platform', 'Organization',
        'Corporation', 'Startup', 'Bank', 'Industry', 'Sector', 'Segment',
        'White House', 'Project', 'Research', 'Technology', 'Science',
        'Operation', 'Capital', 'Capitalization', 'Investment', 'IPO',
        'Investment', 'Tech', 'Engineering', 'Fund', 'Seed', 'Venture Capital',
        'Private Equity', 'Corporate Venture', 'Incubator', 'Accelerator',
        'Customer', 'Commission', 'Secretary', 'Client', 'Customer Service',
        'Shop', 'Restaurant', 'City', 'Facility', 'Joint Venture', 'Website',
        'Internet', 'Region', 'Function', 'General', 'House', 'Appointment',
        'Change', 'Founder', 'Author', 'Analysis', 'Full Text', 'Amendment',
        'Venture', 'Free Online', 'Life', 'Treasury', 'Center'
    ]

    # any US state or territories name
    state_names = [state.name for state in us.states.STATES_AND_TERRITORIES]
    type3.extend(state_names)

    kw4 = [
        word for word in kw3 if word.lower() not in [w.lower() for w in type3]
    ]
    # remove any country/city name
    kw4 = [
        word for word in kw4 if not GeoText(word.title()).countries
        if not GeoText(word.title()).cities
    ]
    #     print(kw4)

    ###### Remove word length equals 1 and not noun/np term ######
    kw5 = []
    good_tag = ['NNP', 'NN']
    for word in kw4:
        if len(word.split()) == 1:
            for word, tag in nltk.pos_tag(word.split()):
                if tag in good_tag:
                    kw5.append(word)
        else:
            kw5.append(word)
#     print(kw5)

# 'Global shares', 'global Shares','global shares','GLOBAL SHARES' -> 'Global Shares' if it exists in the list
    kw6 = [w.title() if w.title() in kw5 else w for w in kw5]
    #     print(kw6)

    # count frequency
    c = collections.Counter(kw6)

    # 'Virtual Reality Technology','Virtual Reality','Technology' -> 'Virtual Reality Technology', 'Technology'
    kw6 = list(
        sorted(set(kw6), key=len,
               reverse=True))  # arrange in descending order of term length
    for i, word in enumerate(kw6):
        for j in range(i + 1, len(kw6)):
            if len(kw6[i].split()) == len(
                    kw6[j].split()) and fuzz.token_set_ratio(kw6[i],
                                                             kw6[j]) == 100:
                c[kw6[i]] += c[kw6[j]]
                del c[kw6[j]]
            elif len(kw6[i].split()) > len(kw6[j].split()) and kw6[i].split(
            )[0].lower() == kw6[j].split()[0].lower():
                c[kw6[j]] += c[kw6[i]]
                del c[kw6[i]]

    # 'Technology','Virtual Reality Technology' -> 'Virtual Reality Technology'
    # 'Agency', 'Estate Agency','Real Estate Agency' -> 'Real Estate Agency'
    kw7 = sorted([key for key, value in c.items()],
                 key=len)  # arrange in ascending order of  term length
    for i, word in enumerate(kw7):
        for j in range(i + 1, len(kw7)):
            if len(kw7[i].split()) < len(kw7[j].split()) and kw7[i].split(
            )[-1].lower() == kw7[j].split()[-1].lower():
                c[kw7[j]] += c[kw7[i]]
                del c[kw7[i]]


#     print(kw7)

# Remove noun phrases with length greater than 3 words and shorter than 2 character
# sort by occurring frequency
    sorted_kw = [
        key
        for key, value in sorted(c.items(), key=lambda x: x[1], reverse=True)
        if len(key) > 2 and len(key.split()) < 4
    ]

    # Rerank by giving more weight to terms occurred in the title
    kw_scores = collections.OrderedDict()

    for kw in sorted_kw:
        pattern = re.compile(r'\b' + re.escape(kw) + r'(\b|[,;.!?]|\s)',
                             re.IGNORECASE)
        if pattern.search(doc_title):
            in_title = 1
        else:
            in_title = 0

        kw_scores[kw] = in_title

    in_title_list = []
    notin_title_list = []
    for term in kw_scores.items():
        if term[1] == 1:
            in_title_list.append(term[0])
        else:
            notin_title_list.append(term[0])
    reranked = in_title_list + notin_title_list

    return reranked
Пример #28
0
    train_df['oscore'] = train_df.clean_beneficiary.apply(
        lambda x: get_score(x, o_word_dict))
    return train_df


if __name__ == '__main__':
    train_path = '/Users/aditya1/Downloads/DIAFTE-master 2/data/CSSol-2/NewData/whole_train_new.csv'
    test_path = '/Users/aditya1/Downloads/DIAFTE-master 2/data/CSSol-2/NewData/whole_test_new.csv'

    test_df = pd.read_csv(test_path)
    train_df = pd.read_csv(train_path)

    # If don't want to reset index
    train_df = train_df[train_df.columns[1:]]
    test_df = test_df[test_df.columns[1:]]

    train_df['clean_beneficiary'] = train_df.beneficiary.apply(
        lambda x: clean_beneficiary(str(x)))
    test_df['clean_beneficiary'] = test_df.beneficiary.apply(
        lambda x: clean_beneficiary(str(x)))

    train_df['clean_beneficiary'] = train_df.clean_beneficiary.apply(
        lambda x: cleanco(str(x)).clean_name())
    test_df['clean_beneficiary'] = test_df.clean_beneficiary.apply(
        lambda x: cleanco(str(x)).clean_name())
    train_df = main(train_df)
    print(train_df)
    # test_df = main(test_df)
    # train_df.to_csv('updated_train_new.csv', index=False)
    # test_df.to_csv('updated_test_new.csv', index=False)
def get_country_from_company_name(s):
    cleaned = cleanco(s)
    return cleaned.country()
Пример #30
0
from cleanco import cleanco

business_name = "Hello World, llc."
print("Inputted Business Name: %s" % business_name)
x = cleanco(business_name)
print("Clean Name: %s" % x.clean_name())
print("Business Type: %s" % x.type())
print("Country: %s" % x.country())
Пример #31
0
 def ngrams(string, n=2):
     x = cleanco(str(string).lower()).clean_name()
     ngrams = zip(*[x[i:] for i in range(n)])
     return [''.join(ngram) for ngram in ngrams]
Пример #32
0
def test_commas():
    name = "bp, kek, llc"
    cleaned_name = cleanco(name).clean_name()
    assert cleaned_name == "bp, kek"
def test_preserving_cleanups():
   errmsg = "preserving cleanup of %s failed"
   for testname, (variation, expected) in preserving_cleanup_tests.items():
      assert cleanco(variation).clean_name() == expected, errmsg % testname
def test_with_unicode_umlauted_name():
   errmsg = "preserving cleanup of %s failed"
   for testname, (variation, expected) in unicode_umlaut_tests.items():
      assert cleanco(variation).clean_name() == expected, errmsg % testname
def find_contacts_at_companies(input_companies_fname, input_contacts_fname):

    # Initialize lists
    target_company_list = []
    contacts_matrix = []
    contacts_matrix_noblanks = []
    contacts_at_companies_matrix = []
    any_contacts_at_company = False
    contact_match_count = 0
    companies_with_matches = []

    # Create constants that correspond to field positions.

    COMPANYNAME_OF_COMPANIES = 0
    COMPANYNAME_OF_COMPANIES_CLEANSED = 1
    FULLNAME_OF_CONTACT = 0
    EMAIL1_OF_CONTACT = 30
    EMAIL2_OF_CONTACT = 31
    COMPANY_OF_CONTACT = 64
    COMPANY_OF_CONTACT_CLEANSED = 65
    TITLE_OF_CONTACT = 66 + 1
    DEPARTMENT_OF_CONTACT = 67 + 1

    # Read data into lists from files
    with open(input_companies_fname, encoding='utf-8') as csvDataFile:
        csvReader = csv.reader(csvDataFile)
        for row in csvReader:
            target_company_list.append(row)

    with open(input_contacts_fname, encoding='utf-8') as csvDataFile:
        csvReader = csv.reader(csvDataFile)
        for row in csvReader:
            contacts_matrix.append(row)

# Pop off the top row of each input containing the text tile row.

    target_company_list.pop(0)
    contacts_matrix.pop(0)

    print("Number of target accounts: ", len(target_company_list))
    print("Number of contacts in database: ", len(contacts_matrix))

    # Insert a cleansed version of the company name in each row in the company
    # data and in the contact data.

    # Insert a cleansed company field in the target company list data.

    for row_companies in target_company_list:
        row_companies.insert(COMPANYNAME_OF_COMPANIES_CLEANSED,
                             row_companies[COMPANYNAME_OF_COMPANIES])

# Clean company control names like "Inc.," "Incorporated," etc.

    for row_companies in target_company_list:
        row_companies[COMPANYNAME_OF_COMPANIES_CLEANSED] = cleanco(
            row_companies[COMPANYNAME_OF_COMPANIES_CLEANSED]).clean_name()

        # Remove punctuation

        row_companies[COMPANYNAME_OF_COMPANIES_CLEANSED] = regex.sub(
            r"[[:punct:]]+", "",
            row_companies[COMPANYNAME_OF_COMPANIES_CLEANSED])

        # Make lower case and handle certain foreign language capitalization
        # conventions with casefold().

        row_companies[COMPANYNAME_OF_COMPANIES_CLEANSED] = row_companies[
            COMPANYNAME_OF_COMPANIES_CLEANSED].casefold()

        row_companies[COMPANYNAME_OF_COMPANIES_CLEANSED] = nltk.word_tokenize(
            row_companies[COMPANYNAME_OF_COMPANIES_CLEANSED])

        row_companies[COMPANYNAME_OF_COMPANIES_CLEANSED] = [
            t for t in row_companies[COMPANYNAME_OF_COMPANIES_CLEANSED]
            if t not in stopwords.words('english')
        ]

# Get rid of all contacts without a company using list comprehension.

    contacts_matrix_noblanks = [
        row for row in contacts_matrix if (row[COMPANY_OF_CONTACT] != "")
    ]

    # Now cleanse the contact company names.

    for row_contacts in contacts_matrix_noblanks:
        row_contacts.insert(COMPANY_OF_CONTACT_CLEANSED,
                            row_contacts[COMPANY_OF_CONTACT])

    for row_contacts in contacts_matrix_noblanks:
        row_contacts[COMPANY_OF_CONTACT_CLEANSED] = cleanco(
            row_contacts[COMPANY_OF_CONTACT_CLEANSED]).clean_name()

        row_contacts[COMPANY_OF_CONTACT_CLEANSED] = regex.sub(
            r"[[:punct:]]+", "", row_contacts[COMPANY_OF_CONTACT_CLEANSED])

        row_contacts[COMPANY_OF_CONTACT_CLEANSED] = row_contacts[
            COMPANY_OF_CONTACT_CLEANSED].casefold()

        row_contacts[COMPANY_OF_CONTACT_CLEANSED] = nltk.word_tokenize(
            row_contacts[COMPANY_OF_CONTACT_CLEANSED])

        row_contacts[COMPANY_OF_CONTACT_CLEANSED] = [
            t for t in row_contacts[COMPANY_OF_CONTACT_CLEANSED]
            if t not in stopwords.words('english')
        ]

# Walk through the companies list.  For each company, go through companies
# of contacts (in the noblanks matrix) and find matches or approximate
# matches where

    for row_companies in target_company_list:
        any_contacts_at_company = False
        c = row_companies[COMPANYNAME_OF_COMPANIES_CLEANSED]
        for row_contacts in contacts_matrix_noblanks:
            coc = row_contacts[COMPANY_OF_CONTACT_CLEANSED]
            if c == coc or (set(coc) < set(c)) or (set(c) < set(coc)):
                contacts_at_companies_matrix.append([
                    row_companies[COMPANYNAME_OF_COMPANIES],
                    row_contacts[COMPANY_OF_CONTACT],
                    row_contacts[FULLNAME_OF_CONTACT],
                    row_contacts[TITLE_OF_CONTACT],
                    row_contacts[DEPARTMENT_OF_CONTACT],
                    row_contacts[EMAIL1_OF_CONTACT],
                    row_contacts[EMAIL2_OF_CONTACT]
                ])
                any_contacts_at_company = True
                contact_match_count += 1
                companies_with_matches.append(
                    row_companies[COMPANYNAME_OF_COMPANIES])
        if any_contacts_at_company == False:
            contacts_at_companies_matrix.append(
                [row_companies[COMPANYNAME_OF_COMPANIES]])

    contacts_at_companies_matrix.insert(0, [
        "Company Name", "Contact Company Name", "Contact Full Name",
        "Contact Title", "Contact Department", "Contact Email1",
        "Contact Email2"
    ])

    companies_with_matches = list(sorted(set(companies_with_matches)))

    print("Number of target accounts with contacts: ",
          len(companies_with_matches))
    print("Number of contacts found at target accounts: ", contact_match_count)

    return contacts_at_companies_matrix, companies_with_matches
def test_basic_cleanups():
   expected = "Hello World"
   errmsg = "cleanup of %s failed"
   for testname, variation in basic_cleanup_tests.items():
      assert cleanco(variation).clean_name() == expected, errmsg % testname