Exemplo n.º 1
0
 def test_strip_issuer(self):
     self.issuers_to_clean = {
         'WHIRLPOOL CORP DE': 'WHIRLPOOL',
         'WESTERN UNION CO': 'WESTERN UNION'
     }
     for company in self.issuers_to_clean.keys():
         self.assertEqual(self.issuers_to_clean[company],
                          strip_issuer_name(company))
def score_item_relevance(item):
    score = 0
    from cleaning import strip_issuer_name

    if item.get('issuer_name'):
        if item['issuer_name'].lower() == item['filing_person'].lower():
            score += 100
        elif strip_issuer_name(
                item['issuer_name']).lower() == strip_issuer_name(
                    item['filing_person']).lower():
            score += 50

        if item['issuer_name'].lower == item['search_term'].lower():
            score += 100
        elif strip_issuer_name(
                item['issuer_name']).lower() == strip_issuer_name(
                    item['search_term']).lower():
            score += 50
        return score

    if item['search_term'].lower() == item['filing_person'].lower():
        score += 70
    elif strip_issuer_name(item['search_term']).lower() == strip_issuer_name(
            item['filing_person']).lower():
        score += 50
    return score
def score_item_relevance(item):
    score = 0
    from cleaning import strip_issuer_name

    if item.get("issuer_name"):
        if item["issuer_name"].lower() == item["filing_person"].lower():
            score += 100
        elif strip_issuer_name(item["issuer_name"]).lower() == strip_issuer_name(item["filing_person"]).lower():
            score += 50

        if item["issuer_name"].lower == item["search_term"].lower():
            score += 100
        elif strip_issuer_name(item["issuer_name"]).lower() == strip_issuer_name(item["search_term"]).lower():
            score += 50
        return score

    if item["search_term"].lower() == item["filing_person"].lower():
        score += 70
    elif strip_issuer_name(item["search_term"]).lower() == strip_issuer_name(item["filing_person"]).lower():
        score += 50
    return score
Exemplo n.º 4
0
        writer = csv.DictWriter(
            issuers_out_csv,
            fieldnames=['CUSIP6', 'SEARCH_TERM', 'ISSUER_NAME', 'ADDRESS'])
        writer.writeheader()

        with open("cleaned-issuer-cusip6s-rejected.csv",
                  "w") as rejected_out_csv:
            rejected_writer = csv.DictWriter(rejected_out_csv,
                                             fieldnames=[
                                                 'CUSIP6', 'SEARCH_TERM',
                                                 'ISSUER_NAME',
                                                 'CLEANED_SEARCH_TERM',
                                                 'CLEANED_ISSUER_NAME'
                                             ])
            for row in reader:
                cleaned_search_term = strip_issuer_name(row['SEARCH_TERM'])
                cleaned_issuer_name = strip_issuer_name(row['ISSUER_NAME'])

                if cleaned_issuer_name == cleaned_search_term:
                    row = {
                        'SEARCH_TERM': row['SEARCH_TERM'],
                        'ISSUER_NAME': row['ISSUER_NAME'],
                        'CUSIP6': row['CUSIP6'],
                        'ADDRESS': row['ADDRESS']
                    }
                    writer.writerow(row)
                else:
                    print "Rejecting %s, %s" % (cleaned_issuer_name,
                                                cleaned_search_term)
                    rejected_writer.writerow({
                        'CUSIP6': row['CUSIP6'],
import csv

# Take the issuer-cusip6's and standardize issuer names
from cleaning import strip_issuer_name

with open("issuer-cusip6s.csv", "r") as issuers_csv:
    reader = csv.DictReader(issuers_csv)
    with open("cleaned-issuer-cusip6s.csv", "w") as issuers_out_csv:
        writer = csv.DictWriter(issuers_out_csv, fieldnames = ['CUSIP6', 'SEARCH_TERM', 'ISSUER_NAME', 'ADDRESS'])
        writer.writeheader()


        with open("cleaned-issuer-cusip6s-rejected.csv", "w") as rejected_out_csv:
            rejected_writer = csv.DictWriter(rejected_out_csv, fieldnames=['CUSIP6', 'SEARCH_TERM', 'ISSUER_NAME', 'CLEANED_SEARCH_TERM', 'CLEANED_ISSUER_NAME'])
            for row in reader:
                cleaned_search_term = strip_issuer_name(row['SEARCH_TERM'])
                cleaned_issuer_name = strip_issuer_name(row['ISSUER_NAME'])

                if cleaned_issuer_name == cleaned_search_term:
                    row = {'SEARCH_TERM' : row['SEARCH_TERM'],
                           'ISSUER_NAME' : row['ISSUER_NAME'],
                           'CUSIP6' : row['CUSIP6'],
                           'ADDRESS' : row['ADDRESS']}
                    writer.writerow(row)
                else:
                    print "Rejecting %s, %s" % (cleaned_issuer_name, cleaned_search_term)
                    rejected_writer.writerow({'CUSIP6' : row['CUSIP6'],
                                              'CLEANED_SEARCH_TERM' : cleaned_search_term,
                                              'CLEANED_ISSUER_NAME' : cleaned_issuer_name,
                                              'SEARCH_TERM' : row['SEARCH_TERM'],
                                              'ISSUER_NAME' : row['ISSUER_NAME']})
Exemplo n.º 6
0
os.system("python extract_cusip6_from_valid_items.py")
os.system("tail -n+2 issuer_cusip6s.csv > issuer_cusip6s.csv.tmp; mv issuer_cusip6s.csv.tmp issuer_cusip6s.csv")
os.system("sqlite3 ingest.db < sql/import-issuer-cusip6s.sql")

# Need to implement a fuzzy match to find the companies that do not have a similar issuer in distinct_issuers
import sqlite3
con = sqlite3.connect("ingest.db")
cur = con.cursor()

companies = [row[0] for row in cur.execute("select company from sp500_companies")]

import csv
with open("missing-sp500-issuers.csv", "w") as out_csv:
    writer = csv.writer(out_csv)

    for company in companies:
        #print strip_issuer_name(company)
        #"select count(*) from issuer_cusip6s where issuer_name like 'TWENTY FIRST CENTURY FOX%' or filing_person like 'TWENTY FIRST CENTURY FOX%'"
        query = "select count(*) from issuer_cusip6s where (issuer_name like '" + strip_issuer_name(company) + "%'"\
                " or filing_person like '" + strip_issuer_name(company) + "%') and address not like ''"
        cur.execute(query)
        row = cur.fetchone()

        if row[0] == 0:
            print company
            writer.writerow([company])



 def test_strip_issuer(self):
     self.issuers_to_clean = {'WHIRLPOOL CORP DE' : 'WHIRLPOOL',
                              'WESTERN UNION CO' : 'WESTERN UNION'
                       }
     for company in self.issuers_to_clean.keys():
         self.assertEqual(self.issuers_to_clean[company], strip_issuer_name(company))
Exemplo n.º 8
0
def read_fieldnames(path):
    with open(path, "r") as path_csv:
        reader = csv.reader(path_csv)
        row = reader.next()
        return row

parser = argparse.ArgumentParser(description='Enrich file with issuer name stem.')
parser.add_argument("path", help="path to input file")
args = parser.parse_args()
fieldnames = read_fieldnames(args.path)

basename = os.path.basename(args.path)
(root, extension) = os.path.splitext(basename)

with open(args.path, "r") as path_csv:
    reader = csv.DictReader(path_csv)

    import re
    if re.search(r'_', root):
        separator = '_'
    else:
        separator = '-'

    with open(root + "%swith%sstems.csv" % (separator, separator), "w") as out_csv:
        writer = csv.DictWriter(out_csv, fieldnames + ['STEM'])
        writer.writeheader()
        for row in reader:
            row['STEM'] = strip_issuer_name(row['SEARCH_TERM'])
            writer.writerow(row)