def test_strip_issuer(self): self.issuers_to_clean = { 'WHIRLPOOL CORP DE': 'WHIRLPOOL', 'WESTERN UNION CO': 'WESTERN UNION' } for company in self.issuers_to_clean.keys(): self.assertEqual(self.issuers_to_clean[company], strip_issuer_name(company))
def score_item_relevance(item): score = 0 from cleaning import strip_issuer_name if item.get('issuer_name'): if item['issuer_name'].lower() == item['filing_person'].lower(): score += 100 elif strip_issuer_name( item['issuer_name']).lower() == strip_issuer_name( item['filing_person']).lower(): score += 50 if item['issuer_name'].lower == item['search_term'].lower(): score += 100 elif strip_issuer_name( item['issuer_name']).lower() == strip_issuer_name( item['search_term']).lower(): score += 50 return score if item['search_term'].lower() == item['filing_person'].lower(): score += 70 elif strip_issuer_name(item['search_term']).lower() == strip_issuer_name( item['filing_person']).lower(): score += 50 return score
def score_item_relevance(item): score = 0 from cleaning import strip_issuer_name if item.get("issuer_name"): if item["issuer_name"].lower() == item["filing_person"].lower(): score += 100 elif strip_issuer_name(item["issuer_name"]).lower() == strip_issuer_name(item["filing_person"]).lower(): score += 50 if item["issuer_name"].lower == item["search_term"].lower(): score += 100 elif strip_issuer_name(item["issuer_name"]).lower() == strip_issuer_name(item["search_term"]).lower(): score += 50 return score if item["search_term"].lower() == item["filing_person"].lower(): score += 70 elif strip_issuer_name(item["search_term"]).lower() == strip_issuer_name(item["filing_person"]).lower(): score += 50 return score
writer = csv.DictWriter( issuers_out_csv, fieldnames=['CUSIP6', 'SEARCH_TERM', 'ISSUER_NAME', 'ADDRESS']) writer.writeheader() with open("cleaned-issuer-cusip6s-rejected.csv", "w") as rejected_out_csv: rejected_writer = csv.DictWriter(rejected_out_csv, fieldnames=[ 'CUSIP6', 'SEARCH_TERM', 'ISSUER_NAME', 'CLEANED_SEARCH_TERM', 'CLEANED_ISSUER_NAME' ]) for row in reader: cleaned_search_term = strip_issuer_name(row['SEARCH_TERM']) cleaned_issuer_name = strip_issuer_name(row['ISSUER_NAME']) if cleaned_issuer_name == cleaned_search_term: row = { 'SEARCH_TERM': row['SEARCH_TERM'], 'ISSUER_NAME': row['ISSUER_NAME'], 'CUSIP6': row['CUSIP6'], 'ADDRESS': row['ADDRESS'] } writer.writerow(row) else: print "Rejecting %s, %s" % (cleaned_issuer_name, cleaned_search_term) rejected_writer.writerow({ 'CUSIP6': row['CUSIP6'],
import csv # Take the issuer-cusip6's and standardize issuer names from cleaning import strip_issuer_name with open("issuer-cusip6s.csv", "r") as issuers_csv: reader = csv.DictReader(issuers_csv) with open("cleaned-issuer-cusip6s.csv", "w") as issuers_out_csv: writer = csv.DictWriter(issuers_out_csv, fieldnames = ['CUSIP6', 'SEARCH_TERM', 'ISSUER_NAME', 'ADDRESS']) writer.writeheader() with open("cleaned-issuer-cusip6s-rejected.csv", "w") as rejected_out_csv: rejected_writer = csv.DictWriter(rejected_out_csv, fieldnames=['CUSIP6', 'SEARCH_TERM', 'ISSUER_NAME', 'CLEANED_SEARCH_TERM', 'CLEANED_ISSUER_NAME']) for row in reader: cleaned_search_term = strip_issuer_name(row['SEARCH_TERM']) cleaned_issuer_name = strip_issuer_name(row['ISSUER_NAME']) if cleaned_issuer_name == cleaned_search_term: row = {'SEARCH_TERM' : row['SEARCH_TERM'], 'ISSUER_NAME' : row['ISSUER_NAME'], 'CUSIP6' : row['CUSIP6'], 'ADDRESS' : row['ADDRESS']} writer.writerow(row) else: print "Rejecting %s, %s" % (cleaned_issuer_name, cleaned_search_term) rejected_writer.writerow({'CUSIP6' : row['CUSIP6'], 'CLEANED_SEARCH_TERM' : cleaned_search_term, 'CLEANED_ISSUER_NAME' : cleaned_issuer_name, 'SEARCH_TERM' : row['SEARCH_TERM'], 'ISSUER_NAME' : row['ISSUER_NAME']})
os.system("python extract_cusip6_from_valid_items.py") os.system("tail -n+2 issuer_cusip6s.csv > issuer_cusip6s.csv.tmp; mv issuer_cusip6s.csv.tmp issuer_cusip6s.csv") os.system("sqlite3 ingest.db < sql/import-issuer-cusip6s.sql") # Need to implement a fuzzy match to find the companies that do not have a similar issuer in distinct_issuers import sqlite3 con = sqlite3.connect("ingest.db") cur = con.cursor() companies = [row[0] for row in cur.execute("select company from sp500_companies")] import csv with open("missing-sp500-issuers.csv", "w") as out_csv: writer = csv.writer(out_csv) for company in companies: #print strip_issuer_name(company) #"select count(*) from issuer_cusip6s where issuer_name like 'TWENTY FIRST CENTURY FOX%' or filing_person like 'TWENTY FIRST CENTURY FOX%'" query = "select count(*) from issuer_cusip6s where (issuer_name like '" + strip_issuer_name(company) + "%'"\ " or filing_person like '" + strip_issuer_name(company) + "%') and address not like ''" cur.execute(query) row = cur.fetchone() if row[0] == 0: print company writer.writerow([company])
def test_strip_issuer(self): self.issuers_to_clean = {'WHIRLPOOL CORP DE' : 'WHIRLPOOL', 'WESTERN UNION CO' : 'WESTERN UNION' } for company in self.issuers_to_clean.keys(): self.assertEqual(self.issuers_to_clean[company], strip_issuer_name(company))
def read_fieldnames(path): with open(path, "r") as path_csv: reader = csv.reader(path_csv) row = reader.next() return row parser = argparse.ArgumentParser(description='Enrich file with issuer name stem.') parser.add_argument("path", help="path to input file") args = parser.parse_args() fieldnames = read_fieldnames(args.path) basename = os.path.basename(args.path) (root, extension) = os.path.splitext(basename) with open(args.path, "r") as path_csv: reader = csv.DictReader(path_csv) import re if re.search(r'_', root): separator = '_' else: separator = '-' with open(root + "%swith%sstems.csv" % (separator, separator), "w") as out_csv: writer = csv.DictWriter(out_csv, fieldnames + ['STEM']) writer.writeheader() for row in reader: row['STEM'] = strip_issuer_name(row['SEARCH_TERM']) writer.writerow(row)