def read(self, request, **kwargs): kwargs.update({'name': request.GET.get('name', '')}) out = super(DetailExplorerHandler, self).read(request, **kwargs) from name_cleaver import OrganizationNameCleaver, IndividualNameCleaver, PoliticianNameCleaver from django.contrib.humanize.templatetags.humanize import intcomma from django.template.defaultfilters import slugify for row in out: row['lobbyist_name_standardized'] = IndividualNameCleaver( row['lobbyist_name']).parse( ) if row['lobbyist_name'] else row['lobbyist_name'] row['lobbyist_name_slug'] = slugify( row['lobbyist_name_standardized']) row['firm_name_standardized'] = OrganizationNameCleaver( row['firm_name']).parse( ) if row['firm_name'] else row['firm_name'] row['firm_name_slug'] = slugify(row['firm_name_standardized']) if row['recipient_id']: row['recipient_name_standardized'] = PoliticianNameCleaver( row['recipient_name']).parse() else: row['recipient_name_standardized'] = OrganizationNameCleaver( row['recipient_name']).parse() row['recipient_name_slug'] = slugify( row['recipient_name_standardized']) row['total_amount_standardized'] = intcomma(row['total_amount']) return out
def test_capitalizes_letter_after_slash(self): self.assertEqual( 'Health Services/Hmos', str(OrganizationNameCleaver('HEALTH SERVICES/HMOS').parse())) self.assertEqual( 'Lawyers/Law Firms', str(OrganizationNameCleaver('LAWYERS/LAW FIRMS').parse()))
def test_capitalize_scottish_names(self): self.assertEqual( 'McDonnell Douglas', str(OrganizationNameCleaver('MCDONNELL DOUGLAS').parse())) self.assertEqual( 'MacDonnell Douglas', str(OrganizationNameCleaver('MACDONNELL DOUGLAS').parse()))
def test_expand(self): self.assertEqual( 'Raytheon Corporation', OrganizationNameCleaver('Raytheon Corp.').parse().expand()) self.assertEqual( 'Massachusetts Institute of Technology', OrganizationNameCleaver( 'Massachusetts Inst. of Technology').parse().expand())
def test_dont_strip_after_hyphens_too_soon_in_a_name(self): self.assertEqual( 'US-Russia Business Council', OrganizationNameCleaver( 'US-Russia Business Council').parse().kernel()) self.assertEqual( 'Wal-Mart Stores', OrganizationNameCleaver('Wal-Mart Stores, Inc.').parse().kernel())
def normalize_organization(alias): parts = OrganizationNameCleaver(alias).parse(safe=True) if isinstance(parts, (str, unicode)): return [parts] standardized = parts.__str__() expanded = parts.expand() if standardized == expanded: return [standardized] else: return [standardized, expanded]
def process_file(filingnum, csvwriter, name): f1 = filing(filingnum) f1.download() form = f1.get_form_type() version = f1.get_version() # only parse forms that we're set up to read if not fp.is_allowed_form(form): print "Not a parseable form: %s - %s" % (form, filingnum) return print "Found form: %s - %s" % (form, filingnum) #rows = f1.get_all_rows() rows = f1.get_rows('^SB') #print "rows: %s" % rows for row in rows: # the last line is empty, so don't try to parse it if len(row) > 1: #print "in filing: %s" % filingnum parsed_line = fp.parse_form_line(row, version) orgname = parsed_line['payee_organization_name'].replace('"', '') parsed_line['orgname_parsed'] = str( OrganizationNameCleaver(orgname).parse()) parsed_line['committee_name'] = name #map_parsed_line(parsed_line) csvwriter.writerow(parsed_line)
def test_strip_hyphens_more_than_three_characters_into_a_name(self): # This is not ideal for this name, but we can't get the best for all cases self.assertEqual( 'F Hoffmann', OrganizationNameCleaver( 'F. HOFFMANN-LA ROCHE LTD and its Affiliates').parse().kernel( ))
def test_kernel(self): """ Intended to get only the unique/meaningful words out of a name """ self.assertEqual( 'Massachusetts Technology', OrganizationNameCleaver( 'Massachusetts Inst. of Technology').parse().kernel()) self.assertEqual( 'Massachusetts Technology', OrganizationNameCleaver( 'Massachusetts Institute of Technology').parse().kernel()) self.assertEqual( 'Walsh', OrganizationNameCleaver('The Walsh Group').parse().kernel()) self.assertEqual( 'Health Net', OrganizationNameCleaver('Health Net Inc').parse().kernel()) self.assertEqual( 'Health Net', OrganizationNameCleaver('Health Net, Inc.').parse().kernel()) self.assertEqual( 'Distilled Spirits Council', OrganizationNameCleaver( 'Distilled Spirits Council of the U.S., Inc.').parse().kernel( ))
from django.http import Http404 from django.template.defaultfilters import slugify from settings import api, LATEST_CYCLE, DEFAULT_CYCLE import datetime import googleanalytics import re from django.utils.datastructures import SortedDict from name_cleaver import PoliticianNameCleaver, OrganizationNameCleaver, \ IndividualNameCleaver from name_cleaver.names import PoliticianName _standardizers = { 'politician': lambda n: PoliticianNameCleaver(n).parse(), 'individual': lambda n: IndividualNameCleaver(n).parse(), 'industry': lambda n: OrganizationNameCleaver(n).parse(), 'organization': lambda n: OrganizationNameCleaver(n).parse(), } def standardize_name(name, type): try: standardized_name = _standardizers[type](name) if standardized_name.honorific: standardized_name.honorific = "("+standardized_name.honorific+")" return standardized_name except AttributeError: return _standardizers[type](name) def bar_validate(data): ''' take a dict formatted for submission to the barchart generation function, and make sure there's data worth displaying.
def build_section_data(self): entity_id, cycle, standardized_name, external_ids = self.entity.entity_id, self.entity.cycle, self.entity.standardized_name, self.entity.external_ids self.contributions_data = True candidates_barchart_data = [] for record in self.data['recipient_candidates']: candidates_barchart_data.append({ 'key': generate_label( str( PoliticianNameCleaver( record['recipient_name']).parse().plus_metadata( record['party'], record['state']))), 'value': record['amount'], 'href': barchart_href(record, cycle, entity_type="politician"), }) self.candidates_barchart_data = json.dumps( bar_validate(candidates_barchart_data)) orgs_barchart_data = [] for record in self.data['recipient_orgs']: orgs_barchart_data.append({ 'key': generate_label( str( OrganizationNameCleaver( record['recipient_name']).parse())), 'value': record['amount'], 'href': barchart_href(record, cycle, entity_type="organization"), }) self.orgs_barchart_data = json.dumps(bar_validate(orgs_barchart_data)) for key, values in self.data['party_breakdown'].iteritems(): self.data['party_breakdown'][key] = float(values[1]) self.party_breakdown = json.dumps( pie_validate(self.data['party_breakdown'])) # if none of the charts have data, or if the aggregate total # received was negative, then suppress that whole content # section except the overview bar amount = int( float(self.entity.metadata['entity_info']['totals'] ['contributor_amount'])) if amount < 0: self.suppress_contrib_graphs = True self.reason = "negative" elif (not self.candidates_barchart_data and not self.orgs_barchart_data and not self.party_breakdown): self.suppress_contrib_graphs = True self.reason = 'empty' self.external_links = external_sites.get_contribution_links( 'individual', standardized_name, external_ids, cycle) self.bundling_data = [[ x[key] for key in 'recipient_entity recipient_name recipient_type firm_entity firm_name amount' .split() ] for x in self.data['bundling']]
def test_expand_with_two_tokens_to_expand(self): self.assertEqual( 'Merck & Company Incorporated', OrganizationNameCleaver('Merck & Co., Inc.').parse().expand())
def test_organization(self): self.assertEqual(u'\u00C6tna, Inc.'.encode('utf-8'), \ str(OrganizationNameCleaver(u'\u00C6tna, Inc.').parse()))
def test_dont_capitalize_just_anything_starting_with_mac(self): self.assertEqual( 'Machinists/Aerospace Workers Union', str( OrganizationNameCleaver( 'MACHINISTS/AEROSPACE WORKERS UNION').parse()))
def test_doesnt_bother_names_containing_string_pac(self): self.assertEqual('Pacific Trust', str(OrganizationNameCleaver('PACIFIC TRUST').parse()))
def build_section_data(self): entity_id, cycle, type, standardized_name, external_ids = self.entity.entity_id, self.entity.cycle, self.entity.type, self.entity.standardized_name, self.entity.external_ids amount = int( float(self.entity.metadata['entity_info']['totals'] ['contributor_amount'])) if type == 'industry': self.top_orgs = json.dumps([{ 'key': generate_label( str(OrganizationNameCleaver(org['name']).parse())), 'value': org['total_amount'], 'value_employee': org['employee_amount'], 'value_pac': org['direct_amount'], 'href': barchart_href(org, cycle, 'organization') } for org in self.data['industry_orgs']]) self.contributions_data = True pol_recipients_barchart_data = [] for record in self.data['recipients']: pol_recipients_barchart_data.append({ 'key': generate_label( str( PoliticianNameCleaver( record['name']).parse().plus_metadata( record['party'], record['state']))), 'value': record['total_amount'], 'value_employee': record['employee_amount'], 'value_pac': record['direct_amount'], 'href': barchart_href(record, cycle, entity_type='politician') }) self.pol_recipients_barchart_data = json.dumps( bar_validate(pol_recipients_barchart_data)) pacs_barchart_data = [] for record in self.data['recipient_pacs']: pacs_barchart_data.append({ 'key': generate_label( str(OrganizationNameCleaver(record['name']).parse())), 'value': record['total_amount'], 'value_employee': record['employee_amount'], 'value_pac': record['direct_amount'], 'href': barchart_href(record, cycle, entity_type="organization"), }) self.pacs_barchart_data = json.dumps(bar_validate(pacs_barchart_data)) for key, values in self.data['party_breakdown'].iteritems(): self.data['party_breakdown'][key] = float(values[1]) self.party_breakdown = json.dumps( pie_validate(self.data['party_breakdown'])) for key, values in self.data['level_breakdown'].iteritems(): self.data['level_breakdown'][key] = float(values[1]) self.level_breakdown = json.dumps( pie_validate(self.data['level_breakdown'])) # if none of the charts have data, or if the aggregate total # received was negative, then suppress that whole content # section except the overview bar if amount <= 0: self.suppress_contrib_graphs = True if amount < 0: self.reason = "negative" elif (not self.pol_recipients_barchart_data and not self.party_breakdown and not self.level_breakdown and not self.pacs_barchart_data): self.suppress_contrib_graphs = True self.reason = 'empty' self.external_links = external_sites.get_contribution_links( type, standardized_name, external_ids, cycle) self.bundling_data = [[ x[key] for key in 'recipient_entity recipient_name recipient_type lobbyist_entity lobbyist_name firm_name amount' .split() ] for x in self.data['bundling']] if int(cycle) != -1: self.fec_indexp = self.data['fec_indexp'] if self.data['fec_summary'] and self.data['fec_summary'][ 'num_committee_filings'] > 0 and self.data[ 'fec_summary'].get('first_filing_date'): self.fec_summary = self.data['fec_summary'] self.fec_summary['clean_date'] = datetime.datetime.strptime( self.fec_summary['first_filing_date'], "%Y-%m-%d") top_contribs_data = [ dict(key=generate_label( row['contributor_name'] if row['contributor_name'] else '<Name Missing>', 27), value=row['amount'], href='') for row in self.data['fec_top_contribs'] if float(row['amount']) >= 100000 ] if top_contribs_data: self.fec_top_contribs_data = json.dumps(top_contribs_data) if getattr(self, 'fec_indexp', False) or getattr( self, 'fec_summary', False): self.include_fec = True
def test_parse_safe__organization(self): self.assertEqual('', OrganizationNameCleaver(None).parse(safe=True))
def test_handles_empty_names(self): self.assertEqual('', str(OrganizationNameCleaver('').parse()))
def standardize_industry_name_filter(name): return str(OrganizationNameCleaver(name).parse())
def test_capitalize_pac(self): self.assertEqual( 'Nancy Pelosi Leadership PAC', str( OrganizationNameCleaver( 'NANCY PELOSI LEADERSHIP PAC').parse()))
def test_make_single_word_names_ending_in_pac_all_uppercase(self): self.assertEqual('ECEPAC', str(OrganizationNameCleaver('ECEPAC').parse()))
def test_overrides_dumb_python_titlecasing_for_apostrophes(self): self.assertEqual( 'Phoenix Women\'s Health Center', str( OrganizationNameCleaver( 'PHOENIX WOMEN\'S HEALTH CENTER').parse()))
def test_names_starting_with_PAC(self): self.assertEqual( 'PAC For Engineers', str(OrganizationNameCleaver('PAC FOR ENGINEERS').parse())) self.assertEqual('PAC 102', str(OrganizationNameCleaver('PAC 102').parse()))
def test_capitalizes_letter_after_hyphen(self): self.assertEqual( 'Non-Profit Institutions', str(OrganizationNameCleaver('NON-PROFIT INSTITUTIONS').parse())) self.assertEqual('Pro-Israel', str(OrganizationNameCleaver('PRO-ISRAEL').parse()))
def build_section_data(self): entity_id, standardized_name, cycle, external_ids = self.entity.entity_id, self.entity.standardized_name, self.entity.cycle, self.entity.external_ids self.contributions_data = True contributors_barchart_data = [] for record in self.data['top_contributors']: contributors_barchart_data.append({ 'key': generate_label( str(OrganizationNameCleaver(record['name']).parse())), 'value': record['total_amount'], 'value_employee': record['employee_amount'], 'value_pac': record['direct_amount'], 'href': barchart_href(record, cycle, 'organization') }) contributors_barchart_data = bar_validate(contributors_barchart_data) self.contributors_barchart_data = json.dumps( contributors_barchart_data) industries_barchart_data = [] for record in self.data['top_industries']: industries_barchart_data.append({ 'key': generate_label( str(OrganizationNameCleaver(record['name']).parse())), 'href': barchart_href(record, cycle, 'industry'), 'value': record['amount'], }) industries_barchart_data = bar_validate(industries_barchart_data) self.industries_barchart_data = json.dumps(industries_barchart_data) for key, values in self.data['local_breakdown'].iteritems(): # values is a list of [count, amount] self.data['local_breakdown'][key] = float(values[1]) self.data['local_breakdown'] = pie_validate( self.data['local_breakdown']) self.local_breakdown = json.dumps(self.data['local_breakdown']) for key, values in self.data['entity_breakdown'].iteritems(): # values is a list of [count, amount] self.data['entity_breakdown'][key] = float(values[1]) self.data['entity_breakdown'] = pie_validate( self.data['entity_breakdown']) self.entity_breakdown = json.dumps(self.data['entity_breakdown']) # if none of the charts have data, or if the aggregate total # received was negative, then suppress that whole content # section except the overview bar amount = int( float(self.entity.metadata['entity_info']['totals'] ['recipient_amount'])) if amount < 0: self.suppress_contrib_graphs = True self.reason = "negative" elif not any( (industries_barchart_data, contributors_barchart_data, self.data['local_breakdown'], self.data['entity_breakdown'])): self.suppress_contrib_graphs = True self.reason = 'empty' pct_unknown = 0 if amount: pct_unknown = float(self.data['industries_unknown_amount'].get( 'amount', 0)) * 100 / amount self.pct_known = int(round(100 - pct_unknown)) self.external_links = external_sites.get_contribution_links( 'politician', standardized_name.name_str(), external_ids, cycle) if self.partytime_link: self.external_links.append({ 'url': self.partytime_link, 'text': 'Party Time' }) self.bundling_data = [[ x[key] for key in 'lobbyist_entity lobbyist_name firm_entity firm_name amount'.split( ) ] for x in self.data['bundling']] if self.fec_summary: self.include_fec = True if self.fec_summary and 'date' in self.fec_summary: self.fec_summary['clean_date'] = datetime.datetime.strptime( self.fec_summary['date'], "%Y-%m-%d") timelines = [] for pol in self.data['fec_timeline']: tl = { 'name': pol['candidate_name'], 'party': pol['party'], 'is_this': pol['entity_id'] == entity_id, 'timeline': map(lambda item: item if item >= 0 else 0, pol['timeline']), 'href': '/politician/%s/%s?cycle=%s' % (slugify( PoliticianNameCleaver( pol['candidate_name']).parse().name_str()), pol['entity_id'], cycle) } tl['sum'] = sum(tl['timeline']) timelines.append(tl) timelines.sort(key=lambda t: (int(t['is_this']), t['sum']), reverse=True) # restrict to top 5, and only those receiving at least 10% of this pol's total if timelines: this_sum = timelines[0]['sum'] timelines = [ timeline for timeline in timelines if timeline['sum'] > 0.1 * this_sum ] timelines = timelines[:5] self.fec_timelines = json.dumps(timelines)