def get_organizations(self): # Initialize the Organization class. Use keyword args to set the basic # properties. secretary_of_state = Organization( name="Office of the Secretary of State, State of Arizona", classification="office" ) # secretary_of_state.add_contact_detail( type="voice", value="602-542-4285" ) secretary_of_state.add_contact_detail( type="address", value="1700 W Washington St Fl 7, Phoenix AZ 85007-2808" ) secretary_of_state.add_link( url="http://www.azsos.gov/", note="Home page" ) self._secretary_of_state = secretary_of_state yield secretary_of_state
def test_full_organization(): org = ScrapeOrganization('United Nations', classification='international') org.add_identifier('un') org.add_name('UN', start_date='1945') org.add_contact_detail(type='phone', value='555-555-1234', note='this is fake') org.add_link('http://example.com/link') org.add_source('http://example.com/source') # import org od = org.as_dict() OrganizationImporter('jurisdiction-id').import_data([od]) # get person from db and assert it imported correctly o = Organization.objects.get() assert 'ocd-organization' in o.id assert o.name == org.name assert o.identifiers.all()[0].identifier == 'un' assert o.identifiers.all()[0].scheme == '' assert o.other_names.all()[0].name == 'UN' assert o.other_names.all()[0].start_date == '1945' assert o.contact_details.all()[0].type == 'phone' assert o.contact_details.all()[0].value == '555-555-1234' assert o.contact_details.all()[0].note == 'this is fake' assert o.links.all()[0].url == 'http://example.com/link' assert o.sources.all()[0].url == 'http://example.com/source'
def get_organizations(self): org = Organization(name="Ferguson City Council", classification="legislature") org.add_contact_detail( type='email', value='*****@*****.**' ) org.add_post( label="Mayor", role="Mayor", division_id=self.division_id ) WARDS = 3 for ward in range(1, WARDS + 1): org.add_post( label="Council Member Ward {}".format(ward), role="Council Member Ward {}".format(ward), division_id=self.division_id, # num_seats=2, ) yield org
def test_full_organization(): create_jurisdictions() org = ScrapeOrganization('United Nations', classification='international') org.add_identifier('un') org.add_name('UN', start_date='1945') org.add_contact_detail(type='phone', value='555-555-1234', note='this is fake') org.add_link('http://example.com/link') org.add_source('http://example.com/source') # import org od = org.as_dict() OrganizationImporter('jid1').import_data([od]) # get person from db and assert it imported correctly o = Organization.objects.get() assert 'ocd-organization' in o.id assert o.name == org.name assert o.identifiers.all()[0].identifier == 'un' assert o.identifiers.all()[0].scheme == '' assert o.other_names.all()[0].name == 'UN' assert o.other_names.all()[0].start_date == '1945' assert o.contact_details.all()[0].type == 'phone' assert o.contact_details.all()[0].value == '555-555-1234' assert o.contact_details.all()[0].note == 'this is fake' assert o.links.all()[0].url == 'http://example.com/link' assert o.sources.all()[0].url == 'http://example.com/source'
def categorize_data(self, csv_data): return_objs = [] Contribution = namedtuple('Contribution', self.csv_header_row.replace(' ', '_')) for line in csv_data.split('\n'): # explicity defining delimiter because otherwise fails in case of single line if not line: continue # cur_obj will be the person or organization that made the contribution cur_obj = None contribution = Contribution(*line.split(',')) if contribution.Contributor_Type in self.business_contribution_types: cur_obj = Organization(contribution.Contributor_Name) elif contribution.Contributor_Type in self.individual_contribution_types: cur_obj = Person(contribution.Contributor_Name) elif contribution.Contributor_Type == 'Unknown/Anonymous': if contribution.Contributor_Name: #ignoring un-named contributors #these look like catch-all business contributions cur_obj = Organization(contribution.Contributor_Name) if cur_obj: #we don't set cur_obj in the event that there was an #anonymous/unknown contribution without a Contribution_Name #so we need to check that it exists before adding to it cur_obj.add_source(url=self.search_url) cur_obj.source_identified = True if contribution.Contributor_Address: cur_obj.add_contact_detail(type='address', value=contribution.Contributor_Address) if contribution.Employer_Name: cur_obj.extras['Employer'] = contribution.Employer_Name if contribution.Employer_Occupation: cur_obj.extras['Occupation'] = contribution.Employer_Occupation #recipiant_obj is the organization that received the contribution recipiant_obj = Organization(contribution.Receiving_Committee) recipiant_obj.extras['Office'] = contribution.Office recipiant_obj.extras['Filing Period'] = contribution.Filing_Period recipiant_obj.extras['Fundtype'] = contribution.Fundtype #transaction is the event linking the donor and recipiant transaction = Event('Contribution', contribution.Contribution_Date, 'EST', 'Maryland') #EST and Maryland b/c MD transaction.extras['Contribution Amount'] = contribution.Contribution_Amount transaction.extras['Contribution Type'] = contribution.Contribution_Type transaction.add_source(url=self.search_url) #transaction.source_identified = True transaction.participants.append(cur_obj.as_dict()) transaction.participants.append(recipiant_obj.as_dict()) yield (cur_obj, recipiant_obj, transaction) else: yield []
def get_organizations(self): secretary_of_state = Organization( name="Office of the Secretary of State, State of California", classification="office") secretary_of_state.add_contact_detail(type="voice", value="916-653-6814") secretary_of_state.add_contact_detail( type="address", value="1500 11th Street, Sacramento, CA 95814") secretary_of_state.add_link(url="http://www.sos.ca.gov", note="Home page") self._secretary_of_state = secretary_of_state yield secretary_of_state
def scrape_committees(self, chamber): url = _COMMITTEE_URL % _CHAMBERS[chamber] page = self.get(url).text html = lxml.html.fromstring(page) table = html.xpath( 'body/section[2]/div/div/div/section[2]/div[2]/div/div/div/div' )[1:] for row in table: # committee name, description, hours of operation, # secretary and office_phone text = list(row[0].xpath('div')[0].itertext()) attributes = [ list( value.replace(u'\xa0', ' ').replace( 'Secretary:', '').encode('ascii', 'ignore') for value in text if 'Email:' not in value and value != '\n' and 'Phone:' not in value) ] for i in range(len(attributes[0])): if 'Room' in str(attributes[0][i]): attributes[0][i] = str( attributes[0][i]).split('Room')[0].replace(', ', ' ') org = Organization(chamber=chamber, classification="committee", name=str(attributes[0][0].decode())) if len(attributes[0]) > 5: org.add_contact_detail(type='email', value=str(attributes[0][4].decode()), note='District Office') org.add_contact_detail(type='voice', value=str(attributes[0][5].decode()), note='District Office') else: org.add_contact_detail(type='email', value=str(attributes[0][3].decode()), note='District Office') org.add_contact_detail(type='voice', value=str(attributes[0][4].decode()), note='District Office') org.add_source(url) # membership for td in row[1].xpath('div'): td_text = list(td.itertext()) members = list( value for value in td_text if value != ' ' and value != '\n' and value != ',') role = "member" for member in members: if (member in ['Chair', 'Vice Chair']): role = member.lower() continue else: org.add_member(member.strip(), role=role) role = "member" yield org
def categorize_data(self, csv_data): #Is there a better place to define this? return_objs = [] Contribution = namedtuple('Contribution', self.csv_header_row.replace(' ', '_')) for line in csv_data.split('\n'): # explicity defining delimiter because otherwise fails in case of single line if not line: continue cur_obj = None try: contribution = Contribution(*line.split(',')) except Exception as e: import pdb; pdb.set_trace() if contribution.Contributor_Type in self.business_contribution_types: cur_obj = Organization(contribution.Contributor_Name) elif contribution.Contributor_Type in self.individual_contribution_types: cur_obj = Person(contribution.Contributor_Name) elif contribution.Contributor_Type == 'Unknown/Anonymous': if contribution.Contributor_Name: #ignoring un-named contributors #these look like catch-all business contributions cur_obj = Organization(contribution.Contributor_Name) if cur_obj: cur_obj.add_source(url=self.search_url) cur_obj.source_identified = True if contribution.Contributor_Address: cur_obj.add_contact_detail(type='address', value=contribution.Contributor_Address) if contribution.Employer_Name: cur_obj.extras['Employer'] = contribution.Employer_Name if contribution.Employer_Occupation: cur_obj.extras['Occupation'] = contribution.Employer_Occupation recipiant_obj = Organization(contribution.Receiving_Committee) recipiant_obj.extras['Office'] = contribution.Office recipiant_obj.extras['Filing Period'] = contribution.Filing_Period recipiant_obj.extras['Fundtype'] = contribution.Fundtype transaction = Event('Contribution', contribution.Contribution_Date, 'EST', 'Maryland') #EST and Maryland b/c MD transaction.extras['Contribution Amount'] = contribution.Contribution_Amount transaction.extras['Contribution Type'] = contribution.Contribution_Type transaction.add_source(url=self.search_url) #transaction.source_identified = True transaction.participants.append(cur_obj.as_dict()) transaction.participants.append(recipiant_obj.as_dict()) yield (cur_obj, recipiant_obj, transaction) else: yield []
def get_organizations(self): org = Organization(name="Ferguson City Council", classification="legislature") org.add_contact_detail(type='email', value='*****@*****.**') org.add_post(label="Mayor", role="Mayor", division_id=self.division_id) WARDS = 3 for ward in range(1, WARDS + 1): org.add_post( label="Council Member Ward {}".format(ward), role="Council Member Ward {}".format(ward), division_id=self.division_id, # num_seats=2, ) yield org
def scrape_committees(self, repos): for repo in repos: source = "https://raw.githubusercontent.com/unitedstates/congress-legislators/master/{0}".format(repo) committees = self.fetch_yaml(source) for committee in committees: org = Organization(committee["name"], classification="committee") org.add_source(source) for key in committee.keys() & {"url", "rss_url"}: org.add_link(committee[key]) for key in committee.keys() & {"phone", "address"}: org.add_contact_detail( type="voice", value=committee[key] ) if key == "phone" else org.add_contact_detail(type=key, value=committee[key]) for key in committee.keys() & {"senate_committee_id", "house_committee_id", "thomas_id"}: org.add_identifier(committee[key], scheme=key) if "subcommittees" in committee: for subcommittee in committee["subcommittees"]: sub_org = Organization(subcommittee["name"], classification="committee", parent_id=org._id) sub_org.add_identifier(subcommittee["thomas_id"], scheme="thomas") sub_org.add_source(source) for key in subcommittee.keys() & {"phone", "address"}: sub_org.add_contact_detail( type="voice", value=committee[key] ) if key == "phone" else sub_org.add_contact_detail(type=key, value=committee[key]) yield sub_org yield org
def get_organizations(self): secretary_of_the_commonwealth = Organization( name="Office of the Secretary of the Commonwealth, Commonwealth of Virginia", classification="office" ) secretary_of_the_commonwealth.add_contact_detail( type="voice", value="804-786-2441" ) secretary_of_the_commonwealth.add_contact_detail( type="address", value="1111 East Broad Street, 4th Floor, Richmond, Virginia 23219" ) secretary_of_the_commonwealth.add_link( url="https://commonwealth.virginia.gov/", note="Home page" ) self._secretary_of_the_commonwealth = secretary_of_the_commonwealth yield secretary_of_the_commonwealth
def get_organizations(self): secretary_of_state = Organization( name="Office of the Secretary of State, State of California", classification="office" ) secretary_of_state.add_contact_detail( type="voice", value="916-653-6814" ) secretary_of_state.add_contact_detail( type="address", value="1500 11th Street, Sacramento, CA 95814" ) secretary_of_state.add_link( url="http://www.sos.ca.gov", note="Home page" ) self._secretary_of_state = secretary_of_state yield secretary_of_state
def scrape_committees(self, chamber): url = _COMMITTEE_URL % _CHAMBERS[chamber] page = self.get(url, verify=False).text html = lxml.html.fromstring(page) table = html.xpath('body/section[2]/div/div/div/section[2]/div[2]/div/div/div/div')[1:] for row in table: # committee name, description, hours of operation, # secretary and office_phone text = list(row[0].xpath('div')[0].itertext()) attributes = [list(value.replace(u'\xa0', ' ') .replace('Secretary:', '').encode('ascii', 'ignore') for value in text if 'Email:' not in value and value != '\n' and 'Phone:' not in value)] for i in range(len(attributes[0])): if 'Room' in str(attributes[0][i]): attributes[0][i] = str(attributes[0][i]).split('Room')[0].replace(', ', ' ') org = Organization(chamber=chamber, classification="committee", name=str(attributes[0][0].decode())) if len(attributes[0]) > 5: org.add_contact_detail(type='email', value=str(attributes[0][4].decode()), note='District Office') org.add_contact_detail(type='voice', value=str(attributes[0][5].decode()), note='District Office') else: org.add_contact_detail(type='email', value=str(attributes[0][3].decode()), note='District Office') org.add_contact_detail(type='voice', value=str(attributes[0][4].decode()), note='District Office') org.add_source(url) # membership for td in row[1].xpath('div'): td_text = list(td.itertext()) members = list(value for value in td_text if value != ' ' and value != '\n' and value != ',') role = "member" for member in members: if (member in ['Chair', 'Vice Chair']): role = member.lower() continue else: org.add_member(member.strip(), role=role) role = "member" yield org
def scrape_committees(self, repos): for repo in repos: source = "https://raw.githubusercontent.com/unitedstates/congress-legislators/master/{0}".format( repo) committees = self.fetch_yaml(source) for committee in committees: org = Organization(committee['name'], classification='committee') org.add_source(source) for key in committee.keys() & {'url', 'rss_url'}: org.add_link(committee[key]) for key in committee.keys() & {'phone', 'address'}: org.add_contact_detail( type='voice', value=committee[key] ) if key == 'phone' else org.add_contact_detail( type=key, value=committee[key]) for key in committee.keys() & { 'senate_committee_id', 'house_committee_id', 'thomas_id' }: org.add_identifier(committee[key], scheme=key) if 'subcommittees' in committee: for subcommittee in committee['subcommittees']: sub_org = Organization(subcommittee['name'], classification="committee", parent_id=org._id) sub_org.add_identifier(subcommittee['thomas_id'], scheme="thomas") sub_org.add_source(source) for key in subcommittee.keys() & {'phone', 'address'}: sub_org.add_contact_detail( type='voice', value=committee[key] ) if key == 'phone' else sub_org.add_contact_detail( type=key, value=committee[key]) yield sub_org yield org
def get_organizations(self): legislature = Organization("United States Congress", classification='legislature') self._legislature = legislature yield legislature senate = Organization( name="United States Senate", classification='upper', parent_id=legislature._id, ) self._senate = senate yield senate house = Organization( name="United States House", classification='lower', parent_id=legislature._id, ) self._house = house yield house sopr = Organization( name="Office of Public Record, US Senate", classification="office", parent_id=senate._id, ) sopr.add_contact_detail(type="voice", value="202-224-0322") sopr.add_source(url="http://www.senate.gov/pagelayout/legislative/" "one_item_and_teasers/opr.htm", note="Profile page") sopr.add_source(url="http://www.senate.gov/pagelayout/legislative/" "g_three_sections_with_teasers/lobbyingdisc.htm" "#lobbyingdisc=lda", note="Disclosure Home") sopr.add_link(url="http://soprweb.senate.gov/index.cfm" "?event=selectfields", note="Disclosure Search Portal") sopr.add_link(url="http://soprweb.senate.gov/", note="Disclosure Electronic Filing System") self._sopr = sopr yield sopr house_clerk = Organization( name="Office of the Clerk, US House", classification="office", parent_id=house._id, ) house_clerk.add_contact_detail(type="voice", value="202-225-7000") house_clerk.add_source(url="http://clerk.house.gov/", note="Home page") self._house_clerk = house_clerk yield house_clerk yield legislature
def scrape_committees(self, chamber): url = _COMMITTEE_URL % _CHAMBERS[chamber] page = self.get(url).text html = lxml.html.fromstring(page) table = html.xpath( "body/section[2]/div/div/section[2]/div[2]/div/div/div/div") for row in table[1:]: # committee name, description, hours of operation, # secretary and office_phone text = list(row[0].xpath("div")[0].itertext()) attributes = [ list( value.replace(u"\xa0", " ").replace( "Secretary:", "").encode("ascii", "ignore") for value in text if "Email:" not in value and value != "\n" and "Phone:" not in value) ] for i in range(len(attributes[0])): if "Room" in str(attributes[0][i]): attributes[0][i] = (str( attributes[0][i]).split("Room")[0].replace(", ", " ")) org = Organization( chamber=chamber, classification="committee", name=str(attributes[0][0].decode()), ) if len(attributes[0]) > 5: org.add_contact_detail( type="email", value=str(attributes[0][4].decode()), note="District Office", ) org.add_contact_detail( type="voice", value=str(attributes[0][5].decode()), note="District Office", ) else: org.add_contact_detail( type="email", value=str(attributes[0][3].decode()), note="District Office", ) org.add_contact_detail( type="voice", value=str(attributes[0][4].decode()), note="District Office", ) org.add_source(url) # membership td_text = list() for td in row[1].xpath("div") + row[2].xpath("div"): td_text += td.itertext() members = list(value for value in td_text if value != " " and value != "\n" and value != ",") role = "member" for member in members: if member in ["Chair", "Vice Chair"]: role = member.lower() continue elif member.strip(): org.add_member(member.strip(), role=role) role = "member" yield org
def transform_parse(self, parsed_form, response): _source = { "url": response.url, "note": "LDA Form LD-1" } # basic disclosure fields _disclosure = Disclosure( effective_date=datetime.strptime( parsed_form['datetimes']['effective_date'], '%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC), timezone='America/New_York', submitted_date=datetime.strptime( parsed_form['datetimes']['signature_date'], '%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC), classification="lobbying" ) _disclosure.add_authority(name=self.authority.name, type=self.authority._type, id=self.authority._id) _disclosure.add_identifier( identifier=parsed_form['_meta']['document_id'], scheme="urn:sopr:filing" ) # disclosure extras _disclosure.extras = {} _disclosure.extras['registrant'] = { 'self_employed_individual': parsed_form['registrant']['self_employed_individual'], 'general_description': parsed_form['registrant']['registrant_general_description'], 'signature': { "signature_date": parsed_form['datetimes']['signature_date'], "signature": parsed_form['signature'] } } _disclosure.extras['client'] = { 'same_as_registrant': parsed_form['client']['client_self'], 'general_description': parsed_form['client']['client_general_description'] } _disclosure.extras['registration_type'] = { 'is_amendment': parsed_form['registration_type']['is_amendment'], 'new_registrant': parsed_form['registration_type']['new_registrant'], 'new_client_for_existing_registrant': parsed_form['registration_type'][ 'new_client_for_existing_registrant'], } # # Registrant # build registrant _registrant_self_employment = None if parsed_form['registrant']['self_employed_individual']: n = ' '.join([p for p in [ parsed_form['registrant']['registrant_individual_prefix'], parsed_form['registrant']['registrant_individual_firstname'], parsed_form['registrant']['registrant_individual_lastname'] ] if len(p) > 0]).strip() _registrant = Person( name=n, source_identified=True ) _registrant_self_employment = Organization( name='SELF-EMPLOYMENT of {n}'.format(n=n), classification='company', source_identified=True ) _registrant.add_membership( organization=_registrant_self_employment, role='self_employed', label='self-employment of {n}'.format(n=n), start_date=_disclosure.effective_date.strftime('%Y-%m-%d') ) else: _registrant = Organization( name=parsed_form['registrant']['registrant_org_name'], classification='company', source_identified=True ) if len(parsed_form['registrant']['registrant_house_id']) > 0: _registrant.add_identifier( identifier=parsed_form['registrant']['registrant_house_id'], scheme='urn:house_clerk:registrant' ) if len(parsed_form['registrant']['registrant_senate_id']) > 0: _registrant.add_identifier( identifier=parsed_form['registrant']['registrant_senate_id'], scheme='urn:sopr:registrant' ) registrant_contact_details = [ { "type": "address", "note": "contact address", "value": '; '.join([ p for p in [ parsed_form['registrant']['registrant_address_one'], parsed_form['registrant']['registrant_address_two'], parsed_form['registrant']['registrant_city'], parsed_form['registrant']['registrant_state'], parsed_form['registrant']['registrant_zip'], parsed_form['registrant']['registrant_country']] if len(p) > 0]).strip(), }, { "type": "voice", "note": "contact phone", "value": parsed_form['registrant']['registrant_contact_phone'], }, { "type": "email", "note": "contact email", "value": parsed_form['registrant']['registrant_contact_email'], }, ] registrant_contact_ppb = { "type": "address", "note": "principal place of business", "value": '; '.join([ p for p in [ parsed_form['registrant']['registrant_ppb_city'], parsed_form['registrant']['registrant_ppb_state'], parsed_form['registrant']['registrant_ppb_zip'], parsed_form['registrant']['registrant_ppb_country']] if len(p) > 0]).strip(), } if registrant_contact_ppb["value"]: registrant_contact_details.append(registrant_contact_ppb) for cd in registrant_contact_details: _registrant.add_contact_detail(**cd) _registrant.extras = { "contact_details_structured": [ { "type": "address", "note": "contact address", "parts": [ { "note": "address_one", "value": parsed_form['registrant'][ 'registrant_address_one'], }, { "note": "address_two", "value": parsed_form['registrant'][ 'registrant_address_two'], }, { "note": "city", "value": parsed_form['registrant'][ 'registrant_city'], }, { "note": "state", "value": parsed_form['registrant'][ 'registrant_state'], }, { "note": "zip", "value": parsed_form['registrant'][ 'registrant_zip'], }, { "note": "country", "value": parsed_form['registrant'][ 'registrant_country'], } ], }, { "type": "address", "note": "principal place of business", "parts": [ { "note": "city", "value": parsed_form['registrant'][ 'registrant_ppb_city'], }, { "note": "state", "value": parsed_form['registrant'][ 'registrant_ppb_state'], }, { "note": "zip", "value": parsed_form['registrant'][ 'registrant_ppb_zip'], }, { "note": "country", "value": parsed_form['registrant'][ 'registrant_ppb_country'], } ], }, ] } # # People # build contact _main_contact = Person( name=parsed_form['registrant']['registrant_contact_name'], source_identified=True ) main_contact_contact_details = [ { "type": "voice", "note": "contact phone", "value": parsed_form['registrant']['registrant_contact_phone'], }, { "type": "email", "note": "contact email", "value": parsed_form['registrant']['registrant_contact_email'], } ] for cd in main_contact_contact_details: _main_contact.add_contact_detail(**cd) if _registrant._type == 'organization': _registrant.add_member( name_or_person=_main_contact, role='main_contact', label='main contact for {n}'.format(n=_registrant.name), start_date=_disclosure.effective_date.strftime('%Y-%m-%d') ) else: _registrant_self_employment.add_member( name_or_person=_main_contact, role='main_contact', label='main contact for {n}'.format(n=_registrant.name), start_date=_disclosure.effective_date.strftime('%Y-%m-%d') ) # # Client # build client _client = Organization( name=parsed_form['client']['client_name'], classification='company', source_identified=True ) client_contact_details = [ { "type": "address", "note": "contact address", "value": '; '.join([ p for p in [ parsed_form['client']['client_address'], parsed_form['client']['client_city'], parsed_form['client']['client_state'], parsed_form['client']['client_zip'], parsed_form['client']['client_country']] if len(p) > 0]).strip(), }, ] client_contact_ppb = { "type": "address", "note": "principal place of business", "value": '; '.join([ p for p in [ parsed_form['client']['client_ppb_city'], parsed_form['client']['client_ppb_state'], parsed_form['client']['client_ppb_zip'], parsed_form['client']['client_ppb_country']] if len(p) > 0]).strip(), } if client_contact_ppb["value"]: client_contact_details.append(client_contact_ppb) for cd in client_contact_details: _client.add_contact_detail(**cd) _client.extras = { "contact_details_structured": [ { "type": "address", "note": "contact address", "parts": [ { "note": "address", "value": parsed_form['client']['client_address'], }, { "note": "city", "value": parsed_form['client']['client_city'], }, { "note": "state", "value": parsed_form['client']['client_state'], }, { "note": "zip", "value": parsed_form['client']['client_zip'], }, { "note": "country", "value": parsed_form['client']['client_country'], } ], }, { "type": "address", "note": "principal place of business", "parts": [ { "note": "city", "value": parsed_form['client']['client_ppb_city'], }, { "note": "state", "value": parsed_form['client']['client_ppb_state'], }, { "note": "zip", "value": parsed_form['client']['client_ppb_zip'], }, { "note": "country", "value": parsed_form['client'][ 'client_ppb_country'], } ], }, ], } # Collect Foreign Entities _foreign_entities = [] _foreign_entities_by_name = {} for fe in parsed_form['foreign_entities']: fe_extras = {} fe_name = fe['foreign_entity_name'] # check for name-based duplicates if fe_name in _foreign_entities_by_name: _foreign_entity = _foreign_entities_by_name[fe_name] else: _foreign_entity = Organization( name=fe_name, classification='company', source_identified=True ) # collect contact details foreign_entity_contact_details = [ { "type": "address", "note": "contact address", "value": '; '.join([ p for p in [ fe['foreign_entity_address'], fe['foreign_entity_city'], fe['foreign_entity_state'], fe['foreign_entity_country']] if len(p) > 0]).strip(), }, { "type": "address", "note": "principal place of business", "value": '; '.join([ p for p in [ fe['foreign_entity_ppb_state'], fe['foreign_entity_ppb_country']] if len(p) > 0]).strip(), }, ] foreign_entity_contact_ppb = { "type": "address", "note": "principal place of business", "value": '; '.join([ p for p in [ fe['foreign_entity_ppb_city'], fe['foreign_entity_ppb_state'], fe['foreign_entity_ppb_country']] if len(p) > 0]), } if foreign_entity_contact_ppb["value"]: foreign_entity_contact_details.append( foreign_entity_contact_ppb) # add contact details for cd in foreign_entity_contact_details: if cd['value'] != '': _foreign_entity.add_contact_detail(**cd) # add extras fe_extras["contact_details_structured"] = [ { "type": "address", "note": "contact address", "parts": [ { "note": "address", "value": fe['foreign_entity_address'], }, { "note": "city", "value": fe['foreign_entity_city'], }, { "note": "state", "value": fe['foreign_entity_state'], }, { "note": "country", "value": fe['foreign_entity_country'], } ], }, { "type": "address", "note": "principal place of business", "parts": [ { "note": "state", "value": fe['foreign_entity_ppb_state'], }, { "note": "country", "value": fe['foreign_entity_ppb_country'], } ], }, ] _foreign_entity.extras = combine_dicts(_foreign_entity.extras, fe_extras) _foreign_entities_by_name[fe_name] = _foreign_entity for unique_foreign_entity in _foreign_entities_by_name.values(): _foreign_entities.append(unique_foreign_entity) # TODO: add a variant on memberships to represent inter-org # relationships (associations, ownership, etc) # # _client['memberships'].append({ # "id": _foreign_entity['id'], # "classification": "organization", # "name": _foreign_entity['name'], # "extras": { # "ownership_percentage": # fe['foreign_entity_amount'] # } # }) # Collect Lobbyists # TODO: deal with wierd non-name line continuation cases (blanks, "continued") _lobbyists_by_name = {} for l in parsed_form['lobbyists']: l_extras = {} l_name = ' '.join([l['lobbyist_first_name'], l['lobbyist_last_name'], l['lobbyist_suffix'] ]).strip() if l_name in _lobbyists_by_name: _lobbyist = _lobbyists_by_name[l_name] else: _lobbyist = Person( name=l_name, source_identified=True ) if l['lobbyist_covered_official_position']: l_extras['lda_covered_official_positions'] = [ { 'date_reported': parsed_form['datetimes']['effective_date'], 'covered_official_position': l['lobbyist_covered_official_position'] }, ] _lobbyist.extras = combine_dicts(_lobbyist.extras, l_extras) _lobbyists_by_name[l_name] = _lobbyist _lobbyists = [] for unique_lobbyist in _lobbyists_by_name.values(): _lobbyists.append(unique_lobbyist) if _registrant._type == 'organization': for l in _lobbyists: _registrant.add_member( l, role='lobbyist', label='lobbyist for {n}'.format(n=_registrant.name), start_date=_disclosure.effective_date.strftime('%Y-%m-%d') ) else: for l in _lobbyists: _registrant_self_employment.add_member( l, role='lobbyist', label='lobbyist for {n}'.format(n=_registrant.name), start_date=_disclosure.effective_date.strftime('%Y-%m-%d') ) # # Document # build document _disclosure.add_document( note='submitted filing', date=parsed_form['datetimes']['effective_date'][:10], url=response.url ) # Collect Affiliated orgs _affiliated_organizations = [] _affiliated_organizations_by_name = {} for ao in parsed_form['affiliated_organizations']: ao_extras = {} ao_name = ao['affiliated_organization_name'] if ao_name in _affiliated_organizations_by_name: # There's already one by this name _affiliated_organization = _affiliated_organizations_by_name[ao_name] else: # New affiliated org _affiliated_organization = Organization( name=ao_name, classification='company', source_identified=True ) # collect contact details affiliated_organization_contact_details = [ { "type": "address", "note": "contact address", "value": '; '.join([ p for p in [ ao['affiliated_organization_address'], ao['affiliated_organization_city'], ao['affiliated_organization_state'], ao['affiliated_organization_zip'], ao['affiliated_organization_country']] if len(p) > 0]).strip(), }, ] affiliated_organization_contact_ppb = { "type": "address", "note": "principal place of business", "value": '; '.join([ p for p in [ ao['affiliated_organization_ppb_city'], ao['affiliated_organization_ppb_state'], ao['affiliated_organization_ppb_country']] if len(p) > 0]).strip(), } if affiliated_organization_contact_ppb["value"]: affiliated_organization_contact_details.append( affiliated_organization_contact_ppb) # add contact details for cd in affiliated_organization_contact_details: _affiliated_organization.add_contact_detail(**cd) ao_extras["contact_details_structured"] = [ { "type": "address", "note": "contact address", "parts": [ { "note": "address", "value": ao['affiliated_organization_address'], }, { "note": "city", "value": ao['affiliated_organization_city'], }, { "note": "state", "value": ao['affiliated_organization_state'], }, { "note": "zip", "value": ao['affiliated_organization_zip'], }, { "note": "country", "value": ao['affiliated_organization_country'], } ], }, { "type": "address", "note": "principal place of business", "parts": [ { "note": "city", "value": ao['affiliated_organization_ppb_city'], }, { "note": "state", "value": ao['affiliated_organization_ppb_state'], }, { "note": "country", "value": ao['affiliated_organization_ppb_country'], } ], }, ], _affiliated_organization.extras = combine_dicts( _affiliated_organization.extras, ao_extras) for unique_affiliated_organization in _affiliated_organizations_by_name.values(): _affiliated_organizations.append(unique_affiliated_organization) # # Events & Agendas # name if parsed_form['registration_type']['new_registrant']: registration_type = 'New Client, New Registrant' elif parsed_form['registration_type']['is_amendment']: registration_type = 'Amended Registration' else: registration_type = 'New Client for Existing Registrant' # Create registration event _event = Event( name="{rn} - {rt}, {cn}".format(rn=_registrant.name, rt=registration_type, cn=_client.name), timezone='America/New_York', location='United States', start_time=datetime.strptime( parsed_form['datetimes']['effective_date'], '%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC), classification='registration' ) # add participants _event.add_participant(type=_registrant._type, id=_registrant._id, name=_registrant.name, note="registrant") if _registrant._type == 'person': _event.add_participant(type=_registrant._type, id=_registrant._id, name=_registrant.name, note="registrant") _event.add_participant(type=_client._type, id=_client._id, name=_client.name, note="client") for l in _lobbyists: _event.add_participant(type=l._type, id=l._id, name=l.name, note='lobbyist') for fe in _foreign_entities: _event.add_participant(type=fe._type, id=fe._id, name=fe.name, note='foreign_entity') for ao in _affiliated_organizations: _event.add_participant(type=ao._type, id=ao._id, name=ao.name, note='affiliated_organization') # add agenda item _agenda = _event.add_agenda_item( description='issues lobbied on', ) _agenda['notes'].append( parsed_form['lobbying_issues_detail'] ) for li in parsed_form['lobbying_issues']: if li['general_issue_area'] != '': _agenda.add_subject(li['general_issue_area']) _disclosure.add_disclosed_event( name=_event.name, type=_event._type, classification=_event.classification, id=_event._id ) # add registrant to disclosure's _related and related_entities fields _disclosure.add_registrant(name=_registrant.name, type=_registrant._type, id=_registrant._id) _registrant.add_source( url=_source['url'], note='registrant' ) yield _registrant if _registrant_self_employment is not None: _registrant_self_employment.add_source( url=_source['url'], note='registrant_self_employment' ) yield _registrant_self_employment _client.add_source( url=_source['url'], note='client' ) yield _client _main_contact.add_source( url=_source['url'], note='main_contact' ) yield _main_contact for ao in _affiliated_organizations: ao.add_source( url=_source['url'], note='affiliated_organization' ) yield ao for fe in _foreign_entities: fe.add_source( url=_source['url'], note='foreign_entity' ) yield fe for l in _lobbyists: l.add_source( url=_source['url'], note='lobbyist' ) yield l _event.add_source(**_source) yield _event _disclosure.add_source(**_source) yield _disclosure