def prepare_for_db(self, data): data['jurisdiction_id'] = self.jurisdiction_id data['location'] = self.get_location(data['location']) gdt = lambda x: read_event_iso_8601(x) if x is not None else None data['start_time'] = gdt(data['start_time']) data['end_time'] = gdt(data.get('end_time', None)) resolved_participants = [] for entity in data['participants']: entity_id = entity.pop('id', None) if entity['entity_type'] == 'person': try: entity_pseudo_id = make_pseudo_id( sources__url=data['sources'][0]['url'], name=entity['name'], ) entity['person_id'] = self.person_importer.resolve_json_id( entity_pseudo_id) except (UnresolvedIdError, KeyError, IndexError): entity['person_id'] = self.person_importer.resolve_json_id(entity_id) elif entity['entity_type'] == 'organization': try: entity_pseudo_id = make_pseudo_id( sources__url=data['sources'][0]['url'], name=entity['name'], ) entity['organization_id'] = self.org_importer.resolve_json_id( entity_pseudo_id) except (UnresolvedIdError, KeyError, IndexError): entity['organization_id'] = self.org_importer.resolve_json_id(entity_id) resolved_participants.append(entity) data['participants'] = resolved_participants return data
def _parse_house_floor_xml_legislative_activity(self, xml): """ Parses XML string of House floor updates and yields them in loop. @param xml: XML of field update @type xml: string @return: complete Event object @rtype: Event """ tree = self._xml_parser(xml) congress = tree.xpath('.//legislative_congress')[0].get('congress') house_committees = self._get_current_house_committee_names() for fa in tree.xpath('.//floor_action'): fa_text = fa.xpath('.//action_description')[0].xpath('string()') eastern = pytz.timezone('US/Eastern') dt = datetime.datetime.strptime(fa.xpath('action_time')[0].get('for-search'), '%Y%m%dT%H:%M:%S') event = Event('House Floor Update on {0} at {1}.'.format(dt.strftime('%Y-%m-%d'), dt.strftime('%H:%M:%S')), eastern.localize(dt).astimezone(pytz.utc), 'US/Eastern', '', description=fa_text, classification='floor_update') event.set_location("East Capitol Street Northeast & First St SE, Washington, DC 20004", note='House Floor', url='http://www.house.gov', coordinates={'latitude': '38.889931', 'longitude': '-77.009003'}) event.add_source(self._house_floor_src_url(date_str=tree.xpath('.//legislative_day')[0].get('date')), note="Scraped from the Office of the Clerk, U.S. House of Representatives website.") event.extras['act-id'] = fa.get('act-id') event.extras['unique-id'] = fa.get('unique-id') # bills ai_b = event.add_agenda_item(description='Bills referenced by this update.') for bill in fa.xpath(".//a[@rel='bill']"): bill_name = bill.xpath('string()') ai_b.add_bill(bill_name, id=make_pseudo_id(identifier=bill_code_to_id(bill_name), congress=congress), note="Bill was referenced on the House floor.") # publaws ai_p = event.add_agenda_item(description='Public laws referenced by this update.') for law in fa.xpath(".//a[@rel='publaw']"): detail_url = '/'.join(law.get('href').split('/')[0:-2]) + '/content-detail.html' ai_p.add_bill(law.xpath('string()'), id=make_pseudo_id(**self._public_law_detail_scraper(url=detail_url)), note='Law was referenced on the House floor.') # votes ai_v = event.add_agenda_item(description='Votes referenced by this update.') for vote in fa.xpath(".//a[@rel='vote']"): vote_name = vote.xpath('string()') ai_v.add_vote(vote_name, id=make_pseudo_id(identifier=vote_code_to_id(vote_name), congress=congress), note='Vote was referenced on the House floor.') # reports for report in fa.xpath(".//a[@rel='report']"): event.add_document('Document referenced by this update.', report.get('href'), media_type='text/html') for name in house_committees: if name.replace('House ', '') in fa_text: event.add_committee(name, id=make_pseudo_id(name=name)) # TODO identify legislators and add them as participants? yield event
def scrape_current_legislators(self, repos): for repo in repos: CURRENT_LEGISLATORS = self.get_url(repo) people = self.yamlize(CURRENT_LEGISLATORS) parties = set() posts = {} person_cache = defaultdict(lambda: defaultdict(lambda: None)) for person in people: name = person['name'].get('official_full') if name is None: name = "{name[first]} {name[last]}".format(**person) if 'birthday' in person['bio']: birth_date = person['bio']['birthday'] who = person_cache[name][birth_date] has_term = False if who is None: who = Person(name=name, birth_date=birth_date) who.add_source(url=CURRENT_LEGISLATORS, note="unitedstates project on GitHub") for term in person.get('terms', []): has_term = True start_date = term['start'] end_date = term['end'] state = term['state'] type_ = term['type'] district = term.get('district', None) party = term.get('party', None) chamber = {'rep': self.house, 'sen': self.senate}[type_] role = {'rep': 'Representative', 'sen': 'Senator'}[type_] if type_ == "rep" and district is not None: label = "%s for District %s in %s" % (role, district, state) division_id = ("ocd-division/country:us/state:{state}".format(state=state.lower())) if district != 0: division_id += "/cd:{district}".format(district=district) if type_ == "sen": label = "Senator for %s" % state division_id = ("ocd-division/country:us/state:{state}".format(state=state.lower())) post = posts.get(division_id) if post is None: post = Post(organization_id=chamber._id, division_id=division_id, label=label, role=role) posts[division_id] = post yield post membership = Membership( post_id=post._id, role=role, label=label, start_date=start_date, end_date=end_date, person_id=who._id, organization_id=chamber._id) yield membership if party == "Democrat": party = "Democratic" if party: membership = Membership( role='member', start_date=start_date, end_date=end_date, person_id=who._id, organization_id=make_pseudo_id( classification="party", name=party)) yield membership for key, value in person.get('id', {}).items(): if isinstance(value, list): for v in value: who.add_identifier(str(v), scheme=key) else: who.add_identifier(str(value), scheme=key) if key == 'bioguide': who.image = self.get_image_url(str(value)) if has_term: yield who
def scrape_staff(self, url, role): page = self.lxmlize(url) bar, = page.xpath("//div[@class='right-bar']") head, office, contact, _ = bar.xpath(".//div[@class='module']") name, = head.xpath(".//h4") title, social = head.xpath(".//p") head = Person(name=name.text_content()) head.add_source(url) membership = Membership( post_id=make_pseudo_id(role=role,), role=role, label=title.text_content(), person_id=head._id, organization_id=make_pseudo_id( classification="legislature")) yield membership emails = social.xpath(".//a[contains(@href, 'mailto:')]") for email in emails: head.add_contact_detail(type='email', value=email.attrib['href'], note='Office Email') offices = office.xpath(".//p") for office in offices: head.add_contact_detail(type='address', value=office.text_content(), note='Office Address') contacts = contact.xpath(".//span") for contact in contacts: class_ = contact.attrib['class'] type_ = {"icon-phone": "voice", "icon-fax": "fax", "icon-email": "email"}[class_] value = contact.tail if value is None: value = contact.getnext() value = value.text_content() if value is not None else None if value is None: continue head.add_contact_detail(type=type_, value=value, note="Office Contact Detail") yield head staff, = page.xpath("//div[@id='staff']") for member in staff.xpath( "//div[@class='table-item clearfix remove-clickable']" ): name, = member.xpath(".//span[@class='title1']") name = name.text name, staff_role = name.rsplit("-", 1) name = name.strip() staff_role = staff_role.strip() staffer = Person(name=name) staffer.add_source(url) details = member.xpath(".//p/span") membership = Membership( role=staff_role, label="%s-staff" % (role), person_id=staffer._id, organization_id=make_pseudo_id( classification="legislature",)) yield membership for detail in details: type_ = { "icon-phone marker": "voice", "icon-email marker": "email", }[detail.attrib['class']] value = detail.tail if value is None: value = detail.getnext() value = value.text_content() if value is not None else None if value is None: continue staffer.add_contact_detail(type=type_, value=value, note="Office") yield staffer
def scrape_current_legislators(self, repos): for repo in repos: CURRENT_LEGISLATORS = self.get_url(repo) people = self.yamlize(CURRENT_LEGISLATORS) parties = set() posts = {} person_cache = defaultdict(lambda: defaultdict(lambda: None)) for person in people: name = person['name'].get('official_full') if name is None: name = "{name[first]} {name[last]}".format(**person) if 'birthday' in person['bio']: birth_date = person['bio']['birthday'] who = person_cache[name][birth_date] has_term = False if who is None: who = Person(name=name, birth_date=birth_date) who.add_source(url=CURRENT_LEGISLATORS, note="unitedstates project on GitHub") for term in person.get('terms', []): has_term = True start_date = term['start'] end_date = term['end'] state = term['state'] type_ = term['type'] district = term.get('district', None) party = term.get('party', None) chamber = { 'rep': 'lower', 'sen': 'upper', }[type_] role = { 'rep': 'Representative', 'sen': 'Senator', }[type_] if type_ == "rep" and district is not None: label = "%s for District %s in %s" % (role, district, state) if district == 0: division_id = ( "ocd-division/country:us/state:{state}".format( state=state.lower())) else: division_id = ( "ocd-division/country:us/" "state:{state}/cd:{district}".format( state=state.lower(), district=district)) post = posts.get(division_id) if post is None: post = Post(organization_id={ "rep": self.house, "sen": self.senate }[type_]._id, division_id=division_id, label=label, role=role) posts[division_id] = post yield post membership = Membership(post_id=post._id, role=role, label=label, start_date=start_date, end_date=end_date, person_id=who._id, organization_id={ "rep": self.house, "sen": self.senate, }[type_]._id) yield membership if type_ == "sen": division_id = ( "ocd-division/country:us/state:{state}".format( state=state.lower())) label = "Senitor for %s" % (state) post = posts.get(division_id) if post is None: post = Post(organization_id={ "rep": self.house, "sen": self.senate }[type_]._id, division_id=division_id, label=label, role=role) posts[division_id] = post yield post membership = Membership(post_id=post._id, role=role, label=label, start_date=start_date, end_date=end_date, person_id=who._id, organization_id={ "rep": self.house, "sen": self.senate, }[type_]._id) yield membership if party == "Democrat": party = "Democratic" if party: membership = Membership(role='member', start_date=start_date, end_date=end_date, person_id=who._id, organization_id=make_pseudo_id( classification="party", name=party)) yield membership for key, value in person.get('id', {}).items(): if isinstance(value, list): for v in value: who.add_identifier(str(v), scheme=key) else: who.add_identifier(str(value), scheme=key) if has_term: yield who