示例#1
0
    def prepare_for_db(self, data):
        data['jurisdiction_id'] = self.jurisdiction_id
        data['location'] = self.get_location(data['location'])

        gdt = lambda x: read_event_iso_8601(x) if x is not None else None

        data['start_time'] = gdt(data['start_time'])
        data['end_time'] = gdt(data.get('end_time', None))
       
        resolved_participants = []

        for entity in data['participants']:
            entity_id = entity.pop('id', None)
            if entity['entity_type'] == 'person':
                try:
                    entity_pseudo_id = make_pseudo_id(
                        sources__url=data['sources'][0]['url'],
                        name=entity['name'],
                    )
                    
                    entity['person_id'] = self.person_importer.resolve_json_id(
                        entity_pseudo_id)
                except (UnresolvedIdError, KeyError, IndexError):
                    entity['person_id'] = self.person_importer.resolve_json_id(entity_id)
            elif entity['entity_type'] == 'organization':
                try:
                    entity_pseudo_id = make_pseudo_id(
                        sources__url=data['sources'][0]['url'],
                        name=entity['name'],
                    )
                    entity['organization_id'] = self.org_importer.resolve_json_id(
                        entity_pseudo_id)
                except (UnresolvedIdError, KeyError, IndexError):
                    entity['organization_id'] = self.org_importer.resolve_json_id(entity_id)
            resolved_participants.append(entity)

        data['participants'] = resolved_participants

        return data
    def _parse_house_floor_xml_legislative_activity(self, xml):
        """
        Parses XML string of House floor updates and yields them in loop.

        @param xml: XML of field update
        @type xml: string
        @return: complete Event object
        @rtype: Event
        """
        tree = self._xml_parser(xml)

        congress = tree.xpath('.//legislative_congress')[0].get('congress')

        house_committees = self._get_current_house_committee_names()
        for fa in tree.xpath('.//floor_action'):
            fa_text = fa.xpath('.//action_description')[0].xpath('string()')

            eastern = pytz.timezone('US/Eastern')
            dt = datetime.datetime.strptime(fa.xpath('action_time')[0].get('for-search'), '%Y%m%dT%H:%M:%S')
            event = Event('House Floor Update on {0} at {1}.'.format(dt.strftime('%Y-%m-%d'), dt.strftime('%H:%M:%S')),
                          eastern.localize(dt).astimezone(pytz.utc),
                          'US/Eastern',
                          '',
                          description=fa_text,
                          classification='floor_update')

            event.set_location("East Capitol Street Northeast & First St SE, Washington, DC 20004",
                               note='House Floor', url='http://www.house.gov',
                               coordinates={'latitude': '38.889931', 'longitude': '-77.009003'})

            event.add_source(self._house_floor_src_url(date_str=tree.xpath('.//legislative_day')[0].get('date')),
                             note="Scraped from the Office of the Clerk, U.S. House of Representatives website.")

            event.extras['act-id'] = fa.get('act-id')
            event.extras['unique-id'] = fa.get('unique-id')

            # bills
            ai_b = event.add_agenda_item(description='Bills referenced by this update.')
            for bill in fa.xpath(".//a[@rel='bill']"):
                bill_name = bill.xpath('string()')
                ai_b.add_bill(bill_name, id=make_pseudo_id(identifier=bill_code_to_id(bill_name), congress=congress),
                              note="Bill was referenced on the House floor.")

            # publaws
            ai_p = event.add_agenda_item(description='Public laws referenced by this update.')
            for law in fa.xpath(".//a[@rel='publaw']"):
                detail_url = '/'.join(law.get('href').split('/')[0:-2]) + '/content-detail.html'
                ai_p.add_bill(law.xpath('string()'),
                              id=make_pseudo_id(**self._public_law_detail_scraper(url=detail_url)),
                              note='Law was referenced on the House floor.')

            # votes
            ai_v = event.add_agenda_item(description='Votes referenced by this update.')
            for vote in fa.xpath(".//a[@rel='vote']"):
                vote_name = vote.xpath('string()')
                ai_v.add_vote(vote_name,
                              id=make_pseudo_id(identifier=vote_code_to_id(vote_name), congress=congress),
                              note='Vote was referenced on the House floor.')

            # reports
            for report in fa.xpath(".//a[@rel='report']"):
                event.add_document('Document referenced by this update.', report.get('href'), media_type='text/html')

            for name in house_committees:
                if name.replace('House ', '') in fa_text:
                    event.add_committee(name, id=make_pseudo_id(name=name))

            # TODO identify legislators and add them as participants?


            yield event
    def scrape_current_legislators(self, repos):
        for repo in repos:
            CURRENT_LEGISLATORS = self.get_url(repo)

            people = self.yamlize(CURRENT_LEGISLATORS)
            parties = set()
            posts = {}
            person_cache = defaultdict(lambda: defaultdict(lambda: None))

            for person in people:
                name = person['name'].get('official_full')
                if name is None:
                    name = "{name[first]} {name[last]}".format(**person)

                if 'birthday' in person['bio']:
                    birth_date = person['bio']['birthday']

                who = person_cache[name][birth_date]
                has_term = False

                if who is None:
                    who = Person(name=name, birth_date=birth_date)
                    who.add_source(url=CURRENT_LEGISLATORS, note="unitedstates project on GitHub")

                for term in person.get('terms', []):
                    has_term = True
                    start_date = term['start']
                    end_date = term['end']
                    state = term['state']
                    type_ = term['type']
                    district = term.get('district', None)
                    party = term.get('party', None)

                    chamber = {'rep': self.house,
                               'sen': self.senate}[type_]

                    role = {'rep': 'Representative',
                            'sen': 'Senator'}[type_]

                    if type_ == "rep" and district is not None:
                        label = "%s for District %s in %s" % (role, district, state)

                        division_id = ("ocd-division/country:us/state:{state}".format(state=state.lower()))

                        if district != 0:
                            division_id += "/cd:{district}".format(district=district)

                    if type_ == "sen":
                        label = "Senator for %s" % state

                        division_id = ("ocd-division/country:us/state:{state}".format(state=state.lower()))

                    post = posts.get(division_id)
                    if post is None:
                        post = Post(organization_id=chamber._id,
                            division_id=division_id,
                            label=label, role=role)
                        posts[division_id] = post
                        yield post

                    membership = Membership(
                        post_id=post._id,
                        role=role,
                        label=label,
                        start_date=start_date,
                        end_date=end_date,
                        person_id=who._id,
                        organization_id=chamber._id)
                    yield membership

                    if party == "Democrat":
                        party = "Democratic"

                    if party:
                        membership = Membership(
                            role='member',
                            start_date=start_date,
                            end_date=end_date,
                            person_id=who._id,
                            organization_id=make_pseudo_id(
                                classification="party",
                                name=party))
                        yield membership

                for key, value in person.get('id', {}).items():
                    if isinstance(value, list):
                        for v in value:
                            who.add_identifier(str(v), scheme=key)
                    else:
                        who.add_identifier(str(value), scheme=key)
                        if key == 'bioguide':
                            who.image = self.get_image_url(str(value))

                if has_term:
                    yield who
    def scrape_staff(self, url, role):
        page = self.lxmlize(url)
        bar, = page.xpath("//div[@class='right-bar']")
        head, office, contact, _ = bar.xpath(".//div[@class='module']")
        name, = head.xpath(".//h4")
        title, social = head.xpath(".//p")

        head = Person(name=name.text_content())
        head.add_source(url)

        membership = Membership(
            post_id=make_pseudo_id(role=role,),
            role=role,
            label=title.text_content(),
            person_id=head._id,
            organization_id=make_pseudo_id(
                classification="legislature"))
        yield membership

        emails = social.xpath(".//a[contains(@href, 'mailto:')]")
        for email in emails:
            head.add_contact_detail(type='email',
                                     value=email.attrib['href'],
                                     note='Office Email')

        offices = office.xpath(".//p")
        for office in offices:
            head.add_contact_detail(type='address',
                                     value=office.text_content(),
                                     note='Office Address')

        contacts = contact.xpath(".//span")
        for contact in contacts:
            class_ = contact.attrib['class']
            type_ = {"icon-phone": "voice",
                     "icon-fax": "fax",
                     "icon-email": "email"}[class_]

            value = contact.tail
            if value is None:
                value = contact.getnext()
                value = value.text_content() if value is not None else None

            if value is None:
                continue

            head.add_contact_detail(type=type_,
                                    value=value,
                                    note="Office Contact Detail")
        yield head

        staff, = page.xpath("//div[@id='staff']")
        for member in staff.xpath(
            "//div[@class='table-item clearfix remove-clickable']"
        ):
            name, = member.xpath(".//span[@class='title1']")
            name = name.text
            name, staff_role = name.rsplit("-", 1)
            name = name.strip()
            staff_role = staff_role.strip()

            staffer = Person(name=name)
            staffer.add_source(url)
            details = member.xpath(".//p/span")

            membership = Membership(
                role=staff_role,
                label="%s-staff" % (role),
                person_id=staffer._id,
                organization_id=make_pseudo_id(
                    classification="legislature",))
            yield membership

            for detail in details:
                type_ = {
                    "icon-phone marker": "voice",
                    "icon-email marker": "email",
                }[detail.attrib['class']]
                value = detail.tail
                if value is None:
                    value = detail.getnext()
                    value = value.text_content() if value is not None else None

                if value is None:
                    continue

                staffer.add_contact_detail(type=type_,
                                           value=value,
                                           note="Office")

            yield staffer
    def scrape_current_legislators(self, repos):
        for repo in repos:
            CURRENT_LEGISLATORS = self.get_url(repo)

            people = self.yamlize(CURRENT_LEGISLATORS)
            parties = set()
            posts = {}
            person_cache = defaultdict(lambda: defaultdict(lambda: None))

            for person in people:
                name = person['name'].get('official_full')
                if name is None:
                    name = "{name[first]} {name[last]}".format(**person)

                if 'birthday' in person['bio']:
                    birth_date = person['bio']['birthday']

                who = person_cache[name][birth_date]
                has_term = False

                if who is None:
                    who = Person(name=name, birth_date=birth_date)
                    who.add_source(url=CURRENT_LEGISLATORS,
                                   note="unitedstates project on GitHub")

                for term in person.get('terms', []):
                    has_term = True
                    start_date = term['start']
                    end_date = term['end']
                    state = term['state']
                    type_ = term['type']
                    district = term.get('district', None)
                    party = term.get('party', None)

                    chamber = {
                        'rep': 'lower',
                        'sen': 'upper',
                    }[type_]

                    role = {
                        'rep': 'Representative',
                        'sen': 'Senator',
                    }[type_]

                    if type_ == "rep" and district is not None:
                        label = "%s for District %s in %s" % (role, district,
                                                              state)

                        if district == 0:
                            division_id = (
                                "ocd-division/country:us/state:{state}".format(
                                    state=state.lower()))
                        else:
                            division_id = (
                                "ocd-division/country:us/"
                                "state:{state}/cd:{district}".format(
                                    state=state.lower(), district=district))

                        post = posts.get(division_id)
                        if post is None:
                            post = Post(organization_id={
                                "rep": self.house,
                                "sen": self.senate
                            }[type_]._id,
                                        division_id=division_id,
                                        label=label,
                                        role=role)
                            posts[division_id] = post
                            yield post

                        membership = Membership(post_id=post._id,
                                                role=role,
                                                label=label,
                                                start_date=start_date,
                                                end_date=end_date,
                                                person_id=who._id,
                                                organization_id={
                                                    "rep": self.house,
                                                    "sen": self.senate,
                                                }[type_]._id)
                        yield membership

                    if type_ == "sen":

                        division_id = (
                            "ocd-division/country:us/state:{state}".format(
                                state=state.lower()))

                        label = "Senitor for %s" % (state)

                        post = posts.get(division_id)
                        if post is None:
                            post = Post(organization_id={
                                "rep": self.house,
                                "sen": self.senate
                            }[type_]._id,
                                        division_id=division_id,
                                        label=label,
                                        role=role)
                            posts[division_id] = post
                            yield post

                        membership = Membership(post_id=post._id,
                                                role=role,
                                                label=label,
                                                start_date=start_date,
                                                end_date=end_date,
                                                person_id=who._id,
                                                organization_id={
                                                    "rep": self.house,
                                                    "sen": self.senate,
                                                }[type_]._id)
                        yield membership

                    if party == "Democrat":
                        party = "Democratic"

                    if party:
                        membership = Membership(role='member',
                                                start_date=start_date,
                                                end_date=end_date,
                                                person_id=who._id,
                                                organization_id=make_pseudo_id(
                                                    classification="party",
                                                    name=party))
                        yield membership

                for key, value in person.get('id', {}).items():
                    if isinstance(value, list):
                        for v in value:
                            who.add_identifier(str(v), scheme=key)
                    else:
                        who.add_identifier(str(value), scheme=key)

                if has_term:
                    yield who