Python NHLegacyBillScraper.scrape 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: openstates.nh.legacyBills

클래스/타입: NHLegacyBillScraper

메소드/함수: scrape

hotexamples.com에서의 예제들: 3

Python NHLegacyBillScraper.scrape - 3개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 openstates.nh.legacyBills.NHLegacyBillScraper.scrape에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

NHLegacyBillScraper(2)

scrape(2)

자주 사용되는 메소드들

NHLegacyBillScraper (2)

scrape (2)

예제 #1

파일 보기

파일: bills.py 프로젝트: harrythebarry/openstates

    def scrape_chamber(self, chamber, session):
        if int(session) < 2017:
            legacy = NHLegacyBillScraper(self.metadata, self.datadir)
            yield from legacy.scrape(chamber, session)
            # This throws an error because object_count isn't being properly incremented,
            # even though it saves fine. So fake the output_names
            self.output_names = ["1"]
            return

        # bill basics
        self.bills = {}  # LSR->Bill
        self.bills_by_id = {}  # need a second table to attach votes
        self.versions_by_lsr = {}  # mapping of bill ID to lsr
        self.amendments_by_lsr = {}

        # pre load the mapping table of LSR -> version id
        self.scrape_version_ids()
        self.scrape_amendments()

        last_line = []
        for line in (
            self.get("http://gencourt.state.nh.us/dynamicdatafiles/LSRs.txt")
            .content.decode("utf-8")
            .split("\n")
        ):
            line = line.split("|")
            if len(line) < 1:
                continue

            if len(line) < 36:
                if len(last_line + line[1:]) == 36:
                    # combine two lines for processing
                    # (skip an empty entry at beginning of second line)
                    line = last_line + line
                    self.warning("used bad line")
                else:
                    # skip this line, maybe we'll use it later
                    self.warning("bad line: %s" % "|".join(line))
                    last_line = line
                    continue
            session_yr = line[0]
            lsr = line[1]
            title = line[2]
            body = line[3]
            # type_num = line[4]
            expanded_bill_id = line[9]
            bill_id = line[10]

            if body == body_code[chamber] and session_yr == session:
                if expanded_bill_id.startswith("CACR"):
                    bill_type = "constitutional amendment"
                elif expanded_bill_id.startswith("PET"):
                    bill_type = "petition"
                elif expanded_bill_id.startswith("AR") and bill_id.startswith("CACR"):
                    bill_type = "constitutional amendment"
                elif expanded_bill_id.startswith("SSSB") or expanded_bill_id.startswith(
                    "SSHB"
                ):
                    # special session house/senate bills
                    bill_type = "bill"
                else:
                    bill_type = bill_type_map[expanded_bill_id.split(" ")[0][1:]]

                if title.startswith("("):
                    title = title.split(")", 1)[1].strip()

                self.bills[lsr] = Bill(
                    legislative_session=session,
                    chamber=chamber,
                    identifier=bill_id,
                    title=title,
                    classification=bill_type,
                )

                # http://www.gencourt.state.nh.us/bill_status/billText.aspx?sy=2017&id=95&txtFormat=html
                if lsr in self.versions_by_lsr:
                    version_id = self.versions_by_lsr[lsr]
                    version_url = (
                        "http://www.gencourt.state.nh.us/bill_status/"
                        "billText.aspx?sy={}&id={}&txtFormat=html".format(
                            session, version_id
                        )
                    )

                    self.bills[lsr].add_version_link(
                        note="latest version", url=version_url, media_type="text/html"
                    )

                # http://gencourt.state.nh.us/bill_status/billtext.aspx?sy=2017&txtFormat=amend&id=2017-0464S
                if lsr in self.amendments_by_lsr:
                    amendment_id = self.amendments_by_lsr[lsr]
                    amendment_url = (
                        "http://www.gencourt.state.nh.us/bill_status/"
                        "billText.aspx?sy={}&id={}&txtFormat=amend".format(
                            session, amendment_id
                        )
                    )
                    amendment_name = "Amendment #{}".format(amendment_id)

                    self.bills[lsr].add_version_link(
                        note=amendment_name,
                        url=amendment_url,
                        media_type="application/pdf",
                    )

                self.bills_by_id[bill_id] = self.bills[lsr]

        # load legislators
        self.legislators = {}
        for line in (
            self.get("http://gencourt.state.nh.us/dynamicdatafiles/legislators.txt")
            .content.decode("utf-8")
            .split("\n")
        ):
            if len(line) < 1:
                continue

            line = line.split("|")
            employee_num = line[0]

            # first, last, middle
            if line[3]:
                name = "%s %s %s" % (line[2], line[3], line[1])
            else:
                name = "%s %s" % (line[2], line[1])

            self.legislators[employee_num] = {"name": name, "seat": line[5]}
            # body = line[4]

        # sponsors
        for line in (
            self.get("http://gencourt.state.nh.us/dynamicdatafiles/LsrSponsors.txt")
            .content.decode("utf-8")
            .split("\n")
        ):
            if len(line) < 1:
                continue

            session_yr, lsr, _seq, employee, primary = line.strip().split("|")

            if session_yr == session and lsr in self.bills:
                sp_type = "primary" if primary == "1" else "cosponsor"
                try:
                    self.bills[lsr].add_sponsorship(
                        classification=sp_type,
                        name=self.legislators[employee]["name"],
                        entity_type="person",
                        primary=True if sp_type == "primary" else False,
                    )
                    self.bills[lsr].extras = {
                        "_code": self.legislators[employee]["seat"]
                    }
                except KeyError:
                    self.warning("Error, can't find person %s" % employee)

        # actions
        for line in (
            self.get("http://gencourt.state.nh.us/dynamicdatafiles/Docket.txt")
            .content.decode("utf-8")
            .split("\n")
        ):
            if len(line) < 1:
                continue
            # a few blank/irregular lines, irritating
            if "|" not in line:
                continue

            (session_yr, lsr, timestamp, bill_id, body, action, _) = line.split("|")

            if session_yr == session and lsr in self.bills:
                actor = "lower" if body == "H" else "upper"
                time = dt.datetime.strptime(timestamp, "%m/%d/%Y %H:%M:%S %p")
                action = action.strip()
                atype = classify_action(action)
                self.bills[lsr].add_action(
                    chamber=actor,
                    description=action,
                    date=time.strftime("%Y-%m-%d"),
                    classification=atype,
                )
                amendment_id = extract_amendment_id(action)
                if amendment_id:
                    self.bills[lsr].add_document_link(
                        note="amendment %s" % amendment_id,
                        url=AMENDMENT_URL % amendment_id,
                    )

        yield from self.scrape_votes(session)

        # save all bills
        for bill in self.bills:
            # bill.add_source(zip_url)
            self.add_source(self.bills[bill], bill, session)
            yield self.bills[bill]

예제 #2

파일 보기

    def scrape(self, chamber, session):
        if int(session) < 2017:
            legacy = NHLegacyBillScraper(self.metadata, self.output_dir,
                                         self.strict_validation)
            legacy.scrape(chamber, session)
            # This throws an error because object_count isn't being properly incremented,
            # even though it saves fine. So fake the output_names
            self.output_names = ['1']
            return

        # bill basics
        self.bills = {}  # LSR->Bill
        self.bills_by_id = {}  # need a second table to attach votes
        self.versions_by_lsr = {}  # mapping of bill ID to lsr
        self.amendments_by_lsr = {}

        # pre load the mapping table of LSR -> version id
        self.scrape_version_ids()
        self.scrape_amendments()

        last_line = []
        for line in self.get(
                'http://gencourt.state.nh.us/dynamicdatafiles/LSRs.txt'
        ).content.split("\n"):
            line = line.split('|')
            if len(line) < 1:
                continue

            if len(line) < 36:
                if len(last_line + line[1:]) == 36:
                    # combine two lines for processing
                    # (skip an empty entry at beginning of second line)
                    line = last_line + line
                    self.warning('used bad line')
                else:
                    # skip this line, maybe we'll use it later
                    self.warning('bad line: %s' % '|'.join(line))
                    last_line = line
                    continue
            session_yr = line[0]
            lsr = line[1]
            title = line[2]
            body = line[3]
            type_num = line[4]
            expanded_bill_id = line[9]
            bill_id = line[10]

            if body == body_code[chamber] and session_yr == session:
                if expanded_bill_id.startswith('CACR'):
                    bill_type = 'constitutional amendment'
                elif expanded_bill_id.startswith('PET'):
                    bill_type = 'petition'
                elif expanded_bill_id.startswith('AR') and bill_id.startswith(
                        'CACR'):
                    bill_type = 'constitutional amendment'
                else:
                    bill_type = bill_type_map[expanded_bill_id.split(' ')[0]
                                              [1:]]

                if title.startswith('('):
                    title = title.split(')', 1)[1].strip()

                self.bills[lsr] = Bill(session,
                                       chamber,
                                       bill_id,
                                       title,
                                       type=bill_type)

                # http://www.gencourt.state.nh.us/bill_status/billText.aspx?sy=2017&id=95&txtFormat=html
                if lsr in self.versions_by_lsr:
                    version_id = self.versions_by_lsr[lsr]
                    version_url = 'http://www.gencourt.state.nh.us/bill_status/' \
                                  'billText.aspx?sy={}&id={}&txtFormat=html' \
                                  .format(session, version_id)

                    self.bills[lsr].add_version('latest version',
                                                version_url,
                                                mimetype='text/html',
                                                on_duplicate='use_new')

                # http://gencourt.state.nh.us/bill_status/billtext.aspx?sy=2017&txtFormat=amend&id=2017-0464S
                if lsr in self.amendments_by_lsr:
                    amendment_id = self.amendments_by_lsr[lsr]
                    amendment_url = 'http://www.gencourt.state.nh.us/bill_status/' \
                                  'billText.aspx?sy={}&id={}&txtFormat=amend' \
                                  .format(session, amendment_id)
                    amendment_name = 'Amendment #{}'.format(amendment_id)

                    self.bills[lsr].add_version(amendment_name,
                                                amendment_url,
                                                mimetype='application/pdf',
                                                on_duplicate='use_new')

                self.bills_by_id[bill_id] = self.bills[lsr]

        # load legislators
        self.legislators = {}
        for line in self.get(
                'http://gencourt.state.nh.us/dynamicdatafiles/legislators.txt'
        ).content.split("\n"):
            if len(line) < 1:
                continue

            line = line.split('|')
            employee_num = line[0]

            # first, last, middle
            if line[3]:
                name = '%s %s %s' % (line[2], line[3], line[1])
            else:
                name = '%s %s' % (line[2], line[1])

            self.legislators[employee_num] = {'name': name, 'seat': line[5]}
            #body = line[4]

        # sponsors
        for line in self.get(
                'http://gencourt.state.nh.us/dynamicdatafiles/LsrSponsors.txt'
        ).content.split("\n"):
            if len(line) < 1:
                continue

            session_yr, lsr, seq, employee, primary = line.strip().split('|')

            if session_yr == session and lsr in self.bills:
                sp_type = 'primary' if primary == '1' else 'cosponsor'
                try:
                    self.bills[lsr].add_sponsor(
                        sp_type,
                        self.legislators[employee]['name'],
                        _code=self.legislators[employee]['seat'])
                except KeyError:
                    self.warning("Error, can't find person %s" % employee)

        # actions
        for line in self.get(
                'http://gencourt.state.nh.us/dynamicdatafiles/Docket.txt'
        ).content.split("\n"):
            if len(line) < 1:
                continue
            # a few blank/irregular lines, irritating
            if '|' not in line:
                continue

            (session_yr, lsr, timestamp, bill_id, body, action,
             _) = line.split('|')

            if session_yr == session and lsr in self.bills:
                actor = 'lower' if body == 'H' else 'upper'
                time = dt.datetime.strptime(timestamp, '%m/%d/%Y %H:%M:%S %p')
                action = action.strip()
                atype = classify_action(action)
                self.bills[lsr].add_action(actor, action, time, type=atype)
                amendment_id = extract_amendment_id(action)
                if amendment_id:
                    self.bills[lsr].add_document('amendment %s' % amendment_id,
                                                 AMENDMENT_URL % amendment_id)

        self.scrape_votes(session)

        # save all bills
        for bill in self.bills:
            #bill.add_source(zip_url)
            self.add_source(self.bills[bill], bill, session)
            self.save_bill(self.bills[bill])

예제 #3

파일 보기

파일: bills.py 프로젝트: neelneelpurk/openstates

    def scrape_chamber(self, chamber, session):
        if int(session) < 2017:
            legacy = NHLegacyBillScraper(self.metadata, self.datadir)
            yield from legacy.scrape(chamber, session)
            # This throws an error because object_count isn't being properly incremented,
            # even though it saves fine. So fake the output_names
            self.output_names = ['1']
            return

        # bill basics
        self.bills = {}         # LSR->Bill
        self.bills_by_id = {}   # need a second table to attach votes
        self.versions_by_lsr = {}  # mapping of bill ID to lsr
        self.amendments_by_lsr = {}

        # pre load the mapping table of LSR -> version id
        self.scrape_version_ids()
        self.scrape_amendments()

        last_line = []
        for line in self.get('http://gencourt.state.nh.us/dynamicdatafiles/LSRs.txt') \
                .content.decode('utf-8').split("\n"):
            line = line.split('|')
            if len(line) < 1:
                continue

            if len(line) < 36:
                if len(last_line + line[1:]) == 36:
                    # combine two lines for processing
                    # (skip an empty entry at beginning of second line)
                    line = last_line + line
                    self.warning('used bad line')
                else:
                    # skip this line, maybe we'll use it later
                    self.warning('bad line: %s' % '|'.join(line))
                    last_line = line
                    continue
            session_yr = line[0]
            lsr = line[1]
            title = line[2]
            body = line[3]
            # type_num = line[4]
            expanded_bill_id = line[9]
            bill_id = line[10]

            if body == body_code[chamber] and session_yr == session:
                if expanded_bill_id.startswith('CACR'):
                    bill_type = 'constitutional amendment'
                elif expanded_bill_id.startswith('PET'):
                    bill_type = 'petition'
                elif expanded_bill_id.startswith('AR') and bill_id.startswith('CACR'):
                    bill_type = 'constitutional amendment'
                else:
                    bill_type = bill_type_map[expanded_bill_id.split(' ')[0][1:]]

                if title.startswith('('):
                    title = title.split(')', 1)[1].strip()

                self.bills[lsr] = Bill(legislative_session=session,
                                       chamber=chamber,
                                       identifier=bill_id,
                                       title=title,
                                       classification=bill_type)

                # http://www.gencourt.state.nh.us/bill_status/billText.aspx?sy=2017&id=95&txtFormat=html
                if lsr in self.versions_by_lsr:
                    version_id = self.versions_by_lsr[lsr]
                    version_url = 'http://www.gencourt.state.nh.us/bill_status/' \
                                  'billText.aspx?sy={}&id={}&txtFormat=html' \
                                  .format(session, version_id)

                    self.bills[lsr].add_version_link(note='latest version',
                                                     url=version_url,
                                                     media_type='text/html')

                # http://gencourt.state.nh.us/bill_status/billtext.aspx?sy=2017&txtFormat=amend&id=2017-0464S
                if lsr in self.amendments_by_lsr:
                    amendment_id = self.amendments_by_lsr[lsr]
                    amendment_url = 'http://www.gencourt.state.nh.us/bill_status/' \
                                    'billText.aspx?sy={}&id={}&txtFormat=amend' \
                                    .format(session, amendment_id)
                    amendment_name = 'Amendment #{}'.format(amendment_id)

                    self.bills[lsr].add_version_link(note=amendment_name,
                                                     url=amendment_url,
                                                     media_type='application/pdf')

                self.bills_by_id[bill_id] = self.bills[lsr]

        # load legislators
        self.legislators = {}
        for line in self.get('http://gencourt.state.nh.us/dynamicdatafiles/legislators.txt') \
                        .content.decode('utf-8').split("\n"):
            if len(line) < 1:
                continue

            line = line.split('|')
            employee_num = line[0]

            # first, last, middle
            if line[3]:
                name = '%s %s %s' % (line[2], line[3], line[1])
            else:
                name = '%s %s' % (line[2], line[1])

            self.legislators[employee_num] = {'name': name,
                                              'seat': line[5]}
            # body = line[4]

        # sponsors
        for line in self.get('http://gencourt.state.nh.us/dynamicdatafiles/LsrSponsors.txt') \
                        .content.decode('utf-8').split("\n"):
            if len(line) < 1:
                continue

            session_yr, lsr, seq, employee, primary = line.strip().split('|')

            if session_yr == session and lsr in self.bills:
                sp_type = 'primary' if primary == '1' else 'cosponsor'
                try:
                    self.bills[lsr].add_sponsorship(classification=sp_type,
                                                    name=self.legislators[employee]['name'],
                                                    entity_type='person',
                                                    primary=True if sp_type == 'primary'
                                                    else False)
                    self.bills[lsr].extras = {'_code': self.legislators[employee]['seat']}
                except KeyError:
                    self.warning("Error, can't find person %s" % employee)

        # actions
        for line in self.get('http://gencourt.state.nh.us/dynamicdatafiles/Docket.txt') \
                        .content.decode('utf-8').split("\n"):
            if len(line) < 1:
                continue
            # a few blank/irregular lines, irritating
            if '|' not in line:
                continue

            (session_yr, lsr, timestamp, bill_id, body,
             action, _) = line.split('|')

            if session_yr == session and lsr in self.bills:
                actor = 'lower' if body == 'H' else 'upper'
                time = dt.datetime.strptime(timestamp,
                                            '%m/%d/%Y %H:%M:%S %p')
                action = action.strip()
                atype = classify_action(action)
                self.bills[lsr].add_action(chamber=actor, description=action,
                                           date=time.strftime("%Y-%m-%d"),
                                           classification=atype)
                amendment_id = extract_amendment_id(action)
                if amendment_id:
                    self.bills[lsr].add_document_link(note='amendment %s' % amendment_id,
                                                      url=AMENDMENT_URL % amendment_id)

        yield from self.scrape_votes(session)

        # save all bills
        for bill in self.bills:
            # bill.add_source(zip_url)
            self.add_source(self.bills[bill], bill, session)
            yield self.bills[bill]