Python get_abs_url示例，openstates.nm.utils.get_abs_url Python示例

示例#1

0

显示文件

文件： legislators.py 项目： rzuck/openstates

    def scrape_legislator_data(self, url, chamber):
        party_fulls = {'R' : 'Republican', 'D' : 'Democrat'}
        with self.urlopen(url) as page:
            page = BeautifulSoup(page)
            for data in page.find('table', id = 'ctl00_mainCopy_DataList1')('td'):
                spans = data('span')
                if len(spans) == 0:
                    self.debug('Found an empty cell in %s. Continuing' % url)
                    continue
                full_name = ' '.join([span.string.strip() for span in spans])
                if len(spans[0].string.strip().split()) == 2:
                    first_name, middle_name = spans[0].string.strip().split()
                else:
                    first_name, middle_name = spans[0].string.strip(), ''
                last_name = spans[1].string.strip()

                details_url = get_abs_url(url, data.find('a')['href'])
                with self.urlopen(details_url) as details:
                    details = BeautifulSoup(details)
                    district = details.find('a', id = 'ctl00_mainCopy_LegisInfo_DISTRICTLabel').string.strip()
                    party = party_fulls[details.find('span', id = 'ctl00_mainCopy_LegisInfo_PARTYLabel').string]

                    leg = Legislator('2010', chamber, district, full_name, first_name, 
                            last_name, middle_name, party)
                    leg.add_source(details_url)

                    comms_table = details.find('table', id = 'ctl00_mainCopy_MembershipGrid')
                    for comms_raw_data in comms_table('tr')[1:]:
                        comm_data = comms_raw_data('td')
                        comm_role_type = comm_data[0].string.strip()
                        comm_name = comm_data[1]('a')[0].string.strip()
                        leg.add_role(comm_role_type, '2010', chamber = chamber, committee = comm_name)

                    self.save_legislator(leg)

示例#2

0

显示文件

文件： bills.py 项目： AlokKothari/openstates

    def get_doc_data(self, base_url, soup):
        ret_dict = {}

        ret_dict['name'] = soup.find('span').string.strip()

        try:
            """
                Need to put this in try block 'cause 'Final Version' and
                'Fiscal Impact Report' will not have any links besides
                the pdf links
            """
            ret_dict['url'] = get_abs_url(base_url, soup('a')[1]['href'])
        except IndexError:
            ret_dict['url'] = get_abs_url(base_url, soup.find('a')['href'])

        date = soup.find('font')
        #Need to check both if tag exists and has some text - sometimes even if 1'st check passes
        #the 2'nd one won't. See http://legis.state.nm.us/lcs/_session.aspx?chamber=H&legtype=B&legno=1&year=03s
        if date and date.string:
            ret_dict['date'] = DATE_RE.match(date.string.strip()).group('date')

        return ret_dict

示例#3

0

显示文件

    def get_doc_data(self, base_url, soup):
        ret_dict = {}

        ret_dict['name'] = soup.find('span').string.strip()

        try:
            """
                Need to put this in try block 'cause 'Final Version' and
                'Fiscal Impact Report' will not have any links besides
                the pdf links
            """
            ret_dict['url'] = get_abs_url(base_url, soup('a')[1]['href'])
        except IndexError:
            ret_dict['url'] = get_abs_url(base_url, soup.find('a')['href'])

        date = soup.find('font')
        #Need to check both if tag exists and has some text - sometimes even if 1'st check passes
        #the 2'nd one won't. See http://legis.state.nm.us/lcs/_session.aspx?chamber=H&legtype=B&legno=1&year=03s
        if date and date.string:
            ret_dict['date'] = DATE_RE.match(date.string.strip()).group('date')

        return ret_dict

示例#4

0

显示文件

文件： bills.py 项目： AlokKothari/openstates

    def scrape(self, chamber, year):
        if year not in metadata['sessions']:
            raise NoDataForPeriod(year)

        start_char = 'S' if chamber == 'upper' else 'H'

        nm_locator_url = 'http://legis.state.nm.us/lcs/locator.aspx'
        with self.urlopen(nm_locator_url) as page:
            page = BeautifulSoup(page)
            #The first `tr` is simply 'Bill Locator`. Ignoring that
            data_table = page.find('table', id = 'ctl00_mainCopy_Locators')('tr')[1:]
            for session in data_table:
                session_tag = session.find('a')
                session_name = ' '.join([tag.string.strip() for tag in session_tag('span')]).strip()

                if year not in session_name:
                    continue

                session_url = get_abs_url(nm_locator_url, session_tag['href'])
                with self.urlopen(session_url) as session_page:
                    session_page = BeautifulSoup(session_page)
                    bills_data_table = session_page.find('table', id = 'ctl00_mainCopy_LocatorGrid')('tr')[1:]
                    for bill in bills_data_table:
                        data = bill('td')

                        bill_num_link = data[0].find('a')
                        bill_num = ''.join([tag.string.strip() if tag.string else '' for tag in bill_num_link('span')]).strip()
                        bill_num = bill_num[1:] if bill_num.startswith('*') else bill_num
                        if not bill_num.startswith(start_char):
                            self.log('Skipping %s. This bill is not for the relevant chamber %s.' % (bill_num, chamber))
                            continue

                        bill_title = data[1].string.strip()
                        #For now, removing the '*' in front of the bill # (* means emergency)

                        bill_url = get_abs_url(session_url, bill_num_link['href'].replace(' ', ''))

                        bill = Bill(session = session_name, chamber = chamber, bill_id = bill_num, title = bill_title)
                        bill.add_source(bill_url)

                        with self.urlopen(bill_url) as bill_page:
                            bill_page = BeautifulSoup(bill_page)
                            sponsor_data = bill_page.find('table', id = 'ctl00_mainCopy__SessionFormView')
                            #The last link in this block will be the link to 'Key to Abbreviations'. Ignoring it.
                            for sponsor_link in sponsor_data('a')[:-1]:
                                #We will always have one extra 'a' tag than required - and it's 'span' strings will be empty.
                                #need to check for that condition.
                                sponsor_name = ' '.join([tag.string.strip() if tag.string else '' for tag in sponsor_link('span')]).strip()
                                if sponsor_name != '':
                                    bill.add_sponsor(type = 'primary', name = sponsor_name)

                            bill.add_version(**self.get_doc_data(bill_url, bill_page.find('table', id = 'ctl00_mainCopy_Introduced')))

                            committee_data = bill_page.find('table', id = 'ctl00_mainCopy_CommReportsList')
                            if committee_data:
                                for comms_data in committee_data('tr'):
                                    bill.add_document(**self.get_doc_data(bill_url, comms_data))

                            fir_data = bill_page.find('table', id = 'ctl00_mainCopy_FIRs')
                            if fir_data:
                                bill.add_document(**self.get_doc_data(bill_url, fir_data))

                            fin_ver_data = bill_page.find('table', id = 'ctl00_mainCopy_FinalVersion')
                            if fin_ver_data:
                                bill.add_version(**self.get_doc_data(bill_url, fin_ver_data))

                        self.save_bill(bill)

示例#5

0

显示文件

    def scrape(self, chamber, year):
        if year not in metadata['sessions']:
            raise NoDataForPeriod(year)

        start_char = 'S' if chamber == 'upper' else 'H'

        nm_locator_url = 'http://legis.state.nm.us/lcs/locator.aspx'
        with self.urlopen(nm_locator_url) as page:
            page = BeautifulSoup(page)
            #The first `tr` is simply 'Bill Locator`. Ignoring that
            data_table = page.find('table',
                                   id='ctl00_mainCopy_Locators')('tr')[1:]
            for session in data_table:
                session_tag = session.find('a')
                session_name = ' '.join([
                    tag.string.strip() for tag in session_tag('span')
                ]).strip()

                if year not in session_name:
                    continue

                session_url = get_abs_url(nm_locator_url, session_tag['href'])
                with self.urlopen(session_url) as session_page:
                    session_page = BeautifulSoup(session_page)
                    bills_data_table = session_page.find(
                        'table', id='ctl00_mainCopy_LocatorGrid')('tr')[1:]
                    for bill in bills_data_table:
                        data = bill('td')

                        bill_num_link = data[0].find('a')
                        bill_num = ''.join([
                            tag.string.strip() if tag.string else ''
                            for tag in bill_num_link('span')
                        ]).strip()
                        bill_num = bill_num[1:] if bill_num.startswith(
                            '*') else bill_num
                        if not bill_num.startswith(start_char):
                            self.log(
                                'Skipping %s. This bill is not for the relevant chamber %s.'
                                % (bill_num, chamber))
                            continue

                        bill_title = data[1].string.strip()
                        #For now, removing the '*' in front of the bill # (* means emergency)

                        bill_url = get_abs_url(
                            session_url,
                            bill_num_link['href'].replace(' ', ''))

                        bill = Bill(session=session_name,
                                    chamber=chamber,
                                    bill_id=bill_num,
                                    title=bill_title)
                        bill.add_source(bill_url)

                        with self.urlopen(bill_url) as bill_page:
                            bill_page = BeautifulSoup(bill_page)
                            sponsor_data = bill_page.find(
                                'table', id='ctl00_mainCopy__SessionFormView')
                            #The last link in this block will be the link to 'Key to Abbreviations'. Ignoring it.
                            for sponsor_link in sponsor_data('a')[:-1]:
                                #We will always have one extra 'a' tag than required - and it's 'span' strings will be empty.
                                #need to check for that condition.
                                sponsor_name = ' '.join([
                                    tag.string.strip() if tag.string else ''
                                    for tag in sponsor_link('span')
                                ]).strip()
                                if sponsor_name != '':
                                    bill.add_sponsor(type='primary',
                                                     name=sponsor_name)

                            bill.add_version(**self.get_doc_data(
                                bill_url,
                                bill_page.find(
                                    'table', id='ctl00_mainCopy_Introduced')))

                            committee_data = bill_page.find(
                                'table', id='ctl00_mainCopy_CommReportsList')
                            if committee_data:
                                for comms_data in committee_data('tr'):
                                    bill.add_document(**self.get_doc_data(
                                        bill_url, comms_data))

                            fir_data = bill_page.find('table',
                                                      id='ctl00_mainCopy_FIRs')
                            if fir_data:
                                bill.add_document(
                                    **self.get_doc_data(bill_url, fir_data))

                            fin_ver_data = bill_page.find(
                                'table', id='ctl00_mainCopy_FinalVersion')
                            if fin_ver_data:
                                bill.add_version(**self.get_doc_data(
                                    bill_url, fin_ver_data))

                        self.save_bill(bill)