Пример #1
0
    def scrape_senate_comittee_data(self, committee_elements, link):
        committees_grouped = grouper(3, committee_elements)
            
        for commission, type, president in committees_grouped:
            link_part = commission.iterlinks().next()[2]
            com_link = 'http://senadopr.us' + link_part
            name = commission.text_content()
            com_president = president.text_content()

            com = Committee('upper', name, president = com_president)
            com.add_source(com_link)
            com.add_source(link)
Пример #2
0
    def scrape_senate_comittee_data(self, committee_elements, link):
        committees_grouped = grouper(3, committee_elements)

        for commission, type, president in committees_grouped:
            link_part = commission.iterlinks().next()[2]
            com_link = 'http://senadopr.us' + link_part
            name = commission.text_content()
            com_president = president.text_content()

            com = Committee('upper', name, president=com_president)
            com.add_source(com_link)
            com.add_source(link)
Пример #3
0
    def scrape(self, chamber, session):
        bill_search_url = 'http://www.camaraderepresentantes.org/cr_buscar.asp'
        bill_types = {'Project':'P', 'Resolution':'R', \
                             'Joint Resolution':'RC', \
                             'Concurrent Resolution':'RK', \
                             'Appointment':'N'}
        #bodies = {'upper':'S', 'lower':'C'}
        bodies = {'upper': 'S'}

        bill_search_page = lxml.html.parse(bill_search_url).getroot()
        search_form = bill_search_page.forms[0]

        for body in bodies.itervalues():
            for bill_type in bill_types.itervalues():
                search_form.fields['cuerpo'] = body
                search_form.fields['tipo'] = bill_type
                search_form.fields['autor'] = 'NA'

                if year_from_session(session) == '2009':
                    search_form.fields['f2'] = '12/31/2009'
                elif year_from_session(session) == '2010':
                    search_form.fields['f1'] = '01/01/2010'

                result = lxml.html.parse(
                    lxml.html.submit_form(search_form)).getroot()
                table_elements = result.cssselect('table')
                table_elements.pop()
                bill_elements = grouper(3, table_elements)

                for actions, complete_data, bill_data in bill_elements:
                    td_elements = bill_data.cssselect('td')
                    title = td_elements[1].text_content()
                    date = td_elements[3].text_content()
                    description = td_elements[5].text_content()
                    authors = td_elements[7].text_content().split('/')

                    bill = Bill(session, chamber, title, description)

                    for author in authors:
                        if len(authors) == 1:
                            bill.add_sponsor('primary', author)
                        else:
                            bill.add_sponsor('cosponsor', author)

                    td_elements = actions.cssselect('td')
                    td_elements = td_elements[4:-1]
                    action_elements = grouper(3, td_elements)

                    for date_element, action, empty in action_elements:
                        # Clean unicode character
                        date_text = date_element.text_content().replace(
                            u'\xa0', u'')
                        date = dt.datetime.strptime(date_text, '%m/%d/%Y')
                        action_text = action.text_content()
                        try:
                            doc_link_part = action.iterlinks().next()[2]
                            if 'voto' in doc_link_part:
                                raise
                            doc_link = doc_link_url(doc_link_part)
                            bill.add_version((action_text, doc_link))
                        except:
                            pass

                        bill.add_action(chamber, action_text, date)
Пример #4
0
    def scrape(self, chamber, session):
        bill_search_url = 'http://www.camaraderepresentantes.org/cr_buscar.asp'
        bill_types = {'Project':'P', 'Resolution':'R', \
                             'Joint Resolution':'RC', \
                             'Concurrent Resolution':'RK', \
                             'Appointment':'N'}
        #bodies = {'upper':'S', 'lower':'C'}
        bodies = {'upper':'S'}

        bill_search_page = lxml.html.parse(bill_search_url).getroot()
        search_form = bill_search_page.forms[0]

        for body in bodies.itervalues():
            for bill_type in bill_types.itervalues():
                search_form.fields['cuerpo'] = body
                search_form.fields['tipo'] = bill_type
                search_form.fields['autor'] = 'NA'

                if year_from_session(session) == '2009':
                    search_form.fields['f2'] = '12/31/2009'
                elif year_from_session(session) == '2010':
                    search_form.fields['f1'] = '01/01/2010'

                result = lxml.html.parse(lxml.html.submit_form(search_form)).getroot()
                table_elements = result.cssselect('table')
                table_elements.pop()
                bill_elements = grouper(3, table_elements)

                for actions, complete_data, bill_data in bill_elements:
                    td_elements = bill_data.cssselect('td')
                    title = td_elements[1].text_content()
                    date = td_elements[3].text_content()
                    description = td_elements[5].text_content()
                    authors = td_elements[7].text_content().split('/')

                    bill = Bill(session, chamber, title, description)

                    for author in authors:
                        if len(authors) == 1:
                            bill.add_sponsor('primary', author)
                        else:
                            bill.add_sponsor('cosponsor', author)

                    td_elements = actions.cssselect('td')
                    td_elements = td_elements[4:-1]
                    action_elements = grouper(3, td_elements)

                    for date_element, action, empty in action_elements:
                        # Clean unicode character
                        date_text = date_element.text_content().replace(u'\xa0',u'')
                        date = dt.datetime.strptime(date_text, '%m/%d/%Y')
                        action_text = action.text_content()
                        try:
                            doc_link_part = action.iterlinks().next()[2]
                            if 'voto' in doc_link_part:
                                raise
                            doc_link = doc_link_url(doc_link_part)
                            bill.add_version((action_text, doc_link))
                        except:
                            pass

                        bill.add_action(chamber, action_text, date)