Пример #1
0
 def _load_data(self, session):
     sessionYear = year_from_session(session)
     if not self.rawdataByYear.has_key(sessionYear):
         url = self._resolve_ftp_url(sessionYear, dt.date.today().year)
         actionUrl = self._resolve_action_ftp_url(sessionYear, dt.date.today().year)
         self.rawdataByYear[sessionYear] = ( self.urlopen(url), self.urlopen(actionUrl) )
     return self.rawdataByYear[sessionYear]
Пример #2
0
 def _load_data(self, session):
     sessionYear = year_from_session(session)
     if not self.rawdataByYear.has_key(sessionYear):
         url = self._resolve_ftp_url(sessionYear, dt.date.today().year)
         actionUrl = self._resolve_action_ftp_url(sessionYear,
                                                  dt.date.today().year)
         self.rawdataByYear[sessionYear] = (self.urlopen(url),
                                            self.urlopen(actionUrl))
     return self.rawdataByYear[sessionYear]
Пример #3
0
    def scrape(self, chamber, session):
        sessionYear = year_from_session(session)
        currentYear = dt.date.today().year
        source_url = self._resolve_ftp_url(sessionYear, currentYear)

        (billData, actionData) = self._load_data(session)
        self.actionsByBill = self.parse_actions_and_group(actionData)

        first = True
        for line in billData.split("\n"):
            if first: first = False
            else: self._parse_bill(session, chamber, source_url, line.strip())
Пример #4
0
    def scrape(self, chamber, session):
        sessionYear = year_from_session(session)
        currentYear = dt.date.today().year
        source_url = self._resolve_ftp_url(sessionYear, currentYear)

        (billData, actionData) = self._load_data(session)
        self.actionsByBill = self.parse_actions_and_group(actionData)
        
        first = True
        for line in billData.split("\n"):
            if first: first = False
            else: self._parse_bill(session, chamber, source_url, line.strip())
Пример #5
0
 def resolve_search_params(self, session, bill_id):
     year = year_from_session(session)
     if self.years_to_lookin.has_key(year):
         (chamber, number) = bill_id.split(" ")
         number = str(int(number))  # remove leading zeros
         return {
             'lookin': self.years_to_lookin[year],
             'lookfor': chamber.lower(),
             'number': number,
             'submit': 'Search'
         }
     else:
         return None
Пример #6
0
 def resolve_search_params(self, session, bill_id):
     year = year_from_session(session)
     if self.years_to_lookin.has_key(year):
         (chamber, number) = bill_id.split(" ")
         number = str(int(number))  # remove leading zeros
         return {
             'lookin'  : self.years_to_lookin[year],
             'lookfor' : chamber.lower(),
             'number'  : number,
             'submit'  : 'Search'
         }
     else:
         return None
Пример #7
0
    def scrape(self, chamber, session):
        bills_link = bills_url()
        bills_sessions_pages = []

        with self.urlopen(bills_link) as bills_page_html:
            bills_page = lxml.html.fromstring(bills_page_html)
            for element, attribute, link, pos in bills_page.iterlinks():
                match = re.search("..(/measures[0-9]{2}s?.html)", link)
                if match != None:
                    bills_sessions_pages.append(base_url() + match.group(1))

        year = year_from_session(session)

        shortened_year = int(year) % 100

        if shortened_year == 00:
            return

        pages_for_year = []

        for bsp in bills_sessions_pages:
            if str(shortened_year) in bsp:
                pages_for_year.append(bsp)

        measure_pages = []
        bill_pages_directory = []

        for pfy in pages_for_year:
            with self.urlopen(pfy) as year_bills_page_html:
                year_bills_page = lxml.html.fromstring(year_bills_page_html)
                for element, attribute, link, pos in year_bills_page.iterlinks():
                    if chamber == 'upper':
                        link_part = 'senmh'
                    else:
                        link_part = 'hsemh'

                    regex = "([0-9]{2}(reg|ss[0-9]))/pubs/" + link_part + ".(html|txt)"
                    match = re.search(regex, link)

                    if match != None:
                        measure_pages.append(base_url() + match.group(0))
                        bill_pages_directory.append(base_url() + match.group(1) + "/measures/main.html")

        bill_pages = []

        for bp in bill_pages_directory:
            with self.urlopen(bp) as bills_page_html:
                bills_page = lxml.html.fromstring(bills_page_html)
                for element, attribute, link, pos in bills_page.iterlinks():
                    if re.search(' +.html +', link)!= None:
                        continue

                    base_link = bp.rstrip('main.html')

                    if chamber == 'upper':
                        if link[0] == 's':
                            bill_pages.append(base_link + link.translate(None, '\n'))
                    else:
                        if link[0] == 'h':
                            bill_pages.append(base_link + link.translate(None, '\n'))

        # Remove unnecesary link
        bill_pages.pop(0)

        bills_dict = {}

        for bp in bill_pages:
            with self.urlopen(bp) as bills_page_html:
                bills_page = lxml.html.fromstring(bills_page_html)
                bills = bills_page.cssselect('a')
                for b in bills:
                    bill_description = b.text_content()
                    title, sep, version = bill_description.partition('-')
                    splitted_title = title.split()
                    bill_number = splitted_title[-1]
                    splitted_title.pop(-1)
                    initials = ''
                    for t in splitted_title:
                        initials += t[0]

                    key = initials + ' ' + bill_number.lstrip('0')
                    link = b.iterlinks().next()[2]

                    try:
                        bills_dict[key]
                    except KeyError:
                        bills_dict[key] = []

                    bills_dict[key].append((version, base_link + link))

        if chamber == 'upper':
            markers = ('SB', 'SR', 'SJR', 'SJM', 'SCR', 'SM')
        else:
            markers = ('HB', 'HR', 'HJR', 'HJM', 'HCR', 'JM')

        bill_info = {}

        for mp in measure_pages:
            with self.urlopen(mp) as measure_page_html:
                measure_page = lxml.html.fromstring(measure_page_html)
                measures = measure_page.text_content()
                lines = measures.split('\n')

                raw_date = ''
                action_party = ''
                key = ''
                text = ''
                actions = []
                first_bill = True

                for line in lines:
                    date_match = re.search('([0-9]{1,2}-[0-9]{1,2})(\((S|H)\))? ', line)

                    marker_in_line = False
                    for marker in markers:
                        if marker in line[0:2]:
                            marker_in_line = True
                            break

                    if marker_in_line:
                        if not first_bill:
                            value = bill_info[key]
                            date = dt.datetime.strptime(raw_date + '-' + year, '%m-%d-%Y')
                            actions.append((date, action_party, self.clean_space(text)))
                            value.append(actions)
                            actions = []

                        else:
                            first_bill = False

                        new_bill = True
                        regex = marker + ' +[0-9]{1,4}'
                        key_match = re.search(regex, line)
                        if key_match == None:
                            print line
                            print regex
                        key  = self.clean_space(key_match.group(0))
                        text = line.split(key)[1]

                    elif date_match != None:
                        if new_bill:
                            bill_info[key] = [self.clean_space(text)]
                            print self.clean_space(text)
                            print
                            new_bill = False

                        else:
                            date = dt.datetime.strptime(raw_date + '-' + year, '%m-%d-%Y')
                            actions.append((date, action_party, self.clean_space(text)))

                        raw_date = date_match.group(1)
                        action_party = date_match.group(2)
                        text = line.split(date_match.group(0))[1]

                    elif line.isspace():
                        continue

                    elif '---' in line:
                        continue

                    else:
                        text = text + ' ' + line