Пример #1
0
    def scrape_recurse(self, data, parent=None):
        me = Location()
        if 'area_name' in data.keys():
            me.name = data['area_name'].strip()

        if 'total_custs' in data.keys():
            custs = data['total_custs']
            me.total_customers = custs

        if 'custs_out' in data.keys():
            out = data['custs_out']
            etr = data['etrmillis']
            if out > 0:
                outage = Outage()
                outage.affected_customers = out
                if etr >= 0:
                    outage.proposed_end_time = datetime.fromtimestamp(etr/1000.0)

                me.outage = outage

        if 'areas' in data.keys():
            for area in data['areas']:
                self.scrape_recurse(area, me)

        if parent:
            parent.locations.append(me)
Пример #2
0
    def scrape(self, url, parent=None):
        print "Getting:", url
        soup = self.get_soup(url)
        table = self.extract_table(soup)

        update_time, location_level = self.get_metadata(table)

        if location_level is None:
            return
    
        first_data_row = None        
        # get all rows that have no attributes on them
        for row in table.findAll(lambda tag : tag.name == 'tr' and not tag.attrs):
            if not row.findAll('td'):
                continue

            first_data_row = row
            break

        # Get data-rows then prepend the first
        rows = first_data_row.findNextSiblings('tr')
        rows.insert(0, first_data_row)
        # The last row is junk we don't need
        rows.pop()

        locations = []

        for row in rows:
            loc = Location()
            loc.update_time = update_time
            loc.location_level = location_level
            if parent:
                parent.locations.append(loc)

            cells = row.findAll('td')

            loc.total_customers = int(cells[1].string.replace(',',''))
            out_customers = int(cells[2].string.replace(',',''))

            if cells[0].findAll('a'):
                # Theres more data here, recurse
                child_url = urljoin(url, cells[0].contents[0]['href'])
                loc.name = cells[0].contents[0].contents[0].string

                self.scrape(child_url, loc)
            else:
                # This is drilled-down as far as we can go, make an outage object
                loc.name = cells[0].string
                outage = Outage()
                outage.affected_customers = out_customers
                try:
                    outage.proposed_end_time = datetime.strptime(cells[3].string, "%b %d, %Y %I:%M %p")
                except:
                    # no proposed time
                    pass

                loc.outage = outage