Exemplo n.º 1
    def parse_availability_html(self, campusparking_avail_html):
        results = []
        lot_spots = None

            campus_lot_soup = BeautifulSoup(campusparking_avail_html)
            # get all children of the availability div whose class name starts with dataRow
            lot_rows = campus_lot_soup.find(
                'table', {
                    'id': 'ctl00_ctl00_central_block_right_navi_cnt_gvName'

            # loop table rows, starting with 2bd row (excludes header row)
            for row_index in range(1, len(lot_rows)):

                # grab the array of cells in the current row
                table_cells = lot_rows[row_index].findAll('td')

                short_name = table_cells[1].string.split(' ')[0].strip()

                spots_cell = table_cells[2].string.strip()
                if spots_cell is not None and spots_cell.isdigit():
                    lot_spots = spots_cell

                lot_details = {
                    'shortName': short_name,
                    'openSpots': int(lot_spots)


        except ValueError:
            # Cannot parse html perhaps due to html change.
                'ValueError parsing scraped content from campus parking page.')
            raise ValueError

        except AttributeError:
            # HTML doesn't include expected elements
                'AttributeError parsing scraped content from campus parking page.'
            raise AttributeError

        except TypeError:
            # Html is probably None
                'TypeError parsing scraped content from campus parking page.')
            raise TypeError

        except IndexError:
            # Html is probably None
                'IndexError parsing scraped content from campus parking page.')
            raise IndexError

        return results
Exemplo n.º 2
    def parse_special_events_html(self, special_events_html):
        special_events = dict()
        special_events['specialEvents'] = []

        if not special_events_html:
            return special_events

            soup = BeautifulSoup(special_events_html)

            # special_event_rows is array of <tr>'s.
            special_event_rows = soup.find('table', {
                'id': 'calendar'
            # loop table rows, starting with 3rd row (excludes 2 header rows)
            for row_index in range(2, len(special_event_rows)):
                # table_cells is array in the current row
                table_cells = special_event_rows[row_index].findAll('td')

                parking_location = table_cells[1].string
                event_venue = table_cells[4].string
                event = table_cells[3].string

                event_time, parking_end_time, parking_start_time = self.parse_special_event_datetimes(

                # add this special event info to the specialEvents collection

        except (ValueError, AttributeError, TypeError, IndexError) as e:
            # unlike availability, we eat this error. availability is still useful w/out events
                'Error parsing scraped content from city special events page.'
                + str(e))
            special_events['specialEvents'] = []

        return special_events
Exemplo n.º 3
Exemplo n.º 4
    def parse_availability_html(self, availability_html):
        results = []
        lot_spots = None

            city_lot_soup = BeautifulSoup(availability_html)
            # get all children of the availability div whose class name starts with dataRow
            lot_rows = city_lot_soup.find('div', {'id': 'availability'})\
                .findAll('div', {'class': re.compile('^dataRow')})

            if not lot_rows:  # if we find no rows, we're dead
                raise ValueError

            for row in lot_rows:
                for detail in row:
                    if detail.string is not None and detail.string.isdigit():
                        lot_spots = detail.string

                lot_details = {
                    'name': row.div.a.string,
                    'openSpots': int(lot_spots)


        except ValueError:
            # Cannot parse html perhaps due to html change.
                'ValueError parsing scraped content from city parking page.')
            raise ValueError

        except AttributeError:
            # HTML doesn't include expected elements
                'AttributeError parsing scraped content from city parking page.'
            raise AttributeError

        except TypeError:
            # Html is probably None
                'TypeError parsing scraped content from city parking page.')
            raise TypeError

        return results
Exemplo n.º 5
Exemplo n.º 6
Exemplo n.º 7
def getParkingSpecialEvents():
    loop = 0
    done = False
    result = None
    specialeventsurl = 'http://www.cityofmadison.com/parkingUtility/calendar/index.cfm'
    cachehours = 24
    #initialize the dict to hold result of scrape.
    specialevents = dict()
    specialevents['CacheUntil'] = datetime.datetime.strftime(api_utils.getLocalDatetime() + datetime.timedelta(hours=+cachehours), '%Y-%m-%dT%H:%M:%S')
    specialevents['ParkingSpecialEvents'] = []
    specialevents['LastScraped'] = datetime.datetime.strftime(api_utils.getLocalDatetime(), '%Y-%m-%dT%H:%M:%S')
    # Looping in case fetch flaky.
    while not done and loop < 3:

            #grab the city parking html page - what an awesome API!!! :(
            result = urlfetch.fetch(specialeventsurl)

            #invoke soup to parse html
            soup = BeautifulSoup(result.content)

            # find the calendar table containing special event info.
            # returns array of <tr>'s.
            special_event_rows = soup.find("table", { "id" : "calendar" }).findAll('tr')

            # loop table rows, starting with 3rd row (excludes 2 header rows)
            for row_index in range(2, len(special_event_rows)):

                # grab the array of cells in the current row
                table_cells = special_event_rows[row_index].findAll('td')

                parkinglocation = table_cells[1].string
                eventvenue = table_cells[4].string
                event = table_cells[3].string

                # take the event time strings (already central time), create datetime obj, then convert back to correct string
                eventtimeobj = datetime.datetime.strptime(table_cells[0].string + api_utils.get_time_from_text(table_cells[5].string)
                                                          .replace(' ',''), '%m/%d/%Y%I:%M%p')
                eventtime = datetime.datetime.strftime(eventtimeobj, '%Y-%m-%dT%H:%M:%S')

                # split '00:00 pm - 00:00 pm' into start and end strings
                timeparts = table_cells[2].string.split(' - ')

                # clean up whitespace to avoid errors due to inconsistent format
                timeparts[0] = timeparts[0].replace(' ', '')
                timeparts[1] = timeparts[1].replace(' ', '')

                parkingstarttimeobj = datetime.datetime.strptime(table_cells[0].string + timeparts[0], '%m/%d/%Y%I:%M%p')
                parkingstarttime = datetime.datetime.strftime(parkingstarttimeobj, '%Y-%m-%dT%H:%M:%S')

                parkingendtimeobj = datetime.datetime.strptime(table_cells[0].string + timeparts[1], '%m/%d/%Y%I:%M%p')
                parkingendtime = datetime.datetime.strftime(parkingstarttimeobj, '%Y-%m-%dT%H:%M:%S')

                # add this special event info to the ParkingSpecialEvents collection
                specialevents['ParkingSpecialEvents'].append({"ParkingLocation":parkinglocation, "EventVenue":eventvenue, "EventTime":eventtime, "Event":event, "ParkingStartTime":parkingstarttime, "ParkingEndTime":parkingendtime})

                # setting content var to keep contract with caller exactly in-tact (for now).
            result.content = json.dumps(specialevents)

            done = True;

        # problem hiting url, try a few times
        except urlfetch.DownloadError:
            logging.error("Error loading page (%s)... sleeping" % loop)
            if result:
                logging.debug("Error status: %s" % result.status_code)
                logging.debug("Error header: %s" % result.headers)
                logging.debug("Error content: %s" % result.content)
            loop = loop+1

        # This is bad. Some data may be in a differnt format due to 
        # either unexpected data entry or *gulp* site redeisgn.
        # Likely require code change to fix.
        except ValueError:
            logging.error("Error parsing scraped content from (%s)... exiting getParkingSpecialEvents()" % specialeventsurl)
            done = True
            result = None
    return result