def parse_availability_html(self, campusparking_avail_html): results = [] lot_spots = None try: campus_lot_soup = BeautifulSoup(campusparking_avail_html) # get all children of the availability div whose class name starts with dataRow lot_rows = campus_lot_soup.find( 'table', { 'id': 'ctl00_ctl00_central_block_right_navi_cnt_gvName' }).findAll('tr') # loop table rows, starting with 2bd row (excludes header row) for row_index in range(1, len(lot_rows)): # grab the array of cells in the current row table_cells = lot_rows[row_index].findAll('td') short_name = table_cells[1].string.split(' ')[0].strip() spots_cell = table_cells[2].string.strip() if spots_cell is not None and spots_cell.isdigit(): lot_spots = spots_cell lot_details = { 'shortName': short_name, 'openSpots': int(lot_spots) } results.append(lot_details) logging.debug(json.dumps(results)) except ValueError: # Cannot parse html perhaps due to html change. logging.error( 'ValueError parsing scraped content from campus parking page.') raise ValueError except AttributeError: # HTML doesn't include expected elements logging.error( 'AttributeError parsing scraped content from campus parking page.' ) raise AttributeError except TypeError: # Html is probably None logging.error( 'TypeError parsing scraped content from campus parking page.') raise TypeError except IndexError: # Html is probably None logging.error( 'IndexError parsing scraped content from campus parking page.') raise IndexError return results
def parse_special_events_html(self, special_events_html): special_events = dict() special_events['specialEvents'] = [] if not special_events_html: return special_events try: soup = BeautifulSoup(special_events_html) # special_event_rows is array of <tr>'s. special_event_rows = soup.find('table', { 'id': 'calendar' }).findAll('tr') # loop table rows, starting with 3rd row (excludes 2 header rows) for row_index in range(2, len(special_event_rows)): # table_cells is array in the current row table_cells = special_event_rows[row_index].findAll('td') parking_location = table_cells[1].string event_venue = table_cells[4].string event = table_cells[3].string event_time, parking_end_time, parking_start_time = self.parse_special_event_datetimes( table_cells) # add this special event info to the specialEvents collection special_events['specialEvents'].append({ 'parkingLocation': parking_location, 'eventVenue': event_venue, 'eventDatetime': event_time, 'eventName': event, 'parkingStartDatetime': parking_start_time, 'parkingEndDatetime': parking_end_time, 'webUrl': self.parking_data['special_events_url'] }) except (ValueError, AttributeError, TypeError, IndexError) as e: # unlike availability, we eat this error. availability is still useful w/out events logging.error( 'Error parsing scraped content from city special events page.' + str(e)) special_events['specialEvents'] = [] return special_events
def parse_availability_html(self, availability_html): results = [] lot_spots = None try: city_lot_soup = BeautifulSoup(availability_html) # get all children of the availability div whose class name starts with dataRow lot_rows = city_lot_soup.find('div', {'id': 'availability'})\ .findAll('div', {'class': re.compile('^dataRow')}) if not lot_rows: # if we find no rows, we're dead raise ValueError for row in lot_rows: for detail in row: if detail.string is not None and detail.string.isdigit(): lot_spots = detail.string lot_details = { 'name': row.div.a.string, 'openSpots': int(lot_spots) } results.append(lot_details) logging.debug(json.dumps(results)) except ValueError: # Cannot parse html perhaps due to html change. logging.error( 'ValueError parsing scraped content from city parking page.') raise ValueError except AttributeError: # HTML doesn't include expected elements logging.error( 'AttributeError parsing scraped content from city parking page.' ) raise AttributeError except TypeError: # Html is probably None logging.error( 'TypeError parsing scraped content from city parking page.') raise TypeError return results
def post(self): try: scrapeURL = self.request.get('crawl') direction = self.request.get('direction') routeID = self.request.get('routeID') logging.debug("task scraping for %s, direction %s, route %s" % (scrapeURL, direction, routeID)) loop = 0 done = False result = None #start = quota.get_request_cpu_usage() while not done and loop < 3: try: # fetch the page result = urlfetch.fetch(scrapeURL) done = True except urlfetch.DownloadError: logging.info("Error loading page (%s)... sleeping" % loop) if result: logging.debug("Error status: %s" % result.status_code) logging.debug("Error header: %s" % result.headers) logging.debug("Error content: %s" % result.content) time.sleep(4) loop = loop + 1 #end = quota.get_request_cpu_usage() #logging.info("scraping took %s cycles" % (end-start)) # start to interrogate the results soup = BeautifulSoup(result.content) for slot in soup.html.body.findAll("a", "ada"): logging.info("pulling out data from page... %s" % slot) if slot.has_key('href'): href = slot['href'] title = slot['title'] logging.info("FOUND A TITLE ----> %s" % title) # route crawler looks for titles with an ID# string if title.find("#") > 0: # we finally got down to the page we're looking for # pull the stopID from the page content... stopID = title.split("#")[1].split("]")[0] # pull the intersection from the page content... intersection = title.split("[")[0].strip() logging.info("found stop %s, %s" % (stopID, intersection)) # check for conflicts... stop = db.GqlQuery( "SELECT * FROM StopLocation WHERE stopID = :1", stopID).get() if stop is None: logging.error( "Missing stop %s which should be impossible" % stopID) # pull the route and direction data from the URL routeData = scrapeURL.split('?')[1] logging.info( "FOUND THE PAGE ---> arguments: %s stopID: %s" % (routeData, stopID)) routeArgs = routeData.split('&') routeID = routeArgs[0].split('=')[1] directionID = routeArgs[1].split('=')[1] timeEstimatesURL = CRAWL_URLBASE + href # check for conflicts... r = db.GqlQuery( "SELECT * FROM RouteListing WHERE route = :1 AND direction = :2 AND stopID = :3", routeID, directionID, stopID).get() if r is None: # add the new route to the DB route = RouteListing() route.route = routeID route.direction = directionID route.stopID = stopID route.scheduleURL = timeEstimatesURL route.stopLocation = stop route.put() logging.info( "added new route listing entry to the database!" ) else: logging.error("we found a duplicate entry!?! %s", r.scheduleURL) #else: # title.split(",")[0].isdigit(): else: if href.find("?r=") > -1: # create a new task with this link crawlURL = CRAWL_URLBASE + href if routeID == '00': routeID = href.split('r=')[1] elif href.find("&") > -1: routeID = href.split('&')[0].split('r=')[1] task = Task(url='/crawl/routelist/crawlingtask', params={ 'crawl': crawlURL, 'direction': title, 'routeID': routeID }) task.add('crawler') logging.info( "Added new task for %s, direction %s, route %s" % (title.split(",")[0], title, routeID)) # label crawler looks for titles with letters for extraction/persistence if title.replace('-', '').replace(' ', '').isalpha(): logging.info( "found the route LABEL page! href: %s" % href) routeData = href.split('?')[1] routeArgs = routeData.split('&') directionID = routeArgs[1].split('=')[1] l = DestinationListing.get_or_insert( title, id=directionID, label=title) except apiproxy_errors.DeadlineExceededError: logging.error("DeadlineExceededError exception!?") return return
def parse_special_events_html(self, special_events_html): special_events = dict() special_events['specialEvents'] = [] if not special_events_html: return special_events try: soup = BeautifulSoup(special_events_html) event_tables = soup.findAll('table', {'class': 'event-item'}) for event_table in event_tables: rows = event_table.findAll('tr') event_time = '' event_date = '' lot_num_array = [] event_name = '' for row_index in range(0, 3): if row_index == 0: # we're on the header row header_content = rows[row_index].find('th').string header_array = header_content.split(':') event_date = header_array[0] event_name = header_array[1].replace(' ', '') elif row_index == 1: # time row cells = rows[row_index].findAll('td') cell_content = cells[1].string event_time_arr = cell_content.split(' ', 2) event_time = event_time_arr[0] + event_time_arr[ 1].replace('.', '').upper() elif row_index == 2: # lots row cells = rows[row_index].findAll('td') cell_content = cells[1].string lot_num_array = cell_content.replace(' ', '').split(',') # strip leading 0's out of lot number array for index, item in enumerate(lot_num_array): lot_num_array[ index] = self.strip_leading_zeros_from_short_name( item) try: # the most brittle part of the parse if len(event_time) == 1: event_time = '0' + event_time event_datetime_tmp = event_date + ' ' + event_time event_datetime_str = datetime.datetime.strptime( event_datetime_tmp, '%m/%d/%Y %I:%M%p').strftime('%Y-%m-%dT%H:%M:%S') except ValueError: logging.error('Error parsing campus special event date') event_datetime_str = None # Rather than exclude props not currently available via uw lots, going with "None" # This will manifest in the json as "property":null which should be easily detectable via client JS special_event = { 'eventName': event_name, 'parkingLocations': lot_num_array, 'eventDatetime': event_datetime_str, 'parkingStartDatetime': None, 'parkingEndDatetime': None, 'eventVenue': None, 'webUrl': self.parking_data['special_events_url'] } special_events['specialEvents'].append(special_event) except (ValueError, AttributeError, TypeError, IndexError) as e: # unlike availability, we eat this error. availability is still useful w/out events logging.error( 'Error parsing scraped content from campus special events page.' + str(e)) special_events['specialEvents'] = [] return special_events