Python BeautifulSoup.BeautifulSoup示例，api.BeautifulSoup.BeautifulSoup.BeautifulSoup Python示例

示例#1

0

显示文件

    def parse_availability_html(self, campusparking_avail_html):
        results = []
        lot_spots = None

        try:
            campus_lot_soup = BeautifulSoup(campusparking_avail_html)
            # get all children of the availability div whose class name starts with dataRow
            lot_rows = campus_lot_soup.find(
                'table', {
                    'id': 'ctl00_ctl00_central_block_right_navi_cnt_gvName'
                }).findAll('tr')

            # loop table rows, starting with 2bd row (excludes header row)
            for row_index in range(1, len(lot_rows)):

                # grab the array of cells in the current row
                table_cells = lot_rows[row_index].findAll('td')

                short_name = table_cells[1].string.split(' ')[0].strip()

                spots_cell = table_cells[2].string.strip()
                if spots_cell is not None and spots_cell.isdigit():
                    lot_spots = spots_cell

                lot_details = {
                    'shortName': short_name,
                    'openSpots': int(lot_spots)
                }
                results.append(lot_details)

            logging.debug(json.dumps(results))

        except ValueError:
            # Cannot parse html perhaps due to html change.
            logging.error(
                'ValueError parsing scraped content from campus parking page.')
            raise ValueError

        except AttributeError:
            # HTML doesn't include expected elements
            logging.error(
                'AttributeError parsing scraped content from campus parking page.'
            )
            raise AttributeError

        except TypeError:
            # Html is probably None
            logging.error(
                'TypeError parsing scraped content from campus parking page.')
            raise TypeError

        except IndexError:
            # Html is probably None
            logging.error(
                'IndexError parsing scraped content from campus parking page.')
            raise IndexError

        return results

示例#2

0

显示文件

文件： cityparking.py 项目： pliu6/madison-transit-api

    def parse_special_events_html(self, special_events_html):
        special_events = dict()
        special_events['specialEvents'] = []

        if not special_events_html:
            return special_events

        try:
            soup = BeautifulSoup(special_events_html)

            # special_event_rows is array of <tr>'s.
            special_event_rows = soup.find('table', {
                'id': 'calendar'
            }).findAll('tr')
            # loop table rows, starting with 3rd row (excludes 2 header rows)
            for row_index in range(2, len(special_event_rows)):
                # table_cells is array in the current row
                table_cells = special_event_rows[row_index].findAll('td')

                parking_location = table_cells[1].string
                event_venue = table_cells[4].string
                event = table_cells[3].string

                event_time, parking_end_time, parking_start_time = self.parse_special_event_datetimes(
                    table_cells)

                # add this special event info to the specialEvents collection
                special_events['specialEvents'].append({
                    'parkingLocation':
                    parking_location,
                    'eventVenue':
                    event_venue,
                    'eventDatetime':
                    event_time,
                    'eventName':
                    event,
                    'parkingStartDatetime':
                    parking_start_time,
                    'parkingEndDatetime':
                    parking_end_time,
                    'webUrl':
                    self.parking_data['special_events_url']
                })

        except (ValueError, AttributeError, TypeError, IndexError) as e:
            # unlike availability, we eat this error. availability is still useful w/out events
            logging.error(
                'Error parsing scraped content from city special events page.'
                + str(e))
            special_events['specialEvents'] = []

        return special_events

示例#3

0

显示文件

文件： cityparking.py 项目： pliu6/madison-transit-api

    def parse_availability_html(self, availability_html):
        results = []
        lot_spots = None

        try:
            city_lot_soup = BeautifulSoup(availability_html)
            # get all children of the availability div whose class name starts with dataRow
            lot_rows = city_lot_soup.find('div', {'id': 'availability'})\
                .findAll('div', {'class': re.compile('^dataRow')})

            if not lot_rows:  # if we find no rows, we're dead
                raise ValueError

            for row in lot_rows:
                for detail in row:
                    if detail.string is not None and detail.string.isdigit():
                        lot_spots = detail.string

                lot_details = {
                    'name': row.div.a.string,
                    'openSpots': int(lot_spots)
                }
                results.append(lot_details)

            logging.debug(json.dumps(results))

        except ValueError:
            # Cannot parse html perhaps due to html change.
            logging.error(
                'ValueError parsing scraped content from city parking page.')
            raise ValueError

        except AttributeError:
            # HTML doesn't include expected elements
            logging.error(
                'AttributeError parsing scraped content from city parking page.'
            )
            raise AttributeError

        except TypeError:
            # Html is probably None
            logging.error(
                'TypeError parsing scraped content from city parking page.')
            raise TypeError

        return results

示例#4

0

显示文件

    def post(self):
        try:
            scrapeURL = self.request.get('crawl')
            direction = self.request.get('direction')
            routeID = self.request.get('routeID')
            logging.debug("task scraping for %s, direction %s, route %s" %
                          (scrapeURL, direction, routeID))

            loop = 0
            done = False
            result = None
            #start = quota.get_request_cpu_usage()
            while not done and loop < 3:
                try:
                    # fetch the page
                    result = urlfetch.fetch(scrapeURL)
                    done = True
                except urlfetch.DownloadError:
                    logging.info("Error loading page (%s)... sleeping" % loop)
                    if result:
                        logging.debug("Error status: %s" % result.status_code)
                        logging.debug("Error header: %s" % result.headers)
                        logging.debug("Error content: %s" % result.content)
                        time.sleep(4)
                        loop = loop + 1
            #end = quota.get_request_cpu_usage()
            #logging.info("scraping took %s cycles" % (end-start))

            # start to interrogate the results
            soup = BeautifulSoup(result.content)
            for slot in soup.html.body.findAll("a", "ada"):
                logging.info("pulling out data from page... %s" % slot)

                if slot.has_key('href'):
                    href = slot['href']
                    title = slot['title']
                    logging.info("FOUND A TITLE ----> %s" % title)
                    # route crawler looks for titles with an ID# string
                    if title.find("#") > 0:
                        # we finally got down to the page we're looking for

                        # pull the stopID from the page content...
                        stopID = title.split("#")[1].split("]")[0]

                        # pull the intersection from the page content...
                        intersection = title.split("[")[0].strip()

                        logging.info("found stop %s, %s" %
                                     (stopID, intersection))

                        # check for conflicts...
                        stop = db.GqlQuery(
                            "SELECT * FROM StopLocation WHERE stopID = :1",
                            stopID).get()
                        if stop is None:
                            logging.error(
                                "Missing stop %s which should be impossible" %
                                stopID)

                        # pull the route and direction data from the URL
                        routeData = scrapeURL.split('?')[1]
                        logging.info(
                            "FOUND THE PAGE ---> arguments: %s stopID: %s" %
                            (routeData, stopID))
                        routeArgs = routeData.split('&')
                        routeID = routeArgs[0].split('=')[1]
                        directionID = routeArgs[1].split('=')[1]
                        timeEstimatesURL = CRAWL_URLBASE + href

                        # check for conflicts...
                        r = db.GqlQuery(
                            "SELECT * FROM RouteListing WHERE route = :1 AND direction = :2 AND stopID = :3",
                            routeID, directionID, stopID).get()
                        if r is None:
                            # add the new route to the DB
                            route = RouteListing()
                            route.route = routeID
                            route.direction = directionID
                            route.stopID = stopID
                            route.scheduleURL = timeEstimatesURL
                            route.stopLocation = stop
                            route.put()
                            logging.info(
                                "added new route listing entry to the database!"
                            )
                        else:
                            logging.error("we found a duplicate entry!?! %s",
                                          r.scheduleURL)
                    #else: # title.split(",")[0].isdigit():
                    else:
                        if href.find("?r=") > -1:
                            # create a new task with this link
                            crawlURL = CRAWL_URLBASE + href
                            if routeID == '00':
                                routeID = href.split('r=')[1]
                            elif href.find("&") > -1:
                                routeID = href.split('&')[0].split('r=')[1]
                            task = Task(url='/crawl/routelist/crawlingtask',
                                        params={
                                            'crawl': crawlURL,
                                            'direction': title,
                                            'routeID': routeID
                                        })
                            task.add('crawler')
                            logging.info(
                                "Added new task for %s, direction %s, route %s"
                                % (title.split(",")[0], title, routeID))
                        # label crawler looks for titles with letters for extraction/persistence
                        if title.replace('-', '').replace(' ', '').isalpha():
                            logging.info(
                                "found the route LABEL page! href: %s" % href)
                            routeData = href.split('?')[1]
                            routeArgs = routeData.split('&')
                            directionID = routeArgs[1].split('=')[1]

                            l = DestinationListing.get_or_insert(
                                title, id=directionID, label=title)

        except apiproxy_errors.DeadlineExceededError:
            logging.error("DeadlineExceededError exception!?")
            return

        return

示例#5

0

显示文件

    def parse_special_events_html(self, special_events_html):
        special_events = dict()
        special_events['specialEvents'] = []

        if not special_events_html:
            return special_events

        try:
            soup = BeautifulSoup(special_events_html)
            event_tables = soup.findAll('table', {'class': 'event-item'})

            for event_table in event_tables:
                rows = event_table.findAll('tr')
                event_time = ''
                event_date = ''
                lot_num_array = []
                event_name = ''
                for row_index in range(0, 3):
                    if row_index == 0:  # we're on the header row
                        header_content = rows[row_index].find('th').string
                        header_array = header_content.split(':')
                        event_date = header_array[0]
                        event_name = header_array[1].replace('&nbsp;', '')
                    elif row_index == 1:  # time row
                        cells = rows[row_index].findAll('td')
                        cell_content = cells[1].string
                        event_time_arr = cell_content.split(' ', 2)
                        event_time = event_time_arr[0] + event_time_arr[
                            1].replace('.', '').upper()
                    elif row_index == 2:  # lots row
                        cells = rows[row_index].findAll('td')
                        cell_content = cells[1].string
                        lot_num_array = cell_content.replace(' ',
                                                             '').split(',')

                        # strip leading 0's out of lot number array
                        for index, item in enumerate(lot_num_array):
                            lot_num_array[
                                index] = self.strip_leading_zeros_from_short_name(
                                    item)

                try:  # the most brittle part of the parse
                    if len(event_time) == 1:
                        event_time = '0' + event_time
                    event_datetime_tmp = event_date + ' ' + event_time
                    event_datetime_str = datetime.datetime.strptime(
                        event_datetime_tmp,
                        '%m/%d/%Y %I:%M%p').strftime('%Y-%m-%dT%H:%M:%S')

                except ValueError:
                    logging.error('Error parsing campus special event date')
                    event_datetime_str = None

                # Rather than exclude props not currently available via uw lots, going with "None"
                # This will manifest in the json as "property":null which should be easily detectable via client JS
                special_event = {
                    'eventName': event_name,
                    'parkingLocations': lot_num_array,
                    'eventDatetime': event_datetime_str,
                    'parkingStartDatetime': None,
                    'parkingEndDatetime': None,
                    'eventVenue': None,
                    'webUrl': self.parking_data['special_events_url']
                }
                special_events['specialEvents'].append(special_event)

        except (ValueError, AttributeError, TypeError, IndexError) as e:
            # unlike availability, we eat this error. availability is still useful w/out events
            logging.error(
                'Error parsing scraped content from campus special events page.'
                + str(e))
            special_events['specialEvents'] = []

        return special_events