Python BeautifulSoup示例，DistributedCrawler.client.BeautifulSoup.BeautifulSoup Python示例

示例#1

0

显示文件

文件： retrievers.py 项目： tmacam/LastFMCrawling

 def validate(self, data):
     soup = BeautifulSoup(data)
     tracks_table = soup.find("table", "candyStriped tracklist")
     empty_message = soup.find("span", "messageWrapper")
     
     if not tracks_table and not empty_message:
         raise InvalidPage()

示例#2

0

显示文件

文件： retrievers.py 项目： tmacam/LastFMCrawling

 def validate(self, data):
     soup = BeautifulSoup(data)
     if not soup.find('div', 'clearit user vcard') and \
        not soup.find('div', 'clearit subscriber vcard') and \
        not soup.find('div', 'clearit staff vcard') and \
        not soup.find('div', 'clearit moderator vcard'):
         raise InvalidPage()

示例#3

0

显示文件

文件： retrievers.py 项目： tmacam/LastFMCrawling

 def get_user_groups(self, user):
     cur_page = 1
     lastpage = 1
     groups = []
     log = logging.getLogger("GroupRetrievers")
     # the usernames were extracted from the anchor tags from the
     # user search html pages -- those usernames are already
     # percent-encoded.
     while cur_page <= lastpage:
         log.info("Retrieving page %i of %i", cur_page, lastpage)
         url = self.GROUP_URL_TEMPLATE % (user, cur_page)
         data = self.get_url(url)
         soup = BeautifulSoup(data)
         groups_html = soup.findAll("div","groupContainer")
         if groups_html:
             for group in groups_html:
                 group_link = group.find("a")
                 if group_link:
                     group_name = group_link['href'].split('/')[-1]
                     groups.append(fixURLContent(group_name.encode("utf-8")))
             # Group information can be splited across several pages
             # Get the number of pages
             if cur_page == 1:
                 last_page_html = soup.find("a", "lastpage")
                 if last_page_html:
                     lastpage = int(last_page_html.contents[0])
                 else:
                     lastpage = 1
         cur_page += 1
     return groups

示例#4

0

显示文件

文件： retrievers.py 项目： tmacam/LastFMCrawling

 def validate(self, page):
     "Check if this search result has any user at all."
     soup = BeautifulSoup(page)
     try:
         users_vcards = soup.find("ul", "usersMedium").findAll("div","vcard")
         users = [vcard.a['href'].split('/')[-1] for vcard in users_vcards]
         if not users:
             raise InvalidPage()
     except:
         raise InvalidPage()

示例#5

0

显示文件

文件： retrievers.py 项目： tmacam/LastFMCrawling

 def get_users_from_search_page(self, sex, page):
     "Returns a list whith the users found in a search page."
     page_url = FINDUSERS_URL_TEMPLATE % (sex, page)
     data = self.get_url(page_url)
     soup = BeautifulSoup(data)
     # Each user has a vcard whose first and only Anchor tag is a
     # link for the user profile in LastFM, in the form
     # http://www.lastfm.com.br/user/USERNAME. All we want is the USERNAME
     # part of such links.
     users_vcards = soup.find("ul", "usersMedium").findAll("div","vcard")
     users = [vcard.a['href'].split('/')[-1] for vcard in users_vcards]
     return users

示例#6

0

显示文件

文件： retrievers.py 项目： tmacam/LastFMCrawling

 def get_user(self, username):
     "Get the parsed user profile page."
     url = self.USER_URL_TEMPLATE % (username)
     try:
         html = self.get_url(url)
     except urllib2.HTTPError, e:
         if e.code == 404:
             page = BeautifulSoup(e.read())
             text = ""
             try:
                 text = str(page.find("div",{"id":"fourOhFour"}).h1)
             except:
                 pass
             raise PageNotFound(text)
         else:
             # Propagate original error
             raise

示例#7

0

显示文件

文件： retrievers.py 项目： tmacam/LastFMCrawling

    def get_library(self, username, time_threshold, today=None):
        """Get the parsed user library.
        
        Args:
            username: LastFM's username
            time_threshold: We will ignore musics listened
                before this date. Must be a in representing seconds
                since epoch.
            today: Overwrite our notion of time just to ease testing.
        """

        cur_page = 1
        lastpage = 1
        listened_tracks = []
        reached_previous_snapshot = False
        log = logging.getLogger("LibrarySnapshotsRetriever")
        if not today:
            today = datetime.date.today()
        else:
            today = datetime.date.fromtimestamp(today)

        listened_date_threshold = datetime.date.fromtimestamp(time_threshold)

        while cur_page <= lastpage and not reached_previous_snapshot:
            log.info("Retrieving page %i of %i", cur_page, lastpage)
            url = self.LIBRARY_URL_TEMPLATE % (username, cur_page)
            data = self.get_url(url)
            soup = BeautifulSoup(data)
            tracks_table = soup.find("table", "candyStriped tracklist")
            
            #table tracks may not exists
            if not tracks_table and cur_page == 1:
                return (listened_tracks, time.mktime(today.timetuple()))

            tracks = tracks_table.findAll("tr")

            for track in tracks:
                # Parsing artist and track_name from 
                track_artist_name = track.find("td", "subjectCell").findAll("a")
                artist = track_artist_name[0].contents[0]
                track_name = track_artist_name[1].contents[0]
                # Parsing date and time from the text
                raw_listened_date = track.find("abbr")["title"]
                parsed_date_time = self.parse_date_time(raw_listened_date)
                (year, month, day, hour, minute, second) = parsed_date_time
                # Discard musics already crawled and musics listened today
                listened_date = datetime.date(year, month, day)
                if listened_date_threshold > listened_date:
                    # Found a song we already retried.
                    # Past this point all musics will be known and there
                    # is no point in keep crawling it anymore. 
                    reached_previous_snapshot = True # break out of the while...
                    break # Get out of the for loop. 
                if listened_date_threshold <= listened_date < today:
                    # a time_tuple contains (year, month, day, hour, minute, 
                    #                        second, weekday, yearday,
                    #                        daylightSavingAdjustment)
                    time_tuple = (year, month, day, hour, minute, second,
                                  -1, -1, -1)
                    # convert time_tuple to seconds since epoch
                    listened_date = time.mktime(time_tuple)
                    
                    listened_tracks.append((artist, track_name, listened_date))

            # Track information can be splitted across several pages.
            # Get the number of pages.
            if cur_page == 1:
                lastpage = int(soup.find("a", "lastpage").contents[0])

            cur_page += 1
        
        return (listened_tracks, time.mktime(today.timetuple()))

示例#8

0

显示文件

文件： retrievers.py 项目： tmacam/LastFMCrawling

    def parse_user_data(self,username, data):
        "Parse the user profile page."
        soup = BeautifulSoup(data)

        log = logging.getLogger("UserInfoRetriever")
        log.info("BEGIN")

        details = soup.find('div', 'clearit user vcard')

        if not details:
            details = soup.find('div', 'clearit subscriber vcard')
        if not details:
            details = soup.find('div', 'clearit staff vcard')
        if not details:
            details = soup.find('div', 'clearit moderator vcard')
        if not details:
            raise InvalidPage() # we should NOT get here 'cuz of validate

        # User's name
        name = ""
        try:
            name = details.find('strong', 'fn').contents[0]
        except AttributeError:
            pass

        # get html code with the average play count, user since 
        # and total executions informations
        details_html = details.find('span', 'userPlays') 

        # average play count per day
        average = details_html.attrs[1][1].split()[2] 

        # User since... (user_since)
        user_since = None
        reseted_date = None
        dates = details_html.findAll('small')

        if dates:
            if len(dates) == 1:
                # Usuário já escutou alguma coisa...
                # desde dd mon YYYY
                day_month_year_text = dates[0].contents[0].strip()
                # Is this a "reseted" profile? If it is, it's got reseted and
                # user_since data
                if day_month_year_text[-1] == ")":
                    # Reset date is were we expected user_since date to be...
                    reseted_date_text = day_month_year_text[:-1]
                    reseted_date  = self.parse_registered_since(reseted_date_text)
                    # Get the "real" user since date
                    day_month_year_text = details_html.contents[0].strip()
            elif len(dates) == 2:
                day_month_year_text = dates[0].contents[0].strip()
                # Remove the parentesis from the text
                reseted_date_text =  dates[1].contents[0].strip()[1:-1]
                # Convert the date text to a python date object
                reseted_date  = self.parse_registered_since(reseted_date_text)
        else:
            # Usuário apenas se registrou, nao postou nenhuma música
            # Registrado em: dd mon YYYY
            day_month_year_text = details_html.contents[0]
        # Get only the "dd mon YYYY"
        user_since = self.parse_registered_since(day_month_year_text)

        # Executions
        flips = details_html.findAll('span', 'flip')
        digits = []
        for d in flips:
            digits.append(d.contents[0])
        executions = "".join(digits)

        # Homepage
        homepage = ""
        homepage_html = details.find('a', 'url homepage')
        if homepage_html and len(homepage_html.contents) > 0:
            homepage = homepage_html.contents[0]

        # Get user info -- Country, Age, Gender
        country, age, gender = "", "", ""
        extra_details = details.find("p", "userInfo adr")
        if extra_details:
            # Country
            country = ''
            country_html = extra_details.find("span", "country-name")
            if country_html:
                country = country_html.contents[0]
            # Gender and age -- Remove all children tags - get only gender
            # and age text, if it exists
            # NOTICE: this modifies the page structure,
            #         should be the last step...
            for child in extra_details.findAll():
                child.extract()
            if extra_details.contents:
                # Get gender and age using regular expressions...
                gender_age_text = extra_details.contents[0]
                gender_age_text = gender_age_text.replace(",", "").strip()
                res = self.AGE_GENDER_RE.match(gender_age_text)
                age, gender = res.groups()
                if not gender:
                    gender = ""
                if not age:
                    age = ""

        # convert everything to plain strings -- every "odd" data is percent
        # encoded...
        log.info("END")
        res = (username, name, age, gender, country, executions, average,
                homepage, user_since )
        if reseted_date:
            res = (username, name, age, gender, country, executions, average,
                    homepage, user_since, reseted_date)
        return tuple([str(i) for i in res])

示例#9

0

显示文件

文件： retrievers.py 项目： tmacam/LastFMCrawling

 def validate(self, data):
     soup = BeautifulSoup(data)
     if not soup.find('div','skyWrap'):
         raise InvalidPage()