def validate(self, data): soup = BeautifulSoup(data) tracks_table = soup.find("table", "candyStriped tracklist") empty_message = soup.find("span", "messageWrapper") if not tracks_table and not empty_message: raise InvalidPage()
def validate(self, data): soup = BeautifulSoup(data) if not soup.find('div', 'clearit user vcard') and \ not soup.find('div', 'clearit subscriber vcard') and \ not soup.find('div', 'clearit staff vcard') and \ not soup.find('div', 'clearit moderator vcard'): raise InvalidPage()
def get_user_groups(self, user): cur_page = 1 lastpage = 1 groups = [] log = logging.getLogger("GroupRetrievers") # the usernames were extracted from the anchor tags from the # user search html pages -- those usernames are already # percent-encoded. while cur_page <= lastpage: log.info("Retrieving page %i of %i", cur_page, lastpage) url = self.GROUP_URL_TEMPLATE % (user, cur_page) data = self.get_url(url) soup = BeautifulSoup(data) groups_html = soup.findAll("div","groupContainer") if groups_html: for group in groups_html: group_link = group.find("a") if group_link: group_name = group_link['href'].split('/')[-1] groups.append(fixURLContent(group_name.encode("utf-8"))) # Group information can be splited across several pages # Get the number of pages if cur_page == 1: last_page_html = soup.find("a", "lastpage") if last_page_html: lastpage = int(last_page_html.contents[0]) else: lastpage = 1 cur_page += 1 return groups
def validate(self, page): "Check if this search result has any user at all." soup = BeautifulSoup(page) try: users_vcards = soup.find("ul", "usersMedium").findAll("div","vcard") users = [vcard.a['href'].split('/')[-1] for vcard in users_vcards] if not users: raise InvalidPage() except: raise InvalidPage()
def get_users_from_search_page(self, sex, page): "Returns a list whith the users found in a search page." page_url = FINDUSERS_URL_TEMPLATE % (sex, page) data = self.get_url(page_url) soup = BeautifulSoup(data) # Each user has a vcard whose first and only Anchor tag is a # link for the user profile in LastFM, in the form # http://www.lastfm.com.br/user/USERNAME. All we want is the USERNAME # part of such links. users_vcards = soup.find("ul", "usersMedium").findAll("div","vcard") users = [vcard.a['href'].split('/')[-1] for vcard in users_vcards] return users
def get_user(self, username): "Get the parsed user profile page." url = self.USER_URL_TEMPLATE % (username) try: html = self.get_url(url) except urllib2.HTTPError, e: if e.code == 404: page = BeautifulSoup(e.read()) text = "" try: text = str(page.find("div",{"id":"fourOhFour"}).h1) except: pass raise PageNotFound(text) else: # Propagate original error raise
def get_library(self, username, time_threshold, today=None): """Get the parsed user library. Args: username: LastFM's username time_threshold: We will ignore musics listened before this date. Must be a in representing seconds since epoch. today: Overwrite our notion of time just to ease testing. """ cur_page = 1 lastpage = 1 listened_tracks = [] reached_previous_snapshot = False log = logging.getLogger("LibrarySnapshotsRetriever") if not today: today = datetime.date.today() else: today = datetime.date.fromtimestamp(today) listened_date_threshold = datetime.date.fromtimestamp(time_threshold) while cur_page <= lastpage and not reached_previous_snapshot: log.info("Retrieving page %i of %i", cur_page, lastpage) url = self.LIBRARY_URL_TEMPLATE % (username, cur_page) data = self.get_url(url) soup = BeautifulSoup(data) tracks_table = soup.find("table", "candyStriped tracklist") #table tracks may not exists if not tracks_table and cur_page == 1: return (listened_tracks, time.mktime(today.timetuple())) tracks = tracks_table.findAll("tr") for track in tracks: # Parsing artist and track_name from track_artist_name = track.find("td", "subjectCell").findAll("a") artist = track_artist_name[0].contents[0] track_name = track_artist_name[1].contents[0] # Parsing date and time from the text raw_listened_date = track.find("abbr")["title"] parsed_date_time = self.parse_date_time(raw_listened_date) (year, month, day, hour, minute, second) = parsed_date_time # Discard musics already crawled and musics listened today listened_date = datetime.date(year, month, day) if listened_date_threshold > listened_date: # Found a song we already retried. # Past this point all musics will be known and there # is no point in keep crawling it anymore. reached_previous_snapshot = True # break out of the while... break # Get out of the for loop. if listened_date_threshold <= listened_date < today: # a time_tuple contains (year, month, day, hour, minute, # second, weekday, yearday, # daylightSavingAdjustment) time_tuple = (year, month, day, hour, minute, second, -1, -1, -1) # convert time_tuple to seconds since epoch listened_date = time.mktime(time_tuple) listened_tracks.append((artist, track_name, listened_date)) # Track information can be splitted across several pages. # Get the number of pages. if cur_page == 1: lastpage = int(soup.find("a", "lastpage").contents[0]) cur_page += 1 return (listened_tracks, time.mktime(today.timetuple()))
def parse_user_data(self,username, data): "Parse the user profile page." soup = BeautifulSoup(data) log = logging.getLogger("UserInfoRetriever") log.info("BEGIN") details = soup.find('div', 'clearit user vcard') if not details: details = soup.find('div', 'clearit subscriber vcard') if not details: details = soup.find('div', 'clearit staff vcard') if not details: details = soup.find('div', 'clearit moderator vcard') if not details: raise InvalidPage() # we should NOT get here 'cuz of validate # User's name name = "" try: name = details.find('strong', 'fn').contents[0] except AttributeError: pass # get html code with the average play count, user since # and total executions informations details_html = details.find('span', 'userPlays') # average play count per day average = details_html.attrs[1][1].split()[2] # User since... (user_since) user_since = None reseted_date = None dates = details_html.findAll('small') if dates: if len(dates) == 1: # Usuário já escutou alguma coisa... # desde dd mon YYYY day_month_year_text = dates[0].contents[0].strip() # Is this a "reseted" profile? If it is, it's got reseted and # user_since data if day_month_year_text[-1] == ")": # Reset date is were we expected user_since date to be... reseted_date_text = day_month_year_text[:-1] reseted_date = self.parse_registered_since(reseted_date_text) # Get the "real" user since date day_month_year_text = details_html.contents[0].strip() elif len(dates) == 2: day_month_year_text = dates[0].contents[0].strip() # Remove the parentesis from the text reseted_date_text = dates[1].contents[0].strip()[1:-1] # Convert the date text to a python date object reseted_date = self.parse_registered_since(reseted_date_text) else: # Usuário apenas se registrou, nao postou nenhuma música # Registrado em: dd mon YYYY day_month_year_text = details_html.contents[0] # Get only the "dd mon YYYY" user_since = self.parse_registered_since(day_month_year_text) # Executions flips = details_html.findAll('span', 'flip') digits = [] for d in flips: digits.append(d.contents[0]) executions = "".join(digits) # Homepage homepage = "" homepage_html = details.find('a', 'url homepage') if homepage_html and len(homepage_html.contents) > 0: homepage = homepage_html.contents[0] # Get user info -- Country, Age, Gender country, age, gender = "", "", "" extra_details = details.find("p", "userInfo adr") if extra_details: # Country country = '' country_html = extra_details.find("span", "country-name") if country_html: country = country_html.contents[0] # Gender and age -- Remove all children tags - get only gender # and age text, if it exists # NOTICE: this modifies the page structure, # should be the last step... for child in extra_details.findAll(): child.extract() if extra_details.contents: # Get gender and age using regular expressions... gender_age_text = extra_details.contents[0] gender_age_text = gender_age_text.replace(",", "").strip() res = self.AGE_GENDER_RE.match(gender_age_text) age, gender = res.groups() if not gender: gender = "" if not age: age = "" # convert everything to plain strings -- every "odd" data is percent # encoded... log.info("END") res = (username, name, age, gender, country, executions, average, homepage, user_since ) if reseted_date: res = (username, name, age, gender, country, executions, average, homepage, user_since, reseted_date) return tuple([str(i) for i in res])
def validate(self, data): soup = BeautifulSoup(data) if not soup.find('div','skyWrap'): raise InvalidPage()