def parse_entry_media_attributes(self, soup): """ Args: soup: a bs4 element containing a row from the current media list Return a dict of attributes of the media the row is about. """ row_info = {} try: start = utilities.parse_profile_date( soup.find('series_start').text) except ValueError: start = None except: if not self.session.suppress_parse_exceptions: raise if start is not None: try: row_info['aired'] = (start, utilities.parse_profile_date( soup.find('series_end').text)) except ValueError: row_info['aired'] = (start, None) except: if not self.session.suppress_parse_exceptions: raise # look up the given media type's status terms. status_terms = getattr(self.session, self.type)(1)._status_terms try: row_info['id'] = int( soup.find('series_' + self.type + 'db_id').text) except: if not self.session.suppress_parse_exceptions: raise try: row_info['title'] = soup.find('series_title').text except: if not self.session.suppress_parse_exceptions: raise try: row_info['status'] = status_terms[int( soup.find('series_status').text)] except: if not self.session.suppress_parse_exceptions: raise try: row_info['picture'] = soup.find('series_image').text except: if not self.session.suppress_parse_exceptions: raise return row_info
def parse_entry(self, soup): """ Given: soup: a bs4 element containing a row from the current media list Return a tuple: (media object, dict of this row's parseable attributes) """ # parse the media object first. media_attrs = self.parse_entry_media_attributes(soup) media_id = media_attrs[u'id'] del media_attrs[u'id'] media = getattr(self.session, self.type)(media_id).set(media_attrs) entry_info = {} try: entry_info[u'started'] = utilities.parse_profile_date( soup.find(u'my_start_date').text) except ValueError: entry_info[u'started'] = None except: if not self.session.suppress_parse_exceptions: raise try: entry_info[u'finished'] = utilities.parse_profile_date( soup.find(u'my_finish_date').text) except ValueError: entry_info[u'finished'] = None except: if not self.session.suppress_parse_exceptions: raise try: entry_info[u'status'] = self.user_status_terms[int( soup.find(u'my_status').text)] except: if not self.session.suppress_parse_exceptions: raise try: entry_info[u'score'] = int(soup.find(u'my_score').text) # if user hasn't set a score, set it to None to indicate as such. if entry_info[u'score'] == 0: entry_info[u'score'] = None except: if not self.session.suppress_parse_exceptions: raise try: entry_info[u'last_updated'] = datetime.datetime.fromtimestamp( int(soup.find(u'my_last_updated').text)) except: if not self.session.suppress_parse_exceptions: raise return media, entry_info
def parse_entry_media_attributes(self, soup): """ Args: soup: a bs4 element containing a row from the current media list Return a dict of attributes of the media the row is about. """ row_info = {} try: start = utilities.parse_profile_date(soup.find('series_start').text) except ValueError: start = None except: if not self.session.suppress_parse_exceptions: raise if start is not None: try: row_info['aired'] = (start, utilities.parse_profile_date(soup.find('series_end').text)) except ValueError: row_info['aired'] = (start, None) except: if not self.session.suppress_parse_exceptions: raise # look up the given media type's status terms. status_terms = getattr(self.session, self.type)(1)._status_terms try: row_info['id'] = int(soup.find('series_' + self.type + 'db_id').text) except: if not self.session.suppress_parse_exceptions: raise try: row_info['title'] = soup.find('series_title').text except: if not self.session.suppress_parse_exceptions: raise try: row_info['status'] = status_terms[int(soup.find('series_status').text)] except: if not self.session.suppress_parse_exceptions: raise try: row_info['picture'] = soup.find('series_image').text except: if not self.session.suppress_parse_exceptions: raise return row_info
def parse_entry(self, soup): """ Given: soup: a bs4 element containing a row from the current media list Return a tuple: (media object, dict of this row's parseable attributes) """ # parse the media object first. media_attrs = self.parse_entry_media_attributes(soup) media_id = media_attrs[u'id'] del media_attrs[u'id'] media = getattr(self.session, self.type)(media_id).set(media_attrs) entry_info = {} try: entry_info[u'started'] = utilities.parse_profile_date(soup.find(u'my_start_date').text) except ValueError: entry_info[u'started'] = None except: if not self.session.suppress_parse_exceptions: raise try: entry_info[u'finished'] = utilities.parse_profile_date(soup.find(u'my_finish_date').text) except ValueError: entry_info[u'finished'] = None except: if not self.session.suppress_parse_exceptions: raise try: entry_info[u'status'] = self.user_status_terms[int(soup.find(u'my_status').text)] except: if not self.session.suppress_parse_exceptions: raise try: entry_info[u'score'] = int(soup.find(u'my_score').text) # if user hasn't set a score, set it to None to indicate as such. if entry_info[u'score'] == 0: entry_info[u'score'] = None except: if not self.session.suppress_parse_exceptions: raise try: entry_info[u'last_updated'] = datetime.datetime.fromtimestamp(int(soup.find(u'my_last_updated').text)) except: if not self.session.suppress_parse_exceptions: raise return media, entry_info
def _parse_last_list_updates(self, user_page): """parse user last media update (manga and anime).""" try: divs_zip = [] for mode in ['anime', 'manga']: divs = user_page.select('div.updates.{}'.format(mode))[0].select('div') divs_zip.append((mode, divs)) media_list = {} for mode, divs in divs_zip: for div in divs: # parse the media try: media_link_tag = div.select('a')[0] except IndexError: continue media_link = media_link_tag.get('href') media_id = int(media_link.split('/{}/'.format(mode))[1].split('/')[0]) media_title = media_link_tag.text media = getattr(self.session, mode)(media_id).set({'title': media_title}) # stats status_tag = div.select('div')[-1] status = status_tag.text # update_date date_tag = div.select('div span')[0] update_date = utilities.parse_profile_date(date_tag.text) # first media list dict media_list[media] = self._parse_update_media_status(status) # add more key and item media_list[media]['time'] = update_date return media_list except: if not self.session.suppress_parse_exceptions: raise
def _get_user_stats(self, user_page, stats_type, type_txt): """get user stats.""" assert stats_type in ['birthday', 'last_online', 'gender', 'join_date', 'location'] # get tags and try to filter it user_stats_tag = user_page.select_one('.user-status') stats_tags = [ x for x in user_stats_tag.select('li > span') if type_txt in x.text ] # return default value if nothing found if not stats_tags: if stats_type == 'gender': return 'Not specified' else: return None # process the html tag stats_tag = stats_tags[0].parent stats_text = stats_tag.text.split(type_txt)[1].strip() # parse the end result based on stats type if stats_type in ['birthday', 'last_online', 'join_date']: return utilities.parse_profile_date(stats_text) else: return stats_text
def parse_media(user_page, mode='anime'): # get div tags divs = user_page.select( 'div.updates.{}'.format(mode))[0].select('div') media_list = {} for div in divs: # parse the media try: media_link_tag = div.select('a')[0] except IndexError: continue media_link = media_link_tag.get('href') media_id = int( media_link.split('/{}/'.format(mode))[1].split('/')[0]) media_title = media_link_tag.text media = getattr(self.session, mode)(media_id).set({'title': media_title}) # stats status_tag = div.select('div')[-1] status = status_tag.text # update_date date_tag = div.select('div span')[0] update_date = utilities.parse_profile_date(date_tag.text) media_list[media] = self._parse_update_media_status( status) # first media list dict media_list[media][ 'time'] = update_date # add more key and item return media_list
def _parse_sidebar_user_status(self, user_page): """Parse the DOM and return user status on sidebar.""" user_info = {} top_section = self._parse_sidebar_user_status_top_section(user_page) # variable for easier key on key comparator parse_date = 'parse_date' user_info_key = 'user_info_key' # top part of side bar key_comparator = { 'last online': { user_info_key: u'last_online', parse_date: True }, 'gender': { user_info_key: u'gender', parse_date: False }, 'birthday': { user_info_key: u'birthday', parse_date: True }, 'location': { user_info_key: u'location', parse_date: False }, 'joined': { user_info_key: u'join_date', parse_date: True }, } # bottom part of sidebar bottom_part_key = [ ['forum posts', u'num_forum_posts'], ['reviews', u'num_reviews'], ['recommendations', u'num_recommendations'], ['blog posts', u'num_blog_posts'], ['clubs', u'num_clubs'], ] for key in bottom_part_key: # add bottom part key to key_comparator key_comparator[key[0]] = {user_info_key: key[1], parse_date: False} # convert top section dictionary into user info dict for keyc in key_comparator: if keyc in top_section: if key_comparator[keyc][parse_date]: user_info[key_comparator[keyc] [user_info_key]] = utilities.parse_profile_date( top_section[keyc]) else: user_info[key_comparator[keyc] [user_info_key]] = top_section[keyc] # fix bottom part keys on user info. # remove the comma and convert into integer for key in bottom_part_key: try: if user_info[key[1]] is not None and type( user_info[key[1]]) != int: user_info[key[1]] = int(user_info[key[1]].replace(',', '')) except KeyError: pass # pass for unsuspected keyerror return user_info
def parse_friends(self, friends_page): """Parse the DOM and returns user friends attributes. :type friends_page: :class:`bs4.BeautifulSoup` :param friends_page: MAL user friends page's DOM :rtype: dict :return: User friends attributes. """ user_info = self.parse_sidebar(friends_page) second_col = ( friends_page .find(u'div', {u'id': u'content'}) .find(u'table') .find(u'tr') .find_all(u'td', recursive=False)[1] ) try: user_info['friends'] = {} friends = second_col.find_all('div', {'class': 'friendHolder'}) if friends: for row in friends: block = row.find('div', {'class': 'friendBlock'}) cols = block.find_all('div') friend_link = cols[1].find('a') friend = self.session.user(friend_link.text) friend_info = {} if len(cols) > 2 and cols[2].text != u'': col_txt = cols[2].text.strip() friend_info[u'last_active'] = utilities.parse_profile_date(col_txt) if len(cols) > 3 and cols[3].text != '': friend_info['since'] = utilities.parse_profile_date( cols[3].text.replace('Friends since', '').strip()) user_info['friends'][friend] = friend_info except: if not self.session.suppress_parse_exceptions: raise return user_info
def parse_recommendations(self, recommendations_page): """Parse the DOM and returns user recommendations attributes. :type recommendations_page: :class:`bs4.BeautifulSoup` :param recommendations_page: MAL user recommendations page's DOM :rtype: dict :return: User recommendations attributes. """ user_info = self.parse_sidebar(recommendations_page) second_col = ( recommendations_page .find(u'div', {u'id': u'content'}) .find(u'table') .find(u'tr') .find_all(u'td', recursive=False)[1] ) try: recommendations = second_col.find_all("div", {"class": "spaceit borderClass"}) if recommendations: user_info['recommendations'] = {} for row in recommendations[1:]: anime_table = row.find(u'table') animes = anime_table.find_all(u'td') # find liked media liked_media_link = animes[0].find(u'a', recursive=False) link_parts = liked_media_link.get(u'href').split(u'/') # of the form /anime|manga/64/Rozen_Maiden liked_media = getattr(self.session, link_parts[1])(int(link_parts[2])).set( {u'title': liked_media_link.text} ) # find recommended media recommended_media_link = animes[1].find(u'a', recursive=False) link_parts = recommended_media_link.get(u'href').split(u'/') # of the form /anime|manga/64/Rozen_Maiden media_id = int(link_parts[2]) recommended_media = getattr(self.session, link_parts[1])(media_id).set( {u'title': recommended_media_link.text} ) # other stats from recommended media recommendation_text = row.find(u'p').text recommendation_menu = row.find(u'div', recursive=False) utilities.extract_tags(recommendation_menu) rec_menu_text = recommendation_menu.text.split(u' - ')[1] recommendation_date = utilities.parse_profile_date(rec_menu_text) user_info['recommendations'][liked_media] = {link_parts[1]: recommended_media, 'text': recommendation_text, 'date': recommendation_date} except: if not self.session.suppress_parse_exceptions: raise return user_info
def parse_recommendations(self, recommendations_page): """Parses the DOM and returns user recommendations attributes. :type recommendations_page: :class:`bs4.BeautifulSoup` :param recommendations_page: MAL user recommendations page's DOM :rtype: dict :return: User recommendations attributes. """ user_info = self.parse_sidebar(recommendations_page) second_col = recommendations_page.find(u'div', { u'id': u'content' }).find(u'table').find(u'tr').find_all(u'td', recursive=False)[1] try: recommendations = second_col.find_all( u"div", {u"class": u"spaceit borderClass"}) if recommendations: user_info[u'recommendations'] = {} for row in recommendations[1:]: anime_table = row.find(u'table') animes = anime_table.find_all(u'td') liked_media_link = animes[0].find(u'a', recursive=False) link_parts = liked_media_link.get(u'href').split(u'/') # of the form /anime|manga/64/Rozen_Maiden liked_media = getattr(self.session, link_parts[1])(int( link_parts[2])).set({u'title': liked_media_link.text}) recommended_media_link = animes[1].find(u'a', recursive=False) link_parts = recommended_media_link.get(u'href').split( u'/') # of the form /anime|manga/64/Rozen_Maiden recommended_media = getattr(self.session, link_parts[1])( int(link_parts[2])).set( {u'title': recommended_media_link.text}) recommendation_text = row.find(u'p').text recommendation_menu = row.find(u'div', recursive=False) utilities.extract_tags(recommendation_menu) recommendation_date = utilities.parse_profile_date( recommendation_menu.text.split(u' - ')[1]) user_info[u'recommendations'][liked_media] = { link_parts[1]: recommended_media, 'text': recommendation_text, 'date': recommendation_date } except: if not self.session.suppress_parse_exceptions: raise return user_info
def parse_friends(self, friends_page): """Parses the DOM and returns user friends attributes. :type friends_page: :class:`bs4.BeautifulSoup` :param friends_page: MAL user friends page's DOM :rtype: dict :return: User friends attributes. """ user_info = self.parse_sidebar(friends_page) second_col = friends_page.find(u'div', { u'id': u'content' }).find(u'table').find(u'tr').find_all(u'td', recursive=False)[1] try: user_info[u'friends'] = {} friends = second_col.find_all(u'div', {u'class': u'friendHolder'}) if friends: for row in friends: block = row.find(u'div', {u'class': u'friendBlock'}) cols = block.find_all(u'div') friend_link = cols[1].find(u'a') friend = self.session.user(friend_link.text) friend_info = {} if len(cols) > 2 and cols[2].text != u'': friend_info[ u'last_active'] = utilities.parse_profile_date( cols[2].text.strip()) if len(cols) > 3 and cols[3].text != u'': friend_info[u'since'] = utilities.parse_profile_date( cols[3].text.replace(u'Friends since', '').strip()) user_info[u'friends'][friend] = friend_info except: if not self.session.suppress_parse_exceptions: raise return user_info
def parse_reviews(self, reviews_page): """Parses the DOM and returns user reviews attributes. :type reviews_page: :class:`bs4.BeautifulSoup` :param reviews_page: MAL user reviews page's DOM :rtype: dict :return: User reviews attributes. """ user_info = self.parse_sidebar(reviews_page) second_col = reviews_page.find(u'div', {u'id': u'content'}).find(u'table').find(u'tr').find_all(u'td', recursive=False)[1] try: user_info[u'reviews'] = {} reviews = second_col.find_all(u'div', {u'class': u'borderDark'}, recursive=False) if reviews: for row in reviews: review_info = {} try: (meta_elt, review_elt) = row.find_all(u'div', recursive=False)[0:2] except ValueError: raise meta_rows = meta_elt.find_all(u'div', recursive=False) review_info[u'date'] = utilities.parse_profile_date(meta_rows[0].find(u'div').text) media_link = meta_rows[0].find(u'a') link_parts = media_link.get(u'href').split(u'/') # of the form /(anime|manga)/9760/Hoshi_wo_Ou_Kodomo media = getattr(self.session, link_parts[1])(int(link_parts[2])).set({u'title': media_link.text}) helpfuls = meta_rows[1].find(u'span', recursive=False) helpful_match = re.match(r'(?P<people_helped>[0-9]+) of (?P<people_total>[0-9]+)', helpfuls.text).groupdict() review_info[u'people_helped'] = int(helpful_match[u'people_helped']) review_info[u'people_total'] = int(helpful_match[u'people_total']) consumption_match = re.match(r'(?P<media_consumed>[0-9]+) of (?P<media_total>[0-9?]+)', meta_rows[2].text).groupdict() review_info[u'media_consumed'] = int(consumption_match[u'media_consumed']) if consumption_match[u'media_total'] == u'?': review_info[u'media_total'] = None else: review_info[u'media_total'] = int(consumption_match[u'media_total']) review_info[u'rating'] = int(meta_rows[3].find(u'div').text.replace(u'Overall Rating: ', '')) for x in review_elt.find_all([u'div', 'a']): x.extract() review_info[u'text'] = review_elt.text.strip() user_info[u'reviews'][media] = review_info except: if not self.session.suppress_parse_exceptions: raise return user_info
def _parse_sidebar_user_status(self, user_page): """Parse the DOM and return user status on sidebar.""" user_info = {} top_section = self._parse_sidebar_user_status_top_section(user_page) # variable for easier key on key comparator parse_date = 'parse_date' user_info_key = 'user_info_key' # top part of side bar key_comparator = { 'last online': {user_info_key: 'last_online', parse_date: True}, 'gender': {user_info_key: 'gender', parse_date: False}, 'birthday': {user_info_key: 'birthday', parse_date: True}, 'location': {user_info_key: 'location', parse_date: False}, 'joined': {user_info_key: 'join_date', parse_date: True}, } # bottom part of sidebar bottom_part_key = [ ['forum posts', 'num_forum_posts'], ['reviews', 'num_reviews'], ['recommendations', 'num_recommendations'], ['blog posts', 'num_blog_posts'], ['clubs', 'num_clubs'], ] for key in bottom_part_key: # add bottom part key to key_comparator key_comparator[key[0]] = {user_info_key: key[1], parse_date: False} # convert top section dictionary into user info dict for keyc in key_comparator: if keyc in top_section: if key_comparator[keyc][parse_date]: user_info[key_comparator[keyc][user_info_key]] = utilities.parse_profile_date( top_section[keyc]) else: user_info[key_comparator[keyc][user_info_key]] = top_section[keyc] # fix bottom part keys on user info. # remove the comma and convert into integer for key in bottom_part_key: try: if user_info[key[1]] is not None and type(user_info[key[1]]) != int: user_info[key[1]] = int(user_info[key[1]].replace(',', '')) except KeyError: pass # pass for unsuspected keyerror return user_info
def parse_media(user_page, mode='anime'): # get div tags divs = user_page.select('div.updates.{}'.format(mode))[0].select('div') media_list = {} for div in divs: # parse the media try: media_link_tag = div.select('a')[0] except IndexError: continue media_link = media_link_tag.get('href') media_id = int(media_link.split('/{}/'.format(mode))[1].split('/')[0]) media_title = media_link_tag.text media = getattr(self.session, mode)(media_id).set({'title': media_title}) # stats status_tag = div.select('div')[-1] status = status_tag.text # update_date date_tag = div.select('div span')[0] update_date = utilities.parse_profile_date(date_tag.text) media_list[media] = self._parse_update_media_status(status) # first media list dict media_list[media]['time'] = update_date # add more key and item return media_list
def parse_sidebar(self, anime_page): """Parses the DOM and returns anime attributes in the sidebar. :type anime_page: :class:`bs4.BeautifulSoup` :param anime_page: MAL anime page's DOM :rtype: dict :return: anime attributes :raises: :class:`.InvalidAnimeError`, :class:`.MalformedAnimePageError` """ try: anime_info = super(Anime, self).parse_sidebar(anime_page) except media.InvalidMediaError as e: raise InvalidAnimeError(e.id) info_panel_first = anime_page.find(u'div', { 'id': 'content' }).find(u'table').find(u'td') try: episode_tag = info_panel_first.find( text=u'Episodes:').parent.parent utilities.extract_tags( episode_tag.find_all(u'span', {'class': 'dark_text'})) anime_info[u'episodes'] = int(episode_tag.text.strip( )) if episode_tag.text.strip() != 'Unknown' else 0 except: if not self.session.suppress_parse_exceptions: raise try: aired_tag = info_panel_first.find(text=u'Aired:').parent.parent utilities.extract_tags( aired_tag.find_all(u'span', {'class': 'dark_text'})) aired_parts = aired_tag.text.strip().split(u' to ') if len(aired_parts) == 1: # this aired once. try: aired_date = utilities.parse_profile_date( aired_parts[0], suppress=self.session.suppress_parse_exceptions) except ValueError: raise MalformedAnimePageError( self.id, aired_parts[0], message="Could not parse single air date") anime_info[u'aired'] = (aired_date, ) else: # two airing dates. try: air_start = utilities.parse_profile_date( aired_parts[0], suppress=self.session.suppress_parse_exceptions) except ValueError: raise MalformedAnimePageError( self.id, aired_parts[0], message="Could not parse first of two air dates") try: air_end = utilities.parse_profile_date( aired_parts[1], suppress=self.session.suppress_parse_exceptions) except ValueError: raise MalformedAnimePageError( self.id, aired_parts[1], message="Could not parse second of two air dates") anime_info[u'aired'] = (air_start, air_end) except: if not self.session.suppress_parse_exceptions: raise try: producers_tag = info_panel_first.find( text=u'Producers:').parent.parent utilities.extract_tags( producers_tag.find_all(u'span', {'class': 'dark_text'})) anime_info[u'producers'] = [] for producer_link in producers_tag.find_all('a'): if producer_link.text == u'add some': # MAL is saying "None found, add some". break link_parts = producer_link.get('href').split('/') # of the form: /anime/producer/23/Bandai_Visual anime_info[u'producers'].append( self.session.producer(int(link_parts[3])).set( {'name': producer_link.text})) except: if not self.session.suppress_parse_exceptions: raise try: duration_tag = info_panel_first.find( text=u'Duration:').parent.parent utilities.extract_tags( duration_tag.find_all(u'span', {'class': 'dark_text'})) anime_info[u'duration'] = duration_tag.text.strip() duration_parts = [ part.strip() for part in anime_info[u'duration'].split(u'.') ] duration_mins = 0 for part in duration_parts: part_match = re.match(u'(?P<num>[0-9]+)', part) if not part_match: continue part_volume = int(part_match.group(u'num')) if part.endswith(u'hr'): duration_mins += part_volume * 60 elif part.endswith(u'min'): duration_mins += part_volume anime_info[u'duration'] = datetime.timedelta(minutes=duration_mins) except: if not self.session.suppress_parse_exceptions: raise try: rating_tag = info_panel_first.find(text=u'Rating:').parent.parent utilities.extract_tags( rating_tag.find_all(u'span', {'class': 'dark_text'})) anime_info[u'rating'] = rating_tag.text.strip() except: if not self.session.suppress_parse_exceptions: raise return anime_info
def parse_reviews(self, reviews_page): """Parse the DOM and returns user reviews attributes. :type reviews_page: :class:`bs4.BeautifulSoup` :param reviews_page: MAL user reviews page's DOM :rtype: dict :return: User reviews attributes. """ user_info = self.parse_sidebar(reviews_page) second_col = \ reviews_page.find(u'div', {u'id': u'content'}).find(u'table').find(u'tr') \ .find_all(u'td', recursive=False)[1] try: user_info[u'reviews'] = {} reviews = second_col.find_all(u'div', {u'class': u'borderDark'}, recursive=False) if reviews: for row in reviews: review_info = {} try: (meta_elt, review_elt) = row.find_all(u'div', recursive=False)[0:2] except ValueError: raise meta_rows = meta_elt.find_all(u'div', recursive=False) review_info[u'date'] = utilities.parse_profile_date( meta_rows[0].find(u'div').text) media_link = meta_rows[0].find(u'a') link_parts = media_link.get(u'href').split(u'/') # of the form /(anime|manga)/9760/Hoshi_wo_Ou_Kodomo media_id = link_parts[2] media = getattr(self.session, link_parts[1])( int(media_id)).set({u'title': media_link.text}) helpfuls = meta_rows[1].find(u'span', recursive=False) try: regex_str = r'(?P<people_helped>[0-9]+) of (?P<people_total>[0-9]+)' helpful_match = re.match(regex_str, helpfuls.text).groupdict() review_info[u'people_helped'] = int( helpful_match[u'people_helped']) review_info[u'people_total'] = int( helpful_match[u'people_total']) except AttributeError: # total of people is no longer shown # try another method, not using regex method. # ie: 805 people found this review helpful helpful_match = helpfuls.text.split( 'people found this review helpful')[0] review_info[u'people_helped'] = int(helpful_match) # review_info[u'people_total'] = int(helpful_match[u'people_total']) review_info[u'people_total'] = None try: regex_str = r'(?P<media_consumed>[0-9]+) of (?P<media_total>[0-9?]+)' consumption_match = re.match( regex_str, meta_rows[2].text).groupdict() review_info[u'media_consumed'] = int( consumption_match[u'media_consumed']) if consumption_match[u'media_total'] == u'?': review_info[u'media_total'] = None else: review_info[u'media_total'] = int( consumption_match[u'media_total']) except AttributeError: # available format # ie anime: 25 of 25 episodes seen # ie : 25 of ? episodes seen # ie : ? episodes # ie manga: 40 chapters # ie : 60 of ? chapters read # ie : ? chapters # <div class="lightLink" style="float: right;">24 of 24 episodes seen</div> media_tag = meta_rows[1].find_all('div')[0] if ' episodes' in media_tag.text: user_media_consumption = media_tag.text.split( ' episodes')[0].strip() elif ' chapters' in media_tag.text: user_media_consumption = media_tag.text.split( ' chapters')[0].strip() else: # no format recognized raise AttributeError # user_media_consumption : 'xx of xx', 'xx of ?', '? of xx', or '?' if 'of' not in user_media_consumption: review_info[u'media_consumed'] = None review_info[u'media_total'] = None else: # temp var for variable media_consumed temp_consumed = user_media_consumption.split( 'of')[0].strip() # temp var for variable media_total temp_total = user_media_consumption.split( 'of')[1].strip() if temp_consumed == '?': review_info[u'media_consumed'] = None else: review_info[u'media_consumed'] = int( temp_consumed) if temp_total == '?': review_info[u'media_total'] = None else: review_info[u'media_total'] = int(temp_total) review_info[u'rating'] = int(meta_rows[2].text.replace( u'Overall Rating: ', '').split('Other review')[0]) for x in review_elt.find_all([u'div', 'a']): x.extract() try: review_info[u'text'] = review_elt.text.strip() except AttributeError: # sometime reviw_elt cant produce attribute error # one of the solution is to reparse the tag review_info[u'text'] = BeautifulSoup( str(review_elt), "lxml").text.strip() user_info[u'reviews'][media] = review_info except: if not self.session.suppress_parse_exceptions: raise return user_info
def parse(self, user_page): """Parses the DOM and returns user attributes in the main-content area. :type user_page: :class:`bs4.BeautifulSoup` :param user_page: MAL user page's DOM :rtype: dict :return: User attributes. """ user_info = self.parse_sidebar(user_page) section_headings = user_page.find_all(u'div', {u'class': u'normal_header'}) # parse general details. # we have to work from the bottom up, since there's broken HTML after every header. last_online_elt = user_page.find(u'td', text=u'Last Online') if last_online_elt: try: general_table = last_online_elt.parent.parent except: if not self.session.suppress_parse_exceptions: raise if general_table and general_table.name == u'table': try: last_online_elt = general_table.find(u'td', text=u'Last Online') if last_online_elt: user_info[u'last_online'] = utilities.parse_profile_date(last_online_elt.findNext(u'td').text) except: if not self.session.suppress_parse_exceptions: raise try: gender = general_table.find(u'td', text=u'Gender') if gender: user_info[u'gender'] = gender.findNext(u'td').text except: if not self.session.suppress_parse_exceptions: raise try: birthday = general_table.find(u'td', text=u'Birthday') if birthday: user_info[u'birthday'] = utilities.parse_profile_date(birthday.findNext(u'td').text) except: if not self.session.suppress_parse_exceptions: raise try: location = general_table.find(u'td', text=u'Location') if location: user_info[u'location'] = location.findNext(u'td').text except: if not self.session.suppress_parse_exceptions: raise try: website = general_table.find(u'td', text=u'Website') if website: user_info[u'website'] = website.findNext(u'td').text except: if not self.session.suppress_parse_exceptions: raise try: join_date = general_table.find(u'td', text=u'Join Date') if join_date: user_info[u'join_date'] = utilities.parse_profile_date(join_date.findNext(u'td').text) except: if not self.session.suppress_parse_exceptions: raise try: access_rank = general_table.find(u'td', text=u'Access Rank') if access_rank: user_info[u'access_rank'] = access_rank.findNext(u'td').text except: if not self.session.suppress_parse_exceptions: raise try: anime_list_views = general_table.find(u'td', text=u'Anime List Views') if anime_list_views: user_info[u'anime_list_views'] = int(anime_list_views.findNext(u'td').text.replace(',', '')) except: if not self.session.suppress_parse_exceptions: raise try: manga_list_views = general_table.find(u'td', text=u'Manga List Views') if manga_list_views: user_info[u'manga_list_views'] = int(manga_list_views.findNext(u'td').text.replace(',', '')) except: if not self.session.suppress_parse_exceptions: raise try: num_comments = general_table.find(u'td', text=u'Comments') if num_comments: user_info[u'num_comments'] = int(num_comments.findNext(u'td').text.replace(',', '')) except: if not self.session.suppress_parse_exceptions: raise try: num_forum_posts = general_table.find(u'td', text=u'Forum Posts') if num_forum_posts: user_info[u'num_forum_posts'] = int(num_forum_posts.findNext(u'td').text.replace(" (Find All)", "").replace(',', '')) except: if not self.session.suppress_parse_exceptions: raise try: # last list updates. list_updates_header = filter(lambda x: u'Last List Updates' in x.text, section_headings) if list_updates_header: list_updates_header = list_updates_header[0] list_updates_table = list_updates_header.findNext(u'table') if list_updates_table: user_info[u'last_list_updates'] = {} for row in list_updates_table.find_all(u'tr'): cols = row.find_all(u'td') info_col = cols[1] media_link = info_col.find(u'a') link_parts = media_link.get(u'href').split(u'/') # of the form /(anime|manga)/10087/Fate/Zero if link_parts[1] == u'anime': media = self.session.anime(int(link_parts[2])).set({u'title': media_link.text}) else: media = self.session.manga(int(link_parts[2])).set({u'title': media_link.text}) list_update = {} progress_div = info_col.find(u'div', {u'class': u'spaceit_pad'}) if progress_div: progress_match = re.match(r'(?P<status>[A-Za-z]+)( at (?P<episodes>[0-9]+) of (?P<total_episodes>[0-9]+))?', progress_div.text).groupdict() list_update[u'status'] = progress_match[u'status'] if progress_match[u'episodes'] is None: list_update[u'episodes'] = None else: list_update[u'episodes'] = int(progress_match[u'episodes']) if progress_match[u'total_episodes'] is None: list_update[u'total_episodes'] = None else: list_update[u'total_episodes'] = int(progress_match[u'total_episodes']) time_div = info_col.find(u'div', {u'class': u'lightLink'}) if time_div: list_update[u'time'] = utilities.parse_profile_date(time_div.text) user_info[u'last_list_updates'][media] = list_update except: if not self.session.suppress_parse_exceptions: raise lower_section_headings = user_page.find_all(u'h2') # anime stats. try: anime_stats_header = filter(lambda x: u'Anime Stats' in x.text, lower_section_headings) if anime_stats_header: anime_stats_header = anime_stats_header[0] anime_stats_table = anime_stats_header.findNext(u'table') if anime_stats_table: user_info[u'anime_stats'] = {} for row in anime_stats_table.find_all(u'tr'): cols = row.find_all(u'td') value = cols[1].text if cols[1].find(u'span', {u'title': u'Days'}): value = round(float(value), 1) else: value = int(value) user_info[u'anime_stats'][cols[0].text] = value except: if not self.session.suppress_parse_exceptions: raise try: # manga stats. manga_stats_header = filter(lambda x: u'Manga Stats' in x.text, lower_section_headings) if manga_stats_header: manga_stats_header = manga_stats_header[0] manga_stats_table = manga_stats_header.findNext(u'table') if manga_stats_table: user_info[u'manga_stats'] = {} for row in manga_stats_table.find_all(u'tr'): cols = row.find_all(u'td') value = cols[1].text if cols[1].find(u'span', {u'title': u'Days'}): value = round(float(value), 1) else: value = int(value) user_info[u'manga_stats'][cols[0].text] = value except: if not self.session.suppress_parse_exceptions: raise try: about_header = filter(lambda x: u'About' in x.text, section_headings) if about_header: about_header = about_header[0] user_info[u'about'] = about_header.findNext(u'div').text.strip() except: if not self.session.suppress_parse_exceptions: raise return user_info
def parse_sidebar(self, manga_page, manga_page_original=None): """Parses the DOM and returns manga attributes in the sidebar. :type manga_page: :class:`bs4.BeautifulSoup` :param manga_page: MAL manga page's DOM :type manga_page: :class:`bs4.BeautifulSoup` :param manga_page: MAL manga page's DOM :rtype: dict :return: manga attributes :raises: :class:`.InvalidMangaError`, :class:`.MalformedMangaPageError` """ # if MAL says the series doesn't exist, raise an InvalidMangaError. error_tag = manga_page.find('div', {'class': 'badresult'}) if error_tag: raise InvalidMangaError(self.id) try: title_tag = manga_page.find('span', {'itemprop': 'name'}) if not title_tag: # otherwise, raise a MalformedMangaPageError. raise MalformedMangaPageError(self.id, manga_page, message="Could not find title") except: if not self.session.suppress_parse_exceptions: raise # otherwise, begin parsing. manga_info = super(Manga, self).parse_sidebar(manga_page, manga_page_original) info_panel_first = manga_page.find('div', {'id': 'content'}).find('table').find('td') try: volumes_tag = [ x for x in manga_page.select('span.dark_text') if 'Volumes:' in x.text][0] manga_volume = volumes_tag.parent.text.split(':')[1].strip().replace(',', '') manga_info[u'volumes'] = ( int(manga_volume) if manga_volume != 'Unknown' else None ) except: if not self.session.suppress_parse_exceptions: raise try: chapters_tag = info_panel_first.find(text=u'Chapters:').parent.parent utilities.extract_tags(chapters_tag.find_all(u'span', {'class': 'dark_text'})) chapters_tag_text = chapters_tag.text if ':' in chapters_tag_text: chapters_tag_text = chapters_tag_text.split(':')[1] manga_chapters = chapters_tag_text.strip() manga_info[u'chapters'] = ( int(manga_chapters) if chapters_tag.text.strip() != 'Unknown' else None ) except: if not self.session.suppress_parse_exceptions: raise try: published_tag = info_panel_first.find(text=u'Published:').parent.parent utilities.extract_tags(published_tag.find_all(u'span', {'class': 'dark_text'})) published_parts = published_tag.text.strip().split(u' to ') # check if published part only contain start date or also end date. if len(published_parts) == 1: # this published once. try: published_date = utilities.parse_profile_date(published_parts[0]) except ValueError: raise MalformedMangaPageError(self.id, published_parts[0], message="Could not parse single publish date") publish_start = published_date publish_end = None else: # two publishing dates. try: # publish_start may contain redundant word such as # 'Published: Feb 24, 2003', if 'Published:' in published_parts[0]: published_parts[0] = published_parts[0].split('Published:')[1].strip() publish_start = utilities.parse_profile_date(published_parts[0]) except ValueError: raise MalformedMangaPageError( self.id, published_parts[0], message="Could not parse first of two publish dates" ) if published_parts == u'?': # this is still publishing. publish_end = None else: try: publish_end = utilities.parse_profile_date(published_parts[1]) except ValueError: raise MalformedMangaPageError( self.id, published_parts[1], message="Could not parse second of two publish dates" ) manga_info[u'published'] = (publish_start, publish_end) except: if not self.session.suppress_parse_exceptions: raise try: authors_tag = info_panel_first.find(text='Authors:').parent.parent utilities.extract_tags(authors_tag.find_all('span', {'class': 'dark_text'})) manga_info['authors'] = {} for author_link in authors_tag.find_all('a'): link_parts = author_link.get('href').split('/') # of the form /people/1867/Naoki_Urasawa person = self.session.person(int(link_parts[2])).set({'name': author_link.text}) role = author_link.nextSibling.replace(' (', '').replace(')', '') manga_info['authors'][person] = role except: if not self.session.suppress_parse_exceptions: raise try: serialization_tag = info_panel_first.find(text=u'Serialization:').parent.parent publication_link = serialization_tag.find('a') manga_info[u'serialization'] = None if publication_link: # of the form /manga.php?mid=1 link_parts = publication_link.get('href').split('mid=') # example for link_parts # ['/manga/magazine/450/Bessatsu_Shounen_Magazine'] publication_id = link_parts[0].split('/manga/magazine/')[1].split('/')[0] manga_info[u'serialization'] = self.session.publication(int(publication_id)).set( {'name': publication_link.text} ) except: if not self.session.suppress_parse_exceptions: raise return manga_info
def parse_sidebar(self, manga_page): """Parses the DOM and returns manga attributes in the sidebar. :type manga_page: :class:`bs4.BeautifulSoup` :param manga_page: MAL manga page's DOM :rtype: dict :return: manga attributes :raises: :class:`.InvalidMangaError`, :class:`.MalformedMangaPageError` """ # if MAL says the series doesn't exist, raise an InvalidMangaError. error_tag = manga_page.find(u"div", {"class": "badresult"}) if error_tag: raise InvalidMangaError(self.id) try: title_tag = manga_page.find(u"div", {"id": "contentWrapper"}).find(u"h1") if not title_tag.find(u"div"): # otherwise, raise a MalformedMangaPageError. raise MalformedMangaPageError(self.id, manga_page, message="Could not find title div") except: if not self.session.suppress_parse_exceptions: raise # otherwise, begin parsing. manga_info = super(Manga, self).parse_sidebar(manga_page) info_panel_first = manga_page.find(u"div", {"id": "content"}).find(u"table").find(u"td") try: volumes_tag = info_panel_first.find(text=u"Volumes:").parent.parent utilities.extract_tags(volumes_tag.find_all(u"span", {"class": "dark_text"})) manga_info[u"volumes"] = int(volumes_tag.text.strip()) if volumes_tag.text.strip() != "Unknown" else None except: if not self.session.suppress_parse_exceptions: raise try: chapters_tag = info_panel_first.find(text=u"Chapters:").parent.parent utilities.extract_tags(chapters_tag.find_all(u"span", {"class": "dark_text"})) manga_info[u"chapters"] = int(chapters_tag.text.strip()) if chapters_tag.text.strip() != "Unknown" else None except: if not self.session.suppress_parse_exceptions: raise try: published_tag = info_panel_first.find(text=u"Published:").parent.parent utilities.extract_tags(published_tag.find_all(u"span", {"class": "dark_text"})) published_parts = published_tag.text.strip().split(u" to ") if len(published_parts) == 1: # this published once. try: published_date = utilities.parse_profile_date(published_parts[0]) except ValueError: raise MalformedMangaPageError( self.id, published_parts[0], message="Could not parse single publish date" ) manga_info[u"published"] = (published_date,) else: # two publishing dates. try: publish_start = utilities.parse_profile_date(published_parts[0]) except ValueError: raise MalformedMangaPageError( self.id, published_parts[0], message="Could not parse first of two publish dates" ) if published_parts == u"?": # this is still publishing. publish_end = None else: try: publish_end = utilities.parse_profile_date(published_parts[1]) except ValueError: raise MalformedMangaPageError( self.id, published_parts[1], message="Could not parse second of two publish dates" ) manga_info[u"published"] = (publish_start, publish_end) except: if not self.session.suppress_parse_exceptions: raise try: authors_tag = info_panel_first.find(text=u"Authors:").parent.parent utilities.extract_tags(authors_tag.find_all(u"span", {"class": "dark_text"})) manga_info[u"authors"] = {} for author_link in authors_tag.find_all("a"): link_parts = author_link.get("href").split("/") # of the form /people/1867/Naoki_Urasawa person = self.session.person(int(link_parts[2])).set({"name": author_link.text}) role = author_link.nextSibling.replace(" (", "").replace(")", "") manga_info[u"authors"][person] = role except: if not self.session.suppress_parse_exceptions: raise try: serialization_tag = info_panel_first.find(text=u"Serialization:").parent.parent publication_link = serialization_tag.find("a") manga_info[u"serialization"] = None if publication_link: link_parts = publication_link.get("href").split("mid=") # of the form /manga.php?mid=1 manga_info[u"serialization"] = self.session.publication(int(link_parts[1])).set( {"name": publication_link.text} ) except: if not self.session.suppress_parse_exceptions: raise return manga_info
def parse_sidebar(self, anime_page): """Parses the DOM and returns anime attributes in the sidebar. :type anime_page: :class:`bs4.BeautifulSoup` :param anime_page: MAL anime page's DOM :rtype: dict :return: anime attributes :raises: :class:`.InvalidAnimeError`, :class:`.MalformedAnimePageError` """ # if MAL says the series doesn't exist, raise an InvalidAnimeError. error_tag = anime_page.find(u'div', {'class': 'badresult'}) if error_tag: raise InvalidAnimeError(self.id) title_tag = anime_page.find(u'div', {'id': 'contentWrapper'}).find(u'h1') if not title_tag.find(u'span'): # otherwise, raise a MalformedAnimePageError. raise MalformedAnimePageError(self.id, anime_page, message="Could not find title div") anime_info = super(Anime, self).parse_sidebar(anime_page) info_panel_first = anime_page.find(u'div', {'id': 'content'}).find(u'table').find(u'td') try: episode_tag = info_panel_first.find(text=u'Episodes:').parent.parent utilities.extract_tags(episode_tag.find_all(u'span', {'class': 'dark_text'})) anime_info[u'episodes'] = int(episode_tag.text.strip()) if episode_tag.text.strip() != 'Unknown' else 0 except: if not self.session.suppress_parse_exceptions: raise try: aired_tag = info_panel_first.find(text=u'Aired:').parent.parent utilities.extract_tags(aired_tag.find_all(u'span', {'class': 'dark_text'})) aired_parts = aired_tag.text.strip().split(u' to ') if len(aired_parts) == 1: # this aired once. try: aired_date = utilities.parse_profile_date(aired_parts[0], suppress=self.session.suppress_parse_exceptions) except ValueError: raise MalformedAnimePageError(self.id, aired_parts[0], message="Could not parse single air date") anime_info[u'aired'] = (aired_date,) else: # two airing dates. try: air_start = utilities.parse_profile_date(aired_parts[0], suppress=self.session.suppress_parse_exceptions) except ValueError: raise MalformedAnimePageError(self.id, aired_parts[0], message="Could not parse first of two air dates") try: air_end = utilities.parse_profile_date(aired_parts[1], suppress=self.session.suppress_parse_exceptions) except ValueError: raise MalformedAnimePageError(self.id, aired_parts[1], message="Could not parse second of two air dates") anime_info[u'aired'] = (air_start, air_end) except: if not self.session.suppress_parse_exceptions: raise try: producers_tag = info_panel_first.find(text=u'Producers:').parent.parent utilities.extract_tags(producers_tag.find_all(u'span', {'class': 'dark_text'})) utilities.extract_tags(producers_tag.find_all(u'sup')) anime_info[u'producers'] = [] for producer_link in producers_tag.find_all('a'): if producer_link.text == u'add some': # MAL is saying "None found, add some". break link_parts = producer_link.get('href').split('p=') # of the form: /anime.php?p=14 if len(link_parts) > 1: anime_info[u'producers'].append(self.session.producer(int(link_parts[1])).set({'name': producer_link.text})) except: if not self.session.suppress_parse_exceptions: raise try: duration_tag = info_panel_first.find(text=u'Duration:').parent.parent utilities.extract_tags(duration_tag.find_all(u'span', {'class': 'dark_text'})) anime_info[u'duration'] = duration_tag.text.strip() duration_parts = [part.strip() for part in anime_info[u'duration'].split(u'.')] duration_mins = 0 for part in duration_parts: part_match = re.match(u'(?P<num>[0-9]+)', part) if not part_match: continue part_volume = int(part_match.group(u'num')) if part.endswith(u'hr'): duration_mins += part_volume * 60 elif part.endswith(u'min'): duration_mins += part_volume anime_info[u'duration'] = datetime.timedelta(minutes=duration_mins) except: if not self.session.suppress_parse_exceptions: raise try: rating_tag = info_panel_first.find(text=u'Rating:').parent.parent utilities.extract_tags(rating_tag.find_all(u'span', {'class': 'dark_text'})) anime_info[u'rating'] = rating_tag.text.strip() except: if not self.session.suppress_parse_exceptions: raise return anime_info
def parse(self, user_page): """Parses the DOM and returns user attributes in the main-content area. :type user_page: :class:`bs4.BeautifulSoup` :param user_page: MAL user page's DOM :rtype: dict :return: User attributes. """ user_info = self.parse_sidebar(user_page) section_headings = user_page.find_all(u'div', {u'class': u'normal_header'}) # parse general details. # we have to work from the bottom up, since there's broken HTML after every header. last_online_elt = user_page.find(u'td', text=u'Last Online') if last_online_elt: try: general_table = last_online_elt.parent.parent except: if not self.session.suppress_parse_exceptions: raise if general_table and general_table.name == u'table': try: last_online_elt = general_table.find(u'td', text=u'Last Online') if last_online_elt: user_info[ u'last_online'] = utilities.parse_profile_date( last_online_elt.findNext(u'td').text) except: if not self.session.suppress_parse_exceptions: raise try: gender = general_table.find(u'td', text=u'Gender') if gender: user_info[u'gender'] = gender.findNext(u'td').text except: if not self.session.suppress_parse_exceptions: raise try: birthday = general_table.find(u'td', text=u'Birthday') if birthday: user_info[u'birthday'] = utilities.parse_profile_date( birthday.findNext(u'td').text) except: if not self.session.suppress_parse_exceptions: raise try: location = general_table.find(u'td', text=u'Location') if location: user_info[u'location'] = location.findNext(u'td').text except: if not self.session.suppress_parse_exceptions: raise try: website = general_table.find(u'td', text=u'Website') if website: user_info[u'website'] = website.findNext(u'td').text except: if not self.session.suppress_parse_exceptions: raise try: join_date = general_table.find(u'td', text=u'Join Date') if join_date: user_info[u'join_date'] = utilities.parse_profile_date( join_date.findNext(u'td').text) except: if not self.session.suppress_parse_exceptions: raise try: access_rank = general_table.find(u'td', text=u'Access Rank') if access_rank: user_info[u'access_rank'] = access_rank.findNext( u'td').text except: if not self.session.suppress_parse_exceptions: raise try: anime_list_views = general_table.find( u'td', text=u'Anime List Views') if anime_list_views: user_info[u'anime_list_views'] = int( anime_list_views.findNext(u'td').text.replace( ',', '')) except: if not self.session.suppress_parse_exceptions: raise try: manga_list_views = general_table.find( u'td', text=u'Manga List Views') if manga_list_views: user_info[u'manga_list_views'] = int( manga_list_views.findNext(u'td').text.replace( ',', '')) except: if not self.session.suppress_parse_exceptions: raise try: num_comments = general_table.find(u'td', text=u'Comments') if num_comments: user_info[u'num_comments'] = int( num_comments.findNext(u'td').text.replace(',', '')) except: if not self.session.suppress_parse_exceptions: raise try: num_forum_posts = general_table.find(u'td', text=u'Forum Posts') if num_forum_posts: user_info[u'num_forum_posts'] = int( num_forum_posts.findNext(u'td').text.replace( " (Find All)", "").replace(',', '')) except: if not self.session.suppress_parse_exceptions: raise try: # last list updates. list_updates_header = filter( lambda x: u'Last List Updates' in x.text, section_headings) if list_updates_header: list_updates_header = list_updates_header[0] list_updates_table = list_updates_header.findNext(u'table') if list_updates_table: user_info[u'last_list_updates'] = {} for row in list_updates_table.find_all(u'tr'): cols = row.find_all(u'td') info_col = cols[1] media_link = info_col.find(u'a') link_parts = media_link.get(u'href').split(u'/') # of the form /(anime|manga)/10087/Fate/Zero if link_parts[1] == u'anime': media = self.session.anime(int(link_parts[2])).set( {u'title': media_link.text}) else: media = self.session.manga(int(link_parts[2])).set( {u'title': media_link.text}) list_update = {} progress_div = info_col.find( u'div', {u'class': u'spaceit_pad'}) if progress_div: progress_match = re.match( r'(?P<status>[A-Za-z]+)( at (?P<episodes>[0-9]+) of (?P<total_episodes>[0-9]+))?', progress_div.text).groupdict() list_update[u'status'] = progress_match[u'status'] if progress_match[u'episodes'] is None: list_update[u'episodes'] = None else: list_update[u'episodes'] = int( progress_match[u'episodes']) if progress_match[u'total_episodes'] is None: list_update[u'total_episodes'] = None else: list_update[u'total_episodes'] = int( progress_match[u'total_episodes']) time_div = info_col.find(u'div', {u'class': u'lightLink'}) if time_div: list_update[ u'time'] = utilities.parse_profile_date( time_div.text) user_info[u'last_list_updates'][media] = list_update except: if not self.session.suppress_parse_exceptions: raise lower_section_headings = user_page.find_all(u'h2') # anime stats. try: anime_stats_header = filter(lambda x: u'Anime Stats' in x.text, lower_section_headings) if anime_stats_header: anime_stats_header = anime_stats_header[0] anime_stats_table = anime_stats_header.findNext(u'table') if anime_stats_table: user_info[u'anime_stats'] = {} for row in anime_stats_table.find_all(u'tr'): cols = row.find_all(u'td') value = cols[1].text if cols[1].find(u'span', {u'title': u'Days'}): value = round(float(value), 1) else: value = int(value) user_info[u'anime_stats'][cols[0].text] = value except: if not self.session.suppress_parse_exceptions: raise try: # manga stats. manga_stats_header = filter(lambda x: u'Manga Stats' in x.text, lower_section_headings) if manga_stats_header: manga_stats_header = manga_stats_header[0] manga_stats_table = manga_stats_header.findNext(u'table') if manga_stats_table: user_info[u'manga_stats'] = {} for row in manga_stats_table.find_all(u'tr'): cols = row.find_all(u'td') value = cols[1].text if cols[1].find(u'span', {u'title': u'Days'}): value = round(float(value), 1) else: value = int(value) user_info[u'manga_stats'][cols[0].text] = value except: if not self.session.suppress_parse_exceptions: raise try: about_header = filter(lambda x: u'About' in x.text, section_headings) if about_header: about_header = about_header[0] user_info[u'about'] = about_header.findNext( u'div').text.strip() except: if not self.session.suppress_parse_exceptions: raise return user_info
def parse_sidebar(self, anime_page, anime_page_original=None): """Parses the DOM and returns anime attributes in the sidebar. :type anime_page: :class:`bs4.BeautifulSoup` :param anime_page: MAL anime page's DOM :type anime_page: :class:`bs4.BeautifulSoup` :param anime_page: MAL anime page's DOM uncleaned :rtype: dict :return: anime attributes :raises: :class:`.InvalidAnimeError`, :class:`.MalformedAnimePageError` """ # if MAL says the series doesn't exist, raise an InvalidAnimeError. error_tag = anime_page.find(u'div', {'class': 'badresult'}) if error_tag: raise InvalidAnimeError(self.id) title_tag = anime_page.find(u'div', { 'id': 'contentWrapper' }).find(u'h1') if not title_tag.find(u'div'): # otherwise, raise a MalformedAnimePageError. try: title_tag = anime_page.select('h1.h1 span')[0].text except IndexError: raise MalformedAnimePageError( self.id, None, message="Could not find title div") anime_info = super(Anime, self).parse_sidebar(anime_page, anime_page_original) info_panel_first = anime_page.find(u'div', { 'id': 'content' }).find(u'table').find(u'td') try: episode_tag = [ x for x in anime_page_original.find_all('span') if 'Episodes:' in x.text ][0].parent anime_info[u'episodes'] = int( episode_tag.text.split(':') [-1].strip()) if episode_tag.text.strip() != 'Unknown' else 0 except: if not self.session.suppress_parse_exceptions: raise try: aired_tag = [ x for x in anime_page_original.find_all('span') if 'Aired:' in x.text ][0].parent aired_tag_text = aired_tag.text.split(':')[1] aired_parts = aired_tag_text.strip().split(u' to ') if len(aired_parts) == 1: # this aired once. try: aired_date = utilities.parse_profile_date( aired_parts[0], suppress=self.session.suppress_parse_exceptions) except ValueError: raise MalformedAnimePageError( self.id, aired_parts[0], message="Could not parse single air date") anime_info[u'aired'] = (aired_date, ) else: # two airing dates. try: air_start = utilities.parse_profile_date( aired_parts[0], suppress=self.session.suppress_parse_exceptions) except ValueError: raise MalformedAnimePageError( self.id, aired_parts[0], message="Could not parse first of two air dates") try: air_end = utilities.parse_profile_date( aired_parts[1], suppress=self.session.suppress_parse_exceptions) except ValueError: raise MalformedAnimePageError( self.id, aired_parts[1], message="Could not parse second of two air dates") anime_info[u'aired'] = (air_start, air_end) except: if not self.session.suppress_parse_exceptions: raise try: anime_info[u'producers'] = self.parse_producers(anime_page) except: if not self.session.suppress_parse_exceptions: raise try: duration_tag = [ x for x in anime_page_original.find_all('span') if 'Duration:' in x.text ][0].parent anime_info[u'duration'] = duration_tag.text.split(':')[1].strip() duration_parts = [ part.strip() for part in anime_info[u'duration'].split(u'.') ] duration_mins = 0 for part in duration_parts: part_match = re.match(u'(?P<num>[0-9]+)', part) if not part_match: continue part_volume = int(part_match.group(u'num')) if part.endswith(u'hr'): duration_mins += part_volume * 60 elif part.endswith(u'min'): duration_mins += part_volume anime_info[u'duration'] = datetime.timedelta(minutes=duration_mins) except: if not self.session.suppress_parse_exceptions: raise try: rating_tag = [ x for x in anime_page_original.find_all('span') if 'Rating:' in x.text ][0].parent utilities.extract_tags( rating_tag.find_all(u'span', {'class': 'dark_text'})) anime_info[u'rating'] = rating_tag.text.strip() except: if not self.session.suppress_parse_exceptions: raise return anime_info
def parse(self, user_page): """Parses the DOM and returns user attributes in the main-content area. :type user_page: :class:`bs4.BeautifulSoup` :param user_page: MAL user page's DOM :rtype: dict :return: User attributes. """ user_info = self.parse_sidebar(user_page) section_headings = user_page.find_all(u'div', {u'class': u'normal_header'}) # parse general details. try: num_comments_tag = user_page.find(u'a', text=re.compile(u'All Comments')) num_comments = re.search(u'\((\d+)\)', num_comments_tag.text).group(1) user_info[u'num_comments'] = int(num_comments) except: if not self.session.suppress_parse_exceptions: raise # parse favorites favorites_tag = user_page.find(u'div', {u'class': u'user-favorites'}) if favorites_tag: favorites_section = favorites_tag.find_all(u'div', recursive=False) try: favorite_anime_header = favorites_section[0] user_info[u'favorite_anime'] = [] for elt in favorite_anime_header.find_all(u'li'): link_tag = elt.find_all(u'a')[1] link_parts = link_tag.get(u'href').split(u'.net')[1].split( u'/') # of the form /anime/467/Ghost_in_the_Shell:_Stand_Alone_Complex user_info[u'favorite_anime'].append( self.session.anime(int(link_parts[2])).set( {u'title': link_tag.text})) except: if not self.session.suppress_parse_exceptions: raise try: favorite_manga_header = favorites_section[1] user_info[u'favorite_manga'] = [] for elt in favorite_manga_header.find_all(u'li'): link_tag = elt.find_all(u'a')[1] link_parts = link_tag.get(u'href').split(u'.net')[1].split( u'/') # of the form /manga/467/Ghost_in_the_Shell:_Stand_Alone_Complex user_info[u'favorite_manga'].append( self.session.manga(int(link_parts[2])).set( {u'title': link_tag.text})) except: if not self.session.suppress_parse_exceptions: raise try: favorite_character_header = favorites_section[2] user_info[u'favorite_characters'] = {} for elt in favorite_character_header.find_all(u'li'): link_tag = elt.find_all(u'a')[1] link_parts = link_tag.get(u'href').split(u'.net')[1].split( u'/') # of the form /character/467/Ghost_in_the_Shell:_Stand_Alone_Complex char = self.session.character(int(link_parts[2])).set( {u'title': link_tag.text}) media_link_tag = link_tag.nextSibling.find(u'a') media_link_parts = media_link_tag.get(u'href').split(u'/') # of the form /anime|manga/467/Ghost_in_the_Shell:_Stand_Alone_Complex anime = getattr(self.session, media_link_parts[1])(int( media_link_parts[2])).set( {u'title': media_link_tag.text}) user_info[u'favorite_characters'][char] = anime except: if not self.session.suppress_parse_exceptions: raise try: favorite_people_header = favorites_section[3] user_info[u'favorite_people'] = [] for elt in favorite_people_header.find_all(u'li'): link_tag = elt.find_all(u'a')[1] link_parts = link_tag.get(u'href').split(u'.net')[1].split( u'/') # of the form /people/467/Ghost_in_the_Shell:_Stand_Alone_Complex user_info[u'favorite_people'].append( self.session.person(int(link_parts[2])).set( {u'title': link_tag.text})) except: if not self.session.suppress_parse_exceptions: raise stats_tag = user_page.find(id='statistics') try: # last list updates. list_updates_header = filter( lambda x: u'Last List Updates' in x.text, section_headings) if list_updates_header: list_updates_header = list_updates_header[0] list_updates_table = list_updates_header.findNext(u'table') if list_updates_table: user_info[u'last_list_updates'] = {} for row in list_updates_table.find_all(u'tr'): cols = row.find_all(u'td') info_col = cols[1] media_link = info_col.find(u'a') link_parts = media_link.get(u'href').split(u'/') # of the form /(anime|manga)/10087/Fate/Zero if link_parts[1] == u'anime': media = self.session.anime(int(link_parts[2])).set( {u'title': media_link.text}) else: media = self.session.manga(int(link_parts[2])).set( {u'title': media_link.text}) list_update = {} progress_div = info_col.find( u'div', {u'class': u'spaceit_pad'}) if progress_div: progress_match = re.match( r'(?P<status>[A-Za-z]+)( at (?P<episodes>[0-9]+) of (?P<total_episodes>[0-9]+))?', progress_div.text).groupdict() list_update[u'status'] = progress_match[u'status'] if progress_match[u'episodes'] is None: list_update[u'episodes'] = None else: list_update[u'episodes'] = int( progress_match[u'episodes']) if progress_match[u'total_episodes'] is None: list_update[u'total_episodes'] = None else: list_update[u'total_episodes'] = int( progress_match[u'total_episodes']) time_div = info_col.find(u'div', {u'class': u'lightLink'}) if time_div: list_update[ u'time'] = utilities.parse_profile_date( time_div.text) user_info[u'last_list_updates'][media] = list_update except: if not self.session.suppress_parse_exceptions: raise # anime stats. try: anime_stats_header = stats_tag.find(u'div', {u'class': u'stats anime'}) stats = user_info['anime_stats'] = {} stats['Days'] = float( anime_stats_header.find( text=re.compile('Days')).parent.nextSibling) stats['Mean Score'] = float( anime_stats_header.find( text=re.compile('Mean Score')).parent.nextSibling) stats_tables = anime_stats_header.find_all(u'ul') # watching, completed, etc for metric in stats_tables[0].find_all(u'li'): stats[metric.find(u'a').text] = int( metric.find(u'span').text.replace(',', '')) # total entries, rewatched, etc for metric in stats_tables[1].find_all(u'li'): parts = metric.find_all(u'span') stats[parts[0].text] = int(parts[1].text.replace(',', '')) except: if not self.session.suppress_parse_exceptions: raise # manga stats. try: manga_stats_header = stats_tag.find(u'div', {u'class': u'stats manga'}) stats = user_info['manga_stats'] = {} stats['Days'] = float( manga_stats_header.find( text=re.compile('Days')).parent.nextSibling) stats['Mean Score'] = float( manga_stats_header.find( text=re.compile('Mean Score')).parent.nextSibling) stats_tables = manga_stats_header.find_all(u'ul') # reading, completed, etc for metric in stats_tables[0].find_all(u'li'): stats[metric.find(u'a').text] = int( metric.find(u'span').text.replace(',', '')) # total entries, reread, etc for metric in stats_tables[1].find_all(u'li'): parts = metric.find_all(u'span') stats[parts[0].text] = int(parts[1].text.replace(',', '')) except: if not self.session.suppress_parse_exceptions: raise try: about_header = user_page.find(u'div', {u'class': u'profile-about-user'}) if not about_header: user_info[u'about'] = u'' else: user_info[u'about'] = about_header.find(u'div').text.strip() except: if not self.session.suppress_parse_exceptions: raise return user_info
def parse_reviews(self, reviews_page): """Parse the DOM and returns user reviews attributes. :type reviews_page: :class:`bs4.BeautifulSoup` :param reviews_page: MAL user reviews page's DOM :rtype: dict :return: User reviews attributes. """ user_info = self.parse_sidebar(reviews_page) second_col = ( reviews_page .find(u'div', {u'id': u'content'}) .find(u'table') .find(u'tr') .find_all(u'td', recursive=False)[ 1] ) try: user_info['reviews'] = {} reviews = second_col.find_all('div', {'class': 'borderDark'}, recursive=False) if reviews: for row in reviews: review_info = {} try: (meta_elt, review_elt) = row.find_all('div', recursive=False)[0:2] except ValueError: raise meta_rows = meta_elt.find_all(u'div', recursive=False) date_txt = meta_rows[0].find(u'div').text review_info[u'date'] = utilities.parse_profile_date(date_txt) media_link = meta_rows[0].find(u'a') link_parts = media_link.get(u'href').split(u'/') # of the form /(anime|manga)/9760/Hoshi_wo_Ou_Kodomo media_id = int(link_parts[2]) media_type = link_parts[1] media = getattr( self.session, media_type )(media_id).set({u'title': media_link.text}) helpfuls = meta_rows[1].find('span', recursive=False) try: hm_reg = r'(?P<people_helped>[0-9]+) of (?P<people_total>[0-9]+)' helpful_match = re.match(hm_reg, helpfuls.text).groupdict() review_info[u'people_helped'] = int(helpful_match[u'people_helped']) review_info[u'people_total'] = int(helpful_match[u'people_total']) except AttributeError: # total of people is no longer shown # try another method, not using regex method. # ie: 805 people found this review helpful helpful_match = helpfuls.text.split('people found this review helpful')[0] review_info['people_helped'] = int(helpful_match) # review_info[u'people_total'] = int(helpful_match[u'people_total']) review_info['people_total'] = None try: cm_reg = r'(?P<media_consumed>[0-9]+) of (?P<media_total>[0-9?]+)' consumption_match = re.match(cm_reg, meta_rows[2].text).groupdict() review_info[u'media_consumed'] = int(consumption_match[u'media_consumed']) if consumption_match[u'media_total'] == u'?': review_info[u'media_total'] = None else: review_info['media_total'] = int(consumption_match['media_total']) except AttributeError: # available format # ie anime: 25 of 25 episodes seen # ie : 25 of ? episodes seen # ie : ? episodes # ie manga: 40 chapters # ie : 60 of ? chapters read # ie : ? chapters # <div class="lightLink" style="float: right;">24 of 24 episodes seen</div> media_tag = meta_rows[1].find_all('div')[0] if ' episodes' in media_tag.text: user_media_consumption = media_tag.text.split(' episodes')[0].strip() elif ' chapters' in media_tag.text: user_media_consumption = media_tag.text.split(' chapters')[0].strip() else: # no format recognized raise AttributeError # user_media_consumption : 'xx of xx', 'xx of ?', '? of xx', or '?' if 'of' not in user_media_consumption: review_info['media_consumed'] = None review_info['media_total'] = None else: # temp var for variable media_consumed temp_consumed = user_media_consumption.split('of')[0].strip() # temp var for variable media_total temp_total = user_media_consumption.split('of')[1].strip() if temp_consumed == '?': review_info['media_consumed'] = None else: review_info['media_consumed'] = int(temp_consumed) if temp_total == '?': review_info['media_total'] = None else: review_info['media_total'] = int(temp_total) rating_txt = meta_rows[2].text.replace(u'Overall Rating: ', '') rating_txt = rating_txt.split('Other review')[0] review_info[u'rating'] = int(rating_txt) for x in review_elt.find_all(['div', 'a']): x.extract() try: review_info[u'text'] = review_elt.text.strip() except AttributeError: # sometime reviw_elt cant produce attribute error # one of the solution is to reparse the tag review_info[u'text'] = BeautifulSoup(str(review_elt), "lxml").text.strip() user_info['reviews'][media] = review_info except: if not self.session.suppress_parse_exceptions: raise return user_info
def parse_sidebar(self, anime_page, anime_page_original=None): """Parse the DOM and returns anime attributes in the sidebar. :type anime_page: :class:`bs4.BeautifulSoup` :param anime_page: MAL anime page's DOM :type anime_page: :class:`bs4.BeautifulSoup` :param anime_page: MAL anime page's DOM uncleaned :rtype: dict :return: anime attributes :raises: :class:`.InvalidAnimeError`, :class:`.MalformedAnimePageError` """ # if MAL says the series doesn't exist, raise an InvalidAnimeError. error_tag = anime_page.find('div', {'class': 'badresult'}) if error_tag: raise InvalidAnimeError(self.id) title_tag = anime_page.find('div', {'id': 'contentWrapper'}).find('h1') if not title_tag.find('div'): # otherwise, raise a MalformedAnimePageError. try: title_tag = anime_page.select('h1.h1 span')[0].text except IndexError: raise MalformedAnimePageError(self.id, None, message="Could not find title div") anime_info = super(Anime, self).parse_sidebar(anime_page, anime_page_original) # 'info_panel_first'not used # info_panel_first = anime_page.find(u'div', {'id': 'content'}).find(u'table').find(u'td') try: anime_info['episodes'] = self._parse_episodes(anime_page) except: if not self.session.suppress_parse_exceptions: raise try: aired_tag = [x for x in anime_page_original.find_all('span') if 'Aired:' in x.text][0].parent aired_tag_text = aired_tag.text.split(':')[1] aired_parts = aired_tag_text.strip().split(' to ') suppress_parse_exceptions = self.session.suppress_parse_exceptions if len(aired_parts) == 1: # this aired once. try: aired_date = parse_profile_date(aired_parts[0], suppress=suppress_parse_exceptions) except ValueError: err_msg = "Could not parse single air date" raise MalformedAnimePageError(self.id, aired_parts[0], message=err_msg) anime_info['aired'] = (aired_date,) else: # two airing dates. try: air_start = parse_profile_date(aired_parts[0], suppress=suppress_parse_exceptions) except ValueError: raise MalformedAnimePageError(self.id, aired_parts[0], message="Could not parse first of two air dates") try: air_end = parse_profile_date(aired_parts[1], suppress=suppress_parse_exceptions) except ValueError: error_msg = "Could not parse second of two air dates" raise MalformedAnimePageError(self.id, aired_parts[1], message=error_msg) anime_info['aired'] = (air_start, air_end) except: if not self.session.suppress_parse_exceptions: raise try: anime_info['producers'] = self.parse_producers(anime_page) except: if not self.session.suppress_parse_exceptions: raise try: duration_tag = [x for x in anime_page_original.find_all('span') if 'Duration:' in x.text][0].parent anime_info['duration'] = duration_tag.text.split(':')[1].strip() duration_parts = [part.strip() for part in anime_info['duration'].split('.')] duration_mins = 0 for part in duration_parts: part_match = re.match('(?P<num>[0-9]+)', part) if not part_match: continue part_volume = int(part_match.group('num')) if part.endswith('hr'): duration_mins += part_volume * 60 elif part.endswith('min'): duration_mins += part_volume anime_info['duration'] = datetime.timedelta(minutes=duration_mins) except: if not self.session.suppress_parse_exceptions: raise try: rating_tag = [x for x in anime_page_original.find_all('span') if 'Rating:' in x.text][0].parent utilities.extract_tags(rating_tag.find_all('span', {'class': 'dark_text'})) anime_info['rating'] = rating_tag.text.strip() except: if not self.session.suppress_parse_exceptions: raise return anime_info
def parse_sidebar(self, manga_page, manga_page_original=None): """Parses the DOM and returns manga attributes in the sidebar. :type manga_page: :class:`bs4.BeautifulSoup` :param manga_page: MAL manga page's DOM :type manga_page: :class:`bs4.BeautifulSoup` :param manga_page: MAL manga page's DOM :rtype: dict :return: manga attributes :raises: :class:`.InvalidMangaError`, :class:`.MalformedMangaPageError` """ # if MAL says the series doesn't exist, raise an InvalidMangaError. error_tag = manga_page.find(u'div', {'class': 'badresult'}) if error_tag: raise InvalidMangaError(self.id) try: title_tag = manga_page.find(u'span', {'itemprop': 'name'}) if not title_tag: # otherwise, raise a MalformedMangaPageError. raise MalformedMangaPageError(self.id, manga_page, message="Could not find title") except: if not self.session.suppress_parse_exceptions: raise # otherwise, begin parsing. manga_info = super(Manga, self).parse_sidebar(manga_page, manga_page_original) info_panel_first = manga_page.find(u'div', { 'id': 'content' }).find(u'table').find(u'td') try: volumes_tag = info_panel_first.find(text=u'Volumes:').parent.parent utilities.extract_tags( volumes_tag.find_all(u'span', {'class': 'dark_text'})) manga_info[u'volumes'] = int(volumes_tag.text.strip( )) if volumes_tag.text.strip() != 'Unknown' else None except: if not self.session.suppress_parse_exceptions: raise try: chapters_tag = info_panel_first.find( text=u'Chapters:').parent.parent utilities.extract_tags( chapters_tag.find_all(u'span', {'class': 'dark_text'})) manga_info[u'chapters'] = int(chapters_tag.text.strip( )) if chapters_tag.text.strip() != 'Unknown' else None except: if not self.session.suppress_parse_exceptions: raise try: published_tag = info_panel_first.find( text=u'Published:').parent.parent utilities.extract_tags( published_tag.find_all(u'span', {'class': 'dark_text'})) published_parts = published_tag.text.strip().split(u' to ') if len(published_parts) == 1: # this published once. try: published_date = utilities.parse_profile_date( published_parts[0]) except ValueError: raise MalformedMangaPageError( self.id, published_parts[0], message="Could not parse single publish date") manga_info[u'published'] = (published_date, ) else: # two publishing dates. try: publish_start = utilities.parse_profile_date( published_parts[0]) except ValueError: raise MalformedMangaPageError( self.id, published_parts[0], message="Could not parse first of two publish dates") if published_parts == u'?': # this is still publishing. publish_end = None else: try: publish_end = utilities.parse_profile_date( published_parts[1]) except ValueError: raise MalformedMangaPageError( self.id, published_parts[1], message= "Could not parse second of two publish dates") manga_info[u'published'] = (publish_start, publish_end) except: if not self.session.suppress_parse_exceptions: raise try: authors_tag = info_panel_first.find(text=u'Authors:').parent.parent utilities.extract_tags( authors_tag.find_all(u'span', {'class': 'dark_text'})) manga_info[u'authors'] = {} for author_link in authors_tag.find_all('a'): link_parts = author_link.get('href').split('/') # of the form /people/1867/Naoki_Urasawa person = self.session.person(int(link_parts[2])).set( {'name': author_link.text}) role = author_link.nextSibling.replace(' (', '').replace(')', '') manga_info[u'authors'][person] = role except: if not self.session.suppress_parse_exceptions: raise try: manga_info[u'serialization'] = self.parse_serialization(manga_page) except: if not self.session.suppress_parse_exceptions: raise return manga_info