def parse_synopsis(self, media_page): """Parse the DOM and returns media synopsis. :type media_page: :class:`bs4.BeautifulSoup` :param media_page: MAL media page's DOM :rtype: string :return: media synopsis. """ synopsis_elt = [x for x in media_page.find_all('h2') if "Synopsis" in x.text][0].parent # filter the text between 2 h2-tag temp_synopsis_elt = [] for x in synopsis_elt.contents[1:]: if type(x) == bs4.element.Tag: if x.name == 'h2': break temp_synopsis_elt.append(x.text) else: temp_synopsis_elt.append(x) synopsis_elt = ''.join(temp_synopsis_elt) try: utilities.extract_tags(synopsis_elt.find_all('h2')) result = synopsis_elt.text.strip() except AttributeError: # the current synopsis_elt may not contain any h2-tag result = synopsis_elt if result == '': # result tag rs_tag = [xx for xx in media_page.select('span') if xx.get('itemprop') == 'description'][0] result = rs_tag.text return result
def parse(self, media_page): """Parses the DOM and returns media attributes in the main-content area. :type media_page: :class:`bs4.BeautifulSoup` :param media_page: MAL media page's DOM :rtype: dict :return: media attributes. """ media_info = self.parse_sidebar(media_page) try: synopsis_tag = media_page.find(u'span', {'itemprop': 'description'}) utilities.extract_tags([synopsis_tag]) media_info[u'synopsis'] = synopsis_tag.text.strip() except: if not self.session.suppress_parse_exceptions: raise try: related_title = media_page.find( text=re.compile(u'Related ' + self.__class__.__name__)) if related_title: related_table = related_title.parent.next_sibling utilities.extract_tags([related_table]) related = {} # extract each related category for row in related_table.find_all('tr'): related_type = row.find('td').text.strip(':') related[related_type] = [] # extract each title in the category for link in row.find_all('a'): href = link.get(u'href').replace( u'http://myanimelist.net', '') if not re.match(r'/(anime|manga)', href): break title = link.text # parse link: may be manga or anime. href_parts = href.split(u'/') # sometimes links on MAL are broken, of the form /anime// if href_parts[2] == '': continue # of the form: /(anime|manga)/1/Cowboy_Bebop obj_id = int(href_parts[2]) new_obj = getattr(self.session, href_parts[1])(obj_id).set( {'title': title}) related[related_type].append(new_obj) media_info[u'related'] = related else: media_info[u'related'] = None except: if not self.session.suppress_parse_exceptions: raise return media_info
def parse_recommendations(self, recommendations_page): """Parse the DOM and returns user recommendations attributes. :type recommendations_page: :class:`bs4.BeautifulSoup` :param recommendations_page: MAL user recommendations page's DOM :rtype: dict :return: User recommendations attributes. """ user_info = self.parse_sidebar(recommendations_page) second_col = ( recommendations_page .find(u'div', {u'id': u'content'}) .find(u'table') .find(u'tr') .find_all(u'td', recursive=False)[1] ) try: recommendations = second_col.find_all("div", {"class": "spaceit borderClass"}) if recommendations: user_info['recommendations'] = {} for row in recommendations[1:]: anime_table = row.find(u'table') animes = anime_table.find_all(u'td') # find liked media liked_media_link = animes[0].find(u'a', recursive=False) link_parts = liked_media_link.get(u'href').split(u'/') # of the form /anime|manga/64/Rozen_Maiden liked_media = getattr(self.session, link_parts[1])(int(link_parts[2])).set( {u'title': liked_media_link.text} ) # find recommended media recommended_media_link = animes[1].find(u'a', recursive=False) link_parts = recommended_media_link.get(u'href').split(u'/') # of the form /anime|manga/64/Rozen_Maiden media_id = int(link_parts[2]) recommended_media = getattr(self.session, link_parts[1])(media_id).set( {u'title': recommended_media_link.text} ) # other stats from recommended media recommendation_text = row.find(u'p').text recommendation_menu = row.find(u'div', recursive=False) utilities.extract_tags(recommendation_menu) rec_menu_text = recommendation_menu.text.split(u' - ')[1] recommendation_date = utilities.parse_profile_date(rec_menu_text) user_info['recommendations'][liked_media] = {link_parts[1]: recommended_media, 'text': recommendation_text, 'date': recommendation_date} except: if not self.session.suppress_parse_exceptions: raise return user_info
def parse_recommendations(self, recommendations_page): """Parses the DOM and returns user recommendations attributes. :type recommendations_page: :class:`bs4.BeautifulSoup` :param recommendations_page: MAL user recommendations page's DOM :rtype: dict :return: User recommendations attributes. """ user_info = self.parse_sidebar(recommendations_page) second_col = recommendations_page.find(u'div', { u'id': u'content' }).find(u'table').find(u'tr').find_all(u'td', recursive=False)[1] try: recommendations = second_col.find_all( u"div", {u"class": u"spaceit borderClass"}) if recommendations: user_info[u'recommendations'] = {} for row in recommendations[1:]: anime_table = row.find(u'table') animes = anime_table.find_all(u'td') liked_media_link = animes[0].find(u'a', recursive=False) link_parts = liked_media_link.get(u'href').split(u'/') # of the form /anime|manga/64/Rozen_Maiden liked_media = getattr(self.session, link_parts[1])(int( link_parts[2])).set({u'title': liked_media_link.text}) recommended_media_link = animes[1].find(u'a', recursive=False) link_parts = recommended_media_link.get(u'href').split( u'/') # of the form /anime|manga/64/Rozen_Maiden recommended_media = getattr(self.session, link_parts[1])( int(link_parts[2])).set( {u'title': recommended_media_link.text}) recommendation_text = row.find(u'p').text recommendation_menu = row.find(u'div', recursive=False) utilities.extract_tags(recommendation_menu) recommendation_date = utilities.parse_profile_date( recommendation_menu.text.split(u' - ')[1]) user_info[u'recommendations'][liked_media] = { link_parts[1]: recommended_media, 'text': recommendation_text, 'date': recommendation_date } except: if not self.session.suppress_parse_exceptions: raise return user_info
def parse_synopsis(self, media_page): """Parse the DOM and returns media synopsis. :type media_page: :class:`bs4.BeautifulSoup` :param media_page: MAL media page's DOM :rtype: string :return: media synopsis. """ synopsis_elt = [ x for x in media_page.find_all(u'h2') if "Synopsis" in x.text ][0].parent # filter the text between 2 h2-tag temp_synopsis_elt = [] for x in synopsis_elt.contents[1:]: if type(x) == bs4.element.Tag: if x.name == 'h2': break temp_synopsis_elt.append(x.text) else: temp_synopsis_elt.append(x) synopsis_elt = ''.join(temp_synopsis_elt) try: utilities.extract_tags(synopsis_elt.find_all(u'h2')) result = synopsis_elt.text.strip() except AttributeError: # the current synopsis_elt may not contain any h2-tag result = synopsis_elt if result == '': # result tag rs_tag = [ xx for xx in media_page.select('span') if xx.get('itemprop') == 'description' ][0] result = rs_tag.text return result
def parse(self, media_page): """Parses the DOM and returns media attributes in the main-content area. :type media_page: :class:`bs4.BeautifulSoup` :param media_page: MAL media page's DOM :rtype: dict :return: media attributes. """ media_info = self.parse_sidebar(media_page) try: synopsis_elt = media_page.find(u'h2', text=u'Synopsis').parent utilities.extract_tags(synopsis_elt.find_all(u'h2')) media_info[u'synopsis'] = synopsis_elt.text.strip() except: if not self.session.suppress_parse_exceptions: raise try: related_title = media_page.find(u'h2', text=u'Related ' + self.__class__.__name__) if related_title: related_elt = related_title.parent utilities.extract_tags(related_elt.find_all(u'h2')) related = {} for link in related_elt.find_all(u'a'): href = link.get(u'href').replace(u'http://myanimelist.net', '') if not re.match(r'/(anime|manga)', href): break curr_elt = link.previous_sibling if curr_elt is None: # we've reached the end of the list. break related_type = None while True: if not curr_elt: raise MalformedAnimePageError( self.id, related_elt, message= "Prematurely reached end of related anime listing" ) if isinstance(curr_elt, bs4.NavigableString): type_match = re.match(u'(?P<type>[a-zA-Z\ \-]+):', curr_elt) if type_match: related_type = type_match.group(u'type') break curr_elt = curr_elt.previous_sibling title = link.text # parse link: may be manga or anime. href_parts = href.split(u'/') # sometimes links on MAL are broken, of the form /anime// if href_parts[2] == '': continue # of the form: /(anime|manga)/1/Cowboy_Bebop obj_id = int(href_parts[2]) new_obj = getattr(self.session, href_parts[1])(obj_id).set( {'title': title}) if related_type not in related: related[related_type] = [new_obj] else: related[related_type].append(new_obj) media_info[u'related'] = related else: media_info[u'related'] = None except: if not self.session.suppress_parse_exceptions: raise return media_info
def parse_sidebar(self, manga_page, manga_page_original=None): """Parses the DOM and returns manga attributes in the sidebar. :type manga_page: :class:`bs4.BeautifulSoup` :param manga_page: MAL manga page's DOM :type manga_page: :class:`bs4.BeautifulSoup` :param manga_page: MAL manga page's DOM :rtype: dict :return: manga attributes :raises: :class:`.InvalidMangaError`, :class:`.MalformedMangaPageError` """ # if MAL says the series doesn't exist, raise an InvalidMangaError. error_tag = manga_page.find(u'div', {'class': 'badresult'}) if error_tag: raise InvalidMangaError(self.id) try: title_tag = manga_page.find(u'span', {'itemprop': 'name'}) if not title_tag: # otherwise, raise a MalformedMangaPageError. raise MalformedMangaPageError(self.id, manga_page, message="Could not find title") except: if not self.session.suppress_parse_exceptions: raise # otherwise, begin parsing. manga_info = super(Manga, self).parse_sidebar(manga_page, manga_page_original) info_panel_first = manga_page.find(u'div', { 'id': 'content' }).find(u'table').find(u'td') try: volumes_tag = info_panel_first.find(text=u'Volumes:').parent.parent utilities.extract_tags( volumes_tag.find_all(u'span', {'class': 'dark_text'})) manga_info[u'volumes'] = int(volumes_tag.text.strip( )) if volumes_tag.text.strip() != 'Unknown' else None except: if not self.session.suppress_parse_exceptions: raise try: chapters_tag = info_panel_first.find( text=u'Chapters:').parent.parent utilities.extract_tags( chapters_tag.find_all(u'span', {'class': 'dark_text'})) manga_info[u'chapters'] = int(chapters_tag.text.strip( )) if chapters_tag.text.strip() != 'Unknown' else None except: if not self.session.suppress_parse_exceptions: raise try: published_tag = info_panel_first.find( text=u'Published:').parent.parent utilities.extract_tags( published_tag.find_all(u'span', {'class': 'dark_text'})) published_parts = published_tag.text.strip().split(u' to ') if len(published_parts) == 1: # this published once. try: published_date = utilities.parse_profile_date( published_parts[0]) except ValueError: raise MalformedMangaPageError( self.id, published_parts[0], message="Could not parse single publish date") manga_info[u'published'] = (published_date, ) else: # two publishing dates. try: publish_start = utilities.parse_profile_date( published_parts[0]) except ValueError: raise MalformedMangaPageError( self.id, published_parts[0], message="Could not parse first of two publish dates") if published_parts == u'?': # this is still publishing. publish_end = None else: try: publish_end = utilities.parse_profile_date( published_parts[1]) except ValueError: raise MalformedMangaPageError( self.id, published_parts[1], message= "Could not parse second of two publish dates") manga_info[u'published'] = (publish_start, publish_end) except: if not self.session.suppress_parse_exceptions: raise try: authors_tag = info_panel_first.find(text=u'Authors:').parent.parent utilities.extract_tags( authors_tag.find_all(u'span', {'class': 'dark_text'})) manga_info[u'authors'] = {} for author_link in authors_tag.find_all('a'): link_parts = author_link.get('href').split('/') # of the form /people/1867/Naoki_Urasawa person = self.session.person(int(link_parts[2])).set( {'name': author_link.text}) role = author_link.nextSibling.replace(' (', '').replace(')', '') manga_info[u'authors'][person] = role except: if not self.session.suppress_parse_exceptions: raise try: manga_info[u'serialization'] = self.parse_serialization(manga_page) except: if not self.session.suppress_parse_exceptions: raise return manga_info
def parse_sidebar(self, media_page): """Parses the DOM and returns media attributes in the sidebar. :type media_page: :class:`bs4.BeautifulSoup` :param media_page: MAL media page's DOM :rtype: dict :return: media attributes. :raises: InvalidMediaError, MalformedMediaPageError """ media_info = {} # if MAL says the series doesn't exist, raise an InvalidMediaError. error_tag = media_page.find(u'div', {'class': 'badresult'}) if error_tag: raise InvalidMediaError(self.id) try: title_tag = media_page.find(u'div', { 'id': 'contentWrapper' }).find(u'h1') except: if not self.session.suppress_parse_exceptions: raise try: media_info[u'title'] = title_tag.text.strip() except: if not self.session.suppress_parse_exceptions: raise info_panel_first = media_page.find(u'div', { 'id': 'content' }).find(u'table').find(u'td') try: picture_tag = info_panel_first.find(u'img') media_info[u'picture'] = picture_tag.get(u'src').decode('utf-8') except: if not self.session.suppress_parse_exceptions: raise try: # assemble alternative titles for this series. media_info[u'alternative_titles'] = {} alt_titles_header = info_panel_first.find( u'h2', text=u'Alternative Titles') if alt_titles_header: next_tag = alt_titles_header.find_next_sibling( u'div', {'class': 'spaceit_pad'}) while True: if next_tag is None or not next_tag.find( u'span', {'class': 'dark_text'}): # not a language node, break. break # get language and remove the node. language = next_tag.find(u'span').text[:-1] utilities.extract_tags( next_tag.find_all(u'span', {'class': 'dark_text'})) names = next_tag.text.strip().split(u', ') media_info[u'alternative_titles'][language] = names next_tag = next_tag.find_next_sibling( u'div', {'class': 'spaceit_pad'}) except: if not self.session.suppress_parse_exceptions: raise try: type_tag = info_panel_first.find(text=u'Type:').parent.parent utilities.extract_tags( type_tag.find_all(u'span', {'class': 'dark_text'})) media_info[u'type'] = type_tag.text.strip() except: if not self.session.suppress_parse_exceptions: raise try: status_tag = info_panel_first.find(text=u'Status:').parent.parent utilities.extract_tags( status_tag.find_all(u'span', {'class': 'dark_text'})) media_info[u'status'] = status_tag.text.strip() except: if not self.session.suppress_parse_exceptions: raise try: genres_tag = info_panel_first.find(text=u'Genres:').parent.parent utilities.extract_tags( genres_tag.find_all(u'span', {'class': 'dark_text'})) media_info[u'genres'] = [] for genre_link in genres_tag.find_all('a'): media_info[u'genres'].append(genre_link.text) except: if not self.session.suppress_parse_exceptions: raise try: # grab statistics for this media. score_tag = info_panel_first.find(text=u'Score:').parent.parent # Third span tag contains information on how many users rated num_users = int( score_tag.find_all('span')[2].text.replace(',', '')) # Second span tag contains average rating score = float(score_tag.find_all('span')[1].text) media_info[u'score'] = (score, num_users) except: if not self.session.suppress_parse_exceptions: raise try: rank_tag = info_panel_first.find(text=u'Ranked:').parent.parent utilities.extract_tags(rank_tag.find_all()) media_info[u'rank'] = int(rank_tag.text.strip()[1:].replace( u',', '')) except: if not self.session.suppress_parse_exceptions: raise try: popularity_tag = info_panel_first.find( text=u'Popularity:').parent.parent utilities.extract_tags(popularity_tag.find_all()) media_info[u'popularity'] = int( popularity_tag.text.strip()[1:].replace(u',', '')) except: if not self.session.suppress_parse_exceptions: raise try: members_tag = info_panel_first.find(text=u'Members:').parent.parent utilities.extract_tags(members_tag.find_all()) media_info[u'members'] = int(members_tag.text.strip().replace( u',', '')) except: if not self.session.suppress_parse_exceptions: raise try: favorites_tag = info_panel_first.find( text=u'Favorites:').parent.parent utilities.extract_tags(favorites_tag.find_all()) media_info[u'favorites'] = int(favorites_tag.text.strip().replace( u',', '')) except: if not self.session.suppress_parse_exceptions: raise try: # get popular tags. tags_header = media_page.find(u'h2', text=u'Popular Tags') tags_tag = tags_header.find_next_sibling(u'span') media_info[u'popular_tags'] = {} for tag_link in tags_tag.find_all('a'): tag = self.session.tag(tag_link.text) num_people = int( re.match(r'(?P<people>[0-9]+) people', tag_link.get('title')).group('people')) media_info[u'popular_tags'][tag] = num_people except: if not self.session.suppress_parse_exceptions: raise return media_info
def parse_sidebar(self, anime_page, anime_page_original=None): """Parses the DOM and returns anime attributes in the sidebar. :type anime_page: :class:`bs4.BeautifulSoup` :param anime_page: MAL anime page's DOM :type anime_page: :class:`bs4.BeautifulSoup` :param anime_page: MAL anime page's DOM uncleaned :rtype: dict :return: anime attributes :raises: :class:`.InvalidAnimeError`, :class:`.MalformedAnimePageError` """ # if MAL says the series doesn't exist, raise an InvalidAnimeError. error_tag = anime_page.find(u'div', {'class': 'badresult'}) if error_tag: raise InvalidAnimeError(self.id) title_tag = anime_page.find(u'div', { 'id': 'contentWrapper' }).find(u'h1') if not title_tag.find(u'div'): # otherwise, raise a MalformedAnimePageError. try: title_tag = anime_page.select('h1.h1 span')[0].text except IndexError: raise MalformedAnimePageError( self.id, None, message="Could not find title div") anime_info = super(Anime, self).parse_sidebar(anime_page, anime_page_original) info_panel_first = anime_page.find(u'div', { 'id': 'content' }).find(u'table').find(u'td') try: episode_tag = [ x for x in anime_page_original.find_all('span') if 'Episodes:' in x.text ][0].parent anime_info[u'episodes'] = int( episode_tag.text.split(':') [-1].strip()) if episode_tag.text.strip() != 'Unknown' else 0 except: if not self.session.suppress_parse_exceptions: raise try: aired_tag = [ x for x in anime_page_original.find_all('span') if 'Aired:' in x.text ][0].parent aired_tag_text = aired_tag.text.split(':')[1] aired_parts = aired_tag_text.strip().split(u' to ') if len(aired_parts) == 1: # this aired once. try: aired_date = utilities.parse_profile_date( aired_parts[0], suppress=self.session.suppress_parse_exceptions) except ValueError: raise MalformedAnimePageError( self.id, aired_parts[0], message="Could not parse single air date") anime_info[u'aired'] = (aired_date, ) else: # two airing dates. try: air_start = utilities.parse_profile_date( aired_parts[0], suppress=self.session.suppress_parse_exceptions) except ValueError: raise MalformedAnimePageError( self.id, aired_parts[0], message="Could not parse first of two air dates") try: air_end = utilities.parse_profile_date( aired_parts[1], suppress=self.session.suppress_parse_exceptions) except ValueError: raise MalformedAnimePageError( self.id, aired_parts[1], message="Could not parse second of two air dates") anime_info[u'aired'] = (air_start, air_end) except: if not self.session.suppress_parse_exceptions: raise try: anime_info[u'producers'] = self.parse_producers(anime_page) except: if not self.session.suppress_parse_exceptions: raise try: duration_tag = [ x for x in anime_page_original.find_all('span') if 'Duration:' in x.text ][0].parent anime_info[u'duration'] = duration_tag.text.split(':')[1].strip() duration_parts = [ part.strip() for part in anime_info[u'duration'].split(u'.') ] duration_mins = 0 for part in duration_parts: part_match = re.match(u'(?P<num>[0-9]+)', part) if not part_match: continue part_volume = int(part_match.group(u'num')) if part.endswith(u'hr'): duration_mins += part_volume * 60 elif part.endswith(u'min'): duration_mins += part_volume anime_info[u'duration'] = datetime.timedelta(minutes=duration_mins) except: if not self.session.suppress_parse_exceptions: raise try: rating_tag = [ x for x in anime_page_original.find_all('span') if 'Rating:' in x.text ][0].parent utilities.extract_tags( rating_tag.find_all(u'span', {'class': 'dark_text'})) anime_info[u'rating'] = rating_tag.text.strip() except: if not self.session.suppress_parse_exceptions: raise return anime_info
def parse_sidebar(self, anime_page, anime_page_original=None): """Parse the DOM and returns anime attributes in the sidebar. :type anime_page: :class:`bs4.BeautifulSoup` :param anime_page: MAL anime page's DOM :type anime_page: :class:`bs4.BeautifulSoup` :param anime_page: MAL anime page's DOM uncleaned :rtype: dict :return: anime attributes :raises: :class:`.InvalidAnimeError`, :class:`.MalformedAnimePageError` """ # if MAL says the series doesn't exist, raise an InvalidAnimeError. error_tag = anime_page.find('div', {'class': 'badresult'}) if error_tag: raise InvalidAnimeError(self.id) title_tag = anime_page.find('div', {'id': 'contentWrapper'}).find('h1') if not title_tag.find('div'): # otherwise, raise a MalformedAnimePageError. try: title_tag = anime_page.select('h1.h1 span')[0].text except IndexError: raise MalformedAnimePageError(self.id, None, message="Could not find title div") anime_info = super(Anime, self).parse_sidebar(anime_page, anime_page_original) # 'info_panel_first'not used # info_panel_first = anime_page.find(u'div', {'id': 'content'}).find(u'table').find(u'td') try: anime_info['episodes'] = self._parse_episodes(anime_page) except: if not self.session.suppress_parse_exceptions: raise try: aired_tag = [x for x in anime_page_original.find_all('span') if 'Aired:' in x.text][0].parent aired_tag_text = aired_tag.text.split(':')[1] aired_parts = aired_tag_text.strip().split(' to ') suppress_parse_exceptions = self.session.suppress_parse_exceptions if len(aired_parts) == 1: # this aired once. try: aired_date = parse_profile_date(aired_parts[0], suppress=suppress_parse_exceptions) except ValueError: err_msg = "Could not parse single air date" raise MalformedAnimePageError(self.id, aired_parts[0], message=err_msg) anime_info['aired'] = (aired_date,) else: # two airing dates. try: air_start = parse_profile_date(aired_parts[0], suppress=suppress_parse_exceptions) except ValueError: raise MalformedAnimePageError(self.id, aired_parts[0], message="Could not parse first of two air dates") try: air_end = parse_profile_date(aired_parts[1], suppress=suppress_parse_exceptions) except ValueError: error_msg = "Could not parse second of two air dates" raise MalformedAnimePageError(self.id, aired_parts[1], message=error_msg) anime_info['aired'] = (air_start, air_end) except: if not self.session.suppress_parse_exceptions: raise try: anime_info['producers'] = self.parse_producers(anime_page) except: if not self.session.suppress_parse_exceptions: raise try: duration_tag = [x for x in anime_page_original.find_all('span') if 'Duration:' in x.text][0].parent anime_info['duration'] = duration_tag.text.split(':')[1].strip() duration_parts = [part.strip() for part in anime_info['duration'].split('.')] duration_mins = 0 for part in duration_parts: part_match = re.match('(?P<num>[0-9]+)', part) if not part_match: continue part_volume = int(part_match.group('num')) if part.endswith('hr'): duration_mins += part_volume * 60 elif part.endswith('min'): duration_mins += part_volume anime_info['duration'] = datetime.timedelta(minutes=duration_mins) except: if not self.session.suppress_parse_exceptions: raise try: rating_tag = [x for x in anime_page_original.find_all('span') if 'Rating:' in x.text][0].parent utilities.extract_tags(rating_tag.find_all('span', {'class': 'dark_text'})) anime_info['rating'] = rating_tag.text.strip() except: if not self.session.suppress_parse_exceptions: raise return anime_info
def parse_sidebar(self, media_page): """Parses the DOM and returns media attributes in the sidebar. :type media_page: :class:`bs4.BeautifulSoup` :param media_page: MAL media page's DOM :rtype: dict :return: media attributes. :raises: InvalidMediaError, MalformedMediaPageError """ media_info = {} # if MAL says the series doesn't exist, raise an InvalidMediaError. error_tag = media_page.find(u'div', {'class': 'error404'}) if error_tag: raise InvalidMediaError(self.id) try: title_tag = media_page.find(u'span', {'itemprop': 'name'}) if not title_tag: # otherwise, raise a MalformedMediaPageError. raise MalformedMediaPageError( self.id, media_page, message="Could not find title span") utilities.extract_tags(title_tag.find_all()) media_info[u'title'] = title_tag.text.strip() except: if not self.session.suppress_parse_exceptions: raise info_panel_first = media_page.find(u'div', { 'id': 'content' }).find(u'table').find(u'td') # remove user-controls, if exists controls = info_panel_first.find(id='addtolist') if controls: controls.extract() try: picture_tag = info_panel_first.find(u'img') media_info[u'picture'] = picture_tag.get(u'src').decode('utf-8') except: if not self.session.suppress_parse_exceptions: raise try: # assemble alternative titles for this series. media_info[u'alternative_titles'] = {} alt_titles_header = info_panel_first.find( u'h2', text=u'Alternative Titles') if alt_titles_header: next_tag = alt_titles_header.find_next_sibling( u'div', {'class': 'spaceit_pad'}) while True: if next_tag is None or not next_tag.find( u'span', {'class': 'dark_text'}): # not a language node, break. break # get language and remove the node. language = next_tag.find(u'span').text[:-1] utilities.extract_tags( next_tag.find_all(u'span', {'class': 'dark_text'})) names = next_tag.text.strip().split(u', ') media_info[u'alternative_titles'][language] = names next_tag = next_tag.find_next_sibling( u'div', {'class': 'spaceit_pad'}) except: if not self.session.suppress_parse_exceptions: raise try: type_tag = info_panel_first.find(text=u'Type:').parent.parent utilities.extract_tags( type_tag.find_all(u'span', {'class': 'dark_text'})) media_info[u'type'] = type_tag.text.strip() except: if not self.session.suppress_parse_exceptions: raise try: status_tag = info_panel_first.find(text=u'Status:').parent.parent utilities.extract_tags( status_tag.find_all(u'span', {'class': 'dark_text'})) media_info[u'status'] = status_tag.text.strip() except: if not self.session.suppress_parse_exceptions: raise try: genres_tag = info_panel_first.find(text=u'Genres:').parent.parent media_info[u'genres'] = [] for genre_link in genres_tag.find_all('a'): link_parts = genre_link.get('href').split('/') # 2017-02-19: of the form /anime/genre/4/Comedy genre = self.session.genre(int(link_parts[3])).set( {'name': genre_link.text}) media_info[u'genres'].append(genre) except: if not self.session.suppress_parse_exceptions: raise try: # grab statistics for this media. score_tag = info_panel_first.find(text=u'Score:').parent.parent # get score and number of users. score = score_tag.find(attrs={'itemprop': 'ratingValue'}).text if score == u'N/A': score = u'0' num_users = int( score_tag.find(attrs={ 'itemprop': 'ratingCount' }).text.replace(',', '')) media_info[u'score'] = (decimal.Decimal(score), num_users) except: if not self.session.suppress_parse_exceptions: raise try: rank_tag = info_panel_first.find(text=u'Ranked:').parent.parent utilities.extract_tags(rank_tag.find_all()) rank = rank_tag.text.strip().replace(u',', '').replace(u'#', '') if rank == u'N/A': rank = u'0' media_info[u'rank'] = int(rank) except: if not self.session.suppress_parse_exceptions: raise try: popularity_tag = info_panel_first.find( text=u'Popularity:').parent.parent utilities.extract_tags(popularity_tag.find_all()) media_info[u'popularity'] = int( popularity_tag.text.strip()[1:].replace(u',', '')) except: if not self.session.suppress_parse_exceptions: raise try: members_tag = info_panel_first.find(text=u'Members:').parent.parent utilities.extract_tags(members_tag.find_all()) media_info[u'members'] = int(members_tag.text.strip().replace( u',', '')) except: if not self.session.suppress_parse_exceptions: raise try: favorites_tag = info_panel_first.find( text=u'Favorites:').parent.parent utilities.extract_tags(favorites_tag.find_all()) media_info[u'favorites'] = int(favorites_tag.text.strip().replace( u',', '')) except: if not self.session.suppress_parse_exceptions: raise # TODO: popular tags no longer exist in MAL, the API should be updated to reflect that media_info[u'popular_tags'] = {} return media_info
def parse_sidebar(self, anime_page): """Parses the DOM and returns anime attributes in the sidebar. :type anime_page: :class:`bs4.BeautifulSoup` :param anime_page: MAL anime page's DOM :rtype: dict :return: anime attributes :raises: :class:`.InvalidAnimeError`, :class:`.MalformedAnimePageError` """ # if MAL says the series doesn't exist, raise an InvalidAnimeError. error_tag = anime_page.find(u'div', {'class': 'badresult'}) if error_tag: raise InvalidAnimeError(self.id) title_tag = anime_page.find(u'div', {'id': 'contentWrapper'}).find(u'h1') if not title_tag.find(u'span'): # otherwise, raise a MalformedAnimePageError. raise MalformedAnimePageError(self.id, anime_page, message="Could not find title div") anime_info = super(Anime, self).parse_sidebar(anime_page) info_panel_first = anime_page.find(u'div', {'id': 'content'}).find(u'table').find(u'td') try: episode_tag = info_panel_first.find(text=u'Episodes:').parent.parent utilities.extract_tags(episode_tag.find_all(u'span', {'class': 'dark_text'})) anime_info[u'episodes'] = int(episode_tag.text.strip()) if episode_tag.text.strip() != 'Unknown' else 0 except: if not self.session.suppress_parse_exceptions: raise try: aired_tag = info_panel_first.find(text=u'Aired:').parent.parent utilities.extract_tags(aired_tag.find_all(u'span', {'class': 'dark_text'})) aired_parts = aired_tag.text.strip().split(u' to ') if len(aired_parts) == 1: # this aired once. try: aired_date = utilities.parse_profile_date(aired_parts[0], suppress=self.session.suppress_parse_exceptions) except ValueError: raise MalformedAnimePageError(self.id, aired_parts[0], message="Could not parse single air date") anime_info[u'aired'] = (aired_date,) else: # two airing dates. try: air_start = utilities.parse_profile_date(aired_parts[0], suppress=self.session.suppress_parse_exceptions) except ValueError: raise MalformedAnimePageError(self.id, aired_parts[0], message="Could not parse first of two air dates") try: air_end = utilities.parse_profile_date(aired_parts[1], suppress=self.session.suppress_parse_exceptions) except ValueError: raise MalformedAnimePageError(self.id, aired_parts[1], message="Could not parse second of two air dates") anime_info[u'aired'] = (air_start, air_end) except: if not self.session.suppress_parse_exceptions: raise try: producers_tag = info_panel_first.find(text=u'Producers:').parent.parent utilities.extract_tags(producers_tag.find_all(u'span', {'class': 'dark_text'})) utilities.extract_tags(producers_tag.find_all(u'sup')) anime_info[u'producers'] = [] for producer_link in producers_tag.find_all('a'): if producer_link.text == u'add some': # MAL is saying "None found, add some". break link_parts = producer_link.get('href').split('p=') # of the form: /anime.php?p=14 if len(link_parts) > 1: anime_info[u'producers'].append(self.session.producer(int(link_parts[1])).set({'name': producer_link.text})) except: if not self.session.suppress_parse_exceptions: raise try: duration_tag = info_panel_first.find(text=u'Duration:').parent.parent utilities.extract_tags(duration_tag.find_all(u'span', {'class': 'dark_text'})) anime_info[u'duration'] = duration_tag.text.strip() duration_parts = [part.strip() for part in anime_info[u'duration'].split(u'.')] duration_mins = 0 for part in duration_parts: part_match = re.match(u'(?P<num>[0-9]+)', part) if not part_match: continue part_volume = int(part_match.group(u'num')) if part.endswith(u'hr'): duration_mins += part_volume * 60 elif part.endswith(u'min'): duration_mins += part_volume anime_info[u'duration'] = datetime.timedelta(minutes=duration_mins) except: if not self.session.suppress_parse_exceptions: raise try: rating_tag = info_panel_first.find(text=u'Rating:').parent.parent utilities.extract_tags(rating_tag.find_all(u'span', {'class': 'dark_text'})) anime_info[u'rating'] = rating_tag.text.strip() except: if not self.session.suppress_parse_exceptions: raise return anime_info
def parse_sidebar(self, manga_page, manga_page_original=None): """Parses the DOM and returns manga attributes in the sidebar. :type manga_page: :class:`bs4.BeautifulSoup` :param manga_page: MAL manga page's DOM :type manga_page: :class:`bs4.BeautifulSoup` :param manga_page: MAL manga page's DOM :rtype: dict :return: manga attributes :raises: :class:`.InvalidMangaError`, :class:`.MalformedMangaPageError` """ # if MAL says the series doesn't exist, raise an InvalidMangaError. error_tag = manga_page.find('div', {'class': 'badresult'}) if error_tag: raise InvalidMangaError(self.id) try: title_tag = manga_page.find('span', {'itemprop': 'name'}) if not title_tag: # otherwise, raise a MalformedMangaPageError. raise MalformedMangaPageError(self.id, manga_page, message="Could not find title") except: if not self.session.suppress_parse_exceptions: raise # otherwise, begin parsing. manga_info = super(Manga, self).parse_sidebar(manga_page, manga_page_original) info_panel_first = manga_page.find('div', {'id': 'content'}).find('table').find('td') try: volumes_tag = [ x for x in manga_page.select('span.dark_text') if 'Volumes:' in x.text][0] manga_volume = volumes_tag.parent.text.split(':')[1].strip().replace(',', '') manga_info[u'volumes'] = ( int(manga_volume) if manga_volume != 'Unknown' else None ) except: if not self.session.suppress_parse_exceptions: raise try: chapters_tag = info_panel_first.find(text=u'Chapters:').parent.parent utilities.extract_tags(chapters_tag.find_all(u'span', {'class': 'dark_text'})) chapters_tag_text = chapters_tag.text if ':' in chapters_tag_text: chapters_tag_text = chapters_tag_text.split(':')[1] manga_chapters = chapters_tag_text.strip() manga_info[u'chapters'] = ( int(manga_chapters) if chapters_tag.text.strip() != 'Unknown' else None ) except: if not self.session.suppress_parse_exceptions: raise try: published_tag = info_panel_first.find(text=u'Published:').parent.parent utilities.extract_tags(published_tag.find_all(u'span', {'class': 'dark_text'})) published_parts = published_tag.text.strip().split(u' to ') # check if published part only contain start date or also end date. if len(published_parts) == 1: # this published once. try: published_date = utilities.parse_profile_date(published_parts[0]) except ValueError: raise MalformedMangaPageError(self.id, published_parts[0], message="Could not parse single publish date") publish_start = published_date publish_end = None else: # two publishing dates. try: # publish_start may contain redundant word such as # 'Published: Feb 24, 2003', if 'Published:' in published_parts[0]: published_parts[0] = published_parts[0].split('Published:')[1].strip() publish_start = utilities.parse_profile_date(published_parts[0]) except ValueError: raise MalformedMangaPageError( self.id, published_parts[0], message="Could not parse first of two publish dates" ) if published_parts == u'?': # this is still publishing. publish_end = None else: try: publish_end = utilities.parse_profile_date(published_parts[1]) except ValueError: raise MalformedMangaPageError( self.id, published_parts[1], message="Could not parse second of two publish dates" ) manga_info[u'published'] = (publish_start, publish_end) except: if not self.session.suppress_parse_exceptions: raise try: authors_tag = info_panel_first.find(text='Authors:').parent.parent utilities.extract_tags(authors_tag.find_all('span', {'class': 'dark_text'})) manga_info['authors'] = {} for author_link in authors_tag.find_all('a'): link_parts = author_link.get('href').split('/') # of the form /people/1867/Naoki_Urasawa person = self.session.person(int(link_parts[2])).set({'name': author_link.text}) role = author_link.nextSibling.replace(' (', '').replace(')', '') manga_info['authors'][person] = role except: if not self.session.suppress_parse_exceptions: raise try: serialization_tag = info_panel_first.find(text=u'Serialization:').parent.parent publication_link = serialization_tag.find('a') manga_info[u'serialization'] = None if publication_link: # of the form /manga.php?mid=1 link_parts = publication_link.get('href').split('mid=') # example for link_parts # ['/manga/magazine/450/Bessatsu_Shounen_Magazine'] publication_id = link_parts[0].split('/manga/magazine/')[1].split('/')[0] manga_info[u'serialization'] = self.session.publication(int(publication_id)).set( {'name': publication_link.text} ) except: if not self.session.suppress_parse_exceptions: raise return manga_info
def parse_sidebar(self, manga_page): """Parses the DOM and returns manga attributes in the sidebar. :type manga_page: :class:`bs4.BeautifulSoup` :param manga_page: MAL manga page's DOM :rtype: dict :return: manga attributes :raises: :class:`.InvalidMangaError`, :class:`.MalformedMangaPageError` """ # if MAL says the series doesn't exist, raise an InvalidMangaError. error_tag = manga_page.find(u"div", {"class": "badresult"}) if error_tag: raise InvalidMangaError(self.id) try: title_tag = manga_page.find(u"div", {"id": "contentWrapper"}).find(u"h1") if not title_tag.find(u"div"): # otherwise, raise a MalformedMangaPageError. raise MalformedMangaPageError(self.id, manga_page, message="Could not find title div") except: if not self.session.suppress_parse_exceptions: raise # otherwise, begin parsing. manga_info = super(Manga, self).parse_sidebar(manga_page) info_panel_first = manga_page.find(u"div", {"id": "content"}).find(u"table").find(u"td") try: volumes_tag = info_panel_first.find(text=u"Volumes:").parent.parent utilities.extract_tags(volumes_tag.find_all(u"span", {"class": "dark_text"})) manga_info[u"volumes"] = int(volumes_tag.text.strip()) if volumes_tag.text.strip() != "Unknown" else None except: if not self.session.suppress_parse_exceptions: raise try: chapters_tag = info_panel_first.find(text=u"Chapters:").parent.parent utilities.extract_tags(chapters_tag.find_all(u"span", {"class": "dark_text"})) manga_info[u"chapters"] = int(chapters_tag.text.strip()) if chapters_tag.text.strip() != "Unknown" else None except: if not self.session.suppress_parse_exceptions: raise try: published_tag = info_panel_first.find(text=u"Published:").parent.parent utilities.extract_tags(published_tag.find_all(u"span", {"class": "dark_text"})) published_parts = published_tag.text.strip().split(u" to ") if len(published_parts) == 1: # this published once. try: published_date = utilities.parse_profile_date(published_parts[0]) except ValueError: raise MalformedMangaPageError( self.id, published_parts[0], message="Could not parse single publish date" ) manga_info[u"published"] = (published_date,) else: # two publishing dates. try: publish_start = utilities.parse_profile_date(published_parts[0]) except ValueError: raise MalformedMangaPageError( self.id, published_parts[0], message="Could not parse first of two publish dates" ) if published_parts == u"?": # this is still publishing. publish_end = None else: try: publish_end = utilities.parse_profile_date(published_parts[1]) except ValueError: raise MalformedMangaPageError( self.id, published_parts[1], message="Could not parse second of two publish dates" ) manga_info[u"published"] = (publish_start, publish_end) except: if not self.session.suppress_parse_exceptions: raise try: authors_tag = info_panel_first.find(text=u"Authors:").parent.parent utilities.extract_tags(authors_tag.find_all(u"span", {"class": "dark_text"})) manga_info[u"authors"] = {} for author_link in authors_tag.find_all("a"): link_parts = author_link.get("href").split("/") # of the form /people/1867/Naoki_Urasawa person = self.session.person(int(link_parts[2])).set({"name": author_link.text}) role = author_link.nextSibling.replace(" (", "").replace(")", "") manga_info[u"authors"][person] = role except: if not self.session.suppress_parse_exceptions: raise try: serialization_tag = info_panel_first.find(text=u"Serialization:").parent.parent publication_link = serialization_tag.find("a") manga_info[u"serialization"] = None if publication_link: link_parts = publication_link.get("href").split("mid=") # of the form /manga.php?mid=1 manga_info[u"serialization"] = self.session.publication(int(link_parts[1])).set( {"name": publication_link.text} ) except: if not self.session.suppress_parse_exceptions: raise return manga_info
def parse_sidebar(self, media_page, media_page_original=None): """Parses the DOM and returns media attributes in the sidebar. :type media_page: :class:`bs4.BeautifulSoup` :param media_page: MAL media page's DOM :rtype: dict :return: media attributes. :raises: InvalidMediaError, MalformedMediaPageError """ media_info = {} # if MAL says the series doesn't exist, raise an InvalidMediaError. error_tag = media_page.find(u'div', {'class': 'badresult'}) if error_tag: raise InvalidMediaError(self.id) try: title_tag = media_page.find(u'div', { 'id': 'contentWrapper' }).find(u'h1') if not title_tag.find(u'div'): try: title_tag = media_page_original.select( 'div#contentWrapper h1.h1 span')[0] except IndexError: # otherwise, raise a MalformedMediaPageError. raise MalformedMediaPageError( self.id, None, message="Could not find title div") except: if not self.session.suppress_parse_exceptions: raise try: utilities.extract_tags(title_tag.find_all()) media_info[u'title'] = title_tag.text.strip() if media_info[u'title'] == '': media_info[u'title'] = media_page_original.find( 'span', { 'itemprop': 'name' }).text except: if not self.session.suppress_parse_exceptions: raise info_panel_first = media_page_original.select( 'div#content table td')[0] try: picture_tag = info_panel_first.find(u'img') media_info[u'picture'] = picture_tag.get(u'src').decode('utf-8') except: if not self.session.suppress_parse_exceptions: raise try: # assemble alternative titles for this series. media_info[u'alternative_titles'] = {} alt_titles_header = info_panel_first.find( u'h2', text=u'Alternative Titles') if alt_titles_header: next_tag = alt_titles_header.find_next_sibling( u'div', {'class': 'spaceit_pad'}) while True: if next_tag is None or not next_tag.find( u'span', {'class': 'dark_text'}): # not a language node, break. break # get language and remove the node. language = next_tag.find(u'span').text[:-1] utilities.extract_tags( next_tag.find_all(u'span', {'class': 'dark_text'})) names = next_tag.text.strip().split(u', ') media_info[u'alternative_titles'][language] = names next_tag = next_tag.find_next_sibling( u'div', {'class': 'spaceit_pad'}) except: if not self.session.suppress_parse_exceptions: raise try: try: type_tag = info_panel_first.find(text=u'Type:').parent.parent utilities.extract_tags( type_tag.find_all(u'span', {'class': 'dark_text'})) media_info[u'type'] = type_tag.text.strip() except AttributeError: type_tag = [ x for x in info_panel_first.find_all('div') if 'Type:' in x.text ][0] media_info[u'type'] = type_tag.text.split(':')[-1].strip() except: if not self.session.suppress_parse_exceptions: raise try: status_tag = [ x for x in media_page.find_all('span') if 'Status:' in x.text ][0].parent media_info[u'status'] = status_tag.text.split(':')[1].strip() except: if not self.session.suppress_parse_exceptions: raise try: media_info[u'genres'] = self.parse_genres(media_page_original) except: if not self.session.suppress_parse_exceptions: raise try: # grab statistics for this media. score_tag = media_page.find('span', {'itemprop': 'aggregateRating'}) # there is difference between anime and manga page # in manga page score_tag is in span-tag and anime in div-page # test score tag by try to find span-tag try: score_tag.find('span') except AttributeError: score_tag = score_tag = media_page.find( 'div', {'itemprop': 'aggregateRating'}) # get score and number of users. num_users = int( score_tag.find('span', { 'itemprop': 'ratingCount' }).text.replace(',', '')) # utilities.extract_tags(score_tag.find_all()) score_point = score_tag.find('span', { 'itemprop': 'ratingValue' }).text try: media_info[u'score'] = (decimal.Decimal(score_point), num_users) except (InvalidOperation, AttributeError): score_tag = media_page_original.find( 'span', {'itemprop': 'ratingValue'}) media_info[u'score'] = (decimal.Decimal(score_tag.text), num_users) except: if not self.session.suppress_parse_exceptions: raise try: try: rank_tag = info_panel_first.find(text=u'Ranked:').parent.parent utilities.extract_tags(rank_tag.find_all()) media_info[u'rank'] = int(rank_tag.text.strip()[1:].replace( u',', '')) except AttributeError: rank_tag = filter( lambda x: 'Ranked:' in x.text, media_page_original.find_all('div', {'class': 'spaceit'})) media_info[u'rank'] = int( rank_tag[0].text.split('#')[-1].strip()) except: if not self.session.suppress_parse_exceptions: raise try: try: popularity_tag = info_panel_first.find( text=u'Popularity:').parent.parent utilities.extract_tags(popularity_tag.find_all()) media_info[u'popularity'] = int( popularity_tag.text.strip()[1:].replace(u',', '')) except AttributeError: rank_tag = filter( lambda x: 'Popularity' in x.text, media_page_original.find_all( 'span', {'class': 'dark_text'}))[0].parent media_info[u'popularity'] = int( rank_tag.text.split('#')[-1].strip()) except: if not self.session.suppress_parse_exceptions: raise try: try: members_tag = info_panel_first.find( text=u'Members:').parent.parent utilities.extract_tags(members_tag.find_all()) media_info[u'members'] = int(members_tag.text.strip().replace( u',', '')) except AttributeError: members_tag = filter( lambda x: 'Members' in x.text, media_page_original.find_all( 'span', {'class': 'dark_text'}))[0].parent media_info[u'members'] = int( members_tag.text.split(':')[-1].strip().replace(u',', '')) except: if not self.session.suppress_parse_exceptions: raise try: try: favorites_tag = info_panel_first.find( text=u'Favorites:').parent.parent utilities.extract_tags(favorites_tag.find_all()) media_info[u'favorites'] = int( favorites_tag.text.strip().replace(u',', '')) except AttributeError: favorites_tag = filter( lambda x: 'Favorites' in x.text, media_page_original.find_all( 'span', {'class': 'dark_text'}))[0].parent media_info[u'favorites'] = int( favorites_tag.text.split(':')[-1].strip().replace( u',', '')) except: if not self.session.suppress_parse_exceptions: raise try: # get popular tags. tags_header = media_page.find(u'h2', text=u'Popular Tags') try: tags_tag = tags_header.find_next_sibling(u'span') media_info[u'popular_tags'] = {} for tag_link in tags_tag.find_all('a'): tag = self.session.tag(tag_link.text) num_people = int( re.match(r'(?P<people>[0-9]+) people', tag_link.get('title')).group('people')) media_info[u'popular_tags'][tag] = num_people except AttributeError: tags_tag = media_page_original.find('span', text='Genres:').parent media_info[u'popular_tags'] = {} for tag_link in tags_tag.find_all('a'): tag = self.session.tag(tag_link.text.lower()) try: num_people = int( re.match(r'(?P<people>[0-9]+) people', tag_link.get('title')).group('people')) media_info[u'popular_tags'][tag] = num_people except (TypeError, AttributeError): tag_num = tag_link.get('href').split('=')[-1] media_info[u'popular_tags'][tag] = tag_num except: if not self.session.suppress_parse_exceptions: raise return media_info
def parse_sidebar(self, media_page): """Parses the DOM and returns media attributes in the sidebar. :type media_page: :class:`bs4.BeautifulSoup` :param media_page: MAL media page's DOM :rtype: dict :return: media attributes. :raises: InvalidMediaError, MalformedMediaPageError """ media_info = {} # if MAL says the series doesn't exist, raise an InvalidMediaError. error_tag = media_page.find(u'div', {'class': 'badresult'}) if error_tag: raise InvalidMediaError(self.id) try: title_tag = media_page.find(u'div', { 'id': 'contentWrapper' }).find(u'h1') if not title_tag.find(u'div'): # otherwise, raise a MalformedMediaPageError. raise MalformedMediaPageError( self.id, media_page, message="Could not find title div") except: if not self.session.suppress_parse_exceptions: raise try: utilities.extract_tags(title_tag.find_all()) media_info[u'title'] = title_tag.text.strip() except: if not self.session.suppress_parse_exceptions: raise info_panel_first = media_page.find(u'div', { 'id': 'content' }).find(u'table').find(u'td') try: picture_tag = info_panel_first.find(u'img') media_info[u'picture'] = picture_tag.get(u'src').decode('utf-8') except: if not self.session.suppress_parse_exceptions: raise try: # assemble alternative titles for this series. media_info[u'alternative_titles'] = {} alt_titles_header = info_panel_first.find( u'h2', text=u'Alternative Titles') if alt_titles_header: next_tag = alt_titles_header.find_next_sibling( u'div', {'class': 'spaceit_pad'}) while True: if next_tag is None or not next_tag.find( u'span', {'class': 'dark_text'}): # not a language node, break. break # get language and remove the node. language = next_tag.find(u'span').text[:-1] utilities.extract_tags( next_tag.find_all(u'span', {'class': 'dark_text'})) names = next_tag.text.strip().split(u', ') media_info[u'alternative_titles'][language] = names next_tag = next_tag.find_next_sibling( u'div', {'class': 'spaceit_pad'}) except: if not self.session.suppress_parse_exceptions: raise try: type_tag = info_panel_first.find(text=u'Type:').parent.parent utilities.extract_tags( type_tag.find_all(u'span', {'class': 'dark_text'})) media_info[u'type'] = type_tag.text.strip() except: if not self.session.suppress_parse_exceptions: raise try: status_tag = info_panel_first.find(text=u'Status:').parent.parent utilities.extract_tags( status_tag.find_all(u'span', {'class': 'dark_text'})) media_info[u'status'] = status_tag.text.strip() except: if not self.session.suppress_parse_exceptions: raise try: genres_tag = info_panel_first.find(text=u'Genres:').parent.parent utilities.extract_tags( genres_tag.find_all(u'span', {'class': 'dark_text'})) media_info[u'genres'] = [] for genre_link in genres_tag.find_all('a'): link_parts = genre_link.get('href').split('[]=') # of the form /anime|manga.php?genre[]=1 genre = self.session.genre(int(link_parts[1])).set( {'name': genre_link.text}) media_info[u'genres'].append(genre) except: if not self.session.suppress_parse_exceptions: raise try: # grab statistics for this media. score_tag = info_panel_first.find(text=u'Score:').parent.parent # get score and number of users. users_node = [ x for x in score_tag.find_all(u'small') if u'scored by' in x.text ][0] num_users = int( users_node.text.split(u'scored by ')[-1].split(u' users')[0]) utilities.extract_tags(score_tag.find_all()) media_info[u'score'] = (decimal.Decimal(score_tag.text.strip()), num_users) except: if not self.session.suppress_parse_exceptions: raise try: rank_tag = info_panel_first.find(text=u'Ranked:').parent.parent utilities.extract_tags(rank_tag.find_all()) media_info[u'rank'] = int(rank_tag.text.strip()[1:].replace( u',', '')) except: if not self.session.suppress_parse_exceptions: raise try: popularity_tag = info_panel_first.find( text=u'Popularity:').parent.parent utilities.extract_tags(popularity_tag.find_all()) media_info[u'popularity'] = int( popularity_tag.text.strip()[1:].replace(u',', '')) except: if not self.session.suppress_parse_exceptions: raise try: members_tag = info_panel_first.find(text=u'Members:').parent.parent utilities.extract_tags(members_tag.find_all()) media_info[u'members'] = int(members_tag.text.strip().replace( u',', '')) except: if not self.session.suppress_parse_exceptions: raise try: favorites_tag = info_panel_first.find( text=u'Favorites:').parent.parent utilities.extract_tags(favorites_tag.find_all()) media_info[u'favorites'] = int(favorites_tag.text.strip().replace( u',', '')) except: if not self.session.suppress_parse_exceptions: raise try: # get popular tags. tags_header = media_page.find(u'h2', text=u'Popular Tags') tags_tag = tags_header.find_next_sibling(u'span') media_info[u'popular_tags'] = {} for tag_link in tags_tag.find_all('a'): tag = self.session.tag(tag_link.text) num_people = int( re.match(r'(?P<people>[0-9]+) people', tag_link.get('title')).group('people')) media_info[u'popular_tags'][tag] = num_people except: if not self.session.suppress_parse_exceptions: raise return media_info
def parse_sidebar(self, media_page, media_page_original=None): """Parse the DOM and returns media attributes in the sidebar. :type media_page: :class:`bs4.BeautifulSoup` :param media_page: MAL media page's DOM :rtype: dict :return: media attributes. :raises: InvalidMediaError, MalformedMediaPageError """ media_info = {} # if MAL says the series doesn't exist, raise an InvalidMediaError. error_tag = media_page.find('div', {'class': 'badresult'}) if error_tag: raise InvalidMediaError(self.id) try: title_tag = media_page.find('div', {'id': 'contentWrapper'}).find('h1') if not title_tag.find('div'): try: title_tag = media_page_original.select('div#contentWrapper h1.h1 span')[0] except IndexError: # otherwise, raise a MalformedMediaPageError. raise MalformedMediaPageError(self.id, None, message="Could not find title div") except: if not self.session.suppress_parse_exceptions: raise try: utilities.extract_tags(title_tag.find_all()) media_info['title'] = title_tag.text.strip() if media_info['title'] == '': media_info['title'] = media_page_original.find('span', {'itemprop': 'name'}).text except: if not self.session.suppress_parse_exceptions: raise info_panel_first = media_page_original.select('div#content table td')[0] try: picture_tag = info_panel_first.find(u'img') media_info[u'picture'] = picture_tag.get('src') except: if not self.session.suppress_parse_exceptions: raise try: # assemble alternative titles for this series. media_info[u'alternative_titles'] = self._get_alternative_titles(info_panel_first) except: if not self.session.suppress_parse_exceptions: raise try: type_tag = info_panel_first.find(text=u'Type:').parent.parent media_info[u'type'] = type_tag.text.split(':')[1].strip() except: if not self.session.suppress_parse_exceptions: raise try: status_tag = [x for x in media_page.find_all('span')if 'Status:' in x.text][0].parent media_info['status'] = status_tag.text.split(':')[1].strip() except: if not self.session.suppress_parse_exceptions: raise try: genres_tag = info_panel_first.find(text=u'Genres:').parent.parent # utilities.extract_tags(genres_tag.find_all(u'span', {'class': 'dark_text'})) media_info[u'genres'] = [] for genre_link in genres_tag.find_all('a'): link_parts = genre_link.get('href').split('=') # of the form /anime|manga.php?genre[]=1 try: genre_id = int(link_parts[-1]) except: # for current form # '/anime/genre/2/Adventure' genre_id = int(link_parts[-1].split('/genre/')[1].split('/')[0]) genre = self.session.genre(genre_id).set({'name': genre_link.text}) media_info[u'genres'].append(genre) except: if not self.session.suppress_parse_exceptions: raise try: media_info['score'] = self._parse_score(media_page) except: if not self.session.suppress_parse_exceptions: raise try: media_info['rank'] = self._parse_rank(media_page) except: if not self.session.suppress_parse_exceptions: raise try: try: # find popularity html-tag popularity_tag = info_panel_first.find(text=u'Popularity:').parent.parent utilities.extract_tags(popularity_tag.find_all()) # format popularity popularity = popularity_tag.text.strip()[1:].replace(u',', '') if '#' in popularity: popularity = popularity.split('#')[1].split()[0] # set into media info media_info[u'popularity'] = int(popularity) except AttributeError: rank_tag = filter(lambda x: 'Popularity' in x.text, media_page_original.find_all('span', {'class':'dark_text'}))[0].parent media_info[u'popularity'] = int(rank_tag.text.split('#')[-1].strip()) except: if not self.session.suppress_parse_exceptions: raise try: members_tag = [ x for x in media_page.select('span.dark_text') if 'Members:' in x.text][0] members = members_tag.parent.text.split(':')[1].strip().replace(',', '') media_info[u'members'] = int(members) except: if not self.session.suppress_parse_exceptions: raise try: fav_tag = [x for x in media_page.select('span.dark_text') if 'Favorites:' in x.text][0] favorites = fav_tag.parent.text.split(':')[1].strip().replace(',', '') media_info[u'favorites'] = int(favorites) except: if not self.session.suppress_parse_exceptions: raise return media_info
def parse_sidebar(self, media_page): """Parses the DOM and returns media attributes in the sidebar. :type media_page: :class:`bs4.BeautifulSoup` :param media_page: MAL media page's DOM :rtype: dict :return: media attributes. :raises: InvalidMediaError, MalformedMediaPageError """ media_info = {} # if MAL says the series doesn't exist, raise an InvalidMediaError. error_tag = media_page.find(u'div', {'class': 'badresult'}) if error_tag: raise InvalidMediaError(self.id) try: title_tag = media_page.find(u'div', {'id': 'contentWrapper'}).find(u'h1') if not title_tag.find(u'div'): # otherwise, raise a MalformedMediaPageError. raise MalformedMediaPageError(self.id, media_page, message="Could not find title div") except: if not self.session.suppress_parse_exceptions: raise try: utilities.extract_tags(title_tag.find_all()) media_info[u'title'] = title_tag.text.strip() except: if not self.session.suppress_parse_exceptions: raise info_panel_first = media_page.find(u'div', {'id': 'content'}).find(u'table').find(u'td') try: picture_tag = info_panel_first.find(u'img') media_info[u'picture'] = picture_tag.get(u'src').decode('utf-8') except: if not self.session.suppress_parse_exceptions: raise try: # assemble alternative titles for this series. media_info[u'alternative_titles'] = {} alt_titles_header = info_panel_first.find(u'h2', text=u'Alternative Titles') if alt_titles_header: next_tag = alt_titles_header.find_next_sibling(u'div', {'class': 'spaceit_pad'}) while True: if next_tag is None or not next_tag.find(u'span', {'class': 'dark_text'}): # not a language node, break. break # get language and remove the node. language = next_tag.find(u'span').text[:-1] utilities.extract_tags(next_tag.find_all(u'span', {'class': 'dark_text'})) names = next_tag.text.strip().split(u', ') media_info[u'alternative_titles'][language] = names next_tag = next_tag.find_next_sibling(u'div', {'class': 'spaceit_pad'}) except: if not self.session.suppress_parse_exceptions: raise try: type_tag = info_panel_first.find(text=u'Type:').parent.parent utilities.extract_tags(type_tag.find_all(u'span', {'class': 'dark_text'})) media_info[u'type'] = type_tag.text.strip() except: if not self.session.suppress_parse_exceptions: raise try: status_tag = info_panel_first.find(text=u'Status:').parent.parent utilities.extract_tags(status_tag.find_all(u'span', {'class': 'dark_text'})) media_info[u'status'] = status_tag.text.strip() except: if not self.session.suppress_parse_exceptions: raise try: genres_tag = info_panel_first.find(text=u'Genres:').parent.parent utilities.extract_tags(genres_tag.find_all(u'span', {'class': 'dark_text'})) media_info[u'genres'] = [] for genre_link in genres_tag.find_all('a'): link_parts = genre_link.get('href').split('[]=') # of the form /anime|manga.php?genre[]=1 genre = self.session.genre(int(link_parts[1])).set({'name': genre_link.text}) media_info[u'genres'].append(genre) except: if not self.session.suppress_parse_exceptions: raise try: # grab statistics for this media. score_tag = info_panel_first.find(text=u'Score:').parent.parent # get score and number of users. users_node = [x for x in score_tag.find_all(u'small') if u'scored by' in x.text][0] num_users = int(users_node.text.split(u'scored by ')[-1].split(u' users')[0]) utilities.extract_tags(score_tag.find_all()) media_info[u'score'] = (decimal.Decimal(score_tag.text.strip()), num_users) except: if not self.session.suppress_parse_exceptions: raise try: rank_tag = info_panel_first.find(text=u'Ranked:').parent.parent utilities.extract_tags(rank_tag.find_all()) media_info[u'rank'] = int(rank_tag.text.strip()[1:].replace(u',', '')) except: if not self.session.suppress_parse_exceptions: raise try: popularity_tag = info_panel_first.find(text=u'Popularity:').parent.parent utilities.extract_tags(popularity_tag.find_all()) media_info[u'popularity'] = int(popularity_tag.text.strip()[1:].replace(u',', '')) except: if not self.session.suppress_parse_exceptions: raise try: members_tag = info_panel_first.find(text=u'Members:').parent.parent utilities.extract_tags(members_tag.find_all()) media_info[u'members'] = int(members_tag.text.strip().replace(u',', '')) except: if not self.session.suppress_parse_exceptions: raise try: favorites_tag = info_panel_first.find(text=u'Favorites:').parent.parent utilities.extract_tags(favorites_tag.find_all()) media_info[u'favorites'] = int(favorites_tag.text.strip().replace(u',', '')) except: if not self.session.suppress_parse_exceptions: raise try: # get popular tags. tags_header = media_page.find(u'h2', text=u'Popular Tags') tags_tag = tags_header.find_next_sibling(u'span') media_info[u'popular_tags'] = {} for tag_link in tags_tag.find_all('a'): tag = self.session.tag(tag_link.text) num_people = int(re.match(r'(?P<people>[0-9]+) people', tag_link.get('title')).group('people')) media_info[u'popular_tags'][tag] = num_people except: if not self.session.suppress_parse_exceptions: raise return media_info
def parse(self, media_page): """Parses the DOM and returns media attributes in the main-content area. :type media_page: :class:`bs4.BeautifulSoup` :param media_page: MAL media page's DOM :rtype: dict :return: media attributes. """ media_info = self.parse_sidebar(media_page) try: synopsis_elt = media_page.find(u'h2', text=u'Synopsis').parent utilities.extract_tags(synopsis_elt.find_all(u'h2')) media_info[u'synopsis'] = synopsis_elt.text.strip() except: if not self.session.suppress_parse_exceptions: raise try: related_title = media_page.find(u'h2', text=u'Related ' + self.__class__.__name__) if related_title: related_elt = related_title.parent utilities.extract_tags(related_elt.find_all(u'h2')) related = {} for link in related_elt.find_all(u'a'): href = link.get(u'href').replace(u'http://myanimelist.net', '') if not re.match(r'/(anime|manga)', href): break curr_elt = link.previous_sibling if curr_elt is None: # we've reached the end of the list. break related_type = None while True: if not curr_elt: raise MalformedAnimePageError(self.id, related_elt, message="Prematurely reached end of related anime listing") if isinstance(curr_elt, bs4.NavigableString): type_match = re.match(u'(?P<type>[a-zA-Z\ \-]+):', curr_elt) if type_match: related_type = type_match.group(u'type') break curr_elt = curr_elt.previous_sibling title = link.text # parse link: may be manga or anime. href_parts = href.split(u'/') # sometimes links on MAL are broken, of the form /anime// if href_parts[2] == '': continue # of the form: /(anime|manga)/1/Cowboy_Bebop obj_id = int(href_parts[2]) new_obj = getattr(self.session, href_parts[1])(obj_id).set({'title': title}) if related_type not in related: related[related_type] = [new_obj] else: related[related_type].append(new_obj) media_info[u'related'] = related else: media_info[u'related'] = None except: if not self.session.suppress_parse_exceptions: raise return media_info
def parse_sidebar(self, anime_page): """Parses the DOM and returns anime attributes in the sidebar. :type anime_page: :class:`bs4.BeautifulSoup` :param anime_page: MAL anime page's DOM :rtype: dict :return: anime attributes :raises: :class:`.InvalidAnimeError`, :class:`.MalformedAnimePageError` """ try: anime_info = super(Anime, self).parse_sidebar(anime_page) except media.InvalidMediaError as e: raise InvalidAnimeError(e.id) info_panel_first = anime_page.find(u'div', { 'id': 'content' }).find(u'table').find(u'td') try: episode_tag = info_panel_first.find( text=u'Episodes:').parent.parent utilities.extract_tags( episode_tag.find_all(u'span', {'class': 'dark_text'})) anime_info[u'episodes'] = int(episode_tag.text.strip( )) if episode_tag.text.strip() != 'Unknown' else 0 except: if not self.session.suppress_parse_exceptions: raise try: aired_tag = info_panel_first.find(text=u'Aired:').parent.parent utilities.extract_tags( aired_tag.find_all(u'span', {'class': 'dark_text'})) aired_parts = aired_tag.text.strip().split(u' to ') if len(aired_parts) == 1: # this aired once. try: aired_date = utilities.parse_profile_date( aired_parts[0], suppress=self.session.suppress_parse_exceptions) except ValueError: raise MalformedAnimePageError( self.id, aired_parts[0], message="Could not parse single air date") anime_info[u'aired'] = (aired_date, ) else: # two airing dates. try: air_start = utilities.parse_profile_date( aired_parts[0], suppress=self.session.suppress_parse_exceptions) except ValueError: raise MalformedAnimePageError( self.id, aired_parts[0], message="Could not parse first of two air dates") try: air_end = utilities.parse_profile_date( aired_parts[1], suppress=self.session.suppress_parse_exceptions) except ValueError: raise MalformedAnimePageError( self.id, aired_parts[1], message="Could not parse second of two air dates") anime_info[u'aired'] = (air_start, air_end) except: if not self.session.suppress_parse_exceptions: raise try: producers_tag = info_panel_first.find( text=u'Producers:').parent.parent utilities.extract_tags( producers_tag.find_all(u'span', {'class': 'dark_text'})) anime_info[u'producers'] = [] for producer_link in producers_tag.find_all('a'): if producer_link.text == u'add some': # MAL is saying "None found, add some". break link_parts = producer_link.get('href').split('/') # of the form: /anime/producer/23/Bandai_Visual anime_info[u'producers'].append( self.session.producer(int(link_parts[3])).set( {'name': producer_link.text})) except: if not self.session.suppress_parse_exceptions: raise try: duration_tag = info_panel_first.find( text=u'Duration:').parent.parent utilities.extract_tags( duration_tag.find_all(u'span', {'class': 'dark_text'})) anime_info[u'duration'] = duration_tag.text.strip() duration_parts = [ part.strip() for part in anime_info[u'duration'].split(u'.') ] duration_mins = 0 for part in duration_parts: part_match = re.match(u'(?P<num>[0-9]+)', part) if not part_match: continue part_volume = int(part_match.group(u'num')) if part.endswith(u'hr'): duration_mins += part_volume * 60 elif part.endswith(u'min'): duration_mins += part_volume anime_info[u'duration'] = datetime.timedelta(minutes=duration_mins) except: if not self.session.suppress_parse_exceptions: raise try: rating_tag = info_panel_first.find(text=u'Rating:').parent.parent utilities.extract_tags( rating_tag.find_all(u'span', {'class': 'dark_text'})) anime_info[u'rating'] = rating_tag.text.strip() except: if not self.session.suppress_parse_exceptions: raise return anime_info
def parse_sidebar(self, media_page, media_page_original=None): """Parses the DOM and returns media attributes in the sidebar. :type media_page: :class:`bs4.BeautifulSoup` :param media_page: MAL media page's DOM :rtype: dict :return: media attributes. :raises: InvalidMediaError, MalformedMediaPageError """ media_info = {} # if MAL says the series doesn't exist, raise an InvalidMediaError. error_tag = media_page.find(u'div', {'class': 'badresult'}) if error_tag: raise InvalidMediaError(self.id) try: title_tag = media_page.find(u'div', {'id': 'contentWrapper'}).find(u'h1') if not title_tag.find(u'div'): try: title_tag = media_page_original.select('div#contentWrapper h1.h1 span')[0] except IndexError: # otherwise, raise a MalformedMediaPageError. raise MalformedMediaPageError(self.id, None, message="Could not find title div") except: if not self.session.suppress_parse_exceptions: raise try: utilities.extract_tags(title_tag.find_all()) media_info[u'title'] = title_tag.text.strip() if media_info[u'title'] == '': media_info[u'title'] = media_page_original.find('span',{'itemprop':'name'}).text except: if not self.session.suppress_parse_exceptions: raise info_panel_first = media_page_original.select('div#content table td')[0] try: picture_tag = info_panel_first.find(u'img') media_info[u'picture'] = picture_tag.get(u'src').decode('utf-8') except: if not self.session.suppress_parse_exceptions: raise try: # assemble alternative titles for this series. media_info[u'alternative_titles'] = {} alt_titles_header = info_panel_first.find(u'h2', text=u'Alternative Titles') if alt_titles_header: next_tag = alt_titles_header.find_next_sibling(u'div', {'class': 'spaceit_pad'}) while True: if next_tag is None or not next_tag.find(u'span', {'class': 'dark_text'}): # not a language node, break. break # get language and remove the node. language = next_tag.find(u'span').text[:-1] utilities.extract_tags(next_tag.find_all(u'span', {'class': 'dark_text'})) names = next_tag.text.strip().split(u', ') media_info[u'alternative_titles'][language] = names next_tag = next_tag.find_next_sibling(u'div', {'class': 'spaceit_pad'}) except: if not self.session.suppress_parse_exceptions: raise try: try: type_tag = info_panel_first.find(text=u'Type:').parent.parent utilities.extract_tags(type_tag.find_all(u'span', {'class': 'dark_text'})) media_info[u'type'] = type_tag.text.strip() except AttributeError: type_tag = [x for x in info_panel_first.find_all('div') if 'Type:' in x.text][0] media_info[u'type'] = type_tag.text.split(':')[-1].strip() except: if not self.session.suppress_parse_exceptions: raise try: status_tag = [x for x in media_page.find_all('span')if 'Status:' in x.text][0].parent media_info[u'status'] = status_tag.text.split(':')[1].strip() except: if not self.session.suppress_parse_exceptions: raise try: media_info[u'genres'] = self.parse_genres(media_page_original) except: if not self.session.suppress_parse_exceptions: raise try: # grab statistics for this media. score_tag = media_page.find('span', {'itemprop': 'aggregateRating'}) # there is difference between anime and manga page # in manga page score_tag is in span-tag and anime in div-page # test score tag by try to find span-tag try: score_tag.find('span') except AttributeError: score_tag = score_tag = media_page.find('div', {'itemprop': 'aggregateRating'}) # get score and number of users. num_users = int(score_tag.find('span', {'itemprop':'ratingCount'}).text.replace(',','')) # utilities.extract_tags(score_tag.find_all()) score_point = score_tag.find('span',{'itemprop':'ratingValue'}).text try: media_info[u'score'] = (decimal.Decimal(score_point), num_users) except (InvalidOperation, AttributeError) : score_tag = media_page_original.find('span',{'itemprop':'ratingValue'}) media_info[u'score'] = (decimal.Decimal(score_tag.text), num_users) except: if not self.session.suppress_parse_exceptions: raise try: try: rank_tag = info_panel_first.find(text=u'Ranked:').parent.parent utilities.extract_tags(rank_tag.find_all()) media_info[u'rank'] = int(rank_tag.text.strip()[1:].replace(u',', '')) except AttributeError: rank_tag = filter(lambda x: 'Ranked:' in x.text, media_page_original.find_all('div', {'class':'spaceit'})) media_info[u'rank'] = int(rank_tag[0].text.split('#')[-1].strip()) except: if not self.session.suppress_parse_exceptions: raise try: try : popularity_tag = info_panel_first.find(text=u'Popularity:').parent.parent utilities.extract_tags(popularity_tag.find_all()) media_info[u'popularity'] = int(popularity_tag.text.strip()[1:].replace(u',', '')) except AttributeError : rank_tag = filter(lambda x: 'Popularity' in x.text, media_page_original.find_all('span', {'class':'dark_text'}))[0].parent media_info[u'popularity'] = int(rank_tag.text.split('#')[-1].strip()) except: if not self.session.suppress_parse_exceptions: raise try: try : members_tag = info_panel_first.find(text=u'Members:').parent.parent utilities.extract_tags(members_tag.find_all()) media_info[u'members'] = int(members_tag.text.strip().replace(u',', '')) except AttributeError : members_tag = filter(lambda x: 'Members' in x.text, media_page_original.find_all('span', {'class':'dark_text'}))[0].parent media_info[u'members'] = int(members_tag.text.split(':')[-1].strip().replace(u',', '')) except: if not self.session.suppress_parse_exceptions: raise try: try : favorites_tag = info_panel_first.find(text=u'Favorites:').parent.parent utilities.extract_tags(favorites_tag.find_all()) media_info[u'favorites'] = int(favorites_tag.text.strip().replace(u',', '')) except AttributeError : favorites_tag = filter(lambda x: 'Favorites' in x.text, media_page_original.find_all('span', {'class':'dark_text'}))[0].parent media_info[u'favorites'] = int(favorites_tag.text.split(':')[-1].strip().replace(u',', '')) except: if not self.session.suppress_parse_exceptions: raise try: # get popular tags. tags_header = media_page.find(u'h2', text=u'Popular Tags') try: tags_tag = tags_header.find_next_sibling(u'span') media_info[u'popular_tags'] = {} for tag_link in tags_tag.find_all('a'): tag = self.session.tag(tag_link.text) num_people = int(re.match(r'(?P<people>[0-9]+) people', tag_link.get('title')).group('people')) media_info[u'popular_tags'][tag] = num_people except AttributeError: tags_tag = media_page_original.find('span',text='Genres:').parent media_info[u'popular_tags'] = {} for tag_link in tags_tag.find_all('a'): tag = self.session.tag(tag_link.text.lower()) try: num_people = int(re.match(r'(?P<people>[0-9]+) people', tag_link.get('title')).group('people')) media_info[u'popular_tags'][tag] = num_people except (TypeError, AttributeError): tag_num = tag_link.get('href').split('=')[-1] media_info[u'popular_tags'][tag] = tag_num except: if not self.session.suppress_parse_exceptions: raise return media_info