예제 #1
0
    def parse_synopsis(self, media_page):
        """Parse the DOM and returns media synopsis.

        :type media_page: :class:`bs4.BeautifulSoup`
        :param media_page: MAL media page's DOM

        :rtype: string
        :return: media synopsis.

        """
        synopsis_elt = [x for x in media_page.find_all('h2')
                        if "Synopsis" in x.text][0].parent
        # filter the text between 2 h2-tag
        temp_synopsis_elt = []
        for x in synopsis_elt.contents[1:]:
            if type(x) == bs4.element.Tag:
                if x.name == 'h2':
                    break
                temp_synopsis_elt.append(x.text)
            else:
                temp_synopsis_elt.append(x)
        synopsis_elt = ''.join(temp_synopsis_elt)
        try:
            utilities.extract_tags(synopsis_elt.find_all('h2'))
            result = synopsis_elt.text.strip()
        except AttributeError:
            # the current synopsis_elt may not contain any h2-tag
            result = synopsis_elt
        if result == '':
            # result tag
            rs_tag = [xx for xx in media_page.select('span')
                      if xx.get('itemprop') == 'description'][0]
            result = rs_tag.text
        return result
예제 #2
0
    def parse(self, media_page):
        """Parses the DOM and returns media attributes in the main-content area.

    :type media_page: :class:`bs4.BeautifulSoup`
    :param media_page: MAL media page's DOM

    :rtype: dict
    :return: media attributes.

    """
        media_info = self.parse_sidebar(media_page)

        try:
            synopsis_tag = media_page.find(u'span',
                                           {'itemprop': 'description'})
            utilities.extract_tags([synopsis_tag])
            media_info[u'synopsis'] = synopsis_tag.text.strip()
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            related_title = media_page.find(
                text=re.compile(u'Related ' + self.__class__.__name__))
            if related_title:
                related_table = related_title.parent.next_sibling
                utilities.extract_tags([related_table])
                related = {}

                # extract each related category
                for row in related_table.find_all('tr'):
                    related_type = row.find('td').text.strip(':')
                    related[related_type] = []
                    # extract each title in the category
                    for link in row.find_all('a'):
                        href = link.get(u'href').replace(
                            u'http://myanimelist.net', '')
                        if not re.match(r'/(anime|manga)', href):
                            break
                        title = link.text
                        # parse link: may be manga or anime.
                        href_parts = href.split(u'/')
                        # sometimes links on MAL are broken, of the form /anime//
                        if href_parts[2] == '':
                            continue
                        # of the form: /(anime|manga)/1/Cowboy_Bebop
                        obj_id = int(href_parts[2])
                        new_obj = getattr(self.session,
                                          href_parts[1])(obj_id).set(
                                              {'title': title})
                        related[related_type].append(new_obj)

                media_info[u'related'] = related
            else:
                media_info[u'related'] = None
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        return media_info
예제 #3
0
    def parse_recommendations(self, recommendations_page):
        """Parse the DOM and returns user recommendations attributes.

        :type recommendations_page: :class:`bs4.BeautifulSoup`
        :param recommendations_page: MAL user recommendations page's DOM

        :rtype: dict
        :return: User recommendations attributes.

        """
        user_info = self.parse_sidebar(recommendations_page)
        second_col = (
            recommendations_page
            .find(u'div', {u'id': u'content'})
            .find(u'table')
            .find(u'tr')
            .find_all(u'td', recursive=False)[1]
        )

        try:
            recommendations = second_col.find_all("div", {"class": "spaceit borderClass"})
            if recommendations:
                user_info['recommendations'] = {}
                for row in recommendations[1:]:
                    anime_table = row.find(u'table')
                    animes = anime_table.find_all(u'td')
                    # find liked media
                    liked_media_link = animes[0].find(u'a', recursive=False)
                    link_parts = liked_media_link.get(u'href').split(u'/')
                    # of the form /anime|manga/64/Rozen_Maiden
                    liked_media = getattr(self.session, link_parts[1])(int(link_parts[2])).set(
                        {u'title': liked_media_link.text}
                    )
                    # find recommended media
                    recommended_media_link = animes[1].find(u'a', recursive=False)
                    link_parts = recommended_media_link.get(u'href').split(u'/')
                    # of the form /anime|manga/64/Rozen_Maiden
                    media_id = int(link_parts[2])
                    recommended_media = getattr(self.session, link_parts[1])(media_id).set(
                        {u'title': recommended_media_link.text}
                    )
                    # other stats from recommended media
                    recommendation_text = row.find(u'p').text
                    recommendation_menu = row.find(u'div', recursive=False)
                    utilities.extract_tags(recommendation_menu)
                    rec_menu_text = recommendation_menu.text.split(u' - ')[1]
                    recommendation_date = utilities.parse_profile_date(rec_menu_text)

                    user_info['recommendations'][liked_media] = {link_parts[1]: recommended_media,
                                                                  'text': recommendation_text,
                                                                  'date': recommendation_date}
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        return user_info
예제 #4
0
    def parse_recommendations(self, recommendations_page):
        """Parses the DOM and returns user recommendations attributes.

    :type recommendations_page: :class:`bs4.BeautifulSoup`
    :param recommendations_page: MAL user recommendations page's DOM

    :rtype: dict
    :return: User recommendations attributes.

    """
        user_info = self.parse_sidebar(recommendations_page)
        second_col = recommendations_page.find(u'div', {
            u'id': u'content'
        }).find(u'table').find(u'tr').find_all(u'td', recursive=False)[1]

        try:
            recommendations = second_col.find_all(
                u"div", {u"class": u"spaceit borderClass"})
            if recommendations:
                user_info[u'recommendations'] = {}
                for row in recommendations[1:]:
                    anime_table = row.find(u'table')
                    animes = anime_table.find_all(u'td')
                    liked_media_link = animes[0].find(u'a', recursive=False)
                    link_parts = liked_media_link.get(u'href').split(u'/')
                    # of the form /anime|manga/64/Rozen_Maiden
                    liked_media = getattr(self.session, link_parts[1])(int(
                        link_parts[2])).set({u'title': liked_media_link.text})

                    recommended_media_link = animes[1].find(u'a',
                                                            recursive=False)
                    link_parts = recommended_media_link.get(u'href').split(
                        u'/')
                    # of the form /anime|manga/64/Rozen_Maiden
                    recommended_media = getattr(self.session, link_parts[1])(
                        int(link_parts[2])).set(
                            {u'title': recommended_media_link.text})

                    recommendation_text = row.find(u'p').text

                    recommendation_menu = row.find(u'div', recursive=False)
                    utilities.extract_tags(recommendation_menu)
                    recommendation_date = utilities.parse_profile_date(
                        recommendation_menu.text.split(u' - ')[1])

                    user_info[u'recommendations'][liked_media] = {
                        link_parts[1]: recommended_media,
                        'text': recommendation_text,
                        'date': recommendation_date
                    }
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        return user_info
예제 #5
0
    def parse_synopsis(self, media_page):
        """Parse the DOM and returns media synopsis.

        :type media_page: :class:`bs4.BeautifulSoup`
        :param media_page: MAL media page's DOM

        :rtype: string
        :return: media synopsis.

        """
        synopsis_elt = [
            x for x in media_page.find_all(u'h2') if "Synopsis" in x.text
        ][0].parent
        # filter the text between 2 h2-tag
        temp_synopsis_elt = []
        for x in synopsis_elt.contents[1:]:
            if type(x) == bs4.element.Tag:
                if x.name == 'h2':
                    break
                temp_synopsis_elt.append(x.text)
            else:
                temp_synopsis_elt.append(x)
        synopsis_elt = ''.join(temp_synopsis_elt)
        try:
            utilities.extract_tags(synopsis_elt.find_all(u'h2'))
            result = synopsis_elt.text.strip()
        except AttributeError:
            # the current synopsis_elt may not contain any h2-tag
            result = synopsis_elt
        if result == '':
            # result tag
            rs_tag = [
                xx for xx in media_page.select('span')
                if xx.get('itemprop') == 'description'
            ][0]
            result = rs_tag.text
        return result
예제 #6
0
    def parse(self, media_page):
        """Parses the DOM and returns media attributes in the main-content area.

    :type media_page: :class:`bs4.BeautifulSoup`
    :param media_page: MAL media page's DOM

    :rtype: dict
    :return: media attributes.

    """
        media_info = self.parse_sidebar(media_page)

        try:
            synopsis_elt = media_page.find(u'h2', text=u'Synopsis').parent
            utilities.extract_tags(synopsis_elt.find_all(u'h2'))
            media_info[u'synopsis'] = synopsis_elt.text.strip()
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            related_title = media_page.find(u'h2',
                                            text=u'Related ' +
                                            self.__class__.__name__)
            if related_title:
                related_elt = related_title.parent
                utilities.extract_tags(related_elt.find_all(u'h2'))
                related = {}
                for link in related_elt.find_all(u'a'):
                    href = link.get(u'href').replace(u'http://myanimelist.net',
                                                     '')
                    if not re.match(r'/(anime|manga)', href):
                        break
                    curr_elt = link.previous_sibling
                    if curr_elt is None:
                        # we've reached the end of the list.
                        break
                    related_type = None
                    while True:
                        if not curr_elt:
                            raise MalformedAnimePageError(
                                self.id,
                                related_elt,
                                message=
                                "Prematurely reached end of related anime listing"
                            )
                        if isinstance(curr_elt, bs4.NavigableString):
                            type_match = re.match(u'(?P<type>[a-zA-Z\ \-]+):',
                                                  curr_elt)
                            if type_match:
                                related_type = type_match.group(u'type')
                                break
                        curr_elt = curr_elt.previous_sibling
                    title = link.text
                    # parse link: may be manga or anime.
                    href_parts = href.split(u'/')
                    # sometimes links on MAL are broken, of the form /anime//
                    if href_parts[2] == '':
                        continue
                    # of the form: /(anime|manga)/1/Cowboy_Bebop
                    obj_id = int(href_parts[2])
                    new_obj = getattr(self.session, href_parts[1])(obj_id).set(
                        {'title': title})
                    if related_type not in related:
                        related[related_type] = [new_obj]
                    else:
                        related[related_type].append(new_obj)
                media_info[u'related'] = related
            else:
                media_info[u'related'] = None
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        return media_info
예제 #7
0
    def parse_sidebar(self, manga_page, manga_page_original=None):
        """Parses the DOM and returns manga attributes in the sidebar.

        :type manga_page: :class:`bs4.BeautifulSoup`
        :param manga_page: MAL manga page's DOM

        :type manga_page: :class:`bs4.BeautifulSoup`
        :param manga_page: MAL manga page's DOM

        :rtype: dict
        :return: manga attributes

        :raises: :class:`.InvalidMangaError`, :class:`.MalformedMangaPageError`
        """
        # if MAL says the series doesn't exist, raise an InvalidMangaError.
        error_tag = manga_page.find(u'div', {'class': 'badresult'})
        if error_tag:
            raise InvalidMangaError(self.id)

        try:
            title_tag = manga_page.find(u'span', {'itemprop': 'name'})
            if not title_tag:
                # otherwise, raise a MalformedMangaPageError.
                raise MalformedMangaPageError(self.id,
                                              manga_page,
                                              message="Could not find title")
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        # otherwise, begin parsing.
        manga_info = super(Manga, self).parse_sidebar(manga_page,
                                                      manga_page_original)

        info_panel_first = manga_page.find(u'div', {
            'id': 'content'
        }).find(u'table').find(u'td')

        try:
            volumes_tag = info_panel_first.find(text=u'Volumes:').parent.parent
            utilities.extract_tags(
                volumes_tag.find_all(u'span', {'class': 'dark_text'}))
            manga_info[u'volumes'] = int(volumes_tag.text.strip(
            )) if volumes_tag.text.strip() != 'Unknown' else None
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            chapters_tag = info_panel_first.find(
                text=u'Chapters:').parent.parent
            utilities.extract_tags(
                chapters_tag.find_all(u'span', {'class': 'dark_text'}))
            manga_info[u'chapters'] = int(chapters_tag.text.strip(
            )) if chapters_tag.text.strip() != 'Unknown' else None
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            published_tag = info_panel_first.find(
                text=u'Published:').parent.parent
            utilities.extract_tags(
                published_tag.find_all(u'span', {'class': 'dark_text'}))
            published_parts = published_tag.text.strip().split(u' to ')
            if len(published_parts) == 1:
                # this published once.
                try:
                    published_date = utilities.parse_profile_date(
                        published_parts[0])
                except ValueError:
                    raise MalformedMangaPageError(
                        self.id,
                        published_parts[0],
                        message="Could not parse single publish date")
                manga_info[u'published'] = (published_date, )
            else:
                # two publishing dates.
                try:
                    publish_start = utilities.parse_profile_date(
                        published_parts[0])
                except ValueError:
                    raise MalformedMangaPageError(
                        self.id,
                        published_parts[0],
                        message="Could not parse first of two publish dates")
                if published_parts == u'?':
                    # this is still publishing.
                    publish_end = None
                else:
                    try:
                        publish_end = utilities.parse_profile_date(
                            published_parts[1])
                    except ValueError:
                        raise MalformedMangaPageError(
                            self.id,
                            published_parts[1],
                            message=
                            "Could not parse second of two publish dates")
                manga_info[u'published'] = (publish_start, publish_end)
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            authors_tag = info_panel_first.find(text=u'Authors:').parent.parent
            utilities.extract_tags(
                authors_tag.find_all(u'span', {'class': 'dark_text'}))
            manga_info[u'authors'] = {}
            for author_link in authors_tag.find_all('a'):
                link_parts = author_link.get('href').split('/')
                # of the form /people/1867/Naoki_Urasawa
                person = self.session.person(int(link_parts[2])).set(
                    {'name': author_link.text})
                role = author_link.nextSibling.replace(' (',
                                                       '').replace(')', '')
                manga_info[u'authors'][person] = role
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            manga_info[u'serialization'] = self.parse_serialization(manga_page)
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        return manga_info
예제 #8
0
    def parse_sidebar(self, media_page):
        """Parses the DOM and returns media attributes in the sidebar.

    :type media_page: :class:`bs4.BeautifulSoup`
    :param media_page: MAL media page's DOM

    :rtype: dict
    :return: media attributes.

    :raises: InvalidMediaError, MalformedMediaPageError

    """
        media_info = {}

        # if MAL says the series doesn't exist, raise an InvalidMediaError.
        error_tag = media_page.find(u'div', {'class': 'badresult'})
        if error_tag:
            raise InvalidMediaError(self.id)

        try:
            title_tag = media_page.find(u'div', {
                'id': 'contentWrapper'
            }).find(u'h1')
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            media_info[u'title'] = title_tag.text.strip()
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        info_panel_first = media_page.find(u'div', {
            'id': 'content'
        }).find(u'table').find(u'td')

        try:
            picture_tag = info_panel_first.find(u'img')
            media_info[u'picture'] = picture_tag.get(u'src').decode('utf-8')
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            # assemble alternative titles for this series.
            media_info[u'alternative_titles'] = {}
            alt_titles_header = info_panel_first.find(
                u'h2', text=u'Alternative Titles')
            if alt_titles_header:
                next_tag = alt_titles_header.find_next_sibling(
                    u'div', {'class': 'spaceit_pad'})
                while True:
                    if next_tag is None or not next_tag.find(
                            u'span', {'class': 'dark_text'}):
                        # not a language node, break.
                        break
                    # get language and remove the node.
                    language = next_tag.find(u'span').text[:-1]
                    utilities.extract_tags(
                        next_tag.find_all(u'span', {'class': 'dark_text'}))
                    names = next_tag.text.strip().split(u', ')
                    media_info[u'alternative_titles'][language] = names
                    next_tag = next_tag.find_next_sibling(
                        u'div', {'class': 'spaceit_pad'})
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            type_tag = info_panel_first.find(text=u'Type:').parent.parent
            utilities.extract_tags(
                type_tag.find_all(u'span', {'class': 'dark_text'}))
            media_info[u'type'] = type_tag.text.strip()
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            status_tag = info_panel_first.find(text=u'Status:').parent.parent
            utilities.extract_tags(
                status_tag.find_all(u'span', {'class': 'dark_text'}))
            media_info[u'status'] = status_tag.text.strip()
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            genres_tag = info_panel_first.find(text=u'Genres:').parent.parent
            utilities.extract_tags(
                genres_tag.find_all(u'span', {'class': 'dark_text'}))
            media_info[u'genres'] = []
            for genre_link in genres_tag.find_all('a'):
                media_info[u'genres'].append(genre_link.text)
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            # grab statistics for this media.
            score_tag = info_panel_first.find(text=u'Score:').parent.parent

            # Third span tag contains information on how many users rated
            num_users = int(
                score_tag.find_all('span')[2].text.replace(',', ''))
            # Second span tag contains average rating
            score = float(score_tag.find_all('span')[1].text)

            media_info[u'score'] = (score, num_users)
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            rank_tag = info_panel_first.find(text=u'Ranked:').parent.parent
            utilities.extract_tags(rank_tag.find_all())
            media_info[u'rank'] = int(rank_tag.text.strip()[1:].replace(
                u',', ''))
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            popularity_tag = info_panel_first.find(
                text=u'Popularity:').parent.parent
            utilities.extract_tags(popularity_tag.find_all())
            media_info[u'popularity'] = int(
                popularity_tag.text.strip()[1:].replace(u',', ''))
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            members_tag = info_panel_first.find(text=u'Members:').parent.parent
            utilities.extract_tags(members_tag.find_all())
            media_info[u'members'] = int(members_tag.text.strip().replace(
                u',', ''))
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            favorites_tag = info_panel_first.find(
                text=u'Favorites:').parent.parent
            utilities.extract_tags(favorites_tag.find_all())
            media_info[u'favorites'] = int(favorites_tag.text.strip().replace(
                u',', ''))
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            # get popular tags.
            tags_header = media_page.find(u'h2', text=u'Popular Tags')
            tags_tag = tags_header.find_next_sibling(u'span')
            media_info[u'popular_tags'] = {}
            for tag_link in tags_tag.find_all('a'):
                tag = self.session.tag(tag_link.text)
                num_people = int(
                    re.match(r'(?P<people>[0-9]+) people',
                             tag_link.get('title')).group('people'))
                media_info[u'popular_tags'][tag] = num_people
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        return media_info
예제 #9
0
    def parse_sidebar(self, anime_page, anime_page_original=None):
        """Parses the DOM and returns anime attributes in the sidebar.

        :type anime_page: :class:`bs4.BeautifulSoup`
        :param anime_page: MAL anime page's DOM

        :type anime_page: :class:`bs4.BeautifulSoup`
        :param anime_page: MAL anime page's DOM uncleaned

        :rtype: dict
        :return: anime attributes

        :raises: :class:`.InvalidAnimeError`, :class:`.MalformedAnimePageError`
        """
        # if MAL says the series doesn't exist, raise an InvalidAnimeError.
        error_tag = anime_page.find(u'div', {'class': 'badresult'})
        if error_tag:
            raise InvalidAnimeError(self.id)

        title_tag = anime_page.find(u'div', {
            'id': 'contentWrapper'
        }).find(u'h1')
        if not title_tag.find(u'div'):
            # otherwise, raise a MalformedAnimePageError.
            try:
                title_tag = anime_page.select('h1.h1 span')[0].text
            except IndexError:
                raise MalformedAnimePageError(
                    self.id, None, message="Could not find title div")

        anime_info = super(Anime, self).parse_sidebar(anime_page,
                                                      anime_page_original)
        info_panel_first = anime_page.find(u'div', {
            'id': 'content'
        }).find(u'table').find(u'td')

        try:
            episode_tag = [
                x for x in anime_page_original.find_all('span')
                if 'Episodes:' in x.text
            ][0].parent
            anime_info[u'episodes'] = int(
                episode_tag.text.split(':')
                [-1].strip()) if episode_tag.text.strip() != 'Unknown' else 0
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            aired_tag = [
                x for x in anime_page_original.find_all('span')
                if 'Aired:' in x.text
            ][0].parent
            aired_tag_text = aired_tag.text.split(':')[1]
            aired_parts = aired_tag_text.strip().split(u' to ')
            if len(aired_parts) == 1:
                # this aired once.
                try:
                    aired_date = utilities.parse_profile_date(
                        aired_parts[0],
                        suppress=self.session.suppress_parse_exceptions)
                except ValueError:
                    raise MalformedAnimePageError(
                        self.id,
                        aired_parts[0],
                        message="Could not parse single air date")
                anime_info[u'aired'] = (aired_date, )
            else:
                # two airing dates.
                try:
                    air_start = utilities.parse_profile_date(
                        aired_parts[0],
                        suppress=self.session.suppress_parse_exceptions)
                except ValueError:
                    raise MalformedAnimePageError(
                        self.id,
                        aired_parts[0],
                        message="Could not parse first of two air dates")
                try:
                    air_end = utilities.parse_profile_date(
                        aired_parts[1],
                        suppress=self.session.suppress_parse_exceptions)
                except ValueError:
                    raise MalformedAnimePageError(
                        self.id,
                        aired_parts[1],
                        message="Could not parse second of two air dates")
                anime_info[u'aired'] = (air_start, air_end)
        except:
            if not self.session.suppress_parse_exceptions:
                raise
        try:
            anime_info[u'producers'] = self.parse_producers(anime_page)
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            duration_tag = [
                x for x in anime_page_original.find_all('span')
                if 'Duration:' in x.text
            ][0].parent
            anime_info[u'duration'] = duration_tag.text.split(':')[1].strip()
            duration_parts = [
                part.strip() for part in anime_info[u'duration'].split(u'.')
            ]
            duration_mins = 0
            for part in duration_parts:
                part_match = re.match(u'(?P<num>[0-9]+)', part)
                if not part_match:
                    continue
                part_volume = int(part_match.group(u'num'))
                if part.endswith(u'hr'):
                    duration_mins += part_volume * 60
                elif part.endswith(u'min'):
                    duration_mins += part_volume
            anime_info[u'duration'] = datetime.timedelta(minutes=duration_mins)
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            rating_tag = [
                x for x in anime_page_original.find_all('span')
                if 'Rating:' in x.text
            ][0].parent
            utilities.extract_tags(
                rating_tag.find_all(u'span', {'class': 'dark_text'}))
            anime_info[u'rating'] = rating_tag.text.strip()
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        return anime_info
예제 #10
0
    def parse_sidebar(self, anime_page, anime_page_original=None):
        """Parse the DOM and returns anime attributes in the sidebar.

        :type anime_page: :class:`bs4.BeautifulSoup`
        :param anime_page: MAL anime page's DOM

        :type anime_page: :class:`bs4.BeautifulSoup`
        :param anime_page: MAL anime page's DOM uncleaned

        :rtype: dict
        :return: anime attributes

        :raises: :class:`.InvalidAnimeError`, :class:`.MalformedAnimePageError`
        """
        # if MAL says the series doesn't exist, raise an InvalidAnimeError.
        error_tag = anime_page.find('div', {'class': 'badresult'})
        if error_tag:
            raise InvalidAnimeError(self.id)

        title_tag = anime_page.find('div', {'id': 'contentWrapper'}).find('h1')
        if not title_tag.find('div'):
            # otherwise, raise a MalformedAnimePageError.
            try:
                title_tag = anime_page.select('h1.h1 span')[0].text
            except IndexError:
                raise MalformedAnimePageError(self.id, None, message="Could not find title div")

        anime_info = super(Anime, self).parse_sidebar(anime_page, anime_page_original)
        # 'info_panel_first'not used
        # info_panel_first = anime_page.find(u'div', {'id': 'content'}).find(u'table').find(u'td')

        try:
            anime_info['episodes'] = self._parse_episodes(anime_page)
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            aired_tag = [x for x in anime_page_original.find_all('span')
                         if 'Aired:' in x.text][0].parent
            aired_tag_text = aired_tag.text.split(':')[1]
            aired_parts = aired_tag_text.strip().split(' to ')
            suppress_parse_exceptions = self.session.suppress_parse_exceptions
            if len(aired_parts) == 1:
                # this aired once.
                try:
                    aired_date = parse_profile_date(aired_parts[0],
                                                    suppress=suppress_parse_exceptions)
                except ValueError:
                    err_msg = "Could not parse single air date"
                    raise MalformedAnimePageError(self.id, aired_parts[0], message=err_msg)
                anime_info['aired'] = (aired_date,)
            else:
                # two airing dates.
                try:
                    air_start = parse_profile_date(aired_parts[0],
                                                   suppress=suppress_parse_exceptions)
                except ValueError:
                    raise MalformedAnimePageError(self.id, aired_parts[0],
                                                  message="Could not parse first of two air dates")
                try:
                    air_end = parse_profile_date(aired_parts[1],
                                                 suppress=suppress_parse_exceptions)
                except ValueError:
                    error_msg = "Could not parse second of two air dates"
                    raise MalformedAnimePageError(self.id, aired_parts[1],
                                                  message=error_msg)
                anime_info['aired'] = (air_start, air_end)
        except:
            if not self.session.suppress_parse_exceptions:
                raise
        try:
            anime_info['producers'] = self.parse_producers(anime_page)
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            duration_tag = [x for x in anime_page_original.find_all('span')
                            if 'Duration:' in x.text][0].parent
            anime_info['duration'] = duration_tag.text.split(':')[1].strip()
            duration_parts = [part.strip() for part in anime_info['duration'].split('.')]
            duration_mins = 0
            for part in duration_parts:
                part_match = re.match('(?P<num>[0-9]+)', part)
                if not part_match:
                    continue
                part_volume = int(part_match.group('num'))
                if part.endswith('hr'):
                    duration_mins += part_volume * 60
                elif part.endswith('min'):
                    duration_mins += part_volume
            anime_info['duration'] = datetime.timedelta(minutes=duration_mins)
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            rating_tag = [x for x in anime_page_original.find_all('span')
                          if 'Rating:' in x.text][0].parent
            utilities.extract_tags(rating_tag.find_all('span', {'class': 'dark_text'}))
            anime_info['rating'] = rating_tag.text.strip()
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        return anime_info
예제 #11
0
    def parse_sidebar(self, media_page):
        """Parses the DOM and returns media attributes in the sidebar.

    :type media_page: :class:`bs4.BeautifulSoup`
    :param media_page: MAL media page's DOM

    :rtype: dict
    :return: media attributes.

    :raises: InvalidMediaError, MalformedMediaPageError

    """
        media_info = {}

        # if MAL says the series doesn't exist, raise an InvalidMediaError.
        error_tag = media_page.find(u'div', {'class': 'error404'})
        if error_tag:
            raise InvalidMediaError(self.id)

        try:
            title_tag = media_page.find(u'span', {'itemprop': 'name'})
            if not title_tag:
                # otherwise, raise a MalformedMediaPageError.
                raise MalformedMediaPageError(
                    self.id, media_page, message="Could not find title span")
            utilities.extract_tags(title_tag.find_all())
            media_info[u'title'] = title_tag.text.strip()
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        info_panel_first = media_page.find(u'div', {
            'id': 'content'
        }).find(u'table').find(u'td')

        # remove user-controls, if exists
        controls = info_panel_first.find(id='addtolist')
        if controls:
            controls.extract()

        try:
            picture_tag = info_panel_first.find(u'img')
            media_info[u'picture'] = picture_tag.get(u'src').decode('utf-8')
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            # assemble alternative titles for this series.
            media_info[u'alternative_titles'] = {}
            alt_titles_header = info_panel_first.find(
                u'h2', text=u'Alternative Titles')
            if alt_titles_header:
                next_tag = alt_titles_header.find_next_sibling(
                    u'div', {'class': 'spaceit_pad'})
                while True:
                    if next_tag is None or not next_tag.find(
                            u'span', {'class': 'dark_text'}):
                        # not a language node, break.
                        break
                    # get language and remove the node.
                    language = next_tag.find(u'span').text[:-1]
                    utilities.extract_tags(
                        next_tag.find_all(u'span', {'class': 'dark_text'}))
                    names = next_tag.text.strip().split(u', ')
                    media_info[u'alternative_titles'][language] = names
                    next_tag = next_tag.find_next_sibling(
                        u'div', {'class': 'spaceit_pad'})
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            type_tag = info_panel_first.find(text=u'Type:').parent.parent
            utilities.extract_tags(
                type_tag.find_all(u'span', {'class': 'dark_text'}))
            media_info[u'type'] = type_tag.text.strip()
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            status_tag = info_panel_first.find(text=u'Status:').parent.parent
            utilities.extract_tags(
                status_tag.find_all(u'span', {'class': 'dark_text'}))
            media_info[u'status'] = status_tag.text.strip()
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            genres_tag = info_panel_first.find(text=u'Genres:').parent.parent
            media_info[u'genres'] = []
            for genre_link in genres_tag.find_all('a'):
                link_parts = genre_link.get('href').split('/')
                # 2017-02-19: of the form /anime/genre/4/Comedy
                genre = self.session.genre(int(link_parts[3])).set(
                    {'name': genre_link.text})
                media_info[u'genres'].append(genre)
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            # grab statistics for this media.
            score_tag = info_panel_first.find(text=u'Score:').parent.parent
            # get score and number of users.
            score = score_tag.find(attrs={'itemprop': 'ratingValue'}).text
            if score == u'N/A':
                score = u'0'
            num_users = int(
                score_tag.find(attrs={
                    'itemprop': 'ratingCount'
                }).text.replace(',', ''))
            media_info[u'score'] = (decimal.Decimal(score), num_users)
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            rank_tag = info_panel_first.find(text=u'Ranked:').parent.parent
            utilities.extract_tags(rank_tag.find_all())
            rank = rank_tag.text.strip().replace(u',', '').replace(u'#', '')
            if rank == u'N/A':
                rank = u'0'
            media_info[u'rank'] = int(rank)
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            popularity_tag = info_panel_first.find(
                text=u'Popularity:').parent.parent
            utilities.extract_tags(popularity_tag.find_all())
            media_info[u'popularity'] = int(
                popularity_tag.text.strip()[1:].replace(u',', ''))
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            members_tag = info_panel_first.find(text=u'Members:').parent.parent
            utilities.extract_tags(members_tag.find_all())
            media_info[u'members'] = int(members_tag.text.strip().replace(
                u',', ''))
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            favorites_tag = info_panel_first.find(
                text=u'Favorites:').parent.parent
            utilities.extract_tags(favorites_tag.find_all())
            media_info[u'favorites'] = int(favorites_tag.text.strip().replace(
                u',', ''))
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        # TODO: popular tags no longer exist in MAL, the API should be updated to reflect that
        media_info[u'popular_tags'] = {}

        return media_info
예제 #12
0
파일: anime.py 프로젝트: kratsg/python-mal
  def parse_sidebar(self, anime_page):
    """Parses the DOM and returns anime attributes in the sidebar.

    :type anime_page: :class:`bs4.BeautifulSoup`
    :param anime_page: MAL anime page's DOM

    :rtype: dict
    :return: anime attributes

    :raises: :class:`.InvalidAnimeError`, :class:`.MalformedAnimePageError`
    """
    # if MAL says the series doesn't exist, raise an InvalidAnimeError.
    error_tag = anime_page.find(u'div', {'class': 'badresult'})
    if error_tag:
        raise InvalidAnimeError(self.id)

    title_tag = anime_page.find(u'div', {'id': 'contentWrapper'}).find(u'h1')
    if not title_tag.find(u'span'):
      # otherwise, raise a MalformedAnimePageError.
      raise MalformedAnimePageError(self.id, anime_page, message="Could not find title div")

    anime_info = super(Anime, self).parse_sidebar(anime_page)
    info_panel_first = anime_page.find(u'div', {'id': 'content'}).find(u'table').find(u'td')

    try:
      episode_tag = info_panel_first.find(text=u'Episodes:').parent.parent
      utilities.extract_tags(episode_tag.find_all(u'span', {'class': 'dark_text'}))
      anime_info[u'episodes'] = int(episode_tag.text.strip()) if episode_tag.text.strip() != 'Unknown' else 0
    except:
      if not self.session.suppress_parse_exceptions:
        raise

    try:
      aired_tag = info_panel_first.find(text=u'Aired:').parent.parent
      utilities.extract_tags(aired_tag.find_all(u'span', {'class': 'dark_text'}))
      aired_parts = aired_tag.text.strip().split(u' to ')
      if len(aired_parts) == 1:
        # this aired once.
        try:
          aired_date = utilities.parse_profile_date(aired_parts[0], suppress=self.session.suppress_parse_exceptions)
        except ValueError:
          raise MalformedAnimePageError(self.id, aired_parts[0], message="Could not parse single air date")
        anime_info[u'aired'] = (aired_date,)
      else:
        # two airing dates.
        try:
          air_start = utilities.parse_profile_date(aired_parts[0], suppress=self.session.suppress_parse_exceptions)
        except ValueError:
          raise MalformedAnimePageError(self.id, aired_parts[0], message="Could not parse first of two air dates")
        try:
          air_end = utilities.parse_profile_date(aired_parts[1], suppress=self.session.suppress_parse_exceptions)
        except ValueError:
          raise MalformedAnimePageError(self.id, aired_parts[1], message="Could not parse second of two air dates")
        anime_info[u'aired'] = (air_start, air_end)
    except:
      if not self.session.suppress_parse_exceptions:
        raise

    try:
      producers_tag = info_panel_first.find(text=u'Producers:').parent.parent
      utilities.extract_tags(producers_tag.find_all(u'span', {'class': 'dark_text'}))
      utilities.extract_tags(producers_tag.find_all(u'sup'))
      anime_info[u'producers'] = []
      for producer_link in producers_tag.find_all('a'):
        if producer_link.text == u'add some':
          # MAL is saying "None found, add some".
          break
        link_parts = producer_link.get('href').split('p=')
        # of the form: /anime.php?p=14
        if len(link_parts) > 1:
          anime_info[u'producers'].append(self.session.producer(int(link_parts[1])).set({'name': producer_link.text}))
    except:
      if not self.session.suppress_parse_exceptions:
        raise

    try:
      duration_tag = info_panel_first.find(text=u'Duration:').parent.parent
      utilities.extract_tags(duration_tag.find_all(u'span', {'class': 'dark_text'}))
      anime_info[u'duration'] = duration_tag.text.strip()
      duration_parts = [part.strip() for part in anime_info[u'duration'].split(u'.')]
      duration_mins = 0
      for part in duration_parts:
        part_match = re.match(u'(?P<num>[0-9]+)', part)
        if not part_match:
          continue
        part_volume = int(part_match.group(u'num'))
        if part.endswith(u'hr'):
          duration_mins += part_volume * 60
        elif part.endswith(u'min'):
          duration_mins += part_volume
      anime_info[u'duration'] = datetime.timedelta(minutes=duration_mins)
    except:
      if not self.session.suppress_parse_exceptions:
        raise

    try:
      rating_tag = info_panel_first.find(text=u'Rating:').parent.parent
      utilities.extract_tags(rating_tag.find_all(u'span', {'class': 'dark_text'}))
      anime_info[u'rating'] = rating_tag.text.strip()
    except:
      if not self.session.suppress_parse_exceptions:
        raise

    return anime_info
예제 #13
0
    def parse_sidebar(self, manga_page, manga_page_original=None):
        """Parses the DOM and returns manga attributes in the sidebar.

        :type manga_page: :class:`bs4.BeautifulSoup`
        :param manga_page: MAL manga page's DOM

        :type manga_page: :class:`bs4.BeautifulSoup`
        :param manga_page: MAL manga page's DOM

        :rtype: dict
        :return: manga attributes

        :raises: :class:`.InvalidMangaError`, :class:`.MalformedMangaPageError`
        """
        # if MAL says the series doesn't exist, raise an InvalidMangaError.
        error_tag = manga_page.find('div', {'class': 'badresult'})
        if error_tag:
            raise InvalidMangaError(self.id)

        try:
            title_tag = manga_page.find('span', {'itemprop': 'name'})
            if not title_tag:
                # otherwise, raise a MalformedMangaPageError.
                raise MalformedMangaPageError(self.id, manga_page, message="Could not find title")
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        # otherwise, begin parsing.
        manga_info = super(Manga, self).parse_sidebar(manga_page, manga_page_original)

        info_panel_first = manga_page.find('div', {'id': 'content'}).find('table').find('td')

        try:
            volumes_tag = [
                x for x in manga_page.select('span.dark_text') if 'Volumes:' in x.text][0]
            manga_volume = volumes_tag.parent.text.split(':')[1].strip().replace(',', '')
            manga_info[u'volumes'] = (
                int(manga_volume)
                if manga_volume != 'Unknown'
                else None
            )
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            chapters_tag = info_panel_first.find(text=u'Chapters:').parent.parent
            utilities.extract_tags(chapters_tag.find_all(u'span', {'class': 'dark_text'}))
            chapters_tag_text = chapters_tag.text
            if ':' in chapters_tag_text:
                chapters_tag_text = chapters_tag_text.split(':')[1]
            manga_chapters = chapters_tag_text.strip()
            manga_info[u'chapters'] = (
                int(manga_chapters)
                if chapters_tag.text.strip() != 'Unknown'
                else None
            )
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            published_tag = info_panel_first.find(text=u'Published:').parent.parent
            utilities.extract_tags(published_tag.find_all(u'span', {'class': 'dark_text'}))
            published_parts = published_tag.text.strip().split(u' to ')
            # check if published part only contain start date or also end date.
            if len(published_parts) == 1:
                # this published once.
                try:
                    published_date = utilities.parse_profile_date(published_parts[0])
                except ValueError:
                    raise MalformedMangaPageError(self.id, published_parts[0],
                                                  message="Could not parse single publish date")
                publish_start = published_date
                publish_end = None
            else:
                # two publishing dates.
                try:
                    # publish_start may contain redundant word such as
                    # 'Published: Feb  24, 2003',
                    if 'Published:' in published_parts[0]:
                        published_parts[0] = published_parts[0].split('Published:')[1].strip()
                    publish_start = utilities.parse_profile_date(published_parts[0])
                except ValueError:
                    raise MalformedMangaPageError(
                        self.id, published_parts[0],
                        message="Could not parse first of two publish dates"
                    )
                if published_parts == u'?':
                    # this is still publishing.
                    publish_end = None
                else:
                    try:
                        publish_end = utilities.parse_profile_date(published_parts[1])
                    except ValueError:
                        raise MalformedMangaPageError(
                            self.id,
                            published_parts[1],
                            message="Could not parse second of two publish dates"
                        )

            manga_info[u'published'] = (publish_start, publish_end)
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            authors_tag = info_panel_first.find(text='Authors:').parent.parent
            utilities.extract_tags(authors_tag.find_all('span', {'class': 'dark_text'}))
            manga_info['authors'] = {}
            for author_link in authors_tag.find_all('a'):
                link_parts = author_link.get('href').split('/')
                # of the form /people/1867/Naoki_Urasawa
                person = self.session.person(int(link_parts[2])).set({'name': author_link.text})
                role = author_link.nextSibling.replace(' (', '').replace(')', '')
                manga_info['authors'][person] = role
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            serialization_tag = info_panel_first.find(text=u'Serialization:').parent.parent
            publication_link = serialization_tag.find('a')
            manga_info[u'serialization'] = None
            if publication_link:
                # of the form /manga.php?mid=1
                link_parts = publication_link.get('href').split('mid=')
                # example for link_parts
                #  ['/manga/magazine/450/Bessatsu_Shounen_Magazine']
                publication_id = link_parts[0].split('/manga/magazine/')[1].split('/')[0]
                manga_info[u'serialization'] = self.session.publication(int(publication_id)).set(
                    {'name': publication_link.text}
                )
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        return manga_info
예제 #14
0
파일: manga.py 프로젝트: kratsg/python-mal
    def parse_sidebar(self, manga_page):
        """Parses the DOM and returns manga attributes in the sidebar.

    :type manga_page: :class:`bs4.BeautifulSoup`
    :param manga_page: MAL manga page's DOM

    :rtype: dict
    :return: manga attributes

    :raises: :class:`.InvalidMangaError`, :class:`.MalformedMangaPageError`
    """
        # if MAL says the series doesn't exist, raise an InvalidMangaError.
        error_tag = manga_page.find(u"div", {"class": "badresult"})
        if error_tag:
            raise InvalidMangaError(self.id)

        try:
            title_tag = manga_page.find(u"div", {"id": "contentWrapper"}).find(u"h1")
            if not title_tag.find(u"div"):
                # otherwise, raise a MalformedMangaPageError.
                raise MalformedMangaPageError(self.id, manga_page, message="Could not find title div")
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        # otherwise, begin parsing.
        manga_info = super(Manga, self).parse_sidebar(manga_page)

        info_panel_first = manga_page.find(u"div", {"id": "content"}).find(u"table").find(u"td")

        try:
            volumes_tag = info_panel_first.find(text=u"Volumes:").parent.parent
            utilities.extract_tags(volumes_tag.find_all(u"span", {"class": "dark_text"}))
            manga_info[u"volumes"] = int(volumes_tag.text.strip()) if volumes_tag.text.strip() != "Unknown" else None
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            chapters_tag = info_panel_first.find(text=u"Chapters:").parent.parent
            utilities.extract_tags(chapters_tag.find_all(u"span", {"class": "dark_text"}))
            manga_info[u"chapters"] = int(chapters_tag.text.strip()) if chapters_tag.text.strip() != "Unknown" else None
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            published_tag = info_panel_first.find(text=u"Published:").parent.parent
            utilities.extract_tags(published_tag.find_all(u"span", {"class": "dark_text"}))
            published_parts = published_tag.text.strip().split(u" to ")
            if len(published_parts) == 1:
                # this published once.
                try:
                    published_date = utilities.parse_profile_date(published_parts[0])
                except ValueError:
                    raise MalformedMangaPageError(
                        self.id, published_parts[0], message="Could not parse single publish date"
                    )
                manga_info[u"published"] = (published_date,)
            else:
                # two publishing dates.
                try:
                    publish_start = utilities.parse_profile_date(published_parts[0])
                except ValueError:
                    raise MalformedMangaPageError(
                        self.id, published_parts[0], message="Could not parse first of two publish dates"
                    )
                if published_parts == u"?":
                    # this is still publishing.
                    publish_end = None
                else:
                    try:
                        publish_end = utilities.parse_profile_date(published_parts[1])
                    except ValueError:
                        raise MalformedMangaPageError(
                            self.id, published_parts[1], message="Could not parse second of two publish dates"
                        )
                manga_info[u"published"] = (publish_start, publish_end)
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            authors_tag = info_panel_first.find(text=u"Authors:").parent.parent
            utilities.extract_tags(authors_tag.find_all(u"span", {"class": "dark_text"}))
            manga_info[u"authors"] = {}
            for author_link in authors_tag.find_all("a"):
                link_parts = author_link.get("href").split("/")
                # of the form /people/1867/Naoki_Urasawa
                person = self.session.person(int(link_parts[2])).set({"name": author_link.text})
                role = author_link.nextSibling.replace(" (", "").replace(")", "")
                manga_info[u"authors"][person] = role
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            serialization_tag = info_panel_first.find(text=u"Serialization:").parent.parent
            publication_link = serialization_tag.find("a")
            manga_info[u"serialization"] = None
            if publication_link:
                link_parts = publication_link.get("href").split("mid=")
                # of the form /manga.php?mid=1
                manga_info[u"serialization"] = self.session.publication(int(link_parts[1])).set(
                    {"name": publication_link.text}
                )
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        return manga_info
예제 #15
0
    def parse_sidebar(self, media_page, media_page_original=None):
        """Parses the DOM and returns media attributes in the sidebar.

        :type media_page: :class:`bs4.BeautifulSoup`
        :param media_page: MAL media page's DOM

        :rtype: dict
        :return: media attributes.

        :raises: InvalidMediaError, MalformedMediaPageError

        """
        media_info = {}

        # if MAL says the series doesn't exist, raise an InvalidMediaError.
        error_tag = media_page.find(u'div', {'class': 'badresult'})
        if error_tag:
            raise InvalidMediaError(self.id)

        try:
            title_tag = media_page.find(u'div', {
                'id': 'contentWrapper'
            }).find(u'h1')
            if not title_tag.find(u'div'):
                try:
                    title_tag = media_page_original.select(
                        'div#contentWrapper h1.h1 span')[0]
                except IndexError:
                    # otherwise, raise a MalformedMediaPageError.
                    raise MalformedMediaPageError(
                        self.id, None, message="Could not find title div")
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            utilities.extract_tags(title_tag.find_all())
            media_info[u'title'] = title_tag.text.strip()
            if media_info[u'title'] == '':
                media_info[u'title'] = media_page_original.find(
                    'span', {
                        'itemprop': 'name'
                    }).text
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        info_panel_first = media_page_original.select(
            'div#content table td')[0]
        try:
            picture_tag = info_panel_first.find(u'img')
            media_info[u'picture'] = picture_tag.get(u'src').decode('utf-8')
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            # assemble alternative titles for this series.
            media_info[u'alternative_titles'] = {}
            alt_titles_header = info_panel_first.find(
                u'h2', text=u'Alternative Titles')
            if alt_titles_header:
                next_tag = alt_titles_header.find_next_sibling(
                    u'div', {'class': 'spaceit_pad'})
                while True:
                    if next_tag is None or not next_tag.find(
                            u'span', {'class': 'dark_text'}):
                        # not a language node, break.
                        break
                    # get language and remove the node.
                    language = next_tag.find(u'span').text[:-1]
                    utilities.extract_tags(
                        next_tag.find_all(u'span', {'class': 'dark_text'}))
                    names = next_tag.text.strip().split(u', ')
                    media_info[u'alternative_titles'][language] = names
                    next_tag = next_tag.find_next_sibling(
                        u'div', {'class': 'spaceit_pad'})
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            try:
                type_tag = info_panel_first.find(text=u'Type:').parent.parent
                utilities.extract_tags(
                    type_tag.find_all(u'span', {'class': 'dark_text'}))
                media_info[u'type'] = type_tag.text.strip()
            except AttributeError:
                type_tag = [
                    x for x in info_panel_first.find_all('div')
                    if 'Type:' in x.text
                ][0]
                media_info[u'type'] = type_tag.text.split(':')[-1].strip()
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            status_tag = [
                x for x in media_page.find_all('span') if 'Status:' in x.text
            ][0].parent
            media_info[u'status'] = status_tag.text.split(':')[1].strip()
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            media_info[u'genres'] = self.parse_genres(media_page_original)
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            # grab statistics for this media.
            score_tag = media_page.find('span',
                                        {'itemprop': 'aggregateRating'})
            # there is difference between anime and manga page
            # in manga page score_tag is in span-tag and anime in div-page
            # test score tag by try to find span-tag
            try:
                score_tag.find('span')
            except AttributeError:
                score_tag = score_tag = media_page.find(
                    'div', {'itemprop': 'aggregateRating'})

            # get score and number of users.
            num_users = int(
                score_tag.find('span', {
                    'itemprop': 'ratingCount'
                }).text.replace(',', ''))
            # utilities.extract_tags(score_tag.find_all())
            score_point = score_tag.find('span', {
                'itemprop': 'ratingValue'
            }).text
            try:
                media_info[u'score'] = (decimal.Decimal(score_point),
                                        num_users)
            except (InvalidOperation, AttributeError):
                score_tag = media_page_original.find(
                    'span', {'itemprop': 'ratingValue'})
                media_info[u'score'] = (decimal.Decimal(score_tag.text),
                                        num_users)
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            try:
                rank_tag = info_panel_first.find(text=u'Ranked:').parent.parent
                utilities.extract_tags(rank_tag.find_all())
                media_info[u'rank'] = int(rank_tag.text.strip()[1:].replace(
                    u',', ''))
            except AttributeError:
                rank_tag = filter(
                    lambda x: 'Ranked:' in x.text,
                    media_page_original.find_all('div', {'class': 'spaceit'}))
                media_info[u'rank'] = int(
                    rank_tag[0].text.split('#')[-1].strip())

        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            try:
                popularity_tag = info_panel_first.find(
                    text=u'Popularity:').parent.parent
                utilities.extract_tags(popularity_tag.find_all())
                media_info[u'popularity'] = int(
                    popularity_tag.text.strip()[1:].replace(u',', ''))
            except AttributeError:
                rank_tag = filter(
                    lambda x: 'Popularity' in x.text,
                    media_page_original.find_all(
                        'span', {'class': 'dark_text'}))[0].parent
                media_info[u'popularity'] = int(
                    rank_tag.text.split('#')[-1].strip())
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            try:
                members_tag = info_panel_first.find(
                    text=u'Members:').parent.parent
                utilities.extract_tags(members_tag.find_all())
                media_info[u'members'] = int(members_tag.text.strip().replace(
                    u',', ''))
            except AttributeError:
                members_tag = filter(
                    lambda x: 'Members' in x.text,
                    media_page_original.find_all(
                        'span', {'class': 'dark_text'}))[0].parent
                media_info[u'members'] = int(
                    members_tag.text.split(':')[-1].strip().replace(u',', ''))

        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            try:
                favorites_tag = info_panel_first.find(
                    text=u'Favorites:').parent.parent
                utilities.extract_tags(favorites_tag.find_all())
                media_info[u'favorites'] = int(
                    favorites_tag.text.strip().replace(u',', ''))
            except AttributeError:
                favorites_tag = filter(
                    lambda x: 'Favorites' in x.text,
                    media_page_original.find_all(
                        'span', {'class': 'dark_text'}))[0].parent
                media_info[u'favorites'] = int(
                    favorites_tag.text.split(':')[-1].strip().replace(
                        u',', ''))

        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            # get popular tags.
            tags_header = media_page.find(u'h2', text=u'Popular Tags')
            try:
                tags_tag = tags_header.find_next_sibling(u'span')
                media_info[u'popular_tags'] = {}
                for tag_link in tags_tag.find_all('a'):
                    tag = self.session.tag(tag_link.text)
                    num_people = int(
                        re.match(r'(?P<people>[0-9]+) people',
                                 tag_link.get('title')).group('people'))
                    media_info[u'popular_tags'][tag] = num_people
            except AttributeError:
                tags_tag = media_page_original.find('span',
                                                    text='Genres:').parent
                media_info[u'popular_tags'] = {}
                for tag_link in tags_tag.find_all('a'):
                    tag = self.session.tag(tag_link.text.lower())
                    try:
                        num_people = int(
                            re.match(r'(?P<people>[0-9]+) people',
                                     tag_link.get('title')).group('people'))
                        media_info[u'popular_tags'][tag] = num_people
                    except (TypeError, AttributeError):
                        tag_num = tag_link.get('href').split('=')[-1]
                        media_info[u'popular_tags'][tag] = tag_num

        except:
            if not self.session.suppress_parse_exceptions:
                raise

        return media_info
예제 #16
0
    def parse_sidebar(self, media_page):
        """Parses the DOM and returns media attributes in the sidebar.

    :type media_page: :class:`bs4.BeautifulSoup`
    :param media_page: MAL media page's DOM

    :rtype: dict
    :return: media attributes.

    :raises: InvalidMediaError, MalformedMediaPageError

    """
        media_info = {}

        # if MAL says the series doesn't exist, raise an InvalidMediaError.
        error_tag = media_page.find(u'div', {'class': 'badresult'})
        if error_tag:
            raise InvalidMediaError(self.id)

        try:
            title_tag = media_page.find(u'div', {
                'id': 'contentWrapper'
            }).find(u'h1')
            if not title_tag.find(u'div'):
                # otherwise, raise a MalformedMediaPageError.
                raise MalformedMediaPageError(
                    self.id, media_page, message="Could not find title div")
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            utilities.extract_tags(title_tag.find_all())
            media_info[u'title'] = title_tag.text.strip()
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        info_panel_first = media_page.find(u'div', {
            'id': 'content'
        }).find(u'table').find(u'td')

        try:
            picture_tag = info_panel_first.find(u'img')
            media_info[u'picture'] = picture_tag.get(u'src').decode('utf-8')
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            # assemble alternative titles for this series.
            media_info[u'alternative_titles'] = {}
            alt_titles_header = info_panel_first.find(
                u'h2', text=u'Alternative Titles')
            if alt_titles_header:
                next_tag = alt_titles_header.find_next_sibling(
                    u'div', {'class': 'spaceit_pad'})
                while True:
                    if next_tag is None or not next_tag.find(
                            u'span', {'class': 'dark_text'}):
                        # not a language node, break.
                        break
                    # get language and remove the node.
                    language = next_tag.find(u'span').text[:-1]
                    utilities.extract_tags(
                        next_tag.find_all(u'span', {'class': 'dark_text'}))
                    names = next_tag.text.strip().split(u', ')
                    media_info[u'alternative_titles'][language] = names
                    next_tag = next_tag.find_next_sibling(
                        u'div', {'class': 'spaceit_pad'})
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            type_tag = info_panel_first.find(text=u'Type:').parent.parent
            utilities.extract_tags(
                type_tag.find_all(u'span', {'class': 'dark_text'}))
            media_info[u'type'] = type_tag.text.strip()
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            status_tag = info_panel_first.find(text=u'Status:').parent.parent
            utilities.extract_tags(
                status_tag.find_all(u'span', {'class': 'dark_text'}))
            media_info[u'status'] = status_tag.text.strip()
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            genres_tag = info_panel_first.find(text=u'Genres:').parent.parent
            utilities.extract_tags(
                genres_tag.find_all(u'span', {'class': 'dark_text'}))
            media_info[u'genres'] = []
            for genre_link in genres_tag.find_all('a'):
                link_parts = genre_link.get('href').split('[]=')
                # of the form /anime|manga.php?genre[]=1
                genre = self.session.genre(int(link_parts[1])).set(
                    {'name': genre_link.text})
                media_info[u'genres'].append(genre)
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            # grab statistics for this media.
            score_tag = info_panel_first.find(text=u'Score:').parent.parent
            # get score and number of users.
            users_node = [
                x for x in score_tag.find_all(u'small')
                if u'scored by' in x.text
            ][0]
            num_users = int(
                users_node.text.split(u'scored by ')[-1].split(u' users')[0])
            utilities.extract_tags(score_tag.find_all())
            media_info[u'score'] = (decimal.Decimal(score_tag.text.strip()),
                                    num_users)
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            rank_tag = info_panel_first.find(text=u'Ranked:').parent.parent
            utilities.extract_tags(rank_tag.find_all())
            media_info[u'rank'] = int(rank_tag.text.strip()[1:].replace(
                u',', ''))
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            popularity_tag = info_panel_first.find(
                text=u'Popularity:').parent.parent
            utilities.extract_tags(popularity_tag.find_all())
            media_info[u'popularity'] = int(
                popularity_tag.text.strip()[1:].replace(u',', ''))
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            members_tag = info_panel_first.find(text=u'Members:').parent.parent
            utilities.extract_tags(members_tag.find_all())
            media_info[u'members'] = int(members_tag.text.strip().replace(
                u',', ''))
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            favorites_tag = info_panel_first.find(
                text=u'Favorites:').parent.parent
            utilities.extract_tags(favorites_tag.find_all())
            media_info[u'favorites'] = int(favorites_tag.text.strip().replace(
                u',', ''))
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            # get popular tags.
            tags_header = media_page.find(u'h2', text=u'Popular Tags')
            tags_tag = tags_header.find_next_sibling(u'span')
            media_info[u'popular_tags'] = {}
            for tag_link in tags_tag.find_all('a'):
                tag = self.session.tag(tag_link.text)
                num_people = int(
                    re.match(r'(?P<people>[0-9]+) people',
                             tag_link.get('title')).group('people'))
                media_info[u'popular_tags'][tag] = num_people
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        return media_info
예제 #17
0
    def parse_sidebar(self, media_page, media_page_original=None):
        """Parse the DOM and returns media attributes in the sidebar.

        :type media_page: :class:`bs4.BeautifulSoup`
        :param media_page: MAL media page's DOM

        :rtype: dict
        :return: media attributes.

        :raises: InvalidMediaError, MalformedMediaPageError

        """
        media_info = {}

        # if MAL says the series doesn't exist, raise an InvalidMediaError.
        error_tag = media_page.find('div', {'class': 'badresult'})
        if error_tag:
            raise InvalidMediaError(self.id)

        try:
            title_tag = media_page.find('div', {'id': 'contentWrapper'}).find('h1')
            if not title_tag.find('div'):
                try:
                    title_tag = media_page_original.select('div#contentWrapper h1.h1 span')[0]
                except IndexError:
                    # otherwise, raise a MalformedMediaPageError.
                    raise MalformedMediaPageError(self.id, None, message="Could not find title div")
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            utilities.extract_tags(title_tag.find_all())
            media_info['title'] = title_tag.text.strip()
            if media_info['title'] == '':
                media_info['title'] = media_page_original.find('span', {'itemprop': 'name'}).text
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        info_panel_first = media_page_original.select('div#content table td')[0]
        try:
            picture_tag = info_panel_first.find(u'img')
            media_info[u'picture'] = picture_tag.get('src')
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            # assemble alternative titles for this series.
            media_info[u'alternative_titles'] = self._get_alternative_titles(info_panel_first)
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
                type_tag = info_panel_first.find(text=u'Type:').parent.parent
                media_info[u'type'] = type_tag.text.split(':')[1].strip()
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            status_tag = [x for x in media_page.find_all('span')if 'Status:' in x.text][0].parent
            media_info['status'] = status_tag.text.split(':')[1].strip()
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            genres_tag = info_panel_first.find(text=u'Genres:').parent.parent
            # utilities.extract_tags(genres_tag.find_all(u'span', {'class': 'dark_text'}))
            media_info[u'genres'] = []
            for genre_link in genres_tag.find_all('a'):
                link_parts = genre_link.get('href').split('=')
                # of the form /anime|manga.php?genre[]=1
                try:
                    genre_id = int(link_parts[-1])
                except:
                    # for current form
                    # '/anime/genre/2/Adventure'
                    genre_id = int(link_parts[-1].split('/genre/')[1].split('/')[0])

                genre = self.session.genre(genre_id).set({'name': genre_link.text})
                media_info[u'genres'].append(genre)
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            media_info['score'] = self._parse_score(media_page)
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            media_info['rank'] = self._parse_rank(media_page)
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            try:
                # find popularity html-tag
                popularity_tag = info_panel_first.find(text=u'Popularity:').parent.parent
                utilities.extract_tags(popularity_tag.find_all())
                # format popularity
                popularity = popularity_tag.text.strip()[1:].replace(u',', '')
                if '#' in popularity:
                    popularity = popularity.split('#')[1].split()[0]
                # set into media info
                media_info[u'popularity'] = int(popularity)
            except AttributeError:
                rank_tag = filter(lambda x: 'Popularity' in x.text,
                                  media_page_original.find_all('span', {'class':'dark_text'}))[0].parent
                media_info[u'popularity'] = int(rank_tag.text.split('#')[-1].strip())
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            members_tag = [
                x for x in media_page.select('span.dark_text')
                if 'Members:' in x.text][0]
            members = members_tag.parent.text.split(':')[1].strip().replace(',', '')
            media_info[u'members'] = int(members)

        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            fav_tag = [x for x in media_page.select('span.dark_text') if 'Favorites:' in x.text][0]
            favorites = fav_tag.parent.text.split(':')[1].strip().replace(',', '')
            media_info[u'favorites'] = int(favorites)
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        return media_info
예제 #18
0
  def parse_sidebar(self, media_page):
    """Parses the DOM and returns media attributes in the sidebar.

    :type media_page: :class:`bs4.BeautifulSoup`
    :param media_page: MAL media page's DOM

    :rtype: dict
    :return: media attributes.

    :raises: InvalidMediaError, MalformedMediaPageError

    """
    media_info = {}

    # if MAL says the series doesn't exist, raise an InvalidMediaError.
    error_tag = media_page.find(u'div', {'class': 'badresult'})
    if error_tag:
        raise InvalidMediaError(self.id)

    try:
      title_tag = media_page.find(u'div', {'id': 'contentWrapper'}).find(u'h1')
      if not title_tag.find(u'div'):
        # otherwise, raise a MalformedMediaPageError.
        raise MalformedMediaPageError(self.id, media_page, message="Could not find title div")
    except:
      if not self.session.suppress_parse_exceptions:
        raise

    try:
      utilities.extract_tags(title_tag.find_all())
      media_info[u'title'] = title_tag.text.strip()
    except:
      if not self.session.suppress_parse_exceptions:
        raise

    info_panel_first = media_page.find(u'div', {'id': 'content'}).find(u'table').find(u'td')

    try:
      picture_tag = info_panel_first.find(u'img')
      media_info[u'picture'] = picture_tag.get(u'src').decode('utf-8')
    except:
      if not self.session.suppress_parse_exceptions:
        raise

    try:
      # assemble alternative titles for this series.
      media_info[u'alternative_titles'] = {}
      alt_titles_header = info_panel_first.find(u'h2', text=u'Alternative Titles')
      if alt_titles_header:
        next_tag = alt_titles_header.find_next_sibling(u'div', {'class': 'spaceit_pad'})
        while True:
          if next_tag is None or not next_tag.find(u'span', {'class': 'dark_text'}):
            # not a language node, break.
            break
          # get language and remove the node.
          language = next_tag.find(u'span').text[:-1]
          utilities.extract_tags(next_tag.find_all(u'span', {'class': 'dark_text'}))
          names = next_tag.text.strip().split(u', ')
          media_info[u'alternative_titles'][language] = names
          next_tag = next_tag.find_next_sibling(u'div', {'class': 'spaceit_pad'})
    except:
      if not self.session.suppress_parse_exceptions:
        raise

    try:
      type_tag = info_panel_first.find(text=u'Type:').parent.parent
      utilities.extract_tags(type_tag.find_all(u'span', {'class': 'dark_text'}))
      media_info[u'type'] = type_tag.text.strip()
    except:
      if not self.session.suppress_parse_exceptions:
        raise

    try:
      status_tag = info_panel_first.find(text=u'Status:').parent.parent
      utilities.extract_tags(status_tag.find_all(u'span', {'class': 'dark_text'}))
      media_info[u'status'] = status_tag.text.strip()
    except:
      if not self.session.suppress_parse_exceptions:
        raise

    try:
      genres_tag = info_panel_first.find(text=u'Genres:').parent.parent
      utilities.extract_tags(genres_tag.find_all(u'span', {'class': 'dark_text'}))
      media_info[u'genres'] = []
      for genre_link in genres_tag.find_all('a'):
        link_parts = genre_link.get('href').split('[]=')
        # of the form /anime|manga.php?genre[]=1
        genre = self.session.genre(int(link_parts[1])).set({'name': genre_link.text})
        media_info[u'genres'].append(genre)
    except:
      if not self.session.suppress_parse_exceptions:
        raise

    try:
      # grab statistics for this media.
      score_tag = info_panel_first.find(text=u'Score:').parent.parent
      # get score and number of users.
      users_node = [x for x in score_tag.find_all(u'small') if u'scored by' in x.text][0]
      num_users = int(users_node.text.split(u'scored by ')[-1].split(u' users')[0])
      utilities.extract_tags(score_tag.find_all())
      media_info[u'score'] = (decimal.Decimal(score_tag.text.strip()), num_users)
    except:
      if not self.session.suppress_parse_exceptions:
        raise

    try:
      rank_tag = info_panel_first.find(text=u'Ranked:').parent.parent
      utilities.extract_tags(rank_tag.find_all())
      media_info[u'rank'] = int(rank_tag.text.strip()[1:].replace(u',', ''))
    except:
      if not self.session.suppress_parse_exceptions:
        raise

    try:
      popularity_tag = info_panel_first.find(text=u'Popularity:').parent.parent
      utilities.extract_tags(popularity_tag.find_all())
      media_info[u'popularity'] = int(popularity_tag.text.strip()[1:].replace(u',', ''))
    except:
      if not self.session.suppress_parse_exceptions:
        raise

    try:
      members_tag = info_panel_first.find(text=u'Members:').parent.parent
      utilities.extract_tags(members_tag.find_all())
      media_info[u'members'] = int(members_tag.text.strip().replace(u',', ''))
    except:
      if not self.session.suppress_parse_exceptions:
        raise

    try:
      favorites_tag = info_panel_first.find(text=u'Favorites:').parent.parent
      utilities.extract_tags(favorites_tag.find_all())
      media_info[u'favorites'] = int(favorites_tag.text.strip().replace(u',', ''))
    except:
      if not self.session.suppress_parse_exceptions:
        raise

    try:
      # get popular tags.
      tags_header = media_page.find(u'h2', text=u'Popular Tags')
      tags_tag = tags_header.find_next_sibling(u'span')
      media_info[u'popular_tags'] = {}
      for tag_link in tags_tag.find_all('a'):
        tag = self.session.tag(tag_link.text)
        num_people = int(re.match(r'(?P<people>[0-9]+) people', tag_link.get('title')).group('people'))
        media_info[u'popular_tags'][tag] = num_people
    except:
      if not self.session.suppress_parse_exceptions:
        raise

    return media_info
예제 #19
0
  def parse(self, media_page):
    """Parses the DOM and returns media attributes in the main-content area.

    :type media_page: :class:`bs4.BeautifulSoup`
    :param media_page: MAL media page's DOM

    :rtype: dict
    :return: media attributes.

    """
    media_info = self.parse_sidebar(media_page)

    try:
      synopsis_elt = media_page.find(u'h2', text=u'Synopsis').parent
      utilities.extract_tags(synopsis_elt.find_all(u'h2'))
      media_info[u'synopsis'] = synopsis_elt.text.strip()
    except:
      if not self.session.suppress_parse_exceptions:
        raise

    try:
      related_title = media_page.find(u'h2', text=u'Related ' + self.__class__.__name__)
      if related_title:
        related_elt = related_title.parent
        utilities.extract_tags(related_elt.find_all(u'h2'))
        related = {}
        for link in related_elt.find_all(u'a'):
          href = link.get(u'href').replace(u'http://myanimelist.net', '')
          if not re.match(r'/(anime|manga)', href):
            break
          curr_elt = link.previous_sibling
          if curr_elt is None:
            # we've reached the end of the list.
            break
          related_type = None
          while True:
            if not curr_elt:
              raise MalformedAnimePageError(self.id, related_elt, message="Prematurely reached end of related anime listing")
            if isinstance(curr_elt, bs4.NavigableString):
              type_match = re.match(u'(?P<type>[a-zA-Z\ \-]+):', curr_elt)
              if type_match:
                related_type = type_match.group(u'type')
                break
            curr_elt = curr_elt.previous_sibling
          title = link.text
          # parse link: may be manga or anime.
          href_parts = href.split(u'/')
          # sometimes links on MAL are broken, of the form /anime//
          if href_parts[2] == '':
            continue
          # of the form: /(anime|manga)/1/Cowboy_Bebop
          obj_id = int(href_parts[2])
          new_obj = getattr(self.session, href_parts[1])(obj_id).set({'title': title})
          if related_type not in related:
            related[related_type] = [new_obj]
          else:
            related[related_type].append(new_obj)
        media_info[u'related'] = related
      else:
        media_info[u'related'] = None
    except:
      if not self.session.suppress_parse_exceptions:
        raise

    return media_info
예제 #20
0
파일: anime.py 프로젝트: EaterOA/python-mal
    def parse_sidebar(self, anime_page):
        """Parses the DOM and returns anime attributes in the sidebar.

    :type anime_page: :class:`bs4.BeautifulSoup`
    :param anime_page: MAL anime page's DOM

    :rtype: dict
    :return: anime attributes

    :raises: :class:`.InvalidAnimeError`, :class:`.MalformedAnimePageError`
    """

        try:
            anime_info = super(Anime, self).parse_sidebar(anime_page)
        except media.InvalidMediaError as e:
            raise InvalidAnimeError(e.id)
        info_panel_first = anime_page.find(u'div', {
            'id': 'content'
        }).find(u'table').find(u'td')

        try:
            episode_tag = info_panel_first.find(
                text=u'Episodes:').parent.parent
            utilities.extract_tags(
                episode_tag.find_all(u'span', {'class': 'dark_text'}))
            anime_info[u'episodes'] = int(episode_tag.text.strip(
            )) if episode_tag.text.strip() != 'Unknown' else 0
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            aired_tag = info_panel_first.find(text=u'Aired:').parent.parent
            utilities.extract_tags(
                aired_tag.find_all(u'span', {'class': 'dark_text'}))
            aired_parts = aired_tag.text.strip().split(u' to ')
            if len(aired_parts) == 1:
                # this aired once.
                try:
                    aired_date = utilities.parse_profile_date(
                        aired_parts[0],
                        suppress=self.session.suppress_parse_exceptions)
                except ValueError:
                    raise MalformedAnimePageError(
                        self.id,
                        aired_parts[0],
                        message="Could not parse single air date")
                anime_info[u'aired'] = (aired_date, )
            else:
                # two airing dates.
                try:
                    air_start = utilities.parse_profile_date(
                        aired_parts[0],
                        suppress=self.session.suppress_parse_exceptions)
                except ValueError:
                    raise MalformedAnimePageError(
                        self.id,
                        aired_parts[0],
                        message="Could not parse first of two air dates")
                try:
                    air_end = utilities.parse_profile_date(
                        aired_parts[1],
                        suppress=self.session.suppress_parse_exceptions)
                except ValueError:
                    raise MalformedAnimePageError(
                        self.id,
                        aired_parts[1],
                        message="Could not parse second of two air dates")
                anime_info[u'aired'] = (air_start, air_end)
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            producers_tag = info_panel_first.find(
                text=u'Producers:').parent.parent
            utilities.extract_tags(
                producers_tag.find_all(u'span', {'class': 'dark_text'}))
            anime_info[u'producers'] = []
            for producer_link in producers_tag.find_all('a'):
                if producer_link.text == u'add some':
                    # MAL is saying "None found, add some".
                    break
                link_parts = producer_link.get('href').split('/')
                # of the form: /anime/producer/23/Bandai_Visual
                anime_info[u'producers'].append(
                    self.session.producer(int(link_parts[3])).set(
                        {'name': producer_link.text}))
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            duration_tag = info_panel_first.find(
                text=u'Duration:').parent.parent
            utilities.extract_tags(
                duration_tag.find_all(u'span', {'class': 'dark_text'}))
            anime_info[u'duration'] = duration_tag.text.strip()
            duration_parts = [
                part.strip() for part in anime_info[u'duration'].split(u'.')
            ]
            duration_mins = 0
            for part in duration_parts:
                part_match = re.match(u'(?P<num>[0-9]+)', part)
                if not part_match:
                    continue
                part_volume = int(part_match.group(u'num'))
                if part.endswith(u'hr'):
                    duration_mins += part_volume * 60
                elif part.endswith(u'min'):
                    duration_mins += part_volume
            anime_info[u'duration'] = datetime.timedelta(minutes=duration_mins)
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            rating_tag = info_panel_first.find(text=u'Rating:').parent.parent
            utilities.extract_tags(
                rating_tag.find_all(u'span', {'class': 'dark_text'}))
            anime_info[u'rating'] = rating_tag.text.strip()
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        return anime_info
예제 #21
0
    def parse_sidebar(self, media_page, media_page_original=None):
        """Parses the DOM and returns media attributes in the sidebar.

        :type media_page: :class:`bs4.BeautifulSoup`
        :param media_page: MAL media page's DOM

        :rtype: dict
        :return: media attributes.

        :raises: InvalidMediaError, MalformedMediaPageError

        """
        media_info = {}

        # if MAL says the series doesn't exist, raise an InvalidMediaError.
        error_tag = media_page.find(u'div', {'class': 'badresult'})
        if error_tag:
            raise InvalidMediaError(self.id)

        try:
            title_tag = media_page.find(u'div', {'id': 'contentWrapper'}).find(u'h1')
            if not title_tag.find(u'div'):
                try:
                    title_tag = media_page_original.select('div#contentWrapper h1.h1 span')[0]
                except IndexError:
                    # otherwise, raise a MalformedMediaPageError.
                    raise MalformedMediaPageError(self.id, None, message="Could not find title div")
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            utilities.extract_tags(title_tag.find_all())
            media_info[u'title'] = title_tag.text.strip()
            if media_info[u'title'] == '':
                media_info[u'title'] = media_page_original.find('span',{'itemprop':'name'}).text 
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        info_panel_first =  media_page_original.select('div#content table td')[0]
        try:
            picture_tag = info_panel_first.find(u'img')
            media_info[u'picture'] = picture_tag.get(u'src').decode('utf-8')
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            # assemble alternative titles for this series.
            media_info[u'alternative_titles'] = {}
            alt_titles_header = info_panel_first.find(u'h2', text=u'Alternative Titles')
            if alt_titles_header:
                next_tag = alt_titles_header.find_next_sibling(u'div', {'class': 'spaceit_pad'})
                while True:
                    if next_tag is None or not next_tag.find(u'span', {'class': 'dark_text'}):
                        # not a language node, break.
                        break
                    # get language and remove the node.
                    language = next_tag.find(u'span').text[:-1]
                    utilities.extract_tags(next_tag.find_all(u'span', {'class': 'dark_text'}))
                    names = next_tag.text.strip().split(u', ')
                    media_info[u'alternative_titles'][language] = names
                    next_tag = next_tag.find_next_sibling(u'div', {'class': 'spaceit_pad'})
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            try:
                type_tag = info_panel_first.find(text=u'Type:').parent.parent
                utilities.extract_tags(type_tag.find_all(u'span', {'class': 'dark_text'}))
                media_info[u'type'] = type_tag.text.strip()
            except AttributeError:
                type_tag = [x for x in info_panel_first.find_all('div') if 'Type:' in x.text][0]
                media_info[u'type'] = type_tag.text.split(':')[-1].strip()
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            status_tag = [x for x in media_page.find_all('span')if 'Status:' in x.text][0].parent
            media_info[u'status'] = status_tag.text.split(':')[1].strip()
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            media_info[u'genres'] = self.parse_genres(media_page_original)
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            # grab statistics for this media.
            score_tag = media_page.find('span', {'itemprop': 'aggregateRating'})
            # there is difference between anime and manga page
            # in manga page score_tag is in span-tag and anime in div-page
            # test score tag by try to find span-tag
            try:
                score_tag.find('span')
            except AttributeError:
                score_tag = score_tag = media_page.find('div', {'itemprop': 'aggregateRating'})

            # get score and number of users.
            num_users = int(score_tag.find('span', {'itemprop':'ratingCount'}).text.replace(',',''))
            # utilities.extract_tags(score_tag.find_all())
            score_point = score_tag.find('span',{'itemprop':'ratingValue'}).text
            try:
                media_info[u'score'] = (decimal.Decimal(score_point), num_users)
            except (InvalidOperation, AttributeError) :
                score_tag = media_page_original.find('span',{'itemprop':'ratingValue'})
                media_info[u'score'] = (decimal.Decimal(score_tag.text), num_users)
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            try:
                rank_tag = info_panel_first.find(text=u'Ranked:').parent.parent
                utilities.extract_tags(rank_tag.find_all())
                media_info[u'rank'] = int(rank_tag.text.strip()[1:].replace(u',', ''))
            except AttributeError:
                rank_tag = filter(lambda x: 'Ranked:' in x.text, media_page_original.find_all('div', {'class':'spaceit'}))
                media_info[u'rank'] = int(rank_tag[0].text.split('#')[-1].strip())

        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            try :
                popularity_tag = info_panel_first.find(text=u'Popularity:').parent.parent
                utilities.extract_tags(popularity_tag.find_all())
                media_info[u'popularity'] = int(popularity_tag.text.strip()[1:].replace(u',', ''))
            except AttributeError :
                rank_tag = filter(lambda x: 'Popularity' in x.text,
                                  media_page_original.find_all('span', {'class':'dark_text'}))[0].parent
                media_info[u'popularity'] = int(rank_tag.text.split('#')[-1].strip())
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            try :
                members_tag = info_panel_first.find(text=u'Members:').parent.parent
                utilities.extract_tags(members_tag.find_all())
                media_info[u'members'] = int(members_tag.text.strip().replace(u',', ''))
            except AttributeError :
                members_tag = filter(lambda x: 'Members' in x.text,
                                  media_page_original.find_all('span', {'class':'dark_text'}))[0].parent
                media_info[u'members'] = int(members_tag.text.split(':')[-1].strip().replace(u',', ''))

        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            try :
                favorites_tag = info_panel_first.find(text=u'Favorites:').parent.parent
                utilities.extract_tags(favorites_tag.find_all())
                media_info[u'favorites'] = int(favorites_tag.text.strip().replace(u',', ''))
            except AttributeError :
                favorites_tag = filter(lambda x: 'Favorites' in x.text,
                                  media_page_original.find_all('span', {'class':'dark_text'}))[0].parent
                media_info[u'favorites'] = int(favorites_tag.text.split(':')[-1].strip().replace(u',', ''))                
            
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        try:
            # get popular tags.
            tags_header = media_page.find(u'h2', text=u'Popular Tags')
            try:
                tags_tag = tags_header.find_next_sibling(u'span')
                media_info[u'popular_tags'] = {}
                for tag_link in tags_tag.find_all('a'):
                    tag = self.session.tag(tag_link.text)
                    num_people = int(re.match(r'(?P<people>[0-9]+) people', tag_link.get('title')).group('people'))
                    media_info[u'popular_tags'][tag] = num_people
            except AttributeError:
                tags_tag = media_page_original.find('span',text='Genres:').parent
                media_info[u'popular_tags'] = {}
                for tag_link in tags_tag.find_all('a'):
                    tag = self.session.tag(tag_link.text.lower())
                    try: 
                        num_people = int(re.match(r'(?P<people>[0-9]+) people', tag_link.get('title')).group('people'))
                        media_info[u'popular_tags'][tag] = num_people
                    except (TypeError, AttributeError): 
                        tag_num = tag_link.get('href').split('=')[-1]
                        media_info[u'popular_tags'][tag] = tag_num
                
        except:
            if not self.session.suppress_parse_exceptions:
                raise

        return media_info