Пример #1
0
    def get_manga_data(self, initial_data):
        """
        Returns manga data from API

        Initial data should contain at least manga's slug (provided by search)
        """
        assert 'slug' in initial_data, 'Manga slug is missing in initial data'
        r = self.session_get(self.api_chapters_url.format(initial_data['slug']))

        json_data = r.json()
        resp_data = json_data['series']
        chapters = json_data['chapters']

        data = initial_data.copy()
        data.update(dict(
            authors=[],
            scanlators=[],
            genres=[],
            status='ongoing',
            chapters=[],
            synopsis=resp_data['locale'][self.locale]['description'],
            server_id=self.id,
            cover=resp_data['locale'][self.locale]['thumb_url'],
            url=self.manga_url.format(resp_data['url'][1:]),
        ))

        if resp_data.get('authors'):
            data['authors'] += [t.strip() for t in resp_data['authors'].split(',')]
        if resp_data.get('artist'):
            data['authors'] += [t.strip() for t in resp_data['artist'].split(',') if t.strip() not in data['authors']]

        if resp_data.get('translator'):
            data['scanlators'] += [t.strip() for t in resp_data['translator'].split('|')]

        if resp_data.get('genres'):
            data['genres'] = resp_data['genres']

        if resp_data['locale'][self.locale].get('copyright'):
            data['synopsis'] += '\n\n' + resp_data['locale'][self.locale]['copyright']

        # Chapters
        for chapter in chapters:
            date = None
            if chapter.get('availability_start'):
                date_string = chapter['availability_start'].split(' ')[0]
                if len(date_string) == 10 and '-00' not in date_string:
                    date = convert_date_string(date_string, '%Y-%m-%d')
            if date is None and chapter.get('updated'):
                date_string = chapter['updated'].split(' ')[0]
                if len(date_string) == 10 and '-00' not in date_string:
                    date = convert_date_string(date_string, '%Y-%m-%d')

            data['chapters'].append(dict(
                slug=chapter['chapter_id'],
                title=chapter['locale'][self.locale]['name'],
                date=date,
            ))

        return data
Пример #2
0
    def get_manga_chapters_data(self, url):
        """
        Returns manga chapters data by scraping content of manga Mobile HTML page
        """
        # Use a Mobile user agent
        r = self.session_get(self.chapters_url.format(url),
                             headers={'user-agent': USER_AGENT_MOBILE})
        if r is None:
            return []

        mime_type = magic.from_buffer(r.content[:128], mime=True)

        if r.status_code != 200 or mime_type != 'text/html':
            return []

        soup = BeautifulSoup(r.text, 'html.parser')

        li_elements = soup.find('ul',
                                id='_episodeList').find_all('li',
                                                            recursive=False)

        data = []
        for li_element in reversed(li_elements):
            if li_element.get('data-episode-no') is None:
                continue

            date_element = li_element.find('p', class_='date')
            if date_element.span:
                date_element.span.decompose()

            # Small difference here compared to other servers
            # the slug can't be used to forge chapter URL, we must store the full url
            url_split = urlsplit(li_element.a.get('href'))

            data.append(
                dict(
                    slug=url_split.query,
                    title=li_element.find('p', class_='sub_title').find(
                        'span', class_='ellipsis').text.strip(),
                    date=convert_date_string(date_element.text.strip(),
                                             format='%b %d, %Y'),
                    url='{0}?{1}'.format(url_split.path, url_split.query),
                ))

        return data
Пример #3
0
    def get_manga_data(self, initial_data):
        """
        Returns manga data by scraping manga HTML page content
        """
        r = self.session_get(self.manga_url)
        if r is None:
            return None

        mime_type = get_buffer_mime_type(r.content)

        if r.status_code != 200 or mime_type != 'text/html':
            return None

        soup = BeautifulSoup(r.text, 'html.parser')

        data = initial_data.copy()
        data.update(dict(
            authors=['Randall Munroe', ],
            scanlators=[],
            genres=[],
            status='ongoing',
            synopsis='A webcomic of romance, sarcasm, math, and language.',
            chapters=[],
            server_id=self.id,
            cover=self.cover_url,
        ))

        # Chapters
        for a_element in reversed(soup.find('div', id='middleContainer').find_all('a')):
            slug = a_element.get('href')[1:-1]

            data['chapters'].append(dict(
                slug=slug,
                date=convert_date_string(a_element.get('title'), '%Y-%m-%d'),
                title='{0} - {1}'.format(slug, a_element.text.strip()),
            ))

        return data
Пример #4
0
    def get_manga_data(self, initial_data):
        """
        Returns manga data by scraping manga HTML page content

        Initial data should contain at least manga's slug (provided by search)
        """
        assert 'slug' in initial_data, 'Slug is missing in initial data'

        r = self.session_get(self.manga_url.format(initial_data['slug']))
        if r is None:
            return None

        mime_type = get_buffer_mime_type(r.content)

        if r.status_code != 200 or mime_type != 'text/html':
            return None

        soup = BeautifulSoup(r.text, 'html.parser')

        data = initial_data.copy()
        data.update(
            dict(
                authors=[],
                scanlators=[],
                genres=[],
                status=None,
                synopsis=None,
                chapters=[],
                server_id=self.id,
                cover=None,
            ))

        data['name'] = soup.find('span', class_='manga-title').text.strip()

        cover_element = soup.find('div', class_='mangaImage2')
        if cover_element:
            data['cover'] = 'https:{0}'.format(cover_element.img.get('src'))

        # Details
        for element in soup.find_all('div', class_='rightBox')[1].find_all():
            if element.name == 'h4':
                label = element.text.strip()

                if label.startswith(('Status', 'Stato')):
                    status = element.find_all_next(string=True,
                                                   limit=2)[1].strip().lower()

                    if status in ('ongoing', 'in corso'):
                        data['status'] = 'ongoing'
                    elif status in ('completed', 'completato'):
                        data['status'] = 'complete'
                    elif status in ('suspended', 'sospeso'):
                        data['status'] = 'suspended'

                continue

            if element.name == 'a':
                if label.startswith(('Author', 'Autore', 'Artist', 'Artista')):
                    data['authors'].append(element.text.strip())
                elif label.startswith(('Genres', 'Genere')):
                    data['genres'].append(element.text.strip())

        # Synopsis
        synopsis_element = soup.find('h2', id='mangaDescription')
        if synopsis_element:
            data['synopsis'] = synopsis_element.text.strip()

        # Chapters
        elements = soup.find('table').tbody.find_all('tr')
        for element in reversed(elements):
            tds_elements = element.find_all('td')

            data['chapters'].append(
                dict(
                    slug=tds_elements[0].a.get('href').split('/')[-3],
                    title=tds_elements[0].b.text.strip(),
                    date=convert_date_string(tds_elements[3].text.strip(),
                                             format='%b %d, %Y'),
                ))

        return data
Пример #5
0
    def get_manga_data(self, initial_data):
        """
        Returns manga data by scraping manga HTML page content

        Initial data should contain at least manga's slug (provided by search)
        """
        assert 'slug' in initial_data, 'Slug is missing in initial data'

        r = self.session_get(self.manga_url.format(initial_data['slug']))
        if r is None:
            return None

        mime_type = magic.from_buffer(r.content[:128], mime=True)

        if r.url == self.base_url:
            # Manga page doesn't exist, we have been redirected to homepage
            return None
        if r.status_code != 200 or mime_type != 'text/html':
            return None

        soup = BeautifulSoup(r.text, 'html.parser')

        data = initial_data.copy()
        data.update(
            dict(
                authors=[],
                scanlators=[],
                genres=[],
                status=None,
                synopsis=None,
                chapters=[],
                server_id=self.id,
                cover=None,
            ))

        name = soup.find('div', class_='ttline').h1.text.strip()
        name = name.replace(' Manga', '').replace(' Манга', '')  # cleaning
        data['name'] = name
        data['cover'] = soup.find('a', class_='bookface').img.get('src')

        # Details
        elements = soup.find('ul', class_='message').find_all('li')
        for element in elements:
            label = element.b.text

            if label.startswith(('Author', 'Auteur', 'Autor')):
                data['authors'] = [
                    element.a.text.strip(),
                ]
            elif label.startswith(
                ('Genre', 'Genre', 'Género', 'Genere', 'Gênero')):
                for a_element in element.find_all('a'):
                    data['genres'].append(a_element.text)
            elif label.startswith(('Status', 'Statut', 'Estado', 'Stato')):
                value = element.find_all('a')[0].text.strip().lower()

                if value in ('ongoing', 'en cours', 'laufende', 'en curso',
                             'in corso', 'em tradução'):
                    data['status'] = 'ongoing'
                elif value in ('complete', 'complété', 'abgeschlossen',
                               'completado', 'completato', 'completo'):
                    data['status'] = 'complete'

        # Synopsis
        synopsis_element = soup.find('p', itemprop='description')
        if synopsis_element:
            synopsis_element.b.extract()
            data['synopsis'] = synopsis_element.text.strip()

        # Chapters
        div_element = soup.find('div', class_='chapterbox')
        if div_element:
            li_elements = div_element.find_all('li')
            for li_element in reversed(li_elements):
                slug = li_element.a.get('href').split('/')[-1].replace(
                    '.html', '')
                data['chapters'].append(
                    dict(
                        slug=slug,
                        title=li_element.a.text.strip(),
                        date=convert_date_string(li_element.span.text.strip(),
                                                 format='%b %d, %Y'),
                    ))

        return data
Пример #6
0
    def get_manga_data(self, initial_data):
        """
        Returns manga data by scraping manga HTML page content

        Initial data should contain at least manga's slug (provided by search)
        """
        assert 'slug' in initial_data, 'Manga slug is missing in initial data'

        r = self.session_get(self.manga_url.format(initial_data['slug']))
        if r is None:
            return None

        mime_type = magic.from_buffer(r.content[:128], mime=True)

        if r.status_code != 200 or mime_type != 'text/html':
            return None

        soup = BeautifulSoup(r.text, 'html.parser')

        data = initial_data.copy()
        data.update(
            dict(
                authors=[],
                scanlators=[],
                genres=[],
                status=None,
                synopsis=None,
                chapters=[],
                server_id=self.id,
                cover=None,
            ))

        # Name & cover
        data['name'] = soup.find('h1', class_='SeriesName').text.strip()
        data['cover'] = soup.find('div', class_='leftImage').img.get('src')

        # Details & Synopsis
        elements = soup.find('span', class_='details').find_all('div',
                                                                class_='row')
        for element in elements:
            div_element = element.div
            if div_element.b:
                label = div_element.b.text.strip()
            elif div_element.strong:
                label = div_element.strong.text.strip()

            if label.startswith('Author'):
                links_elements = div_element.find_all('a')
                for link_element in links_elements:
                    data['authors'].append(link_element.text.strip())
            elif label.startswith('Genre'):
                links_elements = div_element.find_all('a')
                for link_element in links_elements:
                    data['genres'].append(link_element.text.strip())
            elif label.startswith('Status'):
                value = div_element.find_all('a')[0].text.strip()
                if value.startswith('Complete'):
                    data['status'] = 'complete'
                elif value.startswith('Ongoing'):
                    data['status'] = 'ongoing'
            elif label.startswith('Description'):
                data['synopsis'] = div_element.div.text.strip()

        # Chapters
        elements = soup.find('div',
                             class_='chapter-list').find_all('a',
                                                             recursive=False)
        for link_element in reversed(elements):
            data['chapters'].append(
                dict(
                    slug=link_element.get('chapter'),
                    title=link_element.span.text.strip(),
                    date=convert_date_string(
                        link_element.time.get('datestring').strip(),
                        format='%Y%m%d'),
                ))

        return data
Пример #7
0
    def get_manga_data(self, initial_data):
        """
        Returns manga data by scraping manga HTML page content

        Initial data should contain at least manga's slug (provided by search)
        """
        assert 'slug' in initial_data, 'Manga slug is missing in initial data'

        r = self.session_get(self.manga_url.format(initial_data['slug']))
        if r is None:
            return None

        mime_type = get_buffer_mime_type(r.content)

        if r.status_code != 200 or mime_type != 'text/html':
            return None

        soup = BeautifulSoup(r.text, 'html.parser')

        data = initial_data.copy()
        data.update(
            dict(
                authors=[],
                scanlators=[
                    self.name,
                ],
                genres=[],
                status=None,
                synopsis=None,
                chapters=[],
                server_id=self.id,
                cover=None,
            ))

        data['name'] = soup.find_all('h5')[0].text.strip()
        data['cover'] = self.image_url.format(
            soup.find(
                'div',
                class_='media-comic-card').a.get('style').split('(')[-1][:-1])

        # Details
        data['synopsis'] = soup.find('div',
                                     class_='col-lg-9').contents[2].strip()

        # Chapters
        elements = soup.find('div', class_='list list-row row').find_all(
            'div', class_='list-item')
        for element in reversed(elements):
            a_elements = element.find_all('a')

            slug = '/'.join(a_elements[0].get('href').split('/')[-2:])
            title = '#{0} - {1}'.format(element.span.text.strip(),
                                        a_elements[0].text.strip())
            date = a_elements[1].text.strip()

            data['chapters'].append(
                dict(
                    slug=slug,
                    date=convert_date_string(date),
                    title=title,
                ))

        return data
Пример #8
0
    def get_manga_data(self, initial_data):
        """
        Returns manga data by scraping manga HTML page content

        Initial data should contain at least manga's slug (provided by search)
        """
        assert 'slug' in initial_data, 'Manga slug is missing in initial data'

        r = self.session_get(self.manga_url.format(initial_data['slug']))
        if r is None:
            return None

        mime_type = magic.from_buffer(r.content[:128], mime=True)

        if r.status_code != 200 or mime_type != 'text/html':
            return None

        soup = BeautifulSoup(r.text, 'html.parser')

        adult_alert = False
        if soup.find('div', class_='alert'):
            adult_alert = True

            r = self.session_post(self.manga_url.format(initial_data['slug']),
                                  data=dict(adult='true'))
            if r is None:
                return None

            soup = BeautifulSoup(r.text, 'html.parser')

        data = initial_data.copy()
        data.update(
            dict(
                authors=[],
                scanlators=[
                    self.name,
                ],
                genres=[],
                status=None,
                synopsis=None,
                chapters=[],
                server_id=self.id,
                cover=None,
            ))

        data['name'] = soup.find('h1', class_='title').text.strip()
        data['cover'] = soup.find('div', class_='thumbnail').img.get('src')

        # Details
        for element in soup.find('div', class_='info').find_all('b'):
            label = element.text
            value = list(element.next_siblings)[0][2:]
            if label in ('Author', 'Artist'):
                data['authors'].append(value)
            elif label in (
                    'Description',
                    'Synopsis',
            ):
                if adult_alert:
                    data['synopsis'] = '{0}\n\n{1}'.format(
                        'ALERT: This series contains mature contents and is meant to be viewed by an adult audience.',
                        value)
                else:
                    data['synopsis'] = value

        # Chapters
        for element in reversed(
                soup.find('div', class_='list').find_all('div',
                                                         class_='element')):
            a_element = element.find('div', class_='title').a

            title = a_element.text.strip()
            slug = a_element.get('href').replace(
                f'{self.base_url}/read/{initial_data["slug"]}/{self.lang}/',
                '')[:-1]
            date = convert_date_string(
                list(
                    element.find('div', class_='meta_r').find_all('a')
                    [-1].next_siblings)[0][2:], '%Y.%m.%d')

            data['chapters'].append(dict(
                slug=slug,
                date=date,
                title=title,
            ))

        return data
Пример #9
0
    def get_manga_data(self, initial_data):
        """
        Returns manga data by scraping manga HTML page content

        Initial data should contain at least manga's slug (provided by search)
        """
        assert 'slug' in initial_data, 'Manga slug is missing in initial data'

        r = self.session_get(self.manga_url.format(initial_data['slug']))
        if r is None:
            return None

        mime_type = get_buffer_mime_type(r.content)

        if r.status_code != 200 or mime_type != 'text/html':
            return None

        soup = BeautifulSoup(r.text, 'lxml')

        data = initial_data.copy()
        data.update(
            dict(
                authors=[],
                scanlators=[],
                genres=[],
                status=None,
                synopsis=None,
                chapters=[],
                server_id=self.id,
                cover=None,
            ))

        name_element = soup.find_all('h3')[0]
        name_element.i.decompose()
        name_element.small.decompose()
        data['name'] = name_element.text.strip()
        data['cover'] = self.cover_url.format(data['slug'])

        # Details
        elements = soup.find('div', class_='list-group').find_all(
            'span', class_='list-group-item')
        for element in elements:
            label = element.b.text.strip()

            if label.startswith(('Autor', 'Artist')):
                for a_element in element.find_all('a'):
                    value = a_element.text.strip()
                    if value not in data['authors']:
                        data['authors'].append(value)
            elif label.startswith('Categorías'):
                for a_element in element.find_all('a'):
                    value = a_element.text.strip()
                    if value not in data['authors']:
                        data['genres'].append(value)
            elif label.startswith('Estado'):
                value = element.span.text.strip().lower()
                if value in ('complete', 'ongoing'):
                    data['status'] = value
            elif label.startswith('Resumen'):
                element.b.extract()
                data['synopsis'] = element.text.strip()

        # Chapters
        elements = soup.find('div', class_='capitulos-list').find_all('tr')
        for element in reversed(elements):
            td_elements = element.find_all('td')
            a_element = td_elements[0].find('a')
            date_element = td_elements[1]
            date_element.i.extract()
            date_element.span.extract()

            data['chapters'].append(
                dict(
                    slug=a_element.get('href').split('/')[-1],
                    title=a_element.text.strip(),
                    date=convert_date_string(date_element.text.strip(),
                                             '%d %b. %Y'),
                ))

        return data
Пример #10
0
    def get_manga_data(self, initial_data):
        """
        Returns manga data by scraping manga HTML page content

        Initial data should contain at least manga's slug (provided by search)
        """
        assert 'slug' in initial_data, 'Manga slug is missing in initial data'

        r = self.session_get(self.manga_url.format(initial_data['slug']))
        if r is None:
            return None

        mime_type = magic.from_buffer(r.content[:128], mime=True)

        if r.status_code != 200 or mime_type != 'text/html':
            return None

        soup = BeautifulSoup(r.text, 'lxml')

        data = initial_data.copy()
        data.update(
            dict(
                authors=[],
                scanlators=[],
                genres=[],
                status=None,
                synopsis=None,
                chapters=[],
                server_id=self.id,
                cover=None,
            ))

        data['name'] = soup.find_all('h2',
                                     class_='widget-title')[0].text.strip()
        data['cover'] = self.cover_url.format(data['slug'])

        # Details
        elements = soup.find(
            'dl', class_='dl-horizontal').findChildren(recursive=False)
        for element in elements:
            if element.name not in ('dt', 'dd'):
                continue

            if element.name == 'dt':
                label = element.text
                continue

            if label.startswith('Auteur') or label.startswith('Artist'):
                value = element.text.strip()
                for t in value.split(','):
                    t = t.strip()
                    if t not in data['authors']:
                        data['authors'].append(t)
            elif label.startswith('Catégories'):
                data['genres'] = [
                    a_element.text.strip()
                    for a_element in element.find_all('a')
                ]
            elif label.startswith('Statut'):
                value = element.text.strip().lower()
                if value == 'en cours':
                    data['status'] = 'ongoing'
                elif value == 'terminé':
                    data['status'] = 'complete'

        data['synopsis'] = soup.find('div', class_='well').p.text.strip()
        alert_element = soup.find('div', class_='alert-danger')
        if alert_element:
            data['synopsis'] += '\n\n' + alert_element.text.strip()

        # Chapters
        elements = soup.find('ul', class_='chapters').find_all('li',
                                                               recursive=False)
        for element in reversed(elements):
            h5 = element.h5
            if not h5:
                continue

            slug = h5.a.get('href').split('/')[-1]
            title = '{0}: {1}'.format(h5.a.text.strip(), h5.em.text.strip())
            date = element.div.div

            data['chapters'].append(
                dict(slug=slug,
                     date=convert_date_string(date.text.strip(),
                                              format='%d %b. %Y'),
                     title=title))

        return data
Пример #11
0
    def get_manga_data(self, initial_data):
        """
        Returns manga data by scraping manga HTML page content

        Initial data should contain at least manga's slug (provided by search)
        """
        assert 'slug' in initial_data, 'Manga slug is missing in initial data'

        r = self.session_get(self.manga_url.format(initial_data['slug']))
        if r is None:
            return None

        mime_type = magic.from_buffer(r.content[:128], mime=True)

        if r.status_code != 200 or mime_type != 'text/html':
            return None

        soup = BeautifulSoup(r.text, 'html.parser')

        data = initial_data.copy()
        data.update(
            dict(
                authors=[],
                scanlators=[],
                genres=[],
                status=None,
                synopsis=None,
                chapters=[],
                server_id=self.id,
            ))

        title_element = soup.find('h1', class_='manga-bg__title')
        if title_element is None:
            title_element = soup.find('h1', class_='manga__title')
        data['name'] = title_element.text.strip()
        if data.get('cover') is None:
            data['cover'] = self.cover_url.format(data['slug'])

        # Details
        elements = soup.find(
            'div', class_='manga-info').find_all(class_='info-list__row')
        for element in elements:
            label = element.strong.text.strip()

            if label.startswith('Auteur') or label.startswith('Artiste'):
                value = element.a.text.strip()
                for t in value.split(','):
                    t = t.strip()
                    if t not in data['authors']:
                        data['authors'].append(t)
            elif label.startswith('Scantrad'):
                a_element = element.find_all('a')[0]
                data['scanlators'] = [
                    a_element.text.replace('[', '').replace(']', '').strip(),
                ]
            elif label.startswith('Genres'):
                a_elements = element.find_all('a')
                data['genres'] = [
                    a_element.text.strip() for a_element in a_elements
                ]
            elif label.startswith('Statut'):
                status = element.span.text.strip().lower()
                if status == 'en cours':
                    data['status'] = 'ongoing'
                elif status == 'terminé':
                    data['status'] = 'complete'

        # Synopsis
        data['synopsis'] = soup.find('div',
                                     class_='info-desc__content').text.strip()

        # Chapters
        elements = soup.find('div', class_='chapters-list').find_all(
            'div', class_='chapter-item')
        for element in reversed(elements):
            a_element = element.find('div', class_='chapter-item__name').a
            slug = a_element.get('href').split('/')[-1]
            title = a_element.text.strip()
            date = element.find('div',
                                class_='chapter-item__date').text.strip()

            data['chapters'].append(
                dict(
                    slug=slug,
                    title=title,
                    date=convert_date_string(date, format='%d.%m.%Y'),
                ))

        return data
Пример #12
0
    def get_manga_data(self, initial_data):
        """
        Returns manga data by scraping manga HTML page content

        Initial data should contain at least manga's slug (provided by search)
        """
        assert 'slug' in initial_data, 'Manga slug is missing in initial data'

        r = self.session_get(self.manga_url.format(initial_data['slug']))
        if r.status_code != 200:
            return None

        mime_type = get_buffer_mime_type(r.content)
        if mime_type != 'text/html':
            return None

        soup = BeautifulSoup(r.text, 'html.parser')

        data = initial_data.copy()
        data.update(
            dict(
                authors=[],
                scanlators=[],
                genres=[],
                status=None,
                synopsis=None,
                chapters=[],
                server_id=self.id,
            ))

        # Name & cover
        data['name'] = soup.find(
            'div', class_='story-info-right').find('h1').text.strip()
        if data.get('cover') is None:
            data['cover'] = soup.find('span',
                                      class_='info-image').img.get('src')

        # Details
        tr_elements = soup.find('table',
                                class_='variations-tableInfo').find_all('tr')
        for tr_element in tr_elements:
            td_elements = tr_element.find_all('td')
            label = td_elements[0].text.strip()
            value = td_elements[1].text.strip()

            if label.startswith('Author'):
                data['authors'] = [t.strip() for t in value.split('-') if t]
            elif label.startswith('Genres'):
                data['genres'] = [t.strip() for t in value.split('-')]
            elif label.startswith('Status'):
                status = value.lower()
                if status == 'completed':
                    data['status'] = 'complete'
                elif status == 'ongoing':
                    data['status'] = 'ongoing'

        # Synopsis
        div_synopsis = soup.find('div', id='panel-story-info-description')
        div_synopsis.h3.extract()
        data['synopsis'] = div_synopsis.text.strip()

        # Chapters
        li_elements = soup.find('ul',
                                class_='row-content-chapter').find_all('li')
        for li_element in reversed(li_elements):
            span_elements = li_element.find_all('span')

            slug = li_element.a.get('href').split('/')[-1]
            title = li_element.a.text.strip()
            date = span_elements[1].get('title')[:-6]

            data['chapters'].append(
                dict(
                    slug=slug,
                    title=title,
                    date=convert_date_string(date, format='%b %d,%y'),
                ))

        return data
Пример #13
0
    def get_manga_data(self, initial_data):
        """
        Returns manga data by scraping manga HTML page content

        Initial data should contain at least manga's slug (provided by search)
        """
        assert 'slug' in initial_data, 'Manga slug is missing in initial data'

        r = self.session_get(self.manga_url.format(initial_data['slug']))
        if r is None:
            return None

        mime_type = magic.from_buffer(r.content[:128], mime=True)

        if r.status_code != 200 or mime_type != 'text/html':
            return None

        soup = BeautifulSoup(r.text, 'lxml')

        data = initial_data.copy()
        data.update(
            dict(
                authors=[],
                scanlators=[],
                genres=[],
                status=None,
                synopsis=None,
                chapters=[],
                server_id=self.id,
            ))

        info_element = soup.find('div', class_='leftContent')

        title_element = info_element.find('span', class_='name')
        data['name'] = title_element.text.strip()

        cover_element = info_element.find('img', attrs={'data-full': True})
        data['cover'] = cover_element.get('data-full')

        # Details
        elements = info_element.find('div', class_='subject-meta').find_all(
            'p', recursive=False)

        status = elements[1].find(text=True, recursive=False).strip()
        if status == 'продолжается':
            data['status'] = 'ongoing'
        elif status == 'завершен':
            data['status'] = 'complete'

        for element in elements[2:]:
            label = element.span.text.strip()

            if label.startswith('Автор') or label.startswith(
                    'Сценарист') or label.startswith('Художник'):
                value = [
                    author.text.strip()
                    for author in element.find_all('a', class_='person-link')
                ]
                data['authors'].extend(value)
            elif label.startswith('Переводчик'):
                value = [
                    scanlator.text.strip()
                    for scanlator in element.find_all('a',
                                                      class_='person-link')
                ]
                data['scanlators'].extend(value)
            elif label.startswith('Жанр'):
                value = [
                    genre.text.strip()
                    for genre in element.find_all('a', class_='element-link')
                ]
                data['genres'].extend(value)

        # Synopsis
        data['synopsis'] = info_element.find(
            'div', class_='manga-description').text.strip()

        # Chapters
        chapters_element = info_element.find('div',
                                             class_='chapters-link',
                                             recursive=False)
        if not chapters_element:
            return data

        for element in reversed(
                chapters_element.table.find_all('tr', recursive=False)):
            a_element = element.find('a')
            slug = a_element.get('href').split('/', 2)[2]
            title = a_element.find(text=True, recursive=False).strip()
            date = element.find('td', align='right').text.strip()

            data['chapters'].append(
                dict(
                    slug=slug,
                    title=title,
                    date=convert_date_string(date, format='%d.%m.%Y'),
                ))

        return data
Пример #14
0
    def get_manga_data(self, initial_data):
        """
        Returns manga data by scraping manga HTML page content
        """
        r = self.session_get(self.manga_url.format(initial_data['slug']))
        if r is None:
            return None

        mime_type = magic.from_buffer(r.content[:128], mime=True)

        if r.status_code != 200 or mime_type != 'text/html':
            return None

        soup = BeautifulSoup(r.text, 'html.parser')

        data = initial_data.copy()
        data.update(
            dict(
                authors=[],
                scanlators=[],
                genres=[],
                status=None,
                synopsis=None,
                chapters=[],
                server_id=self.id,
                cover=None,
            ))

        container_element = soup.find('div', class_='tamanho-bloco-perfil')

        data['name'] = container_element.find('h2').text.strip()
        data['cover'] = container_element.find(
            'img', class_='img-thumbnail').get('src')

        for div_element in container_element.find_all(
                'div', class_='col-md-8 col-xs-12'):
            if not div_element.h4:
                continue

            label = div_element.find('label').text.strip()
            div_element.h4.label.extract()
            value = div_element.text.strip()

            if label.startswith('Gênero'):
                data['genres'] = [genre.strip() for genre in value.split(',')]
            elif label.startswith(('Autor', 'Artista')):
                for author in value.split(','):
                    author = author.strip()
                    if author not in data['authors']:
                        data['authors'].append(author)
            elif label.startswith('Status'):
                if value == 'Completo':
                    data['status'] = 'complete'
                elif value == 'Ativo':
                    data['status'] = 'ongoing'

        data['synopsis'] = container_element.find(
            'div', class_='panel-body').text.strip()

        # Chapters
        for div_element in reversed(
                container_element.find_all('div',
                                           class_='row lancamento-linha')):
            a_element = div_element.div.a
            span_element = div_element.div.find_all('span', recursive=False)[1]

            data['chapters'].append(
                dict(
                    title=a_element.text.strip(),
                    slug=a_element.get('href').split('/')[-1],
                    date=convert_date_string(span_element.text.strip()[1:-1],
                                             format='%d/%m/%Y'),
                ))

        return data
Пример #15
0
    def get_manga_data(self, initial_data):
        """
        Returns manga data by scraping manga HTML page content

        Initial data should contain at least manga's slug (provided by search)
        """
        assert 'slug' in initial_data, 'Manga slug is missing in initial data'
        r = self.session_get(self.manga_url.format(initial_data['slug']))
        if r is None:
            return None

        mime_type = get_buffer_mime_type(r.content)

        if r.status_code != 200 or mime_type != 'text/html':
            return None

        soup = BeautifulSoup(r.text, 'lxml')

        data = initial_data.copy()
        data.update(
            dict(
                authors=[],
                scanlators=[],
                genres=[],
                status=None,
                synopsis=None,
                chapters=[],
                server_id=self.id,
            ))

        title_element = soup.find('h1', class_='manga-bg__title')
        if not title_element:
            title_element = soup.find('div', class_='manga-title').h1
        data['name'] = title_element.text.strip()

        cover_element = soup.find('img', class_='manga__cover')
        data['cover'] = cover_element.get('src')

        # Details
        for info in soup.find_all('div', class_='info-list__row'):
            label = info.strong.text.strip()

            if label.startswith('Автор'):
                value = [author.text.strip() for author in info.find_all('a')]
                data['authors'].extend(value)
            elif label.startswith('Художник'):
                value = [
                    author.text.strip() for author in info.find_all('a')
                    if not author.text.strip() in data['authors']
                ]
                data['authors'].extend(value)
            elif label.startswith('Переводчик'):
                value = [
                    scanlator.text.strip() for scanlator in info.find_all('a')
                ]
                data['scanlators'].extend(value)
            elif label.startswith('Перевод'):
                status = info.span.text.strip()
                if status == 'продолжается':
                    data['status'] = 'ongoing'
                elif status == 'завершен':
                    data['status'] = 'complete'
            elif label.startswith('Жанр'):
                value = [genre.text.strip() for genre in info.find_all('a')]
                data['genres'].extend(value)

        # Synopsis
        synopsis_element = soup.find('div', class_='info-desc__content')
        if synopsis_element:
            data['synopsis'] = synopsis_element.text.strip()

        # Chapters
        for element in reversed(soup.find_all('div', class_='chapter-item')):
            a_element = element.find('a')
            if a_element:
                slug = a_element.get('href')[8:].split('/', 2)[2]
            else:
                teams = json.loads(element.get('data-teams'))
                slug = 'v{}/c{}/{}'.format(element.get('data-volume'),
                                           element.get('data-number'),
                                           teams[0]['slug'])

            title = ' '.join(
                element.find('div', class_='chapter-item__name').text.split())
            date = element.find('div',
                                class_='chapter-item__date').text.strip()

            data['chapters'].append(
                dict(
                    slug=slug,
                    title=title,
                    date=convert_date_string(date, format='%d.%m.%Y'),
                ))

        return data
Пример #16
0
    def get_manga_data(self, initial_data):
        """
        Returns manga data by scraping manga HTML page content

        Initial data should contain at least manga's slug (provided by search)
        """
        assert 'slug' in initial_data, 'Manga slug is missing in initial data'

        r = self.session_get(self.manga_url.format(initial_data['slug']))
        if r is None:
            return None

        mime_type = get_buffer_mime_type(r.content)

        if r.status_code != 200 or mime_type != 'text/html':
            return None

        soup = BeautifulSoup(r.text, 'html.parser')

        data = initial_data.copy()
        data.update(
            dict(
                authors=[],
                scanlators=[
                    SERVER_NAME,
                ],
                genres=[],
                status=None,
                synopsis=None,
                chapters=[],
                server_id=self.id,
                cover=None,
            ))

        div_info = soup.find('div', class_='mf-info')
        data['name'] = div_info.find('div', class_='titre').text.strip()
        data['cover'] = '{0}/{1}'.format(
            self.base_url,
            div_info.find('div', class_='poster').img.get('src'))

        status = div_info.find_all(
            'div', class_='sub-i')[-1].span.text.strip().lower()
        if status == 'en cours':
            data['status'] = 'ongoing'
        elif status == 'terminé':
            data['status'] = 'complete'

        data['synopsis'] = div_info.find('div', class_='synopsis').text.strip()

        # Chapters
        for div_element in reversed(
                soup.find('div', id='chap-top').find_all('div',
                                                         class_='chapitre')):
            btns_elements = div_element.find('div',
                                             class_='ch-right').find_all('a')
            if len(btns_elements) < 2:
                continue

            data['chapters'].append(
                dict(
                    slug=btns_elements[0].get('href').split('/')[-1],
                    date=convert_date_string(
                        div_element.find('div', class_='chl-date').text),
                    title='{0} {1}'.format(
                        div_element.find('span',
                                         class_='chl-num').text.strip(),
                        div_element.find('span',
                                         class_='chl-titre').text.strip()),
                ))

        return data
Пример #17
0
    def get_manga_data(self, initial_data):
        """
        Returns manga data by scraping manga HTML page content

        Initial data should contain at least manga's slug (provided by search)
        """
        assert 'slug' in initial_data, 'Slug is missing in initial data'

        r = self.session_get(self.manga_url.format(initial_data['slug']))
        if r.status_code != 200:
            return None

        mime_type = get_buffer_mime_type(r.content)
        if mime_type != 'text/html':
            return None

        data = initial_data.copy()
        data.update(dict(
            authors=[],
            scanlators=[],  # not available
            genres=[],
            status=None,
            synopsis=None,
            chapters=[],
            server_id=self.id,
            cover=self.cover_url.format(data['slug']),
        ))

        soup = BeautifulSoup(r.content, 'lxml')

        data['name'] = soup.find('h1').text.strip()

        for li_element in soup.find('ul', class_='list-group list-group-flush').find_all('li'):
            if li_element.span is None:
                continue

            label = li_element.span.text.strip()
            li_element.span.extract()

            if label.startswith('Author'):
                data['authors'] = [artist.strip() for artist in li_element.text.split(',')]

            elif label.startswith('Genre'):
                data['genres'] = [genre.strip() for genre in li_element.text.split(',')]

            elif label.startswith('Status'):
                for status in li_element.text.split(','):
                    if 'Scan' not in status:
                        continue

                    status = status.replace('(Scan)', '').strip().lower()
                    if status in ('complete', 'hiatus', 'ongoing', ):
                        data['status'] = status
                    elif status in ('cancelled', 'discontinued', ):
                        data['status'] = 'suspended'
                    break

            elif label.startswith('Description'):
                data['synopsis'] = li_element.text.strip()

        # Chapters
        chapters = None
        try:
            script = soup.find_all('script')[-1].string
            if script:
                for line in script.split('\n'):
                    line = line.strip()
                    if not line.startswith('vm.Chapters'):
                        continue

                    chapters = json.loads(line.split('=')[1].strip()[:-1])
                    break
        except Exception as e:
            log_error_traceback(e)
            return None

        if chapters is not None:
            for chapter in reversed(chapters):
                slug = chapter['Chapter']

                title = f'{chapter["Type"]} {int(chapter["Chapter"][1:-1])}'
                if chapter['Chapter'][-1] != '0':
                    title = f'{title}.{chapter["Chapter"][-1]}'
                if chapter.get('ChapterName'):
                    title = f'{title} - {chapter["ChapterName"]}'

                data['chapters'].append(dict(
                    slug=slug,
                    title=title,
                    date=convert_date_string(chapter['Date'], '%Y-%m-%d %H:%M:%S') if chapter.get('Date') else None,
                ))

        return data
Пример #18
0
    def get_manga_data(self, initial_data):
        """
        Returns manga data by scraping manga HTML page content

        Initial data should contain at least manga's slug (provided by search)
        """
        assert 'slug' in initial_data, 'Manga slug is missing in initial data'

        r = self.session_get(self.manga_url.format(initial_data['slug']))
        if r is None:
            return None

        mime_type = magic.from_buffer(r.content[:128], mime=True)

        if r.status_code != 200 or mime_type != 'text/html':
            return None

        soup = BeautifulSoup(r.text, 'html.parser')

        data = initial_data.copy()
        data.update(
            dict(
                authors=[],
                scanlators=[],
                genres=[],
                status=None,
                chapters=[],
                server_id=self.id,
                synopsis=None,
            ))

        card_element = soup.find_all('div', class_='card')[0]

        # Main name: japscan handles several names for mangas (main + alternatives)
        # Name provided by search can be one of the alternatives
        # First word (Manga, Manhwa, ...) must be removed from name
        data['name'] = ' '.join(
            card_element.find('h1').text.strip().split()[1:])
        if data.get('cover') is None:
            data['cover'] = self.cover_url.format(
                card_element.find('img').get('src'))

        # Details
        if not card_element.find_all('div', class_='d-flex'):
            # mobile version
            elements = card_element.find_all('div',
                                             class_='row')[0].find_all('p')
        else:
            # desktop version
            elements = card_element.find_all(
                'div', class_='d-flex')[0].find_all('p', class_='mb-2')

        for element in elements:
            label = element.span.text
            element.span.extract()
            value = element.text.strip()

            if label.startswith(('Auteur', 'Artiste')):
                for t in value.split(','):
                    t = t.strip()
                    if t not in data['authors']:
                        data['authors'].append(t)
            elif label.startswith('Genre'):
                data['genres'] = [genre.strip() for genre in value.split(',')]
            elif label.startswith('Statut'):
                # Possible values: ongoing, complete
                data[
                    'status'] = 'ongoing' if value == 'En Cours' else 'complete'

        # Synopsis
        synopsis_element = card_element.find('p',
                                             class_='list-group-item-primary')
        if synopsis_element:
            data['synopsis'] = synopsis_element.text.strip()

        # Chapters
        elements = soup.find('div', id='chapters_list').find_all(
            'div', class_='chapters_list')
        for element in reversed(elements):
            if element.a.span:
                span = element.a.span.extract()
                # JapScan sometimes uploads some "spoiler preview" chapters, containing 2 or 3 untranslated pictures taken from a raw.
                # Sometimes they also upload full RAWs/US versions and replace them with a translation as soon as available.
                # Those have a span.badge "SPOILER", "RAW" or "VUS". We exclude these from the chapters list.
                if span.text.strip() in (
                        'RAW',
                        'SPOILER',
                        'VUS',
                ):
                    continue

            slug = element.a.get('href').split('/')[3]

            data['chapters'].append(
                dict(
                    slug=slug,
                    title=element.a.text.strip(),
                    date=convert_date_string(element.span.text.strip(),
                                             format='%d %b %Y'),
                ))

        return data
Пример #19
0
    def get_manga_data(self, initial_data):
        """
        Returns manga data by scraping manga HTML page content

        Initial data should contain at least manga's slug (provided by search)
        """
        assert 'slug' in initial_data, 'Slug is missing in initial data'

        r = self.session_get(self.manga_url.format(initial_data['slug']))
        if r is None:
            return None

        mime_type = magic.from_buffer(r.content[:128], mime=True)

        if r.status_code != 200 or mime_type != 'text/html':
            return None

        soup = BeautifulSoup(r.text, 'html.parser')

        data = initial_data.copy()
        data.update(
            dict(
                authors=[],
                scanlators=[],
                genres=[],
                status=None,
                synopsis=None,
                chapters=[],
                server_id=self.id,
                cover=None,
            ))

        data['name'] = soup.find('h1').text.strip()

        # Details
        elements = soup.find('div', class_='relaxed').find_all('div',
                                                               class_='item')
        for element in elements:
            label_element = element.find('div', class_='header')
            if not label_element:
                continue

            label = label_element.text.strip()
            value_element = element.find('div', class_='description')

            if label == 'Sinópse':
                cover_img = value_element.img.extract()
                data['cover'] = cover_img.get('src')
                data['synopsis'] = value_element.text.strip()
            elif label in ('Arte', 'Autor'):
                data['authors'].append(value_element.text.strip())
            elif label == 'Gênero':
                for a_element in value_element.find_all('a'):
                    data['genres'].append(a_element.text.strip())
            elif label == 'Scantrad':
                for a_element in value_element.find_all('a'):
                    data['scanlators'].append(a_element.text.strip())
            elif label == 'Status':
                value = value_element.a.text.strip()

                if value == 'Em publicação':
                    data['status'] = 'ongoing'
                elif value == 'Completo':
                    data['status'] = 'complete'
                elif value == 'Cancelado':
                    data['status'] = 'suspended'
                elif value == 'Pausado':
                    data['status'] = 'hiatus'
            elif label == 'Capítulos':
                for tr_element in reversed(
                        value_element.find_all('div', class_='content')
                    [0].table.tbody.find_all('tr')[1:]):
                    tds_elements = tr_element.find_all('td')

                    data['chapters'].append(
                        dict(
                            slug=tds_elements[0].a.get('href').split('/')[-1],
                            title=tds_elements[0].a.text.strip(),
                            date=convert_date_string(
                                tds_elements[1].text.strip(),
                                format='%d/%m/%Y'),
                        ))

        return data