예제 #1
0
    def parse_search_html(self, res, stype):
        if not res:
            return
        next_page = re.search(
            r'<li><a href="(http://www.dmm.co.jp/.+?page=\d*)/">次へ</a>',
            res.text, re.IGNORECASE)
        if next_page:
            url = next_page.group(1)
            self.add_urls(url)

        # soup = BeautifulSoup(res.text, 'lxml')
        # total = soup.select_one('div.list-boxcaptside.list-boxpagenation > p')
        # li_nodes = soup.select("#list li")

        doc = pq(res.text)
        li_nodes = doc.find('#list > li').items()
        total = doc.find('div.list-boxcaptside.list-boxpagenation > p')

        for li in li_nodes:
            if self.stoped:
                break
            sell = li.find('p.sublink a')
            url = li.find('p.tmb a').attr('href')
            url_type = self.get_url_type(url)
            if url_type >= 0:
                src_url = 'http:' + li.find('p.tmb a img').attr('src')
                result = utils.gen_metadata_struck(stype)
                if '标题' in result:
                    result['标题'] = li.find('p.tmb a img').attr('alt')
                if '电视节目标题' in result:
                    result['电视节目标题'] = li.find('p.tmb a img').attr('alt')
                if '集标题' in result:
                    result['集标题'] = li.find('p.tmb a img').attr('alt')
                result['级别'] = 'R18+'
                try:
                    result['评级'] = self.format_rate_str(
                        li.find('div.value p.rate').text())
                except Exception:
                    pass

                result['tag']['type'] = stype
                result['tag']['dital_url'] = url
                result['tag']['video_id'] = re.search(r'cid=(.+)/',
                                                      url).group(1)

                result['tag']['backdrop'] = utils.tim_img_bytes(
                    self.download_page_request(
                        self.get_full_src(src_url)).content)
                result['tag']['poster'] = utils.create_poster(
                    result['tag']['backdrop'])
                result['tag']['total'] = int(
                    re.match(r'(\d+).*',
                             total.text()).group(1)) if total else 0
                result['tag']['tip'] = sell.text() if sell else ''

                yield result
    def parse_dital(self, html, meta):
        if not html or not meta: return
        doc = pq(html)

        try:
            date = doc(
                '#detail_box > table > tr:nth-child(7) > td.movie_table_td2'
            ).text()
            if '发布日期' in meta:
                meta['发布日期'] = date
            if '发布日期(电视节目)' in meta:
                meta['发布日期(电视节目)'] = date
            if '发布日期(集)' in meta:
                meta['发布日期(集)'] = date
        except AttributeError:
            pass

        try:
            des = doc('#comment').text()

            meta['摘要'] = des.strip()
            meta['标语'] = meta['摘要'][:30]
        except AttributeError:
            pass

        try:

            genres = doc(
                '#detail_box > table > tr:nth-child(5) > td.movie_table_td2 > div'
            ).items()
            meta['类型'] = ','.join([genre.text().strip() for genre in genres])
        except AttributeError:
            pass

        try:
            actors = doc('#detail_box > table >tr:nth-child(1) a').items()
            meta['演员'] = ','.join([actor.text().strip() for actor in actors])
        except AttributeError:
            pass

        # try:
        #     producers = doc('#titlebox > ul:nth-child(4) > li:nth-child(2) > a').items()
        #     meta['作者'] = ','.join([producer.text().strip() for producer in producers])
        # except AttributeError:
        #     pass

        try:
            poster_data = self.download_page_request(
                re.search(r"var imgurl = '(http://.*?jpg)';", html,
                          re.S).group(1)).content
            meta['tag']['backdrop'] = utils.tim_img_bytes(poster_data)
            meta['tag']['poster'] = utils.create_poster(
                meta['tag']['backdrop'])
        except AttributeError:
            pass
        yield meta
예제 #3
0
    def parse_url_search(self, res, stype='movie'):
        if not res:
            return
        result = utils.gen_metadata_struck(stype)
        try:
            json_ld = json.loads(
                re.search(
                    r'<script type="application/ld\+json">(.*?)</script>',
                    res.text, re.S).group(1))

            if '标题' in result:
                result['标题'] = json_ld.get('name')
            if '电视节目标题' in result:
                result['电视节目标题'] = json_ld.get('name')
            if '集标题' in result:
                result['集标题'] = json_ld.get('name')

            result['级别'] = 'R18+'
            result['评级'] = self.format_rate_str(
                json_ld.get('aggregateRating').get('ratingValue'))

            result['tag']['type'] = stype
            result['tag']['dital_url'] = res.url
            result['tag']['video_id'] = re.search(r'cid=(.+)/',
                                                  res.url).group(1)

            result['tag']['backdrop'] = utils.tim_img_bytes(
                self.download_page_request(
                    self.get_full_src(json_ld.get('image'))).content)
            result['tag']['poster'] = utils.create_poster(
                result['tag']['backdrop'])
            result['tag']['total'] = 0
            result['tag']['tip'] = ''

        except Exception:
            pass

        result['dital_url'] = res.url

        return result
    def parse_url_search(self, res, stype='movie'):
        if not res:
            return
        res.encoding = 'utf-8'
        result = utils.gen_metadata_struck(stype)
        try:

            doc = PyQuery(res.text)
            title = doc('#contents > form > div.detailed_title').text()
            if title:
                if '标题' in result:
                    result['标题'] = title
                if '电视节目标题' in result:
                    result['电视节目标题'] = title
                if '集标题' in result:
                    result['集标题'] = title

                result['级别'] = 'R18+'

                result['tag']['type'] = stype
                result['tag']['dital_url'] = res.url
                result['tag']['video_id'] = re.search(
                    r'id=(\d+)', result['tag']['dital_url']).group(1)
                result['tag']['tip'] = doc(
                    '#contents > form > div.item_detail > ul > li:nth-child(2) > a'
                ).text()
                poster_url = urljoin(
                    res.url,
                    doc('#contents > form > div.item_detail > div.item600 > img'
                        ).attr('src'))
                if poster_url:
                    result['tag']['backdrop'] = utils.tim_img_bytes(
                        self.download_page_request(poster_url).content)
                    result['tag']['poster'] = utils.create_poster(
                        result['tag']['backdrop'])
        except Exception:
            pass

        return result
    def parse_dital(self, html, meta):
        if not html or not meta: return
        doc = PyQuery(html)
        try:
            title = doc('#contents > form > div.detailed_title').text()

            if '标题' in meta:
                meta['标题'] = title
            if '电视节目标题' in meta:
                meta['电视节目标题'] = title
            if '集标题' in meta:
                meta['集标题'] = title
        except AttributeError:
            pass

        # try:
        #     date = doc('#main-content > div.main-content-movieinfo > div.movie-info > dl:nth-child(3) > dd').text()
        #     if '发布日期' in meta:
        #         meta['发布日期'] = date
        #     if '发布日期(电视节目)' in meta:
        #         meta['发布日期(电视节目)'] = date
        #     if '发布日期(集)' in meta:
        #         meta['发布日期(集)'] = date
        # except AttributeError:
        #     pass

        try:

            meta['摘要'] = doc(
                '#contents > form > div.item_detail > div:nth-child(5)').text(
                ).strip()
            meta['标语'] = meta['摘要'][:30]
        except AttributeError:
            pass

        try:
            meta['类型'] = ','.join([
                genre.text() for genre in
                doc('#contents > form > div.item_detail > ul > li:nth-child(4) > a'
                    ).items()
            ])

        except AttributeError:
            pass

        try:
            actors = doc(
                '#contents > form > div.item_detail > ul > li:nth-child(2) > a'
            ).items()
            meta['演员'] = ','.join([actor.text().strip() for actor in actors])
        except AttributeError:
            pass

        try:
            writers = doc(
                '#contents > form > div.item_detail > ul > li:nth-child(3) > a'
            ).items()
            meta['作者'] = ','.join(
                [writer.text().strip() for writer in writers])
        except AttributeError:
            pass

        # try:
        #     starts = doc('#main-content > div.main-content-movieinfo > div.movie-info > dl:nth-child(5) > dd').text()
        #     if starts:
        #
        #         meta['评级'] = str(20*len(starts))
        # except AttributeError:
        #     pass

        try:
            poster_url = urljoin(
                meta.get('tag').get('dital_url'),
                doc('#contents > form > div.item_detail > div.item600 > img').
                attr('src'))
            if poster_url:
                meta['tag']['backdrop'] = utils.tim_img_bytes(
                    self.download_page_request(poster_url).content)
                meta['tag']['poster'] = utils.create_poster(
                    meta['tag']['backdrop'])
        except AttributeError:
            pass
        yield meta

        try:
            # 缩略图
            sample_urls = doc('#contents > form > div.item_cap > img').items()
            for sample_url in sample_urls:
                nail_url = sample_url.attr('src')
                if nail_url:
                    nail_url = urljoin(
                        meta.get('tag').get('dital_url'), nail_url)

                    yield self.download_page_request(nail_url).content

        except Exception:
            pass
    def parse_dital(self, html, meta, types='movie', search=False, url=''):
        if not html or not meta: return
        relock = re.search(r'<a href="(.*?)">Click here to continue\.\.\.</a>',
                           html)
        if relock:
            res = self.RequestSession.get((relock.group(1)),
                                          timeout=utils.DOWN_TIME_OUT)
            self.add_log('parse_search_html 重置:', relock.group(1))
            if res == 200:
                html = res.text
            else:
                return

        soup = BeautifulSoup(html, "lxml")

        if types == 'movies':
            try:
                title = soup.select_one(
                    '#centered > div.p8 > div:nth-of-type(1) > h1').text
                if '标题' in meta:
                    meta['标题'] = title
                if '电视节目标题' in meta:
                    meta['电视节目标题'] = title
                if '集标题' in meta:
                    meta['集标题'] = title
            except Exception:
                pass

            div_main = soup.select_one(
                '#centered > div.p8 > div:nth-of-type(7) > div')

            is_backdrop = re.search('(Click to Enlarge Front & Back Cover)',
                                    div_main.text, re.S) is not None
            try:
                post_url = div_main.select_one('div:nth-of-type(1) > a').get(
                    'href')
                if is_backdrop:
                    meta['tag']['backdrop'] = utils.tim_img_bytes(
                        self.download_page_request(post_url).content)
                    meta['tag']['poster'] = utils.create_poster(
                        meta['tag']['backdrop'])
                else:
                    meta['tag']['poster'] = utils.tim_img_bytes(
                        self.download_page_request(post_url).content)
                    backdrop_url = div_main.select_one(
                        'div:nth-of-type(1) > p > a').get('href')
                    meta['tag']['backdrop'] = utils.tim_img_bytes(
                        self.download_page_request(backdrop_url).content)
                    meta['tag']['backdrop'] = utils.merge_image(
                        meta['tag']['poster'], meta['tag']['backdrop'])
            except Exception:
                pass

            meta['级别'] = 'R18+'

            try:
                year = div_main.select_one('div.gen12 > p:nth-of-type(1)').text
                year = re.search(r'Release date: (.*?\d{4})', year)
                if not year:
                    year = div_main.select_one(
                        'div.gen12 > p:nth-of-type(2)').text
                    year = re.search(r'Release date: (.*?\d{4})', year)

                year = year.group(1)
                if '发布日期' in meta:
                    meta['发布日期'] = utils.format_date_str(year)
                if '发布日期(电视节目)' in meta:
                    meta['发布日期(电视节目)'] = utils.format_date_str(year)
                if '发布日期(集)' in meta:
                    meta['发布日期(集)'] = utils.format_date_str(year)

            except Exception:
                pass

            try:
                meta['摘要'] = div_main.select_one(
                    'div.gen12 > p.gen12').text.strip().strip('Description:')
                meta['标语'] = meta['摘要'][:30]
            except Exception:
                pass

            try:
                g_str = re.search('<b>Categories:</b>(.*?)<b>Description:',
                                  str(div_main), re.S).group(1)
                geres = []
                geres_a = re.findall(r'<a href=".*?">(.*?)</a>', g_str, re.S)
                if geres_a:
                    geres.extend(geres_a)
                geres_b = re.findall(r'<span class="gensmall">(.*?)</span>',
                                     g_str, re.S)
                if geres_b:
                    geres.extend(geres_b)
                geres = [a for a in filter(lambda x: x.find(':') < 0, geres)]
                meta['类型'] = ','.join(geres)
            except Exception:
                pass

            try:
                actor_div = soup.select(
                    '#centered > div.p8 > div:nth-of-type(10) > div > div > div'
                )
                actors = [actor.text.strip() for actor in actor_div]
                actor_more = soup.select(
                    '#centered > div.p8 > div:nth-of-type(10) > div > p:nth-of-type(2) > a'
                )
                if actor_more:
                    actors.extend([actor.text.strip() for actor in actor_more])
                meta['演员'] = ','.join(actors)
            except Exception:
                pass

            try:
                w_d = div_main.select_one('div.gen12 > p:nth-of-type(2)')
                writers = re.search(
                    r'<b>(?:Site|Studio):</b> (.*?) \| <b>Director:</b>(.*?)</p>',
                    str(w_d), re.S)
                if not writers:
                    w_d = div_main.select_one('div.gen12 > p:nth-of-type(3)')
                    writers = re.search(
                        r'<b>(?:Site|Studio):</b> (.*?) \| <b>Director:</b>(.*?)</p>',
                        str(w_d), re.S)

                if writers:
                    if writers.group(1):
                        writer = writers.group(1)
                        writer_a = re.search(r'<a.*?>(.*)</a>', writer)
                        if writer_a:
                            writer = writer_a.group(1)
                        meta['作者'] = ','.join([writer.strip()])

                    if writers.group(2):
                        director = writers.group(2)
                        director_a = re.search(r'<a.*?>(.*)</a>', director)
                        if director_a:
                            director = director_a.group(1)
                        meta['导演'] = ','.join([director.strip()])

            except Exception:
                pass

            yield meta

            # 缩略图
            if not search:
                try:
                    soup = BeautifulSoup(html, 'lxml')
                    samples = soup.select(
                        '#centered > div.p8 > div:nth-of-type(15) > a')
                    sample = samples[-1].get('href')
                    for im in self.parse_thumbel_page(sample):
                        yield im
                except Exception:
                    pass
        elif types == 'content':
            doc = pq(html)
            try:
                title = doc(
                    '#centered > div.p8 > div:nth-child(1) > h1').text()
                if '标题' in meta:
                    meta['标题'] = title
                if '电视节目标题' in meta:
                    meta['电视节目标题'] = title
                if '集标题' in meta:
                    meta['集标题'] = title
            except Exception:
                pass

            try:
                post_url = doc('#moviewrap > img').attr('src')

                meta['tag']['backdrop'] = utils.tim_img_bytes(
                    self.download_page_request(post_url).content)
                meta['tag']['poster'] = utils.create_poster(
                    meta['tag']['backdrop'], middle=True)

            except Exception:
                pass

            meta['级别'] = 'R18+'

            try:

                year = doc(
                    '#centered > div.p8 > div:nth-child(7) > div:nth-child(3) > p:nth-child(2) > span > a'
                )
                year = year.text()

                if '发布日期' in meta:
                    meta['发布日期'] = utils.format_date_str(year)
                if '发布日期(电视节目)' in meta:
                    meta['发布日期(电视节目)'] = utils.format_date_str(year)
                if '发布日期(集)' in meta:
                    meta['发布日期(集)'] = utils.format_date_str(year)

            except Exception:
                pass

            try:
                meta['摘要'] = doc(
                    '#centered > div.p8 > div:nth-child(7) > div:nth-child(3) > div.gen12 > p'
                ).text().strip('Story:')
                meta['标语'] = meta['摘要'][:30]
            except Exception:
                pass
            #
            try:
                div_gener = doc(
                    '#centered > div.p8 > div:nth-child(7) > div:nth-child(3) > div:nth-child(5) > div'
                ).find('a').items()
                meta['类型'] = ','.join([x.text().strip() for x in div_gener])
            except Exception:
                pass

            if not meta['类型']:

                try:
                    div_gener = doc(
                        '#centered > div.p8 > div:nth-child(7) > div:nth-child(3) > div:nth-child(6) > div'
                    ).find('a').items()
                    meta['类型'] = ','.join(
                        [x.text().strip() for x in div_gener])
                except Exception:
                    pass
            #
            try:
                actors_p = doc(
                    '#centered > div.p8 > div:nth-child(7) > div:nth-child(3) > p:nth-child(4)'
                ).find('a.bold').items()

                meta['演员'] = ','.join([x.text().strip() for x in actors_p])
            except Exception:
                pass
            try:
                if not meta['演员']:
                    actors_p = doc(
                        '#centered > div.p8 > div:nth-child(7) > div:nth-child(3) > p:nth-child(5)'
                    ).find('a.bold').items()

                    meta['演员'] = ','.join([x.text().strip() for x in actors_p])
            except Exception:
                pass
            #
            try:
                d_w_p = doc(
                    '#centered > div.p8 > div:nth-child(7) > div:nth-child(3) > p:nth-child(3) > a'
                ).text()
                meta['作者'] = ','.join([d_w_p])
                meta['导演'] = ','.join([d_w_p])

            except Exception:
                pass

            yield meta

            # 缩略图
            if not search:
                try:
                    count = doc(
                        '#centered > div.p8 > div:nth-child(13) > div > p > b'
                    ).text().strip('images').strip()
                    father = doc(
                        '#centered > div.p8 > div:nth-child(13) > div > div'
                    ).find('a').attr('href')
                    pa = re.search('(http://.*/)(\d+)', father)

                    http = pa.group(1)
                    father_url = '{}{:0>2d}'.format(http, int(count))

                    for im in self.parse_thumbel_page(father_url):
                        yield im
                except Exception:
                    pass
    def parse_dital(self, html, meta):
        if not html or not meta: return
        doc = PyQuery(html)

        try:
            date = doc(
                '#main-content > div.main-content-movieinfo > div.movie-info > dl:nth-child(3) > dd'
            ).text()
            if '发布日期' in meta:
                meta['发布日期'] = date
            if '发布日期(电视节目)' in meta:
                meta['发布日期(电视节目)'] = date
            if '发布日期(集)' in meta:
                meta['发布日期(集)'] = date
        except AttributeError:
            pass

        try:
            des = doc(
                '#main-content > div.main-content-movieinfo > div.movie-comment > p'
            ).text()

            meta['摘要'] = des.strip()
            meta['标语'] = meta['摘要'][:30]
        except AttributeError:
            pass

        try:
            lst = []
            genres = doc(
                '#main-content > div.main-content-movieinfo > div.movie-info > dl.movie-info-cat > dd'
            ).items()
            if genres:
                lst = [genre.text().strip() for genre in genres]

            groups = doc(
                '#main-content > div.main-content-movieinfo > div.movie-info > dl:nth-child(6) > dd'
            ).items()
            if groups:
                groups = [group.text().strip() for group in groups]
                lst.extend(groups)
            meta['类型'] = ','.join(lst)
        except AttributeError:
            pass

        try:
            actors = doc(
                '#main-content > div.main-content-movieinfo > div.movie-info > dl:nth-child(1) > dd'
            ).items()
            meta['演员'] = ','.join([actor.text().strip() for actor in actors])
        except AttributeError:
            pass

        try:
            meta['作者'] = 'カリビアンコム'
        except AttributeError:
            pass

        try:
            starts = doc(
                '#main-content > div.main-content-movieinfo > div.movie-info > dl:nth-child(5) > dd'
            ).text()
            if starts:

                meta['评级'] = str(20 * len(starts))
        except AttributeError:
            pass

        try:
            poster_url = 'https://www.caribbeancom.com/moviepages/{}/images/l_l.jpg'.format(
                meta['tag']['video_id'])
            poster_data = self.download_page_request(poster_url).content
            meta['tag']['backdrop'] = utils.tim_img_bytes(poster_data)
            meta['tag']['poster'] = utils.create_poster(
                meta['tag']['backdrop'])
        except AttributeError:
            pass
        yield meta

        try:
            # 缩略图
            sample_urls = doc('a.fancy-gallery').items()
            for sample_url in sample_urls:
                nail_url = sample_url.attr('href')
                if nail_url:
                    nail_url = urljoin(
                        meta.get('tag').get('dital_url'), nail_url)
                    if nail_url.find('member') < 0:
                        yield self.download_page_request(nail_url).content

        except Exception:
            pass