Пример #1
0
    def parse_detail(self, response):

        item_loader = Hao6vItemLoader(item=Hao6vItem(), response=response)

        item_loader.add_xpath(
            "title_detail", '//div[@class="col6"]/div[@class="box"]/h1/text()')
        item_loader.add_xpath(
            "title", '//div[@class="col6"]/div[@class="box"]/h1/text()')

        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))

        download_url_titles = response.xpath(
            '//div[@id="endText"]/table/tbody/tr/td/a/text()').extract()
        download_url_urls = response.xpath(
            '//div[@id="endText"]/table/tbody/tr/td/a/@href').extract()

        download_url = []
        for i in range(len(download_url_urls)):
            if download_url_urls[i]:
                url = download_url_titles[i].strip() + \
                    " &nbsp;&nbsp;:<br><br>" + download_url_urls[i]
                download_url.append(url)

        item_loader.add_value('download_url', download_url)
        item_loader.add_value("crawl_time", datetime.now())

        movie_item = item_loader.load_item()

        if len(download_url) > 1:
            return movie_item
Пример #2
0
    def parse_detail(self, response):

        item_loader = Mkv99ItemLoader(item=Mkv99Item(), response=response)

        title_detail = response.css(".movieTitle h1").extract_first("")
        title_detail = remove_tags(title_detail).strip()
        item_loader.add_value('title_detail', title_detail)

        title = response.css(".nvc dl dd a:last-child::text").extract_first(
            "").strip()
        item_loader.add_value('title', title)

        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))

        a_titles = response.css('.adds div span a::text').extract()
        a_hrefs = response.css('.adds div span a::attr(href)').extract()
        download_url = []
        for i in range(len(a_hrefs)):
            a = a_titles[i].strip() + "&nbsp;&nbsp;:<br><br>" + a_hrefs[i]
            download_url.append(a)

        item_loader.add_value('download_url', download_url)
        item_loader.add_value("crawl_time", datetime.now())

        movie_item = item_loader.load_item()

        if len(download_url) > 1:
            return movie_item
Пример #3
0
    def parse_detail(self, response):

        item_loader = XunleicangItemLoader(
            item=XunleicangItem(), response=response)

        item_loader.add_css('title_detail', "div.pleft h3 a:last-child")
        item_loader.add_css('title', ".moviecont .bt h1::text")

        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))

        download_url = []
        download_xls = response.xpath('//ul[@class="downurl"]/li/a')
        download_cls = response.xpath('//div[@id="cili"]/ul/li/a')

        for xl in download_xls:
            xl_title = remove_tags(xl.xpath('text()').extract_first(""))
            xl_url = xl.xpath('@href').extract_first("")
            download_url.append(xl_title.strip() +
                                "&nbsp;&nbsp;:<br><br>" + xl_url)

        for cl in download_cls:
            cl_title = remove_tags(cl.xpath('text()').extract_first(""))
            cl_url = cl.xpath('@href').extract_first("")
            download_url.append(cl_title.strip() +
                                "&nbsp;&nbsp;:<br><br>" + cl_url)

        item_loader.add_value('download_url', download_url)
        item_loader.add_value("crawl_time", datetime.now())

        movie_item = item_loader.load_item()

        if len(download_url) > 1:
            return movie_item
Пример #4
0
    def parse_detail(self, response):

        item_loader = Dy2018ItemLoader(item=Dy2018Item(), response=response)

        item_loader.add_xpath("title_detail",
                              '//div[@class="title_all"]/h1/text()')
        item_loader.add_xpath("title", '//div[@class="title_all"]/h1/text()')

        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))

        # download_url =  response.css("#Zoom table tbody tr td a::attr(href)").extract()
        # item_loader.add_value('download_url', download_url)

        download_url_list = response.xpath('//a/text()').re(
            r'^(ftp.*)|(magnet.*)|(thunder.*)|(ed2k.*)|(.*?torrent)')
        download_url = []
        for url in download_url_list:
            if url:
                download_url.append(url.strip())

        item_loader.add_value('download_url', download_url)
        item_loader.add_value("crawl_time", datetime.now())

        movie_item = item_loader.load_item()

        if len(download_url) > 1:
            return movie_item
Пример #5
0
    def parse_detail(self, response):

        item_loader = LbldyItemLoader(item=LbldyItem(), response=response)

        item_loader.add_xpath("title_detail", '//div[@class="post"]/h2/text()')
        item_loader.add_xpath("title", '//div[@class="post"]/h2/text()')

        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))

        download_links = response.xpath('//div[@id="download_link"]/p/a')
        download_url = []

        for download_link in download_links:
            download_link_title = download_link.xpath('text()').extract_first(
                "").strip()
            download_link_url = download_link.xpath('@href').extract_first("")
            url = download_link_title + " &nbsp;&nbsp;:<br><br>" + download_link_url
            download_url.append(url)

        item_loader.add_value('download_url', download_url)
        item_loader.add_value("crawl_time", datetime.now())

        movie_item = item_loader.load_item()

        if len(download_url) > 1:
            return movie_item
Пример #6
0
    def parse_detail(self, response):

        item_loader = MovieItemLoader(item=Ygdy8Item(), response=response)
        # 通过CSS选择器提取字段
        item_loader.add_value('url', response.url)
        item_loader.add_value('url_object_id', get_md5(response.url))

        item_loader.add_css('title_detail',
                            ".bd3r .co_area2 .title_all h1 font::text")
        item_loader.add_css('title',
                            ".bd3r .co_area2 .title_all h1 font::text")

        download_url = response.css(
            "#Zoom table tbody tr td a::attr(href)").extract()

        item_loader.add_value('download_url', download_url)
        item_loader.add_value('thunder_url', download_url)

        # item_loader.add_value("crawl_time", datetime.now().strftime(SQL_DATETIME_FORMAT))

        # title_detail = response.css(".bd3r .co_area2 .title_all h1 font::text").extract_first("")
        # match_re = re.match(r'.*?《(.*?)》.*',title_detail)
        # if match_re:
        #      title = match_re.group(1)
        # ftp_url = response.css("#Zoom table tbody tr td a::attr(href)").extract_first("")

        item_loader.add_value("crawl_time", datetime.now())

        movie_item = item_loader.load_item()

        if len(download_url) > 1:
            return movie_item
Пример #7
0
    def parse_download(self, response):

        main_url = response.meta.get("main_url", "")
        title_detail = response.meta.get("title_detail", "")
        title = response.meta.get("title", "")
        item_loader = Yyj268ItemLoader(item=Yyj268Item(), response=response)
        item_loader.add_value('title_url', response.url)
        item_loader.add_value('title_detail', title_detail)
        item_loader.add_value('title', title)

        item_loader.add_value("url", main_url)
        item_loader.add_value("url_object_id", get_md5(main_url))

        # pw = response.css("center p strong font span::text").extract_first()
        # match_pw = re.match('^(\d+).*', pw)

        download_url_1 = response.css(
            ".desc center font strong a::attr(href)").extract_first("")
        download_url_2 = response.css(
            "center p strong span::text").extract_first("").strip()

        download_url = ""
        if download_url_1 and download_url_2:
            download_url = download_url_1 + "&nbsp;&nbsp;:<br><br>" + download_url_2
        elif download_url_1:
            download_url = download_url_1
        elif download_url_2:
            download_url = download_url_2

        item_loader.add_value("download_url", download_url)
        item_loader.add_value("crawl_time", datetime.now())

        movie_item = item_loader.load_item()
        if download_url_1 or download_url_2:
            return movie_item
Пример #8
0
    def parse_detail(self, response):

        item_loader = DysfzItemLoader(item=DysfzItem(), response=response)

        movie_title = response.xpath('//div[@class="main shadow"]/h1/text()')
        title_detail = movie_title.extract_first("")
        titles = movie_title.re(r'^(.*?)【.*')
        item_loader.add_value("title_detail", title_detail.strip())
        if titles:
            title = titles[0].strip()
        else:
            title = title_detail.strip()
        item_loader.add_value("title", title)

        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))

        download_links = response.xpath(
            '//div[@class="detail"]/p[contains(text(), "资源列表")]/following-sibling::p')
        download_url = ["资源列表&nbsp;&nbsp;:"]

        for download_link in download_links:
            download_link_title = download_link.xpath(
                'string(.)').extract_first("").strip()
            download_link_url = download_link.xpath(
                'a/@href').extract_first("")
            if download_link_url != download_link_title:
                url = download_link_title.strip() + "&nbsp;&nbsp;:<br><br>" + download_link_url
                download_url.append(url)
            else:
                download_url.append(download_link_url)

        item_loader.add_value('download_url', download_url)
        item_loader.add_value("crawl_time", datetime.now())

        movie_item = item_loader.load_item()

        if len(download_url) > 2:
            return movie_item
Пример #9
0
    def parse_detail(self, response):

        item_loader = LoldyttwItemLoader(item=LoldyttwItem(),
                                         response=response)

        title_detail = response.xpath(
            '//div[@class="lm"]/h1/a/text()').extract_first("").strip()
        item_loader.add_value("title_detail", title_detail)
        item_loader.add_value("title", title_detail)

        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))

        download_url = []

        download_xl_cls = response.css('ul.downurl li a')
        for xl_cl in download_xl_cls:
            xl_cl_title = remove_tags(xl_cl.css('::text').extract_first(""))
            xl_cl_url = xl_cl.css('::attr(href)').extract_first("")
            download_url.append(xl_cl_title.strip() + "&nbsp;&nbsp;:<br><br>" +
                                xl_cl_url)

        download_bts = response.xpath('//div[@id="bt"]/ul/li/a')
        for bt in download_bts:
            bt_title = remove_tags(bt.xpath('text()').extract_first(""))
            bt_url = bt.xpath('@href').extract_first("")
            download_url.append(bt_title.strip() + "&nbsp;&nbsp;:<br><br>" +
                                bt_url)

        item_loader.add_value('download_url', download_url)
        item_loader.add_value("crawl_time", datetime.now())

        movie_item = item_loader.load_item()

        if len(download_url) > 1:
            return movie_item
Пример #10
0
    def parse_detail(self, response):

        item_loader = BtbttItemLoader(item=BtbttItem(), response=response)

        movie_title = response.xpath(
            '//div[@class="bg1 border post"]/h2').xpath('string(.)')
        title_detail = movie_title.extract_first("").strip()
        titles = movie_title.re(r'.*?\[.*?BT.*?\]\[(.*?)\]\[.*?')
        if len(titles) == 0:
            title = title_detail.replace("[", "").replace("]", "")
        else:
            title = titles[0].strip()

        item_loader.add_value("title_detail", title_detail)
        item_loader.add_value("title", title)

        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))

        domain_url = response.meta.get("domain_url", "")
        download_url = []
        download_link_1 = response.xpath('//a[contains(text(),".torrent")]')
        if download_link_1:
            download_url.append("BT下载链接 &nbsp;&nbsp;:")
            for link in download_link_1:
                link_1_title = link.xpath("text()").extract_first("")
                link_1_url = domain_url + \
                    link.xpath("@href").extract_first(
                        "").replace("dialog", "download")
                link1 = link_1_title.strip(
                ) + "&nbsp;&nbsp;:<br><br>" + link_1_url
                download_url.append(link1)

        download_link_2_1 = response.xpath(
            '//a[contains(@href,"pan.baidu.com")]/../..')
        download_link_2_2 = " ".join(
            response.xpath('//div[@class="bg1 border post"]//p').xpath(
                "string(.)").extract())
        if download_link_2_1 or "pan.baidu.com" in download_link_2_2:
            download_url.append("网盘下载链接 &nbsp;&nbsp;:" + response.url)

        # download_link_2 = response.xpath('//a[contains(@href,"pan.baidu.com")]/../..')
        # download_link2_1 = response.xpath('//a[contains(@href,"pan.baidu.com")]/@href').extract()
        # link_2 = download_link_2.xpath('string(.)')
        # if "pan.baidu.com" in link_2:
        #     download_url.append("网盘下载链接 &nbsp;&nbsp;:<br><br>")
        #     match_re_1 = re.findall(r'(pan.baidu.com/s/\w+).*?(密码:\w{4})', link_2)
        #     for link in match_re_1:
        #         link2 = "  ".join(link)
        #         download_url.append(link2 + "&nbsp;&nbsp;<br><br>")
        #
        #     match_re_2 = re.findall(r'(pan.baidu.com/s/\w+).*?', link_2)
        #     for link in match_re_2:
        #         link2 = "  ".join(link)
        #         download_url.append(link2 + "&nbsp;&nbsp;<br><br>")
        # elif download_link2_1:
        #     for link in download_link2_1:
        #         download_url.append(link + "&nbsp;&nbsp;<br><br>")
        #     download_url.append("若需要密码,请访问更多下载链接页面")

        item_loader.add_value('download_url', download_url)
        item_loader.add_value("crawl_time", datetime.now())

        movie_item = item_loader.load_item()

        if len(download_url) > 1:
            return movie_item
Пример #11
0
    def parse_detail(self, response):

        item_loader = MeijuttItemLoader(item=MeijuttItem(), response=response)

        movie_title = response.xpath('//div[@class="info-title"]')
        title = movie_title.xpath("h1/text()").extract_first("").strip()
        title_detail = title + \
            movie_title.xpath("text()").extract_first("").strip()

        item_loader.add_value("title", title)
        item_loader.add_value("title_detail", title_detail)

        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))

        download_url = []
        download_links = response.xpath('//strong[@class="down_part_name"]')
        download_link_1 = download_links.xpath('a[contains(text(),"字幕")]')
        # download_link_2 = download_links.xpath('a[contains(text(),"第")')
        # download_link_2 = download_links.xpath('a[starts-with(text(),"第")')
        download_link_2 = download_links.xpath('a[re:test(text(), "第\d+?集")]')

        download_link_3 = response.xpath('//div[@class="wp-list"]/ul/li')

        if len(download_link_1) > 0:
            download_url.append("电驴下载链接 &nbsp;&nbsp;:")
            for download_link in download_link_1:
                download_link_title = download_link.xpath(
                    'text()').extract_first("")
                download_link_url = download_link.xpath(
                    '@href').extract_first("")
                url = download_link_title.strip() + "&nbsp;&nbsp;:<br><br>" + download_link_url
                download_url.append(url)

        if len(download_link_2) > 0:
            download_url.append("磁力下载链接 &nbsp;&nbsp;:")
            for download_link in download_link_2:
                download_link_title = download_link.xpath(
                    'text()').extract_first("")
                download_link_url = download_link.xpath(
                    '@href').extract_first("")
                url = download_link_title.strip() + "&nbsp;&nbsp;:<br><br>" + download_link_url
                download_url.append(url)

        if len(download_link_3) > 0:
            download_url.append("百度网盘下载 &nbsp;&nbsp;:")
            for download_link in download_link_3:
                download_link_title = download_link.xpath(
                    'strong/text()').extract_first("")
                download_link_url = download_link.xpath(
                    'a/@href').extract_first("")
                download_link_passwd = download_link.xpath(
                    'span/text()').extract_first("")
                url = download_link_title.strip() + "&nbsp;&nbsp;:<br><br>" + \
                    download_link_url + "&nbsp;&nbsp;" + download_link_passwd.strip()
                download_url.append(url)

        item_loader.add_value('download_url', download_url)
        item_loader.add_value("crawl_time", datetime.now())

        movie_item = item_loader.load_item()

        if len(download_url) > 1:
            return movie_item