def parse_detail(self, response): item_loader = Hao6vItemLoader(item=Hao6vItem(), response=response) item_loader.add_xpath( "title_detail", '//div[@class="col6"]/div[@class="box"]/h1/text()') item_loader.add_xpath( "title", '//div[@class="col6"]/div[@class="box"]/h1/text()') item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) download_url_titles = response.xpath( '//div[@id="endText"]/table/tbody/tr/td/a/text()').extract() download_url_urls = response.xpath( '//div[@id="endText"]/table/tbody/tr/td/a/@href').extract() download_url = [] for i in range(len(download_url_urls)): if download_url_urls[i]: url = download_url_titles[i].strip() + \ " :<br><br>" + download_url_urls[i] download_url.append(url) item_loader.add_value('download_url', download_url) item_loader.add_value("crawl_time", datetime.now()) movie_item = item_loader.load_item() if len(download_url) > 1: return movie_item
def parse_detail(self, response): item_loader = Mkv99ItemLoader(item=Mkv99Item(), response=response) title_detail = response.css(".movieTitle h1").extract_first("") title_detail = remove_tags(title_detail).strip() item_loader.add_value('title_detail', title_detail) title = response.css(".nvc dl dd a:last-child::text").extract_first( "").strip() item_loader.add_value('title', title) item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) a_titles = response.css('.adds div span a::text').extract() a_hrefs = response.css('.adds div span a::attr(href)').extract() download_url = [] for i in range(len(a_hrefs)): a = a_titles[i].strip() + " :<br><br>" + a_hrefs[i] download_url.append(a) item_loader.add_value('download_url', download_url) item_loader.add_value("crawl_time", datetime.now()) movie_item = item_loader.load_item() if len(download_url) > 1: return movie_item
def parse_detail(self, response): item_loader = XunleicangItemLoader( item=XunleicangItem(), response=response) item_loader.add_css('title_detail', "div.pleft h3 a:last-child") item_loader.add_css('title', ".moviecont .bt h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) download_url = [] download_xls = response.xpath('//ul[@class="downurl"]/li/a') download_cls = response.xpath('//div[@id="cili"]/ul/li/a') for xl in download_xls: xl_title = remove_tags(xl.xpath('text()').extract_first("")) xl_url = xl.xpath('@href').extract_first("") download_url.append(xl_title.strip() + " :<br><br>" + xl_url) for cl in download_cls: cl_title = remove_tags(cl.xpath('text()').extract_first("")) cl_url = cl.xpath('@href').extract_first("") download_url.append(cl_title.strip() + " :<br><br>" + cl_url) item_loader.add_value('download_url', download_url) item_loader.add_value("crawl_time", datetime.now()) movie_item = item_loader.load_item() if len(download_url) > 1: return movie_item
def parse_detail(self, response): item_loader = Dy2018ItemLoader(item=Dy2018Item(), response=response) item_loader.add_xpath("title_detail", '//div[@class="title_all"]/h1/text()') item_loader.add_xpath("title", '//div[@class="title_all"]/h1/text()') item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) # download_url = response.css("#Zoom table tbody tr td a::attr(href)").extract() # item_loader.add_value('download_url', download_url) download_url_list = response.xpath('//a/text()').re( r'^(ftp.*)|(magnet.*)|(thunder.*)|(ed2k.*)|(.*?torrent)') download_url = [] for url in download_url_list: if url: download_url.append(url.strip()) item_loader.add_value('download_url', download_url) item_loader.add_value("crawl_time", datetime.now()) movie_item = item_loader.load_item() if len(download_url) > 1: return movie_item
def parse_detail(self, response): item_loader = LbldyItemLoader(item=LbldyItem(), response=response) item_loader.add_xpath("title_detail", '//div[@class="post"]/h2/text()') item_loader.add_xpath("title", '//div[@class="post"]/h2/text()') item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) download_links = response.xpath('//div[@id="download_link"]/p/a') download_url = [] for download_link in download_links: download_link_title = download_link.xpath('text()').extract_first( "").strip() download_link_url = download_link.xpath('@href').extract_first("") url = download_link_title + " :<br><br>" + download_link_url download_url.append(url) item_loader.add_value('download_url', download_url) item_loader.add_value("crawl_time", datetime.now()) movie_item = item_loader.load_item() if len(download_url) > 1: return movie_item
def parse_detail(self, response): item_loader = MovieItemLoader(item=Ygdy8Item(), response=response) # 通过CSS选择器提取字段 item_loader.add_value('url', response.url) item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_css('title_detail', ".bd3r .co_area2 .title_all h1 font::text") item_loader.add_css('title', ".bd3r .co_area2 .title_all h1 font::text") download_url = response.css( "#Zoom table tbody tr td a::attr(href)").extract() item_loader.add_value('download_url', download_url) item_loader.add_value('thunder_url', download_url) # item_loader.add_value("crawl_time", datetime.now().strftime(SQL_DATETIME_FORMAT)) # title_detail = response.css(".bd3r .co_area2 .title_all h1 font::text").extract_first("") # match_re = re.match(r'.*?《(.*?)》.*',title_detail) # if match_re: # title = match_re.group(1) # ftp_url = response.css("#Zoom table tbody tr td a::attr(href)").extract_first("") item_loader.add_value("crawl_time", datetime.now()) movie_item = item_loader.load_item() if len(download_url) > 1: return movie_item
def parse_download(self, response): main_url = response.meta.get("main_url", "") title_detail = response.meta.get("title_detail", "") title = response.meta.get("title", "") item_loader = Yyj268ItemLoader(item=Yyj268Item(), response=response) item_loader.add_value('title_url', response.url) item_loader.add_value('title_detail', title_detail) item_loader.add_value('title', title) item_loader.add_value("url", main_url) item_loader.add_value("url_object_id", get_md5(main_url)) # pw = response.css("center p strong font span::text").extract_first() # match_pw = re.match('^(\d+).*', pw) download_url_1 = response.css( ".desc center font strong a::attr(href)").extract_first("") download_url_2 = response.css( "center p strong span::text").extract_first("").strip() download_url = "" if download_url_1 and download_url_2: download_url = download_url_1 + " :<br><br>" + download_url_2 elif download_url_1: download_url = download_url_1 elif download_url_2: download_url = download_url_2 item_loader.add_value("download_url", download_url) item_loader.add_value("crawl_time", datetime.now()) movie_item = item_loader.load_item() if download_url_1 or download_url_2: return movie_item
def parse_detail(self, response): item_loader = DysfzItemLoader(item=DysfzItem(), response=response) movie_title = response.xpath('//div[@class="main shadow"]/h1/text()') title_detail = movie_title.extract_first("") titles = movie_title.re(r'^(.*?)【.*') item_loader.add_value("title_detail", title_detail.strip()) if titles: title = titles[0].strip() else: title = title_detail.strip() item_loader.add_value("title", title) item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) download_links = response.xpath( '//div[@class="detail"]/p[contains(text(), "资源列表")]/following-sibling::p') download_url = ["资源列表 :"] for download_link in download_links: download_link_title = download_link.xpath( 'string(.)').extract_first("").strip() download_link_url = download_link.xpath( 'a/@href').extract_first("") if download_link_url != download_link_title: url = download_link_title.strip() + " :<br><br>" + download_link_url download_url.append(url) else: download_url.append(download_link_url) item_loader.add_value('download_url', download_url) item_loader.add_value("crawl_time", datetime.now()) movie_item = item_loader.load_item() if len(download_url) > 2: return movie_item
def parse_detail(self, response): item_loader = LoldyttwItemLoader(item=LoldyttwItem(), response=response) title_detail = response.xpath( '//div[@class="lm"]/h1/a/text()').extract_first("").strip() item_loader.add_value("title_detail", title_detail) item_loader.add_value("title", title_detail) item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) download_url = [] download_xl_cls = response.css('ul.downurl li a') for xl_cl in download_xl_cls: xl_cl_title = remove_tags(xl_cl.css('::text').extract_first("")) xl_cl_url = xl_cl.css('::attr(href)').extract_first("") download_url.append(xl_cl_title.strip() + " :<br><br>" + xl_cl_url) download_bts = response.xpath('//div[@id="bt"]/ul/li/a') for bt in download_bts: bt_title = remove_tags(bt.xpath('text()').extract_first("")) bt_url = bt.xpath('@href').extract_first("") download_url.append(bt_title.strip() + " :<br><br>" + bt_url) item_loader.add_value('download_url', download_url) item_loader.add_value("crawl_time", datetime.now()) movie_item = item_loader.load_item() if len(download_url) > 1: return movie_item
def parse_detail(self, response): item_loader = BtbttItemLoader(item=BtbttItem(), response=response) movie_title = response.xpath( '//div[@class="bg1 border post"]/h2').xpath('string(.)') title_detail = movie_title.extract_first("").strip() titles = movie_title.re(r'.*?\[.*?BT.*?\]\[(.*?)\]\[.*?') if len(titles) == 0: title = title_detail.replace("[", "").replace("]", "") else: title = titles[0].strip() item_loader.add_value("title_detail", title_detail) item_loader.add_value("title", title) item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) domain_url = response.meta.get("domain_url", "") download_url = [] download_link_1 = response.xpath('//a[contains(text(),".torrent")]') if download_link_1: download_url.append("BT下载链接 :") for link in download_link_1: link_1_title = link.xpath("text()").extract_first("") link_1_url = domain_url + \ link.xpath("@href").extract_first( "").replace("dialog", "download") link1 = link_1_title.strip( ) + " :<br><br>" + link_1_url download_url.append(link1) download_link_2_1 = response.xpath( '//a[contains(@href,"pan.baidu.com")]/../..') download_link_2_2 = " ".join( response.xpath('//div[@class="bg1 border post"]//p').xpath( "string(.)").extract()) if download_link_2_1 or "pan.baidu.com" in download_link_2_2: download_url.append("网盘下载链接 :" + response.url) # download_link_2 = response.xpath('//a[contains(@href,"pan.baidu.com")]/../..') # download_link2_1 = response.xpath('//a[contains(@href,"pan.baidu.com")]/@href').extract() # link_2 = download_link_2.xpath('string(.)') # if "pan.baidu.com" in link_2: # download_url.append("网盘下载链接 :<br><br>") # match_re_1 = re.findall(r'(pan.baidu.com/s/\w+).*?(密码:\w{4})', link_2) # for link in match_re_1: # link2 = " ".join(link) # download_url.append(link2 + " <br><br>") # # match_re_2 = re.findall(r'(pan.baidu.com/s/\w+).*?', link_2) # for link in match_re_2: # link2 = " ".join(link) # download_url.append(link2 + " <br><br>") # elif download_link2_1: # for link in download_link2_1: # download_url.append(link + " <br><br>") # download_url.append("若需要密码,请访问更多下载链接页面") item_loader.add_value('download_url', download_url) item_loader.add_value("crawl_time", datetime.now()) movie_item = item_loader.load_item() if len(download_url) > 1: return movie_item
def parse_detail(self, response): item_loader = MeijuttItemLoader(item=MeijuttItem(), response=response) movie_title = response.xpath('//div[@class="info-title"]') title = movie_title.xpath("h1/text()").extract_first("").strip() title_detail = title + \ movie_title.xpath("text()").extract_first("").strip() item_loader.add_value("title", title) item_loader.add_value("title_detail", title_detail) item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) download_url = [] download_links = response.xpath('//strong[@class="down_part_name"]') download_link_1 = download_links.xpath('a[contains(text(),"字幕")]') # download_link_2 = download_links.xpath('a[contains(text(),"第")') # download_link_2 = download_links.xpath('a[starts-with(text(),"第")') download_link_2 = download_links.xpath('a[re:test(text(), "第\d+?集")]') download_link_3 = response.xpath('//div[@class="wp-list"]/ul/li') if len(download_link_1) > 0: download_url.append("电驴下载链接 :") for download_link in download_link_1: download_link_title = download_link.xpath( 'text()').extract_first("") download_link_url = download_link.xpath( '@href').extract_first("") url = download_link_title.strip() + " :<br><br>" + download_link_url download_url.append(url) if len(download_link_2) > 0: download_url.append("磁力下载链接 :") for download_link in download_link_2: download_link_title = download_link.xpath( 'text()').extract_first("") download_link_url = download_link.xpath( '@href').extract_first("") url = download_link_title.strip() + " :<br><br>" + download_link_url download_url.append(url) if len(download_link_3) > 0: download_url.append("百度网盘下载 :") for download_link in download_link_3: download_link_title = download_link.xpath( 'strong/text()').extract_first("") download_link_url = download_link.xpath( 'a/@href').extract_first("") download_link_passwd = download_link.xpath( 'span/text()').extract_first("") url = download_link_title.strip() + " :<br><br>" + \ download_link_url + " " + download_link_passwd.strip() download_url.append(url) item_loader.add_value('download_url', download_url) item_loader.add_value("crawl_time", datetime.now()) movie_item = item_loader.load_item() if len(download_url) > 1: return movie_item