def item_completed(self, results, item, info): if self.LOG_FAILED_RESULTS: msg = '%s found errors proessing %s' % (self.__class__.__name__, item) for ok, value in results: if not ok: log.err(value, msg, spider=info.spider) bookfile_paths_urls = [(x['path'], x['url']) for ok, x in results if ok] bookfile_path_url = list_first_item(bookfile_paths_urls) if bookfile_path_url: item['book_file'] = os.path.join( os.path.abspath(self.bookfile_store), bookfile_path_url[0]) item['book_file_url'] = bookfile_path_url[1] return item else: if self.item_download[item['original_url']]: next = list_first_item( self.item_download[item['original_url']]) self.item_download[item['original_url']] = self.item_download[ item['original_url']][1:] return Request(next) else: return item
def parse_detail(self, response): woaidu_item = WoaiduCrawlerItem() response_selector = HtmlXPathSelector(response) woaidu_item['book_name'] = list_first_item( response_selector.select( '//div[@class="zizida"][1]/text()').extract()) woaidu_item['author'] = [ list_first_item( response_selector.select('//div[@class="xiaoxiao"][1]/text()'). extract())[5:].strip(), ] woaidu_item['book_description'] = list_first_item( response_selector.select( '//div[@class="lili"][1]/text()').extract()).strip() woaidu_item['book_covor_image_url'] = list_first_item( response_selector.select( '//div[@class="hong"][1]/img/@src').extract()) download = [] for i in response_selector.select( '//div[contains(@class,"xiazai_xiao")]')[1:]: download_item = {} download_item['url'] = \ strip_null(deduplication( [ list_first_item(i.select('./div')[0].select('./a/@href').extract()), list_first_item(i.select('./div')[1].select('./a/@href').extract()) ] ) ) download_item['progress'] = list_first_item( i.select('./div')[2].select('./text()').extract()) download_item['update_time'] = list_first_item( i.select('./div')[3].select('./text()').extract()) download_item['source_site'] = [ list_first_item(i.select('./div')[4].select('./a/text()').extract()), list_first_item(i.select('./div')[4].select('./a/@href').extract()) ]\ download.append(download_item) woaidu_item['book_download'] = download woaidu_item['original_url'] = response.url yield woaidu_item
def parse(self,response): response_selector = HtmlXPathSelector(response) next_link = list_first_item(response_selector.select(u'//div[@class="k2"]/div/a[text()="下一页"]/@href').extract()) if next_link: next_link = clean_url(response.url,next_link,response.encoding) #what does it mean yield yield Request(url=next_link, callback=self.parse)
def parse(self,response): response_selector = HtmlXPathSelector(response) next_link = list_first_item(response_selector.select(u'//div[@class="k2"]/div/a[text()="下一页"]/@href').extract()) if next_link: next_link = clean_url(response.url,next_link,response.encoding) yield Request(url=next_link, callback=self.parse) for detail_link in response_selector.select(u'//div[contains(@class,"sousuolist")]/a/@href').extract(): if detail_link: detail_link = clean_url(response.url,detail_link,response.encoding) yield Request(url=detail_link, callback=self.parse_detail)
def item_completed(self, results, item, info): if self.LOG_FAILED_RESULTS: msg = '%s found errors proessing %s' % (self.__class__.__name__, item) for ok, value in results: if not ok: log.err(value, msg, spider=info.spider) bookfile_paths_urls = [(x['path'],x['url']) for ok, x in results if ok] bookfile_path_url = list_first_item(bookfile_paths_urls) if bookfile_path_url: item['book_file'] = os.path.join(os.path.abspath(self.bookfile_store),bookfile_path_url[0]) item['book_file_url'] = bookfile_path_url[1] return item else: if self.item_download[item['original_url']]: next = list_first_item(self.item_download[item['original_url']]) self.item_download[item['original_url']] = self.item_download[item['original_url']][1:] return Request(next) else: return item
def item_completed(self, results, item, info): if self.LOG_FAILED_RESULTS: msg = '%s found errors proessing %s' % (self.__class__.__name__, item) for ok, value in results: if not ok: log.err(value, msg, spider=info.spider) image_paths = [x['path'] for ok, x in results if ok] image_path = list_first_item(image_paths) item['book_covor_image_path'] = os.path.join(os.path.abspath(self.images_store),image_path) if image_path else "" return item
def get_media_requests(self, item, info): """ Only download once per book,so it pick out one from all of the download urls. """ #XXX:To test specific url,you can use the following method: #return Request("http://down.wmtxt.com/wmtxt/wmtxt/UploadFile/2010-6/%A1%B6%D3%F6%BC%FB%C4%E3%A1%B7.rar") if item.get('book_download'): downloadfile_urls = [i['url'] for i in item.get('book_download') if i['url']] downloadfile_urls = list(set(itertools.chain(*downloadfile_urls))) first_download_file = list_first_item(downloadfile_urls) self.item_download[item['original_url']] = downloadfile_urls[1:] if first_download_file: return Request(first_download_file)
def parse_detail(self, response): woaidu_item = WoaiduCrawlerItem() response_selector = HtmlXPathSelector(response) woaidu_item['book_name'] = list_first_item(response_selector.select('//div[@class="zizida"][1]/text()').extract()) woaidu_item['author'] = [list_first_item(response_selector.select('//div[@class="xiaoxiao"][1]/text()').extract())[5:].strip(),] woaidu_item['book_description'] = list_first_item(response_selector.select('//div[@class="lili"][1]/text()').extract()).strip() woaidu_item['book_covor_image_url'] = list_first_item(response_selector.select('//div[@class="hong"][1]/img/@src').extract()) download = [] for i in response_selector.select('//div[contains(@class,"xiazai_xiao")]')[1:]: download_item = {} download_item['url'] = \ strip_null( \ deduplication(\ [\ list_first_item(i.select('./div')[0].select('./a/@href').extract()),\ list_first_item(i.select('./div')[1].select('./a/@href').extract())\ ]\ )\ ) download_item['progress'] = list_first_item(i.select('./div')[2].select('./text()').extract()) download_item['update_time'] = list_first_item(i.select('./div')[3].select('./text()').extract()) download_item['source_site'] = \ [\ list_first_item(i.select('./div')[4].select('./a/text()').extract()),\ list_first_item(i.select('./div')[4].select('./a/@href').extract())\ ]\ download.append(download_item) woaidu_item['book_download'] = download woaidu_item['original_url'] = response.url yield woaidu_item
def get_media_requests(self, item, info): """ Only download once per book,so it pick out one from all of the download urls. """ #XXX:To test specific url,you can use the following method: #return Request("http://down.wmtxt.com/wmtxt/wmtxt/UploadFile/2010-6/%A1%B6%D3%F6%BC%FB%C4%E3%A1%B7.rar") if item.get('book_download'): downloadfile_urls = [ i['url'] for i in item.get('book_download') if i['url'] ] downloadfile_urls = list(set(itertools.chain(*downloadfile_urls))) first_download_file = list_first_item(downloadfile_urls) self.item_download[item['original_url']] = downloadfile_urls[1:] if first_download_file: return Request(first_download_file)
def stat_file(self, key, info): """ the stat is the file key dir, the last_modified is the file that saved to the file key dir. """ keydir = os.path.join(self.basedir, *key.split('/')) filenames = os.listdir(keydir) if len(filenames) != 1: shutil.rmtree(keydir, True) return {} else: filename = list_first_item(filenames) absolute_path = self._get_filesystem_path(key) try: last_modified = os.path.getmtime(absolute_path) except Exception as e: # FIXME: catching everything! return {} with open(os.path.join(absolute_path, filename), 'rb') as file_content: checksum = md5sum(file_content) return {'last_modified': last_modified, 'checksum': checksum}
def stat_file(self, key, info): """ the stat is the file key dir, the last_modified is the file that saved to the file key dir. """ keydir = os.path.join(self.basedir, *key.split('/')) filenames = os.listdir(keydir) if len(filenames) != 1: shutil.rmtree(keydir,True) return {} else: filename = list_first_item(filenames) absolute_path = self._get_filesystem_path(key) try: last_modified = os.path.getmtime(absolute_path) except: # FIXME: catching everything! return {} with open(os.path.join(absolute_path,filename), 'rb') as file_content: checksum = md5sum(file_content) return {'last_modified': last_modified, 'checksum': checksum}