Пример #1
0
    def item_completed(self, results, item, info):
        if self.LOG_FAILED_RESULTS:
            msg = '%s found errors proessing %s' % (self.__class__.__name__,
                                                    item)
            for ok, value in results:
                if not ok:
                    log.err(value, msg, spider=info.spider)

        bookfile_paths_urls = [(x['path'], x['url']) for ok, x in results
                               if ok]
        bookfile_path_url = list_first_item(bookfile_paths_urls)
        if bookfile_path_url:
            item['book_file'] = os.path.join(
                os.path.abspath(self.bookfile_store), bookfile_path_url[0])
            item['book_file_url'] = bookfile_path_url[1]
            return item
        else:
            if self.item_download[item['original_url']]:
                next = list_first_item(
                    self.item_download[item['original_url']])
                self.item_download[item['original_url']] = self.item_download[
                    item['original_url']][1:]
                return Request(next)
            else:
                return item
Пример #2
0
    def parse_detail(self, response):
        woaidu_item = WoaiduCrawlerItem()

        response_selector = HtmlXPathSelector(response)
        woaidu_item['book_name'] = list_first_item(
            response_selector.select(
                '//div[@class="zizida"][1]/text()').extract())
        woaidu_item['author'] = [
            list_first_item(
                response_selector.select('//div[@class="xiaoxiao"][1]/text()').
                extract())[5:].strip(),
        ]
        woaidu_item['book_description'] = list_first_item(
            response_selector.select(
                '//div[@class="lili"][1]/text()').extract()).strip()
        woaidu_item['book_covor_image_url'] = list_first_item(
            response_selector.select(
                '//div[@class="hong"][1]/img/@src').extract())

        download = []
        for i in response_selector.select(
                '//div[contains(@class,"xiazai_xiao")]')[1:]:
            download_item = {}
            download_item['url'] = \
                strip_null(deduplication(
                        [
                            list_first_item(i.select('./div')[0].select('./a/@href').extract()),
                            list_first_item(i.select('./div')[1].select('./a/@href').extract())
                        ]
                    )
                )

            download_item['progress'] = list_first_item(
                i.select('./div')[2].select('./text()').extract())
            download_item['update_time'] = list_first_item(
                i.select('./div')[3].select('./text()').extract())
            download_item['source_site'] = [
                        list_first_item(i.select('./div')[4].select('./a/text()').extract()),
                        list_first_item(i.select('./div')[4].select('./a/@href').extract())
                    ]\

            download.append(download_item)

        woaidu_item['book_download'] = download
        woaidu_item['original_url'] = response.url

        yield woaidu_item
Пример #3
0
    def parse(self,response):
        response_selector = HtmlXPathSelector(response)
        next_link = list_first_item(response_selector.select(u'//div[@class="k2"]/div/a[text()="下一页"]/@href').extract())
        if next_link:
            next_link = clean_url(response.url,next_link,response.encoding)
            
			#what does it mean yield
			yield Request(url=next_link, callback=self.parse)
Пример #4
0
    def parse(self,response):
        response_selector = HtmlXPathSelector(response)
        next_link = list_first_item(response_selector.select(u'//div[@class="k2"]/div/a[text()="下一页"]/@href').extract())
        if next_link:
            next_link = clean_url(response.url,next_link,response.encoding)
            yield Request(url=next_link, callback=self.parse)

        for detail_link in response_selector.select(u'//div[contains(@class,"sousuolist")]/a/@href').extract():
            if detail_link:
                detail_link = clean_url(response.url,detail_link,response.encoding)
                yield Request(url=detail_link, callback=self.parse_detail)
Пример #5
0
    def item_completed(self, results, item, info):
        if self.LOG_FAILED_RESULTS:
            msg = '%s found errors proessing %s' % (self.__class__.__name__, item)
            for ok, value in results:
                if not ok:
                    log.err(value, msg, spider=info.spider)

        bookfile_paths_urls = [(x['path'],x['url']) for ok, x in results if ok]
        bookfile_path_url = list_first_item(bookfile_paths_urls)
        if bookfile_path_url:
            item['book_file'] = os.path.join(os.path.abspath(self.bookfile_store),bookfile_path_url[0])
            item['book_file_url'] = bookfile_path_url[1]
            return item
        else:
            if self.item_download[item['original_url']]:
                next = list_first_item(self.item_download[item['original_url']])
                self.item_download[item['original_url']] = self.item_download[item['original_url']][1:]
                return Request(next)
            else:
                return item
Пример #6
0
    def item_completed(self, results, item, info):
        if self.LOG_FAILED_RESULTS:
            msg = '%s found errors proessing %s' % (self.__class__.__name__, item)
            for ok, value in results:
                if not ok:
                    log.err(value, msg, spider=info.spider)

        image_paths = [x['path'] for ok, x in results if ok]
        image_path = list_first_item(image_paths)
        item['book_covor_image_path'] = os.path.join(os.path.abspath(self.images_store),image_path) if image_path else ""

        return item
Пример #7
0
 def get_media_requests(self, item, info):
     """
         Only download once per book,so it pick out one from all of the download urls.
     """ 
     
     #XXX:To test specific url,you can use the following method:
     #return Request("http://down.wmtxt.com/wmtxt/wmtxt/UploadFile/2010-6/%A1%B6%D3%F6%BC%FB%C4%E3%A1%B7.rar")
     if item.get('book_download'):
         downloadfile_urls = [i['url'] for i in item.get('book_download') if i['url']]
         downloadfile_urls = list(set(itertools.chain(*downloadfile_urls)))
         first_download_file = list_first_item(downloadfile_urls)
         self.item_download[item['original_url']] = downloadfile_urls[1:]
         if first_download_file:
             return Request(first_download_file)
    def parse_detail(self, response):
        woaidu_item = WoaiduCrawlerItem()

        response_selector = HtmlXPathSelector(response)
        woaidu_item['book_name'] = list_first_item(response_selector.select('//div[@class="zizida"][1]/text()').extract())
        woaidu_item['author'] = [list_first_item(response_selector.select('//div[@class="xiaoxiao"][1]/text()').extract())[5:].strip(),]
        woaidu_item['book_description'] = list_first_item(response_selector.select('//div[@class="lili"][1]/text()').extract()).strip()
        woaidu_item['book_covor_image_url'] = list_first_item(response_selector.select('//div[@class="hong"][1]/img/@src').extract())

        download = []
        for i in response_selector.select('//div[contains(@class,"xiazai_xiao")]')[1:]:
            download_item = {}
            download_item['url'] = \
                strip_null( \
                    deduplication(\
                        [\
                            list_first_item(i.select('./div')[0].select('./a/@href').extract()),\
                            list_first_item(i.select('./div')[1].select('./a/@href').extract())\
                        ]\
                    )\
                )
            
            download_item['progress'] = list_first_item(i.select('./div')[2].select('./text()').extract())
            download_item['update_time'] = list_first_item(i.select('./div')[3].select('./text()').extract())
            download_item['source_site'] = \
                    [\
                        list_first_item(i.select('./div')[4].select('./a/text()').extract()),\
                        list_first_item(i.select('./div')[4].select('./a/@href').extract())\
                    ]\

            download.append(download_item)

        woaidu_item['book_download'] = download
        woaidu_item['original_url'] = response.url
        
        yield woaidu_item
Пример #9
0
    def get_media_requests(self, item, info):
        """
            Only download once per book,so it pick out one from all of the download urls.
        """

        #XXX:To test specific url,you can use the following method:
        #return Request("http://down.wmtxt.com/wmtxt/wmtxt/UploadFile/2010-6/%A1%B6%D3%F6%BC%FB%C4%E3%A1%B7.rar")
        if item.get('book_download'):
            downloadfile_urls = [
                i['url'] for i in item.get('book_download') if i['url']
            ]
            downloadfile_urls = list(set(itertools.chain(*downloadfile_urls)))
            first_download_file = list_first_item(downloadfile_urls)
            self.item_download[item['original_url']] = downloadfile_urls[1:]
            if first_download_file:
                return Request(first_download_file)
Пример #10
0
    def stat_file(self, key, info):
        """
            the stat is the file key dir,
            the last_modified is the file that saved to the file key dir.
        """
        keydir = os.path.join(self.basedir, *key.split('/'))
        filenames = os.listdir(keydir)
        if len(filenames) != 1:
            shutil.rmtree(keydir, True)
            return {}
        else:
            filename = list_first_item(filenames)
        absolute_path = self._get_filesystem_path(key)
        try:
            last_modified = os.path.getmtime(absolute_path)
        except Exception as e:  # FIXME: catching everything!
            return {}

        with open(os.path.join(absolute_path, filename), 'rb') as file_content:
            checksum = md5sum(file_content)

        return {'last_modified': last_modified, 'checksum': checksum}
Пример #11
0
    def stat_file(self, key, info):
        """
            the stat is the file key dir,
            the last_modified is the file that saved to the file key dir.
        """
        
        keydir = os.path.join(self.basedir, *key.split('/'))
        filenames = os.listdir(keydir)
        if len(filenames) != 1:
            shutil.rmtree(keydir,True)
            return {}
        else:
            filename = list_first_item(filenames)
        
        absolute_path = self._get_filesystem_path(key)
        try:
            last_modified = os.path.getmtime(absolute_path)
        except:  # FIXME: catching everything!
            return {}

        with open(os.path.join(absolute_path,filename), 'rb') as file_content:
            checksum = md5sum(file_content)

        return {'last_modified': last_modified, 'checksum': checksum}