Пример #1
0
    def parse_parts2(self, response):
        log.msg("\tparse_parts time: %s" % int(time.time()), level=log.DEBUG)
        ua = response.request.headers['User-Agent']
        log.msg("\tua: %s" % ua, level=log.DEBUG)

        for part in response.css('table.parts > tbody > tr'):
            il = ItemLoader(item=CarPart(), selector=part)
            il.add_xpath('shop_city', "td[@class='shop']/a/text()")
            il.add_xpath('shop_name', "td[@class='shop']/a/strong/text()")

            shop_url = il.get_xpath("td[@class='shop']/a/@href", TakeFirst())
            photo_url = il.get_xpath("td[@class='photo']/a/@href", TakeFirst())
            il.add_value('shop_url', urljoin(self.main_url, shop_url))
            il.add_value('ext_link', urljoin(self.main_url, photo_url))

            il.add_xpath('info', "td[@class='info']//text()")
            il.add_xpath('price', "td[@class='price']//text()")

            il.add_value('brand', response.meta.get('brand'))
            il.add_value('model', response.meta.get('model'))
            il.add_value('car_part', response.meta.get('car_part'))
            il.add_value('category', response.meta.get('category'))

            item = il.load_item()
            if item.is_valid():
                yield item
Пример #2
0
def Loader_content(response):
    l = ItemLoader(item={}, response=response)
    sub_title = ''
    if len(l.get_xpath('//*[@class="pagenow"]/text()')) >= 1:
        sub_title += '-' + l.get_xpath('//*[@class="pagenow"]/text()')[0]
    l.add_value(
        'title',
        l.get_xpath('//*[@class="b_list-1a-1c"]/text()')[0] + sub_title)
    l.add_value('src_url', response.url)
    content_img = l.get_xpath('//*[@class="content-img"]/p/img/@src')
    l.add_value('content', content_img)
    l.add_value('image_urls', content_img)
    print('正下载图片:', content_img)
    time.sleep(len(content_img))
    return l.load_item()
Пример #3
0
    def parse(self, response):

        item = PdfItem()
        loader = ItemLoader(response=response)

        pdf_path = '//*[contains(text(), "[PDF]")]'
        pdf_url_path = '%s//following-sibling::*' % pdf_path

        item['url'] = loader.get_xpath('%s' % pdf_url_path)
        item['title'] = loader.get_xpath('%s/text()' % pdf_url_path,
                                         TakeFirst())

        summary_path = '%s//parent::*//parent::*/*[@class="s"]/*' % pdf_url_path
        description_path = '%s/*[@class="st"]/*' % summary_path

        item['description'] = loader.get_xpath(
            '%s/text()|%s/*/text()' % (description_path, description_path))

        similar_path = '%s/*[contains(@class, "f")]//a[contains(@href, "q=related:")]' % summary_path

        # similar_url = loader.get_xpath('%s/@href' % similar_path, TakeFirst())
        # yield Request(
        #     url=urlparse.urljoin(response.url, similar_url),
        #     callback=self.parse,
        #     meta=response.meta,
        #     dont_filter=True
        # )
        #
        # next_path = '//*[@class="pn"]'
        # next_url = loader.get_xpath('%s/@href' % next_path, TakeFirst())
        # yield Request(
        #     url=urlparse.urljoin(response.url, next_url),
        #     callback=self.parse,
        #     meta=response.meta,
        #     dont_filter=True
        # )

        pdf_url = item['url']
        print item
        if pdf_url:
            pdf_filename = os.path.basename(pdf_url)
            pdf_filepath = '%s/%s/%s' % (DOWNLOAD_DIR, SEARCH_TERM,
                                         pdf_filename)

            if self.download_files:
                self.download_file(pdf_url, pdf_filepath, response.url)

            yield item
Пример #4
0
def Loader_content(response):
    l = ItemLoader(item={}, response=response)
    l.add_css('title', '.k_jianjie-3a-1-name::text')
    l.add_value('date', l.get_xpath('//*[@class="k_jianjie-3a-2b"]/text()')[2])
    #l.add_value('url',_response.url[len(self._scheme+"//"+self.allowed_domains[0]):])
    l.add_css('down', '.k_jianjie-3a-5down::text', TrimAll())

    conver_img = l.get_xpath('//*[@id="k_jianjie-2b"]/a/img/@src')
    content_img = l.get_xpath('//*[@class="content"]/p/img/@src')
    l.add_value('src_url', response.url)
    l.add_value('preview', conver_img)
    l.add_value('content', content_img)
    l.add_value('image_urls', conver_img + content_img)
    print('正下载图片:', conver_img + content_img)
    #time.sleep(len(conver_img+content_img))
    return l.load_item()
Пример #5
0
    def parse(self, response):

        item = PdfItem()
        loader = ItemLoader(response=response)

        pdf_path = '//*[contains(text(), "[PDF]")]'
        pdf_url_path = '%s//following-sibling::*' % pdf_path

        item['url'] = loader.get_xpath('%s' % pdf_url_path)
        item['title'] = loader.get_xpath('%s/text()' % pdf_url_path, TakeFirst())

        summary_path = '%s//parent::*//parent::*/*[@class="s"]/*' % pdf_url_path
        description_path = '%s/*[@class="st"]/*' % summary_path

        item['description'] = loader.get_xpath('%s/text()|%s/*/text()' % (description_path, description_path))

        similar_path = '%s/*[contains(@class, "f")]//a[contains(@href, "q=related:")]' % summary_path

        # similar_url = loader.get_xpath('%s/@href' % similar_path, TakeFirst())
        # yield Request(
        #     url=urlparse.urljoin(response.url, similar_url),
        #     callback=self.parse,
        #     meta=response.meta,
        #     dont_filter=True
        # )
        #
        # next_path = '//*[@class="pn"]'
        # next_url = loader.get_xpath('%s/@href' % next_path, TakeFirst())
        # yield Request(
        #     url=urlparse.urljoin(response.url, next_url),
        #     callback=self.parse,
        #     meta=response.meta,
        #     dont_filter=True
        # )

        pdf_url = item['url']
        print item
        if pdf_url:
            pdf_filename = os.path.basename(pdf_url)
            pdf_filepath = '%s/%s/%s' % (DOWNLOAD_DIR, SEARCH_TERM, pdf_filename)

            if self.download_files:
                self.download_file(pdf_url, pdf_filepath, response.url)

            yield item
Пример #6
0
 def parse(self, response):
     l = ItemLoader(item=CoserItem(), response=response)
     l.add_xpath('name', "//h1[@class='js-post-title']/text()")
     l.add_xpath('info', "//div[@class='post__info']/div[@class='post__type post__info-group']/span/text()")
     urls = l.get_xpath('//img[@class="detail_std detail_clickable"]/@src')
     urls = [url.replace('/w650', '') for url in urls]
     l.add_value('image_urls', urls)
     l.add_value('url', response.url)
     return l.load_item()
Пример #7
0
 def parase_item(self,response):
     l = ItemLoader(item = CoserspiderItem(),response=response)
     l.add_xpath('name', "//h1[@class='js-post-title']/text()")
     l.add_xpath('info',"//div[@class='post__info']/div[@class='post__type post__info-group']/span/text()")
     urls = l.get_xpath('//img[@class="detail_std detail_clickable"]/@src')
     urls = [url.replace('/w650','') for url in urls]
     l.add_xpath('image_urls',urls)
     l.add_xpath('url',response.url)
     return l.load_item()
Пример #8
0
def Loader_index(self, item_selector):
    l = ItemLoader(item={}, selector=item_selector)

    conver_img = l.get_xpath('.//*[@class="lz_img"]/img/@src')

    l.add_xpath('title', './/*[@class="k_list-lb-2"]/div[1]/a[1]/text()')
    l.add_xpath('url', './/*[@class="k_list-lb-2"]/div[1]/a/@href')
    l.add_value('preview', conver_img)
    l.add_css('date', '#k_list-lb-2-f::text', re=r'(\d{4}-\d{2}-\d{2})')
    l.add_value('image_urls', conver_img)
    return l.load_item()
Пример #9
0
    def parse(self, response):
        try:
            page = response.url.split("/")[-1].split(".")[0]
            self.log('ID: %s' % page)

            book_name = response.css('h2::text').extract_first()
            #self.log('book_name: %s' % book_name)

            book_author = response.css('h4::text').extract_first().replace(
                'Tác giả: ', '')
            #self.log('book_author: %s' % book_author)

            book_category = response.css('h4 a::text').extract_first().replace(
                'Thể loại: ', '')
            #self.log('book_category: %s' % book_category)

            book_cover = response.xpath(
                '//img[@class="img-thumbnail"]//@src').extract_first()
            # BookDownload = namedtuple('BookDownload', ['source', 'epub', 'mobi', 'pdf', 'azw3', 'prc'])
            # book_downloads = []
            # for book_download in response.css('div.book-download'):
            #     # print(book_download.css('a::text').extract_first())
            #     # bd = BookDownload._fields_defaults
            #     source = book_download.css('a::text').extract_first()
            #     epub = book_download.css('a')[1].extract()
            #     # book_downloads.append(bd)
            #     self.log('source: %s' % source)
            #     self.log('epub: %s' % epub)
            # self.log('book_downloads: %s' % book_downloads)

            loader = ItemLoader(response=response)
            book_description = loader.get_xpath(
                '//div[@class="book-description"]/node()', Join())
            #self.log('book_description: %s' % book_description)
        except:
            self.log('ERROR in: %s', response.url)

        yield {
            'id': page,
            'name': book_name,
            'author': book_author,
            'category': book_category,
            'description': book_description,
            'cover': book_cover
        }
Пример #10
0
    def parse(self, response):
        if '%s/404' % DOMAIN_URL not in response.url:
            item = BookItem()
            loader = ItemLoader(response=response)

            name_path = '//h1/text()|//h3/text()'  # h1
            item['name'] = " ".join(loader.get_xpath(name_path))

            item['url'] = response.url

            description_path = '//*[@itemprop="description"]'  # span
            item['description'] = "".join(loader.get_xpath('%s/text()|%s/*/text()' % (description_path, description_path)))

            publisher_path = '//*[@itemprop="publisher"]/text()'  # a
            item['publisher'] = loader.get_xpath(publisher_path, TakeFirst())

            by_path = '//*[@itemprop="author"]/text()'  # b
            item['author'] = loader.get_xpath(by_path, TakeFirst())

            isbn_path = '//*[@itemprop="isbn"]/text()'  # b
            item['isbn13'] = loader.get_xpath(isbn_path, TakeFirst())

            year_path = '//*[@itemprop="datePublished"]/text()'  # b
            item['year'] = loader.get_xpath(year_path, TakeFirst())

            pages_path = '//*[@itemprop="numberOfPages"]/text()'  # b
            item['pages'] = loader.get_xpath(pages_path, TakeFirst())

            language_path = '//*[@itemprop="inLanguage"]/text()'  # b
            item['language'] = loader.get_xpath(language_path, TakeFirst())

            format_path = '//*[@itemprop="bookFormat"]/text()'  # b
            item['format'] = loader.get_xpath(format_path, TakeFirst())

            url_path = '//a[contains(@href, "http://file")]/@href'
            item['download_url'] = loader.get_xpath(url_path, TakeFirst())

            buy_path = '//a[contains(@href, "http://isbn")]/@href'
            item['buy'] = loader.get_xpath(buy_path, TakeFirst())

            size_path = '//*[contains(text(), "size")]//following-sibling::*'
            item['size'] = loader.get_xpath('%s/*/text()' % size_path, TakeFirst())

            image_path = '//img[contains(@itemprop, "image")]'
            item['image_url'] = loader.get_xpath('%s/@src' % image_path, TakeFirst())

            related_path = '//td[contains(@width, "166")]/a'
            item['related'] = {
                'name': loader.get_xpath('%s/@title' % related_path, TakeFirst()),
                'url': urlparse.urljoin(response.url, loader.get_xpath('%s/@href' % related_path, TakeFirst())),
            }
            pdf_url = item['download_url']
            pdf_filename = './books/' + \
                item['name'].replace(',', '_').replace('/', '\/').replace(' ', '_') + \
                '-' + item['author'].replace(',', '').replace(' ', '_') + \
                '-' + item['isbn13'].replace('-', '') + \
                '.pdf'.replace('__', '_')

            item['filename'] = pdf_filename

            if self.download_files:
                self.download_file(pdf_url, pdf_filename, response.url)
            print response.url, item['image_url']
            image_url = urlparse.urljoin(response.url, item['image_url'])
            self.download_file(image_url, './%s' % item['image_url'].strip('/'), response.url)

            yield item
Пример #11
0
    def parse_post(self, response):
        post = ItemLoader(item=ArabiaPostItem(), response=response)
        post.default_output_processor = TakeFirst()
        #post.add_xpath('id', '//*[@class="post_content replace_urls"]/@id', MapCompose(int), re=r'(\d+)')
        post.add_xpath('id',
                       '//*[@class="short_url inputtext"]/@value',
                       MapCompose(int),
                       re=r'(\d+)')
        post.add_xpath('title', '//*[@id="nav_title"]/a/text()')
        post.add_xpath('up_votes',
                       '//*[@class="s_upvotes"]/text()',
                       MapCompose(int),
                       re=r'(\d+)')
        post.add_xpath('down_votes',
                       '//*[@class="s_downvotes"]/text()',
                       MapCompose(int),
                       re=r'(\d+)')
        post.add_xpath('points', '//*[@class="post_points ltr"]/text()',
                       MapCompose(int))
        post.add_xpath('author_username',
                       '//*[@class="block username"]/text()')
        post.add_xpath('author_fullname',
                       '//*[@class="block full_name"]/text()',
                       MapCompose(lambda value: value.replace(u'\xa0', u'')))
        post.add_xpath('date', '//*[@class="icon-time"]/../text()')
        post.add_xpath('community',
                       '//*[@class="icon-reorder"]/../a[1]/text()')
        post.add_xpath('topics', '//*[@class="topic"]/text()',
                       MapCompose(string.strip))
        post.add_xpath('url', '//*[@class="short_url inputtext"]/@value')
        post.add_value(
            'type',
            'link' if post.get_xpath('//*[@id="nav_title"]/a/@rel',
                                     TakeFirst()) == 'nofollow' else 'text')
        if post.get_output_value('type') == 'link':
            post.add_xpath('link', '//*[@id="nav_title"]/a/@href')
            post.add_xpath('domain',
                           '//*[@class="post_domain"]/text()',
                           re=r'\((.+?)\)')
        post.add_xpath('content', '//*[@class="post_content replace_urls"]/*',
                       Join('\n'))
        post.add_value('item', 'post')
        yield post.load_item()

        comments = []
        for row in response.selector.xpath(
                '//*[contains(@class, "post_comment")]'):
            comment = ItemLoader(item=ArabiaCommentItem(),
                                 selector=row,
                                 response=response)
            comment.default_output_processor = TakeFirst()
            comment.add_xpath('id', './@id', re=r'(\d+)')
            comment.add_xpath('index',
                              './@class',
                              MapCompose(int),
                              re=r'index(\d+)')
            comment.add_value('post_id', post.get_output_value('id'))
            #comment.add_value('parent_id', '')
            comment.add_xpath('author_username',
                              './/*[@class="comment_user"]/a/text()')
            comment.add_xpath('date', './/*[@class="comment_date"]/text()')
            comment.add_xpath('points',
                              './/*[@class="comment_points ltr"]/text()')
            comment.add_xpath(
                'content',
                './/*[@class="post_content comment_content replace_urls"]/*',
                Join('\n'))
            #comment.add_xpath('url', './/*[@class="comment_short_url"]/a/@href')
            comment.add_value(
                'url', 'https://arabia.io/go/{0}/{1}'.format(
                    post.get_output_value('id'),
                    comment.get_output_value('id')))
            comment.add_value('item', 'comment')
            comments.append(comment)

        for (index, comment) in enumerate(comments):
            if comment.get_output_value('index') == 0:
                comment.add_value('parent_id', 0)
                continue
            for comment_cursor in comments[:index][::-1]:
                if comment_cursor.get_output_value(
                        'index') == comment.get_output_value('index') - 1:
                    comment.add_value('parent_id',
                                      comment_cursor.get_output_value('id'))
                    break

        for comment in comments:
            yield comment.load_item()
Пример #12
0
    def parse(self, response):
        loader = ItemLoader(response=response)

        xpaths = [
            '//a[contains(text(), "Next")]',
            '//a[contains(@href, "-ebook.htm")]/@href'
        ]
        for xpath in xpaths:
            for href in loader.get_xpath(xpath):
                yield Request(url=urlparse.urljoin(response.url, href),
                              callback=self.parse,
                              meta=response.meta,
                              dont_filter=True)

        if '-ebook.htm' in response.url:
            item = BookItem()

            name_path = '//*[@itemprop="name"]/text()'  # h1
            item['name'] = loader.get_xpath(name_path, TakeFirst())

            item['url'] = response.url

            description_path = '//div[contains(text(), ".")]/text()'  # span
            item['description'] = loader.get_xpath(description_path,
                                                   TakeFirst())

            publisher_path = '//*[@itemprop="publisher"]/text()'  # a
            item['publisher'] = loader.get_xpath(publisher_path, TakeFirst())

            by_path = '//*[@itemprop="author"]/text()'  # b
            item['author'] = loader.get_xpath(by_path, TakeFirst())

            isbn_path = '//*[@itemprop="isbn"]/text()'  # b
            item['isbn10'] = loader.get_xpath(isbn_path, TakeFirst())
            item['isbn13'] = loader.get_xpath(isbn_path).pop()

            year_path = '//*[@itemprop="datePublished"]/text()'  # b
            item['year'] = loader.get_xpath(year_path, TakeFirst())

            pages_path = '//*[@itemprop="numberOfPages"]/text()'  # b
            item['pages'] = loader.get_xpath(pages_path, TakeFirst())

            language_path = '//*[@itemprop="inLanguage"]/text()'  # b
            item['language'] = loader.get_xpath(language_path, TakeFirst())

            format_path = '//*[@itemprop="bookFormat"]/text()'  # b
            item['format'] = loader.get_xpath(format_path, TakeFirst())

            edition_path = '//*[@itemprop="bookEdition"]/text()'  # b
            item['edition'] = loader.get_xpath(edition_path, TakeFirst())

            series_path = '//*[contains(text(), "Series")]//following-sibling::*'
            item['series'] = ", ".join(
                loader.get_xpath('%s/*/text()' % series_path)).strip()

            size_path = '//*[contains(text(), "Book size")]//following-sibling::*'
            item['size'] = loader.get_xpath('%s/text()' % size_path,
                                            TakeFirst())

            related_url_path = '//h4/a/@href'
            related_name_path = '//h4/a/text()'
            item['related'] = {
                'name':
                loader.get_xpath(related_name_path, TakeFirst()),
                'url':
                urlparse.urljoin(
                    response.url,
                    loader.get_xpath(related_url_path, TakeFirst())),
            }

            url_path = '//div[contains(@id, "dl")]/script'
            script = loader.get_xpath(url_path, TakeFirst())

            download_url = script.split('href="')[1].split('" onclick')[0]
            item['download_url'] = urlparse.urljoin(response.url, download_url)

            pdf_url = item['download_url']
            pdf_filename = './books_/' + item['name'] + ' - ' + item[
                'author'] + ' - ' + item['isbn13'] + '.pdf'

            item['filename'] = pdf_filename

            if self.download_files:
                # req = urllib2.Request(pdf_url)
                # req.add_header('Referer', '%s' % response.url)
                # r = urllib2.urlopen(req)
                # r = r.read()
                # with open(pdf_filename, 'wb') as out:
                #     print r
                #     out.write(r)
                from socket import socket

                host = DOMAIN
                port = 80
                path = response.url.replace(DOMAIN_URL, '')
                xmlmessage = "<port>0</port>"

                s = socket()
                s.connect((host, port))
                s.send("GET %s HTTP/1.1\r\n" % path)
                s.send("Host: %s\r\n" % host)
                s.send("Content-Type: text/html\r\n")

                for line in s.makefile():
                    print line,
                s.close()

            yield item

            for href in loader.get_xpath('//td/a/@href'):
                yield Request(url=urlparse.urljoin(response.url, href),
                              callback=self.parse,
                              meta=response.meta,
                              dont_filter=True)

        else:
            for href in loader.get_xpath(
                    '//a[contains(@href, "-ebooks-")]/@href'):
                yield Request(url=urlparse.urljoin(response.url, href),
                              callback=self.parse,
                              meta=response.meta,
                              dont_filter=True)
Пример #13
0
    def parse(self, response):
        loader = ItemLoader(response=response)

        xpaths = [
            '//a[contains(text(), "Next")]',
            '//a[contains(@href, "-ebook.htm")]/@href'
        ]
        for xpath in xpaths:
            for href in loader.get_xpath(xpath):
                yield Request(
                    url=urlparse.urljoin(response.url, href),
                    callback=self.parse,
                    meta=response.meta,
                    dont_filter=True
                )

        if '-ebook.htm' in response.url:
            item = BookItem()

            name_path = '//*[@itemprop="name"]/text()'  # h1
            item['name'] = loader.get_xpath(name_path, TakeFirst())

            item['url'] = response.url

            description_path = '//div[contains(text(), ".")]/text()'  # span
            item['description'] = loader.get_xpath(description_path, TakeFirst())

            publisher_path = '//*[@itemprop="publisher"]/text()'  # a
            item['publisher'] = loader.get_xpath(publisher_path, TakeFirst())

            by_path = '//*[@itemprop="author"]/text()'  # b
            item['author'] = loader.get_xpath(by_path, TakeFirst())

            isbn_path = '//*[@itemprop="isbn"]/text()'  # b
            item['isbn10'] = loader.get_xpath(isbn_path, TakeFirst())
            item['isbn13'] = loader.get_xpath(isbn_path).pop()

            year_path = '//*[@itemprop="datePublished"]/text()'  # b
            item['year'] = loader.get_xpath(year_path, TakeFirst())

            pages_path = '//*[@itemprop="numberOfPages"]/text()'  # b
            item['pages'] = loader.get_xpath(pages_path, TakeFirst())

            language_path = '//*[@itemprop="inLanguage"]/text()'  # b
            item['language'] = loader.get_xpath(language_path, TakeFirst())

            format_path = '//*[@itemprop="bookFormat"]/text()'  # b
            item['format'] = loader.get_xpath(format_path, TakeFirst())

            edition_path = '//*[@itemprop="bookEdition"]/text()'  # b
            item['edition'] = loader.get_xpath(edition_path, TakeFirst())

            series_path = '//*[contains(text(), "Series")]//following-sibling::*'
            item['series'] = ", ".join(loader.get_xpath('%s/*/text()' % series_path)).strip()

            size_path = '//*[contains(text(), "Book size")]//following-sibling::*'
            item['size'] = loader.get_xpath('%s/text()' % size_path, TakeFirst())

            related_url_path = '//h4/a/@href'
            related_name_path = '//h4/a/text()'
            item['related'] = {
                'name': loader.get_xpath(related_name_path, TakeFirst()),
                'url': urlparse.urljoin(response.url, loader.get_xpath(related_url_path, TakeFirst())),
            }

            url_path = '//div[contains(@id, "dl")]/script'
            script = loader.get_xpath(url_path, TakeFirst())

            download_url = script.split('href="')[1].split('" onclick')[0]
            item['download_url'] = urlparse.urljoin(response.url, download_url)

            pdf_url = item['download_url']
            pdf_filename = './books_/' + item['name'] + ' - ' + item['author'] + ' - ' + item['isbn13'] + '.pdf'

            item['filename'] = pdf_filename

            if self.download_files:
                # req = urllib2.Request(pdf_url)
                # req.add_header('Referer', '%s' % response.url)
                # r = urllib2.urlopen(req)
                # r = r.read()
                # with open(pdf_filename, 'wb') as out:
                #     print r
                #     out.write(r)
                from socket import socket

                host = DOMAIN
                port = 80
                path = response.url.replace(DOMAIN_URL, '')
                xmlmessage = "<port>0</port>"

                s = socket()
                s.connect((host, port))
                s.send("GET %s HTTP/1.1\r\n" % path)
                s.send("Host: %s\r\n" % host)
                s.send("Content-Type: text/html\r\n")

                for line in s.makefile():
                    print line,
                s.close()

            yield item

            for href in loader.get_xpath('//td/a/@href'):
                yield Request(
                    url=urlparse.urljoin(response.url, href),
                    callback=self.parse,
                    meta=response.meta,
                    dont_filter=True
                )

        else:
            for href in loader.get_xpath('//a[contains(@href, "-ebooks-")]/@href'):
                yield Request(
                    url=urlparse.urljoin(response.url, href),
                    callback=self.parse,
                    meta=response.meta,
                    dont_filter=True
                )