def parse_parts2(self, response): log.msg("\tparse_parts time: %s" % int(time.time()), level=log.DEBUG) ua = response.request.headers['User-Agent'] log.msg("\tua: %s" % ua, level=log.DEBUG) for part in response.css('table.parts > tbody > tr'): il = ItemLoader(item=CarPart(), selector=part) il.add_xpath('shop_city', "td[@class='shop']/a/text()") il.add_xpath('shop_name', "td[@class='shop']/a/strong/text()") shop_url = il.get_xpath("td[@class='shop']/a/@href", TakeFirst()) photo_url = il.get_xpath("td[@class='photo']/a/@href", TakeFirst()) il.add_value('shop_url', urljoin(self.main_url, shop_url)) il.add_value('ext_link', urljoin(self.main_url, photo_url)) il.add_xpath('info', "td[@class='info']//text()") il.add_xpath('price', "td[@class='price']//text()") il.add_value('brand', response.meta.get('brand')) il.add_value('model', response.meta.get('model')) il.add_value('car_part', response.meta.get('car_part')) il.add_value('category', response.meta.get('category')) item = il.load_item() if item.is_valid(): yield item
def Loader_content(response): l = ItemLoader(item={}, response=response) sub_title = '' if len(l.get_xpath('//*[@class="pagenow"]/text()')) >= 1: sub_title += '-' + l.get_xpath('//*[@class="pagenow"]/text()')[0] l.add_value( 'title', l.get_xpath('//*[@class="b_list-1a-1c"]/text()')[0] + sub_title) l.add_value('src_url', response.url) content_img = l.get_xpath('//*[@class="content-img"]/p/img/@src') l.add_value('content', content_img) l.add_value('image_urls', content_img) print('正下载图片:', content_img) time.sleep(len(content_img)) return l.load_item()
def parse(self, response): item = PdfItem() loader = ItemLoader(response=response) pdf_path = '//*[contains(text(), "[PDF]")]' pdf_url_path = '%s//following-sibling::*' % pdf_path item['url'] = loader.get_xpath('%s' % pdf_url_path) item['title'] = loader.get_xpath('%s/text()' % pdf_url_path, TakeFirst()) summary_path = '%s//parent::*//parent::*/*[@class="s"]/*' % pdf_url_path description_path = '%s/*[@class="st"]/*' % summary_path item['description'] = loader.get_xpath( '%s/text()|%s/*/text()' % (description_path, description_path)) similar_path = '%s/*[contains(@class, "f")]//a[contains(@href, "q=related:")]' % summary_path # similar_url = loader.get_xpath('%s/@href' % similar_path, TakeFirst()) # yield Request( # url=urlparse.urljoin(response.url, similar_url), # callback=self.parse, # meta=response.meta, # dont_filter=True # ) # # next_path = '//*[@class="pn"]' # next_url = loader.get_xpath('%s/@href' % next_path, TakeFirst()) # yield Request( # url=urlparse.urljoin(response.url, next_url), # callback=self.parse, # meta=response.meta, # dont_filter=True # ) pdf_url = item['url'] print item if pdf_url: pdf_filename = os.path.basename(pdf_url) pdf_filepath = '%s/%s/%s' % (DOWNLOAD_DIR, SEARCH_TERM, pdf_filename) if self.download_files: self.download_file(pdf_url, pdf_filepath, response.url) yield item
def Loader_content(response): l = ItemLoader(item={}, response=response) l.add_css('title', '.k_jianjie-3a-1-name::text') l.add_value('date', l.get_xpath('//*[@class="k_jianjie-3a-2b"]/text()')[2]) #l.add_value('url',_response.url[len(self._scheme+"//"+self.allowed_domains[0]):]) l.add_css('down', '.k_jianjie-3a-5down::text', TrimAll()) conver_img = l.get_xpath('//*[@id="k_jianjie-2b"]/a/img/@src') content_img = l.get_xpath('//*[@class="content"]/p/img/@src') l.add_value('src_url', response.url) l.add_value('preview', conver_img) l.add_value('content', content_img) l.add_value('image_urls', conver_img + content_img) print('正下载图片:', conver_img + content_img) #time.sleep(len(conver_img+content_img)) return l.load_item()
def parse(self, response): item = PdfItem() loader = ItemLoader(response=response) pdf_path = '//*[contains(text(), "[PDF]")]' pdf_url_path = '%s//following-sibling::*' % pdf_path item['url'] = loader.get_xpath('%s' % pdf_url_path) item['title'] = loader.get_xpath('%s/text()' % pdf_url_path, TakeFirst()) summary_path = '%s//parent::*//parent::*/*[@class="s"]/*' % pdf_url_path description_path = '%s/*[@class="st"]/*' % summary_path item['description'] = loader.get_xpath('%s/text()|%s/*/text()' % (description_path, description_path)) similar_path = '%s/*[contains(@class, "f")]//a[contains(@href, "q=related:")]' % summary_path # similar_url = loader.get_xpath('%s/@href' % similar_path, TakeFirst()) # yield Request( # url=urlparse.urljoin(response.url, similar_url), # callback=self.parse, # meta=response.meta, # dont_filter=True # ) # # next_path = '//*[@class="pn"]' # next_url = loader.get_xpath('%s/@href' % next_path, TakeFirst()) # yield Request( # url=urlparse.urljoin(response.url, next_url), # callback=self.parse, # meta=response.meta, # dont_filter=True # ) pdf_url = item['url'] print item if pdf_url: pdf_filename = os.path.basename(pdf_url) pdf_filepath = '%s/%s/%s' % (DOWNLOAD_DIR, SEARCH_TERM, pdf_filename) if self.download_files: self.download_file(pdf_url, pdf_filepath, response.url) yield item
def parse(self, response): l = ItemLoader(item=CoserItem(), response=response) l.add_xpath('name', "//h1[@class='js-post-title']/text()") l.add_xpath('info', "//div[@class='post__info']/div[@class='post__type post__info-group']/span/text()") urls = l.get_xpath('//img[@class="detail_std detail_clickable"]/@src') urls = [url.replace('/w650', '') for url in urls] l.add_value('image_urls', urls) l.add_value('url', response.url) return l.load_item()
def parase_item(self,response): l = ItemLoader(item = CoserspiderItem(),response=response) l.add_xpath('name', "//h1[@class='js-post-title']/text()") l.add_xpath('info',"//div[@class='post__info']/div[@class='post__type post__info-group']/span/text()") urls = l.get_xpath('//img[@class="detail_std detail_clickable"]/@src') urls = [url.replace('/w650','') for url in urls] l.add_xpath('image_urls',urls) l.add_xpath('url',response.url) return l.load_item()
def Loader_index(self, item_selector): l = ItemLoader(item={}, selector=item_selector) conver_img = l.get_xpath('.//*[@class="lz_img"]/img/@src') l.add_xpath('title', './/*[@class="k_list-lb-2"]/div[1]/a[1]/text()') l.add_xpath('url', './/*[@class="k_list-lb-2"]/div[1]/a/@href') l.add_value('preview', conver_img) l.add_css('date', '#k_list-lb-2-f::text', re=r'(\d{4}-\d{2}-\d{2})') l.add_value('image_urls', conver_img) return l.load_item()
def parse(self, response): try: page = response.url.split("/")[-1].split(".")[0] self.log('ID: %s' % page) book_name = response.css('h2::text').extract_first() #self.log('book_name: %s' % book_name) book_author = response.css('h4::text').extract_first().replace( 'Tác giả: ', '') #self.log('book_author: %s' % book_author) book_category = response.css('h4 a::text').extract_first().replace( 'Thể loại: ', '') #self.log('book_category: %s' % book_category) book_cover = response.xpath( '//img[@class="img-thumbnail"]//@src').extract_first() # BookDownload = namedtuple('BookDownload', ['source', 'epub', 'mobi', 'pdf', 'azw3', 'prc']) # book_downloads = [] # for book_download in response.css('div.book-download'): # # print(book_download.css('a::text').extract_first()) # # bd = BookDownload._fields_defaults # source = book_download.css('a::text').extract_first() # epub = book_download.css('a')[1].extract() # # book_downloads.append(bd) # self.log('source: %s' % source) # self.log('epub: %s' % epub) # self.log('book_downloads: %s' % book_downloads) loader = ItemLoader(response=response) book_description = loader.get_xpath( '//div[@class="book-description"]/node()', Join()) #self.log('book_description: %s' % book_description) except: self.log('ERROR in: %s', response.url) yield { 'id': page, 'name': book_name, 'author': book_author, 'category': book_category, 'description': book_description, 'cover': book_cover }
def parse(self, response): if '%s/404' % DOMAIN_URL not in response.url: item = BookItem() loader = ItemLoader(response=response) name_path = '//h1/text()|//h3/text()' # h1 item['name'] = " ".join(loader.get_xpath(name_path)) item['url'] = response.url description_path = '//*[@itemprop="description"]' # span item['description'] = "".join(loader.get_xpath('%s/text()|%s/*/text()' % (description_path, description_path))) publisher_path = '//*[@itemprop="publisher"]/text()' # a item['publisher'] = loader.get_xpath(publisher_path, TakeFirst()) by_path = '//*[@itemprop="author"]/text()' # b item['author'] = loader.get_xpath(by_path, TakeFirst()) isbn_path = '//*[@itemprop="isbn"]/text()' # b item['isbn13'] = loader.get_xpath(isbn_path, TakeFirst()) year_path = '//*[@itemprop="datePublished"]/text()' # b item['year'] = loader.get_xpath(year_path, TakeFirst()) pages_path = '//*[@itemprop="numberOfPages"]/text()' # b item['pages'] = loader.get_xpath(pages_path, TakeFirst()) language_path = '//*[@itemprop="inLanguage"]/text()' # b item['language'] = loader.get_xpath(language_path, TakeFirst()) format_path = '//*[@itemprop="bookFormat"]/text()' # b item['format'] = loader.get_xpath(format_path, TakeFirst()) url_path = '//a[contains(@href, "http://file")]/@href' item['download_url'] = loader.get_xpath(url_path, TakeFirst()) buy_path = '//a[contains(@href, "http://isbn")]/@href' item['buy'] = loader.get_xpath(buy_path, TakeFirst()) size_path = '//*[contains(text(), "size")]//following-sibling::*' item['size'] = loader.get_xpath('%s/*/text()' % size_path, TakeFirst()) image_path = '//img[contains(@itemprop, "image")]' item['image_url'] = loader.get_xpath('%s/@src' % image_path, TakeFirst()) related_path = '//td[contains(@width, "166")]/a' item['related'] = { 'name': loader.get_xpath('%s/@title' % related_path, TakeFirst()), 'url': urlparse.urljoin(response.url, loader.get_xpath('%s/@href' % related_path, TakeFirst())), } pdf_url = item['download_url'] pdf_filename = './books/' + \ item['name'].replace(',', '_').replace('/', '\/').replace(' ', '_') + \ '-' + item['author'].replace(',', '').replace(' ', '_') + \ '-' + item['isbn13'].replace('-', '') + \ '.pdf'.replace('__', '_') item['filename'] = pdf_filename if self.download_files: self.download_file(pdf_url, pdf_filename, response.url) print response.url, item['image_url'] image_url = urlparse.urljoin(response.url, item['image_url']) self.download_file(image_url, './%s' % item['image_url'].strip('/'), response.url) yield item
def parse_post(self, response): post = ItemLoader(item=ArabiaPostItem(), response=response) post.default_output_processor = TakeFirst() #post.add_xpath('id', '//*[@class="post_content replace_urls"]/@id', MapCompose(int), re=r'(\d+)') post.add_xpath('id', '//*[@class="short_url inputtext"]/@value', MapCompose(int), re=r'(\d+)') post.add_xpath('title', '//*[@id="nav_title"]/a/text()') post.add_xpath('up_votes', '//*[@class="s_upvotes"]/text()', MapCompose(int), re=r'(\d+)') post.add_xpath('down_votes', '//*[@class="s_downvotes"]/text()', MapCompose(int), re=r'(\d+)') post.add_xpath('points', '//*[@class="post_points ltr"]/text()', MapCompose(int)) post.add_xpath('author_username', '//*[@class="block username"]/text()') post.add_xpath('author_fullname', '//*[@class="block full_name"]/text()', MapCompose(lambda value: value.replace(u'\xa0', u''))) post.add_xpath('date', '//*[@class="icon-time"]/../text()') post.add_xpath('community', '//*[@class="icon-reorder"]/../a[1]/text()') post.add_xpath('topics', '//*[@class="topic"]/text()', MapCompose(string.strip)) post.add_xpath('url', '//*[@class="short_url inputtext"]/@value') post.add_value( 'type', 'link' if post.get_xpath('//*[@id="nav_title"]/a/@rel', TakeFirst()) == 'nofollow' else 'text') if post.get_output_value('type') == 'link': post.add_xpath('link', '//*[@id="nav_title"]/a/@href') post.add_xpath('domain', '//*[@class="post_domain"]/text()', re=r'\((.+?)\)') post.add_xpath('content', '//*[@class="post_content replace_urls"]/*', Join('\n')) post.add_value('item', 'post') yield post.load_item() comments = [] for row in response.selector.xpath( '//*[contains(@class, "post_comment")]'): comment = ItemLoader(item=ArabiaCommentItem(), selector=row, response=response) comment.default_output_processor = TakeFirst() comment.add_xpath('id', './@id', re=r'(\d+)') comment.add_xpath('index', './@class', MapCompose(int), re=r'index(\d+)') comment.add_value('post_id', post.get_output_value('id')) #comment.add_value('parent_id', '') comment.add_xpath('author_username', './/*[@class="comment_user"]/a/text()') comment.add_xpath('date', './/*[@class="comment_date"]/text()') comment.add_xpath('points', './/*[@class="comment_points ltr"]/text()') comment.add_xpath( 'content', './/*[@class="post_content comment_content replace_urls"]/*', Join('\n')) #comment.add_xpath('url', './/*[@class="comment_short_url"]/a/@href') comment.add_value( 'url', 'https://arabia.io/go/{0}/{1}'.format( post.get_output_value('id'), comment.get_output_value('id'))) comment.add_value('item', 'comment') comments.append(comment) for (index, comment) in enumerate(comments): if comment.get_output_value('index') == 0: comment.add_value('parent_id', 0) continue for comment_cursor in comments[:index][::-1]: if comment_cursor.get_output_value( 'index') == comment.get_output_value('index') - 1: comment.add_value('parent_id', comment_cursor.get_output_value('id')) break for comment in comments: yield comment.load_item()
def parse(self, response): loader = ItemLoader(response=response) xpaths = [ '//a[contains(text(), "Next")]', '//a[contains(@href, "-ebook.htm")]/@href' ] for xpath in xpaths: for href in loader.get_xpath(xpath): yield Request(url=urlparse.urljoin(response.url, href), callback=self.parse, meta=response.meta, dont_filter=True) if '-ebook.htm' in response.url: item = BookItem() name_path = '//*[@itemprop="name"]/text()' # h1 item['name'] = loader.get_xpath(name_path, TakeFirst()) item['url'] = response.url description_path = '//div[contains(text(), ".")]/text()' # span item['description'] = loader.get_xpath(description_path, TakeFirst()) publisher_path = '//*[@itemprop="publisher"]/text()' # a item['publisher'] = loader.get_xpath(publisher_path, TakeFirst()) by_path = '//*[@itemprop="author"]/text()' # b item['author'] = loader.get_xpath(by_path, TakeFirst()) isbn_path = '//*[@itemprop="isbn"]/text()' # b item['isbn10'] = loader.get_xpath(isbn_path, TakeFirst()) item['isbn13'] = loader.get_xpath(isbn_path).pop() year_path = '//*[@itemprop="datePublished"]/text()' # b item['year'] = loader.get_xpath(year_path, TakeFirst()) pages_path = '//*[@itemprop="numberOfPages"]/text()' # b item['pages'] = loader.get_xpath(pages_path, TakeFirst()) language_path = '//*[@itemprop="inLanguage"]/text()' # b item['language'] = loader.get_xpath(language_path, TakeFirst()) format_path = '//*[@itemprop="bookFormat"]/text()' # b item['format'] = loader.get_xpath(format_path, TakeFirst()) edition_path = '//*[@itemprop="bookEdition"]/text()' # b item['edition'] = loader.get_xpath(edition_path, TakeFirst()) series_path = '//*[contains(text(), "Series")]//following-sibling::*' item['series'] = ", ".join( loader.get_xpath('%s/*/text()' % series_path)).strip() size_path = '//*[contains(text(), "Book size")]//following-sibling::*' item['size'] = loader.get_xpath('%s/text()' % size_path, TakeFirst()) related_url_path = '//h4/a/@href' related_name_path = '//h4/a/text()' item['related'] = { 'name': loader.get_xpath(related_name_path, TakeFirst()), 'url': urlparse.urljoin( response.url, loader.get_xpath(related_url_path, TakeFirst())), } url_path = '//div[contains(@id, "dl")]/script' script = loader.get_xpath(url_path, TakeFirst()) download_url = script.split('href="')[1].split('" onclick')[0] item['download_url'] = urlparse.urljoin(response.url, download_url) pdf_url = item['download_url'] pdf_filename = './books_/' + item['name'] + ' - ' + item[ 'author'] + ' - ' + item['isbn13'] + '.pdf' item['filename'] = pdf_filename if self.download_files: # req = urllib2.Request(pdf_url) # req.add_header('Referer', '%s' % response.url) # r = urllib2.urlopen(req) # r = r.read() # with open(pdf_filename, 'wb') as out: # print r # out.write(r) from socket import socket host = DOMAIN port = 80 path = response.url.replace(DOMAIN_URL, '') xmlmessage = "<port>0</port>" s = socket() s.connect((host, port)) s.send("GET %s HTTP/1.1\r\n" % path) s.send("Host: %s\r\n" % host) s.send("Content-Type: text/html\r\n") for line in s.makefile(): print line, s.close() yield item for href in loader.get_xpath('//td/a/@href'): yield Request(url=urlparse.urljoin(response.url, href), callback=self.parse, meta=response.meta, dont_filter=True) else: for href in loader.get_xpath( '//a[contains(@href, "-ebooks-")]/@href'): yield Request(url=urlparse.urljoin(response.url, href), callback=self.parse, meta=response.meta, dont_filter=True)
def parse(self, response): loader = ItemLoader(response=response) xpaths = [ '//a[contains(text(), "Next")]', '//a[contains(@href, "-ebook.htm")]/@href' ] for xpath in xpaths: for href in loader.get_xpath(xpath): yield Request( url=urlparse.urljoin(response.url, href), callback=self.parse, meta=response.meta, dont_filter=True ) if '-ebook.htm' in response.url: item = BookItem() name_path = '//*[@itemprop="name"]/text()' # h1 item['name'] = loader.get_xpath(name_path, TakeFirst()) item['url'] = response.url description_path = '//div[contains(text(), ".")]/text()' # span item['description'] = loader.get_xpath(description_path, TakeFirst()) publisher_path = '//*[@itemprop="publisher"]/text()' # a item['publisher'] = loader.get_xpath(publisher_path, TakeFirst()) by_path = '//*[@itemprop="author"]/text()' # b item['author'] = loader.get_xpath(by_path, TakeFirst()) isbn_path = '//*[@itemprop="isbn"]/text()' # b item['isbn10'] = loader.get_xpath(isbn_path, TakeFirst()) item['isbn13'] = loader.get_xpath(isbn_path).pop() year_path = '//*[@itemprop="datePublished"]/text()' # b item['year'] = loader.get_xpath(year_path, TakeFirst()) pages_path = '//*[@itemprop="numberOfPages"]/text()' # b item['pages'] = loader.get_xpath(pages_path, TakeFirst()) language_path = '//*[@itemprop="inLanguage"]/text()' # b item['language'] = loader.get_xpath(language_path, TakeFirst()) format_path = '//*[@itemprop="bookFormat"]/text()' # b item['format'] = loader.get_xpath(format_path, TakeFirst()) edition_path = '//*[@itemprop="bookEdition"]/text()' # b item['edition'] = loader.get_xpath(edition_path, TakeFirst()) series_path = '//*[contains(text(), "Series")]//following-sibling::*' item['series'] = ", ".join(loader.get_xpath('%s/*/text()' % series_path)).strip() size_path = '//*[contains(text(), "Book size")]//following-sibling::*' item['size'] = loader.get_xpath('%s/text()' % size_path, TakeFirst()) related_url_path = '//h4/a/@href' related_name_path = '//h4/a/text()' item['related'] = { 'name': loader.get_xpath(related_name_path, TakeFirst()), 'url': urlparse.urljoin(response.url, loader.get_xpath(related_url_path, TakeFirst())), } url_path = '//div[contains(@id, "dl")]/script' script = loader.get_xpath(url_path, TakeFirst()) download_url = script.split('href="')[1].split('" onclick')[0] item['download_url'] = urlparse.urljoin(response.url, download_url) pdf_url = item['download_url'] pdf_filename = './books_/' + item['name'] + ' - ' + item['author'] + ' - ' + item['isbn13'] + '.pdf' item['filename'] = pdf_filename if self.download_files: # req = urllib2.Request(pdf_url) # req.add_header('Referer', '%s' % response.url) # r = urllib2.urlopen(req) # r = r.read() # with open(pdf_filename, 'wb') as out: # print r # out.write(r) from socket import socket host = DOMAIN port = 80 path = response.url.replace(DOMAIN_URL, '') xmlmessage = "<port>0</port>" s = socket() s.connect((host, port)) s.send("GET %s HTTP/1.1\r\n" % path) s.send("Host: %s\r\n" % host) s.send("Content-Type: text/html\r\n") for line in s.makefile(): print line, s.close() yield item for href in loader.get_xpath('//td/a/@href'): yield Request( url=urlparse.urljoin(response.url, href), callback=self.parse, meta=response.meta, dont_filter=True ) else: for href in loader.get_xpath('//a[contains(@href, "-ebooks-")]/@href'): yield Request( url=urlparse.urljoin(response.url, href), callback=self.parse, meta=response.meta, dont_filter=True )