def parse_item(self, response): hxs = HtmlXPathSelector(response) items = [] # extract urls from the src attribute of img tags # Example: <img src="http://url"> will extract http://url images = hxs.select('//img') for image in images: item = CrawlerItem() item['page'] = response.url item['picture'] = image.select('@src').extract()[0] item = self.CALLBACK_processImage(item['picture'], item) items.append(item) urls = hxs.select('//a') for url in urls: if url.select('img'): item = CrawlerItem() item['page'] = response.url item['picture'] = url.select('img/@src').extract()[0] item['picture_destination'] = url.select('@href').extract()[0] item = self.CALLBACK_processImage(item['picture_destination'], item) items.append(item) #if len(items): log.msg('%s images were found.' % len(items), level=log.INFO) return items
def parse_comment(self, response): picture_href = response.xpath('//div[re:match(@class,"media-preview-content.*")]/a/@href').extract_first() global valid_link if picture_href != None: response.meta['info']["url"] = picture_href response.meta['info']["type"] = picture_href.split(".")[-1] valid_link = True yield CrawlerItem(response.meta['info']) elif self.picture_type.match(response.meta['info']["url"]): response.meta['info']["type"] = response.meta['info']["url"].split(".")[-1] valid_link = True yield CrawlerItem(response.meta['info']) else: valid_link = False print response.url, " is not picture. data-url is ", response.meta['info']["url"] if valid_link: comments = response.xpath( '//div[@id="' + "siteTable_" + response.meta["info"]["_id"] + '"]/div[re:match(@class," ?thing id-.*")]') if comments: get_comments = self.analysis_comment(response.meta["info"]['_id'], response.meta["info"]['href'], response.meta["info"]["_id"], comments) i = 0 while i < len(comments): comment_item, num = get_comments.next() if num == 0: i = i + 1 if comment_item: yield comment_item
def parse_article(self, response): main = response.xpath('//div[@id="main-content"]') title = main.xpath('//h1[@id="page-title"]/span/text()').extract() content = main.xpath('//div[@class="field-item even"]/p').extract() item = CrawlerItem() if len(content) == 0: return item["url"] = response.url item["content"] = [] cleanr = re.compile('<.*?>') for c in content: if len(c) > 1: c = re.sub(cleanr, '', c).replace('\n', '') item["content"].append(c) count = 0 for line in item["content"]: count += len(line.split()) if count < 500: # print(count) return item["title"] = title[0].replace('<', '').replace('>', '').replace( ':', '').replace('"', '').replace('/', '').replace('|', '').replace( '*', '').replace('?', '') return item
def process_TaoCan(self, pageindex): res_items = [] values = {'parameter':'FF32=&FF33=',\ 'areaCode':'025',\ 'sortFlag':'xl',\ 'cssFlag':'down',\ 'pageindex' : str(pageindex),\ 'tableNumber':'03'} pj = json.loads( post('http://js.189.cn/nmall/product/queryPackageList.do', values)) pageTotal = pj['pageCount'] for offer in pj['offerList']: item = CrawlerItem() item[ 'url'] = 'http://js.189.cn/nmall/product/queryPackageXq/' + offer[ 'FNUMBER'] + '.html' item['title'] = offer['FNUMBER'] item['table'] = json.dumps(offer, ensure_ascii=False).encode('utf-8') item['table2'] = '' item['need_know'] = '' item['faq'] = '' res_items.append(item) for i in range(2, pageTotal + 1): res_items.append(self.process_TaoCan(i)) return res_items
def parse_broadbandInfo(self, response): self.log('%s' % response.url) bodys = response.body.split('<html>') items = [] filename = 'G:/Github/crawler/out/broadbandInfo/' + response.url.split( '/')[-1] fout = open(filename, 'wb') for body in bodys: soup = bs(body) kd_xqinfo_res = soup.find('div', class_='kd_xqinfo') if kd_xqinfo_res == None: continue else: item = CrawlerItem() item['url'] = response.url item['title'] = kd_xqinfo_res.find('h2').string fout.write(item['title'] + '\n') tr_s = kd_xqinfo_res.find_all(has_tr_no_displayNone) tableContent = '' for tr in tr_s: for ss in tr.stripped_strings: tableContent += ss + '\n' item['table'] = tableContent item['need_know'] = '' items.append(item) fout.write(item['table']) break fout.close()
def parse_details(self, response): items = CrawlerItem() # items = CrawlerItem() job_page = response.selector.xpath('//div[@class= "section single"]') #jobs_meta = response.selector.xpath('//div[starts-with(@class, "job-meta")]') jobs_meta = job_page.xpath('//div[starts-with(@class, "job-meta")]') items['url'] = response.url items['title'] = job_page.xpath( 'div/h1[@class= "title"]/text()').extract_first().strip() items['jobtype'] = jobs_meta.xpath( 'span[@class = "job-type"]/span/text()').extract_first() items['location'] = jobs_meta.xpath( 'span[@class = "location"]/span/text()').extract_first() items['organisation'] = jobs_meta.xpath( 'span[@class = "company"]/text()').extract_first() items['date_posted'] = jobs_meta.xpath( 'span[@class = "date"]/text()').extract_first() footer_div = job_page.xpath( '//div[@class= "content-bar iconfix foot"]') items['category'] = footer_div.xpath( 'p[@class= "meta"]//a/text()').extract_first() items['days_to_expiry'] = footer_div.xpath( 'p[@class= "meta"]//span[@class= "expiry"]/text()').extract_first( ) contents_table = bs(response.body, 'html.parser').find('div', { 'class': 'section_content' }).children items['job_details'] = self.process_details_soup(contents_table) #items['job_details'] = html.fromstring(job_page.xpath('//div[@class= "section_content"]').extract()).text_content().strip() yield items
def parse(self, response): telephones = Selector(response).xpath('//body/section/ul/li') for telephone in telephones: item = CrawlerItem() item['name'] = telephone.xpath('a/h3/text()').extract_first() item['price'] = telephone.xpath( 'a/div[@class="price"]/strong/text()').extract_first() item['img'] = telephone.xpath( 'a/img/@data-original').extract_first() if item['img'] is None or len(item['img']) < 1: item['img'] = telephone.xpath('a/img/@src').extract_first() for row in telephone.xpath('a/figure/span/text()'): key = row.extract().split(":")[0] value = row.extract().split(":")[1] key = 'screen' if key.lower() == "màn hình" else key.lower() if key.lower() == "ram": ram_data = row.extract().split(",") for data in ram_data: k = data.split(":")[0] v = data.split(":")[1] k = k.strip().lower() # print(k) item[k] = v else: item[key] = value print(key, value) item["screen"] = item["screen"].replace('\"', '"') yield item
def parse(self, response): logger.debug('Getting URL: %s', response.url) items = [] if is_document(response.url): item = CrawlerItem() item['body'] = response item['link'] = response.url items.append(item) return items try: sel = Selector(response) for link in sel.xpath('//a'): href = link.xpath('@href').extract() if not href: continue lnk = href[0].strip() if lnk.startswith('#') or not lnk or lnk.startswith('mailto:'): continue url = urllib.parse.urljoin(response.url, lnk) items.append(Request(url)) except Exception as e: logger.error(e) return items
def parse_page(self, response): """Scrapes information from pages into items""" item = CrawlerItem() item['url'] = response.url.encode('utf-8') item['visited'] = datetime.datetime.now().isoformat().encode('utf-8') item['published'] = get_first( response.selector.xpath('//time/@datetime').extract()) item['title'] = get_first( response.selector.xpath( '//meta[@property="og:title"]/@content').extract()) item['description'] = get_first( response.selector.xpath( '//meta[@name="description"]/@content').extract()) item['text'] = "".join([ s.strip().encode('utf-8') for s in response.selector.css( '.article-content>.rtf-content-wrapper>P').xpath( './/text()').extract() ]) item['author'] = [ s.encode('utf-8') for s in response.selector.xpath( '//div[@class="name"]/a[@rel="author"]/text()').extract() ] item['keywords'] = [ s.encode('utf-8') for s in response.selector.xpath( '//meta[@name="news_keywords"]/@content').extract() ] return item
def parse_page(self, response): """Scrapes information from pages into items""" item = CrawlerItem() item['url'] = response.url.encode('utf-8') item['visited'] = datetime.datetime.now().isoformat().encode('utf-8') item['published'] = get_first( response.selector.xpath( '//span[@class="Datum"]/@content').extract()) item['title'] = get_first( response.selector.xpath( '//meta[@property="og:title"]/@content').extract()) item['description'] = get_first( response.selector.xpath( '//meta[@property="og:description"]/@content').extract() ).strip() item['text'] = "".join([ s.strip().encode('utf-8') for s in response.selector.xpath( '//div[@class="FAZArtikelText"]/div/p/text()').extract() ]) item['author'] = [ s.encode('utf-8') for s in response.selector.xpath( '//span[@class="Autor"]/span[@class="caps last"]/a/span/text()' ).extract() ] item['keywords'] = [ s.encode('utf-8') for s in response.selector.xpath( '//meta[@name="keywords"]/@content').extract() ] return item
def parse_page(self, response): """Scrapes information from pages into items""" item = CrawlerItem() item['url'] = response.url.encode('utf-8') item['visited'] = datetime.datetime.now().isoformat().encode('utf-8') item['published'] = get_first( response.selector.xpath('//meta[@name="date"]/@content').extract()) item['title'] = get_first( response.selector.xpath( '//meta[@property="og:title"]/@content').extract()) item['description'] = get_first( response.selector.xpath( '//meta[@name="description"]/@content').extract()) item['text'] = "".join([ s.strip().encode('utf-8') for s in response.selector.css('.article__item').css( '.paragraph').xpath('.//text()').extract() ]) item['author'] = [ s.encode('utf-8') for s in response.selector.css('.byline').css( 'span[itemprop="name"]').xpath('./text()').extract() ] item['keywords'] = [ s.encode('utf-8') for s in response.selector.xpath( '//meta[@name="keywords"]/@content').extract() ] # Handle next pages next_page = get_first( response.selector.xpath('//link[@rel="next"]/@href').extract()) if next_page: self.logger.debug("Next page found: " + next_page) yield Request(next_page, callback=self.parse_page) yield item
def parse(self, response): print( "----------------------------------------------------------------------------" ) print( "----------------------------------------------------------------------------" ) # xpath匹配规则 for each in response.xpath("//li"): item = CrawlerItem() try: item["title"] = each.xpath("./div/a/text()").extract()[0] except: item["title"] = '空' try: item["name"] = each.xpath( "./div/div[1]/div/a/text()").extract()[0] except: item["name"] = '空' try: item["href"] = 'https://www.jianshu.com' + each.xpath( "./div/div[2]/a[2]/@href").extract()[0] except: item["href"] = '空' try: item["type"] = each.xpath( "./div/div[2]/a[1]/text()").extract()[0] except: item["type"] = '空' try: item["time"] = each.xpath( "./div/div[1]/div/span/@data-shared-at").extract()[0] except: item["time"] = '空' try: extract = 'https://www.jianshu.com' + each.xpath( "./div/div[2]/a[1]/@href").extract()[0] if (DemoSpider.start_urls.count(extract) == 0): DemoSpider.start_urls.insert( DemoSpider.start_urls.__len__(), extract) else: pass except: pass # 把数据交给管道文件 yield item DemoSpider.index = DemoSpider.index + 1 if (DemoSpider.index < DemoSpider.start_urls.__len__() - 1): # 把请求交给控制器 yield scrapy.Request(self.url[DemoSpider.index], callback=self.parse) else: pass
def parse_page_contents(self, response): item = CrawlerItem() id_ = response.xpath( '//*[@id="main"]/div[1]/div/div[2]/h2/text()')[0].extract() reviews = response.xpath('//*[@id="main"]/div[3]/div/div[2]/ol/li') for review in reviews: item["title"] = review.xpath( './div/div/div/div/div/div[1]/div[1]/div[1]/div[1]/a/text()' )[0].extract() item["id"] = id_ item["score"] = review.xpath( './div/div/div/div/div/div[1]/div[1]/div[2]/div/text()' )[0].extract() try: tmp = review.xpath( './div/div/div/div/div/div[1]/div[3]/span/span[2]/text()' ).extract() item["review"] = ''.join(tmp) if not tmp: raise MakeError() except: tmp = review.xpath( './div/div/div/div/div/div[1]/div[3]/span/text()').extract( ) item["review"] = ''.join(tmp) yield item
def parse_page(self, response): """Scrapes information from pages into items""" item = CrawlerItem() item['url'] = response.url.encode('utf-8') item['visited'] = datetime.datetime.now().isoformat().encode('utf-8') item['published'] = parser.parse( get_first(response.selector.xpath( '//time/@datetime').extract())).isoformat().encode('utf-8') item['title'] = get_first( response.selector.xpath( '//meta[@property="og:title"]/@content').extract()) item['description'] = get_first( response.selector.xpath( '//meta[@name="description"]/@content').extract()) item['text'] = "".join([ s.strip().encode('utf-8') for s in response.selector.css( '.article>.body>p').xpath('.//text()').extract() ]) item['author'] = [ s.encode('utf-8') for s in response.selector.css('.authorContainer').xpath( './/span/strong/span/text()').extract() ] item['keywords'] = [ s.encode('utf-8') for s in response.selector.xpath( '//meta[@name="news_keywords"]/@content').extract() ] item['resource'] = self.name item['publication_id'] = hashlib.sha1( (str(item['url']) + str(item['published']))).hexdigest() return item
def parse(self, response): item = CrawlerItem() item['url'] = response.url item['raw'] = None item['is_visited'] = 'Y' item['rvrsd_domain'] = self.get_rvrsd_domain( response.request.meta.get('download_slot')) try: item['status'] = response.status raw = response.text if response.status == 200: item['parsed'] = self.parse_text(raw) else: item['parsed'] = None self.counter = self.counter + 1 if self.counter % 100 == 0: print('[%d] Sleep...' % self.counter) sleep(1) print('[%d] Parsed: %s' % (self.counter, response.url)) except AttributeError as e: item['status'] = -3 item['parsed'] = None self.logger.error('Fail to Parse: %s , because %s' % (response.url, e)) print('[%d] Fail to Parse: %s , because %s' % (self.counter, response.url, e)) return item
def parse_page(self, response): """Scrapes information from pages into items""" item = CrawlerItem() item['url'] = response.url.encode('utf-8') item['visited'] = datetime.datetime.now().isoformat().encode('utf-8') item['published'] = get_first( response.selector.xpath('//meta[@name="date"]/@content').extract()) item['title'] = get_first( response.selector.css('.headline').xpath('./text()').extract()) item['description'] = get_first( response.selector.xpath( '//meta[@name="description"]/@content').extract()) item['text'] = "".join([ s.strip().encode('utf-8') for s in response.selector.xpath( '//div[@class="article-section clearfix"]/p/text()').extract() ]) item['author'] = [ s.encode('utf-8') for s in response.selector.xpath( '//p[@class="author"]/a/text()').extract() ] item['keywords'] = [ s.encode('utf-8') for s in response.selector.xpath( '//meta[@name="news_keywords"]/@content').extract() ] return item
def parse_page(self, response): """Scrapes information from pages into items""" #settings = get_project_settings() published = parser.parse(get_first(response.selector.xpath('//meta[@name="date"]/@content').extract())) published = published.replace(tzinfo=timezone('UTC')) # earliest = parser.parse(settings.get('EARLIEST_PUBLISHED')) # if published < earliest: # raise DropItem('Dropping this article published on %s at %s which is before earliest published global setting %s' % (self.name, published.isoformat(), earliest.isoformat())) #raise CloseSpider('Article was published on %s at %s which is before earliest published global setting %s' % (self.name, published.isoformat(), earliest.isoformat())) # else: item = CrawlerItem() item['url'] = response.url.encode('utf-8') item['visited'] = datetime.datetime.now().isoformat().encode('utf-8') item['published'] = published.isoformat().encode('utf-8') item['title'] = get_first(response.selector.xpath('//meta[@property="og:title"]/@content').extract()) item['description'] = get_first(response.selector.xpath('//meta[@name="description"]/@content').extract()) #item['text'] = "".join([s.strip().encode('utf-8') for s in response.selector.css('.article__item').css('.paragraph').xpath('.//text()').extract()]) item['author'] = [s.encode('utf-8') for s in response.selector.css('.byline').css('span[itemprop="name"]').xpath('./text()').extract()] item['keywords'] = [s.encode('utf-8') for s in response.selector.xpath('//meta[@name="keywords"]/@content').extract()] item['resource'] = self.name item['publication_id'] = hashlib.sha1((str(item['url']) + str(item['published']))).hexdigest() # Handle next pages next_page = get_first(response.selector.xpath('//link[@rel="next"]/@href').extract()) if next_page: self.logger.debug("Next page found: "+next_page) yield Request(next_page,callback=self.parse_page) #else: # raise CloseSpider('Article was published on %s at %s which is before earliest published global setting %s' % (self.name, published.isoformat(), earliest.isoformat())) yield item
def parse2(self, response): item = CrawlerItem() try: page = Selector(response=response).xpath('//ul[@class="pagination"]') author = Selector(response=response).xpath('//h1[@class="title J_title"]/text()').get().strip() author = ' '.join(author.split()) print(author) if len(page)==0: print('只有一页评论') comments=self.comms(response) for comment in comments: if comment == " " or comment == " " : pass else: item['author'] = author item['comment'] = comment yield item ##### else: print('该页有多页评论') page_num = page[0].xpath('./li/a/text()').getall() print(page_num) num = int(page_num[-2]) print(num) for n in range(1,num+1): print(f'正在提取第{n}页') if n == 1: url = response.request.url + '/#comments' else: url = response.request.url + f'/p{n}/#comments' yield scrapy.Request(url=url, callback=self.parse3, dont_filter=False) except Exception as e: print(e) print('手机详情页链接未抓取成功')
def parse_article(self, response): def extract_with_css(query): return response.css(query).extract_first( default='Not-Found').strip() if response.xpath( "//span[@itemprop='name']/text()").extract_first() is None: author = response.xpath( "//p[@class='byline']/text()").extract_first().strip() else: author = response.xpath( "//span[@itemprop='name']/text()").extract_first().strip() item = CrawlerItem() item['Date'] = dt.datetime.today().strftime('%Y-%m-%d %H:%M:%S') item['Headline'] = extract_with_css('h1.content__headline::text') item['Author'] = author item['Topic'] = extract_with_css( 'div.content__section-label a::text') item['Snippet'] = extract_with_css('p::text') item['Tags'] = response.css( 'li.submeta__link-item a::text').extract() item['DateUpdated'] = extract_with_css( 'p.content__dateline time::text') yield item
def start_requests(self): strategy = 0 try: strategy = int(self.strategy) print(f'使用指定策略:{strategy}') except Exception as e: print('使用默认追踪策略') col = f'kw-{self.keyword}' cols = self.db.list_collection_names(filter={"name":{"$regex":r"^kw-"}}) if not col in cols: yield scrapy.Request(f'https://listado.mercadolibre.com.mx/{self.keyword}') else: if strategy: yield scrapy.Request(f'https://listado.mercadolibre.com.mx/{self.keyword}') else: docs_cursor = self.db[col].aggregate([{'$group': {'_id': { 'pid': '$pid' }, 'pid': {'$last': '$pid'} , 'src': {'$last': '$src'}, 'sales': {'$last': '$sales'}}},{'$match': {'sales': {'$gt': 0}}}]) docs = list(docs_cursor) if not len(docs) > 0: yield scrapy.Request(f'https://listado.mercadolibre.com.mx/{self.keyword}') else: for doc in docs: item = CrawlerItem() item['src'] = doc['src'] item['pid'] = doc['pid'] yield scrapy.Request(item['src'],self.parse_item,cb_kwargs={'item':item})
def download_errback(self, failure, url): item = CrawlerItem() item['url'] = url item['is_visited'] = 'Y' item['rvrsd_domain'] = None item['raw'] = None item['parsed'] = None if failure.check(IgnoreRequest): self.logger.debug('Forbidden by robot rule') item['status'] = -1 elif failure.check(DNSLookupError): self.logger.info('Fail to DNS lookup.') item['status'] = -2 elif failure.check(DNSMismatch): self.logger.info('Fail to DNS match.') item['status'] = -2 elif failure.check(NoRouteError): self.logger.info('No route error.') item['status'] = -4 elif failure.check(HttpError): status = failure.value.response self.logger.info('Http error [%s].' % status) item['status'] = status else: self.logger.info('Unknown error.') item['status'] = -255 yield item
def parse_page(self, response): """Scrapes information from pages into items""" item = CrawlerItem() item['url'] = response.url.encode('utf-8') item['visited'] = datetime.datetime.now().isoformat().encode('utf-8') item['published'] = parser.parse( get_first( response.selector.xpath( '//meta[@property="vr:published_time"]/@content').extract( ))).isoformat().encode('utf-8') item['title'] = get_first( response.selector.xpath( '//meta[@property="og:title"]/@content').extract()) item['description'] = get_first( response.selector.xpath( '//meta[@property="og:description"]/@content').extract() ).strip() item['text'] = "".join([ s.strip().encode('utf-8') for s in response.selector.xpath( '//div[@class="main-text "]/p/text()').extract() ]) item['author'] = [ s.encode('utf-8') for s in response.selector.xpath( '//meta[@name="author"]/@content').extract() ] item['keywords'] = [ s.encode('utf-8') for s in response.selector.xpath( '//meta[@name="keywords"]/@content').extract() ] item['resource'] = self.name item['publication_id'] = hashlib.sha1( (str(item['url']) + str(item['published']))).hexdigest() return item
def crawldata(self, response): questions = response.xpath( '//*[@id="list-comment"]/div[@class="f-cmt-ask"]') for quest in questions[1:]: items = CrawlerItem() items['Comment'] = quest.xpath( 'div[@class="f-cmmain"]/text()').extract_first() yield items
def parse(self, response): items = CrawlerItem() imdbId = str(response.url)[-8:-1] items['imdbId'] = str(imdbId) # xpath提取分数 score = response.xpath('//span[@itemprop="ratingValue"]/text()').extract()[0] items['score'] = score return items
def parse(self, response): item = CrawlerItem() item['jpg_urls'] = [] linkextractors = LxmlLinkExtractor( allow=[r'\.jpg', r'\.tif'], deny_extensions=['md5', 'xmp', 'html']) for link in linkextractors.extract_links(response): item['jpg_urls'].append(link.url) return item
def parse(self, response): sel = Selector(response) sites = sel.css('a[href$=".gz"]') for site in sites: item = CrawlerItem() item['url'] = site.xpath('@href').extract() with open('enlaces.txt', 'a') as f: f.write('{0}\n'.format(item['url'][0]))
def parse_item(self, response): item = CrawlerItem() item['url'] = response.url item['title'] = response.css('#firstHeading::text').get() item['overview'] = re.sub( self.content_regex, '', response.css('#mw-content-text div p:not(.mw-empty-elt)').get()) item['content'] = re.sub(self.content_regex, '', response.css('#mw-content-text div').get()) return item
def parse(self, response): print('================>Start to crawling the URL' + response.url) links = Selector(response).xpath( "//div[@class='title-news']/a/@href").extract() for link in links: url = CrawlerItem() url['title_link'] = link.css('title::text').get() url['author_link'] = link.css('.fck_detail strong::text').get() url['publish_time'] = link.css('.date::text').get() yield url
def parse_item(self, response): questions = response.xpath('//div[@class="summary"]/h3') for question in questions: item = CrawlerItem() item['url'] = question.xpath( 'a[@class="question-hyperlink"]/@href').extract()[0] item['title'] = question.xpath( 'a[@class="question-hyperlink"]/text()').extract()[0] yield item
def parse_apk(self, response): for position in response.xpath('//ul[@id="iconList"]/li'): l = APKItemLoader(item=CrawlerItem(), selector=position) l.add_value('category', response.meta.get('cate', '')) l.add_value('apk_from', '360') l.add_xpath('apk_name', 'h3/a/text()') l.add_xpath('apk_url', 'a[starts-with(@href, "zhushou360:")]/@href', re=r'.*&url=(.*)') yield l.load_item()