def parse_finalized_permits(self, response): """Extracts the download URLs for all finalized permits.""" self.logger.info("Parsing finalized permits at " + response.url) urls = [] if self.pop_shell: from scrapy.shell import inspect_response inspect_response(response, self) permitViewHrefs = response.xpath('//a[text()="view"]/@href').extract() for href in permitViewHrefs: url = response.urljoin(href) self.logger.debug("Found URL: " + url) urls.append(url) yield FinalizedPermitsPage(file_urls=urls) next_page_href = response.xpath('//a[text()="next >"]/@href').extract()[0] if not next_page_href: self.logger.info("No pages left!") return else: print "Moving on to the next page at ." + next_page_href yield scrapy.Request(url=response.urljoin(next_page_href), callback=self.parse_finalized_permits)
def parse_tastypage(self, response): hxs = HtmlXPathSelector(response) print '--------------------------------------' print hxs.select('//title').extract() print '--------------------------------------' from scrapy.shell import inspect_response inspect_response(response)
def parse_details(self, response): item = response.meta.get('item', None) if item: # populate more `item` fields return item else: inspect_response(response, self)
def parse_items(self, response): """ Second step: yield item """ page_number = response.meta['page'] response_ = json.loads(response.body) if self.is_debug: inspect_response(response, self) for i in response_['searchItems']: item = FoodyItem() item['city'] = i['City'] item['district_id'] = i['DistrictId'] item['main_url'] = self.allowed_domains[0] + i['DetailUrl'] item['addr'] = i['Address'] item['mobile_pic'] = re.sub('^//', '', i['MobilePicturePath']) item['large_pic'] = re.sub('^//', '', i['PicturePathLarge']) item['main_category_id'] = i['MainCategoryId'] item['lat'] = i['Latitude'] item['long'] = i['Longitude'] item['views'] = i['TotalView'] item['cuisines'] = dict() item['cuisines']['id'] = list() item['cuisines']['name'] = list() for j in i['Cuisines']: item['cuisines']['id'].append(j['Id']) item['cuisines']['name'].append(j['Name']) item['rating'] = i['AvgRating'] item['id'] = i['Id'] item['category'] = i['MainCategoryId'] yield item
def dostuff(self, response): from scrapy.shell import inspect_response inspect_response(response) sel = Selector(response) rowSelector = '//table[@id="GridView1"]//tr' rows = sel.xpath(rowSelector) logging.info('---------CALLED PARSE*********') for r in range(1, len(rows)-2): # The last row is the footer item = default(FILEITEM) group = default(FILEGROUP) cols = rows[r].xpath('td') name = cols[0].xpath('text()').extract()[0].encode('ascii', 'ignore').upper() disp = cols[1].xpath('text()').extract()[0].encode('ascii', 'ignore') date = cols[2].xpath('text()').extract()[0].encode('ascii', 'ignore') y = re.compile('\d\d\d\d') year = y.findall(date)[0].encode('ascii', 'ignore') type = cols[3].xpath('text()').extract()[0].encode('ascii', 'ignore') link = cols[4].xpath('a') url = link.xpath('@href').extract()[0] title = 'Disposition' item['source'] = url item['name'] = name item['state'] = state item['year'] = year group['items'].append(item) logging.info(item['name']) item = default(FILEITEM)
def shareimage_post(self, response): #register_openers() params={}; basedir = settings['IMAGES_STORE'] key=response.meta['key'] path_comps= key.split('/') filename=os.path.join(basedir, *path_comps) print filename; inspect_response(response) params["userfile[]"]=open(filename,"rb") params["private_upload"]=0 datagen, headers = multipart_encode(params) upload_url = "http://shareimage.org/upload.php" req = urllib2.Request(upload_url, datagen, headers) print response.meta['cookie'] req.add_header('Cookie',response.meta['cookie']) result = urllib2.urlopen(req) content=result.read() print content response["meta"]['tttt']=content inspect_response(response) p = re.compile(r'value="(http://\S+)"') match=p.search(content); print match.group(1) if match: self.tt['publish']['imgs'].append(match.group(1)) saveItem(self.tt) else: self.log("ERROR: failed to upload image") # inspect_response(response) return;
def parse_reviews(self, response): widx = response.meta['work_index'] item = response.meta['pass_item'] if response.meta.get('main_page'): # it may happen that uid taken from search result is different (request was redirected) if response.meta.get('redirect_urls'): item['work_uid'] = response.url[response.url.index('/livres/')+8:] # fetch tags from main page tags_sel = response.xpath('//p[@class="tags"]/a[@rel="tag"]') tags_t = [] tags_n = [] # only way to get approx. frequency ia through font-size for tag_s in tags_sel: tags_t.append(tag_s.xpath('./text()').extract_first().strip()) # "tag_t17 tc0 ..." tag_n_c = tag_s.xpath('./@class').extract_first() tags_n.append(tag_n_c[5:tag_n_c.index(u' ')]) item['tags_t'] = u"__&__".join(tags_t) item['tags_n'] = u";".join(tags_n) item['tags_lang'] = u'fre' # request first review page yield scrapy.Request((self.url_main + self.param_review) % (item['work_uid'], 1), meta={'work_index': widx, 'pass_item': item}, callback=self.parse_reviews) # collect from reviews page else: last_page = response.meta.get('last_page') if last_page is None: page_row = response.xpath('//div[@class="pagination row"]') if len(page_row) == 0: last_page = 1 else: last_page = int(page_row.xpath('./a[last()-1]/text()').extract_first()) # used for debugging... could be removed if response.url.find('?pageN=') == -1 or response.url.find('&tri=') == -1: from scrapy.shell import inspect_response inspect_response(response, self) cur_page = int(response.url[response.url.index('?pageN=') + 7:response.url.index('&tri=')]) found_older = False reviews_sel = response.xpath('//div[@class="post_con"]') for rev in reviews_sel: new_item = self.extract_onereview(item, rev) comp = self.within_harvestperiod(new_item, self.works_to_harvest[widx]['last_harvest_date']) if comp == -1: found_older = True break elif comp == 0: yield new_item if cur_page < last_page and not found_older: yield scrapy.Request((self.url_main + self.param_review) % (item['work_uid'], cur_page + 1), meta={'work_index': widx, 'last_page': last_page, 'pass_item': item}, callback=self.parse_reviews)
def pre_login(self, response): if self.login_debug: inspect_response(response, self) formdata = dict() if self.login_data: formdata.update(self.login_data) formdata[self.username_field] = self.username or "" formdata[self.passwd_field] = self.passwd or "" self.verbose("final: %s" % formdata) yield scrapy.FormRequest.from_response(response, formdata=formdata, callback=self.submit_login, dont_filter=True)
def parse_item(self, response): self.log('Hi, this is an item page! %s' % response.url) from scrapy.shell import inspect_response inspect_response(response) hxs = HtmlXPathSelector(response) item = Item() item['id'] = hxs.select('//td[@id="item_id"]/text()').re(r'ID: (\d+)') item['name'] = hxs.select('//td[@id="item_name"]/text()').extract() item['description'] = hxs.select('//td[@id="item_description"]/text()').extract() return item
def parsePage(self,response): if response.status != 200: yield FormRequest(url =response.request.url, #headers = self.headers, meta={'params':response.meta['params'] ,'xsrfValue':response.meta['xsrfValue'] ,'userDataId':response.meta['userDataId'] ,'offset':response.meta['offset']}, formdata={ 'method':'next', 'params':response.meta['params'], '_xsrf':response.meta['xsrfValue'], }, dont_filter = True, callback = self.parsePage ) else: item = UserColumnItem() data = json.loads(response.body) columnList = data['msg'] inspect_response(response,self) item['spiderName'] = self.name # logging.warning('response.meta[params]: %s \n response.body: %s',response.meta['params'],response.body) #这里注意要处理含有匿名用户的情况 if columnList: res = Selector(text = ''.join(columnList)) item['userDataId'] = response.meta['userDataId'] item['offset'] = response.meta['offset'] for sel in res.xpath('//div[contains(@class,"zm-profile-section-item")]'): item['columnLinkId'] = sel.xpath('a[@class="zm-list-avatar-link"]/@href').re(r'http://zhuanlan.zhihu.com/(.*)')[0] item['columnImgLink'] = sel.xpath('a[@class="zm-list-avatar-link"]/img/@src').extract()[0] item['columnId'] = sel.xpath('div[contains(@class,"zm-profile-section-main")]/button/@id').extract()[0] try: item['columnDescription'] = sel.xpath('div[contains(@class,"zm-profile-section-main")]/div[contains(@class,"description")]/text()').extract()[0] except: # logging.warning('item[columnLinkId]: %s',item['columnLinkId']) item['columnDescription'] = '' item['columnPostCount'] = sel.xpath('div[contains(@class,"zm-profile-section-main")]/div[contains(@class,"meta")]/span/text()').re(r'(\d+)')[0] # 注意userLinkId中可能有中文 yield item else: #没有用户 item['userDataId']='' yield item
def parse_board_list(self, response): self.log("parse board list from %s" % response.url) inspect_response(response, self) for dt in response.xpath("//div/dl[contains(@class, 'perList')]"): link = dt.xpath("./dt/span/a/@href").extract_first() title = dt.xpath("./dt/span/a/img/@title").extract_first() intro = dt.xpath("./dd/div/p/text()").extract_first() url = urlparse.urljoin(self.root_domain, link) yield Request(url=url, callback=self.parse_board, meta={'board':{'url':url, 'title': title, 'intro': intro }})
def after_login(self, response): # from scrapy.shell import inspect_response # inspect_response(response) # check login succeed before going on if "Marketplace" not in response.body: scrapy.log.msg("Login failed", level=log.ERROR) return for idx, sel in enumerate(response.xpath('//table[contains(@class, "mpitems")]/tbody/tr')): try: record = DiscogsRecord() descXpath = sel.xpath('td[@class="item_description"]') record['title'] = descXpath.xpath('span[@class="br_item_title"]/a/text()').extract()[0].strip() info = sel.xpath('td[@class="item_description"]/text()').extract() record['catNum'] = info[6].strip() record['mediaCondition'] = info[8].strip() record['sleeveCondition'] = info[10].strip() record['sellerNotes'] = info[11].strip() record['label'] = sel.xpath('td[@class="item_description"]/a/text()').extract()[0].strip() sellerXpath = sel.xpath('.//td[@class=" seller_info"]') discogsSeller = DiscogsSeller() sellerInfo = sellerXpath.xpath('ul/li/b/a/text()').extract() discogsSeller['name'] = record['seller'] = sellerInfo[0].strip() if len(sellerInfo) > 1: p = re.compile(r'.*(\d+).*') m = p.match(sellerInfo[1].strip()) discogsSeller['numItems'] = int(m.group(1)) else: discogsSeller['numItems'] = 0 discogsSeller['country'] = filter(lambda x : len(x) != 0, [x.strip() for x in sellerXpath.xpath('ul/li/text()').extract()])[0] priceXpath = sel.xpath('.//td[@align="center"]') record['price'] = priceXpath.xpath('span[@class="price"]/text()').extract()[0] record['shipping'] = priceXpath.xpath('span[@style="color:#555"]/text()').extract()[0].strip() yield discogsSeller yield record except: print "index %d" % idx print sel from scrapy.shell import inspect_response inspect_response(response) raise
def parse(self, response): inspect_response(response) all_provinces_value = response.xpath( '//select[@name="propinsi"]/option[contains(., "All")]/@value' ).extract()[0] return FormRequest.from_response( response, formdata={ 'propinsi': all_provinces_value, 'keyword': '', 'submit': 'search!', }, callback=self.parse_list )
def parse_item(self, response): #inspect one response if response.url: from scrapy.shell import inspect_response inspect_response(response, self) tasks = response.xpath('//div[@class="success-task-list clearfix" ]/ul') for task in tasks: self.logger.info('parse '+response.url) item = ZBJItem() item['title'] = task.xpath('li[@class="task-item-title-li"]/a/text()').extract()[0] item['url'] = task.xpath('li[@class="task-item-title-li"]/a/@href').extract()[0] print item yield item
def parse_thread(self,response): self.log("thread url:%s"% response.url) hxs = HtmlXPathSelector(response) first_floor= hxs.select('//div[starts-with(normalize-space(@id),"post_")]')[0] content=first_floor.select('.//td[starts-with(normalize-space(@id),"postmessage_")]')[0] imgs=content.select('.//img/@src').extract(); if(len(imgs)==0): self.threads_db.remove({"url":response.url}) return; #items = []; #for img in imgs: item = AutobtItem() item['name'] = response.url item['image_urls'] = imgs inspect_response(response) all=content.extract(); con={"name":"","size":"","format":""} colon = u'\uff1a' tags={ u'\u5f71\u7247\u540d\u7a31':"name", u'\u5f71\u7247\u540d\u79f0':"name", u'\u5f71\u7247\u683c\u5f0f':"format", u'\u5f71\u7247\u5927\u5c0f':"size", } for key in tags: index =all.find(key); if(index >0): name =all[index:] index = name.find("<br>") if(index <0 ): index = name.find("\t") if(index >0): name = name[0:index] index = name.find(colon) if(index > 0): con[tags[key]] = name[(index+1):] else: index = name.find(u":") if(index>0): con[tags[key]] = name[(index+1):] tt=self.threads_db.find_one({"url":response.url}) tt["content"]=con; tt['raw_content']=all; self.threads_db.save(tt); return item;
def parse(self, response): audios = response.css('.album_soundlist ul li') visitedIdSet = response.meta['audioId'] allAudioNotVisited = True for audio in audios: sound_id = audio.xpath('@sound_id').extract()[0] if sound_id in visitedIdSet: allAudioNotVisited = False else: #访问未被访问到的地址 pass print response.meta from scrapy.shell import inspect_response inspect_response(response,self) pass
def parse_item(self, response): #Debugger: from scrapy.shell import inspect_response inspect_response(response, self) #if response =[]: # self.start_urls = the_ item = RedditItem() item['dates'] = response.xpath('//div[@class="search-result-meta"]/span[@class="search-time"]/time/@title').extract() item['authors'] = response.xpath('//div[@class="search-result-meta"]/span[@class="search-author"]//a/text()').extract() item['votes'] = response.xpath('//div[@class="search-result-meta"]/span[@class="search-score"]/text()').extract() #self.last_date = item['dates'][-1] yield item
def parse_fans_page(self, response): html = response.body.replace(r'\"','').replace(r'\/','/').replace(r'\t','\t').replace(r'\r\n','\n') start = re.search('html":([^}]+)follow_box', html).start(1) end = html.find('"})', start) posts_cleaned = html[start:end] sel = Selector(text=posts_cleaned) response.sel = sel inspect_response(response, self) user_node_list = sel.css('.follow_item') for user_node in user_node_list: user_item = UserItem() name_node = user_node.css('.info_name') user_item['name'] = name_node.xpath('a[1]/text()').extract() user_item['name'] = self._extract_or_empty_string(user_item['name']) user_item['uid'] = name_node.xpath('a[1]/@usercard').extract() user_item['uid'] = self._extract_or_empty_string(user_item['uid']) if user_item['uid'] != '': user_item['uid'] = user_item['uid'].split('=')[1] is_male = name_node.css('.icon_male') is_female = name_node.css('.icon_female') if is_male: user_item['gender'] = 'm' elif is_female: user_item['gender'] = 'f' verified = name_node.css('[href="http://verified.weibo.com/verify"]') user_item['verified'] = True if verified else False club = name_node.css('[href="http://club.weibo.com/intro"]') user_item['club'] = True if club else False intro_node = user_node.css('.info_intro') user_item['intro'] = intro_node.xpath('span') tag_node = user_node.css('.person_label') user_item['tags'] = list(tag_node.xpath('./a/text()').extract()) yield user_item # for page in range(NUM_POSTS_PAGE): page=0 for rqst in self._make_request_posts(user_item['uid'], page): yield rqst for rqst in self._make_request_fans(user_item['uid']): yield rqst
def parse(self, response): currency_code = re.compile("[A-Z]{3}") unit = re.compile("\A[10]+\Z") price = re.compile("\A\d+[.,]{1}\d+\Z") bank_name = bank_from_url[response.url] try: table = bs(response.xpath(banks[bank_name][ "selector"]).extract()[0], "lxml") except: inspect_response(response, self) rows = [row for row in table.findAll("tr")] total_n_cols = 0 for row in rows: total_n_cols += len(row.findAll("td")) # If number of columns is less than average, ignore row try: avg_cols = int(ceil(total_n_cols / len(rows))) except: inspect_response(response, self) for row in rows: columns = row.findAll("td") if len(columns) < avg_cols: continue item = CurrencyItem() item["bank"] = bank_name item["date"] = datetime.today().date() for column in columns: col_content = column.text.strip() if price.match(col_content) is not None: match = float(price.match( col_content).group().replace(",", ".")) item["buy_price"] = match try: if match > item["sell_price"]: item["buy_price"] = item["sell_price"] item["sell_price"] = match except KeyError as e: item["sell_price"] = match if currency_code.match(col_content) is not None: item["currency_code"] = currency_code.match( col_content).group() if unit.match(col_content) is not None: item["unit"] = unit.match(col_content).group() yield item
def parse(self, response): from scrapy.shell import inspect_response inspect_response(response, self) # for sel in response.xpath('//a[@class="active"]'): # item = PlayerLinkItem() # item['url']= sel.xpath('@href').extract() # item['name']= sel.xpath('text()').extract() # item['status']=['active'] # yield item for sel in [response.xpath('//a[@class="historic"]')[0]]: item = PlayerLinkItem() item["url"] = [urljoin("http://stats.nba.com/", sel.xpath("@href").extract()[0] + "career/")] item["name"] = sel.xpath("text()").extract() item["status"] = ["historic"] url = item["url"][0] item["uid"] = int(url.split("/")[-3]) yield item
def parse_search_resp(self, response): widx = response.meta['work_index'] isbns = self.works_to_harvest[widx]['isbns'] nb_try = response.meta['nb_try'] titre_xp = '//td[@class="titre_livre"]' res_sel = response.xpath(titre_xp + '/a[@class="titre_v2"]') uid_txt = res_sel.xpath('./@href').extract_first() # u'/livres/Levy-Rien-de-grave/9229' # found it if uid_txt: uid = uid_txt[uid_txt.index(u'/livres/') + 8:] title = res_sel.xpath('./text()').extract_first().strip() author = response.xpath('//td[@class="auteur"]/a/text()').extract_first().strip() pass_item = self.build_review_item(work_refid=self.works_to_harvest[widx]['work_refid'], work_uid=uid, title=title, authors=author) nb_t = response.xpath(titre_xp + '/a[contains(@href,"#critiques")]/span/text()').extract_first() if not nb_t or int(nb_t) == 0: logger.info("No reviews found for work-refid=%s in site %s (uid=%s)" % (pass_item['work_refid'], self.name, pass_item['work_uid'])) yield pass_item # DATA-ISSUES: may indicate 1-review, when none exist (ex. Ballard-La-Course-au-Paradis/230360) # the impact: try many times to harvest same work as no reviews will be yielded else: yield scrapy.Request(self.url_main % uid, meta={'work_index': widx, 'pass_item': pass_item, 'main_page': True}, callback=self.parse_reviews) else: n_found = response.xpath('//div[@class="content row"]//div[@class="titre"]/text()').extract_first() # found no book if n_found and n_found.find(u'(0)') != -1: if nb_try < len(isbns): yield scrapy.FormRequest(self.form_search, formdata={'Recherche': str(isbns[nb_try]), 'item_recherche': 'isbn'}, meta={'work_index': widx, 'nb_try': nb_try + 1}, callback=self.parse_search_resp) else: logger.info("Nothing found for isbn=%s in site %s" % (isbns[nb_try - 1], self.name)) yield self.build_review_item(work_refid=self.works_to_harvest[widx]['work_refid'], work_uid='-1') else: logger.error("Unexpected result page after %d try (search isbn=%s)" % (nb_try, isbns[nb_try - 1])) # interactively debug page from scrapy.shell import inspect_response inspect_response(response, self)
def parse_list(self, reponse): # the selector build and the data meta hxs = HtmlXPathSelector(response) meta = response.meta # log msg self.log('msg', loglevel=Error) # for debug and first xpath write open_in_browser(response) inspect_response(response, self) # selector choose and join or strip ''.join(hxs.select('').extract()).strip() # url join base_url = get_base_url(response) n_url = urljoin_rfc(base_url, 'url') return item
def parse(self, response): # 解析返回的URL数据 # 下载直接整理的图片 # with open(pre+'/pic_links.txt','r') as f: # for line in f: # item = DoubanItem() # item['image_urls'] = [line.strip()] # item['images'] = line[line.rfind('/')+1:].strip() # yield item inspect_response(response, self) soup = BeautifulSoup(response.body) # 用beautifulsoup解析 # print soup for y in soup.find_all('div', attrs={'class': 'doulist-item'}): item = DoubanItem() item['title'] = y.find('div', attrs={'class': 'title'}).a.text item['link'] = y.find('div', attrs={'class': 'title'}).a['href'] item['rating'] = y.find( 'span', attrs={'class': 'rating_nums'}).text item['major'] = y.find('div', attrs={'class': 'abstract'}).text yield item # 生成器返回匹配到的项目
def parse_search_resp(self, response): widx = response.meta['work_index'] isbns = self.works_to_harvest[widx]['isbns'] nb_try = response.meta['nb_try'] # found, map work_uid and request reviews page if response.url.find('/book/show/') != -1: gr_work_id = response.url[response.url.index('/book/show/') + 11:] # add title/authors (only done for initial QA checks) title = response.xpath('//h1[@class="bookTitle"]/text()').extract_first().strip() a_raw = response.xpath('//a[@class="authorName"]/child::*/text()').extract() authors = ",".join(a_raw) pass_item = self.build_review_item(work_refid=self.works_to_harvest[widx]['work_refid'], work_uid=gr_work_id, authors=authors, title=title) nb_rev = response.xpath('//a[@class="actionLinkLite"]/span[@class="count"]/span[@class="value-title"]/text()').extract_first() if int(nb_rev.replace(',', '')) == 0: logger.info("No reviews found for work-refid=%s (uid=%s of site %s)" % (pass_item['work_refid'], pass_item['work_uid'], self.name)) yield pass_item # map gr's id and trigger new Request to have reviews ordered correctly else: self.works_to_harvest[widx]['last_harvest_date'] = self.min_harvest_date # For popular review, GR has partial list, so must order by oldest yield scrapy.Request(self.url_review % (gr_work_id, 1, 'oldest'), meta={'work_index': widx, 'item': pass_item}, callback=self.parse_reviews) # not found page elif 'Looking for a book?' in response.body: if nb_try < len(isbns): yield scrapy.Request(self.url_search + str(isbns[nb_try]), meta={'work_index': widx, 'nb_try': nb_try + 1}, callback=self.parse_search_resp) else: logger.info("Nothing found for wid: %s, isbns: %s" % (self.works_to_harvest[widx]['work_refid'], str(isbns))) yield self.build_review_item(work_refid=self.works_to_harvest[widx]['work_refid'], work_uid='-1') else: logger.error("Unexpected result page after %d try (search isbn=%s)" % (nb_try, isbns[nb_try - 1])) # interactively debug page from scrapy.shell import inspect_response inspect_response(response, self)
def submit_login(self, response): if self.logged_in: self.verbose("already logged in") elif self.login_check in response.body_as_unicode(): self.verbose("logged in successfully") self.logged_in = True for req in self.start_requests(): yield req self.verbose("bye") else: if self.allow_second_login and self.in_first_attempt: self.verbose("need another attempt..") else: self.verbose("login failed") if self.login_debug: inspect_response(response, self) if self.allow_second_login and self.in_first_attempt: self.in_first_attempt = False yield self.get_login_request() else: raise LoginFailed(response)
def parse(self, response): inspect_response(response, self) round = ItemLoader(item=ResultsItem(), response=response) round.add_xpath("home_team", '//td[@class="cThree"]', re=">.*<") round.add_xpath("away_team", '//td[@class="cFive"]', re=">.*<") round.add_xpath("home_score", '//td[@class="cFour"]/text()') round.add_xpath("away_score", '//td[@class="cFour"]/text()') round.add_xpath("date", '//td[@class="cOne first"]/text()') round.add_xpath("replay", '//td[@class="cTwo"]') round.add_xpath("match_id", '//td[@class="last"]/a/@rel') round = round.load_item() print round match_count = len(round.values()[0]) for i in range(match_count): if round["home_score"][i] == "NaN": # changed this to pick up missing scorelines match = {} # create new item for each match for key, value in round.items(): match[key] = value[i] match["round_id"] = response.meta["round_id"] yield match
def directory_parser(self, response): """ Given a http://wesconnect.wesleyan.edu/directory search form, search and parse. """ print "On a directory page, sending serach data" if "<b>Search Operator / Search Value</b>" in response.body: print "This page has a search form" x = HtmlXPathSelector(response) viewstate = x.select("//input[@name='__VIEWSTATE']").select('@value').extract()[0] eventvalidation = x.select("//input[@name='__EVENTVALIDATION']").select('@value').extract()[0] for year in xrange(1950, 2013): year = str(year) yield FormRequest.from_response(response, formdata = { # LastName Comparison method #'cid_41$SearchGUI$sc285$ddComparison_285' : "Contains", #'cid_41$SearchGUI$sc285$mf_285' : last_name, # FirstName Comparison method #'cid_41$SearchGUI$sc284$ddComparison_284' : "Contains", #'cid_41$SearchGUI$sc284$mf_284' : "", ## Year should be filled in or non-existant, NEVER blank 'cid_41$SearchGUI$sc36$mf_36' : year, }, callback=self.listing_parser) else: print "This is a result from a search" from scrapy.shell import inspect_response inspect_response(response) print "Got to results listing"
def parse_items(self, response): for i, sel in enumerate(response.xpath("//*[@id='the-list']/tr[./td]")): mil = MasjidItemLoader(selector=sel) mil.add_xpath('id_masjid', "./td[5]/text()") mil.add_xpath('nama_masjid', "./td[4]/a/text()") mil.add_xpath('link_detail', "./td[4]/a/@href") mil.add_xpath('kabupaten_kota', "./td[2]/text()") mil.add_xpath('kecamatan', "./td[3]/text()") mil.add_xpath('tipologi', "./td[6]/text()") mil.add_xpath('alamat', "./td[7]/text()") mil.add_xpath('luas_tanah', "./td[8]/text()") mil.add_xpath('status_tanah', "./td[9]/text()") mil.add_xpath('luas_bangunan', "./td[10]/text()") mil.add_xpath('tahun_berdiri', "./td[11]/text()") mil.add_xpath('jamaah', "./td[12]/text()") mil.add_xpath('imam', "./td[13]/text()") mil.add_xpath('khatib', "./td[14]/text()") mil.add_xpath('muazin', "./td[15]/text()") mil.add_xpath('remaja', "./td[16]/text()") mil.add_xpath('no_telepon', "./td[17]/text()") mil.add_xpath('keterangan', "./td[18]/text()") long_lat = sel.xpath("./comment()[2]").re(r'align="center">(-?[0-9.]+)</') try: mil.add_value('longitude', long_lat[0]) mil.add_value('latitude', long_lat[1]) except IndexError: self.logger.error( "Can't get long-lat on %(url)s , element index = %(index)s", {"url": response.url, "index": i}, ) from scrapy.shell import inspect_response inspect_response(response, self) yield mil.load_item()
def parse(self, response): # We want to inspect one specific response. if ".org" in response.url: from scrapy.shell import inspect_response inspect_response(response, self)
def parse_statement(self, response): from scrapy.shell import inspect_response results = [] app_id_code_reestri_db = urlparse.parse_qs( urlparse.urlparse(response.request.url)[4])['app_id'][0] soup = BeautifulSoup(response.body, "html5lib", from_encoding="utf-8") # First table: "Prepared documents" -- scrape details into CorpDoc item # and then grab the doc too; they are usually PDFs. prepared_table = soup.find("caption", text=u"მომზადებული დოკუმენტები") if prepared_table is not None: prepared_table = prepared_table.parent for row in prepared_table.find_all("tr"): # First cell contains link # Second contains title, date # Third is blank cells = row.find_all("td") link = "" if cells[0] is not None: link = cells[0].a["href"] spans = cells[1].find_all("span") title = spans[0].string date = spans[1].string results.append( StatementDocument( fk_corp_id_code=response.meta['corp_id_code'], fk_stmnt_id_code_reestri_db=app_id_code_reestri_db, link=link, title=title, date=date)) results.append( Request(url=link, callback=self.parse_stmnt_prepared_doc, meta={ 'cookiejar': response.meta['cookiejar'], 'corp_id_code': response.meta['corp_id_code'] })) # Second table: Status Documents. Scrape details into CorpDocs, and # grab the docs too, they are usually PDFs. status_table = soup.find("caption", text=u"სტატუსი / გადაწყვეტილება") if status_table is not None: status_table = status_table.parent for row in status_table.find_all("tr"): cells = row.find_all("td") link = "" if cells[0] is not None: link = cells[0].a["href"] registration_num = cells[1].find(class_="maintxt").string date = cells[1].find(class_="smalltxt").string title = cells[2].find(style=True).string results.append( StatementDocument( fk_corp_id_code=response.meta['corp_id_code'], fk_stmnt_id_code_reestri_db=app_id_code_reestri_db, link=link, title=title, date=date, registration_num=registration_num)) # Probably don't actually need to parse these. #results.append(Request(url=link, # callback=self.parse_stmnt_status_pdf, # meta={'cookiejar':response.meta['cookiejar'], # 'id_code_reestri_db':response.meta['id_code_reestri_db']})) # Third table: Scanned Documents. Scrape details into CorpDocs, and # grab the docs if they are PDFs. scanned_table = soup.find("caption", text=u"სკანირებული დოკუმენტები") if scanned_table is not None: scanned_table = scanned_table.parent for row in scanned_table.find_all("tr"): cells = row.find_all("td") link = "" if cells[0] is not None: link = cells[0].a["href"] doc_info = cells[1].find_all(class_="maintxt") if (len(doc_info) == 2): title = doc_info[0].string date = doc_info[1].string else: date = doc_info[0].string title = None filename = cells[2].find("a").find("span").string doc = StatementDocument( fk_corp_id_code=response.meta['corp_id_code'], fk_stmnt_id_code_reestri_db=app_id_code_reestri_db, link=link, date=date, filename=filename) if (title): doc['title'] = title results.append(doc) #TODO: Check whether it's a PDF and if so, return # a Request to the document. # Fourth table: Statement details. Scrape details into RegistryStatement. statement = RegistryStatement() # First block of info, starting with statement number. regx = re.compile(u"^\s+განცხადება.+$") caption = soup.find("caption", text=regx) if caption is None: inspect_response(response) statement['statement_num'] = caption.string.split('#')[1] table = caption.parent statement['registration_num'] = self._get_header_sib( table, u"\n\s*რეგისტრაციის ნომერი\s*").span.string statement['statement_type'] = self._get_header_sib( table, u"\n\s*მომსახურების სახე\s*").span.string statement['service_cost'] = self._get_header_sib( table, u"\n\s*მომსახურების ღირებულება\s*").span.string pay_debt = self._get_header_sib( table, u"\n\s*გადასახდელი თანხა/ბალანსი\s*").span.string statement['payment'] = pay_debt.split("/")[0] statement['outstanding'] = pay_debt.split("/")[1] statement['id_reestri_db'] = response.meta['stmnt_id_reestri_db'] # Second block of info, starting after payment details. # Find the correct table table = soup.find("div", id="application_tab").table # Grab the relevant parts statement['id_code_legal'] = self._get_header_sib( table, u"საიდენტიფიკაციო ნომერი").strong.string statement['name'] = self._get_header_sib( table, u"სუბიექტის დასახელება ").string statement['classification'] = self._get_header_sib( table, u"სამართლებრივი ფორმა").string statement['reorganization_type'] = self._get_header_sib( table, u"რეორგანიზაციის ტიპი ").string statement['quantity'] = self._get_header_sib(table, u"რაოდენობა").string statement['changed_info'] = self._get_header_sib( table, u"შესაცვლელი რეკვიზიტი: ").string # Attached docs description is a <ul> attached = self._get_header_sib(table, u"\n\s*თანდართული დოკუმენტაცია\s") attached_desc = [] for li in attached.ul.contents: attached_desc.append(li.string) statement['attached_docs_desc'] = attached_desc # Additional docs is a <div>, don't know what the format looks like yet addtl_td = self._get_header_sib(table, u"\n\s*დამატებით წარმოდგენილი\s*") statement['additional_docs'] = addtl_td.find( id="additional_docs_container").string # Issued docs also a ul issued = self._get_header_sib(table, u"\n\s*გასაცემი დოკუმენტები\s*").ul issued_desc = [] for li in issued.contents: issued_desc.append(li.string) statement['issued_docs'] = issued_desc # Don't know the format of notes yet either. notes_td = self._get_header_sib(table, u"\n\s*შენიშვნა\s*") statement['notes'] = notes_td.string results.append(statement) # Cells containing people require a bit more intelligence representative_td = self._get_header_sib(table, u" წარმომადგენელი ") rv_pers = self._person_from_statement_cell(representative_td) if len(rv_pers) > 0: results.append( PersonCorpRelation( person=rv_pers, fk_corp_id_code=response.meta['corp_id_code'], relation_type=[u"წარმომადგენელი"], cite_type="statement", cite_link=response.request.url)) representee_td = self._get_header_sib(table, u" წარმომდგენი ") re_pers = self._person_from_statement_cell(representee_td) if len(re_pers) > 0: results.append( PersonCorpRelation( person=re_pers, fk_corp_id_code=response.meta['corp_id_code'], relation_type=[u"წარმომდგენი"], cite_type="statement", cite_link=response.request.url)) ganmcxadebeli_td = self._get_header_sib(table, u"განმცხადებელი ") g_pers = self._person_from_statement_cell(ganmcxadebeli_td) if len(g_pers) > 0: results.append( PersonCorpRelation( person=g_pers, fk_corp_id_code=response.meta['corp_id_code'], relation_type=[u"განმცხადებელი"], cite_type="statement", cite_link=response.request.url)) return results
def inspect(self, response): from scrapy.shell import inspect_response inspect_response(response, self)
def parse(self,response): from scrapy.shell import inspect_response inspect_response(response,self) print response.xpath('//div[@class="content"]').extract()
def parse_news_page(self, response): from scrapy.shell import inspect_response inspect_response(response, self) stop_scrape_flag = False news_list = self.exchange.get_news_list(response) if not news_list: raise Exception('Error: Website Structure Has Been Changed!' + ' Maintainance Needed!') for i, news_row in enumerate(news_list): # has to assign new dict every loop # otherwise mongodb raises dup key (Id) error item = { 'mkt': self.exchange.uptick_name, 'mkt_id': self.mkt_id, 'tzinfo': self.exchange.tzinfo, 'error': True } try: # news row won't have error date_time, url, title, misc_fields_dict = self.exchange.get_news_fields( news_row) # database has previous news and scraped news is older than database if self.latest_date and date_time < self.latest_date: stop_scrape_flag = True break # generate file name by date and number of events on that date # todo: change uptick_name to col_name # if exchange has multi news sources # assign key 'website_url' to misc_fields_dict website_url = '' if self.exchange.is_multi_source_exchange: website_url = misc_fields_dict.get('website_url') filename = du.get_filename(date_time, self.exchange.col_name, website_url) # insert record to mongodb item['date_time'] = date_time item['title'] = title item['url'] = url item['unique_id'] = filename item['error'] = False item.update(misc_fields_dict) yield item utils.save_pdf_url_or_chrome(url, self.pdfs_dir + filename) except Exception as e: # not news row, skip item['error'] = { 'news_row_html': news_row.extract(), 'error_message': '%s: %s' % (e.__class__, str(e)), 'row_no': i, 'traceback': traceback.format_exc(), 'url': response.url } yield item continue # todo: test without keep_follow_page flag if not stop_scrape_flag: for url, meta in self.exchange.get_pagination_urls(response): yield scrapy.Request(url, callback=self.parse_news_page, meta=meta)
def parse(self, response): # print response.text keyword = response.meta['keyword'] results = response.xpath('//div[@class="result c-container "]') #print results time = datetime.datetime.now(self.tz) from scrapy.shell import inspect_response inspect_response(response, self) for res in results: #print res.extract() url = res.xpath( './/h3[contains(@class,"t")]/a/@href').extract_first() # print keyword,url bfdata = str(keyword) + str(url) item = ScrapyBaiduItem() item['url'] = url print url item['title'] = res.xpath('.//h3[contains(@class,"t")]/a').xpath( 'string(.)').extract_first() timestr = res.xpath( './/span[contains(@class," newTimeFactor_before_abs m")]' ).xpath('string(.)').extract_first() # print timestr if timestr == None: item['time'] = time.strftime('%Y_%m_%d_%H_%M_%S') else: if str(timestr).find('天前') != -1: time_num = int(timestr[:str(timestr).find('天前')]) delta = datetime.timedelta(days=time_num) new_time = time - delta item['time'] = new_time.strftime('%Y_%m_%d_%H_%M_%S') elif str(timestr).find('小时前') != -1: time_num = int(timestr[:str(timestr).find('小时前')]) delta = datetime.timedelta(hours=time_num) new_time = time - delta item['time'] = new_time.strftime('%Y_%m_%d_%H_%M_%S') elif str(timestr).find('分钟前') != -1: time_num = int(timestr[:str(timestr).find('分钟前')]) delta = datetime.timedelta(minutes=time_num) new_time = time - delta item['time'] = new_time.strftime('%Y_%m_%d_%H_%M_%S') elif str(timestr).find('秒钟前') != -1: time_num = int(timestr[:str(timestr).find('秒钟前')]) delta = datetime.timedelta(seconds=time_num) new_time = time - delta item['time'] = new_time.strftime('%Y_%m_%d_%H_%M_%S') #print title abstract = res.xpath('.//div[contains(@class,"c-abstract")]' ).xpath('string(.)').extract_first() if abstract == None: #print res.extract() abstract = res.xpath( './/div[@class="c-span18 c-span-last"]/font/p').xpath( 'string(.)').extract() abstract = ' '.join(abstract) #print abstract #print abstract s = abstract.find('-') if s > 0: abstract = abstract[s + 2:] item['abstract'] = abstract item['keyword'] = unicode(keyword) item['create_time'] = time.strftime('%Y_%m_%d_%H_%M_%S') yield item
def parse_page(self, response): # scrape dynamically generated HTML self.browser.get(response.url) hxs = Selector(text=self.browser.page_source) item = ScraperItem() # use scrapy shell to find xpath from scrapy.shell import inspect_response inspect_response(response) try: divs = hxs.xpath( '//div[@id="contentArea"]/div[1]/div[1]/div[1]/div[1]/div[2]/div[1]/descendant-or-self::*/text()' ).extract() text = u" ".join(divs[1:]) no_text = len(divs) == 0 except IndexError: no_text = True if no_text: try: text = " ".join( hxs.xpath( '//span[@class="hasCaption"]/child::node()').extract()) except IndexError: text = "" item['url'] = response.url item['text'] = text item['title'] = hxs.xpath('//title/text()').extract() item['date'] = hxs.xpath( '//span[@class="timestampContent"]/text()').extract() comments = float(hxs.xpath('count(//abbr)').extract()[0]) - 1 try: likes = hxs.xpath( '//div[@class="UFILikeSentenceText"]/span/span/text()' ).extract()[0] if "likes" in likes: like_count = 1.0 else: try: like_count = len(likes.split(", ")) if "others" in likes: like_count += float( likes.split("and ")[1].split(" others")[0].replace( ",", "")) elif "and" in likes: like_count += 1.0 except IndexError: like_count = 2.0 except IndexError: like_count = 0.0 # print "like count: "+str(like_count) try: shares = hxs.xpath( '//a[@class="UFIShareLink"]/text()').extract()[0] share_count = float(shares.split(" share")[0].replace(",", "")) except IndexError: share_count = 0.0 print like_count, share_count, comments item['comment_count'] = [like_count, share_count, comments] yield item
def parse_three(self, response): meta = response.meta unqiue_form = meta['cbSearchResultsUniqueId'] bbb = json.loads(response.text)['responseText'] __response = HtmlResponse(response.url, body=str.encode(bbb)) business_name = __response.xpath( "//td[starts-with(text(),'Business:')]/following::tr/td[1]/span/text()" ).extract_first() if not business_name: inspect_response(__response, self) f_name = __response.xpath( "//td[starts-with(text(),'FName:')]/following::tr/td[1]/span/text()" ).extract_first() l_name = __response.xpath( "//td[starts-with(text(),'LName')]/following::tr/td[1]/span/text()" ).extract_first() phone = __response.xpath( "//td[starts-with(text(),'Phone:')]/following::tr/td[1]/span/text()" ).extract_first() county = __response.xpath( "//td[starts-with(text(),'County:')]/following::tr/td[1]/span/text()" ).extract_first() add = __response.xpath( "//td[starts-with(text(),'Address:')]/following::tr/td[1]/span/text()" ).extract_first() city = __response.xpath( "//td[starts-with(text(),'City:')]/following::tr/td[1]/span/text()" ).extract_first() state = __response.xpath( "//td[starts-with(text(),'State:')]/following::tr/td[1]/span/text()" ).extract_first() zip_code = __response.xpath( "//td[starts-with(text(),'Zip:')]/following::tr/td[1]/span/text()" ).extract_first() print("-------------------0999999999999", business_name, f_name, l_name, phone, county, add, city, state, zip_code) date_certified = __response.xpath( "//td[starts-with(text(),'Date Certified:')]/following::tr/td[1]/span/text()" ).extract_first() expire_date = __response.xpath( "//td[starts-with(text(),'Date of Expiration:')]/following::tr/td[1]/span/text()" ).extract_first() person_name = f_name + ' ' + l_name location_address_string = self.format__address_4( add, city, state, zip_code) next_page = __response.xpath( '//a[@data-cb-name="JumpToNext"]/@href').extract_first() if next_page: app = next_page[next_page.rfind('?appSession=') + 12:next_page.rfind('&RecordID=')] RecordID = next_page[next_page.rfind('&RecordID=') + 10:next_page.rfind('&cpipage=')] cpipage = next_page[next_page.rfind('&cpipage=') + 9:next_page.rfind('&PageID')] PageID = next_page[next_page.rfind('&PageID=') + 8:next_page.rfind('&PrevPageID')] PrevPageID = next_page[next_page.rfind('&PrevPageID=') + 12:next_page.rfind('&CPISortType=')] cbCurrentPageSize = next_page[ next_page.rfind('&cbCurrentPageSize') + 19:next_page.rfind('&cbRandomSortKey=')] cbRandomSortKey = next_page[next_page.rfind('&cbRandomSortKey') + 17:next_page.rfind('&cbRecordPosition' )] cbCurrentRecordPosition = next_page[next_page. rfind('&cbRecordPosition=') + 18:] a = int(time() * 1000) url1 = 'https://c0bkr159.caspio.com/dp/31cf1000eccbd58b888d45ff8350?rnd=' + str( a) print("---form------------------------------>", url1) form_data = { 'AjaxAction': 'JumpToNext', 'GridMode': 'False', 'cbUniqueFormId': unqiue_form, 'ClientQueryString': '', 'appSession': app, 'RecordID': RecordID, 'cpipage': cpipage, 'PageID': PageID, 'PrevPageID': PrevPageID, 'CPISortType': '', 'CPIorderBy': '', 'cbCurrentPageSize': cbCurrentPageSize, 'cbRandomSortKey': cbRandomSortKey, 'cbRecordPosition': cbCurrentRecordPosition, 'AjaxActionHostName': 'https://c0bkr159.caspio.com', } print("---form------------------------------>", form_data) headerr = { 'Origin': 'https://c0bkr159.caspio.com', 'Referer': 'https://c0bkr159.caspio.com/dp/31cf1000eccbd58b888d45ff8350', 'Sec-Fetch-Mode': 'cors', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36', } yield scrapy.FormRequest(url=url1, callback=self.parse_three, method='POST', dont_filter=True, formdata=form_data, meta=meta, headers=headerr) il = ItemLoader(item=SdSeptictankLicensesSpiderItem(), response=response) # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('sourceName', 'SD_SepticTank_Licenses') il.add_value('url', 'http://denr.sd.gov/des/sw/SepticInstallers.aspx') il.add_value('permit_lic_eff_date', date_certified) il.add_value('City', city) il.add_value('State', state) il.add_value('Zip', zip_code) company_name = self._getDBA(business_name)[0] if len(company_name) < 2: company_name = person_name il.add_value('company_name', company_name) il.add_value('permit_lic_desc', 'Waste Transporter Licenses for ' + company_name) il.add_value('dba_name', self._getDBA(business_name)[1]) il.add_value('county', county) il.add_value('permit_lic_exp_date', expire_date) il.add_value('location_address_string', location_address_string) il.add_value('company_phone', phone) il.add_value('person_name', self._getDBA(person_name)[0]) il.add_value('permit_type', 'waste_transporter_license') yield il.load_item()
def parse(self, response): # This callback determines if the selected menu is # at the top of the list, if it is then it adds the urls # to the list and keeps going # if its not, then it calls the lua to prepare the page # for scraping, and then scrapes it url = response.url menu = response.css(".category-filter__link") #submenu = response.css("") #print ("self.urls - " +str(self.urls)) print("processing response.url - " + response.url) #print ("menu: ") #print (menu.getall()) #print ("len(menu): " + str(len(menu))) #print ("menu[0] : " + menu.get()) #print("name - " + menu[0].css('.category-filter__text ::text').get()) #inspect_response(response,self) if (len(menu) > 0 and menu[0].css('[aria-current="page"]')): print(f"inside menu page for url - {url}") # The top page is active #print ("menu[0] : [aria-current=page] " + menu[0].css('[aria-current="page"]').get()) # therefore we need to scrape the links, and continue searching # we then need to loop through each other page. # call parse, and scrape it is not menu_url = menu[0].css('::attr(href)').get() menu_name = menu[0].css('.category-filter__text ::text').get() for item in menu: heading = item.css('.category-filter__text ::text').get() scraped_url = item.css('::attr(href)').get() scraped_url = self.base_url + scraped_url section = menu_name subsection = heading category = lookup_category("", section, subsection) store_url(self.conn, scraped_url, self.store_id, category, section, subsection) #self.section_dict[url]=(menu_name, heading) #if self.urls.count(url) == 0: # self.urls.append(url) #urls=menu.css('::attr(href)').getall() # Remove the the first(this) page from list to parse #urls.pop() #self.urls.extend(urls) #print("urls to scrape - " + str(self.urls)) #print("local urls - " + str(urls)) """ while len(self.urls) != 0: url = self.urls.pop() self.processedUrls.append(url) #url = self.base_url + url_suffix #print ("urls - " + str(self.urls)) #print ("pulling from url - " + url) #print ("urls lengths - " + str(len(self.urls))) yield SplashRequest(url, self.parse, endpoint='execute', args={'lua_source': self.expand_and_scroll_lua}) """ elif (len(menu) == 0): inspect_response(response, self) else: #we are on a subpage, so now we can start scraping # GROCERY_SELECTOR = '.grid-item' NAME_SELECTOR = '.small-type.detail-card-description ::text' PRICE_SELECTOR = '.price ::text' PRICE_PER_UNIT_SELECTOR = '.sub-headline.detail-card-subtext ::text' metadata = get_url_metadata(self.cursor, url) section = metadata[0] subsection = metadata[1] print("subpage - scraping " + url + ", from section - " + section) for grocery in response.css(GROCERY_SELECTOR): self.name = grocery.css(NAME_SELECTOR).extract_first() self.price = grocery.css(PRICE_SELECTOR).extract_first() if self.price is not None: self.price = self.price.replace('*', '').replace('$', '') self.ppu = grocery.css(PRICE_PER_UNIT_SELECTOR).extract_first() if self.ppu is not None: self.ppu = convert_ppu(self.ppu) #inspect_response(response, self) #parse the ounces off of the name yield { 'name': self.name, 'price': self.price, 'price-per-unit': self.ppu, 'section': section, 'subsection': subsection, 'url': response.url } finish_url(self.conn, self.store_id, url) print("finishing url - " + url) next_url = get_next_url(self.cursor, 1) if next_url is not None: print("got next_url - " + next_url) yield SplashRequest( next_url, self.parse, endpoint='execute', dont_filter=True, args={'lua_source': self.expand_and_scroll_lua}) else: print("Next url is none therefore we must be finished ! ")
def parse(self, response): inspect_response(response, self)
def parse(self, response): from scrapy.shell import inspect_response inspect_response(response)
def after_login(self, response): inspect_response(response, self) # opens terminal return
def test(self, response): from scrapy.shell import inspect_response inspect_response(response, self)
def parse_tongshi(self, response): courses = response.xpath('//table[@id="gridMain"]/tbody/tr[re:test(@class, "tdcolour\d$")]') from scrapy.shell import inspect_response inspect_response(response, self)
def parse(self, response): '''get scrapy shell in ipython ''' inspect_response(response, self)
def parsePage(self, response): if response.status != 200: yield FormRequest( url=response.request.url, #headers = self.headers, meta={ 'params': response.meta['params'], 'xsrfValue': response.meta['xsrfValue'], 'userDataId': response.meta['userDataId'], 'offset': response.meta['offset'] }, formdata={ 'method': 'next', 'params': response.meta['params'], '_xsrf': response.meta['xsrfValue'], }, dont_filter=True, callback=self.parsePage) else: item = UserColumnItem() data = json.loads(response.body) columnList = data['msg'] inspect_response(response, self) item['spiderName'] = self.name # logging.warning('response.meta[params]: %s \n response.body: %s',response.meta['params'],response.body) #这里注意要处理含有匿名用户的情况 if columnList: res = Selector(text=''.join(columnList)) item['userDataId'] = response.meta['userDataId'] item['offset'] = response.meta['offset'] for sel in res.xpath( '//div[contains(@class,"zm-profile-section-item")]'): item['columnLinkId'] = sel.xpath( 'a[@class="zm-list-avatar-link"]/@href').re( r'http://zhuanlan.zhihu.com/(.*)')[0] item['columnImgLink'] = sel.xpath( 'a[@class="zm-list-avatar-link"]/img/@src').extract( )[0] item['columnId'] = sel.xpath( 'div[contains(@class,"zm-profile-section-main")]/button/@id' ).extract()[0] try: item['columnDescription'] = sel.xpath( 'div[contains(@class,"zm-profile-section-main")]/div[contains(@class,"description")]/text()' ).extract()[0] except: # logging.warning('item[columnLinkId]: %s',item['columnLinkId']) item['columnDescription'] = '' item['columnPostCount'] = sel.xpath( 'div[contains(@class,"zm-profile-section-main")]/div[contains(@class,"meta")]/span/text()' ).re(r'(\d+)')[0] # 注意userLinkId中可能有中文 yield item else: #没有用户 item['userDataId'] = '' yield item
def parse(self, response): if "c5game" in response.url: from scrapy.shell import inspect_response inspect_response(response, self)
def next(self, response): print("此时已经登录完成并爬取了个人中心的数据") title = response.xpath("/html/head/title/text()").extract() print(title[0]) inspect_response(response, self)
def parse_item(self, response): ''' this part for debug ''' from scrapy.shell import inspect_response inspect_response(response, self) item = ImdbItem() soup = BeautifulSoup(response.text) try: genre = list( map(lambda x: x.text, soup.find("div", { "class": "subtext" }).findAll("a"))) except: genre = None item["genre"] = genre # try: name = soup.find("div", { "class": "title_wrapper" }).find("h1").contents[0] except: name = None item['name'] = name # try: year = soup.find("div", { "class": "title_wrapper" }).find("h1").span.text.strip("(").strip(")") except: year = None item['year'] = year # try: director = list( map( lambda x: x.text, soup.find("div", { "class": "credit_summary_item" }).findAll("a"))) except: director = None item["director"] = director # try: score = int( soup.find("div", { "class": "ratingValue" }).contents[1].text) except: score = None item["score"] = score # try: stars = list( map( lambda x: x.text, soup.findAll( "div", {"class": "credit_summary_item"})[-1].findAll("a"))) except: stars = None item["stars"] = stars yield item
def parse_detail(self, response): inspect_response(response, self)
def parse_test(self, response): print("####IN TEST####") from scrapy.shell import inspect_response inspect_response(response, self) yield {'Test': "Passed"}
def scrapy_shell_called(response, self): # for test # pls use external system terminal, otherwise raise a error from scrapy.shell import inspect_response inspect_response(response, self)
def test(self, response): '''测试方法,回调进入交互式测试''' from scrapy.shell import inspect_response inspect_response(response, self)
def debug(self, response): from scrapy.shell import inspect_response inspect_response(response, self) raise CloseSpider('debug stop')
def inspect_spider_response(response, spider): if spider.settings.get('DEBUG', True): return inspect_response(response, spider)
def parse_country(self, response): inspect_response(response, self)
def inspect(response): inspect_response(response) raise CloseSpider('Done')
def parse_nodes(self, response, nodes): inspect_response(response, self)
def parse_series(self, response): sel = Selector(response) from scrapy.shell import inspect_response inspect_response(response)
def parseInfo(self, response): inspect_response(response, self) title = response.css('body > h1::text').extract()