def parse(self, response): pattern = r'var \$render_data = \[((.|\s)*?})\]' raw_data = re.search(pattern,response.text) raw_data = raw_data.group(1) json_data = json.loads(raw_data) status = json_data['status'] try: is_retweeted = status['retweeted_status'] mblog_id = is_retweeted['id'] mblog_url = base_mblog_url % (mblog_id) return Request(mblog_url, callback=self.getBlog) except: pass pattern = r'var \$render_data = \[((.|\s)*?})\]' raw_data = re.search(pattern, response.text) raw_data = raw_data.group(1) json_data = json.loads(raw_data) status = json_data['status'] items=self.putItem(status) usrdetail_url = base_usrdetail_url % (items['usr_id']) yield Request(url=usrdetail_url, meta={"item": items}, callback=self.getUsrDetail) if status['reposts_count']: reposts_url = base_reposts_url % (status['id'],str(1)) yield Request(reposts_url,callback=self.getReposters)
def parse_comment(self, response): comments_json = json.loads(response.body_as_unicode()) if comments_json['message'] == 'success': comments = comments_json['data']['comments'] if len(comments) > 0: items = [] for comment in comments: item = CommentInfo({ 'comment': comment['text'], 'likes': comment['digg_count'], 'time': comment['create_time'], 'comment_id': comment['id'] }) self.copy_article_info(response.meta, item) if comment['reply_count'] > 0: reply_to_comment_url = 'http://www.toutiao.com/api/comment/get_reply/?comment_id=' + str( comment['id']) + '&dongtai_id=' + str( comment['dongtai_id'] ) + '&offset=0&count=' + str( comment['reply_count']) reply_request = Request(reply_to_comment_url, callback=self.parse_reply, method='GET') self.copy_article_info(response.meta, reply_request.meta) reply_request.meta['reply_to_id'] = comment['id'] yield reply_request items.append(item) return items else: return
def start_requests(self): for row in self.tyre_sizes: if self.check_row_is_processed(row): continue self.add_row_to_history(row) meta = {'row': row} xl = '' if row['XL'] == 'XL': xl = 'Y' meta['xl'] = True run_flat = '' if row['Run Flat'] == 'RF': run_flat = 'Y' meta['run_flat'] = True url = 'http://www.point-s.co.uk/tyres?s=&width=' + row[ 'Width'] + '&profile=' + row['Aspect Ratio'] + '&size=' + row[ 'Rim'] + '&speed=' + row[ 'Speed rating'] + '&paginate=true&runflat=' + run_flat + '&extra_load=' + xl yield Request(url, dont_filter=True, meta=meta) if row['Alt Speed']: url = 'http://www.point-s.co.uk/tyres?s=&width=' + row[ 'Width'] + '&profile=' + row[ 'Aspect Ratio'] + '&size=' + row['Rim'] + '&speed=' + row[ 'Alt Speed'] + '&paginate=true&runflat=' + run_flat + '&extra_load=' + xl yield Request(url, dont_filter=True, meta=meta)
def parse(self, response): """docstring for parse""" yield Request('http://www.ag.senate.gov/hearings', self.parse_data) for links in range(2, 10): yield Request( 'http://www.ag.senate.gov/hearings?PageNum_rs=' + str(links), self.parse_data)
def getReposters(self, response): pattern = 'page=(\d+)' result = re.search(pattern, response.url) page_id = result.group(1) try: json_data = json.loads(response.text) data = json_data['data'] reposts_data = data['data'] if int(page_id) == 1: self.max = data['max'] for item in reposts_data: items = self.putItem(item) time_url = base_mblog_url % (items['mblog_id']) yield Request(url=time_url, meta={"item": items}, callback=self.get_accurate_time) except: pass if int(page_id) < int(self.max): reposts_url = re.sub(pattern, 'page=' + str(int(page_id) + 1), response.url) yield Request(reposts_url, callback=self.getReposters)
def parse_courses(self,response): """ Parses the classlist for a specific subject and term. Selects the links for each class details page, and uses them to fill in the 'url', 'number', 'subject', and 'title' fields for each class item. Generates requests to each link in order to get pre-requisites, These requests are handled by the parse_details callback. """ hxs = HtmlXPathSelector(response) courses = hxs.select('//td[@class="nttitle"]/a') for c in courses: item = CatalogItem() url = c.select('@href').extract()[0] data = dict(e.split('=') for e in url.split('?')[1].split('&')) title = c.select('text()').extract()[0].split(' - ')[1].strip() item['url'] = url item['number'] = data['crse_numb_in'] item['subject'] = data['subj_code_in'] item['title'] = title request = Request("https://www.uvic.ca" + url, callback=self.parse_details) request.meta['item'] = item yield request
def parse(self, response): sel = Selector(response) # Pages from 1998 onwards, new format # These normally cover around a 2-6 year period proceedings_menu = sel.xpath( '//a[starts-with(text(),"Official Record of Proceedings")]/@href') if proceedings_menu: for url in proceedings_menu.extract(): absolute_url = urlparse.urljoin(response.url, url.strip()) req = Request(absolute_url, callback=self.parse_hansard_index_page) yield req # Former Legislative Council (before 7/1997) table = sel.xpath( "//h3[contains(text(),'Former Legislative Council (before 7/1997)')]/following::table[1]" ) if table: links = table[0].xpath( ".//td/a[contains(text(),'Session')]/@href").extract() if links: for url in links: absolute_url = urlparse.urljoin(response.url, url.strip()) req = Request(absolute_url, callback=self.parse_hansard_index_page) yield req
def parse(self, response): item = BaiduItem() soup = BeautifulSoup(response.text, 'html.parser') title = soup.select('.lemmaWgt-lemmaTitle-title h1')[0].text subhead = soup.select('.lemmaWgt-lemmaTitle-title h2') if len(subhead) is not 0: # print(subhead[0].text) title = title + subhead[0].text item['title'] = title info_list = soup.select('.lemma-summary div') info = '' for temp in info_list: # 截取其中文字信息 info += temp.text # 如果有超链接,则继续爬取 a_list = temp.select('a') if len(a_list) is not 0: for a in a_list: if a.has_attr('href'): yield Request(self.base_url + a['href'], headers=self.headers) item['info'] = info properties_list = soup.select('.basicInfo-block dt') properties = '' for pro in properties_list: properties += '###' + pro.text.strip().replace('\n', '') # 如果有超链接,则继续爬取 a_list = pro.select('a') if len(a_list) is not 0: for a in a_list: if a.has_attr('href'): yield Request(self.base_url + a['href'], headers=self.headers) item['properties'] = properties values_list = soup.select('.basicInfo-block dd') values = '' for val in values_list: values += '###' + val.text.strip().replace('\n', '') # 如果有超链接,则继续爬取 a_list = val.select('a') if len(a_list) is not 0: for a in a_list: if a.has_attr('href'): yield Request(self.base_url + a['href'], headers=self.headers) item['values'] = values if len(soup.select('.summary-pic img')) is not 0: item['img'] = soup.select('.summary-pic img')[0]['src'] print(item['title']) yield item
def parse(self, response): for productxs in response.xpath( '//div[contains(@class, "products-list")]//div[@data-product]' ): yield Request(productxs.xpath( './/a[@class="product-card-link"]/@href').extract()[0], callback=self.parse_product) next_page = response.xpath('//link[@rel="next"]/@href').extract() if next_page and not 'Page.Next.Link' in next_page[0]: yield Request(response.urljoin(next_page[0]))
def parse_item(self, response): m2 = hashlib.md5() m2.update(response.url) youm = Myoum7Item() youm['url'] = response.url youm['page_name'] = m2.hexdigest() youm['do_main'] = 'm.youm7.com' referer = response.request.headers.get('Referer', None) reattr = re.findall(r"sectionID=(\w+)", referer) if reattr: youm['url'] = referer yield Request(referer, callback=self.parse_item) date_str = response.xpath('//div[@class="news-dev"]/@data-id') attr = re.findall(r"sectionID=(\w+)", response.url) if date_str and attr: sectionID = attr[0] date_str = date_str.extract() url_date = date_str[len(date_str)-1] newUrl = "https://m.youm7.com/Section/NewsSectionPaging?lastid="+url_date+"§ionID="+str(sectionID) youm['url'] = newUrl yield Request(newUrl, callback=self.parse_item) title_str = response.xpath('//title/text()') content_str = response.xpath('//div[@class="text-cont"]//div[@id="articleBody"]//p/text()') type_str =response.xpath('//div[@class="container"]//div[@class="breadcumb"]//a/text()')#菜单中的分类 if content_str and title_str: content = "" content_str = response.xpath('//div[@class="text-cont"]//div[@id="articleBody"]//p/text()') for s in content_str.extract(): content += s youm['title'] = title_str.extract()[0] youm['content'] = content youm['str_size'] = len(content) youm['type'] = type_str.extract()[1] #取详细页中的分类 yield youm
def parse_schedule(self,response): """ Parse the schedule for a subject and term Grab all the links for each section to get CRNs and enrollment info """ hxs = HtmlXPathSelector(response) links = hxs.select('//th[@class="ddtitle"]/a/@href').extract() for l in links: url = 'https://www.uvic.ca'+l request = Request(url,callback=self.parse_section) request.meta['item'] = response.meta['item'] yield request
def parse(self,response): """ Parses only the first page of the dynamic class catalog. Extracts the available terms from the select box and generates requests for the search pages for each term. These requests are handled by the parse_term method """ hxs = HtmlXPathSelector(response) # get term dates from the options in a select box terms = hxs.select('//select[@id="term_input_id"]/child::option').select('@value').extract() # eliminate the entry corresponding to None, and remove terms that are # too old def is_valid_term(term): current_year = datetime.now().year if len(term) == 0: return False elif (current_year - int(term[0:4])) > 4: return False else: return True terms = [t for t in terms if is_valid_term(t)] self.log('Got terms: '+str(terms)) self.terms = terms # get the complete class listings url = self.classlist_url_template.format(term = self.terms[0]) request = Request(url,callback=self.parse_classlist_search) yield request # get the schedule pages for each term and subject # for term in self.terms: # for subject in self.subjects: # url = self.schedule_url_template.format(term=term,subject=subject,number='') # item = ScheduleItem() # item['term'] = term # item['subject'] = subject # request = Request(url,callback=self.parse_schedule) # request.meta['item'] = item # yield request for term in terms: term_url = 'https://www.uvic.ca/BAN2P/bwckgens.p_proc_term_date?p_calling_proc=bwckschd.p_disp_dyn_sched&p_term='+term request = Request(term_url,callback=self.parse_term) request.meta['term'] = term yield request
def getReposters(self,response): json_data = json.loads(response.text) data = json_data['data'] max = data['max'] reposts_data = data['data'] pattern = 'page=(\d+)' result = re.search(pattern,response.url) page_id = result.group(1) for item in reposts_data: items=self.putItem(item) usrdetail_url = base_usrdetail_url % (items['usr_id']) yield Request(url=usrdetail_url, meta={"item": items}, callback=self.getUsrDetail) if int(page_id) < int(max): reposts_url = re.sub(pattern,'page='+str(int(page_id)+1),response.url) yield Request(reposts_url,callback=self.getReposters)
def parse(self, response): # 当前页面所用的url post_nodes = response.css("div.list_body_box1 .art_tit a") for post_node in post_nodes: # 提取开始url post_url = post_node.css("::attr(href)").extract_first("") yield Request(url=parse.urljoin(response.url, post_url), callback=self.parse_detail) # 提取下一页并交给scrapy进行下载 # !!!!!!!!!!!!!!!!!提取下一页未完成 next_url = response.css('div.list_body_box1 .pagingNormal a') if next_url: yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)
def parse(self, response): self.log("正在解析第{}页".format(self.current_page)) no_data = response.xpath(".//div[@class='ico_list']/div[@class='no_data']") if no_data or self.current_page > self.max_page: self.log("no data = {}".format(no_data)) self.log("没有数据或超过指定页,爬虫退出!最大爬取页为:{}".format(self.max_page)) return uris = response.xpath(".//div[@class='content']/a/@href").extract() for uri in uris: yield Request(self.domains + uri, self.parse_detail) self.current_page += 1 yield Request(self.base_url.format(self.current_page), self.parse)
def parse(self, response): # 当前页面所用的url post_nodes = response.css("ul.listbox li a") for post_node in post_nodes: # 提取开始url post_url = post_node.css("::attr(href)").extract_first("") yield Request(url=parse.urljoin(response.url, post_url), callback=self.parse_detail) # 提取下一页并交给scrapy进行下载 next_url = response.css( "span.next_page a::attr(href)").extract()[0] if next_url: yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)
def parse(self, response): original_key = response.url.split('/')[-1] key = urllib.unquote(original_key.split('?')[0]) texts = response.xpath('//tbody/tr/td/text()').extract() filename = os.getenv('RESULT_PATH') texts = [t.encode('utf-8') for t in texts if '\n' not in t] merged_texts = [] for i in xrange(0, len(texts)): index = i % 2 if index == 0: merged_texts.append(texts[i] + texts[i + 1] + '\n') # print 'lines num:', len(merged_texts) # not_200_path = os.getenv('NOT_200') # if response.status != 200: # with open(key+'\t'+str(len(set(merged_texts)))+'\n') legacy_file_path = os.getenv('LEGACY_PATH') if len(merged_texts) == 100: with open(legacy_file_path, 'a') as legacy_file: legacy_file.write(key + '\n') with open(filename, 'a') as f: f.write(key + '\t' + str(len(set(merged_texts))) + '\n') if len(merged_texts) > 0: detail_urls = response.xpath('//tbody/tr/td/a/@href').extract() for d in detail_urls: print "detail url is %s \n" % d yield Request(url='http://bcc.blcu.edu.cn{0}'.format(d), meta={ 'dont_filter': True, 'dont_merge_cookies': True }, callback=self.parse_detail)
def parse(self, response): item = {} imgurl = response.xpath( "//div[@id='waterfall']/div[@class='pin wfc wft']/a/img/@src" ).extract() for i in range(len(imgurl)): item["name"] = self.title item["imgurl"] = imgurl[i] item["imgherf"] = response.xpath( "//div[@id='waterfall']/div[@class='pin wfc wft']/a/@href" ).extract()[i] item["imgvisit"] = response.xpath( "//div[@id='waterfall']/div[@class='pin wfc wft']/p/span[@class='repin']/text()" ).extract()[i] try: item["imglike"] = response.xpath( "//div[@id='waterfall']/div[@class='pin wfc wft']/p/span[@class='like']/text()" ).extract()[i] except Exception as e: item["imglike"] = "0" try: item["imgdiscrit"] = response.xpath( "//div[@id='waterfall']/div[@class='pin wfc wft']/p[@class='description']/text()" ).extract()[i] except Exception as e: item["imgdiscrit"] = "" yield item for i in range(4): yield Request(url=response.url, callback=self.next, meta={"page": "2"}, dont_filter=True)
def parse(self, response): res = Selector(response) totalcount = res.xpath( '/html/body/div/div[3]/@totalcount').extract()[0] totalpage = int(int(totalcount) / 15 + 1) for i in range(1, totalpage + 1): url = response.url + '?page.pageNo=' + str(i) yield Request(url, callback=self.parse_page)
def parse(self, response): data = json.loads(response.body)['result']['products'].values() for i in data: item = Book_Product() item['title'] = i['title'] item['subtitle'] = i['subtitle'] item['uid'] = i['permanentProductPageUrl'].split('/')[-1].split('?')[0] item['fsp'] = i['fsp'] item['mrp'] = i['mrp'] self.items.append(item) print len(self.items) self.count += 10 if self.count > 100: # < Python 3.3 doesn't allows mixing of return and yield statements in same function. # So, we yield another method self.return_data which then returns the result. yield Request("http://www.flipkart.com/m/store/buk/loadmore?store=buk&start=%d" % self.count, self.return_data) yield Request("http://www.flipkart.com/m/store/buk/loadmore?store=buk&start=%d" % self.count, self.parse)
def parse_link(self, response): based_url = "http://blog.csdn.net" soup = BeautifulSoup(response.body, 'html.parser') blog = soup.find_all("div", "list_item article_item") for item in blog: # print item.find("span", "link_title").find("a").get("href"), item.find("span", "link_title").find("a").get_text() href = based_url + item.find("span", "link_title").find("a").get("href") yield Request(href, callback=self.parse_get_blog_title)
def parse(self, response): res = Selector(response) totalcount = res.xpath('/html/body/script').re('pageCount": .*,')[0] pages = int(re.findall('.*(.\d).*', totalcount)[0]) for i in range(1, pages + 1): if i == 1: url = response.url url = self.page_url.format(str(i)) yield Request(url, callback=self.parse_page)
def parse_products(self, response): brand_name = ''.join(response.xpath( '//p[contains(@class, "category-image")]/img/@title').extract()) products = response.xpath( '//ul[contains(@class, "products-grid")]//*[@class="product-name"]/a/@href').extract() for url in products: yield Request(response.urljoin(url), callback=self.parse_product, meta={'brand': brand_name})
def parse(self, response): """ parse first response """ if self.spider_config['url_type'] == 'list_page': sel = Selector(response) box = sel.xpath(self.spider_config['list_xpath']) for x in box: item = DynamicItem(self.spider_config['item']) for key, value in self.spider_config['xpath']['keys'].iteritems(): result = x.xpath(value).extract() if len(result) == 1: # single value item[key] = result[0] else: item[key] = result # construct follow request if configured if self.spider_config['xpath']['follow'] is not None: # more to follow follow_config = self.spider_config['xpath']['follow'] if len(follow_config['follow_info']['url'].keys()) >= 2: # needs string formation arguments = dict() for key, value in follow_config['follow_info']['url'].iteritems(): # construct arguments if not key == 'base_url': arguments[key] = item[value] url = follow_config['follow_info']['url']['base_url'].format(**arguments) else: url = follow_config['follow_info']['url']['base_url'] request = Request(url, callback=self.parse_follow) request.meta['item'] = item request.meta['config'] = follow_config yield request else: # no follow request, so save the item get_model('barrow', 'SpiderResult').objects.add_result(spider_task=self.spider_task, item=item, unique=self.spider_config['unique_result'], unique_keys=self.spider_config[ 'unique_keys'])
def parse(self, response): # items = [] hxs = HtmlXPathSelector(response) courses = hxs.select('//td[@class="nttitle"]/a') for c in courses: item = Course() url = c.select("@href").extract()[0] data = dict(e.split("=") for e in url.split("?")[1].split("&")) desc = c.select("text()").extract()[0].split(" - ")[1].strip() item["url"] = url item["number"] = data["crse_numb_in"] item["department"] = data["subj_code_in"] item["desc"] = desc request = Request("https://www.uvic.ca" + url, callback=self.parse_details) request.meta["item"] = item yield request
def parse_city_info(self, response): ct = Selector(response) # 获取总页数 total_pages = ct.xpath( '//*[@id="citylistpagination"]/div/a[7]/@data-page').extract()[0] for page in range(1, int(total_pages) + 1): yield Request(self.cities_url.format(page=page), callback=self.parse)
def parse(self, response): pageinformation = response.xpath('//*[@id="threadlisttableid"]') hxs = HtmlXPathSelector(response) march_re = r'">\s*(.*)\<' #for eachstudent in pageinformation: item = AdmissionInformation() item['admission_time'] = hxs.xpath( '//*[contains(@id, "normalthread")]/tr/th/span/font[1]').re( r'">\s*(.*)\<') item['gre'] = hxs.xpath( '//*[contains(@id, "normalthread")]/tr/th/span/font[3]').re( r': \s*(.*)\</font>') item['gpa'] = hxs.xpath( '//*[contains(@id, "normalthread")]/tr/th/span/font[5]').re( r'">\s*(.*)\<') item['undergrad_school'] = hxs.xpath( '//*[contains(@id, "normalthread")]/tr/th/span/font[6]').re( r'>(.*)\</font>') item['major'] = hxs.xpath( '//*[contains(@id, "normalthread")]/tr/th/span/font[4]').re( r'color="green">\s*(.*)\<') item['english_grade'] = hxs.xpath( '//*[contains(@id, "normalthread")]/tr/th/span/font[2]').re( r'>:\s*(.*)\<') item['year'] = hxs.xpath( '//*[contains(@id, "normalthread")]/tr/th/span/u/font[1]').re( r'\[(.*)\<') item['admission_type'] = hxs.xpath( '//*[contains(@id, "normalthread")]/tr/th/span/u/font[2]').re( r'">\s*(.*)\<') item['admission_school'] = hxs.xpath( '//*[contains(@id, "normalthread")]/tr/th/span/u/font[5]').re( r'">\s*(.*)\<') item['admission_major'] = hxs.xpath( '//*[contains(@id,"normalthread")]/tr/th/span/u/font[4]/b').re( r'<b>\s*(.*)\<') item['title'] = hxs.xpath( '//*[contains(@id,"normalthread")]/tr/th/a[2]/text()').extract() item['status'] = hxs.xpath( '//*[contains(@id,"normalthread")]/tr/th/span/u/font[3]/b').re( '<b>\s*(.*)\<') links = hxs.xpath('//*[contains(@id,"normalthread")]/tr/th/a[2]').re( r'href\="([^\"]*)\"') urls_real = [] for each in links: urls_real.append(each.replace('&', '&')) #print('url is:' + each.replace('&','&')) item['link'] = urls_real yield item next_url = self.get_next_url(response.url) if next_url != None: yield Request(next_url)
def parse(self, response): """通过 xpath 获取热门电子书的链接""" sel = Selector(response) sites = sel.xpath( '//div[@class="section ebook-area"]//ul[@class="list-col list-col5"]/li//div[@class="title"]' ) for site in sites: title = site.xpath('a/@title').extract() link = site.xpath('a/@href').extract() title, link = title[0], link[0] # print title, link yield Request(url=link, callback=self.parse2)
def parse_item(self, response): #print(response.url) item = MzituScrapyItem() item['url'] = response.url title = response.xpath('//h2[@class="main-title"]/text()').extract()[0] item['name'] = title max_num = response.xpath('//div[@class="pagenavi"]/a[last()-1]/span/text()').extract()[0] for i in range(1,int(max_num)): page_url = response.url+"/"+str(i) yield Request(page_url,callback= self.get_image_url) item['image_urls'] = self.img_urls yield item
def parse(self, response): based_url = "http://blog.csdn.net" list_result = ["http://blog.csdn.net/Temanm/article/list/1"] soup = BeautifulSoup(response.body, 'html.parser') pages = soup.find("div", "list_item_new").find("div", "pagelist").find_all("a") for i in range(len(pages)): href = based_url + pages[i].get("href") if href not in list_result: list_result.append(href) for link in list_result: yield Request(link, callback=self.parse_link)
def parse(self, response): # 当前页面所用的url post_nodes = ((response.css("div.div_list .div_item .div_title a")) or (response.css("div.st_div .div_item .div_itemtitle a"))) for post_node in post_nodes: # 提取开始url post_url = post_node.css("::attr(href)").extract_first("") yield Request(url=parse.urljoin(response.url, post_url), callback=self.parse_detail) # 提取下一页并交给scrapy进行下载 next_url = response.css( "div.myp2c_div_paging ::attr(href)").extract_first("") #(未实现) 解析javasrip下一页,本处是自动提取下一个也页面的关键 # javascript:fn_loaditems_id_6a4e96a3_7f4b_46f4_b383_5c6b27673ec3(2)' # t _url = response.css("div.myp2c_div_paging ::attr(href)").extract_first("") # next_url = document.execCommand(t_url) if next_url: yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)
def parse(self, response): pattern = r'var \$render_data = \[((.|\s)*?})\]' raw_data = re.search(pattern, response.text) raw_data = raw_data.group(1) json_data = json.loads(raw_data) status = json_data['status'] items = self.putItem(status) yield items if status['reposts_count']: reposts_url = base_reposts_url % (status['id'], str(1)) yield Request(reposts_url, callback=self.getReposters)
def start_requests(self): keys = self._get_search_keys() for k in keys: yield Request( url='http://bcc.blcu.edu.cn/zh/search/0/{0}'.format( urllib.quote(k)), meta={ 'dont_filter': True, 'dont_merge_cookies': True } # 'dont_redirect': True, # 'handle_httpstatus_list': [302]} )
def parse_details(self, response): """ Parse class prerequisites. """ hxs = HtmlXPathSelector(response) prereqs = hxs.select("//span[text()='Faculty']/following-sibling::text() | //span[text()='Faculty']/following-sibling::a") self.log("parsing "+response.url,level=DEBUG) self.log('prereqs = '+str(prereqs),level = DEBUG) prereqs = self.parse_prereqs(prereqs) self.log('parsed prereqs = '+str(prereqs),level = DEBUG) item = response.meta['item'] item['prereqs'] = prereqs yield item calendar_url = self.calendar_url_template.format(subject=item['subject'], number=item['number']) request = Request(calendar_url,callback=self.parse_calendar) request.meta['handle_httpstatus_list'] = [404] request.meta['subject'] = item['subject'] request.meta['number'] = item['number'] yield request
def parse_term(self,response): """ Parses the schedule search page for a particular term. Extracts the subject list from the first select box and generates requests for the schedule for each subject in the current term. These requests are handled by the parse_schedule callback. """ hxs = HtmlXPathSelector(response) if not TEST_RUN: subjects = hxs.select('//select[@id="subj_id"]/child::option').select('@value').extract() else: subjects = TEST_SUBJECTS term = response.meta['term'] # self.log('Got subjects; '+str(subjects)) for subj in subjects: url = self.schedule_url_template.format(term=term,subject=subj,number='') item = ScheduleItem() item['term'] = term item['subject'] = subj request = Request(url,callback=self.parse_schedule) request.meta['item'] = item yield request
def parse_city(self, response): ct = Selector(response) # 获取攻略页链接 gonglve_link = ct.xpath('//*[@class="navbar-btn"]/@href').extract()[0] # 获取城市名 city_name = response.meta.get('name') yield Request(self.domains_url + gonglve_link, callback=self.gong_lve, meta={ 'name': city_name, 'href': gonglve_link })
def parse(self, response): item = LianjiafItem() data = BeautifulSoup(response.text, 'lxml').find_all('li', class_='clear') for tag in data: page_url = response.url title = tag.find('div', class_='title').get_text() url = tag.div.find('a', attrs={'data-el': 'ershoufang'})['href'] type = tag.find('div', class_='houseInfo').get_text() price = tag.find('div', class_='totalPrice').get_text().replace('万', '') for field in item.fields: item[field] = eval(field) yield item page = response.xpath('//div[@comp-module="page"]').re( 'lPage\"\:(\d+)')[0] for u in range(1, int(page) + 1): urls = 'https://bj.lianjia.com/ershoufang/pg{}'.format(u) yield Request(urls, callback=self.parse)