def getReposters(self, response): pattern = 'page=(\d+)' result = re.search(pattern, response.url) page_id = result.group(1) try: json_data = json.loads(response.text) data = json_data['data'] reposts_data = data['data'] if int(page_id) == 1: self.max = data['max'] for item in reposts_data: items = self.putItem(item) time_url = base_mblog_url % (items['mblog_id']) yield Request(url=time_url, meta={"item": items}, callback=self.get_accurate_time) except: pass if int(page_id) < int(self.max): reposts_url = re.sub(pattern, 'page=' + str(int(page_id) + 1), response.url) yield Request(reposts_url, callback=self.getReposters)
def start_requests(self): for row in self.tyre_sizes: if self.check_row_is_processed(row): continue self.add_row_to_history(row) meta = {'row': row} xl = '' if row['XL'] == 'XL': xl = 'Y' meta['xl'] = True run_flat = '' if row['Run Flat'] == 'RF': run_flat = 'Y' meta['run_flat'] = True url = 'http://www.point-s.co.uk/tyres?s=&width=' + row[ 'Width'] + '&profile=' + row['Aspect Ratio'] + '&size=' + row[ 'Rim'] + '&speed=' + row[ 'Speed rating'] + '&paginate=true&runflat=' + run_flat + '&extra_load=' + xl yield Request(url, dont_filter=True, meta=meta) if row['Alt Speed']: url = 'http://www.point-s.co.uk/tyres?s=&width=' + row[ 'Width'] + '&profile=' + row[ 'Aspect Ratio'] + '&size=' + row['Rim'] + '&speed=' + row[ 'Alt Speed'] + '&paginate=true&runflat=' + run_flat + '&extra_load=' + xl yield Request(url, dont_filter=True, meta=meta)
def parse(self, response): """docstring for parse""" yield Request('http://www.ag.senate.gov/hearings', self.parse_data) for links in range(2, 10): yield Request( 'http://www.ag.senate.gov/hearings?PageNum_rs=' + str(links), self.parse_data)
def parse(self, response): sel = Selector(response) # Pages from 1998 onwards, new format # These normally cover around a 2-6 year period proceedings_menu = sel.xpath( '//a[starts-with(text(),"Official Record of Proceedings")]/@href') if proceedings_menu: for url in proceedings_menu.extract(): absolute_url = urlparse.urljoin(response.url, url.strip()) req = Request(absolute_url, callback=self.parse_hansard_index_page) yield req # Former Legislative Council (before 7/1997) table = sel.xpath( "//h3[contains(text(),'Former Legislative Council (before 7/1997)')]/following::table[1]" ) if table: links = table[0].xpath( ".//td/a[contains(text(),'Session')]/@href").extract() if links: for url in links: absolute_url = urlparse.urljoin(response.url, url.strip()) req = Request(absolute_url, callback=self.parse_hansard_index_page) yield req
def parse(self, response): pattern = r'var \$render_data = \[((.|\s)*?})\]' raw_data = re.search(pattern,response.text) raw_data = raw_data.group(1) json_data = json.loads(raw_data) status = json_data['status'] try: is_retweeted = status['retweeted_status'] mblog_id = is_retweeted['id'] mblog_url = base_mblog_url % (mblog_id) return Request(mblog_url, callback=self.getBlog) except: pass pattern = r'var \$render_data = \[((.|\s)*?})\]' raw_data = re.search(pattern, response.text) raw_data = raw_data.group(1) json_data = json.loads(raw_data) status = json_data['status'] items=self.putItem(status) usrdetail_url = base_usrdetail_url % (items['usr_id']) yield Request(url=usrdetail_url, meta={"item": items}, callback=self.getUsrDetail) if status['reposts_count']: reposts_url = base_reposts_url % (status['id'],str(1)) yield Request(reposts_url,callback=self.getReposters)
def parse(self, response): for productxs in response.xpath( '//div[contains(@class, "products-list")]//div[@data-product]' ): yield Request(productxs.xpath( './/a[@class="product-card-link"]/@href').extract()[0], callback=self.parse_product) next_page = response.xpath('//link[@rel="next"]/@href').extract() if next_page and not 'Page.Next.Link' in next_page[0]: yield Request(response.urljoin(next_page[0]))
def parse(self, response): item = BaiduItem() soup = BeautifulSoup(response.text, 'html.parser') title = soup.select('.lemmaWgt-lemmaTitle-title h1')[0].text subhead = soup.select('.lemmaWgt-lemmaTitle-title h2') if len(subhead) is not 0: # print(subhead[0].text) title = title + subhead[0].text item['title'] = title info_list = soup.select('.lemma-summary div') info = '' for temp in info_list: # 截取其中文字信息 info += temp.text # 如果有超链接,则继续爬取 a_list = temp.select('a') if len(a_list) is not 0: for a in a_list: if a.has_attr('href'): yield Request(self.base_url + a['href'], headers=self.headers) item['info'] = info properties_list = soup.select('.basicInfo-block dt') properties = '' for pro in properties_list: properties += '###' + pro.text.strip().replace('\n', '') # 如果有超链接,则继续爬取 a_list = pro.select('a') if len(a_list) is not 0: for a in a_list: if a.has_attr('href'): yield Request(self.base_url + a['href'], headers=self.headers) item['properties'] = properties values_list = soup.select('.basicInfo-block dd') values = '' for val in values_list: values += '###' + val.text.strip().replace('\n', '') # 如果有超链接,则继续爬取 a_list = val.select('a') if len(a_list) is not 0: for a in a_list: if a.has_attr('href'): yield Request(self.base_url + a['href'], headers=self.headers) item['values'] = values if len(soup.select('.summary-pic img')) is not 0: item['img'] = soup.select('.summary-pic img')[0]['src'] print(item['title']) yield item
def parse_item(self, response): m2 = hashlib.md5() m2.update(response.url) youm = Myoum7Item() youm['url'] = response.url youm['page_name'] = m2.hexdigest() youm['do_main'] = 'm.youm7.com' referer = response.request.headers.get('Referer', None) reattr = re.findall(r"sectionID=(\w+)", referer) if reattr: youm['url'] = referer yield Request(referer, callback=self.parse_item) date_str = response.xpath('//div[@class="news-dev"]/@data-id') attr = re.findall(r"sectionID=(\w+)", response.url) if date_str and attr: sectionID = attr[0] date_str = date_str.extract() url_date = date_str[len(date_str)-1] newUrl = "https://m.youm7.com/Section/NewsSectionPaging?lastid="+url_date+"§ionID="+str(sectionID) youm['url'] = newUrl yield Request(newUrl, callback=self.parse_item) title_str = response.xpath('//title/text()') content_str = response.xpath('//div[@class="text-cont"]//div[@id="articleBody"]//p/text()') type_str =response.xpath('//div[@class="container"]//div[@class="breadcumb"]//a/text()')#菜单中的分类 if content_str and title_str: content = "" content_str = response.xpath('//div[@class="text-cont"]//div[@id="articleBody"]//p/text()') for s in content_str.extract(): content += s youm['title'] = title_str.extract()[0] youm['content'] = content youm['str_size'] = len(content) youm['type'] = type_str.extract()[1] #取详细页中的分类 yield youm
def getReposters(self,response): json_data = json.loads(response.text) data = json_data['data'] max = data['max'] reposts_data = data['data'] pattern = 'page=(\d+)' result = re.search(pattern,response.url) page_id = result.group(1) for item in reposts_data: items=self.putItem(item) usrdetail_url = base_usrdetail_url % (items['usr_id']) yield Request(url=usrdetail_url, meta={"item": items}, callback=self.getUsrDetail) if int(page_id) < int(max): reposts_url = re.sub(pattern,'page='+str(int(page_id)+1),response.url) yield Request(reposts_url,callback=self.getReposters)
def parse(self, response): # 当前页面所用的url post_nodes = response.css("div.list_body_box1 .art_tit a") for post_node in post_nodes: # 提取开始url post_url = post_node.css("::attr(href)").extract_first("") yield Request(url=parse.urljoin(response.url, post_url), callback=self.parse_detail) # 提取下一页并交给scrapy进行下载 # !!!!!!!!!!!!!!!!!提取下一页未完成 next_url = response.css('div.list_body_box1 .pagingNormal a') if next_url: yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)
def parse(self, response): self.log("正在解析第{}页".format(self.current_page)) no_data = response.xpath(".//div[@class='ico_list']/div[@class='no_data']") if no_data or self.current_page > self.max_page: self.log("no data = {}".format(no_data)) self.log("没有数据或超过指定页,爬虫退出!最大爬取页为:{}".format(self.max_page)) return uris = response.xpath(".//div[@class='content']/a/@href").extract() for uri in uris: yield Request(self.domains + uri, self.parse_detail) self.current_page += 1 yield Request(self.base_url.format(self.current_page), self.parse)
def parse(self, response): # 当前页面所用的url post_nodes = response.css("ul.listbox li a") for post_node in post_nodes: # 提取开始url post_url = post_node.css("::attr(href)").extract_first("") yield Request(url=parse.urljoin(response.url, post_url), callback=self.parse_detail) # 提取下一页并交给scrapy进行下载 next_url = response.css( "span.next_page a::attr(href)").extract()[0] if next_url: yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)
def parse(self, response): original_key = response.url.split('/')[-1] key = urllib.unquote(original_key.split('?')[0]) texts = response.xpath('//tbody/tr/td/text()').extract() filename = os.getenv('RESULT_PATH') texts = [t.encode('utf-8') for t in texts if '\n' not in t] merged_texts = [] for i in xrange(0, len(texts)): index = i % 2 if index == 0: merged_texts.append(texts[i] + texts[i + 1] + '\n') # print 'lines num:', len(merged_texts) # not_200_path = os.getenv('NOT_200') # if response.status != 200: # with open(key+'\t'+str(len(set(merged_texts)))+'\n') legacy_file_path = os.getenv('LEGACY_PATH') if len(merged_texts) == 100: with open(legacy_file_path, 'a') as legacy_file: legacy_file.write(key + '\n') with open(filename, 'a') as f: f.write(key + '\t' + str(len(set(merged_texts))) + '\n') if len(merged_texts) > 0: detail_urls = response.xpath('//tbody/tr/td/a/@href').extract() for d in detail_urls: print "detail url is %s \n" % d yield Request(url='http://bcc.blcu.edu.cn{0}'.format(d), meta={ 'dont_filter': True, 'dont_merge_cookies': True }, callback=self.parse_detail)
def parse_comment(self, response): comments_json = json.loads(response.body_as_unicode()) if comments_json['message'] == 'success': comments = comments_json['data']['comments'] if len(comments) > 0: items = [] for comment in comments: item = CommentInfo({ 'comment': comment['text'], 'likes': comment['digg_count'], 'time': comment['create_time'], 'comment_id': comment['id'] }) self.copy_article_info(response.meta, item) if comment['reply_count'] > 0: reply_to_comment_url = 'http://www.toutiao.com/api/comment/get_reply/?comment_id=' + str( comment['id']) + '&dongtai_id=' + str( comment['dongtai_id'] ) + '&offset=0&count=' + str( comment['reply_count']) reply_request = Request(reply_to_comment_url, callback=self.parse_reply, method='GET') self.copy_article_info(response.meta, reply_request.meta) reply_request.meta['reply_to_id'] = comment['id'] yield reply_request items.append(item) return items else: return
def parse(self, response): item = {} imgurl = response.xpath( "//div[@id='waterfall']/div[@class='pin wfc wft']/a/img/@src" ).extract() for i in range(len(imgurl)): item["name"] = self.title item["imgurl"] = imgurl[i] item["imgherf"] = response.xpath( "//div[@id='waterfall']/div[@class='pin wfc wft']/a/@href" ).extract()[i] item["imgvisit"] = response.xpath( "//div[@id='waterfall']/div[@class='pin wfc wft']/p/span[@class='repin']/text()" ).extract()[i] try: item["imglike"] = response.xpath( "//div[@id='waterfall']/div[@class='pin wfc wft']/p/span[@class='like']/text()" ).extract()[i] except Exception as e: item["imglike"] = "0" try: item["imgdiscrit"] = response.xpath( "//div[@id='waterfall']/div[@class='pin wfc wft']/p[@class='description']/text()" ).extract()[i] except Exception as e: item["imgdiscrit"] = "" yield item for i in range(4): yield Request(url=response.url, callback=self.next, meta={"page": "2"}, dont_filter=True)
def parse(self, response): data = json.loads(response.body)['result']['products'].values() for i in data: item = Book_Product() item['title'] = i['title'] item['subtitle'] = i['subtitle'] item['uid'] = i['permanentProductPageUrl'].split('/')[-1].split('?')[0] item['fsp'] = i['fsp'] item['mrp'] = i['mrp'] self.items.append(item) print len(self.items) self.count += 10 if self.count > 100: # < Python 3.3 doesn't allows mixing of return and yield statements in same function. # So, we yield another method self.return_data which then returns the result. yield Request("http://www.flipkart.com/m/store/buk/loadmore?store=buk&start=%d" % self.count, self.return_data) yield Request("http://www.flipkart.com/m/store/buk/loadmore?store=buk&start=%d" % self.count, self.parse)
def parse(self, response): res = Selector(response) totalcount = res.xpath( '/html/body/div/div[3]/@totalcount').extract()[0] totalpage = int(int(totalcount) / 15 + 1) for i in range(1, totalpage + 1): url = response.url + '?page.pageNo=' + str(i) yield Request(url, callback=self.parse_page)
def parse_link(self, response): based_url = "http://blog.csdn.net" soup = BeautifulSoup(response.body, 'html.parser') blog = soup.find_all("div", "list_item article_item") for item in blog: # print item.find("span", "link_title").find("a").get("href"), item.find("span", "link_title").find("a").get_text() href = based_url + item.find("span", "link_title").find("a").get("href") yield Request(href, callback=self.parse_get_blog_title)
def parse(self, response): res = Selector(response) totalcount = res.xpath('/html/body/script').re('pageCount": .*,')[0] pages = int(re.findall('.*(.\d).*', totalcount)[0]) for i in range(1, pages + 1): if i == 1: url = response.url url = self.page_url.format(str(i)) yield Request(url, callback=self.parse_page)
def parse_products(self, response): brand_name = ''.join(response.xpath( '//p[contains(@class, "category-image")]/img/@title').extract()) products = response.xpath( '//ul[contains(@class, "products-grid")]//*[@class="product-name"]/a/@href').extract() for url in products: yield Request(response.urljoin(url), callback=self.parse_product, meta={'brand': brand_name})
def parse_city_info(self, response): ct = Selector(response) # 获取总页数 total_pages = ct.xpath( '//*[@id="citylistpagination"]/div/a[7]/@data-page').extract()[0] for page in range(1, int(total_pages) + 1): yield Request(self.cities_url.format(page=page), callback=self.parse)
def parse(self, response): pageinformation = response.xpath('//*[@id="threadlisttableid"]') hxs = HtmlXPathSelector(response) march_re = r'">\s*(.*)\<' #for eachstudent in pageinformation: item = AdmissionInformation() item['admission_time'] = hxs.xpath( '//*[contains(@id, "normalthread")]/tr/th/span/font[1]').re( r'">\s*(.*)\<') item['gre'] = hxs.xpath( '//*[contains(@id, "normalthread")]/tr/th/span/font[3]').re( r': \s*(.*)\</font>') item['gpa'] = hxs.xpath( '//*[contains(@id, "normalthread")]/tr/th/span/font[5]').re( r'">\s*(.*)\<') item['undergrad_school'] = hxs.xpath( '//*[contains(@id, "normalthread")]/tr/th/span/font[6]').re( r'>(.*)\</font>') item['major'] = hxs.xpath( '//*[contains(@id, "normalthread")]/tr/th/span/font[4]').re( r'color="green">\s*(.*)\<') item['english_grade'] = hxs.xpath( '//*[contains(@id, "normalthread")]/tr/th/span/font[2]').re( r'>:\s*(.*)\<') item['year'] = hxs.xpath( '//*[contains(@id, "normalthread")]/tr/th/span/u/font[1]').re( r'\[(.*)\<') item['admission_type'] = hxs.xpath( '//*[contains(@id, "normalthread")]/tr/th/span/u/font[2]').re( r'">\s*(.*)\<') item['admission_school'] = hxs.xpath( '//*[contains(@id, "normalthread")]/tr/th/span/u/font[5]').re( r'">\s*(.*)\<') item['admission_major'] = hxs.xpath( '//*[contains(@id,"normalthread")]/tr/th/span/u/font[4]/b').re( r'<b>\s*(.*)\<') item['title'] = hxs.xpath( '//*[contains(@id,"normalthread")]/tr/th/a[2]/text()').extract() item['status'] = hxs.xpath( '//*[contains(@id,"normalthread")]/tr/th/span/u/font[3]/b').re( '<b>\s*(.*)\<') links = hxs.xpath('//*[contains(@id,"normalthread")]/tr/th/a[2]').re( r'href\="([^\"]*)\"') urls_real = [] for each in links: urls_real.append(each.replace('&', '&')) #print('url is:' + each.replace('&','&')) item['link'] = urls_real yield item next_url = self.get_next_url(response.url) if next_url != None: yield Request(next_url)
def parse_item(self, response): #print(response.url) item = MzituScrapyItem() item['url'] = response.url title = response.xpath('//h2[@class="main-title"]/text()').extract()[0] item['name'] = title max_num = response.xpath('//div[@class="pagenavi"]/a[last()-1]/span/text()').extract()[0] for i in range(1,int(max_num)): page_url = response.url+"/"+str(i) yield Request(page_url,callback= self.get_image_url) item['image_urls'] = self.img_urls yield item
def parse(self, response): """通过 xpath 获取热门电子书的链接""" sel = Selector(response) sites = sel.xpath( '//div[@class="section ebook-area"]//ul[@class="list-col list-col5"]/li//div[@class="title"]' ) for site in sites: title = site.xpath('a/@title').extract() link = site.xpath('a/@href').extract() title, link = title[0], link[0] # print title, link yield Request(url=link, callback=self.parse2)
def parse(self, response): based_url = "http://blog.csdn.net" list_result = ["http://blog.csdn.net/Temanm/article/list/1"] soup = BeautifulSoup(response.body, 'html.parser') pages = soup.find("div", "list_item_new").find("div", "pagelist").find_all("a") for i in range(len(pages)): href = based_url + pages[i].get("href") if href not in list_result: list_result.append(href) for link in list_result: yield Request(link, callback=self.parse_link)
def parse(self, response): pattern = r'var \$render_data = \[((.|\s)*?})\]' raw_data = re.search(pattern, response.text) raw_data = raw_data.group(1) json_data = json.loads(raw_data) status = json_data['status'] items = self.putItem(status) yield items if status['reposts_count']: reposts_url = base_reposts_url % (status['id'], str(1)) yield Request(reposts_url, callback=self.getReposters)
def parse(self, response): # 当前页面所用的url post_nodes = ((response.css("div.div_list .div_item .div_title a")) or (response.css("div.st_div .div_item .div_itemtitle a"))) for post_node in post_nodes: # 提取开始url post_url = post_node.css("::attr(href)").extract_first("") yield Request(url=parse.urljoin(response.url, post_url), callback=self.parse_detail) # 提取下一页并交给scrapy进行下载 next_url = response.css( "div.myp2c_div_paging ::attr(href)").extract_first("") #(未实现) 解析javasrip下一页,本处是自动提取下一个也页面的关键 # javascript:fn_loaditems_id_6a4e96a3_7f4b_46f4_b383_5c6b27673ec3(2)' # t _url = response.css("div.myp2c_div_paging ::attr(href)").extract_first("") # next_url = document.execCommand(t_url) if next_url: yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)
def start_requests(self): keys = self._get_search_keys() for k in keys: yield Request( url='http://bcc.blcu.edu.cn/zh/search/0/{0}'.format( urllib.quote(k)), meta={ 'dont_filter': True, 'dont_merge_cookies': True } # 'dont_redirect': True, # 'handle_httpstatus_list': [302]} )
def parse_city(self, response): ct = Selector(response) # 获取攻略页链接 gonglve_link = ct.xpath('//*[@class="navbar-btn"]/@href').extract()[0] # 获取城市名 city_name = response.meta.get('name') yield Request(self.domains_url + gonglve_link, callback=self.gong_lve, meta={ 'name': city_name, 'href': gonglve_link })
def parse(self, response): item = LianjiafItem() data = BeautifulSoup(response.text, 'lxml').find_all('li', class_='clear') for tag in data: page_url = response.url title = tag.find('div', class_='title').get_text() url = tag.div.find('a', attrs={'data-el': 'ershoufang'})['href'] type = tag.find('div', class_='houseInfo').get_text() price = tag.find('div', class_='totalPrice').get_text().replace('万', '') for field in item.fields: item[field] = eval(field) yield item page = response.xpath('//div[@comp-module="page"]').re( 'lPage\"\:(\d+)')[0] for u in range(1, int(page) + 1): urls = 'https://bj.lianjia.com/ershoufang/pg{}'.format(u) yield Request(urls, callback=self.parse)