def parse_page1(self, response): # Save the English page page_en = response.url.split("/")[-1] print(page_en) article_id = page_en.split("-")[0] filename_en = 'crawledsites/%s-en.html' %article_id hxs = Selector(response) text = hxs.select('//div[@class="article"]//text()').extract() links = hxs.select('//div[@class="article"]/p/em/a/@href').extract() id = Page_ID() id['number'] = article_id for link in links: with open(filename_en, 'w') as f: f.writelines(text) self.log('Saved file ') if 'kurdi' in link: # Go to the Kurdish link request = scrapy.Request(url=link, callback=self.parse_page3, dont_filter=True) request.meta['id'] = id['number'] yield request else: # Go to the Turkish link request = scrapy.Request(url=link, callback=self.parse_page2, dont_filter=True) request.meta['id'] = id['number'] yield request
def parse(self, response): """Parse the wikipedia page of a historical figure""" # Extract the figure's image, birth and death date info_box = Selector(text=response.css('.infobox').extract_first()) image_url = info_box.xpath('//a[@class="image"]//@src').extract_first() image_url = "https:" + image_url death_date = info_box.xpath( '//*[@class="dday deathdate"]//text()').extract_first() death_date_raw = info_box.select( "//th[contains(text(), 'Died')]/following-sibling::td/text()" ).extract_first() birth_date = info_box.xpath( '//*[@class="bday"]//text()').extract_first() birth_date_raw = info_box.select( "//th[contains(text(), 'Born')]/following-sibling::td/text()" ).extract_first() # Create the wiki item wiki_item = WikiItem() wiki_item["image_url"] = image_url wiki_item["death_date"] = death_date wiki_item["death_date_raw"] = death_date_raw wiki_item["birth_date"] = birth_date wiki_item["birth_date_raw"] = birth_date_raw wiki_item["curid"] = response.meta['curid'] yield wiki_item
def parse(self, response): selector = Selector(response) for sel in selector.select("//a"): title = sel.xpath("text()").extract() if len(title) == 0: continue url = sel.xpath("@href").extract() if len(url) == 0: continue if "sample.asp" in url[0] or "browse.asp" in url[0]: child_url = url[0] if not child_url.startswith(ROOT_PAGE): child_url = ROOT_PAGE + child_url page_id = self.get_page_id(child_url) if page_id in self.already_crawled: continue self.already_crawled.add(page_id) yield Request(child_url, self.parse) # now download the file if it is a sample if "sample.asp" in response.url: item = MTCrawlerItem() item["link"] = response.url item["body"] = selector.select("//html").extract()[0] page_ids = self.get_page_id(response.url).split("::") item["sample_name"] = page_ids[0] item["type_name"] = page_ids[1] yield item time.sleep(SLEEP_TIME)
def parse(self, response): hxs = Selector(response) for sel in hxs.select( '//div[@id="ires"]//li[@class="g"]//h3[@class="r"]'): name = u''.join(sel.select(".//text()").extract()) url = _parse_url(sel.select('.//a/@href').extract()[0]) region = _get_region(url) if len(url): if self.download_html: yield Request(url=url, callback=self.parse_item, meta={ 'name': name, 'query': response.meta['query'] }) else: yield GoogleSearchItem( url=url, name=name, query=response.meta['query'], crawled=datetime.datetime.utcnow().isoformat()) next_page = hxs.select( '//table[@id="nav"]//td[contains(@class, "b") and position() = last()]/a' ) if next_page: url = self._build_absolute_url( response, next_page.select('.//@href').extract()[0]) yield Request(url=url, callback=self.parse, meta={'query': response.meta['query']})
def parse(self, response): selector = Selector(response) for sel in selector.select("//a"): title = sel.xpath("text()").extract() if len(title) == 0: continue url = sel.xpath("@href").extract() if len(url) == 0: continue if "sample.asp" in url[0] or "browse.asp" in url[0]: child_url = url[0] if not child_url.startswith(ROOT_PAGE): child_url = ROOT_PAGE + child_url page_id = self.get_page_id(child_url) if page_id in self.already_crawled: continue self.already_crawled.add(page_id) yield Request(child_url, self.parse) # now download the file if it is a sample if "sample.asp" in response.url: item = MTCrawlerItem() item["link"] = response.url item["body"] = selector.select("//html").extract()[0] page_ids = self.get_page_id(response.url).split("::") item["sample_name"] = page_ids[0] item["type_name"] = page_ids[1] yield item time.sleep(SLEEP_TIME)
def post_parse(self, response): x = Selector(response) # 选择给定的网页源代码 item = XywyCrawlItem() # 实例化item对象 items = [] # 初始化每条记录 item['questionUrl'] = response.url # 所有的问答链接 item['question'] = "" # 病人具体的问题,有些是只有描述没有问题的 item['analyse'] = "" # 疾病问题的分析 item['suggestion'] = "" # 疾病问题的建议 str_list = x.select('//div[@class="graydeep User_quecol pt10 mt10"]/text()').extract() # 提取问题 if len(str_list) == 1: item['question'] = str_list[0] # 取第1个字符串 elif len(str_list) == 2: item['question'] = str_list[1] # 取第2个字符串 else: str_list = x.select('//h2/p[@class="fl dib fb"]/text()').extract() # 如果找不到问题,就用描述替代问题 item['question'] = str_list[0] str_list = x.select('//div[@class="pt15 f14 graydeep pl20 pr20"]/text()').extract() # 提取问题的答案 if len(str_list) > 1: item['analyse'] = str_list[0].encode("utf-8") # 第一个字符串是分析 item['suggestion'] = str_list[1].encode("utf-8") # 第二个字符串是建议 elif len(str_list) == 1: # 有些是只有分析的 item['analyse'] = str_list[0].encode("utf-8") # 第一个字符串是分析 log.msg(item['questionUrl']+' '+item['question']) # 记录到日志里面 print item # 在命令行下观察爬虫的踪迹 items.append(item) return items
def parse_item(self, response): with open('first.html', 'wb') as f: f.write(response.body) hxp = Selector(response) urls = hxp.select( "//div[@class='excerpts-wrapper']/div[@class='excerpts']/article/a/@href" ).extract() titles = hxp.select( "//div[@class='excerpts-wrapper']/div[@class='excerpts']/article/a/text()" ).extract() item = DaomubijiItem() print '==========================' for url in titles: print url.split(' ')[1] for i in range(len(urls)): print 'i ===汉字是多少' + str(i) arr = titles[i].split(' ') if len(arr) >= 3: item['url'] = urls[0] item['chapter'] = arr[0] item['chapter_num'] = arr[1] item['section'] = arr[2] item['name'] = '盗墓笔记' else: item['url'] = urls[0] item['chapter'] = arr[0] item['chapter_num'] = arr[1] item['name'] = '盗墓笔记' # time.sleep(1) yield item print '=========================='
def parse_products(self, response): base_url = 'http://soak.com' hxs = Selector(text=response.body.replace('\\"', '"')) products = hxs.select( '//div[contains(@class, "product")]//a[div[@class="name"]]/@href' ).extract() for product in products: yield Request(urljoin_rfc(base_url, product), callback=self.parse_product) pages = hxs.select( '//a[contains(@class, "pageNumber")]/text()').extract() for page in pages: next_page = ( 'http://fsm.attraqt.com/zones-js.aspx?version=2.23.2&' 'siteId=4170eb3b-f55c-40d3-aaeb-8cb777e96a28&referrer=&' 'sitereferrer=&pageurl=' + response.meta['url'] + '%23esp_pg%3D' + page + '&zone0=category_recs1&' 'zone1=category&zone2=banner_advert&zone3=category_recs2&' 'zone4=category_recs3&facetmode=data&mergehash=true&' 'config_categorytree=' + response.meta['category_tree'] + '&config_category=' + response.meta['category_conf']) yield Request(next_page, callback=self.parse_products, meta=response.meta)
def parse_lifecall(self, response): sel = Selector(response) page = LifecallItem() page['url'] = response.url page['state'] = sel.select('//h3[@class="state-name"]/text()').extract() page['fulltext'] = sel.select('//div[@id="inside-content"]/div//p/text()').extract() page['links'] = sel.select('//div[@id="inside-content"]/div//a/text()').extract() return page
def test_deprecated_selector_methods(self): sel = Selector(TextResponse(url="http://example.com", body=b'<p>some text</p>')) with warnings.catch_warnings(record=True) as w: sel.select('//p') self.assertSubstring('Use .xpath() instead', str(w[-1].message)) with warnings.catch_warnings(record=True) as w: sel.extract_unquoted() self.assertSubstring('Use .extract() instead', str(w[-1].message))
def test_deprecated_selector_methods(self): sel = Selector( TextResponse(url="http://example.com", body=b'<p>some text</p>')) with warnings.catch_warnings(record=True) as w: sel.select('//p') self.assertSubstring('Use .xpath() instead', str(w[-1].message)) with warnings.catch_warnings(record=True) as w: sel.extract_unquoted() self.assertSubstring('Use .extract() instead', str(w[-1].message))
def parse_lifecall(self, response): sel = Selector(response) page = LifecallItem() page['url'] = response.url page['state'] = sel.select( '//h3[@class="state-name"]/text()').extract() page['fulltext'] = sel.select( '//div[@id="inside-content"]/div//p/text()').extract() page['links'] = sel.select( '//div[@id="inside-content"]/div//a/text()').extract() return page
def parse_article(self, response): weixin_name, weixin_id = response.meta['weixin_name'],response.meta['weixin_id'] x = Selector(response) title = ''.join(x.select('//h2[@id="activity-name"]/text()').extract()).strip() publish_date = ''.join(x.select('//em[@id="post-date"]/text()').extract()).strip() briefs = x.select('//div[@id="js_content"]/*').extract() print briefs
def parse(self, response): response_selector = Selector(response) next_link = response_selector.select(u'//span[@class="next"]/a/@href').extract()[0] log.msg("jinxp next link is %s" % next_link, level=log.DEBUG) if next_link: next_link = clean_url(response.url,next_link,response.encoding) yield Request(url=next_link, callback=self.parse) for detail_link in response_selector.select(u'//table[@class="olt"]/tr[@class=""]/td[@class="title"]/a/@href').extract(): if detail_link: detail_link = clean_url(response.url,detail_link,response.encoding) yield Request(url=detail_link, callback=self.parse_detail)
def parse(self, response): hxp = Selector(response) titles = hxp.select("//*[@id='list']/dl/dd/a/text()").extract()[9:] urls = hxp.select("//*[@id='list']/dl/dd/a/@href").extract()[9:] self.test = titles domain = 'http://www.booktxt.net/' for urlaa in urls: url = domain + urlaa # self.title = titles[i] yield Request(url, callback=self.parse_txt)
def parse_product_list(self, response): data = json.loads(response.body) meta = response.meta product_list = json.loads(data['d']) record_count = int(product_list['recordCount']) skip = meta.get('skip', 0) if skip < record_count: meta['skip'] += self.per_page req = FormRequest(self.product_list_url.format(rand=get_epoch()), headers=self.req_headers, method='POST', body=self.req_body.format( categ=meta['category_code'], skip=meta['skip'], per_page=self.per_page), meta=meta, callback=self.parse_product_list) yield req for product in product_list['products']: loader = ProductLoader(item=Product(), response=response) sel = Selector(text=product['listContent']) loader.add_value('name', product['description']) loader.add_value('sku', product['productCode']) loader.add_value('identifier', product['productCode']) # the price returned in the HTML for contact lenses is a discount price # so for those the 'price' field is used, the field can't be used for all # products because it's incorrect for most items. if 'contact lenses' in ' '.join(meta['categories']).lower(): loader.add_value('price', product['price']) else: price = sel.select( './/span[@class="ProductPriceLabel"]/text()').extract() loader.add_value('price', price) url = sel.select('.//a/@href').extract() loader.add_value('url', response.urljoin(url[0])) image_url = sel.select('.//img/@src').extract() loader.add_value('image_url', response.urljoin(image_url[0])) loader.add_value('shipping_cost', '7.95') for category in meta['categories']: loader.add_value('category', category) item = loader.load_item() metadata = SpecSaversMeta() promotion = map( lambda x: x.strip(), sel.select('.//h4[@class="ContactLens"]//text()').extract()) promotion = ' '.join(promotion).strip() metadata['promotion'] = promotion if promotion else '' item['metadata'] = metadata yield item
def parse_item(self,response): self.log('PRODUCT: A response from %s just arrived!' % response.url) hxs = Selector(response) item = SolarprojectItem() parsed = urlparse.urlparse(response.url) params = dict(urlparse.parse_qsl(parsed.query)) print params item['id'] = params['products_id'] item['name'] = hxs.select("//h1/text()").extract() item['sku'] = hxs.select("//table/tr[6]/td[3]/text()").extract() item['price'] = hxs.select("//table/tr[2]/td[3]/span/text()").extract() # scrapy.shell.inspect_response(response) return item
def parse_with_term(self, response, term, newspaper): # clean response from scripts response_content = remove_tags_with_content(response.text, ( 'script', 'a', )) selector = Selector(text=response_content) term_query = '//body//*[contains(text(), "%s")]/text()' % self.term term_nodes = selector.select(term_query).extract() if not term_nodes: return item = { 'url': response.url, 'newspaper': newspaper, 'term': term, 'response_content': response.text, 'timestamp': time.time() } related_terms = self.get_related_terms(term_nodes) if term in related_terms: related_terms.pop(term) item['related_terms'] = dict(related_terms) #with open(self.term, 'a') as content_file: # content_file.write("%s\n" % item) cb_client.insert(str(uuid.uuid4()), item) # update scraper process self.update_scraper_summary(item) return item
def parse_post(response): post = dict() sel = Selector(response) title = sel.xpath('//h1[@class="title"]/' 'span[@class="post_title"]/text()').extract_first() post["title"] = title # hubs = sel.xpath('//div[@class="hubs"]/span[@class="profiled_hub"]/' # 'preceding-sibling::a/text()').extract() hubs = sel.xpath('//div[@class="hubs"]/a/text()').extract() post["hubs"] = hubs tags = sel.xpath('//ul[@class="tags icon_tag"]/li/a/text()').extract() post['tags'] = tags content_html = sel.select('//div[@class="content html_format"]').extract_first() soup = BeautifulSoup(content_html) for match in soup.findAll('code'): match.replaceWith('') post['content'] = soup.get_text() post['url'] = response.url return post
def parseItemList(self, response): links = LinkExtractor( allow=('http://www\.zuccalahomes\.com\.au/\?property=[\w-]+$' )).extract_links(response) item = {} hxs = Selector(response) BuildType = self._getBuildType(response.url) for v in links: item['BuildType'] = BuildType info = hxs.select( '''//li[@class="clearfix grid-item type-"]/div[@class="property-block"]/ a[@href="{}"]/following-sibling::div[@class="property-info"]/text()''' .format(v.url)).extract() # with open('links from parseItem', 'a') as f: # f.write(str(info) + '\n\n\n\n\n\n\n\n\n\n') # f.close() squCom = re.compile(r'((?<=Size:)(\s+)?\d+\.\d+[a-zA-Z]+)') hwCom = re.compile(r'((?<=Lot Length:)(\s+)?\d+[a-zA-Z]+)') hlCom = re.compile(r'((?<=Lot Width:)(\s+)?\d+[a-zA-Z]+)') for i in info: try: item['Squares'] = squCom.search(i).group() except AttributeError: pass try: item['HouseWidth'] = hwCom.search(i).group() except AttributeError: pass try: item['HouseLength'] = hlCom.search(i).group() except AttributeError: pass yield Request(v.url, callback=self.parseItem, meta=item)
def parse_post(response): post = dict() sel = Selector(response) title = sel.xpath('//h1[@class="title"]/' 'span[@class="post_title"]/text()').extract_first() post["title"] = title # hubs = sel.xpath('//div[@class="hubs"]/span[@class="profiled_hub"]/' # 'preceding-sibling::a/text()').extract() hubs = sel.xpath('//div[@class="hubs"]/a/text()').extract() post["hubs"] = hubs tags = sel.xpath('//ul[@class="tags icon_tag"]/li/a/text()').extract() post['tags'] = tags content_html = sel.select( '//div[@class="content html_format"]').extract_first() soup = BeautifulSoup(content_html) for match in soup.findAll('code'): match.replaceWith('') post['content'] = soup.get_text() post['url'] = response.url return post
def parse(self, response): sel = Selector(response) results = sel.select("//tr/td[@class='alt1']") for result in results: #filtering name posts # names = result.select('.//font[contains(@color, "Purple")]').extract() names = result.select('.//div[contains(text(),"{}")]'.format(os.environ.get('keyword1'))).extract() if(len(result.select(".//br"))>10 and len(names) == 0): story = result.extract() if os.environ.get('keyword2') in story: re_exp_newline = re.compile('<br>\n<br>') re_exp_remove_tag = re.compile('<.*?>') story = re.sub(re_exp_newline, '\n', story) story = re.sub(re_exp_remove_tag, '', story) # story = re.sub(r'\n+', '\n',story) story = re.sub(r'\n\s*\n', '\n\n', story) story = re.sub(r'OA_show("postbit");', '', story) page = re.sub('.*&+','',response.request.url) if(len(page)>8): page = "page=0" print("*******************************************************") story = re.sub('Last.*',"",story) story = re.sub("The following+.*","",story) with open('output.txt', 'a') as f: f.write(page) f.write(story) f.write("\n **************************\n\n\n")
def parse_items(self, response): hxs = Selector(response) for url in hxs.select('//a/@href').extract(): if not (url.startswith('https://') or url.startswith('https://')): url = URL + url print(url) yield Request(url, callback=self.parse)
def parse_list(self, response): hxs = Selector(response) titles = hxs.select( "//div[contains(@class,'product-unit unit-4 browse-product new-design')]" ) items = [] count1 = 0 for title in titles: count1 = count1 + 1 item = TutorialItem() item['model'] = str( title.select(".//div[contains(@class,'pu-title')]/a/text()"). extract()).encode('utf-8').strip() item['offer'] = title.select( ".//div[contains(@class,'pu-final')]/span/text()").extract() item['image'] = title.select( ".//div[contains(@class,'pu-visual-section')]/a/img/@data-src") item['standard_url'] = "http://www.flipkart.com" + \ title.select( ".//div[contains(@class,'pu-title')]/a/@href")[0].extract() # return items request = Request(item['standard_url'], callback=self.new_features) request.meta['item'] = item items.append(item) yield request
def parse(self, response): #for sel in response.xpath('//ul/li'): #hxs = HtmlXPathSelector(response) sel = Selector(response) bigs = sel.select("//a[@class='nav_bar_link']/@href").extract() items = [] item = DmozItem() yield bigs
def parse(self,response): hxs = Selector(response) event = hxs.select("//div[@id='Box']/div[@id='InnerBox_1']/div[@class='list fleft']/div[@class='det fleft']/ul/li[@class='listhd']/a/text()]").extract() place = hxs.select("//div[@class='list fleft']/div[@class='det fleft']/ul/li/span[@class='bold']/text()").extract() date = hxs.select("//div[@class='list fleft']/div[@class='evdate fleft']/div[@class='dtblock']/span[@class='bold fnt14']/text()").extract() type = ["Sports",] place1 = ["Bangalore",] time = [] event = [] #ur = [] print date print event print place #items=[] #l=len(event) #for i in range(l): # self.c.writerow([event[i],type[0],place[i],place1[0],date[i],""])
def contentsParse(self, url): data = urllib.urlopen(url).read() hxs = Selector(text=data) contents = ''.join(hxs.select('//div[@id="userct"]').extract()) contents = contents.replace("'",'"') return contents
def parse_product(self, response): pd = Selector(response) url = response.url category = response.meta['category'] image_url = pd.select('//a[@id="zoom1"]/@href').extract() product_identifier = response.xpath('//@data-code').extract() product_identifier = response.xpath( '//span[@id="product-code"]/span/strong/text()').extract() if not product_identifier: product_identifier = response.xpath( '//span[@id="product-code"]/text()').extract() if not product_identifier: log.msg(url + " no Code/ID") product_identifier = product_identifier[0].strip() product_name = pd.select( '//h1[@itemprop="name"]/text()').extract()[0].strip() brands = response.css('ul.breadcrumbsList li').xpath( './/a[contains(@href, "/producer/")]/text()').extract() if not brands: brands = pd.select( '//div[@class="modelContainer"]//li[@class="first"]/a/text()' ).extract() brand = '' if brands: brand = brands[0].strip() else: log.msg(url + " no BRND") product_loader = ProductLoader(item=Product(), selector=pd) product_loader.add_value('identifier', product_identifier) product_loader.add_value('name', product_name) product_loader.add_value('sku', product_identifier) if image_url: product_loader.add_value('image_url', image_url[0]) price = response.xpath('//script/text()').re('product_price":(.+?),') if not price: price = response.xpath( '//span[@id="price_per_m"]/text()').extract() price = price[0] if price else 0 product_loader.add_value('price', price.strip().replace(" ", "")) product_loader.add_value('url', url) product_loader.add_value('brand', brand) product_loader.add_value('category', category) product = product_loader.load_item() yield product
def parse(self, response): hxs = Selector(response) lists = hxs.select('//a/@href').extract() for a in lists: if u'ebook' in a: yield Request(a, callback=self.parseEbook) elif u'subject' in a: yield Request(a, callback=self.parseSubject)
def parse_item(self, response): sel = Selector(response) self.log.msg("url: %s" % response.url) i = ContentItem() i['title'] = sel.select("//div[@id='title']/text()").extract()[0] i['keywords'] = '' i['desc'] = '' i['content'] = '' return i
def parse_next_site(self, response): item = response.request.meta['item'] item['summary_url'] = response.url #print('\n Crawling %s\n' % response.url) hxs1 = Selector(response) item['detailed_summary'] = hxs1.select("//span[@class='summary']").extract() item['crawl_timestamp'] = time.strftime('%Y-%m-%d %H:%M:%S') print("--- %s seconds ---" % (time.time() - self.start_time)) return item
def parse(self, response): hxs = Selector(response) base_url = get_base_url(response) self.log("Base URL: " + base_url) for department_line in hxs.select('//div[@class = "CourseViewer"]/table/tr/td/table/tr/td/table/tr[@id]'): #for department_line in hxs.xpath('//div[@class = "CourseViewer"]/table/tr/td/table/tr/td/table/tr[@id][1]'): department = self.parse_department(department_line) department_url = self.extract_department_link(base_url, department_line) yield Request(department_url, callback = self.parse_department_page, meta = {'department' : department})
def parse_txt(self, response): self.chapt += 1 print '***************************************' print 'chapt = ' + str(self.chapt) # with open('fir.html', 'wb') as f: # f.write(response.body) hxp = Selector(response) content = hxp.select("//*[@id='content']/text()").extract() # //*[@id="wrapper"]/div[4]/div/div[2]/h1 titles = hxp.select( '//*[@id="wrapper"]/div[@class="content_read"]//div[@class="bookname"]/h1/text()' ).extract() name = self.chapt / 100 print ' name = ' + str(name) filrname = '第' + str(name + 1) + '部分' + '.txt' with open(filrname, 'a') as f: f.write('*\n') f.write('**\n') f.write('***\n') f.write('****\n') f.write('*****\n') f.write('******\n') f.write('*******\n') f.write('********\n') f.write('*********\n') f.write('**********\n') f.write(self.test[self.chapt - 1]) f.write('\n') f.write('**********\n') f.write('*********\n') f.write('********\n') f.write('*******\n') f.write('******\n') f.write('*****\n') f.write('****\n') f.write('***\n') f.write('**\n') f.write('*\n') f.write('\n\n\n\n') for ss in content: f.write(ss)
def parse(self, response): with open('daomu.html', 'wb') as f: f.write(response.body) hxp = Selector(response) # /html/body/section/article/a[1] first_urls = hxp.select( "//article[@class='article-content']/p/a/@href").extract() for url in first_urls: print url yield Request(url, callback=self.parse_item)
def parse_page3(self, response): # Save the Kurdish translation page_kr = response.url.split("/")[-1] page_id = response.meta['id'] hkr = Selector(response) textkr = hkr.select('//div[@class="article"]//text()').extract() filename_kr = 'crawledsites/%s-kr.html' %page_id with open(filename_kr, 'w') as f: f.writelines(textkr) self.log('Saved file ')
def parse(self, response): hxs = Selector(response) titles = hxs.select("//span[@class='pl']") items=[] for title in titles: item = CraigslistSampleItem() item["title"] = title.select("a/text()").extract() item["link"] = title.select("a/@href").extract() items.append(item) return items
def parse_item(self,response): sel=Selector(response) item=DoubanmoiveItem() item['name']=sel.xpath('//*[@id="content"]/h1/span[1]/text()').extract() item['year']=sel.xpath('//*[@id="content"]/h1/span[2]/text()').re(r'\((\d+)\)') item['score']=sel.xpath('//*[@id="interest_sectl"]/div/p[1]/strong/text()').extract() item['director']=sel.xpath('//*[@id="info"]/span[1]/span[2]/a/text()').extract() item['classification']= sel.xpath('//span[@property="v:genre"]/text()').extract() item['actor']= sel.xpath('//a[@rel="v:starring"]/text()').extract() item['img']= sel.select('//*[@id="mainpic"]/a/img/@src').extract() return item
def parse(self, response): base_url = response.url x = Selector(response) raw_urls = x.select("//a/@href").extract() urls = [] for url in raw_urls: if 'http' not in url: url = base_url + url urls.append(url) print urls print '2222222222222222222222222222222222222222222222222222'
def parse(self, response): items=[] hxs = Selector(response) sites = hxs.select('//ul[@class="list_news2 list_allnews"]/li/a/@href').extract() for site in sites: item = TutorialItem() item['aTxt'] = site.select('//div[@class="article_view"]//p/text()').extract() items.append(item) return items
def parse(self, response): hxs = Selector(response) courses = hxs.select('//div[@class="col-md-3 col-sm-6 course"]') for course in courses: item = CourseItem() item['name'] = course.xpath( './/div[@class="course-name"]/text()').extract()[0].strip() item['learned'] = course.xpath( './/span[@class="course-per-num pull-left"]/text()').extract( )[1].strip() item['image'] = course.xpath( './/div[@class="course-img"]/img/@src').extract()[0].strip() yield item
def parse_item(self,response): sel = Selector(response) movie_name = sel.select("//div[@id='content']/h1/span[1]/text()").extract() movie_director = sel.select("//*[@id='info']/span[1]/span[2]/a/text()").extract() movie_writer = sel.select("//*[@id='info']/span[2]/span[2]/a/text()").extract() movie_score = sel.xpath("//*[@id='interest_sectl']/div/div[2]/strong/text()").extract() movie_classification = sel.xpath("//span[@property='v:genre']/text()").extract() movie_description_paths = sel.select("//*[@id='link-report']") movie_description = [] for movie_description_path in movie_description_paths: movie_description = movie_description_path.select(".//*[@property='v:summary']/text()").extract() movie_roles_paths = sel.select("//*[@id='info']/span[3]/span[2]") movie_roles = [] for movie_roles_path in movie_roles_paths: movie_roles = movie_roles_path.select(".//*[@rel='v:starring']/text()").extract() movie_detail = sel.select("//*[@id='info']").extract() item = WorkItem() item['movie_name'] = ''.join(movie_name).strip().replace(',',';').replace('\'','\\\'').replace('\"','\\\"').replace(':',';') item['movie_director'] = movie_director[0].strip().replace(',',';').replace('\'','\\\'').replace('\"','\\\"').replace(':',';') if len(movie_director) > 0 else '' item['movie_score'] = movie_score[0].strip().replace(',',';').replace('\'','\\\'').replace('\"','\\\"').replace(':',';') if len(movie_director) > 0 else '' item['movie_classification'] = movie_classification[0].strip().replace(',',';').replace('\'','\\\'').replace('\"','\\\"').replace(':',';') if len(movie_director) > 0 else '' item['movie_description'] = movie_description[0].strip().replace(',',';').replace('\'','\\\'').replace('\"','\\\"').replace(':',';') if len(movie_description) > 0 else '' item['movie_writer'] = ';'.join(movie_writer).strip().replace(',',';').replace('\'','\\\'').replace('\"','\\\"').replace(':',';') item['movie_roles'] = ';'.join(movie_roles).strip().replace(',',';').replace('\'','\\\'').replace('\"','\\\"').replace(':',';') movie_detail_str = ''.join(movie_detail).strip() movie_language_str = ".*语言:</span>(.+?)<span.*".decode("utf8") movie_date_str = ".*上映日期:</span> <span property=\"v:initialReleaseDate\" content=\"(\S+?)\">(\S+?)</span>.*".decode("utf8") movie_long_str = ".*片长:</span> <span property=\"v:runtime\" content=\"(\d+).*".decode("utf8") pattern_language =re.compile(movie_language_str,re.S) pattern_date = re.compile(movie_date_str,re.S) pattern_long = re.compile(movie_long_str,re.S) movie_language = re.search(pattern_language,movie_detail_str) movie_date = re.search(pattern_date,movie_detail_str) movie_long = re.search(pattern_long,movie_detail_str) item['movie_language'] = "" if movie_language: item['movie_language'] = movie_language.group(1).replace('<br>','').strip().replace(',',';').replace('\'','\\\'').replace('\"','\\\"').replace(':',';') item['movie_date'] = "" if movie_date: item['movie_date'] = movie_date.group(1).strip().replace(',',';').replace('\'','\\\'').replace('\"','\\\"').replace(':',';') item['movie_long'] = "" if movie_long: item['movie_long'] = movie_long.group(1) yield item
def parse(self, response): selector = Selector(response) for deal in selector.select(self.deals_list_xpath): loader = XPathItemLoader(LivingSocialDeal(), selector=deal) loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) yield loader.load_item()
def parse(self, response): hxs = Selector(response) all_links = hxs.select("//div[@class='ai1ec-day']/a/@href") print "scraping %s events for this month" % len(all_links) for link_sel in all_links[int(self.item_from):int(self.item_to)]: url = "".join(link_sel.extract()) if url: yield Request( url=url, callback=self.parse_event_page, )
def parse(self, response): sel = Selector(response) sites = sel.select('//table[@width = 800 and @align = "center"]') items = [] for site in sites: item = czceItem() symbols = site.select('//tr[1]//b/text()').extract() item['Symbol'] = site.select('a/text()').extract() item['link'] = site.select('a/@href').extract() item['desc'] = site.select('text()').extract() items.append(item) return items
def parse(self, response): s = Selector(response) items = s.xpath('//ul[@id="houseList"]/li[@class="clearfix"]') for house in range(len(items)): title = s.xpath( './/div[@class="txt"]/h3/a/text()').extract()[house] url = s.xpath( './/div[@class="txt"]/h3/a/@href').extract()[house][2:] site = s.xpath('.//div[@class="txt"]/h4/a/text()').extract()[house] size = s.xpath( './/div[@class="txt"]/div[@class="detail"]/p[1]/span[1]/text()' ).extract()[house] floor = s.xpath( './/div[@class="txt"]/div[@class="detail"]/p[1]/span[2]/text()' ).extract()[house] style = s.xpath( './/div[@class="txt"]/div[@class="detail"]/p[1]/span[3]/text()' ).extract()[house] share = s.xpath( './/div[@class="txt"]/div[@class="detail"]/p[1]/span[4]/text()' ).extract()[house] price_list = s.xpath( './/div[@class="priceDetail"]/p[@class="price"]/text()' ).extract() for i in price_list: if "\n " in price_list: price_list.remove("\n ") price = price_list[house] try: item = fox_items.FoxItem() item['title'] = title item['site'] = site item['price'] = price.split()[1] item['url'] = url item['size'] = size item['floor'] = floor item['style'] = style item['share'] = share # print(item) print('******************', title) yield item except Exception as e: print(e) # 获取所有的url,继续访问,并在其中寻找相同的url all_urls = s.select('//a/@href').extract() # print(all_urls) for i in range(2, 51): url = 'http://www.ziroom.com/z/nl/z3.html?p=' + str(i) yield Request(url, callback=self.parse)
def movie_page(self, response): hxs = Selector(response) item = ThreedmmcomItem() item['thread_url'] = response.url item['thread_id'] = url_query_parameter(response.url, 't') item['name'] = firstOrNone(hxs.select('//div[@class="bigusername"]/text()')) files = item['files'] = hxs.select('//a[@data-location]/@data-location').extract() if not files: return meta=item['meta']={} metadivs=response.xpath('//table[starts-with(@id,"post")]//td[@class="alt1" and @width="125"]/div') for i,entry in enumerate(metadivs): name=firstOrNone(entry.css('div.smallfont').xpath('text()')) if name and i<len(metadivs)-2: meta[name]=firstOrNone(metadivs[i+1].xpath('./descendant-or-self::*[name() != "script" and name() != "style"]/text()[normalize-space()]')) version=response.xpath('//table[starts-with(@id,"post")]//td[@class="alt1" and @width="125"]/table/tr[1]//strong/text()').extract() if version: item['meta']['version']=version[0].strip() return item
def parse(self,response): selec = Selector(response) uri = selec.select('//a[contains(text(),"Search Openings")]/@href').extract()[0] """div_title_all = selec.xpath('//div[@id="titletext1"]').extract() #body = selec.xpath('//title') div_title = div_title_all[0] sid = re.search(r"\"search.*?SID=.*?\"",str(div_title),re.I|re.S).group() sid = re.search(r"SID=.*?\"",str(sid),re.I|re.S).group() sid = re.sub(r"SID=","",re.sub(r"\"","",str(sid),re.I))""" url_search = 'https://sjobs.brassring.com/TGWebHost/' + uri #print "URI1",url_search yield Request("https://sjobs.brassring.com/TGWebHost/%s" % uri, callback=self.parse_page_1)
def parse(self, response): global itemcount hxs = Selector(response) sites = hxs.select('//ul/li') items = [] for site in sites: item = FjsenItem() item['title'] = site.select('a/text()').extract() item['link'] = site.select('a/@href').extract() item['addtime'] = site.select('span/text()').extract() if item['addtime']: itemcount = itemcount + 1 print itemcount items.append(item) return items
def parse_bids(self, response): selector = Selector(response) for bid in selector.select(self.bid_list_xpath) : loader = ItemLoader(BidItems(), selector=bid) loader.default_input_processor = MapCompose(lambda v: v.split(), replace_escape_chars) loader.default_output_processor = Join() for field, xpath in auction_bid_fields.iteritems(): loader.add_xpath(field, xpath) yield loader.load_item()
def parse(self,response): global itemcount hxs = Selector(response) sites=hxs.select('//ul/li') items=[] for site in sites: item=FjsenItem() item['title']=site.select('a/text()').extract() item['link'] = site.select('a/@href').extract() item['addtime']=site.select('span/text()').extract() if item['addtime']: itemcount = itemcount +1 print itemcount items.append(item) return items
def parse(self, response): hxs = Selector(response) # The XPath selector sites = hxs.select('//div[contains(@class, "block_1")]') items = [] for site in sites: item = AuctionsItem() item['address'] = site.select('div[contains(@class, "con")]/div[contains(@class, "name")]/a/h4/text()').extract()[0].strip() item['price'] = site.select('div[contains(@class, "image")]/div[contains(@class, "price")]/text()').extract()[0].strip() item['saleDate'] = site.select('div[contains(@class, "con")]/div[contains(@class, "date")]/span/text()').extract() item['squareFeet'] = site.select('div[contains(@class, "last_line")]/div[contains(@class, "blk_1")]/span/text()').extract() item['bedrooms'] = site.select('div[contains(@class, "last_line")]/div[contains(@class, "blk_2")]/span/text()').extract() item['bathrooms'] = site.select('div[contains(@class, "last_line")]/div[contains(@class, "blk_3")]/span/text()').extract() item['url'] = site.select('div[contains(@class, "image")]/a/@href').extract() item['image'] = site.select('div[contains(@class, "image")]/a/img/@src').extract() items.append(item) return items
def parse(self, response): selec = Selector(response) uri = selec.select( '//a[contains(text(),"Search Openings")]/@href').extract()[0] """div_title_all = selec.xpath('//div[@id="titletext1"]').extract() #body = selec.xpath('//title') div_title = div_title_all[0] sid = re.search(r"\"search.*?SID=.*?\"",str(div_title),re.I|re.S).group() sid = re.search(r"SID=.*?\"",str(sid),re.I|re.S).group() sid = re.sub(r"SID=","",re.sub(r"\"","",str(sid),re.I))""" url_search = 'https://sjobs.brassring.com/TGWebHost/' + uri #print "URI1",url_search yield Request("https://sjobs.brassring.com/TGWebHost/%s" % uri, callback=self.parse_page_1)
def process_item(self, item, spider): if utility.isInsideJobList(item['url']) == True : hxs = Selector(text=item['raw_html']) # hxs = HtmlXPathSelector(item['raw_html']+'') # anchors = hxs.select('//a/@href').extract() # urls = hxs.xpath('//li[@class="job"]//a/@href').extract() url = item['url'] titles = hxs.xpath('//li[@class="job"]//h3').extract() companys = hxs.xpath('//li[@class="job"]//span[@class="company"]/text()').extract()[0] total = hxs.select('//div[@id="navigation"]/h2/span/text()').extract()[0] r = re.search(r'[0-9]+', total[0] + '') total = r.group(0) return item
def parse_list(self, response): hxs = Selector(response) titles = hxs.select( "//div[contains(@class,'product-unit unit-4 browse-product new-design')]") items = [] count1 = 0 for title in titles: count1 = count1 + 1 item = TutorialItem() item['model'] = title.select( ".//div[contains(@class,'pu-title')]/a/text()").extract() if not item['model']: item['model']="n/a" else: item['model']=str(item['model'][0]).encode('utf-8').strip() item['offer'] = title.select( ".//div[contains(@class,'pu-final')]/span/text()").extract() if not item['offer']: item['offer']=0.00 else: item['offer']=float(item['offer'][0].replace("Rs.","").replace(",","").strip()) item['image'] = title.select( ".//div[contains(@class,'pu-visual-section')]/a/img/@data-src")[0].extract() if not item['image']: item['image']="n/a" else: item['image']=item['image'][0] item['standard_url'] = title.select( ".//div[contains(@class,'pu-title')]/a/@href")[0].extract() if not item['standard_url']: item['standard_url']="n/a" else: item['standard_url']= "http://www.flipkart.com" + item['standard_url'] request = Request( item['standard_url'], callback=self.new_features) request.meta['item'] = item items.append(item) yield request
def parse(self, response): """ Default callback used by Scrapy to process downloaded responses Testing contracts: @url http://odds.500.com/index_jczq_2014-08-29.shtml """ selector = Selector(response) # iterate over matchs for match in selector.select(self.match_list_xpath): loader = ItemLoader(Match(), selector=match) # define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader for field, xpath in self.match_fields.iteritems(): loader.add_xpath(field, xpath) match_item = loader.load_item() match_item["game_date"] = self.game_date match_item["season_id"] = match_item["season_id"].split('-')[-1] match_item["teama_id"] = match_item["teama_id"].split('-')[-1] match_item["teamb_id"] = match_item["teamb_id"].split('-')[-1] if "score" in match_item: sa, sb = match_item["score"].split(':') match_item["score_a"] = sa match_item["score_b"] = sb match_item["result"] = "win" if sa > sb else "draw" if sa == sb else "lost" else: match_item["score_a"] = match_item["score_b"] = -1 match_item["result"] = "none" yield match_item #scrap asia odds #id=454359&ctype=1&start=60&r=1&style=0&guojia=0 for i in xrange(3): url = self.asia_odds_url % (match_item["match_id"], i * 30) request = scrapy.Request(url, callback=self.parse_asia_odds) request.meta['match_item'] = match_item yield request
def parse_patent(self, response): sel = Selector(response) i = PatentscrapperItem() i['bookmark'] = sel.select('normalize-space(//div[@id="pagebody"]/h1/text())').extract() i['inventors'] =sel.select('normalize-space(//table[@class="tableType3"]/tbody/tr[3]/td/span/text())').extract() i['applicants'] =sel.select('normalize-space(//table[@class="tableType3"]/tbody/tr[4]/td/span/text())').extract() i['applicationNumber'] =sel.select('normalize-space(//table[@class="tableType3"]/tbody/tr[6]/td/text())').extract() i['priorityNumbers'] = sel.select('normalize-space(//table[@class="tableType3"]/tbody/tr[7]/td/span/a/text())').extract() i['core'] = sel.select('normalize-space(//div[@class="application article clearfix"]/p[@class="printAbstract"]/text())').extract() #i['published'] = 0 TODO #i['classification'] = 0 TODO #self.items.append(i) yield i
def parse_list(self, response): hxs = Selector(response) titles = hxs.select( "//div[contains(@class,'product-unit unit-4 browse-product new-design')]") items = [] count1 = 0 for title in titles: count1 = count1 + 1 item = TutorialItem() item['title'] = title.select( ".//div[contains(@class,'pu-title')]/a/text()").extract() if not item['title']: item['title']="n/a" else: item['title']=str(item['title'][0]).encode('utf-8').strip() if item['title']== "n/a": flag = 1 item['price_from_fk'] = title.select( ".//div[contains(@class,'pu-final')]/span/text()").extract() if not item['price_from_fk']: flag = 1 else: item['price_from_fk']=float(item['price_from_fk'][0].replace("Rs.","").replace(",","").strip()) item['link_from_fk'] = title.select( ".//div[contains(@class,'pu-title')]/a/@href")[0].extract() if not item['link_from_fk']: item['link_from_fk'] = "n/a" else: item['link_from_fk'] = "http://www.flipkart.com" + item['link_from_fk'] request = Request( item['link_from_fk'], callback=self.new_features) request.meta['item'] = item items.append(item) yield request
def parse(self, response): hxs = Selector(response) links = hxs.xpath("//a/@href").extract() #We stored already crawled links in this list crawledLinks = [] #Pattern to check proper link linkPattern = re.compile("http:\/\/www\.rs66\.com\/renshengganwu\/(list_\d+\.html|\d+.html)") for link in links: # If it is a proper link and is not checked yet, yield it to the Spider if linkPattern.match(link) and not link in crawledLinks: crawledLinks.append(link) yield Request(link, self.parse) content = " ".join(hxs.select('//div[@class="content"]/text()').extract()) content = ''.join(content.split()) item = SoupItem() item["content"] = content yield item
def parse_asia_odds(self, response): match_item = response.meta['match_item'] selector = Selector(response) # iterate over odds for odds in selector.select(self.asia_odds__xpath): loader = ItemLoader(AsiaOdds(), selector=odds) # define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader for field, xpath in self.asia_odds_fields.iteritems(): loader.add_xpath(field, xpath) odds_item = loader.load_item() #http://odds.500.com/yazhi.php?cid=515 odds_item["match_id"] = match_item["match_id"] odds_item["company_id"] = odds_item["company_id"].split('=')[-1] odds_item["water_a"] = odds_item["water_a"].replace(self.UP_CHAR, '').replace(self.DOWN_CHAR, '') odds_item["water_b"] = odds_item["water_b"].replace(self.UP_CHAR, '').replace(self.DOWN_CHAR, '') yield odds_item