def parse_page2(self, response): # data stored global writer sel = HtmlXPathSelector(response) # article = ''.join(sel.xpath('//div[@class="body yom-art-content clearfix"]').extract()) article = ''.join(sel.xpath('//p/text()').extract()) subheadline = ''.join(sel.xpath('//h2[@class="subheadline"]/text()').extract()) str2 = ''.join(sel.xpath('//abbr/text()').extract()) millis = int(round(time.time() * 1000)) # Get the current time in milliseconds ntime = 0.0 if "hour" in str2[1]: str3 = str2[1].split(" ") ntime += float(str3[0]) * 60 if "minute" in str2[1]: ntime += float(str3[2]) elif "minute" in str2[1]: str3 = str2[1].split(" ") ntime += float(str3[0]) # article_time = datetime.datetime.fromtimestamp((millis - ntime * 60 * 1000)/1000).strftime('%m-%d-%Y %H:%M:%S.%f') articletime = " " # Grabs the information from parse function title = response.meta['Title'] linktime = response.meta['LinkTime'] source = response.meta['Source'] link = response.meta['Link'] # Stores everything in a CSV file Consumer.writer.writerow([title.encode("utf-8"), subheadline.encode("utf-8"), source.encode("utf-8"), linktime.encode("utf-8"), articletime.encode("utf-8"), article.encode("utf-8"), link.encode("utf-8")])
def parse_url(self, response): sel = HtmlXPathSelector(response) for i in range(20): try: url_xpath = '//*[@id="result"]/div[%d]/div/h2/a/@href' % (i + 4) news_url = sel.xpath(url_xpath).extract()[0].decode('UTF-8') except: try: url_xpath = '//*[@id="result"]/div[%d]/h2/a/@href' % (i + 4) news_url = sel.xpath(url_xpath).extract()[0].decode( 'UTF-8') except: try: url_xpath = '//*[@id="result"]/div[%d]/h2/a' % (i + 4) news_url = sel.xpath(url_xpath).extract()[0].decode( 'UTF-8') except: news_url = '' print i, news_url # 爬取正文和评论 if news_url != '': url_info = { 'response_url': news_url, 'topic_id': self.topic_id } # yield UrlsInfoItem(url_info) self.cursor.execute("insert into crawl_url values(null,%s,%s)", (news_url, self.topic_id)) yield scrapy.http.Request(news_url, callback=self.parse_news) # break self.conn.commit()
def parseDetail(self,response): item = response.meta['item'] article = HtmlXPathSelector(response) item['title'] = article.xpath('//h1/text()').extract()[0] item['content'] = article.xpath('//div[@class="entry-content"]/text()').extract()[0] item['createtime'] = article.xpath('//time[@class="entry-date"]/@datetime').extract()[0] return item
def parse_images(self, response): """ 下载图片 :param response: :return: """ hxs = HtmlXPathSelector(response=response) items = hxs.xpath("//div[@id='content-list']/div[@class='item']") for item in items: # print(item) # href = item.xpath(".//div[@class='part1']//a[1]/@href").extract_first() # img = item.xpath("//div[@class='news-pic']/img/@original").extract_first() img = item.xpath( ".//div[@class='part2']/@share-pic").extract_first() # print(img) # file_name = img.rsplit('//')[1].rsplit('?')[0] img_name = img.rsplit('_')[-1] file_path = 'images/{0}'.format(img_name) #使用大文件下载方式 item = ScrapyRedisSpidersItem(url=img, type='file', file_name=file_path) print(img) yield item pages = hxs.xpath( "//div[@id='page-area']//a[@class='ct_pagepa']/@href").extract( ) print(pages) for page_url in pages: #获取所有页码的url page_url = "http://dig.chouti.com" + page_url print(page_url) yield Request(url=page_url, callback=self.parse_images)
def get_page_parse(self, response): hxs = HtmlXPathSelector(response) item = WdspiderItem() _ans = hxs.xpath('//div[@class="answer-con"]/text()').extract() #print _ans if len(_ans[0]) < 30: print u"答案长度太短....." return None #sys.exit(),unicode(line,"UTF-8")"".join(_title) title = hxs.xpath('//h3[@id="questionTitle"]/text()').extract() ''' _title = [] for t in title[0]: _title.append(t.encode('utf-8')) ''' item['question'] = title[0] item['question_detail'] = '', #item['topics'] = , item['answers'] = [{'agree_count': random.randint(5, 25), 'publish_time': time.time(), "comments": {}}], item['answers_text'] = _ans[0] item['signcc'] = 123123, item['callback'] = response.url #print item return item
def parse_pro(self, response): theItem = proItem() sel = HtmlXPathSelector(response) theItem['name'] = '' theItem['website'] = '' theItem['email'] = '' theItem['title'] = '' theItem['phone'] = '' theItem['office'] = '' theItem['picture'] = '' content = sel.xpath('//div[@class="node node-page view-mode-full clearfix"]/span/@content') name = content.extract() if(len(name) > 0): theItem['name'] = name[0] content = sel.xpath('//div[@class="block block-block first last odd"]/div[@class="content"]') picture = content.xpath('./img[@width="160"]/@src').extract() if(len(picture) > 0): url = picture[0] if(url[0]=='/'): url = 'https://www.cs.washington.edu' + url theItem['picture'] = url for p in content.xpath('.//p'): msglst = p.xpath('./text()').extract() if(len(msglst) == 0): continue msg = msglst[0] if(msg[0:3]=='Off'): theItem['office'] = msg[8:] if(msg[0:3]=='Ema'): theItem['email'] = msg[7:] if(msg[0:3]=='Phone'): theItem['phone'] = msg[7:] yield theItem
def parse_items(self, response): hxs = HtmlXPathSelector(response) item = NewsItem() item["link"] = response.request.url item["lang"] = "en" item["source"] = "mirror" category = hxs.xpath( "//div[@class='col-md-12']/div[@class='breadcrumb-body clr']/span//text()" ).extract() date_time = hxs.xpath("//span[@class='modify-date']/text()").extract() item["author"] = "" title = hxs.xpath( "//h1[@class='news-detail-title selectionShareable']/text()" ).extract() intro = hxs.xpath( "//div[@class='news-detail-spot news-detail-spot-margin']/h2/text()" ).extract() new_content = hxs.xpath("//div[@class='news-box']/p/text()").extract() # # Processing outputs item["intro"] = ' '.join(intro) item["title"] = ' '.join(title) new_content = ' '.join(new_content) new_content = re.sub('\n', ' ', new_content) item["content"] = re.sub('\s{2,}', ' ', new_content) category = category[1:-1] category = [c for c in category if not c == ">"] item["category"] = '|'.join(category) item["date_time"] = " ".join(date_time) return (item)
def displayLocations(self, response): hxs = HtmlXPathSelector(response) region = hxs.xpath( '//select[@id="display-refine-region"]/option/@value').extract( )[1::] regionName = hxs.xpath( '//select[@id="display-refine-region"]/option/text()').extract( )[1::] url = 'https://www.hamlan.com.au/wp-admin/admin-ajax.php' for n, i in enumerate(region): formdata = { 'action': 'getDisplayLocationResults', 'selectedRegion': '{}'.format(i) } headers = { 'Accept': '*/*', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest' } # yield Request(url, method='POST', callback=self.parseItem, dont_filter=True,headers=headers) requests = FormRequest(url=url, formdata=formdata, callback=self.getLinks, dont_filter=True, headers=headers, meta={'data': regionName[n]}) yield requests
def handle_blog(self, response): hxs = HtmlXPathSelector(response) item = BuzzCrawlerItem() item['url'] = response.url item['date'] = dateutil.parser.parse(hxs.xpath(".//li[@class='entryDate']/time/@datetime").extract()[0]) item['title'] = hxs.xpath(".//h1[@id='headline']/text()").extract()[0].strip() item['blurb'] = "" unprocessed_content = hxs.xpath(".//span[@itemprop='articleBody']").extract()[0] sane_html = remove_tags_with_content(unprocessed_content,("noscript","div","h6")) h = html2text.HTML2Text() h.ignore_links = True h.ignore_images = True processed_content = h.handle(sane_html) if "noscript" in unprocessed_content: print sane_html.encode("iso-8859-15", "replace") print "*"*98 item['content'] = markdown(processed_content) item['source'] = 'wired.com' yield item
def parse_letter(self,response): hxs=HtmlXPathSelector(response) next_page_url = hxs.xpath(u"//a[text()='下一页']/@href").extract() if len(next_page_url) != 0: flag = True request = Request(next_page_url[0], callback = self.parse_letter) request.meta['item'] = response.meta['item'] yield request else: flag = False l = response.meta['item'] letter='' letter1=hxs.xpath("//script").re('(?<=doctorjy).*?doctorjy') letter = letter + self.parse_letter_detail1(letter1)['letter'] letter2=hxs.xpath("//table[@class='doctorjy']") letter = letter + self.parse_letter_detail2(letter2)['letter'] l.add_value('comment',letter) if flag == True: yield None else: yield l.load_item()
def parse_review_page(self, response): items = response.meta.get('items', '') url = response.meta.get('url', '') hxs = HtmlXPathSelector(text=self._extract_html(response)) reviews = hxs.xpath('//div[@class="BVRRReviewDisplayStyle5"]') for review in reviews: l = ReviewLoader(item=Review(), response=response, date_format='%d/%m/%Y') rating = review.select(".//span[contains(@class,'BVRRRatingNumber')]/text()").extract()[0] date = review.select(".//span[contains(@class,'BVRRValue BVRRReviewDate')]/text()").extract()[0] title = review.select(".//span[contains(@class,'BVRRReviewTitle')]/text()").extract() review_text = ' '.join(review.select(".//span[contains(@class,'BVRRReviewText')]//text()").extract()) if title: full_text = title[0].strip() + '\n' + review_text.strip() else: full_text = review_text.strip() l.add_value('rating', rating) l.add_value('url', url) l.add_value('date', datetime.strptime(date, '%d %B %Y').strftime('%d/%m/%Y')) l.add_value('full_text', full_text) for item in items: item['metadata']['reviews'].append(l.load_item()) next = hxs.xpath('//span[@class="BVRRPageLink BVRRNextPage"]/a/@data-bvjsref').extract() if next: yield Request(next[0], callback=self.parse_review_page, meta={'items': items, 'url': url}) else: for item in items: yield item
def parse_dir_contents(self, response): str1 = response.url.split("/")[3] filename = 'output11/' + str1 + '.html' with open(filename, 'wb') as f: f.write(response.body) hxs = HtmlXPathSelector(response) #extract the cost for new format HDcost1 = hxs.xpath('//*[@class="dv-button-inner"]/text()').extract() len1 = len(HDcost1) del HDcost1[0] for i in range(0, len1 - 1): var1 = HDcost1[i] var1 = var1.encode('utf-8') HDcost1[i] = var1 #extract the title for new format title1 = hxs.xpath('//*[@id="aiv-content-title"]/text()').extract() len1 = len(title1) for i in range(0, len1): var1 = title1[i] var1 = var1.encode('utf-8') var1 = var1.strip() title1[i] = var1 title1 = filter(None, title1) #extract the release year for new format relyear = hxs.xpath('//*[@class="release-year"]/text()').extract() relyear1 = relyear[0].encode('utf-8') relyear1 = relyear1.strip() #extrcat the time for new format times = hxs.xpath( '//*[@id="dv-dp-left-content"]/div[2]/div[2]/dl/dd[2]/text()' ).extract() time1 = times[0].strip() time1 = time1.encode('utf-8') #extract the director for new format dir1 = response.xpath( '//*[@id="dv-center-features"]/div[1]/div/table/tr[2]/td/a/text()' ).extract() dir1 = dir1[0].encode('utf-8') dir1 = dir1.strip() #extract the starring actors actors = hxs.select( '//*[@id="dv-dp-left-content"]/div[2]/div[2]/dl/dd[1]/text()' ).extract() actors = actors[0].encode('utf-8') actors = actors.strip() yield DmozItem( title=title1, time=time1, cost=HDcost1, year=relyear1, director=dir1, star=actors, )
def parse_items(self, response): hxs = HtmlXPathSelector(response) item = NewsItem() item["link"] = response.request.url item["lang"] = "tr" item["source"] = "sabah" category = hxs.xpath( "//div[contains(@class,'haber-header')]/header/span[contains(@class,'category')]//text()" ).extract() date_time = hxs.xpath( "//div[contains(@class,'haber-header')]/div[contains(@class,'info')]/time/text()" ).extract() item["author"] = "" title = hxs.xpath( "//div[contains(@class,'haber-header')]/header/h1/text()").extract( ) intro = hxs.xpath( "//div[contains(@class,'haber-header')]/header/h2/text()").extract( ) new_content = hxs.xpath( "//div[contains(@class,'content')]/div/p/text()").extract() # # Processing outputs item["intro"] = ' '.join(intro) item["title"] = ' '.join(title) new_content = ' '.join(new_content) new_content = re.sub('\n', ' ', new_content) item["content"] = re.sub('\s{2,}', ' ', new_content) item["category"] = '|'.join(category) item["date_time"] = " ".join(date_time) return (item)
def parse(self,response): sel = HtmlXPathSelector(response) item = ProductItem() #str = sel.xpath("//div[@class='name']/hgroup/h1/text()").extract() item['title'] = sel.xpath("//div[@class='name']/hgroup/h1/text()").extract() item['description'] = sel.xpath("//div[@class='full']//text()").extract()[0] item['details'] = sel.xpath("//div[@id='description']//ul/li//text()").extract() item['images'] = sel.xpath("//img[@id='vsImage']/@src").extract() item['imagesdata'] = sel.xpath("//ul[@class='pdp-info box split primary']//section[@class='swatches module']/div[@class='swap']//span[@data-alt-image]/@data-alt-image").extract() item['prices'] = map(unicode.strip,sel.xpath("//ul[@class='pdp-info box split primary']/li//div[@class='price']/p/text()").extract())[0] item['colors'] = sel.xpath("//ul[@class='pdp-info box split primary']//section[@class='swatches module']/div[@class='swap']//h4/text()").extract() item['sizes'] = sel.xpath("//ul[@class='pdp-info box split primary']//div[@class=' scroll']//a//span/text()").extract()[1:] item['id'] = sel.xpath("//section[@class='product']/@data-id").extract()[0] str = sel.xpath("//script//text()").extract() for i in str: res = re.findall('\{\"assetId\".*?\"R\"\}',i) if res != []: item['data'] = res yield item
def parse(self, response): item=ProvincecrawlItem() hxs = HtmlXPathSelector(response) item['country']=hxs.xpath('//tr[@class="o" or @class="e"]/following::td[1]/text()').re('\w.*') item['url']='http://www.statoids.com/'+hxs.xpath('//tr[@class="o" or @class="e"]/following::td[1]/following::a[1]/@href').re('u...html') yield item pass
def parse_items(self, response): begin = time.time() logger.info("start to scrawl url:{0}".format(response.url)) hxs = HtmlXPathSelector(response) scripts = hxs.xpath("//script/@src") is_exsisted = False for script in scripts: script_src = script.extract() if "http" not in script_src: parsed_uri = urlparse(response.url) domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) script_src = domain + script_src r = requests.get(script_src) if GEEKCA_DOMAIN in r.content: is_exsisted = True break if not is_exsisted: titles = hxs.xpath( "//script[contains(.,'geekca.cubead.com')]/text()") items = [] if not titles: item = UrlItem() item["url"] = response.url items.append(item) logger.info("end to scrawl url:{0} and cost time:{1}".format( response.url, (time.time() - begin))) return (items)
def parse_story(self, response): hxs = HtmlXPathSelector(response) loader = ArticleLoader(MeduzaArticlesItem(), hxs) date = str('-'.join(response.url.split('/')[4:7])) title = hxs.xpath( '//div/h1[contains(@class, "RichTitle-root")]/text()').extract()[0] p = [] for par in hxs.xpath(''' //div[@class="GeneralMaterial-article"]/p//text() |//div[@class="GeneralMaterial-article"]/h3//text() ''').extract(): p.append(par) text = ' '.join(p) loader.add_value('url', str(response.url)) loader.add_xpath( 'title', '//div/h1[contains(@class, "RichTitle-root")]/text()') loader.add_value('date_published', date) loader.add_value('text', text) # TODO crawl likes loader.add_value('fb_likes', '') loader.add_value('vk_likes', '') loader.add_value('ok_likes', '') return loader.load_item()
def parse(self, response): item = HtmlXPathSelector(response) # todo: strip html # todo: add other fields # todo: download articles titles = item.xpath('//*[@id="dlpage"]/dl/dd/div/div[contains(@class,"list-title")]/text()').extract() papers = item.xpath( '//*[@id="dlpage"]/dl/dt/span[contains(@class, "list-identifier")]/a[@title="Download PDF"]/@href').extract() print len(titles) print len(papers) title_list = [] for t in titles: if t != '\n': t = t.replace("\n", "") print t title_list.append(t) print len(title_list), len(papers) i = 0 list_map = {} for p in papers: base = "https://arxiv.org" print title_list[i], base + p subprocess.call('wget -U "Mozilla" {}.pdf'.format(base + p), shell=True) list_map[p.replace("/pdf/", "")] = title_list[i] i += 1 print pprint.pprint(list_map)
def other_question(self, response): try: hxs = HtmlXPathSelector(response) item = YahoourlsearcherItem() category = hxs.xpath( '(//a[contains(@class,"Clr-b")])[2]').extract() h = html2text.HTML2Text() h.ignore_links = True category_text = h.handle(category[0]) # Check if the question thread is related to programming and design # if "程式編寫" and "設計" in str(category_text).strip(): if (True): next_page = hxs.xpath( '//a[contains(@class,"Clr-b") and text()=" Next "]/@href')\ .extract() composed_string = "https://hk.answers.yahoo.com" + next_page[0] item['url'] = str(response.url) item['date'] = str("not available") print("*** " + str(category_text).strip() + " - " + item['url'] + " ***") yield item yield scrapy.Request(composed_string, callback=self.other_question) except NoSuchElementException: pass
def parse(self, response): hxs = HtmlXPathSelector(response) if hxs.select("//center").extract(): return bili = BiliItem() bili['url'] = response.url bili['avNo'] = int(re.search(r'\d+', str(response.url)).group()) bili['title'] = hxs.xpath("//h1/text()").extract()[0] bili['time'] = hxs.xpath("//time/i/text()").extract()[0] bili['category'] = hxs.xpath('//a[@class="on"]/text()').extract()[0] bili['up'] = hxs.xpath('//a[@class="name"]/text()').extract()[0] if bili['title']: bili['comment'] = int( re.findall( re.compile(r"acount.{2}\d+"), urllib2.urlopen( "http://api.bilibili.com/x/reply?jsonp=jsonp&type=1&sort=0&oid=" + str(bili['avNo'])).read())[0][8:]) content = urllib2.urlopen( "http://interface.bilibili.com/count?key=5cb9d3f30568fd06bb388d13&aid=" + re.search(r'\d+', str(response.url)).group()).read() bili['click'] = int( re.findall(re.compile(r"ji.{9}\d+"), content)[0][11:]) bili['coin'] = int( re.findall(re.compile(r"es.{8}\d+"), content)[0][10:]) bili['sc'] = int( re.findall(re.compile(r"stow_count.{9}\d+"), content)[0][19:]) bili['dm'] = int( re.findall(re.compile(r"dm_count.{8}\d+"), content)[0][16:]) del content, hxs yield bili
def parse(self, response): hxs = HtmlXPathSelector(response) titles = hxs.xpath("string(//span[@id='productTitle'])").extract() price = hxs.xpath("string(//span[@id='priceblock_dealprice'])").extract() stock = hxs.xpath("//div[@id='availability']/span") description = hxs.xpath("//div[@id='feature-bullets']/ul/li//span") images = hxs.xpath("//img[contains(@class, 'a-dynamic-image') and contains(@class, 'a-stretch-vertical')][@src][1]") #print "########################################################################################################################" #print images.extract() items = [] item = EbayScraperItem() # title = titles.select("text()").extract() # price = price.select("text()").extract() stock = stock.select("text()").extract() #title = title[1:-1] # Remove quots item["title"] = str(titles).strip() item["price"] = price item["stock"] = stock formated_desc = "" for desc in description: formated_desc += str(desc.select("text()").extract()) item["description"] = formated_desc item["images"] = images.select("@src").extract() items.append(item) return items
def parse_item(self, response): global i, not_data i += 1 #记录抓取条数 print(i) item = BaikeItem() sel = HtmlXPathSelector(response) baike_url = str(response.url) baike_name = sel.xpath( '//div[@id="sec-content0"]/h1/span[@class="lemmaTitleH1"]/text()' ).extract() baike_desc = sel.xpath( '//div[@class="card-summary-content"]/div[@class="para"]/text()' ).extract()[0] if not baike_name: not_data += 1 #记录未抓取到的条数 print(not_data) if not baike_desc: baike_desc = '未抓取到' item['title'] = [n.encode('utf-8') for n in baike_name] item['link'] = baike_url.encode('utf-8') item['desc'] = baike_desc yield item
def parse(self, response): selector = HtmlXPathSelector(response) # 出发地字母分类 class4 = selector.xpath( '//*[@id="gnyallist-al"]/div/div[2]/div[1]/following-sibling::div') # 目的地省份 provinces = selector.xpath( '/html/body/div[2]/div[1]/div/div[2]/div[4]/div[2]/dl') # departure_cities = [] # for one in class4: # in_class = one.xpath("dl") # for element in in_class: # info = element.xpath("dd/a/text()").extract() # departure_cities += info # # with codecs.open("tmp.txt", 'w', encoding='utf-8') as f: # f.write(str(departure_cities)) # f.close() arrive_cities = [] for province in provinces: info = province.xpath('dd/a/text()').extract() arrive_cities += info with codecs.open("tmp.txt", 'w+', encoding='utf-8') as f: f.write(str(arrive_cities)) f.close()
def extract_details(self, response): hxs = HtmlXPathSelector(response) item = response.meta['item'] items = [] # don't pass in items # logger.info("XXXXXXX parent URL %s, LOOP: %d, ADDR:%s", response.meta['parent'], response.meta['loop'], item["addr"]) item["facts"] = hxs.xpath('//ul[@class="zsg-list_square zsg-lg-1-3 zsg-md-1-2 zsg-sm-1-1"]/li/text()').extract() zest = hxs.xpath('//div[@class="zest-value"]/text()').extract() item["zest_sale"] = zest[0] item["zest_rent"] = zest[1] school_info = hxs.xpath('//ul[@class="nearby-schools-list"]') school_name = school_info.xpath('//a[@class="za-track-event school-name notranslate"]/text()').extract() school_rating = school_info.xpath('//*[starts-with(@class, "gs-rating-number")]/text()').extract() school_grade = school_info.xpath('//div[@class="nearby-schools-grades"]/text()').extract() school_len = len(school_name) if school_len == 1: item["school_info1"] = str(school_name[0]) + "/" + str(school_rating[0]) + "/" + str(school_grade[0]) elif school_len == 2: item["school_info1"] = str(school_name[0]) + "/" + str(school_rating[0]) + "/" + str(school_grade[0]) item["school_info2"] = str(school_name[1]) + "/" + str(school_rating[1]) + "/" + str(school_grade[1]) elif school_len == 3: item["school_info1"] = str(school_name[0]) + "/" + str(school_rating[0]) + "/" + str(school_grade[0]) item["school_info2"] = str(school_name[1]) + "/" + str(school_rating[1]) + "/" + str(school_grade[1]) item["school_info3"] = str(school_name[2]) + "/" + str(school_rating[2]) + "/" + str(school_grade[2]) else: logger.info("No school info") items.append(item) return items
def parse_items(self, response): hxs = HtmlXPathSelector(response) item = NewsItem() item["link"] = response.request.url item["lang"] = "en" item["source"] = "wired" category = hxs.xpath( "//li/span[@itemprop='articleSection']//text()").extract() date_time = hxs.xpath( "//ul/meta[@itemprop='datePublished']/@content").extract() author = hxs.xpath( "//ul/li/span[@itemprop='author']//text()").extract() title = hxs.xpath("//header/h1[@data-js='postTitle']/text()").extract() intro = "" new_content = hxs.xpath( "//article[@data-js='content']/p//text()").extract() # # Processing outputs item["intro"] = ' '.join(intro) item["title"] = ' '.join(title) new_content = ' '.join(new_content) new_content = re.sub('\n', ' ', new_content) item["content"] = re.sub('\s{2,}', ' ', new_content) category = list(set([c for c in category if re.search("\S", c)])) item["category"] = '|'.join(category) date_time = " ".join(date_time) item["author"] = " ".join(author).strip() item["date_time"] = date_time.split("+")[0] return (item)
def handle_blog(self, response): hxs = HtmlXPathSelector(response) item = BuzzCrawlerItem() item['url'] = response.url item['date'] = dateutil.parser.parse( hxs.xpath(".//li[@class='entryDate']/time/@datetime").extract()[0]) item['title'] = hxs.xpath( ".//h1[@id='headline']/text()").extract()[0].strip() item['blurb'] = "" unprocessed_content = hxs.xpath( ".//span[@itemprop='articleBody']").extract()[0] sane_html = remove_tags_with_content(unprocessed_content, ("noscript", "div", "h6")) h = html2text.HTML2Text() h.ignore_links = True h.ignore_images = True processed_content = h.handle(sane_html) if "noscript" in unprocessed_content: print sane_html.encode("iso-8859-15", "replace") print "*" * 98 item['content'] = markdown(processed_content) item['source'] = 'wired.com' yield item
def parse(self, response): driver = webdriver.Firefox() driver.get("http://www.moneycontrol.com/india/stockpricequote/bankspublicsector/statebankindia/SBI") time.sleep(10) content = driver.page_source i=0 converter = html2text.HTML2Text() converter.ignore_links = True doc = HtmlXPathSelector(response) j=0 while(j<6): driver.refresh() for desc in doc.xpath("//div/span[@id='Bse_Prc_tick']").extract(): i=i+1 print ("\n*******************************************************\n") print i print converter.handle(desc) for desc1 in doc.xpath("//div/span[@id='Nse_Prc_tick']").extract(): i=i+1 print ("\n*******************************************************\n") print i print converter.handle(desc1) j=j+1 return desc
def parse(self, response): hxs = HtmlXPathSelector(response) currurl = hxs.xpath('//link[@rel="alternate"]/@href').extract() print currurl URL = currurl[0].strip().split("/")[2] for url in hxs.xpath('//a/@href').extract(): print "http://" + DOMAIN + URL + url
def village_parse(self, response): hxs = HtmlXPathSelector(response) item = VillageItem() item['name'] = hxs.xpath('//h1[@id="commtitle"]/a/text()').extract()[0] infos = hxs.xpath('//ul[@class="chamber-infolist"]/li') for index, link in enumerate(infos): if index == 0: item['address'] = link.xpath('text()').extract()[0].split( ":")[1] elif index == 3: item['build_date'] = link.select('text()').extract()[0].split( ":")[1] elif index == 4: item['developer'] = link.select('text()').extract()[0].split( ":")[1] elif index == 6: item['property_company'] = link.select( 'text()').extract()[0].split(":")[1] item['village_id'] = response.url.split("/")[5] location = hxs.xpath('//a[@id="propview_map"]/img/@src').extract( )[0].partition('?')[2].split('&')[0].split('=')[1] item['longitude'] = location.split(',')[0] item['latitude'] = location.split(',')[1] return item
def parse(self, response): hxs = HtmlXPathSelector(response) sites = hxs.xpath('//div[contains(@class,"row")]') for site in sites: item = IndeedItem() company = site.xpath( ".//span[@class='company']//a/text()").extract_first() if not company: company = site.xpath( ".//span[@class='company']/text()").extract_first() item['company'] = company.strip() # title title = site.xpath( './/a[@data-tn-element="jobTitle"]/@title[1]').extract_first() item['title'] = title # indeed url link = site.xpath( ".//span[@class='company']//a/@href").extract_first() if link: item['link'] = 'https://www.indeed.com' + link yield item # what to crawl next next_to_crawl = hxs.xpath( '//span[@class="pn"]/parent::a/@href').extract() for i in next_to_crawl: url = response.urljoin(i) yield Request(url)
def parse(self, response): hxs = HtmlXPathSelector(response) Ac = Acitem() Ac['url'] = response.url Ac['title'] = hxs.xpath("//h1/text()").extract()[0] Ac['time'] = hxs.xpath( "/html/body/div/div[7]/div/div[1]/div[1]/div[1]/p/span[2]/text()" ).extract()[0] Ac['acNo'] = int(re.search(r'\d+', str(response.url)).group()) Ac['up'] = hxs.xpath( "/html/body/div/div[7]/div/div[1]/div[1]/div[1]/div[2]/div[1]/a[1]/text()" ).extract()[0] Ac['category'] = hxs.xpath( "/html/body/div/div[7]/div/div[1]/div[1]/p/a[2]/text()").extract( )[0] if Ac['up']: content = urllib2.urlopen( "http://www.acfun.tv/content_view.aspx?contentId=" + re.search(r'\d+', str(response.url)).group()).read() contentnumber = re.findall(re.compile(r"\d*"), content) Ac['click'] = contentnumber[1] Ac['dm'] = contentnumber[9] Ac['coin'] = contentnumber[13] Ac['sc'] = contentnumber[11] Ac['comment'] = contentnumber[3] yield Ac
def parse_item(self, response): print '************* URL:', response.url #sel = Selector(response) hxs = HtmlXPathSelector(response) # Reviews not working seems to dynamically retrieve using javascript item = VueCrawlerItem() item['product_item_num'] = str(response.url) tag = hxs.xpath('//meta[@name="keywords"]/@content').extract() # print '@@@@@@@@@@@values: ',tag productIDs = [] productsuidlist = hxs.xpath('//ul[@id="product-list"]//li//a/@data-item').extract() for productid in productsuidlist: if productid.strip(): productIDs.append(productid) res = hxs.xpath('//div[@class="breadcrumbs"]//li//span[@itemprop="title"]').extract() list = [] for index in range(len(res)): v = (res[index]) v = v.strip() if v!='/': list.append(v) item['tag'] = list item['tag_product_ids'] = productIDs return item
def go_go(self, response): hxs = HtmlXPathSelector(response) price = [] for i in hxs.xpath( "//td[@class='price']/p[@class='new_price' or @class='price_no_discount']/text()" ).extract(): if i != u'\r\n ': price.append(i.strip()) writer = csv.writer(open('price.csv', 'a'), lineterminator='\n') for x, i in enumerate(hxs.xpath("//td[@class='item']")): order = str(self.order_id) name = i.xpath("h2/text()").extract()[0].strip() title = i.xpath( "p[@class='description']/text()").extract()[0].strip() article = i.xpath( "p[@class='article']/text()").extract()[0].strip() num = price[x] writer.writerow([ i.encode('utf-8') for i in [order, name, title, article, num] ]) description = self.parse_me(hxs, "//div[@itemprop='description']/text()") features = self.parse_me(hxs, "//div[@id='features']/dl/dd/text()") img = self.parse_me(hxs, "//div[@class='atg_store_productImage']/img/@src") alls = [str(self.order_id)] + description + features + img writer = csv.writer(open('shop.csv', 'a'), lineterminator='\n') writer.writerow([i.encode('utf-8') for i in alls]) self.order_id += 1
def parse_items(self, response): hxs = HtmlXPathSelector(response) item = NewsItem() item["link"] = response.request.url item["lang"] = "tr" item["source"] = "konya" category = hxs.xpath( "/html/body/div[6]/div[2]/div[1]/div[4]/div/div[2]/div[2]/div/div[1]/div[1]/div/h2" ).extract() date_time = hxs.xpath("").extract() item["author"] = "" title = hxs.xpath( "/html/body/div[6]/div[2]/div[1]/div[4]/div/div[2]/div[2]/div/div[1]/div[2]/div[1]/div[2]" ).extract() intro = hxs.xpath("//*[@id='phoneDetails_0']").extract() new_content = "" # # Processing outputs item["intro"] = ' '.join(intro) item["title"] = ' '.join(title) new_content = ' '.join(new_content) new_content = re.sub('\n', ' ', new_content) item["content"] = re.sub('\s{2,}', ' ', new_content) item["category"] = '|'.join(category) item["date_time"] = " ".join(date_time) return (item)
def parse(self, response): open("douban",'wb').write(response.body) self.log("Fetch group home page: %s" % response.url) hxs = HtmlXPathSelector(response) item = DoubanItem() #get group name item['groupName'] = hxs.xpath('//h1/text()').re("^\s+(.*)\s+$")[0] #get group id item['groupURL'] = response.url groupid = self.__get_id_from_group_url(response.url) #get group members number members_url = "http://www.douban.com/group/%s/members" % groupid members_text = hxs.xpath('//a[contains(@href, "%s")]/text()' % members_url).re("\((\d+)\)") item['totalNumber'] = members_text[0] #get relative groups item['relativeGroups'] = [] groups = hxs.select('//div[contains(@class, "group-list-item")]') for group in groups: url = group.xpath('div[contains(@class, "title")]/a/@href').extract()[0] item['relativeGroups'].append(url) #item['relativeGroups'] = ','.join(relative_groups) return item
def parse(self, response): #index = self.find(response.url) index = self.start_urls.index(response.url) print index if index < 0: print 'index < 0' return #filename = response.url.split("/")[-2] #open(filename, 'wb').write(response.body) hxs = HtmlXPathSelector(response) sites = hxs.xpath('//ul/li') titles = hxs.xpath('//title/text()') urls = hxs.xpath('//div[@class="small_photo_wrap"]/ul/li/a/img/@data-big-url') url = urls.extract() self.VDBobj.Update(index, url) if (len(url) == 0): urls = hxs.xpath('//div[@class="product_feature"]//img/@src') url = urls.extract() self.VDBobj.Update(index, url) if (len(url) == 0): urls = hxs.xpath('//div[@class="hot_recommend"]//img/@src') url = urls.extract() self.VDBobj.Update(index, url)
def parse_money(self, response): # data stored global writer sel = HtmlXPathSelector(response) article = "".join(sel.xpath('//section[@class="article-body"]/p/text()').extract()) subheadline = "".join(sel.xpath('//h2[@class="article-excerpt"]/a/text()').extract()) articletime = " " #''.join(sel.xpath('//time[@datetime]/a/text').extract()) # Grabs the information from parse function title = response.meta["Title"] linktime = response.meta["LinkTime"] source = response.meta["Source"] link = response.meta["Link"] # Stores everything in a CSV file Money.writer.writerow( [ title.encode("utf-8"), subheadline.encode("utf-8"), source.encode("utf-8"), linktime.encode("utf-8"), articletime.encode("utf-8"), article.encode("utf-8"), link.encode("utf-8"), ] )
def parse_items(self, response): hxs = HtmlXPathSelector(response) data = imdbItem() data["seriesRating"] = hxs.xpath( '//span[@itemprop="ratingValue"]/text()').extract() seasonLink = hxs.xpath( '//div[@id="titleTVSeries"]/div[1]//span[@class="see-more inline"]/a/@href' ).extract() #Directly go to ratings page ''' if not seasonLink==[]: #print data["link"] url = data["link"][0]+'epdate' request = Request(url,callback=self.parse_episode_ratings) request.meta['item'] = data yield request ''' #follow season links - can get more data as opposed to above method if not seasonLink == []: for season in seasonLink: link = 'http://www.imdb.com/' + season request = Request(link, callback=self.parse_season_links) request.meta['item'] = data yield request
def detail(self, response): log.msg(response.url) hxs = HtmlXPathSelector(response) product_name = hxs.xpath( '//*[@id="vip_content_section"]/div[2]/h1/text()').extract() # //*[@id="vip_content_section"]/div[2]/h1 if (len(product_name) != 0): product_name = hxs.xpath( '//*[@id="vip_content_section"]/div[2]/h1/text()').extract()[0] product_price = hxs.xpath('//*[@id="price-val"]/text()').extract() if (len(product_price) != 0): product_price = hxs.xpath( '//*[@id="price-val"]/text()').extract()[0] if (len(product_price) != 0 or product_price != None) and (len(product_name) or product_name != None): l = ItemLoader(item=BillionPricesIndiaItem(), response=response) l.add_xpath('product_name', '//*[@id="vip_content_section"]/div[2]/h1/text()') # l.add_xpath('quantity', '//*[@id="product_detail_view_1"]/div/div[1]/div/text()') l.add_xpath('category', '//*[@id="cat_crum"]/@value') l.add_xpath('product', '//*[@id="overview_tab"]/div/div/p/text()') item = l.load_item() item['product_url'] = response.url item['price'] = product_price item['vendor'] = 'PepperFry' item['city'] = 'Mumbai' item['state'] = 'Maharashtra' item['country'] = 'India' item['date'] = str(time.strftime("%d/%m/%Y")) return item
def parse(self, response): f = open('Demo.csv', 'a') hxs = HtmlXPathSelector(response) varDrugname = ( hxs.xpath('//div[@class="contentBox"]/h1/text()').extract()) varReviewID = (hxs.xpath( '//div[@class="contentBox"]/div/div/div[@class="user-comment"]/p[@class="user-name user-type user-type-2_non_member"]/text()' ).extract()) varReview = (hxs.xpath( '//div[@class="contentBox"]/div/div/div[@class="user-comment"]//p[1]//span//text()' ).extract()) # print [x.encode('ascii', 'ignore') for x in varReview] # varDrugname[0].replace("User Reviews for ","") mydb = MySQLdb.connect(host='localhost', user='******', passwd='welcome', db='drugs_review_drugs.com') cursor1 = mydb.cursor() name = list() drugname = list() for j in range(len(varReview)): name.append("drugs.com") drugname.append(varDrugname[0].replace("User Reviews for ", "")) result = zip(name, varReviewID, drugname, [x.encode('ascii', 'ignore') for x in varReview]) myfile = open('finalDrugReview1.csv', 'a') wr = csv.writer(myfile, quoting=csv.QUOTE_ALL) for row in result: wr.writerow(row)
def parse(self, response): log.msg(response.url) urls = [] hxs = HtmlXPathSelector(response) # Look for category count total. tot_cat_count_list = hxs.xpath('//input[@id="total_category_content_count"]/@value').extract() if (len(tot_cat_count_list) != 0): tot_cat_count = int(tot_cat_count_list[0]) log.msg("Total Category Count is {}".format(tot_cat_count)) # log.msg("Total category count for" + response.url + " is " + tot_cat_count) if (tot_cat_count > 60): page_link = response.url + "?p=1" log.msg("Crawl " + page_link) yield scrapy.Request(page_link, callback=self.pagescrape) self.crawledPageUrls.append(page_link) page_nums = hxs.xpath('//*[@class="paginate pjaxer"]/text()').extract() for num in page_nums: if (not num): page_link = response.url + "?p=" + num if page_link not in self.crawledPageUrls: print page_link yield scrapy.Request(page_link, callback=self.pagescrape) self.crawledPageUrls.append(page_link) else: page_link = response.url + "?p=1" log.msg("Crawl " + page_link) yield scrapy.Request(page_link, callback=self.pagescrape) self.crawledPageUrls.append(page_link)
def parse(self, response): log.msg(response.url) urls = [] hxs = HtmlXPathSelector(response) # Look for category count total. tot_cat_count_list = hxs.xpath( '//input[@id="total_category_content_count"]/@value').extract() if (len(tot_cat_count_list) != 0): tot_cat_count = int(tot_cat_count_list[0]) log.msg("Total Category Count is {}".format(tot_cat_count)) # log.msg("Total category count for" + response.url + " is " + tot_cat_count) if (tot_cat_count > 60): page_link = response.url + "?p=1" log.msg("Crawl " + page_link) yield scrapy.Request(page_link, callback=self.pagescrape) self.crawledPageUrls.append(page_link) page_nums = hxs.xpath( '//*[@class="paginate pjaxer"]/text()').extract() for num in page_nums: if (not num): page_link = response.url + "?p=" + num if page_link not in self.crawledPageUrls: print page_link yield scrapy.Request(page_link, callback=self.pagescrape) self.crawledPageUrls.append(page_link) else: page_link = response.url + "?p=1" log.msg("Crawl " + page_link) yield scrapy.Request(page_link, callback=self.pagescrape) self.crawledPageUrls.append(page_link)
def parse_letter(self, response): hxs = HtmlXPathSelector(response) next_page_url = hxs.xpath(u"//a[text()='下一页']/@href").extract() if len(next_page_url) != 0: flag = True request = Request(next_page_url[0], callback=self.parse_letter) request.meta['item'] = response.meta['item'] yield request else: flag = False l = response.meta['item'] letter = '' letter1 = hxs.xpath("//script").re('(?<=doctorjy).*?doctorjy') letter = letter + self.parse_letter_detail1(letter1)['letter'] letter2 = hxs.xpath("//table[@class='doctorjy']") letter = letter + self.parse_letter_detail2(letter2)['letter'] l.add_value('comment', letter) if flag == True: yield None else: yield l.load_item()
def detail(self, response): log.msg(response.url) hxs = HtmlXPathSelector(response) product_name = hxs.xpath('//*[@id="vip_content_section"]/div[2]/h1/text()').extract() # //*[@id="vip_content_section"]/div[2]/h1 if (len(product_name) != 0): product_name = hxs.xpath('//*[@id="vip_content_section"]/div[2]/h1/text()').extract()[0] product_price = hxs.xpath('//*[@id="price-val"]/text()').extract() if (len(product_price) != 0): product_price = hxs.xpath('//*[@id="price-val"]/text()').extract()[0] if (len(product_price) != 0 or product_price != None) and (len(product_name) or product_name != None): l = ItemLoader(item=BillionPricesIndiaItem(), response=response) l.add_xpath('product_name', '//*[@id="vip_content_section"]/div[2]/h1/text()') # l.add_xpath('quantity', '//*[@id="product_detail_view_1"]/div/div[1]/div/text()') l.add_xpath('category', '//*[@id="cat_crum"]/@value') l.add_xpath('product', '//*[@id="overview_tab"]/div/div/p/text()') item = l.load_item() item['product_url'] = response.url item['price'] = product_price item['vendor'] = 'PepperFry' item['city'] = 'Mumbai' item['state'] = 'Maharashtra' item['country'] = 'India' item['date'] = str(time.strftime("%d/%m/%Y")) return item
def parse_product(self, response): product = WikiartProductsItem() hxs = HtmlXPathSelector(response) product['s_url'] = response._url product['product_name'] = hxs.xpath("//div[@class='tt30 pb8']/h1/text()").extract()[0] product_info = hxs.xpath("//div[@class='ArtistInfo']") image_info = product_info.xpath("//a[@id='paintingImage']/@href").extract()[0] product['resource_url'] = image_info for data_info in product_info.xpath("//div[@class='DataProfileBox']/p"): key = data_info.xpath("b/text()").extract()[0] if key == 'Material:': value = data_info.xpath("text()").extract()[1] value = value.replace("\r\n", '') product['material'] = value elif key == 'Dimensions:': value = data_info.xpath("text()").extract()[1] value = value.replace("\r\n", '') product['dimensions'] = value product['create_by'] = hxs.xpath("//a[@itemprop='author']/text()").extract()[0] years = product_info.xpath("//span[@itemprop='dateCreated']/text()").extract() if len(years) > 0: product['create_at'] = years[0] else: product['create_at'] = 'UnKnown' product['product_style'] = product_info.xpath("//span[@itemprop='style']/text()").extract() product['product_genre'] = product_info.xpath("//span[@itemprop='genre']/text()").extract() return product
def parse(self, response): hxs = HtmlXPathSelector(response) items=[] item=UserdetailsItem() idd=response.url #Gets the user id info####################################################################################### idd=idd.replace("http://www.openprocessing.org/user/","").replace("/","") #strips away info to get the userid item["ids"]=idd #Gets the website info####################################################################################### webs=hxs.xpath('//div[@id="userDetails"]/a/strong/text()').extract() item["website"]=webs #Gets the location information################################################################################## loc=hxs.xpath('//div[@id="userDetails"]/strong/text()').extract() item["location"]=loc #Gets the date joined info#################################################################################### #What we are looking for in the joining data #<div id="userDetails"><blah blah>what we want </div> joined=hxs.xpath('//div[@id="userDetails"]/text()').extract() item["joined"]=joined #Gets the name of the person whose page it is which is in the title tag####################################### gd1=hxs.xpath('//title/text()').extract() #Gets the title information for the page item["name"]=gd1 #Gets the membership status data that we are looking for###################################################### # Example <a href="/membership/" class="hangingBox" style="position:absolute; left: 10px;width: 72px; color:#ff9900; text-align:center; ">Professor+</a> gd2=hxs.xpath('//a[@href="/membership/"]/text()').extract() gd2=str(gd2).replace(",","").replace("go","") #Cleans things up item["membership"]=gd2 items.append(item) return items # returns the list
def parse_item(self, response): hxs = HtmlXPathSelector(response) movie = DoubanItem() # 电影名 movie['title'] = hxs.xpath( '//h1/span[@property="v:itemreviewed"]/text()') # 导演 movie['director'] = hxs.xpath( '//div[@id=\"info\"]/span[1]/span[2]/a/text()') # 主演 movie['actor'] = hxs.xpath('//a[@rel="v:starring"]/text()') # 类型 movie['type'] = hxs.xpath( '//*[@id="info"]//span[@property="v:genre"]/text()') # 国家和地区 movie['area'] = hxs.xpath('//*[@id="info"]/text()') # 上映时间 movie['publishtime'] = hxs.xpath( '//span[@property=\"v:initialReleaseDate\"]/text()') # 片长 movie['time'] = hxs.xpath( '//*[@id="info"]//span[@property="v:runtime"]/text()') # 评分 movie['rate_num'] = hxs.xpath('//strong[@property="v:average"]/text()') # 评价 movie['rate'] = hxs.xpath('//div[@class="rating_sum"]/a/span/text()') # 介绍 movie['introduce'] = hxs.xpath('//*[@id="link-report"]/span/text()') yield movie
def parse_link(self, response): hxs = HtmlXPathSelector(response) # yield{ # 'article':hxs.xpath('//div[@class="story-body__inner"]/p/text()').extract(), # 'title':hxs.xpath('//div[@class="story-body"]/h1/text()').extract(), # 'description':hxs.xpath('//div[@class="story-body__inner"]/p/text()').extract_first(), # } item = ArticleItem() parsed_items = [] item['pubDate'] = hxs.xpath('.//div[@class="date date--v2"]/text()').extract_first() item['title'] = hxs.xpath('.//div[@class="story-body"]/h1/text()').extract_first() # item['article'] = hxs.xpath('.//div[@class="story-body__inner"]/p/text()').extract() article_list = hxs.xpath('.//div[@class="story-body__inner"]/p/text()').extract() article = ' '.join(article_list).strip(' \n') item['article'] = article # body_list = response.xpath("//" + XpathUtil.xpath_for_class("story-body__inner") + "//p/text()").extract() # body = ' '.join(body_list).strip(' \n') item['description'] = hxs.xpath('.//div[@class="story-body__inner"]/p/text()').extract_first() item['url'] = response.url item['image_url'] = hxs.xpath('//span[@class="image-and-copyright-container"]/img/@src').extract_first() parsed_items.append(item) return parsed_items
def parse(self, response): """ Default callback used by Scrapy to process downloaded responses """ obj=JabongPageData() selector = HtmlXPathSelector(text=response.body) str1=response.body pattern1='<h2 class="prod-disc" itemprop="description">' pattern2='</h2>' index1=str1.index(pattern1) index2=str1.index(pattern2) tempRes=str1[index1+45:index2] try: tempResponse="<div>"+tempRes+"</div>" parser = etree.HTMLParser() tree = etree.parse(StringIO(unicode(tempResponse, "utf-8")), parser) data=list() x=tree.xpath(self.item_fields['desc2_1']) for element in x: element=element.strip() if element: data.append(element) x=tree.xpath(self.item_fields['desc2_2']) for element in x: element=element.strip() if element: data.append(element) x=tree.xpath(self.item_fields['desc2_3']) tdata="" for element in x: element=element.strip() if element: tdata=tdata+element+"," if tdata: tdata=tdata[:-1] data.append(tdata) obj['desc2']=json.dumps(data) except: print 'could not get h2 data' obj['desc2']="" x=selector.xpath(self.item_fields['brand']) obj['brand']=x[0].extract() x=selector.xpath(self.item_fields['product-title']) obj['productTitle']=x[0].extract() x=selector.xpath(self.item_fields['desc1']) data=dict() for element in x: print "enterd" print element.xpath('label/text()').extract() key=element.xpath('label/text()').extract()[0] val=element.xpath('span/text()').extract()[0] data[key]=val obj['desc1']=json.dumps(data) obj['requestURL']=unicode(response.request.url, "utf-8") yield obj
def parse(self, response): """ Default callback used by Scrapy to process downloaded responses """ obj = JabongPageData() selector = HtmlXPathSelector(text=response.body) str1 = response.body pattern1 = '<h2 class="prod-disc" itemprop="description">' pattern2 = '</h2>' index1 = str1.index(pattern1) index2 = str1.index(pattern2) tempRes = str1[index1 + 45:index2] try: tempResponse = "<div>" + tempRes + "</div>" parser = etree.HTMLParser() tree = etree.parse(StringIO(unicode(tempResponse, "utf-8")), parser) data = list() x = tree.xpath(self.item_fields['desc2_1']) for element in x: element = element.strip() if element: data.append(element) x = tree.xpath(self.item_fields['desc2_2']) for element in x: element = element.strip() if element: data.append(element) x = tree.xpath(self.item_fields['desc2_3']) tdata = "" for element in x: element = element.strip() if element: tdata = tdata + element + "," if tdata: tdata = tdata[:-1] data.append(tdata) obj['desc2'] = json.dumps(data) except: print 'could not get h2 data' obj['desc2'] = "" x = selector.xpath(self.item_fields['brand']) obj['brand'] = x[0].extract() x = selector.xpath(self.item_fields['product-title']) obj['productTitle'] = x[0].extract() x = selector.xpath(self.item_fields['desc1']) data = dict() for element in x: print "enterd" print element.xpath('label/text()').extract() key = element.xpath('label/text()').extract()[0] val = element.xpath('span/text()').extract()[0] data[key] = val obj['desc1'] = json.dumps(data) obj['requestURL'] = unicode(response.request.url, "utf-8") yield obj
def parse(self, response): hxs = HtmlXPathSelector(response) item = PhilomathItem() item['url'] = response.url item['title'] = hxs.xpath('//title/text()').extract() item['body'] = ' '.join(filter(bool, map(unicode.strip, hxs.xpath('//body//text()').extract()))) item['date'] = datetime.datetime.now().strftime("%m-%d-%Y %H:%M:%S") yield item
def parse(self, response): # filename = response.url.split("/")[-2] # open(filename, 'wb').write(response.body) # hxs = HtmlXPathSelector(response) hxs = HtmlXPathSelector(text=response.body); xpath='//h1/a/@href' print hxs.xpath(xpath).extract() ex='''
def parse(self, response): log.msg("Parsing content from url " + response.url) hxs = HtmlXPathSelector(response) subcat_links = hxs.xpath('//*[@id="url"]/@href') subcat_names = hxs.xpath('//*[@id="url"]/text()') for link, name in zip(subcat_links, subcat_names): print link.extract(), name.extract() print "Run Crawler for category " + name.extract() yield scrapy.Request(link.extract(), callback=self.detail_scrape)
def parser(self,response): hxs = HtmlXPathSelector(response) title = hxs.xpath("//div[@class='main-header']/h1[@itemprop='name']/text()").extract() art = hxs.xpath("//div[@class='product__control']/span[not(@class='product__control-name')]/text()").extract() article = art[0].encode('utf-8') code = art[1].encode('utf-8') brand = hxs.xpath("//div[@class='product__control']/a/text()").extract() summary = hxs.xpath("//div[contains(@class,'showhide') and contains(@class ,'item_desc')]/p/text()").extract()[0] print summary
def category_parse(self, response): ''' parse category info ''' if type(response) == s_response.html.HtmlResponse: hxs = HtmlXPathSelector(response) #get sku list p_list = hxs.xpath("//ul[@class='list-h']/li") sku_list = [] if hxs.select("//ul[@class='list-h']/li"): sku_list = [p.xpath('./@sku').extract()[0] for p \ in hxs.xpath("//ul[@class='list-h']/li")] if hxs.select("//div[@id='plist']/div"): sku_list = [p.xpath('./@sku').extract()[0] for p \ in hxs.xpath("//div[@id='plist']/div")] for sku in sku_list: yield Request('/'.join(['http://item.jd.com', sku + '.html']), callback=JD_Product_Spider().parse) next_page_link = hxs.xpath("//div[@class='pagin fr']/a[@class='next']/@href") if next_page_link: if next_page_link.extract()[0].startswith("?"): yield Request('http://list.jd.com/list.html' + next_page_link.extract()[0], callback=self.category_parse) elif next_page_link.extract()[0].startswith("http"): yield Request(next_page_link.extract()[0], callback=self.category_parse) else: pass else: #maybe cache using bs4 to deal dom = BeautifulSoup(response.body) if dom.find('div', id='plist').find_all('li'): p_list = dom.find('div', id='plist').find_all('li') if dom.find('div', id='plist').find_all('div'): p_list = dom.find('div', id='plist').find_all('div') sku_list = [p.get('sku', '') for p in p_list] for sku in sku_list: yield Request('/'.join(['http://item.jd.com', sku + '.html']), callback=JD_Product_Spider().parse) next_page_a = dom.find('div', class_='pagin fr').find('a', class_='next') if next_page_a: next_page_link = next_page_a.get('href') if next_page_link.startswith("?"): yield Request('http://list.jd.com/list.html' + next_page_link.extract()[0], callback=self.category_parse) elif next_page_link.startswith("http"): yield Request(next_page_link, callback=self.category_parse) else: pass
def getReviewText(product_id, n=-1, filename=""): isWriting = False if filename != "": f = open(filename, "w") isWriting = True url = "http://shopping.naver.com/detail/section_user_review.nhn?nv_mid=" + str(product_id) + "&page=" urlForMaxPage = "http://shopping.naver.com/detail/detail.nhn?nv_mid=" + str(product_id) # to get maxPage d = requests.get(urlForMaxPage).text hxs = HtmlXPathSelector(text=d) total = int("".join(re.findall(r"\d+", hxs.xpath(".//*[@class='count']/text()").extract()[0]))) print "total review num = %d" % total maxPage = 0 if n == -1 or n > total: n = total if total % 20 != 0: maxPage = total / 20 + 1 else: maxPage = total / 20 print "Crawling ... (to %d)" % n # maxPage # to get review reviews = [] count = 0 for i in range(1, maxPage + 1): url_page = url + str(i) d = rf.remove_Tag(requests.get(url_page).text) hxs = HtmlXPathSelector(text=d) for each in hxs.xpath(".//*[@class='atc']/text()"): text = each.extract().strip() if len(text) > 0: review = rf.remove_whitespace(text.encode("utf-8")) if isWriting == True: f.write(review + "\n") reviews.append(review) count += 1 if count >= n: break if count >= n: break if isWriting == True: f.close() print "Complete product review crawling. (crawlingCount : %d / total : %d)" % (count, total) return reviews
def parse_dir_contents(self, response): str1 = response.url.split("/")[3] filename = 'output11/'+str1+ '.html' with open(filename, 'wb') as f: f.write(response.body) hxs = HtmlXPathSelector(response) #extract the cost HDcost1 = hxs.xpath('//*[@class="dv-button-inner"]/text()').extract() len1 = len(HDcost1) del HDcost1[0] for i in range(0,len1-1): var1 = HDcost1[i] var1 = var1.encode('utf-8') HDcost1[i] = var1 #extract the title title1 = hxs.xpath('//*[@id="aiv-content-title"]/text()').extract() len1 = len(title1) for i in range(0,len1): var1 = title1[i] var1 = var1.encode('utf-8') var1=var1.strip() title1[i]=var1 title1 = filter(None,title1) #extract the release year relyear= hxs.xpath('//*[@class="release-year"]/text()').extract() relyear1=relyear[0].encode('utf-8') relyear1=relyear1.strip() #extrcat the time times = hxs.xpath('//*[@id="dv-dp-left-content"]/div[2]/div[2]/dl/dd[2]/text()').extract() time1 = times[0].strip() time1 = time1.encode('utf-8') #extract the director #dir1 = hxs.select('//*[@id="dv-center-features"]/div[1]/div/table/tbody/tr[2]/td/a/text()').extract() #not working dir1 = response.xpath('//*[@id="dv-center-features"]/div[1]/div/table/tr[2]/td/a/text()').extract() #dir1 = str(string1) dir1 = dir1[0].encode('utf-8') dir1 = dir1.strip() #extract the starring actors actors = hxs.select('//*[@id="dv-dp-left-content"]/div[2]/div[2]/dl/dd[1]/text()').extract() actors = actors[0].encode('utf-8') actors = actors.strip() #print "======" #print HDcost #print title1 yield DmozItem(title=title1, time=time1, cost=HDcost1,year=relyear1,director=dir1,star=actors,)
def parser(self, response): hxs = HtmlXPathSelector(response) vacancy = hxs.xpath("//div[@class='b-vacancy-custom g-round']/h1[@class='title b-vacancy-title']/text()").extract() company = hxs.xpath("//div[@class='companyname']/a/text()").extract() price = hxs.xpath("//div[@class='l-paddings']/text()").extract()[3] city = hxs.xpath("//div[@class='l-paddings']/text()").extract()[4] exp = hxs.xpath("//div[@class='l-paddings']/text()").extract()[5] vacancy1 = vacancy[0].encode('utf-8') company1 = company[0].encode('utf-8') writer = csv.writer(open('price.csv', 'a'), lineterminator='\n') writer.writerow([vacancy1, company1, price.encode('utf-8'), city.encode('utf-8'), exp.encode('utf-8')])
def parse_item(self, response): self.log('Hi, this is an item page! %s' % response.url) hxs = HtmlXPathSelector(response) i = BoleItem() i['title'] = hxs.xpath('//div[@class="article"]/h2/text()').extract() i['info_class'] = hxs.xpath('//div[@class="article_info"]/div[@class="textl"]/a[2]/text()').extract() i['info_area'] = hxs.xpath('//div[@class="article_info"]/div[@class="textl"]/a[3]/text()').extract() i['pub_date'] = hxs.xpath('//div[@class="article_info"]/div[@class="textr"]/text()').extract() i['content'] = hxs.xpath('//div[@class="article"]/div[@class="context"]/p').extract() return i
def parse(self, response): hxs = HtmlXPathSelector(response) titles = hxs.select("//html") items = [] item = CraigslistSampleItem() names = hxs.xpath('//td[@class="product_name"]/strong/text()') imageurls = hxs.xpath('//tr/td[@align="center"]/a/img/@src') for name, url in zip(names, imageurls): item["productname"] = name item["imgurl"] = url yield item