def parse_item(self, response): self.log('Hi,this is an item page! %s' % response.url) #r = redis.Redis(host='localhost',port=6379,db=0) self.r.set(response.url,1) sel = Selector(response) item = XidianinfoItem() urlParts = response.url.strip().split('/') item['newsType'] = urlParts[-2] item['newsId'] = urlParts[-1][:-4] item['newsTitle'] = sel.xpath('//td[@class="titlestyle1040"]/text()').extract_first() item['newsTime'] = sel.xpath('//span[@class="timestyle1040"]/text()').extract_first().strip() item['newsFrom'] = sel.xpath('//span[@class="authorstyle1040"]/text()').extract_first().strip() item['newsContent'] = sel.xpath('//div[@class="c1040_content"]//p').extract_first() image_urls = response.xpath('//div[@class="c1040_content"]/div/p/img/@src').extract() item['image_urls'] = [] for image_url in image_urls: item['image_urls'].append(image_url.replace('../..','http://info.xidian.edu.cn')) print('============================') print(item['newsType'].strip().encode('utf8') + '---------------------') print(item['newsId'].strip().encode('utf8') + '---------------------') print(item['newsTitle'].strip().encode('utf8') + '---------------------') print(item['newsTime'].strip().encode('utf8') + '---------------------') print(item['newsFrom'].strip().encode('utf8') + '---------------------') print(item['newsContent'].strip().encode('utf8') + '---------------------') return item
def parse_img(self, response): urlItem = MeituItem() sel = Selector(response) for divs in sel.xpath("//div[@class='pic-meinv']"): img_url=divs.xpath("a/img[@class='pic-large']/@src").extract() urlItem['image_urls'] = img_url yield urlItem
def parse_person(self, response): person = response.meta["person"] person["source_url"] = response.url # Connect person to division division_role = DivisionRole() division_role["source_url"] = response.meta["division_url"] division_role["name"] = "Leiter" # ACHTUNG Hartcodierung division_role["person_url"] = response.url division_role["division_url"] = response.meta["division_url"] yield division_role # TODO Ask students for other fields to parse here yield person return # Don't do publications while the parsing is broken # Parse publication list sel = Selector(response) publications_list = sel.css(".gs_publication > .gs_publication_list .gs_publication_list") current_publication_type = None source_url_base = response.url.split("#")[0] + "#" # Remove fragment (regardless if it exists) and add fragment separator for item in publications_list: current_publication_type = join(item.xpath("h3/text()").extract(), "") for pub_item in item.xpath("p"): publication = self.create_publication(pub_item, current_publication_type, source_url_base) # TODO remove person from publication["author_names"] and set publication["author_ids"] instead. if publication: yield publication
def parse(self, response): sel = Selector(response); title = sel.xpath('//h1/text()').extract()[0] title = polishTitle(title, self.name); print(title) tmpNovelDirPath = os.path.join(self.tmpDirPath, title); if(os.path.isdir(tmpNovelDirPath) != True): os.makedirs(tmpNovelDirPath); dd = sel.xpath('//dl/dd'); id = 0; for d in dd: id += 1; nid = ((id-1)/3+1)*3 - (id-1)%3; a = d.xpath('a'); if(len(a) == 0): continue; url = a.xpath('@href').extract()[0]; url = response.urljoin(url.strip()); subtitle = a.xpath('text()').extract()[0]; subtitle = polishSubtitle(subtitle); print(url); print(subtitle); request = scrapy.Request(url, callback = self.parse_page); item = NovelsItem(); item['title'] = title; item['subtitle'] = subtitle; item['id'] = nid; item['type'] = 'novels'; request.meta['item'] = item; yield request;
def parse_overview(self, response): """ Parse start page, branching out to each research subject """ sel = Selector(response) for link in sel.css("#c12546 li a"): url = join(link.xpath("@href").extract(), "") url = self.fix_url(url, response.url) yield Request(url, callback=self.parse_research)
def parse_disease(self, response): """ target_url = response.meta['url'] if(not target_url == response.url): request = Request(target_url, callback=self.parse_disease) request.meta['url'] = target_url yield request """ #if not response.status == 200: # 1+1 sel = Selector(response) disease_title = sel.xpath('//div[@id="printMe"]/h1/text()').extract()[0] # context = sel.xpath('//*[@id="div_nest"]').extract()[0]q contexte = '' temp = ''.join([stuff+'\n' for stuff in sel.xpath('//div[@id="printMe"]//span[@id="info576"]/*[self::p or self::ol or self::ul]//text()').extract()]) if temp: context = html2text(temp) drugs_and_stuff = ''.join([stuff+'\n' for stuff in sel.xpath('//a[@class="rest_data_list"]/text()').extract()]) #time.sleep(5) yield DiseaseDescription( url = response.url, name = disease_title, description = context, drugs = drugs_and_stuff, )
def parse_drug(self, response): sel = Selector(response) drug_name = sel.xpath('//div[@id="printMe"]/h1/text()').extract()[0] context = sel.xpath('//div[@id="printMe"]').extract()[0] classification = '' all_subheads = sel.xpath('//div[@id="printMe"]/h2/text()').extract() description, usage, contra, side, overdose = '', '', '', '', '' for i, subhead in enumerate(all_subheads): n = i+1 if u"Лекарственная форма, состав, упаковка" in subhead: description = ''.join(self.p_between_id(n, sel)) elif u"Режим дозирования" in subhead or u"Показания к применению" in subhead: usage = ''.join(self.p_between_id(n, sel)) elif u"Противопоказания" in subhead: contra = ''.join(self.p_between_id(n, sel)) elif u"Побочные действия" in subhead: side = ''.join(self.p_between_id(n, sel)) elif u"Передозировка при приёме" in subhead: overdose = ''.join(self.p_between_id(n, sel)) #time.sleep(5) yield DrugDescription( url=response.url, name=drug_name, classification=classification, description=description, usage=usage, contra=contra, side=side, overdose=overdose, info=html2text(context), )
def parse(self, response): items=[] filename = 'pic_url.txt' pic_url = open(filename, 'wb') pic_url.write("Start") sel = Selector(response) pics = sel.xpath('//*[@id="comments"]/ol/li') pic_url.write('Version: 0.2') for pic in pics: ooRate = pic.xpath('div[1]/div/div[2]/*[@class="vote"]/span[2]/text()').extract() if len(ooRate) > 0 and int(ooRate[0]) > 100: item = PatuItem() item['support_rate']=ooRate pic_url.write(str(pic.xpath('div[1]/div/div[2]/p/img/@src').extract()) + '\n') if pic.xpath('div[1]/div/div[2]/p/img/@org_src'): item['image_urls'] = pic.xpath('div[1]/div/div[2]/p/img/@org_src').extract() else: item['image_urls'] = pic.xpath('div[1]/div/div[2]/p/img/@src').extract() item['images'] = '' # print(pic.xpath('div[1]/div/div[2]/p/img/@src').extract()) items.append(item) yield item pic_url.write("End") pic_url.close()
def parse(self, response): sel = Selector(response) url_letters = sel.xpath('//div/ul[@class="alphaLinks"]//a/@href').extract() for url in url_letters: print url #time.sleep(5) yield Request(url, callback=self.parse_letter)
def run(self): site = urllib2.urlopen("http://www.nasdaq.com/markets/upcoming-splits.aspx") html = site.read() response = Selector(text = html, type = "html") #headers headers = response.xpath('//table[@rules="all"]/tr/th/text()').extract() headers = self.f.mushList(headers) #data data = response.xpath('//table[@rules="all"]/tr/td/text()').extract() #companyNames companyNames = response.xpath('//table[@rules="all"]/tr/td/a/text()').extract() models = [] d = 0 for com in companyNames: m = InsertModel(self.table) splits = self.splitName(com) for index,h in enumerate(headers): if (index == 0): m.insert("CompanyName", self.f.filterForSQL(splits[1])) m.insert("Symbol", splits[0]) continue elif (index == 1): m.insert(h, self.getRatio(data[d])) else: m.insert(self.f.headerFilter(h), self.f.convertDate(data[d])) d += 1 models.append(m) return models
def parse(self, response): sel = Selector(response) result = [] ad = DatesItem() ad['name'] = "" for p in sel.xpath("//div[@class='poziomd']//text()").extract(): if re.match("^.*,", p): if p.startswith(","): ad['desc'] = p[2:] else: ad['desc'] = p[6:] ad['name'] = ad['name'].lstrip('1234567890() ').strip() if re.match('^.\s', ad['name']): ad['name'] = ad['name'][2:] ad['url'] = response.url if re.match(".*urodzeni.*", response.url): ad['isBirth'] = True else: ad['isBirth'] = False result.append(ad) ad = DatesItem() ad['name'] = "" elif re.match("^\s*[0-9]{1,4}", p) and not ad.has_key('date'): ad['date'] = re.match("^\s*[0-9]{1,4}", p).group() else: ad['name'] = ad['name'] + p return result
def parse_pathPage(self, response): #print("parse a path page!") sel = Selector(response) item = {} item["name"] = sel.xpath('//h1[@class= "zm-editable-content"]/text()').extract() # 找到Path paths = [] for path_selector in sel.xpath('//div[@class= "zm-topic-tree"][1]/ul'): # 提取一条path并append到paths one_path = path_selector.xpath('.//a/text()').extract() paths.append(one_path) item["paths"] = paths # 将item append到local file with io.open("tag_paths_app.jsonl", "a", encoding = "utf8") as outfile: # "a" 表示是appending mode row = json.dumps(item, ensure_ascii=False, sort_keys=True) print(row, file = outfile) # 修改spider的tag structure dictionary outside = self.d # initially, the outside is the whole dictionary for path in paths: for i in path: try: inside = outside[i] except KeyError: inside = {} outside[i] = inside outside = inside outside = self.d # reset the outside to whole dictionary # 将这个item的paths加入到spider的p dictionary中 self.p.append(item)
def parse_item(self,response): sel_detail = Selector(response) item = response.meta['item'] desc = sel_detail.xpath('//*[@id="mainArea"]/*').extract() item['desc'] = [d.encode('UTF-8') for d in desc] print "Done!" yield item
def parsePrice(self, response): sel = Selector(response) item = BitautoAllPriceItem() item['city'] = filt(sel.xpath('//div[@class="adress"]/text()').extract()[0], u'地址:', u'市') item['dealer'] = sel.xpath('//div[@class="info"]/h1/text()').extract()[0] item['dealerid'] = filt(response.url, '.com/', '/') db = SimpleMysql(host = '127.0.0.1:5029', db = 'wholenetwork', user = '******', passwd = '') trs = sel.xpath('//div[@class="car_list"]') for tr in trs: tmp = tr.xpath('div/div[@class="car_top"]/h3/a') item['brand'] = tmp.xpath('text()').extract()[0] item['brandid'] = filt(tmp.xpath('@href').extract()[0], 'cars_', '.html') prices = tr.xpath('div/div[@class="car_price"]/table/tbody/tr') for price in prices: if not price.xpath('td'): continue # filt th rows item['model'] = price.xpath('td[1]/a/@title').extract()[0] item['modelid'] = filt(price.xpath('td[1]/a/@href').extract()[0], 'price_detail/', '.html') item['oprice'] = price.xpath('td[2]/text()').extract()[0].replace(u' ','').replace('\r\n','').replace(u'万','') item['price'] = price.xpath('td[4]/a/text()').extract()[0].replace('\r\n','').replace(u' ','').replace(u'万','') item['off'] = price.xpath('td[3]/em/text()').extract()[0].replace('\r\n','').replace(u' ','').replace(u'万','').replace(u'↓','') if ISSAVE: doSave(db, item) if ISPOST: doPost(API_ADDRESS, item) np = sel.xpath('//div[@id="pager"]/a') while np and (np[-1].xpath('text()').extract()[0] == u'下一页'): url = np[-1].xpath('@href').extract()[0] url = response.urljoin(url) yield Request(url, self.parsePrice)
def parse_tagPage(self, response): sel = Selector(response) # tag的名字和链接 name = sel.xpath('//h1[@class= "zm-editable-content"]/text()').extract() relative_link = sel.xpath('//div[@class= "zm-topic-topbar"]//a/@href').extract() # tag的parent parents = sel.xpath('//div[@id= "zh-topic-organize-parent-editor"]//a[@class= "zm-item-tag"]/text()').extract() parents = [s.replace("\n", "") for s in parents] # tag的children children = sel.xpath('//div[@id= "zh-topic-organize-child-editor"]//a[@class= "zm-item-tag"]/text()').extract() children = [s.replace("\n", "") for s in children] # 把tag item保存起来以备最后输出 item = {} item["name"] = name item["relative_link"] = relative_link item["parents"] = parents item["children"] = children self.l.append(item) # 将item append到local file with io.open("tag_items.jsonl", "a", encoding = "utf8") as outfile: # "a" 表示appending mode row = json.dumps(item, ensure_ascii=False) print(row, file = outfile) # Mimic the return of CrawlSpider's default parse() so that the rules will be applied continueously # in stead of just once on the start_urls return self.parse(response)
def parsePage( self, response): #inspect_response(response, self) try: unicode(response.body.decode(response.encoding)).encode('utf-8') except exceptions.UnicodeDecodeError: print "exception error" sel = Selector(response) doclist = sel.xpath( '/html/body/div[@id="divprincipal"]'+ '/div[@class="minwidth"]'+ '/div[@id="idInternetBlocoEmpacotador"]'+ '/div[@class="incenter_interno"]'+ '/div[@id="idDivContainer"]'+ '/div[@id="idAreaBlocoExterno"]'+ '/div[@id="idArea"]'+ '/div[@id="corpopaginajurisprudencia"]'+ '/div[@id="listadocumentos"]'+ '/div[@style="position: relative;"]') for doc in doclist: yield self.parseDoc( doc) self.fIndex = self.fIndex + 1 nextPage = sel.xpath('//*[@id="navegacao"][1]/a[@class="iconeProximaPagina"]') if nextPage: yield Request( urlparse.urljoin('http://www.stj.jus.br/', nextPage.xpath('@href').extract()[0]), callback=self.parsePage ) else: self.saveSearchInfo()
def parse_profile_frameset(self, response): sel = Selector(response) url = parse.get_extracted(sel.xpath('//frame[@name="mainFrame"]/@src')) url = urljoin(response.url, url) meta = response.request.meta return Request(url, callback=self.parse_profile, meta=meta)
def parse_brand(self,response): sel = Selector(response) items=[] product_sites=sel.xpath('//div[@class="product_result_box"]/ul/li') for product_site in product_sites: img_src=self.check_list(product_site.xpath('a[@class="pro_item"]/img/@src').extract()) if img_src=='': product_id='' else: img_id=img_src.split('/')[-1] product_id=img_id.split('_')[0] product_name=self.check_list(product_site.xpath('div[@class="searchlist_tit"]/a/text()').extract()) comment_href=self.check_list(product_site.xpath('a[@class="pro_item"]/@href').extract()) comment_temp=comment_href.split('_')[-1] comment_id=comment_temp.split(".")[0] comment_url="http://koubei.jumei.com/comment_list-"+comment_id+"-1.html" r = Request(comment_url ,meta={'product_id':product_id,'product_name':product_name} ,callback=self.parse_comment) items.append(r) next_brandpage=response.xpath('//div[@class="pageSplit"]/a[@class="next"]/@href').extract() if len(next_brandpage): next_brandpage_url="http://koubei.jumei.com"+next_brandpage[0] r = Request(next_brandpage_url ,callback=self.parse_brand) items.append(r) return items
def people_page(self, response): yield self.parse_item(response) sel = Selector(response) # 关注和被关注 following = sel.xpath('//div[@class="zm-profile-side-following zg-clear"]') # todo 递归找出所有有效用户关注的数据 followings = following.xpath('.//a/@href').extract() for follow_link in followings: # yield self.cookiejar_addcookies(response, url=follow_link, callback=self.followees_page) #这样调用会重定向 还没有决解 self.webdriver_addcookies(follow_link) browerHeight = self.driver.execute_script('return document.body.scrollHeight;') while True: # do the scrolling self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(1) # 等待加载完成数据 scrollHeight = self.driver.execute_script('return document.body.scrollHeight;') if browerHeight == scrollHeight: break browerHeight = scrollHeight peoplelinks = self.driver.find_elements_by_xpath('//a[@class="zm-item-link-avatar"]') for link in peoplelinks: href = link.get_attribute('href') #某些用户的链接在这里找不到,待查找 yield self.cookiejar_addcookies(response, url=href, callback=self.people_page) pass # followees = followings[0] # 关注的链接 # followers = followings[1] # 被关注 pass
def search_parse(self, response): sel = Selector(response) lst = sel.css('ul.list')[0].xpath('./li') for cur in lst: #print "cur is %s" % (cur.extract()) cura = cur.xpath('./a/@href')[0].extract() #print "cur is %s" % (cura) at = cur.xpath('./a/text()') if len(at) > 0: curn = cur.xpath('./a/text()')[0].extract() else: curn = cur.xpath('./a/font/text()')[0].extract() bt = curn.find(u'《') et = curn.find(u'》') if bt == -1 or et == -1: name = curn else: name = curn[(bt + 1):et] #print 'curmovie name is ' + name self.moviedb.insMovie_6vhao(cura, name) return []
def run(self): html = urllib2.urlopen("http://getsplithistory.com/AA").read() response = Selector(text = html, type = "html") datesAndOther = response.xpath("//table/tbody/tr/td/text()").extract() ratios = response.xpath("//table/tbody/tr/td/span/text()").extract() objects = [] rs = [] for i,data in enumerate(datesAndOther[:-1]): if (i % 4 == 0): objNum = len(objects) objects.append(dict()) objects[objNum]["date"] = self.cleanDate(data) if ((i - 1) % 4 == 0): objNum = len(objects) - 1 objects[objNum]["denom"] = self.cleanDenom(data) for i,data in enumerate(ratios[:-1]): if (i % 3 == 0): objects[i/3]["num"] = self.cleanNum(data) for o in objects: o["factorial"] = float(o["num"]) / float(o["denom"]) # now we insert the date symbol name and the factorial into the DB for o in objects: IM = InsertModel("jdfkasdklfj")#tableName) IM.insert("e", o["date"]) IM.insert("symbol", symbol) IM.insert("Ratio", o["factorial"])
def parse_item(self, response): sel = Selector(response) i = CountryDataCrawlerItem() i['item_type'] = 'country' i['crawled_url'] = response.url print response.url print i['item_type'] i['name'] = sel.xpath('//div[@class="b_title clrfix"]/div[@class="tit"]/text()').extract()[0].encode('UTF-8') print i['name'] image_url_list = sel.xpath('//img/@src').extract() image_url_list = utils.image_url_filter(image_url_list,'720x400') i['image_url'] = utils.get_image_url_string(image_url_list) print i['image_url'] #i['image_path'] = 'test' i['image_path'] = utils.save_image_to_oss(image_url_list) print 'image_path' brief_info_list = sel.xpath('//div[@class="countbox"]//text()').extract() i['brief_description'] = utils.scrape_str(brief_info_list) print i['brief_description'] #抓取详细信息 detail_info_source = urllib2.urlopen(response.url+'/zhinan').read() detail_info_page = etree.HTML(detail_info_source.lower()) detail_info_list = detail_info_page.xpath("//div[@class='b_g_cont']//text()") i['detail_description'] = utils.scrape_str(detail_info_list) print i['detail_description'] i['last_update_time'] = utils.get_current_time() print i['last_update_time'] self.data_count += 1 print "共抓取了:%d 条数据。"%self.data_count return i
def parse_news(self,response): item = response.meta['item'] sel = Selector(response) item['news_title'] = sel.xpath('//title/text()').extract() news_media = sel.xpath('//meta[@name="mediaid"]/@content').extract() if news_media: item['news_media'] = news_media[0] else: item['news_media'] = "NoMedia" timelist1 = sel.xpath('//span[@class="time-source"]/text()').re('\d+')[0:3] timelist2 = sel.xpath('//span[@id="pub_date"]/text()').re('\d+')[0:3] timelist3 = sel.xpath('//span[@class="time"]/text()').re('\d+')[0:3] timelist = timelist1 + timelist2 + timelist3 # print timelist item['news_pubtime'] = ['-'.join(map(str,timelist))] news_content1 = sel.xpath('//div[@id="artibody"]').extract() news_content2 = sel.xpath('//div[@class="mainContent"]').extract() item['news_content'] = news_content1 + news_content2 channel1 = sel.xpath('//script').re('channel:.*\'(.*)\'') channel2 = sel.xpath('//script').re('channel:.*\"(.*)\"') channel = channel1 + channel2 newsid1 = sel.xpath('//script').re('newsid:.*\'(.*)\'') newsid2 = sel.xpath('//script').re('newsid:.*\"(.*)\"') newsid = newsid1 + newsid2 item['news_id'] = newsid[0] cmturl = "http://comment5.news.sina.com.cn/page/info?format=json&channel=%s&newsid=%s&page_size=200"%(channel[0],newsid[0]) item['news_commenturl'] = cmturl yield Request(url=cmturl,callback=self.parse_commentnum,meta={'item': item})
def parse_item(self, response): """ Main parse function """ sel = Selector(response) item = ProductItem() item['source'] = 'tmall' item['name'] = self.get_product_name( sel ) item['img'] = sel.xpath("//ul[@id='J_UlThumb']/li")[0].xpath(".//a/img/@src").extract()[0] item['category'] = self.get_category(response) try: # 获取TShop字符串,并对TShop字符串进行JSON标准化处理 TShop_str = sel.re('TShop\.Setup\(((.|\n)+?)\);')[0] # 移除注释,目前只有天猫超市有注释,以逗号开头 regex = re.compile(',\s*\/\/[^\n]*') TShop_str = re.sub(regex, ',', TShop_str) TShop = eval( TShop_str, type('Dummy', (dict,), dict(__getitem__=lambda s,n:n))() ) except SyntaxError: return item['itemId'] = TShop.get('itemDO').get('itemId', '') item['url'] = 'http://detail.tmall.com/item.htm?id=' + item['itemId'] item['date'] = date.today().strftime('%Y-%m-%d') item['attr'], item['brand'] = self.get_attr_and_brand( sel ) skuMap = self.get_sku_chinese_map( sel, TShop ) initApi_url = TShop.get('initApi') yield Request( initApi_url, headers={'Referer': 'http://www.google.com.hk/'}, meta={'item': item, 'skuMap': skuMap}, callback=self.parse_initapi )
def parse(self, response): location = response.url.lower().split("?") if location[0] in self.seen: #pass self.log('already seen %s' % response.url) else: self.log('parsing %s' % response.url) self.seen.add(location[0]) hxs = Selector(response) if re.match('http://www.simplyrecipes.com/recipes/+',location[0]) and not re.search('(ingredient|course|season|type|cuisine)', location[0]): item = BaseItem() self.last = self.last + 1 item['pk'] = self.last item['title'] = hxs.xpath('//title/text()').extract() item['id'] = response.url item['source'] = "simplyrecipes" item['url'] = location[0] item['text'] = response.body item['content'] = response.body_as_unicode() self.log("saving item " + response.url) yield item for url in hxs.xpath('//a/@href').extract(): url = url.replace('http://www.simplyrecipes.com','') if not url in self.seen and not re.search(r'.(pdf|zip|jar)$', url) and \ url.lower()[0:9] == '/recipes/' and "," not in url: #self.log("yielding request " + url) yield Request('http://www.simplyrecipes.com'+url, callback=self.parse)
def parse_celebrity(self, response): """ 爬取艺人 """ celebrity = CelebrityItem() sel = Selector(response) celebrity["id"] = self._parse_id(response.url) name = sel.css("div.per_header h2::text").extract() celebrity["name"] = name[0] if name else "" name_en = sel.css("div.per_header p.enname::text").extract() celebrity["name_en"] = name_en[0] if name_en else "" yield Request( url=urljoin(response.url, "details.html"), callback=self.parse_celebrity_detail, meta={"celebrity": celebrity.copy()} ) yield Request( url=urljoin(response.url, "awards.html"), callback=self.parse_celebrity_awards, meta={"celebrity": celebrity.copy()} ) yield celebrity
def parse_celebrity_detail(self, response): """ 爬取艺人详情 """ celebrity = response.meta["celebrity"] sel = Selector(response) for dt in sel.css("div.per_info_l dt"): title = dt.css("::text").extract()[0] if title == "出生日期:": text = dt.css("::text").extract()[1].rstrip(")") if "(" in text: birthday, birthplace = text.split("(", 1) else: birthday, birthplace = text, "" celebrity["birthday"] = birthday celebrity["birthplace"] = birthplace elif title == "血型:": celebrity["blood"] = dt.css("::text").extract()[1] elif title == "星座:": celebrity["constellation"] = dt.css("::text").extract()[1] elif title == "身高:": celebrity["height"] = int(dt.css("::text").extract()[1].rstrip("cm")) elif title == "体重:": celebrity["height"] = int(dt.css("::text").extract()[1].rstrip("kg")) celebrity["intro"] = "\n".join(sel.css("div#lblAllGraphy p::text").extract()) return celebrity
def parsePostsList(self,response): sel = Selector(response) posts = sel.xpath('//dl[@class="discussion clear i0 xg_lightborder"]') items = [] topic = response.xpath('//h1/text()').extract_first() url = response.url item = PostItemsList() item['author'] = response.xpath('//div[@class="xg_module xg_module_with_dialog"]//ul[@class="navigation byline"]/li/a[contains(@href,"profile")]/text()').extract_first() item['author_link'] = response.xpath('//div[@class="xg_module xg_module_with_dialog"]//ul[@class="navigation byline"]/li/a[contains(@href,"profile")]/@href').extract_first() item['create_date'] = response.xpath('//div[@class="xg_module xg_module_with_dialog"]//ul[@class="navigation byline"]/li/a[@class="nolink"][2]/text()').extract_first().replace('on','').replace('in','').strip() item['post'] = re.sub('\s+',' '," ".join(response.xpath('//div[@class="xg_module xg_module_with_dialog"]//div[@class="xg_user_generated"]/p/text()').extract()).replace("\t","").replace("\n","").replace("\r","")) item['tag']='epilepsy' item['topic'] = topic item['url']=url logging.info(item.__str__) items.append(item) for post in posts: item = PostItemsList() item['author'] = post.xpath('./dt[@class="byline"]/a[contains(@href,"user")]/text()').extract_first() item['author_link'] = post.xpath('./dt[@class="byline"]/a[contains(@href,"user")]/@href').extract_first() item['create_date'] = post.xpath('./dt[@class="byline"]/span[@class="timestamp"]/text()').extract_first() item['post'] = re.sub('\s+',' '," ".join(post.xpath('.//div[@class="description"]/div[@class="xg_user_generated"]/p/text()').extract()).replace("\t","").replace("\n","").replace("\r","")) item['tag']='epilepsy' item['topic'] = topic item['url']=url logging.info(item.__str__) items.append(item) return items
def parse(self, response): sel = Selector(response); title = sel.xpath('//h2/text()').extract()[0] title = "%s-%s"%(title, self.name); title = self.polishString(title); print(title) tmpNovelDirPath = os.path.join(self.tmpDirPath, title); if(os.path.isdir(tmpNovelDirPath) != True): os.makedirs(tmpNovelDirPath); dd = sel.xpath('//li/a'); id = 0; for d in dd: id += 1; url = d.xpath('@href').extract()[0]; url = response.urljoin(url); subtitle = d.xpath('text()').extract()[0]; subtitle = self.polishString(subtitle); subtitle = '\n\n********* [%d] - %s *********\n\n'% (id, subtitle); print(url); print(subtitle); request = scrapy.Request(url, callback = self.parse_page); item = NovelsItem(); item['title'] = title; item['subtitle'] = subtitle; item['id'] = id; item['type'] = 'novels'; request.meta['item'] = item; yield request;
def parse_category(self,response): self.visited_url.add(response.url) dom = Selector(response) subcategories = dom.xpath("//div[contains(@id,'mw-subcategories')]//a") for subcategory in subcategories: url = self.domain + subcategory.xpath("./@href").extract()[0] if url not in self.visited_url: self.visited_url.add(url) item = FoodbkItem() item["up"] = response.url.decode("utf-8") item["down"] = url yield item yield self.make_requests_from_url(url).replace(callback=self.parse_category) entities = dom.xpath("//div[contains(@id,'mw-pages')]//a") for e in entities: url = e.xpath("./@href").extract()[0] if self.template_url_pattern.match(url) != None: continue url = self.domain + url item = FoodbkItem() item["up"] = response.url.decode("utf-8") item["down"] = url yield item indexes = dom.xpath("//a[contains(text(),'200')]") for index in indexes: if index.xpath("./text()").extract()[0] == u'后200条': url = self.domain + index.xpath("./@href").extract()[0] if url not in self.visited_url: self.visited_url.add(url) yield self.make_requests_from_url(url).replace(callback=self.parse_category)
def run(self): url = 'http://www.sto.cn/Home/Index' self.driver.get(url) css_seletor = 'li.order-search' self.driver.find_element_by_css_selector( 'li.order-search textarea').send_keys('3367154640058') self.driver.find_element_by_css_selector( 'li.order-search div.btn_order_search input').click() sleep(5) # div.layui-layer-content # driver.save_screenshot('申通.jpg') body = self.driver.page_source # print(body) bg_pic = Selector( text=body).css('img.yidun_bg-img::attr("src")').extract_first() slide_pic = Selector( text=body).css('img.yidun_jigsaw::attr("src")').extract_first() print('滑块bg:', bg_pic, ' 补足部分:', slide_pic) '''滑动验证码破解''' # 等待验证码弹出 # bg_pic = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, "yidun_bg-img"))) bg_pic = self.wait.until( EC.presence_of_element_located( (By.CLASS_NAME, "layui-layer-page"))) # html中坐标原点是左上角,右为x轴正方向,下为y轴正方向 # 输出的x为正就是此元素距离屏幕左侧距离 # 输出的y为正就是此元素距离屏幕上侧距离 # 所以我们需要截图的四个距离如下: top, bottom, left, right = (bg_pic.location['y'], bg_pic.location['y'] + bg_pic.size['height'], bg_pic.location['x'], bg_pic.location['x'] + bg_pic.size['width']) print('top: {0}, bottom: {1}, left: {2}, right: {3}'.format( top, bottom, left, right)) sleep(1) cp1 = self.crop(left, top, right, bottom, '12.png') # 获取滑块按钮并点击一下 slide = self.wait.until( EC.presence_of_element_located((By.CLASS_NAME, "yidun_slider"))) slide.click() sleep(3) # 等3秒报错信息消失 TODO 这里应该可以改进 cp2 = self.crop(left, top, right, bottom, '2.png') move = self.calc_move(cp1, cp2) result = self.path1(move) # result = self.path2(move) # 拖动滑块 ActionChains(self.driver).click_and_hold(slide).perform() for x in result: ActionChains(self.driver).move_by_offset(xoffset=x[0], yoffset=x[1]).perform() # ActionChains(driver).move_to_element_with_offset(to_element=slide,xoffset=x[0],yoffset=x[1]).perform() sleep(x[-1]) # 如果使用方法1则需要sleep sleep(0.5) ActionChains(self.driver).release(slide).perform() # 释放按钮 sleep(0.8)
def parse(self, response): driver = response.meta['driver'] for item in self.params: search_box = driver.find_element_by_xpath( "//input[@placeholder='Search for your personal TLD']") search_box.send_keys(item) #time.sleep(2) search_box.send_keys(Keys.ENTER) time.sleep(3) html = driver.page_source response = Selector(text=html) if response.xpath("//div[@class='desktop-bid-card']"): status = "Live" text = None days = response.xpath( "//div[text()='Time left to bid (est.)']/preceding-sibling::div/text()" ).get() blocks = response.xpath( "//div[text()='Blocks left to bid']/preceding-sibling::div/text()" ).get() elif response.xpath("//div[text()='Buy now']"): status = "Buy Now" text = response.xpath("//div[text()='HNS']/text()").get() days = None blocks = None elif response.xpath("//div[text()='Auction over']"): status = "Auction Over" text = response.xpath( "//div[text()='Auction over']/following-sibling::div/text()" ).get() days = None blocks = None elif response.xpath("//div[text()='Already taken']"): status = "Already Taken" text = response.xpath( "//div[text()='Already taken']/following-sibling::div/text()" ).get() days = None blocks = None elif response.xpath("//div[text()='Coming soon']"): status = "Coming Soon" text = response.xpath( "//div[text()='Coming soon']/following-sibling::div/text()" ).get() days = response.xpath( "//div[text()='Available in (est.)']/preceding-sibling::div/text()" ).get() blocks = response.xpath( "//div[text()='Blocks until release']/preceding-sibling::div/text()" ).get() yield { 'Word': item, 'Status': status, 'Text': text, 'Blocks': blocks, 'Days': days } back_button = driver.find_element_by_xpath( "//a[text()='Top-level domain']") back_button.click() time.sleep(3)
def detail_index(self, index_url): response = requests.get(url=index_url, headers=self.headers) selector = Selector(text=response.text) img_url = selector.css('.type_production img::attr(src)').extract()[1] zh_name = selector.css('.pro_tit .a1::text').extract() us_name = selector.css('.pro_tit .a2::text').extract() jianjie = selector.css('.produc_table .a2::text').extract() cankaojiage = selector.css('.produc_table .a3::text').extract_first('') hengliangbiaozhui = selector.css( '.pingfen .sp2 em::attr(style)').extract() text = selector.css('.prod_slidebox ul li .text p::text').extract() for x in range(len(hengliangbiaozhui)): nianren = hengliangbiaozhui[0].replace('width:', '') xijiao = hengliangbiaozhui[1].replace('width:', '') diaomao = hengliangbiaozhui[2].replace('width:', '') tiwei = hengliangbiaozhui[3].replace('width:', '') meirong = hengliangbiaozhui[4].replace('width:', '') youxian = hengliangbiaozhui[5].replace('width:', '') shengren = hengliangbiaozhui[6].replace('width:', '') dongwu = hengliangbiaozhui[7].replace('width:', '') yundongliang = hengliangbiaozhui[8].replace('width:', '') kexunxing = hengliangbiaozhui[9].replace('width:', '') koushui = hengliangbiaozhui[10].replace('width:', '') naihan = hengliangbiaozhui[11].replace('width:', '') naire = hengliangbiaozhui[12].replace('width:', '') shiying = hengliangbiaozhui[13].replace('width:', '') for i in range(len(jianjie)): bieming = jianjie[0].replace('\xa0', '') fenbuquyu = jianjie[1].replace('\xa0', '') yuanchandi = jianjie[2].replace('\xa0', '') tixing = jianjie[3].replace('\xa0', '') gongneng = jianjie[4].replace('\xa0', '') fenzu = jianjie[5].replace('\xa0', '') shengao = jianjie[6].replace('\xa0', '') tizhong = jianjie[7].replace('\xa0', '') souming = jianjie[8].replace('\xa0', '') tidian = jianjie[10].replace('\xa0', '') return { '封 面:': img_url, '中文名字 :': zh_name, '英文名字 :': us_name, '别 名:': bieming, '分布区域 :': fenbuquyu, '原 产 地:': yuanchandi, '体 型:': tixing, '功 能:': gongneng, '分 组:': fenzu, '身 高:': shengao, '体 重:': tizhong, '寿 命:': souming, '参考价格 :': cankaojiage, '特 点:': tidian, '粘人程度 :': nianren, '喜叫程度:': xijiao, '掉毛程度:': diaomao, '体味程度:': tiwei, '美容程度:': meirong, '对小孩友善程度:': youxian, '对生人程度:': shengren, '对动物程度:': dongwu, '运动量:': yundongliang, '可训练性:': kexunxing, '口水程度::': koushui, '耐寒程度:': naihan, '耐热程度:': naire, '城市适应度:': shiying, '简介: ': str(text) }
def parse_songdata(self, response): responseSelector = Selector(response) item = LyricsscraperItem() ## Non Translated data songLyricswithExtra = remove_tags( responseSelector.xpath( '//*[@id="genesis-content"]/article/*[@class="entry-content"]//pre' )[0].extract()) songLyrics = "".join([ char for char in songLyricswithExtra if ((char not in string.digits) and ( char not in string.ascii_letters) and (char not in removepunc)) ]).strip() songLyrics = songLyrics.replace("∆", "") item["songLyrics"] = songLyrics songLyrics = songLyrics.replace("\n", "") songLyrics = songLyrics.replace("\t", "") songLyrics = "".join([ char for char in songLyrics if ((char not in string.punctuation)) ]).strip() item["songLyricsSearchable"] = songLyrics string_viewcount_data = remove_tags( responseSelector.xpath('//*[@class="tptn_counter"]')[0].extract()) string_viewcount = re.sub('[^0-9,]', "", string_viewcount_data).replace(',', '') viewcount = int(string_viewcount.replace(",", "")) item["views"] = viewcount shareobj = responseSelector.xpath('//*[@class="swp_count"]') if (len(shareobj) == 0): shareobj = responseSelector.xpath('//*[@class="swp_count "]') if (len(shareobj) > 0): string_sharecount_data = remove_tags(shareobj[0].extract()) string_sharecount = re.sub('[^0-9,]', "", string_sharecount_data).replace( ',', '') sharecount = int(string_sharecount) item["shares"] = sharecount titlestring = remove_tags( responseSelector.xpath( '//*[@id="genesis-content"]/article/*[@class="entry-content"]/h2' )[0].extract()) if ("-" in titlestring): titles = titlestring.split("-") titles = [i.strip() for i in titles] item["title"] = titles[1] elif ("|" in titlestring): titles = titlestring.split("|") titles = [i.strip() for i in titles] item["title"] = titles[1] elif ("–" in titlestring): titles = titlestring.split("–") titles = [i.strip() for i in titles] item["title"] = titles[1] else: item["title"] = titlestring.strip() musicInfoString = remove_tags( responseSelector.xpath( '//*[@id="genesis-content"]/article/*[@class="entry-content"]/h3' )[0].extract()) if ("-" in musicInfoString): musicInfo = musicInfoString.split("-") musicInfo = [i.strip() for i in musicInfo] item["key"] = musicInfo[0].replace("Key:", "").strip() item["beat"] = musicInfo[1].replace("Beat:", "").strip() elif ("|" in musicInfoString): musicInfo = musicInfoString.split("|") musicInfo = [i.strip() for i in musicInfo] item["key"] = musicInfo[0].replace("Key:", "").strip() item["beat"] = musicInfo[1].replace("Beat:", "").strip() item['url'] = response.url gotNamesfromElement = False artistInfoObject = responseSelector.xpath( '//*[@id="genesis-content"]/article//*[@class="artist-name"]') if (len(artistInfoObject) > 0): aristInfoString = remove_tags(artistInfoObject[0].extract()) aristInfoString = aristInfoString.replace("|", "/") artistNames = aristInfoString.split("/") isascii = lambda s: len(s) == len(s.encode()) sinhalaArtistNamesArray = [] for i in artistNames: if not isascii(i): sinhalaArtistNamesArray.append(i) item['artist'] = sinhalaArtistNamesArray if (len(sinhalaArtistNamesArray) > 0): gotNamesfromElement = True ## Translated Data songInfo = responseSelector.xpath( '//*[@id="genesis-content"]/article/*[@class="entry-content"]/*[@class="su-row"]//ul/li' ) for i in range(0, len(songInfo)): headstring = remove_tags(songInfo[i].extract()) if ("Artist:" in headstring and not gotNamesfromElement): artiststring = headstring.replace("Artist:", "").strip() artists = artiststring.split(",") artists = [i.strip() for i in artists] translated_artists = translate_array(artists) item['artist'] = translated_artists elif (("Genre:" in headstring)): genrestring = headstring.replace("Genre:", "").strip() genre = genrestring.split(",") genre = [i.strip() for i in genre] translated_genre = translate_array(genre) item['genre'] = translated_genre elif (("Lyrics:" in headstring)): writerstring = headstring.replace("Lyrics:", "").strip() writers = writerstring.split(",") writers = [i.strip() for i in writers] translated_writers = translate_array(writers) item['writer'] = translated_writers elif (("Music:" in headstring)): composerstring = headstring.replace("Music:", "").strip() composers = composerstring.split(",") composers = [i.strip() for i in composers] translated_composers = translate_array(composers) item['composer'] = translated_composers elif (("Movie:" in headstring)): item['movie'] = translate_word( headstring.replace("Movie:", "").strip()) return item
data = None urls = [] new_list = [] ming = [] xie = [] hao = [] lei = [] number = [] print(u'\n数据爬取中,请稍候……') html = gethtml( r'http://search.jd.com/Search?keyword=%E4%BA%BA%E6%B0%91%E6%96%87%E5%AD%A6%E5%87%BA%E7%89%88%E7%A4%BE&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E4%BA%BA%E6%B0%91%E6%96%87%E5%AD%A6%E5%87%BA%E7%89%88%E7%A4%BE&sid=1000005720&ev=publishers_%E4%BA%BA%E6%B0%91%E6%96%87%E5%AD%A6%E5%87%BA%E7%89%88%E7%A4%BE%5E&psort=3&click=0' ) # 起始url # 注意headers中的请求 ss = Selector(text=html).xpath("//ul[@class='gl-warp clearfix']") # 总标签 file = Selector( text=html).xpath("//a[@class='crumb-select-item']/em/text()").extract() for s in ss: # print(s) label = s.xpath("//li/div[@class='gl-i-wrap']/div[1]/@class").extract() url = s.xpath( "./li/div[@class='gl-i-wrap']/div[@class='p-name']/a/@href").extract() for i in range(1): if label[i] == 'gl-i-tab': url[i] = s.xpath( ".//div[@class='tab-content-item tab-cnt-i-selected']/div[@class='p-name']/a/@href" ).extract() urls.append(url[i][1])
def parse_details(self, response): metadata = response.meta['userdata'] metadata['url'] = response.url sel = Selector(response) name = self.fetch_name(response) if name: metadata['name'] = name detail = self.fetch_details(response) if detail: metadata['details'] = detail model = self.fetch_model(response) if model: metadata['model'] = model else: return description = self.fetch_description(response) if description: metadata['description'] = description ret = self.fetch_price(response) if 'price' in ret: metadata['price'] = ret['price'] if 'price_discount' in ret: metadata['price_discount'] = ret['price_discount'] colors = self.fetch_color(response) if colors: metadata['color'] = colors # image_urls = sel.xpath('//div[@id="itemContent"]//img/@src').extract() # 获得图片 hdr = None tail = None img0 = sel.xpath( '//meta[@property="og:image" and @content]/@content').extract() if img0: img0 = img0[0] mt = re.search(r'(.+)_\d+_\w(\..+)$', img0) if mt: hdr = mt.group(1) tail = mt.group(2) idx = response.body.find('jsinit_item') img_item = None if idx != -1: tmp = response.body[idx:] idx = tmp.find('ALTERNATE') if idx != -1: try: img_item = json.loads( cm.extract_closure(tmp[idx:], r'\[', r'\]')[0]) except ValueError: pass image_urls = [] if hdr and tail and img_item: for item in img_item: mt = re.search(r'(\d+)_\w', item) if not mt: continue start_idx = int(mt.group(1)) for idx in xrange(start_idx, 15): tmp = re.sub(r'\d+_(\w)', str.format(r'{0}_\1', idx), item) image_urls.append(str.format('{0}_{1}{2}', hdr, tmp, tail)) item = ProductItem() item['url'] = metadata['url'] item['model'] = metadata['model'] item['image_urls'] = image_urls item['metadata'] = metadata yield item
def selector(self): from scrapy.selector import Selector if self._cached_selector is None: self._cached_selector = Selector(self) return self._cached_selector
def parse(self): while True: if self.index > 48: print('-' * 100 + '一次大循环爬取完成') print() print('-' * 100 + '即将重新开始爬取....') ip_object = IpPools(type=self.ip_pool_type) self.proxies = ip_object.get_proxy_ip_from_ip_pool( ) # 获取新的代理pool self.index = 1 else: sleep(5) tmp_number = randint(1, 8) # 随机一个数,来获取随机爬取范围 my_pipeline = SqlServerMyPageInfoSaveItemPipeline() tmp_index = 1 for i in range(0, 49): # 控制每个分类的循环 bozhu = {} if self.index == 49: break tmp_type = self.species[self.index][1] number = self.species[self.index][0] domain = '102803_ctg1_{}_-_ctg1_{}'.format( str(number), str(number)) id = domain tmp_pagebar_index = 0 tmp_pre_page_index = 1 tmp_page_index = 1 for count in self.page_range[ tmp_number]: # 又入坑(大多数热门页面30页后无法下拉):弄清算法规律后,发现在不同的热门页面,下拉到一定的页数,就无法下拉获取数据,点背... if tmp_index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') my_pipeline = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if my_pipeline.is_connect_success: print('============| 正在采集第%d页的内容 ...... |' % (count + 1, )) # 分析pagebar # 5 11 17 # pagebar: 0 1 2 3 4 无 0 1 2 3 4 无 0 1 2 3 4 无.... if tmp_pagebar_index > 5: # 控制其始终小于5 tmp_pagebar_index = 0 pagebar = str(self.pagebar[tmp_pagebar_index]) current_page = str(count + 1) script_uri = r'/102803_ctg1_{}_-_ctg1_{}'.format( str(number), str(number)) domain_op = domain # 1506471533330 __rnd = str(15064) + str(randint(1, 9)) + str( randint(1, 9)) + str(randint(1, 9)) + str( randint(1, 9)) + str(randint(1, 9)) + str( randint(1, 9)) + str(randint( 1, 9)) + str(randint(1, 9)) # __rnd = str(1506471533330) if (count) % 6 == 0: # 分析出来count为6的倍数则pre_page加1 tmp_pre_page_index += 1 pre_page = str(tmp_pre_page_index) if (count + 1) % 6 == 0: # 分析出来count+1为6的倍数则page加1 tmp_page_index += 1 page = str(tmp_page_index) url = 'https://d.weibo.com/p/aj/v6/mblog/mbloglist?ajwvr=6&from=faxian_hot&mod=fenlei&tab=home&pl_name=Pl_Core_NewMixFeed__3&feed_type=1&domain={}&pagebar={}¤t_page={}&id={}&script_uri={}&domain_op={}&__rnd={}&pre_page={}&page={}' \ .format(domain, pagebar, current_page, id, script_uri, domain_op, __rnd, pre_page, page) print(url) sleep(2) # 设置等待时间避免微博进行网页重定向 # 发现规律,每爬取多少页面时,会将页面重定向,并且很久不响应,所以间隔性休眠 # if count == 50 or count == 100 or count == 150 or count == 200 or count == 250: # print('============| >>>>>> 爬虫正在休眠中 ...... <<<<<<') # time.sleep(100) tmp_html = self.get_url_body(url=url) if len(tmp_html) <= 100000: print( '==========| 此时返回的content["data"]为空值, 爬虫进入短暂休眠 ....... |' ) print('==========| 请稍后,即将开始继续爬取------>>>>>') sleep(2) tmp_html = self.get_url_body(url=url) # print(tmp_html) for item in Selector( text=tmp_html).css('div.face a').extract(): tmp_nick_name = Selector(text=item).css( 'img::attr("title")').extract_first() tmp_head_img_url = 'https:' + Selector( text=item).css( 'img::attr("src")').extract_first() bozhu['nick_name'] = self.wash_nick_name( nick_name=tmp_nick_name) bozhu['sina_type'] = tmp_type bozhu['head_img_url'] = re.compile( '\.50/').sub('.180/', tmp_head_img_url) print('---->> ', [ tmp_nick_name, tmp_type, tmp_head_img_url ]) # yield bozhu my_pipeline.insert_into_sina_weibo_table( item=bozhu) gc.collect() print('============| 采集第%d页的内容 完毕 |' % (count + 1, )) tmp_pagebar_index += 1 # 累加1 else: print('数据库连接失败!') pass tmp_index += 1 self.index += 1 # 更换索引地址
@author = super_fazai @File : demo.py @Time : 2017/8/20 10:33 @connect : [email protected] ''' """ Scrapy选择器是Selector通过传递文本或TextResponse 对象构造的类的实例。 它根据输入类型自动选择最佳解析规则(XML vs HTML) """ from scrapy.selector import Selector from scrapy.http import HtmlResponse # 从文本构建 body = '<html><body><span>good</span></body></html>' print(Selector(text=body).xpath('//span/text()').extract()) # 从response(响应)中构建 response = HtmlResponse(url='https://sebastianraschka.com/blog/index.html', body=body, encoding='utf-8') print( Selector(response=response).xpath( '//*/h1[@class="post-title"]/text()').extract()) # 上面那句等价于下面这句 print(response.selector.xpath('//*/h1[@class="post-title"]/text()').extract()) response = r""" <html> <head> <base href='http://example.com/' />
"www.lagou.com", "Referer": "https://www.lagou.com/", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36", } url = 'https://www.lagou.com/zhaopin/Python/?labelWords=label' # url = 'https://www.lagou.com/jobs/positionAjax.json?city=%E5%8C%97%E4%BA%AC&needAddtionalResult=false' r = requests.get(url, headers=headers) # print(r.status_code) # print(r.text) from scrapy.selector import Selector s = Selector(text=r.text) all_links = s.css('.position_link::attr(href)').getall() print(all_links) for link in all_links: r = requests.get(link, headers=headers) print(r.status_code) print(r.url) s = Selector(text=r.text) data = {} title = s.css('.job-name::attr("title")').get() detail = s.css('.job-detail').get() data['title'] = title data['detail'] = detail print(data)
def qa_collect(self,asin,country='us'): try: product=Product.objects.filter(asin=asin,country=country)[0] page=get_url('ask/questions/asin/'+asin+'/ref=ask_ql_psf_ql_hza?sort=SUBMIT_DATE', country) tree = fromstring(page.content) print(tree.findtext('.//title')) if tree.findtext('.//title')=='Robot Check' or tree.findtext('.//title')=='Amazon CAPTCHA': info = {'to':0} return info if Selector(text=page.content).xpath('.//*[@id="noResultsTitle"]'): info = {'to':0} return info qa_collection={} if Selector(text=page.content).xpath("//ul[@class='a-pagination']/li[@class='a-last']//a/@href"): page_num=0 while True: boxes=Selector(text=page.content).xpath(".//*[@class='a-section askTeaserQuestions']/div[@class='a-fixed-left-grid a-spacing-base']") for box in boxes: answer_url,answer,answer_user,qa_time=None,None,None,None vote=int(box.xpath(".//ul[@class='vote voteAjax']/li[2]/span[1]/text()").extract()[0]) question=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-small']//a[@class='a-link-normal']/text()").extract()[0] try: qa_time=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-fixed-left-grid-col a-col-right']/span[2]/text()").extract()[-1:][0] except: pass try: if box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-fixed-left-grid-col a-col-right']/span[1]/text()").extract() and country != 'jp': answer=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-fixed-left-grid-col a-col-right']/span[1]/text()").extract()[0] elif box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-fixed-left-grid-col a-col-right']/span[1]/text()").extract() and country == 'jp': answer=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-fixed-left-grid-col a-col-right']/span[1]/text()").extract()[0] if answer == "": try: answer=" ".join(box.xpath(".//span[@class='askLongText']/text()").extract()).strip() except: pass else: answer=" ".join(box.xpath(".//span[@class='askLongText']/text()").extract()).strip() except: pass try: answer_user=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-fixed-left-grid-col a-col-right']/span[2]/text()").extract()[0] except: pass try: answer_quan=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-section a-spacing-none a-spacing-top-mini']/a/text()").extract()[0] answer_quan = re.search(r'\d+', answer_quan).group(0) except: pass try: answer_url=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-section a-spacing-none a-spacing-top-mini']/a/@href").extract()[0] answer_url=country_url(country)[:-1]+answer_url #print("answer_url:",answer_url) except: pass #print(answer_user,qa_time) if answer_user == None: pass elif answer_user==qa_time: if country in['us','uk','ca','de']: name_date=re.split(' on |By |Von | am ', answer_user) elif country=='it': name_date=re.split(' in |Da ', answer_user) elif country=='fr': name_date=re.split(' le |Par ', answer_user) elif country=='es': name_date=re.split(' el |Por ', answer_user) elif country=='jp': name_date=re.split('投稿者: |、投稿日: ', answer_user) answer_user=name_date[1] qa_time=name_date[2] else: answer_user=re.split(' on |By |Von | am ', answer_user)[-1:][0] qa_time=re.split(' on |By |Von | am ', qa_time)[-1:][0] if answer_url and answer_quan: qa_collection[question]={'vote':vote,'question':question,'qa_time':qa_time.strip(),'answer':answer,'answer_user':answer_user.strip(),'answer_quan':answer_quan,'answer_url':answer_url} elif answer: qa_collection[question]={'vote':vote,'question':question,'qa_time':qa_time.strip(),'answer':answer,'answer_user':answer_user.strip()} print(len(qa_collection)) if Selector(text=page.content).xpath("//ul[@class='a-pagination']/li[@class='a-last']//a/@href") and page_num<200: time.sleep(2+random.random()*5) page=get_url((Selector(text=page.content).xpath("//ul[@class='a-pagination']/li[@class='a-last']//a/@href")).extract()[0],country=country) page_num += 1 else: break else: boxes=Selector(text=page.content).xpath(".//*[@class='a-section askTeaserQuestions']/div[@class='a-fixed-left-grid a-spacing-base']") for box in boxes: answer_url,answer,answer_user,qa_time=None,None,None,None vote=int(box.xpath(".//ul[@class='vote voteAjax']/li[2]/span[1]/text()").extract()[0]) question=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-small']//a[@class='a-link-normal']/text()").extract()[0] try: qa_time=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-fixed-left-grid-col a-col-right']/span[2]/text()").extract()[-1:][0] except: pass try: if box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-fixed-left-grid-col a-col-right']/span[1]/text()").extract(): answer=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-fixed-left-grid-col a-col-right']/span[1]/text()").extract()[0] else: answer=" ".join(box.xpath(".//span[@class='askLongText']/text()").extract()).strip() except: pass try: answer_user=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-fixed-left-grid-col a-col-right']/span[2]/text()").extract()[0] except: pass try: answer_quan=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-section a-spacing-none a-spacing-top-mini']/a/text()").extract()[0] answer_quan = re.search(r'\d+', answer_quan).group(0) except: pass try: answer_url=box.xpath(".//div[@class='a-fixed-left-grid a-spacing-base']//div[@class='a-section a-spacing-none a-spacing-top-mini']/a/@href").extract()[0] answer_url=country_url(country)[:-1]+answer_url except: pass if answer_user == None: pass elif answer_user==qa_time: if country in['us','uk','ca','de']: name_date=re.split(' on |By |Von | am ', answer_user) elif country=='it': name_date=re.split(' in |Da ', answer_user) elif country=='fr': name_date=re.split(' le |Par ', answer_user) elif country=='es': name_date=re.split(' el |Por ', answer_user) elif country=='jp': name_date=re.split('投稿者: |、投稿日: ', answer_user) answer_user=name_date[1] qa_time=name_date[2] else: answer_user=re.split(' on |By |Von | am ', answer_user)[-1:][0] qa_time=re.split(' on |By |Von | am ', qa_time)[-1:][0] if answer_url and answer_quan: qa_collection[question]={'vote':vote,'question':question,'qa_time':qa_time,'answer':answer,'answer_user':answer_user,'answer_quan':answer_quan,'answer_url':answer_url} elif answer: qa_collection[question]={'vote':vote,'question':question,'qa_time':qa_time,'answer':answer,'answer_user':answer_user} for qa in qa_collection: try: num=qa_collection[qa]['answer_quan'] except: num="1" try: #if qa_collection[qa]['answer_url']: QA_detail.objects.get_or_create(product=product,vote=qa_collection[qa]['vote'],question=qa_collection[qa]['question'],qa_time=qa_collection[qa]['qa_time'], answer=qa_collection[qa]['answer'],answer_person=qa_collection[qa]['answer_user'],num=num,answer_url=qa_collection[qa]['answer_url']) except: QA_detail.objects.get_or_create(product=product,vote=qa_collection[qa]['vote'],question=qa_collection[qa]['question'],qa_time=qa_collection[qa]['qa_time'], answer=qa_collection[qa]['answer'],answer_person=qa_collection[qa]['answer_user'],num=num) #except: # pass #report = GlucoseCsvReport(product) #report.email(product.user, 'subject', 'message') except Exception as e: dt = datetime.now(pytz.utc) + timedelta(seconds=40) self.retry(eta=dt, exc=e, max_retries=2)
def parse_detail(self, response): if response.meta: response = response.meta['res'] totalpage = int( response.xpath('//label[@class="ui-label"]/text()').extract_first( ).split('/')[-1]) for page in range(totalpage): # # 容量 # capacities = response.xpath('//span[@class="first"]') # # # 颜色 # colors = response.xpath('//span[contains(string(.),"Color")]') # # # 物流 # Logistics = response.xpath('//span[contains(string(.),"Logistics")]') infos = response.xpath('//div[@class="user-order-info"]') # 日期时间 datetimes = response.xpath( '//dl[@class="buyer-review"]/dd[@class="r-time"]/text()' ).extract() # 国家 countries = response.xpath( '//div[@class="user-country"]/b/text()').extract() # 图片 image_urls = response.xpath( '//ul[@class="util-clearfix"]/li/img/@src').extract() for i in range(len(countries)): item = Item() # # 容量 # capacity = capacities[i].xpath('string(.)').extract_first() # capacity = capacity.replace('\\t', '').replace('\\n', '') # # capacity = re.search("\d+-\d+ml", capacity).group(0) # # # 颜色 # color = colors[i].xpath('string(.)').extract_first() # color = color.split(":")[-1].strip() # # # 物流 # logistics = Logistics[i].xpath('string(.)').extract_first() # logistics = logistics.split(":")[-1].strip() spans = infos[i].xpath('span') item[Item.INFOS] = {} for span in spans: key = span.xpath('strong/text()').extract_first().replace( ':', '') value = span.xpath('string(.)').extract_first() value = value.split(":")[-1].replace('\t', '').replace( '\n', '').replace(' ', '') item[Item.INFOS][key] = value # 日期时间 datetime = datetimes[i] # 国家 country = countries[i] # 图片 if i == 0: item[Item.IMAGE_URLS] = image_urls else: item[Item.IMAGE_URLS] = None # item[Item.CAPACITY] = capacity # item[Item.COLOR] = color # item[Item.LOGISTICS] = logistics item[Item.DATETIME] = datetime item[Item.COUNTRY] = country item[Item.PRODUCT_ID] = self.product_id yield item if response.xpath('//a[contains(text(),"Next")]'): time.sleep(3) self.driver.find_element_by_xpath( '//div[@class="ui-pagination ui-pagination-front ui-pagination-pager util-right"]/a[contains(text(),"Next")]' ).click() response = self.driver.page_source response = Selector(text=response) else: break self.driver.close()
def __init__(self): self.driver = webdriver.Firefox() self.report_selector = Selector(text = "")
def parse(self, response): self.log('Hi, this is: %s' % response.url) hxs = Selector(response) dls = hxs.xpath('//div[@id = "a-bit-more-about"]/dl') item = FlickrProfileItem() # item['_id'] = self.get_username(response.url) _id = self.get_username(response.url) print _id item["_id"] = _id for dl in dls: if dl.xpath('dt/text()').extract()[0] == "Name:": given_name = "" family_name = "" try: given_name = dl.xpath( 'dd/span[@class="given-name"]/text()').extract()[0] except: pass else: print 'given_name:', given_name # item['given_name'] = given_name try: family_name = dl.xpath( 'dd/span[@class = "family-name"]/text()').extract()[0] except: pass else: print 'family_name:', family_name # item['family_name'] = family_name item["name"] = given_name + " " + family_name if dl.xpath('dt/text()').extract()[0] == "Joined:": joined = dl.xpath('dd/text()').extract()[0] print 'joined time:', joined item['joined'] = joined if dl.xpath('dt/text()').extract()[0] == "Hometown:": home = dl.xpath('dd/text()').extract()[0] print 'hometown:', home item['hometown'] = home if dl.xpath('dt/text()').extract()[0] == "Currently:": try: locality = dl.xpath( 'dd/span[@class = "adr"]/span[@class = "locality"]/text()' ).extract()[0] except: pass else: print 'locality:', locality item['location'] = locality try: country_name = dl.xpath( 'dd/span[@class = "adr"]/span[@class = "country-name"]/text()' ).extract()[0] except: pass else: print 'country-name:', country_name item['country'] = country_name if dl.xpath('dt/text()').extract()[0] == "I am:": gender = dl.xpath('dd/text()').extract()[0].strip() print 'gender:', gender item['gender'] = gender if dl.xpath('dt/text()').extract()[0] == "Occupation:": occupation = dl.xpath('dd/text()').extract()[0] print 'occupation:', occupation item['occupation'] = occupation if dl.xpath('dt/text()').extract()[0] == "Website:": websitename = dl.xpath('dd/a/text()').extract()[0] websiteurl = dl.xpath('dd/a/@href').extract()[0] print 'website:', websitename, websiteurl item['websitename'] = websitename item['websiteurl'] = websiteurl yield item
def parse_item(self, response): items = [] sel = Selector(response) name_list = sel.xpath('//td[@class="zwmc"]/div/a').xpath( 'string(.)').extract() link_list = sel.xpath('//td[@class="zwmc"]/div/a/@href').extract() firm_list = sel.xpath('//td[@class="gsmc"]/a').xpath( 'string(.)').extract() salary_list = sel.xpath('//td[@class="zwyx"]').xpath( 'string(.)').extract() workplace_list = sel.xpath('//td[@class="gzdd"]').xpath( 'string(.)').extract() pubdate_list = sel.xpath('//td[@class="gxsj"]/span/text()').extract() firmsize_list = sel.xpath( '//li[@class="newlist_deatil_two"]/span[3]/text()').extract() workreq_list = sel.xpath( '//li[@class="newlist_deatil_two"]/span[4]/text()').extract() details_list = sel.xpath('//li[@class="newlist_deatil_last"]').xpath( 'string(.)').extract() while (name_list): item = ZhilianItem() try: item['job_name'] = name_list.pop() item['link'] = link_list.pop() item['firm_name'] = firm_list.pop() item['salary'] = salary_list.pop() item['working_place'] = workplace_list.pop() item['pub_date'] = pubdate_list.pop() item['firm_size'] = firmsize_list.pop() item['work_requirement'] = edureq_list.pop() item['job_describe'] = details_list.pop() except: pass items.append(item) return items
class ReportSpider(scrapy.Spider): name = "reports" start_urls = [ 'https://h1.sintheticlabs.com/' ] def __init__(self): self.driver = webdriver.Firefox() self.report_selector = Selector(text = "") def parse(self, response): report_urls = response.xpath('//tbody/tr/td[3]/a/@href').extract() item = HackeroneItem() for report_url in report_urls[1:]: self.driver.get(report_url) #Sleep for few moments so that webpage gets loaded properly otherwise content won't come up time.sleep(2) self.report_selector = Selector(text = self.driver.page_source) item = self.parseReport() if item != None: yield item #Everything is over and browser whould be quit self.driver.quit() def parseReport(self): print("Report called") #Check whether report is duplicate if self.report_selector.xpath('//i[contains(@class, "duplicate")]').extract_first() != None: self.log("Found a duplicate report") return None hid = self.get_hid() reward = self.get_reward() submission_date = self.get_submission_date() ending_date = self.get_end_date() vuln_type = self.get_vuln_type() severity = self.get_severity() item = HackeroneItem() item['hid'] = hid item['reward'] = reward item['submission_date'] = submission_date item['resolved_date'] = ending_date item['vuln_type'] = vuln_type item['severity'] = severity return item def get_hid(self): hid = self.report_selector.xpath("//div[@class='report-status']/a/text()[2]").extract_first() return hid def get_reward(self): reward = self.report_selector.xpath("//tr[contains(@class, 'bounty-amount')]/td/text()").extract() if (len(reward) == 0): reward = 0 else: reward = float(reward[0][1:].replace(',', '')) #Typecasting reward to float first so that it can handle decimal reward = int(reward) return reward def get_submission_date(self): submission_date = self.report_selector.xpath("//span[contains(@class,'spec-timestamp')]/span/@title").extract_first() return submission_date def get_end_date(self): ending_date = self.report_selector.xpath("//div[contains(@data-activity, 'BugResolved')]/div[4]/div/span/@title").extract_first() if ending_date == None: ending_date = self.report_selector.xpath("//div[contains(@data-activity, 'BugInformative')]/div[4]/div/span/@title").extract_first() #TODO Construct better way to find reasons for non-existant end_date return ending_date def get_vuln_type(self): vuln_type = self.report_selector.xpath("//tr[contains(@class, 'vuln-types')]/td[2]/text()").extract() vuln_type = ','.join(vuln_type) return vuln_type def get_severity(self): severity = self.report_selector.xpath("//span[contains(@class, 'severity')]/text()").extract_first() return severity
<li class="item-55"><a id='i55' href="link.html" class='ding'>first item</a></li> <li class="item-66"><a id='i66' href="llink.html" class='ding'>first item</a></li> <li class="item-77"><a href="llink2.html">second item<span>vv</span></a></li> </ul> </div> </body> <ul> <li class="item-5"><a id='i5' href="link.html" class='ding'>first item</a></li> <li class="item-6"><a id='i6' href="llink.html" class='ding'>first item</a></li> <li class="item-7"><a href="llink2.html">second item<span>vv</span></a></li> </ul> </html> """ # 构造response对象 response = HtmlResponse(url='', body=html, encoding='utf-8') selector = Selector(response=response) # // 从根开始搜索 # 获取所有a标签 temp = selector.xpath('//a') # 搜索所有符合 div/div 的 temp = selector.xpath('//div/div') # 获取子标签 # 获取不到,因为a标签不是html的子标签 temp = selector.xpath('a') # 相对位置绝对位置 # 获取第一个body标签, 下面从body标签开始找ul标签 x = selector.xpath('body')[0] # ./ul 相对标签的子标签ul
def test_parsing(self): parser = EPCIZoneParser('', 2013, '', '') data = parser.parse(Selector(self.response)) for key, val in self.data.items(): self.assertAlmostEqual(data[key], val)
def _extract_img_requests(self, response, tag, counter): r = [] siteList = [] ObjectList = dict() externalSites = [] #uniqueExternalSites=[] if isinstance(response, HtmlResponse): tag = 'I' #imgcount=0 counterValueImg = counter sites = Selector(response).xpath("//img/@src").extract() # for site in sites: # imgcount=imgcount+1 #logging.info('imgcount',imgcount) #logwr = csv.writer(logFile, delimiter=',',quotechar=' ', quoting=csv.QUOTE_MINIMAL) for item in sites: if isinstance(item, unicode): item = item.encode('utf-8') siteList.append(item) else: siteList.append(item) #wr.writerow(siteList) externalImageCount, InternalImageCount, uniqueExternalSites, externalSites, secondlevelurl = _extract_object_count( siteList) Imagecount = len(siteList) #lock.acquire() # ObjectList['url']=response.url # ObjectList['counter']=counterValueImg # ObjectList['Imagecount']=Imagecount # ObjectList['InternalImageCount']=InternalImageCount # ObjectList['ExternalImageCount']=externalImageCount # logwr.writeheader() logwr.writerow({ 'url': response.url, 'counter': counterValueImg, 'InternalImageCount': InternalImageCount, 'ExternalImageCount': externalImageCount, 'UniqueExternalSites': uniqueExternalSites, 'ExternalSites': externalSites, 'secondlevelurl': secondlevelurl }) #logwr.writerow([ObjectList]) #lock.release() #wr.writerow([Imagecount]) #logwr.writerow([imgcount]) #Imagecount=str(len(siteList)) #logwr.writerow([siteList]) r.extend( Request(site, callback=self.parse, method='HEAD', meta={ 'tagType': tag, 'counter': counterValueImg }) for site in siteList if site.startswith("http://") or site.startswith("https://")) return r
def parse(self, response): sel = Selector(response) sites = sel.xpath('//table[@bgcolor="#808080"]/table/tr/td[@width="33%"]/a/@href').extract() for site in sites: yield scrapy.Request(''.join(["http://ts300.5156edu.com/sc300/",site]),callback=self.parse_dep2)
def b(url, xpath1, xpath2): selector = Selector(text=requests.get(url, headers, verify=False)) selector.xpath(xpath1).extract() selector.xpath(xpath2)
def _set_title(self, page, response): if isinstance(response, HtmlResponse): title = Selector(response).xpath("//title/text()").extract() if title: page['title'] = title[0]
# from selenium.webdriver.common.keys import Keys from selenium.webdriver import Chrome from scrapy.selector import Selector # from bs4 import BeautifulSoup import csv # import re browser = Chrome('/Users/Tim/PyCharmProjects/learning/chromedriver') browser.get('https://maplelegends.com/ranking/monsterbook?page=1&search=') with open('MonsterbookRanking.csv', 'w', newline='') as file: filewriter = csv.writer(file, quotechar='|', quoting=csv.QUOTE_MINIMAL) filewriter.writerow(['Rank', 'IGN', 'Fame', 'Level', 'Cards', 'Class']) for y in range(500): time.sleep(5) html = Selector(text=browser.execute_script( "return document.documentElement.outerHTML;")) for x in range(5): filewriter.writerow( list(( html.xpath('//tr/td/b/text()').extract()[5 * x], html.xpath('//tr/td/b/text()').extract()[5 * x + 1], html.xpath('//tr/td/b/text()').extract()[5 * x + 2], html.xpath('//tr/td/b/text()').extract()[5 * x + 3], html.xpath('//tr/td/b/text()').extract()[5 * x + 4], html.xpath( '//tr/comment()[contains(., "job")]/following-sibling::*[1]/text()' ).extract()[x]))) browser.find_element_by_xpath('//li/a[contains(.,"Next")]').click() browser.quit() browser = Chrome('/Users/Tim/PyCharmProjects/learning/chromedriver')
def get_total_page_numbers(self, response): sel = Selector(response) response_page_list = sel.xpath( "//div[@class='pager']/table/tr/td/text()").extract() response_page = response_page_list[0].strip().split('/')[1][1:-1] return int(response_page)
def parse_salary(self, response): hxs = Selector(response) items = hxs.xpath("//table[@id='salaryDescTable']/tr[@data-url]") for item in items: salary = Salary() # name = first_item(item.xpath('td/a/text()').extract()) if name.endswith(')'): ix = name.rfind('(') if ix == -1: salary['job_name'] = name salary['job_count'] = 0 else: salary['job_name'] = name[0:ix] salary['job_count'] = int(name[ix + 1:-2]) else: salary['job_name'] = name salary['job_count'] = 0 salary['average'] = first_item( item.xpath("td[@class='s-d-average']/text()").extract()) salary['average'] = salary['average'].replace('¥', '').replace(',', '') salary['company_logo'] = first_item( hxs.xpath("//a[@ka='com-logo']/img/@src").extract()) salary['src_url'] = self.create_url( first_item(item.xpath('td/a/@href').extract())) # company_url = first_item( hxs.xpath("//a[@ka='com-logo']/@href").extract()) if company_url is not None: salary['company_url'] = self.create_url(company_url) start = company_url.find('gso') end = company_url.find('.html') salary['company_code'] = company_url[start:end] else: salary['company_url'] = '' salary['company_code'] = '' co_info = hxs.xpath("//div[@class='co_info']") salary['company_name'] = first_item( co_info.xpath( "p[@id='companyName']/@data-companyname").extract()) salary['praise_rate'] = first_item( co_info.xpath("div[@class='msgs']/strong/text()").extract()) other = co_info.xpath( "p[@class='params grey_99 mt5']//text()").extract() salary['industry'] = '' salary['city_name'] = '' salary['company_type'] = '' salary['company_scale'] = '' if other is not None: other_str = '' for ix in other: other_str += ix other_array = other_str.split('|') if len(other_array) > 0: salary['industry'] = other_array[0] if len(other_array) > 1: salary['city_name'] = other_array[1] if len(other_array) > 2: salary['company_scale'] = other_array[2] if len(other_array) > 3: salary['company_type'] = other_array[3] # id = first_item(item.xpath('@id').extract()) if id != '': id += '_C' ul = hxs.xpath( "//table[@id='salaryDescTable']/tr[@id='%s']/td/div/ul" % id) if ul: salary['high'] = first_item( ul.xpath("li[@class='s-d-low']/text()").extract()) salary['low'] = first_item( ul.xpath("li[@class='s-d-high']/text()").extract()) salary['mark'] = first_item( ul.xpath( "li[@class='s-d-mark']/a/em/text()").extract()) salary['high'] = salary['high'].replace('¥', '').replace( ',', '').lstrip(' ') salary['low'] = salary['low'].replace('¥', '').replace( ',', '').lstrip(' ') yield salary #下页处理 link = first_item( hxs.xpath("//div[@class='page_wrap']/div/a[@class='p_next']/@href" ).extract()) if link is not None: yield Request(url=self.create_url(link), meta={'use_proxy': True}, dont_filter=True, callback=self.parse_salary)
selector.xpath(xpath1).extract() selector.xpath(xpath2) def fun(names, codes, urls): d = {} for name, code, url in zip(names, codes, urls): d[name] = {} d[name]['code'] = code d[name]['url'] = url return d if __name__ == '__main__': req = requests.get(url, headers=headers, verify=False) s = Selector(text=req.text) # #机构类型 # organization_name = s.xpath('//div[@class="folder-body"]/div[1]/div/a/text()').extract() # organization_url = s.xpath('//div[@class="folder-body"]/div[1]/div/a/@href').extract() # organization_code = [re.search('companyType=(.*)',i).group(1) for i in organization_url] # organization = fun(organization_name,organization_code,organization_url) # config = {'organization': organization} # json.dump(config, open("config.json", "w"), ensure_ascii=False) # #省份 # Province={} # Provinces_name = s.xpath('//div[@class="folder-body"]/div[2]/div/a/text()').extract() # Province_url = s.xpath('//div[@class="folder-body"]/div[2]/div/a/@href').extract() # Province_code = [re.search('base=(.*)',i).group(1) for i in Province_url] # Province = fun(Provinces_name,Province_code,Province_url) # config = {'organization': organization,'Province':Province} # json.dump(config, open("config.json", "w"), ensure_ascii=False)
# browser.find_element_by_xpath('//*[@id="pl_login_form"]/div/div[3]/div[6]/a').click() # 开源中国博客:selenium执行JavaScript # browser.get("https://www.oschina.net/blog") # import time # time.sleep(5) # for i in range(3): # browser.execute_script("window.scrollTo(0, document.body.scrollHeight); var lenOfPage=document.body.scrollHeight; return lenOfPage;") # time.sleep(3) # 设置chromedriver不加载图片 # chrome_opt = webdriver.ChromeOptions() # prefs = {"profile.managed_default_content_settings.images":2} # chrome_opt.add_experimental_option("prefs", prefs) # # browser = webdriver.Chrome(executable_path="./chromedriver.exe",chrome_options=chrome_opt) # browser.get("https://www.oschina.net/blog") # phantomjs, 无界面的浏览器, 多进程情况下phantomjs性能会下降很严重 browser = webdriver.PhantomJS( executable_path="C:/spiderDriver/phantomjs-2.1.1-windows/bin/phantomjs.exe" ) browser.get( "https://detail.tmall.com/item.htm?spm=a230r.1.14.3.yYBVG6&id=538286972599&cm_id=140105335569ed55e27b&abbucket=15&sku_properties=10004:709990523;5919063:6536025" ) t_selector = Selector(text=browser.page_source) print(t_selector.css(".tm-price::text").extract()) # print (browser.page_source) browser.quit()
def parse_player(self, response): """ Scrape a player's page. """ player_name = ((response.xpath('//h1/text()').extract_first())) position = (response.xpath( '//div[@id="meta"]/div[2]/p[1]/text()').extract()[1].strip()) #### BATTING STATS #### for row in response.xpath( '//table[@id="batting_standard"]/tbody/tr[@class="full"]'): year = row.xpath( './th[@data-stat="year_ID"]/text()').extract_first() age = row.xpath('./td[@data-stat="age"]/text()').extract_first() team = row.xpath( './td[@data-stat="team_ID"]/a/text()').extract_first() pa = row.xpath('./td[@data-stat="PA"]/text()').extract_first() hr = row.xpath('./td[@data-stat="HR"]/text()').extract_first() rbi = row.xpath('./td[@data-stat="RBI"]/text()').extract_first() avg = row.xpath( './td[@data-stat="batting_avg"]/text()').extract_first() obp = row.xpath( './td[@data-stat="onbase_perc"]/text()').extract_first() slg = row.xpath( './td[@data-stat="slugging_perc"]/text()').extract_first() ops = row.xpath('./td[@data-stat="onbase_plus_slugging"]/text()' ).extract_first() # Deal with that godawful commented-out HTML. commented_text = response.xpath('//comment()').re(regex)[16] new_selector = Selector(text=commented_text, type='html') year_row_string = '//tr[@id="batting_value.' + year + '"]' year_row = new_selector.xpath(year_row_string) war = year_row.xpath( './td[@data-stat="WAR"]/text()').extract_first() salary = year_row.xpath( './td[@data-stat="Salary"]/text()').extract_first() stats = { 'player_name': player_name, 'position': position, 'year': year, 'age': age, 'team': team, 'pa': pa, 'hr': hr, 'rbi': rbi, 'avg': avg, 'obp': obp, 'slg': slg, 'ops': ops, 'war': war, 'salary': salary } yield stats
def category_parse(self, response): n_cat = response.meta['n_cat'] categories = response.meta['categories'] pag = 1 parte = url_part(response.url) while True: print('\n', response.url + parte + str(pag), '\n') check_connection() driver.get(response.url + parte + str(pag)) sleep(3) driver.execute_script( 'document.body.style.MozTransform = "scale(0.2)";') sleep(.5) driver.execute_script( 'document.body.style.MozTransformOrigin = "0 0";') sleep(1) button_test_trys = 0 button_bool = False while button_test_trys <= 20: cat_page_sel = Selector(text=driver.page_source) button_test = cat_page_sel.xpath( '//*[@class="vtex-button bw1 ba fw5 v-mid relative pa0 lh-solid br2 min-h-small t-action--small bg-action-primary b--action-primary c-on-action-primary hover-bg-action-primary hover-b--action-primary hover-c-on-action-primary pointer "]' ) if button_test != []: button_bool = True break else: button_test_trys += 1 sleep(2) if button_bool: ver_mas_prods = 0 while ver_mas_prods < 10: cat_page_sel = Selector(text=driver.page_source) n_prods = len( cat_page_sel.xpath( '//*[@class= "vtex-flex-layout-0-x-flexColChild vtex-flex-layout-0-x-flexColChild--search-result-content pb0"]//*[@class="vtex-search-result-3-x-galleryItem vtex-search-result-3-x-galleryItem--normal vtex-search-result-3-x-galleryItem--grid-3 pa4"]' )) if n_prods != 0 and n_prods >= 11: break else: ver_mas_prods += 1 driver.execute_script( "window.scrollTo(0, window.scrollY + 3)") sleep(1) driver.execute_script( "window.scrollTo(0, window.scrollY - 3)") sleep(1) driver.execute_script( 'document.body.style.MozTransform = "scale(0.005)";') driver.execute_script( 'document.body.style.MozTransformOrigin = "0 0";') sleep(2) #import pdb; pdb.set_trace() cat_page_sel = Selector(text=driver.page_source) n_prods = len( cat_page_sel.xpath( '//*[@class= "vtex-flex-layout-0-x-flexColChild vtex-flex-layout-0-x-flexColChild--search-result-content pb0"]//*[@class="vtex-search-result-3-x-galleryItem vtex-search-result-3-x-galleryItem--normal vtex-search-result-3-x-galleryItem--grid-3 pa4"]' )) prods = cat_page_sel.xpath( '//*[@class= "vtex-flex-layout-0-x-flexColChild vtex-flex-layout-0-x-flexColChild--search-result-content pb0"]//*[@class="vtex-search-result-3-x-galleryItem vtex-search-result-3-x-galleryItem--normal vtex-search-result-3-x-galleryItem--grid-3 pa4"]' ) cat_name = cat_page_sel.xpath( './/*[@class= "vtex-search-result-3-x-galleryTitle--layout t-heading-1"]//text()' ).extract()[-1] for prod in prods: prod_name = prod.xpath( './/*[@class= "vtex-product-summary-2-x-productBrand vtex-product-summary-2-x-brandName t-body"]//text()' ).extract_first() normal_price = prod.xpath( './/*[@class= "vtex-product-price-1-x-listPriceValue vtex-product-price-1-x-listPriceValue--summary strike"]//text()' ).extract() if normal_price == []: normal_price = prod.xpath( './/*[@class= "vtex-product-price-1-x-sellingPriceValue vtex-product-price-1-x-sellingPriceValue--summary"]//text()' ).extract() if normal_price != []: normal_price = ' '.join(normal_price) disc_price = prod.xpath( './/*[@class= "vtex-product-price-1-x-currencyContainer vtex-product-price-1-x-currencyContainer--summary"]//text()' ).extract() if disc_price != []: disc_price = ' '.join(disc_price) image_url = prod.xpath( './/*[@class= "vtex-product-summary-2-x-imageNormal vtex-product-summary-2-x-image"]/@src' ).extract_first() print('\n', '#' * 15, 'Resultado Producto', '#' * 15, '\n') print('Categoria: ', cat_name, '\n', '\n\tProducto:\t', prod_name, '\n\tPrecio norm:\t', normal_price, '\n\tPrecio desc:\t', disc_price, '\n') yield { 'cat_name': cat_name, 'prod_name': prod_name, 'normal_price': normal_price, 'disc_price': disc_price, 'image_url': image_url } pag += 1 continue else: break print('\n', 'Se econtraron en total', n_prods) # import pdb; pdb.set_trace() if n_cat < len(categories) - 1: n_cat += 1 check_connection() yield Request(url='http://olimpica.com/', callback=self.parse, meta={ 'n_cat': n_cat, 'categories': categories }, dont_filter=True) else: driver.quit()
def parse_category(self, response): sel = Selector(response) item = DicksItem() pname = sel.xpath("//div[@class='product_name']/text()").extract()[0] pname = pname.encode('utf-8') item['Brand_Name'] = "Nelco Sports" item[ "Product_Image_Description_1"] = "Buy " + pname + " Online in India at LiveYourSport.com| Free Shipping and Massive Discounts" item[ "MetaDescription"] = "Get your hands on the " + pname + ". Buy it Online in India at LiveYourSport.com| Free Shipping and Massive Discounts" item[ "TitleTag"] = "Buy the " + pname + " Online in India at LiveYourSport.com| Free Shipping and Massive Discounts" pcode = sel.xpath( "//input[@type='checkbox']/@value").extract()[0] + "NELPRD" item["Product_Description"] = sel.xpath( "//td[@class='form_text_normal']/text()").extract() item["Product_Description"] = ''.join( item["Product_Description"]).encode('utf-8') mrp = [] sp = sel.xpath("//td[@class='guest']/text()").extract() sp = min(float(x) for x in sp) mrp = sel.xpath( "//tr/td[@class='form_text'][last()-1]/text()").extract() mrp = min(float(x) for x in mrp) sortorder = -150 trackinventory = 'By Option' image = ("http://www.nelcosport.com/" + sel.xpath('//div[@class="enlarge"]/a/@href').extract()[0]) category = sel.xpath("//h1[@class='heading1']/a/text()").extract()[0] row = ( "Product", "", pname, item["Brand_Name"], mrp, mrp, sp, #price item["Product_Description"], pcode, "NELCOSPORTS", category, pname, "15-23 Working days", "100", "N", sortorder, item["MetaDescription"], item["TitleTag"], item["Product_Image_Description_1"], "Y", trackinventory, "1", image) mywriter.writerow(row) variants = {} variants['sku'] = sel.xpath( "//input[@type='checkbox']/@value").extract() x = sel.xpath("//tr/td[2]/text()").extract() variants['weight'] = '' for w in x: if 'Weight' in w: variants['weight'] = sel.xpath( "//tr/td[@class='form_text'][2]/text()").extract() break variants['size'] = sel.xpath( '//tr/td[@bgcolor="#FFFFFF"][last()]/text()').extract() variants['price'] = sel.xpath('//td[@class="guest"]/text()').extract() count = 0 for i in range(len(variants['size'])): variants['size'][i] = ''.join(variants['size'][i]).encode('utf-8') if variants['size'][0] == variants['size'][i]: count = count + 1 if len(variants['price']) > 1: for i in range(len(variants['price'])): if variants['weight']: if variants['size'][i].strip() == '' or count == len( variants['price']): row = ("Rule", '', "[S]Weight =" + variants['weight'][i] + "KG", '', '[FIXED]' + variants['price'][i], '[FIXED]' + variants['price'][i], '[FIXED]' + variants['price'][i], '', variants['sku'][i], 'NELCOSPORTS', '', '', '', '100') rules = ("Rule", '', "[S]Weight =" + variants['weight'][i] + "KG", '', '[FIXED]' + variants['price'][i], '[FIXED]' + variants['price'][i], '[FIXED]' + variants['price'][i], '', variants['sku'][i], 'NELCOSPORTS', '', '', '', '100') else: row = ("Rule", '', "[S]Weight =" + variants['weight'][i] + "KG" + ",[S]Size=" + variants['size'][i].strip('Size'), '', '[FIXED]' + variants['price'][i], '[FIXED]' + variants['price'][i], '[FIXED]' + variants['price'][i], '', variants['sku'][i], 'NELCOSPORTS', '', '', '', '100') else: row = ("Rule", '', "[S]Size=" + variants['size'][i].strip('Size'), '', '[FIXED]' + variants['price'][i], '[FIXED]' + variants['price'][i], '[FIXED]' + variants['price'][i], '', variants['sku'][i], 'NELCOSPORTS', '', '', '', '100') mywriter.writerow(row)