def parse_video(self, response): sel = Selector(response) meta = response.request.meta self.log('The meta: %r' % meta) cur_path = meta['cur_path'] cur_topic_idx = meta['cur_topic_idx'] cur_level_idx = meta['cur_level_idx'] cur_video_idx = meta['cur_video_idx'] cur_video = self.topics[cur_topic_idx]['levels'][cur_level_idx]['videos'][cur_video_idx] self.log('Current video: %r' % cur_video) descr = sel.css('.g9.mb10 p').xpath('text()').extract()[0] if descr: cur_video['descr'] = descr.strip() self.log('Got descr > [%s]' % descr) else: self.log('Descr failed >>>>>>>> [%s]' % response.url) youtube_key = sel.css('.g9.lesson-video.ic iframe').xpath('@src').re('http://.*youtube.com/embed/([^?]*)\?')[0] if youtube_key: cur_video['youtube_key'] = youtube_key os.system('cd /tmp/videos && proxychains youtube-dl http://www.youtube.com/watch\?v\=%s --write-sub --all-subs --write-auto-sub' % youtube_key) os.system('mv /tmp/videos/*%s* %s' % (youtube_key, cur_path)) self.log('Got youtube key > %s' % youtube_key) else: self.log('Youtube key failed >>>>>>>> [%s]' % response.url)
def parsePost(self, response): logging.info(response) sel = Selector(response) posts = sel.css("Table.PostBox") breadcrumbs = sel.css("#Breadcrumbs") # condition = breadcrumbs.xpath("./a[3]/text()") condition = breadcrumbs.xpath("./a[3]/text()").extract()[0].lower() items = [] topic = response.xpath('//div[contains(@id,"PageTitle")]/h1/text()').extract()[0] url = response.url for post in posts: item = PostItemsList() item["author"] = post.css(".msgUser").xpath("./a[2]").xpath("text()").extract()[0] item["author_link"] = post.css(".msgUser").xpath("./a[2]/@href").extract()[0] item["condition"] = condition.lower() item["create_date"] = self.getDate( re.sub( " +|\n|\r|\t|\0|\x0b|\xa0", " ", response.css("td.msgThreadInfo").xpath("text()").extract()[0].replace("Posted ", ""), ) .strip() .lower() ) item["domain"] = "".join(self.allowed_domains) post_msg = self.cleanText(post.css(".PostMessageBody").extract()[0]) item["post"] = post_msg # item['tag'] = '' item["topic"] = topic item["url"] = url items.append(item) return items
def leer(self, response): sel = Selector(response) con_titulo = sel.css('div.product-info') con_ficha = sel.css('ul#product-details') con_disp = sel.css('span#disponibilidad_entrega') con_precio = sel.css('div#product-buy-small') titulo = con_titulo.xpath('.//span/text()')[0].extract() autor = con_titulo.xpath('.//a/text()')[0].extract() editorial = con_ficha.xpath('.//li')[0].xpath('.//a/text()')[0].extract() isbn = con_ficha.xpath('.//li')[3].xpath('.//span/text()')[1].extract() paginas = con_ficha.xpath('.//li')[4].xpath('.//span/text()')[1].extract() disponibilidad = con_disp.xpath('.//span/text()')[1].extract() precio = con_precio.xpath('.//p/text()')[0].extract() item = BuscadorItem() item['ISBN'] = isbn item['titulo'] = titulo item['autor'] = autor item['num_pag'] = paginas item['editorial'] = editorial item['precio'] = precio item['disponibilidad'] = disponibilidad #self.escribe_temp(isbn) return item
def parse_item(self, response): rand = random.randint(1, 2) time.sleep(rand) items = [] sel = Selector(response) base_url = get_base_url(response) title_root_path = sel.css('.mainTitle') phone_root_path = sel.css('.l_phone') if title_root_path is None or len(title_root_path) == 0 or phone_root_path is None or len(phone_root_path) == 0: return items title_path = title_root_path.xpath('./h1/text()') phone_path = phone_root_path.xpath('text()') if title_path is not None and phone_path is not None: info_name = title_path item = AladdinItem() info_name_extract = info_name.extract() if info_name_extract is not None and len(info_name_extract) > 0: name = info_name_extract[0] item['name'] = name phone_extract = phone_path.extract() if phone_extract is not None and len(phone_extract) > 0: phone = phone_extract[0] phone_set = set() phone_set.add(phone) item['phone'] = phone_set if item.get('name') is not None and item.get('phone') is not None and len(set(item.get('phone'))) > 0: items.append(item) info(str(response)) return items
def extract_info_from_post(self, post): """ Determines if the specified craigslist link has contact info. This is done based of the fact that there is a hyperlink created in the html page with the class showcontact when a page is hiding user contact info We are not interested in the users number, we only want to make sure that the listing is associated with a phone number """ baseurl = MutableURL(post) # Get link content & build response object from url content body = requests.get(post) response = HtmlResponse(url=post, body=body.content) body.connection.close() # Build selector from response selector = Selector(response=response) # Extract the price from the link price = selector.css('span[class="price"]').xpath('text()').re( '[$]\d+') # Create the response post = {'link': post} # Attach the link that contains the full content of the page post['contact_info_link'] = selector.css( 'a[class="showcontact"]::attr(href)').extract_first() # Expand the link post['contact_info_link'] = self.base_url.joinurl( post['contact_info_link']) if post['contact_info_link'] else None post['price'] = int(price[0][1:]) if price else None return post
def parse_celebrity(self, response): """ 爬取艺人 """ celebrity = CelebrityItem() sel = Selector(response) celebrity["id"] = self._parse_id(response.url) name = sel.css("div.per_header h2::text").extract() celebrity["name"] = name[0] if name else "" name_en = sel.css("div.per_header p.enname::text").extract() celebrity["name_en"] = name_en[0] if name_en else "" yield Request( url=urljoin(response.url, "details.html"), callback=self.parse_celebrity_detail, meta={"celebrity": celebrity.copy()} ) yield Request( url=urljoin(response.url, "awards.html"), callback=self.parse_celebrity_awards, meta={"celebrity": celebrity.copy()} ) yield celebrity
def search_parse(self, response): sel = Selector(response) print "myparam is %d" % (response.request.doubanid) title = sel.css("title")[0].xpath("./text()")[0].extract().strip() print "title is " + title photo = sel.css("a.nbgnbg")[0] imgurl = photo.xpath("./img/@src")[0].extract() arr1 = imgurl.split("/") print "img is " + arr1[len(arr1) - 1] self.moviedb.updMovie_doubanmovie(response.request.doubanid, title, arr1[len(arr1) - 1]) arrinfo = sel.css("div#info") for curinfo in arrinfo: print "info is " + curinfo.extract() bi = curinfo.extract().find(u">又名:</span>") if bi > 0: tstr = curinfo.extract()[bi + len(u">又名:</span>") :] ei = tstr.find("<br>") tsrt1 = tstr[0:ei].strip() print "other name is " + tsrt1 tarr1 = tsrt1.split("/") for t1 in tarr1: t1 = t1.strip() print "t1 is " + t1 self.moviedb.addMovieName_doubanmovie(response.request.doubanid, t1) break return []
def parse(self, response): sel = Selector(response) list_sel = sel.css('div[id*=wrapper] div[id*=left] div[id*=content_box] div.content div.title a::attr(href)').extract() for list in range(len(list_sel)): print (list_sel[list]) if list_sel[list][0] == 'h': inpage = list_sel[list] else: inpage = 'http://www.ruyig.com/' + list_sel[list] yield Request(inpage,callback=self.main_parse) print ('------------- next page ---------------') next_url = sel.css('div[id*=wrapper] div[id*=left] div[id*=content_box] div[id*=page_num] a::attr(href)').extract()[-1] if next_url[0] == 'h': next_link = next_url else: next_link = "http://www.ruyig.com" + next_url print(next_link) if next_link: self.page_count += 1 print("*" * 30) print(self.page_count) print("*" * 30) yield Request(next_link, callback=self.parse)
def parse(doc): s = Selector(text=doc) title = s.css(".fullText h4::text").extract() title = title[0] if len(title) == 1 else '' tmp = s.css(".fullText .annexInfo span::text").extract() rd = re.compile('\d+') if len(tmp) == 2: time, symble = tmp elif len(tmp) == 1: if rd.match(tmp[0]): time = tmp[0] symble = '--' else: time = '--' symble = tmp[0] else: time, symble = '--', '--' court = s.css(".fullText .annexInfo a::text").extract() court = court[0] if len(court) == 1 else '' content = ''.join(s.css(".fullText .fullCon::text").extract()) return '<%s,%s,%s,%s>\n\n%s' % ( title.encode('utf8'), time.encode('utf8'), symble.encode('utf8'), court.encode('utf8'), content.encode('utf8'))
def situ(imgurl): url = 'http://image.baidu.com/n/pc_search' params = { 'rn':'10', 'appid':'4', 'tag':'1', 'isMobile':'0', 'queryImageUrl':imgurl, 'querySign':'', 'fromProduct':'', 'productBackUrl':'', 'fm':'chrome', 'uptype':'plug_in' } Headers['User-Agent'] = UA.chrome z = requests.get(url,params=params,headers=Headers) response = Selector(text=z.content) # 关键词描述 kw = response.css('.guess-info-word-highlight::text').extract_first() #百度百科名字 bk = response.css('.guess-newbaike-name::text').extract_first() # 图片来源标题 img_title = response.css('.source-card-topic-title-link::text').extract() #图片来源描述 img_content = response.css('.source-card-topic-content::text').extract() tr4s = TextRank4Sentence() tr4s.analyze(text=''.join(img_title), lower=True, source = 'all_filters') for item in tr4s.get_key_sentences(num=3): print(item.index, item.weight, item.sentence)
def _parse_home_page(self, response, item=None): sel = Selector(response) if self._check_blog_created(sel): return if not item: item = BlogerItem() if self._check_private(sel): item['url'] = sel.css('div.avt > a::attr(href)').extract()[0] item['uid'] = re.search(urlpatterns['home'], item['url']).group(1) item['forbidden'] = True item['liveness'] = \ sel.css(u'ul.bbda li:contains(活跃度)::text').re('\d+')[0] item['reputation'] = \ sel.css(u'ul.bbda li:contains(威望)::text').re('\d+')[0] else: item['url'] = sel.css('#nv > ul > li:first-child a::attr(href)').extract()[0] item['uid'] = re.search(urlpatterns['home'], item['url']).group(1) ul = sel.css('#statistic_content ul li') if len(ul) > 0: item['reputation'] = ul.css(u'li:contains(威望) a::text').extract() if len(item['reputation']): item['reputation'] = item['reputation'][0] item['liveness'] = ul.css(u'li:contains(活跃度) a::text').extract() if len(item['liveness']): item['liveness'] = item['liveness'][0] return item
def parse_item(self, response): items=[] sel = Selector(response) base_url=get_base_url(response) sites_even =sel.css('table.tablelist tr.even') for site in sites_even: item = JobItem() item['name']=site.css('.l.square a').xpath('text()').extract() relative_url=site.css('.l.square a').xpath('@href').extract() item['detailLink']=urljoin_rfc(base_url,relative_url) item['catalog']=site.css('tr > td:nth-child(2)::text').extract() item['workLocation']=site.css('tr > td:nth-child(3)::text').extract() item['recruitNumber']=site.css('tr > td:nth-child(4)::text').extract() item['publishTime']=site.css('tr > td:nth-child(5)::text').extract() items.append(item) sites_odd=sel.css('table.tablelist tr.odd') for site in sites_odd: item = JobItem() item['name']=site.css('.l.square a').xpath('text()').extract() relative_url=site.css('.l.square a').xpath('@href').extract() item['detailLink']=urljoin_rfc(base_url,relative_url) item['catalog']=site.css('tr > td:nth-child(2)::text').extract() item['workLocation']=site.css('tr > td:nth-child(3)::text').extract() item['recruitNumber']=site.css('tr > td:nth-child(4)::text').extract() item['publishTime']=site.css('tr > td:nth-child(5)::text').extract() items.append(item) info('parsed'+str(response)) return items
def parse_item(self, response): # 提取数据到Items里面,主要用到XPath和CSS选择器提取网页数据 items = [] sel = Selector(response) base_url = get_base_url(response) sites_even = sel.css('table.tablelist tr.even') for site in sites_even: item = TencentItem() item['name'] = site.css('.l.square a').xpath('text()').extract() relative_url = site.css('.l.square a').xpath('@href').extract()[0] item['detailLink'] = urljoin_rfc(base_url, relative_url) item['catalog'] = site.css('tr > td:nth-child(2)::text').extract() item['workLocation'] = site.css('tr > td:nth-child(4)::text').extract() item['recruitNumber'] = site.css('tr > td:nth-child(3)::text').extract() item['publishTime'] = site.css('tr > td:nth-child(5)::text').extract() items.append(item) #print repr(item).decode("unicode-escape") + '\n' sites_odd = sel.css('table.tablelist tr.odd') for site in sites_odd: item = TencentItem() item['name'] = site.css('.l.square a').xpath('text()').extract() relative_url = site.css('.l.square a').xpath('@href').extract()[0] item['detailLink'] = urljoin_rfc(base_url, relative_url) item['catalog'] = site.css('tr > td:nth-child(2)::text').extract() item['workLocation'] = site.css('tr > td:nth-child(4)::text').extract() item['recruitNumber'] = site.css('tr > td:nth-child(3)::text').extract() item['publishTime'] = site.css('tr > td:nth-child(5)::text').extract() items.append(item) #print repr(item).decode("unicode-escape") + '\n' info('parsed ' + str(response)) return items
def parse_celebrity_detail(self, response): """ 爬取艺人详情 """ celebrity = response.meta["celebrity"] sel = Selector(response) for dt in sel.css("div.per_info_l dt"): title = dt.css("::text").extract()[0] if title == "出生日期:": text = dt.css("::text").extract()[1].rstrip(")") if "(" in text: birthday, birthplace = text.split("(", 1) else: birthday, birthplace = text, "" celebrity["birthday"] = birthday celebrity["birthplace"] = birthplace elif title == "血型:": celebrity["blood"] = dt.css("::text").extract()[1] elif title == "星座:": celebrity["constellation"] = dt.css("::text").extract()[1] elif title == "身高:": celebrity["height"] = int(dt.css("::text").extract()[1].rstrip("cm")) elif title == "体重:": celebrity["height"] = int(dt.css("::text").extract()[1].rstrip("kg")) celebrity["intro"] = "\n".join(sel.css("div#lblAllGraphy p::text").extract()) return celebrity
def getInfo(self, response): sel = Selector(response) item = response.request.meta['item'] item['title'] = sel.css('#content-middle h1::text').extract()[0] item['description'] = sel.css("#content-middle .node .content p ::text").extract() item['url'] = response.url return item
def parse(self,response): sel = Selector(response) base_url = get_base_url(response) urls = sel.css('a') for url in urls: print url.xpath('@href').extract()[0] total_item = sel.css("#LIST_PAGINATION_COUNT") if len(count_item) > 0: total_count = count_item.xpath("text()").extract()[0] list_url_tuple = os.path.split(base_url) for i in total_count: url = list_url_tuple[0] + '/i/' + str(total_count) + "/" + list_url_tuple[1] print url if len(sel.css(".display_news_con")) > 0: info = [] contents = sel.css(".display_news_con") title = contents.css(".atitle").xpath("text()").extract()[0] posttime = contents.css(".posttime").xpath("text()").extract()[0] items = posttime.split("\r\n") temp_submit_time = item[0].split(":") info['submit_time'] = temp_submit_time[1] + temp_submit_time[2] temp_publish_time = item[1].split(":") info['publish_time'] = temp_publish_time[1] + temp_publish_time[2] info['department'] = item[2].split(":")[1] info['content'] = contents.css(".entry").extract()[0] info['last_modified'] = response.headers['Last-Modified'] return info
def parse2(self, response): tv = response.meta['tv'] sel = Selector(response) tv.origin_url = response.url p_dirsort = sel.css('div#main-rt div.mod-datum p.dirsort') for p in p_dirsort: p_type = p.css('::text').extract()[0] if u'导演' in p_type: tv.director = ''.join(p.css('span::text').extract()) elif u'主演' in p_type: tv.starring = ''.join(p.css('span::text').extract()) elif u'类型' in p_type: tv.category = ''.join(p.css('span::text').extract()) tv.detail = sel.css('div#main-rt div.mod-datum p.dirtext span:nth-child(2)::text').extract()[0] print tv.name, '------->', tv.origin_url # 表明动漫 tv.type = 2 db_session.add(tv) db_session.commit() # sub_tv_list = sel.css('div#playCont div div div div.torrent-panel ul li') # # for st in sub_tv_list: # try: # st.css('a span').extract()[0] # except IndexError: # sub_tv_index = st.css('::attr(data-idx)').extract()[0] # else: # continue # # sub_tv = SubFunViedo(fv_id=tv.id, index=sub_tv_index) # sub_tv.id = st.css('::attr(data-vid)').extract()[0] # sub_tv.origin_url = 'http://www.fun.tv{}'.format(st.css('a::attr(href)').extract()[0]) # # print sub_tv.index, '-------->', sub_tv.origin_url # # request1 = Request(sub_tv.origin_url, callback=self.parse3) # request1.meta['sub_tv'] = sub_tv # yield request1 # # def parse3(self, response): # # print 'parse 3 ------->' # # sub_tv = response.meta['sub_tv'] # # sel = Selector(response) # play_count = sel.css('div.playInfo.crumbs div.rightBtn.fix a::text').extract()[0] # # sub_tv.play_count = ''.join(play_count[3:].split(',')) # # db_session.add(sub_tv) # db_session.commit()
def parse(self, response): print "%d" % (response.request.cili006searchid) sel = Selector(response) item = Cili006Item() arrtitle = sel.css('div.desc-title') if len(arrtitle) <= 0: return [] title = sel.css('div.desc-title')[0] print title.extract() item['filename'] = title.xpath('./h2/text()')[0].extract() print item['filename'] #emindex = item['filename'].find('<em>') #print emindex #item['filename'] = item['filename'][0:emindex].strip() #print item['filename'] item['magnet'] = '' item['ed2k'] = '' item['topic_id'] = response.request.cili006searchid arr = sel.css('div.desc-list-item') for cur in arr: ah = cur.xpath('./div[@class="t"]/a/@href')[0].extract() if ah.find('magnet') == 0: item['magnet'] = ah elif ah.find('ed2k') == 0: item['ed2k'] = ah self.moviedb.addMovie_cili006(item) return []
def parse(self, response): items = [] sel = Selector(response) sites_even = sel.css('a.j_th_tit') furl = sel.xpath("/html/head/meta[2][@furl]") for site in sites_even: item = JpanList_Items() item['title'] = site.xpath('text()').extract() item["link"] = site.xpath('@href').extract() item["furl"] = furl.x('@furl').extract() item['bid'] = item["link"][0].replace("/p/",'') #print item; items.append(item) print item['bid'] jRedisco = JRedisco(bid=item['bid']) if jRedisco.is_valid(): jRedisco.save() print 'jRedisco.save()' else: print 'jRedisco.is_valid' nextUrl = "http://tieba.baidu.com" + sel.css("a.next::attr(href)").extract()[0] print 'Next Page :' + nextUrl #self.insert(items) request = scrapy.Request(nextUrl, callback=self.parse) #return items return request
def parse(self, response): ''' cmd = 'phantomjs constructDom.js "%s"' % response.url stdout,stderr = subprocess.Popen(cmd,shell= True,stdout = subprocess.PIPE,stderr = subprocess.PIPE).communicate() f = file('code.txt', 'w+') f.writelines(stdout) #print (stdout) sel = Selector(text=stdout) ''' sel = Selector(response) csrfToken = sel.css("input#j-csrf::attr(value)").extract()[0].strip() name = "".join(sel.css('h1.m-source-title::text').extract()).strip() bookId = response.url.split("/")[-1] item = CartoonItem() item['name'] = "".join(sel.css('h1.m-source-title::text').extract()).strip() item['url'] = response.url item['hitNum'] = "".join(sel.css('div.g-cols--float>div.g-col:nth-of-type(1)>div.metadata:nth-of-type(2)::text').re(u'人气\:(.*)')).strip() searchObj = re.search(u'(.*)万', item['hitNum']) if searchObj: item['hitNum'] = int(float(searchObj.group(1)) * 10000) else: item['hitNum'] = int(item['hitNum']) item['collectionNum'] = -1 item['likeNum'] = -1 item['caiNum'] = -1 item['webName'] = "网易漫画" item['crawlTime'] = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) commentApiUrl = "http://manhua.163.com/comment/"+bookId+"/comments?csrfToken="+csrfToken+"&bookId="+bookId+"&page=1" request = scrapy.Request(commentApiUrl, callback = self.moreparse) request.meta['item'] = item return request
def parse2(self, response): movie = response.meta['movie'] sel = Selector(response) origin_url = response.url director = ''.join(sel.css('div#main-rt div.mod-datum p:nth-child(2) span::text').extract()) starring = ''.join(sel.css('div#main-rt div.mod-datum p:nth-child(3) span::text').extract()) detail = sel.css('div#main-rt div.mod-datum p.dirtext span:nth-child(2)::text').extract()[0] category = ''.join(sel.css('div#main-rt div.mod-datum p:nth-child(4) span::text').extract()) play_count = sel.css('div.playInfo.crumbs div.rightBtn.fix a::text').extract()[0] print movie.name, '------->', origin_url, '------->', play_count if play_count: play_count = ''.join(play_count[3:].split(',')) movie.origin_url = origin_url movie.director = director movie.starring = starring movie.detail = detail movie.category = category movie.play_count = play_count # if movie.play_count.strip() not in [0, '0']: # for f in ['fluent', 'normal', 'high', 'super']: # mp4_url = get_funtv_mp4(origin_url, f) # if mp4_url: # column_name = 'mp4_url_{}'.format(f) # setattr(movie, column_name, mp4_url) # movie.enable = 1 db_session.add(movie) db_session.commit()
def parse_view(self, response): info('parsed_view ' + str(response)) items = [] sel_view = Selector(response) sel_goto = Selector(response) sel_fenlei = Selector(response) sites_view = sel_view.css('li div a[href*=view]') sites_goto = sel_goto.css('a[href*=gotoList]') sites_fenlei = sel_fenlei.css('span a[href*=fenlei]') #for site in sites_view: # item = baikeSiteItem() # item['url'] = site.css('::attr(href)')[0].extract() # items.append(item) #for site in sites_goto: # item = baikeSiteItem() # item['url'] = site.css('::attr(href)')[0].extract() # items.append(item) #for site in sites_fenlei: # item = baikeSiteItem() # item['url'] = site.css('::attr(href)')[0].extract() # items.append(item) for site in sites_view: url = self.domains[0]+site.css('::attr(href)')[0].extract()[0] yield Request(url, callback=self.parse_word) for site in sites_goto: url = self.domains[0]+site.css('::attr(href)')[0].extract()[0] yield Request(url, callback=self.parse_word) for site in sites_fenlei: url = self.domains[0]+site.css('::attr(href)')[0].extract()[0] yield Request(url, callback=self.parse_word)
def parsePost(self,response): logging.info(response) sel = Selector(response) posts = sel.css(".messageList").xpath('./li') items = [] if len(posts)==0: return items topic = sel.css('.titleBar').xpath('./h1/text()').extract()[0].strip() url = response.url for post in posts: item = PostItemsList() if len(post.css('.userText'))>0: item['author'] = post.css('.userText').xpath('./a/text()').extract()[0] item['author_link']=response.urljoin(post.css('.userText').xpath('./a/@href').extract()[0]) else: continue item['create_date']= self.parseText(str=post.css('.DateTime').extract()[0]) post_msg= self.parseText(str=post.css('.messageText').extract()[0]) item['post']=post_msg item['tag']='rheumatoid arthritis' item['topic'] = topic item['url']=url logging.info(post_msg) items.append(item) return items
def parse(self, response): #print response.url sel = Selector(response) title = '' tit_finders = ['.header h1', '.lede-headline', '.title', '#article-headline'] for finder in tit_finders: try: title = sel.css('%s::text' %finder)[0].extract() break except Exception as e: pass con_finders = ['.body', '.article-body__content', '#article_body', '#article-body'] for finder in con_finders: try: content = sel.css('%s' %finder)[0].extract() break except Exception as e: pass dat_finders = ['cite abbr', 'time', '.timestamp span'] for finder in dat_finders: try: date = sel.css('%s::text' %finder)[0].extract() break except Exception as e: pass yield News(url=response.url, title=title, content=content, date=date)
def parse2(self,response): url = response.meta['url'] driver = webdriver.PhantomJS(executable_path = '/Users/hantianyan/phantomjs-1.9.8-macosx/bin/phantomjs', service_args = self.service_args) driver.get(response.url) sel = Selector(text = driver.page_source) item = MoreinfoItem() release_list = sel.css('.extra-info').xpath('./text()').extract() if len(release_list) > 0: item['release_time'] = release_list[0].encode('utf-8').split(':')[2] item['popularity'] = sel.css('#star_greet').xpath('./@style').re('\d+')[0].encode('utf-8') item['comfort'] = sel.css('#star_comfort').xpath('./@style').re('\d+')[0].encode('utf-8') nearsel = sel.css('.nearbox') item['transport'] = 0 transport_list = nearsel.xpath(".//div[@data-attr='traffic']").css('.p_star_s').xpath('./@style').re('\d+') if len(transport_list) > 0: item['transport'] = transport_list[0].encode('utf-8') item['hospital'] = 0 hospital_list = nearsel.xpath(".//div[@data-attr='hospital']").css('.p_star_s').xpath('./@style').re('\d+') if len(hospital_list) > 0: item['hospital'] = hospital_list[0].encode('utf-8') item['education'] = 0 education_list = nearsel.xpath(".//div[@data-attr='school']").css('.p_star_s').xpath('./@style').re('\d+') if len(education_list) > 0: item['education'] = education_list[0].encode('utf-8') item['business'] = 0 business_list = nearsel.xpath(".//div[@data-attr='commerce']").css('.p_star_s').xpath('./@style').re('\d+') if len(business_list) > 0: item['business'] = business_list[0].encode('utf-8') sql = "update second_house_table set release_time = '%s',popularity = '%s',comfort = '%s',transport = '%s',hospital = '%s',education = '%s',business = '%s' WHERE url = '%s' " % \ (item['release_time'],item['popularity'],item['comfort'],item['transport'],item['hospital'],item['education'],item['business'],url) self.cursor.execute(sql) self.db.commit()
def parse_content(self, response): sel = Selector(response) item = JobpostcrawlingItem() item["company_name"] = sel.xpath('//div[@class="ad-content-header"]/h1/text()').extract()[0] if match(r".*organisation-profile", response.url): # procedures for company pages target = sel.css(".hreview-aggregate > p") target.extend(sel.css(".hreview-aggregate > ul > li")) item["company_description"] = "\n".join("".join(p.xpath(".//text()").extract()) for p in target) item["url"] = response.url yield item else: # procedures for job post pages try: save_content = sel.xpath('//div[@id="save-content"]/a/text()').extract()[0] organisation_link = sel.xpath('//p[@class="organisation-link"]/a/text()').extract()[0] item["job_name"] = sel.xpath('//div[@class="main-content-core"]/h2/text()').extract()[0] target = sel.css(".hreview-aggregate > p") target.extend(sel.css(".hreview-aggregate > ul > li")) item["job_description"] = "\n".join("".join(p.xpath(".//text()").extract()) for p in target) item["url"] = response.url yield item except: pass
def parse(self, response): sel = Selector(response) links_to_annonces = sel.css('div[class="list-lbc"]').xpath('a/@href').extract() links_to_annonces = [a.encode('ascii').rstrip() for a in links_to_annonces] print response.url for link in links_to_annonces: # self.parseAnnonce(link) # print link item = AnnonceItem() yield Request(urlparse.urljoin(response.url, link), meta={'item':item}, callback=self.parse_annonce) # if 1: break # next page link_url = None links = sel.css('li[class="page"]') for link in links: link_text = link.xpath('a/text()').extract() print link_text if len(link_text) and link_text[0].find('suivante'): link_urls = link.xpath('a/@href').extract() if len(link_urls): link_url = link_urls[0] if link_url: yield Request(urlparse.urljoin(response.url, link_url), meta={}, callback=self.parse)
def parse(self, response): if response.status == 404: # 如果返回404,可以直接返回,不需要处理 logging.info("response.status:" + response.status) return select = Selector(response) if "data" in response.meta: isNextPage = response.meta["data"] else: isNextPage = "firstPage" question_id = self.digitalPattern.findall(response.url)[0] # 只在第一页取标题 if isNextPage == "firstPage": item = TobosuItem() item["question_id"] = question_id item["question_title"] = select.css(".aqq-title").xpath(".//h1/text()").extract()[0] try: item["question_description"] = select.css(".des").extract()[0][15:-4].strip() except Exception, e: item["question_description"] = "" print e try: big_category = ",".join(select.css(".recom-lab").xpath(".//a/text()")[1:].extract()) except Exception, e: big_category = "" print e
def parse_item(self, response): # 提取数据到Items里面,主要用到XPath和CSS选择器提取网页数据 items = [] sel = Selector(response) base_url = get_base_url(response) sites_even = sel.css("table.tablelist tr.even") for site in sites_even: item = TutorialItem() item["name"] = site.css(".l.square a").xpath("text()").extract() relative_url = site.css(".l.square a").xpath("@href").extract()[0] item["detailLink"] = urljoin_rfc(base_url, relative_url) item["catalog"] = site.css("tr > td:nth-child(2)::text").extract() item["workLocation"] = site.css("tr > td:nth-child(4)::text").extract() item["recruitNumber"] = site.css("tr > td:nth-child(3)::text").extract() item["publishTime"] = site.css("tr > td:nth-child(5)::text").extract() items.append(item) # print repr(item).decode("unicode-escape") + '\n' sites_odd = sel.css("table.tablelist tr.odd") for site in sites_odd: item = TutorialItem() item["name"] = site.css(".l.square a").xpath("text()").extract() relative_url = site.css(".l.square a").xpath("@href").extract()[0] item["detailLink"] = urljoin_rfc(base_url, relative_url) item["catalog"] = site.css("tr > td:nth-child(2)::text").extract() item["workLocation"] = site.css("tr > td:nth-child(4)::text").extract() item["recruitNumber"] = site.css("tr > td:nth-child(3)::text").extract() item["publishTime"] = site.css("tr > td:nth-child(5)::text").extract() items.append(item) # print repr(item).decode("unicode-escape") + '\n' info("parsed " + str(response)) return items
def parse(self, response): sel = Selector(response) item = FeatureItem() item["title"] = sel.css(".title::text").extract() item["link"] = response.url item["reviews"] = sel.css(".review-text::text").extract() return item
def parse_img_link(self, response): selector = Selector(response=response) homepage_id = self.homepage_id page_urls = filter(lambda x: "/s/" in x, selector.css("a::attr(href)").extract()) for url in set(page_urls): yield scrapy.Request(url=url, cookies=self.cookies, callback=self.parse_img_url)
def parse(self, response): selector = Selector(response) for post in selector.css('article.post'): print('---------------------------------') loader = ItemLoader(BlogPost(), post) loader.add_css('name', '.entry-title > a::text') yield loader.load_item()
def parse_more_topics(self, response): logging.warn('FOUND A ADDED TOPICS') json_response = json.loads(response.text) html_selector = Selector(text=json_response['value']['html']) item = response.meta['item'] item['topics'] = ','.join( [item['topics']] + list(html_selector.css(self.more_topic_name_css).extract())) yield item
def parse_proxy(self, response): # url = "https://hidemyna.me/en/proxy-list/?maxtime=1000&type=h&start=0" # self.driver.get(url) # # time.sleep(20) # element = WebDriverWait(self.driver, 30).until( # EC.presence_of_element_located((By.CSS_SELECTOR, "div.allcountries__bl"))) # html = self.driver.page_source # sel = Selector(text=html) # all_countries = sel.css('div.allcountries__bl label span.flag-icon::attr(class)').extract() # self.logger.info(html) # print([country.split('-icon-')[-1] for country in all_countries]) # self.logger.info(all_countries) countries = {} for i in range(7): skip = i * 64 url = 'https://hidemyna.me/en/proxy-list/?country=UADEARALINBGBRBDCACZUSGBHUIDNLRUESFR&type=h&maxtime=1000&start={}#list'.format( skip) print(url) self.driver.get(url) element = WebDriverWait(self.driver, 30).until( EC.presence_of_element_located( (By.CSS_SELECTOR, "div.allcountries__bl"))) html = self.driver.page_source sel = Selector(text=html) for r in sel.css('tbody tr'): p = Proxy() proxy = r.xpath("td[1]/text()").extract_first() port = r.xpath("td[2]/text()").extract_first() country = r.xpath("./td[3]").css('div::text').extract_first() country_alt_raw = r.xpath("./td[3]").css( 'span::attr(class)').extract_first() # print(country_alt_raw) if country_alt_raw: country_alt = country_alt_raw.split('icon-')[-1] if country_alt in countries: countries[country_alt] += 1 else: countries[country_alt] = 0 else: country_alt = None speed = r.css('div.bar p::text').extract_first() p_type = r.xpath("./td[5]/text()").extract_first() p['proxy'] = proxy p['port'] = port p['country'] = country p['country_alt'] = country_alt p['speed'] = speed p['protocol'] = p_type if p_type and country_alt in self.countries_max: if countries[country_alt] <= self.countries_max[ country_alt]: yield p time.sleep(10) self.logger.info(countries)
def search_xls_link_inpage(self, response): sel = Selector(response) url_page = sel.css('#main-interno ul li a::attr(href)').extract() if len(url_page) == 0: return Request(self.all_links.pop(0), callback=self.open_operational_data_label) new_url = self.url_base_2 + url_page[0] return Request(new_url, callback=self.parse_xls)
def parse(self, response): sel = Selector(response) item = ProblemItem() item['origin_oj'] = 'sdut' item['problem_id'] = self.problem_id item['problem_url'] = response.url item['title'] = sel.xpath('//center/h2/text()').extract()[0] item['description'] = sel.css('.pro_desc').extract()[0] item['input'] = sel.css('.pro_desc').extract()[1] item['output'] = sel.css('.pro_desc').extract()[2] item['time_limit'] = sel.xpath('//a/h5/text()').re('T[\S*\s]*s')[0][12:] item['memory_limit'] = \ sel.xpath('//a/h5/text()').re('M[\S*\s]*K')[0][14:] item['sample_input'] = sel.xpath('//div[@class="data"]/pre').extract()[0] item['sample_output'] = sel.xpath('//div[@class="data"]/pre').extract()[1] item['update_time'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") return item
def parse(self, response): pageSelector = Selector(response) objects = pageSelector.css('.location') objects.extract() for index, object in enumerate(objects): objectUrl = Extractor.url(response, object, 'h2.heading-sm > a::attr(href)') yield scrapy.Request(objectUrl, self.parse_object)
def parse_img_url(self, response): selector = Selector(response=response) image_link = selector.css("#img::attr(src)").extract()[0] item = ExhentaiItem() filename = image_link.strip().split("/")[-1] item['image_urls'] = image_link item["image_paths"] = os.path.join(self.title, filename) item["image_title"] = self.title yield item
def get_xls(self, response): """Link to generate XLS is hide in a js script.""" sel = Selector(response) res = sel.css('script').re('"ExportUrlBase":"(.*?)",') if res: res_str = self.base_url + res[0].replace('\\u0026', '&') + 'Excel' return Request(res_str, self.parse_xls) return None
def parse_item(self, response): selector = Selector(response) #sélection du lien officiel dans les onglets url = selector.xpath( '//div[@id="infos"]/p[@class="officiel"]/a/@href').extract() if len(url) == 1: # on l'a trouvé, on le suit item = response.meta['item'] request = Request("http://www.boamp.fr" + url[0], callback=self.parse_official_link) request.meta['item'] = item return request else: # il n'existe pas, on tente notre chance sur la page courrant qui est souvent celle au format officiel if len(selector.css(".officielOnly").extract()) == 1: item = response.meta['item'] selector = Selector(response) html = selector.css("#avisOfficiel").extract() references = selector.css("#references").extract() return self.extract_data(html, references, response.url, item)
def _past_jobs_processor(node_list): if not node_list: return selector = Selector(text=node_list[0]) title = selector.css('.title::text').extract() company_url = selector.xpath('//a/@href').extract() start = selector.xpath('//div[@class="cell date"][1]/text()').extract() end = selector.xpath('//div[@class="cell date"][2]/text()').extract() return zip(title, company_url, start, end)
def parse_recommended_products(self, response): # Scrape similar products sel = Selector(response) url_paths = sel.css( 'article.top-products .content>a::attr(href)').extract() for url_path in url_paths: request = WebdriverRequest(url_path, callback=self.parse_product) self.prep_product_tagging(request, response.meta.get('item')) yield request
def parse(self, response): # path_page_activate = 'div.pagcomment span.active' # path_next_page_numb = 'div.pagcomment span.active + a::text' path_list_QA = 'li.comment_ask' path_comment_id = 'li.comment_ask::attr(id)' path_object_id = 'div.wrap_comment::attr(detailid)' if response.css(path_object_id).extract_first() is not None: self.objectid = response.css(path_object_id).extract_first() str_numb_page = 0 try: str_numb_page = response.css('ul.listcomment div.pagcomment span' )[-2].css('::text').extract_first() except (): str_numb_page = 1 for page_numb in range(1, int(str_numb_page) + 1): try: formdata = { 'core[call]': 'cmt.listpaging', 'objectid': self.objectid, 'objecttype': '6', 'pageindex': str(page_numb), 'order': '1', } print("formdata: ") print(formdata) res_script = requests.post(self.url_api_list_comment, data=formdata).text struct_text = res_script.replace(self.start_replaced_str, '').replace( self.end_replaced_str, '') selector = Selector(text=struct_text) for qa in selector.css(path_list_QA): if len(qa.css('div.listreply div.reply')) >= 1: yield { 'id_cmt': qa.css(path_comment_id).extract_first(), 'question': qa.css('div.question::text').extract_first(), # 'answer': ''.join(qa.css('div.listreply div.reply')[0].css('div.cont::text').extract()), 'answers': [ ''.join(reply.css('div.cont::text').extract()) for reply in qa.css('div.listreply div.reply') ], # 'time': qa.css('li.comment_ask a.time::text').extract_first(), # 'user_name': qa.css('li.comment_ask div.rowuser a strong::text').extract_first(), # 'replier_name': qa.css('li.comment_ask div.rowuser a strong::text').extract_first(), } else: continue except Exception as e: print(e)
def crawl_ips(): # 爬取西刺得免费ip代理 headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36" } for i in range(2354): re = requests.get("http://www.xicidaili.com/nn/{0}".format(i), headers=headers) # print(re.text) selector = Selector(text=re.text) # all_trs = selector.css("#ip_list tr[class]:not([class='subtitle'])") all_trs = selector.css("#ip_list tr") ip_list = [] for tr in all_trs[1:]: speed_str = tr.css(".bar::attr(title)").extract()[0] if speed_str: speed = float(speed_str.split("秒")[0]) # ip = tr.css("td:nth-child[2]::text").extract()[0] # 报错 all_text = tr.css("td::text").extract() ip = all_text[0] port = all_text[1] proxy_type = all_text[5] # lis = (ip, port, speed, proxy_type) # lis = list(map(lambda a: str(a) if type(a) != 'str' else a, (ip, port, speed, proxy_type))) # print(':'.join(lis)) ip_list.append((ip, port, speed, proxy_type)) # print(all_trs) # for tr in all_trs: # # print(tr.extract()) # # ip = tr.xpath('/td[2]/text()').extract() # # port = tr.xpath('/td[3]/text()').extract() # # http_type = tr.xpath('/td[6]/text()').extract() # ip = tr.css('td:nth-child(2)::text').extract()[0] # port = tr.css('td:nth-child(3)::text').extract()[0] # speed = tr.css('td:nth-child(6)::text').extract()[0] # proxy_type = tr.css('td:nth-child(6)::text').extract()[0] # # print(ip, port) # # print(':'.join((str(ip), str(port), str(http_type)))) # print(':'.join((ip, port, speed, proxy_type))) # ip_list.append((ip, port, speed, proxy_type)) print(": ".join(ip_info)) for ip_info in ip_list: cursor.execute( "insert into proxy_ip(ip, port, speed, proxy_type) VALUES ('{0}','{1}',{2},'{3}')" .format(ip_info[0], ip_info[1], ip_info[2], ip_info[3])) # 传递字符串一定要加单引号 conn.commit()
def parse_new_page(self, response): #for sel in response.xpath('//ul/li'): # title = sel.xpath('a/text()').extract() # link = sel.xpath('a/@href').extract() # desc = sel.xpath('text()').extract() #print title, link, desc item = ExxNewsItem() sel = Selector(response) title = sel.css("#blog > div > h2").extract() content = sel.css('''#blog > div > div''').extract() print title, content item["url"] = response.url item['title'] = self.process_item(title) item['content'] = self.process_item(content) yield item
def parse_item(response): sel = Selector(response) url = response.request.url if re.match(r'.*?/\d{4}-\d{2}-\d{2}/.*?html', url): print('---------------------') print(url) content = response.xpath( '/html/body/div[1]/div[2]/div[1]/article/div[1]/p//text()' ).extract() print(content) # 移除编辑 editor = response.xpath( '//*[@class="-articleeditor"]/text()').extract_first() if editor: content.remove(editor) publish_time = sel.re(r'\d{4}-\d{2}-\d{2}.*?\d{2}:\d{2}:\d{2}')[0] print(publish_time) if ' ' in publish_time: publish_time = publish_time.replace(' ', '') if content: item = NewsItem( domainname='http://ti.tibet3.com/', chinesename='tibet3', url=sel.root.base, title=sel.css('.entry-header > h1:nth-child(1)::text' ).extract_first(), subtitle=sel.css('.sub::text').extract_first(), language='藏文', encodingtype='utf-8', corpustype='网络', timeofpublish=publish_time, content=''.join(content), author=None) print(item.get("title", None)) print(item.get("timeofpublish", None)) print(item.get("source", None)) print(item.get("author", None)) # yield item # item = judge_time_news(item) # if item: yield item
def parse_download(self, response): '''The download page (usually) offers multiple download links, we want just the update.''' sel = Selector(response) link_notes = None link_bios = None links = sel.css('a').xpath('@href').extract() for link in links: ### Release notes are cool too, though they are in PDF form. if link.find("ReleaseNotes") >= 0: link_notes = link if link.find(".BIO") >= 0: link_bios = link if link_bios is None: return item = IntelBiosUpdatePageItem() link_bios = link_bios[link_bios.find("httpDown=") + len("httpDown="):link_bios.find(".BIO") + len(".BIO")] item['bios_url'] = link_bios item['notes_url'] = link_notes if link_notes is not None else "" ### Supported products is nice too. products = [] products_sel = sel.css('div#prodos') if len(products_sel) > 0: products_sel = products_sel.xpath( ".//table/tr/td/text()").extract() for product in products_sel: products.append("".join( [c for c in product if c not in ['\t', '\n', '\r']])) item['products'] = products item['attrs'] = dict(response.meta['attrs']) item['item_id'] = item['attrs']['item_id'] #yield item yield Request(url=link_bios, callback=self.parse_binary, meta={"item": item}) pass
def parse_one_page(html): sel = Selector(html) books_lists = sel.css(' div ul li') books_lists = books_lists[16:] for book in books_lists: book_ids = book.xpath("a/@href").extract_first().strip().replace("https://book.douban.com/subject/",'').replace('/', '') book_img_url = book.xpath("a[@class='cover']/img/@src").extract_first().strip() id_books.append(book_ids) url_img_books.append(book_img_url) books = sel.css('ul li div') book2 = books[:20] for book in book2: book_title = book.xpath("h2/a/text()").extract_first() book_publis = book.xpath("p[@class='color-gray']/text()").extract_first().replace(' ', '').strip() book_intro = book.xpath("p[@class='detail']/text()").extract_first().replace(" ", "").strip() book_2 = book_publis.split('/') bookauthor = book_2[0] bookpub = book_2[-2] book_date = book_2[-1] title_books.append(book_title) author_books.append(bookauthor) publis_books.append(bookpub) date_books.append(book_date) intro_books.append(book_intro) # print(bookauthor+bookpub+book_date) books1 = books[20:] for book in books1: book_title = book.xpath("h2/a/text()").extract_first() book_publis = book.xpath("p[@class='color-gray']/text()").extract_first().replace(' ', '').strip() book_2 = book_publis.split('/') bookauthor = book_2[0] bookpub = book_2[-2] book_date = book_2[-1] book_intro = book.xpath("p[3]/text()").extract_first().replace(" ", "").strip() title_books.append(book_title) author_books.append(bookauthor) publis_books.append(bookpub) date_books.append(book_date) intro_books.append(book_intro)
class DataclassLoader(ItemLoader, DataclassHelper): """ Using the `ItemLoader` pollutes a dataclass declaration for a scrapy item. See: https://docs.scrapy.org/en/latest/topics/loaders.html#working-with-dataclass-items This loader allows to keep dataclasses pure and frozen. See `items.py`. In subclasses, you must override the `dataclass()` abstract property and define all methods needed to `populate()` objects, see the method below. """ default_output_processor = TakeFirst() # The `ItemLoader` uses mutable dict under the hood. default_item_class = dict @property def response(self): return self.context['response'] def __call__(self, response): self.update(response) self.populate() return self.load_item() def update(self, response): self.selector = Selector(response=response) self.context.update(selector=self.selector) self.context.update(response=response) def populate(self): """ For each `self.field_names` calls the `self.<field_name>()` method to get the field value and store it internally for loading the item further. NOTE: using `replace_value()` instead of `add_value()` keeps the first item of the internal list is actual. So using the `TakeFirst` as the `default_output_processor` works correctly. """ for name in self.field_names: self.replace_value(name, getattr(self, name)()) def load_item(self): return self.dataclass(**super().load_item()) def css_response(self, query): """ Builds a `HtmlResponse` from a HTML text selected with the CSS `query` from `self.response`. It allows to call a `DataclassLoader` with a nested response similar to `ItemLoader.nested_css()`, but without instantiating the current class. """ return html(self.selector.css(query).get())
def parse_dir_name(self, response): """TODO: Docstring for parse_dir_name. :response: TODO :returns: TODO """ sel = Selector(response) return sel.css('.forum_dir_info li:last-child a::text').extract( )[0].strip() # format: 40,876
def parse(self, response): paragraphs = json.loads(response.body_as_unicode())["aaData"] for paragraph, *_ in paragraphs: selector = Selector(text=paragraph) url = selector.css("p a ::attr(href)").extract_first() text = selector.css("p strong ::text") is_extra_edition = text.extract_first().startswith("Suplemento") date = text.re_first("\d{1,2} de \w+ de \d{4}") date = parse(date, languages=["pt"]).date() yield Gazette( date=date, file_urls=[url], is_extra_edition=is_extra_edition, territory_id=self.TERRITORY_ID, power="executive_legislature", scraped_at=dt.datetime.utcnow(), )
def img_url_from_page(url): html = requests.get(url).text sel = Selector(text=html) img_names = sel.css('td a img::attr(src)').extract() img_names = [img_name for img_name in img_names] return img_names
def parse(self, response): selector = Selector(response) base_url = get_base_url(response) form_urls = selector.css( 'div[class^="floor js"] li[class^="dir-item"] a[class="fwb"]::attr(href)' ).extract() for url in form_urls[:]: form_url = clean_url(base_url, url, response.encoding) yield Request(url=form_url, callback=self.parse_classify_form)
def parse(self, response): print("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa") print(response.request.headers['User-Agent']) print("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa") resp = Selector(text=self.html) for row in resp.css('tbody tr[role="row"]'): yield { 'fund_name': row.css('.text-left a::text').get(), 'NAV': row.css('td:nth-child(10)::text').get() }
def store_page(url, page): sel = Selector(text=page) title = sel.css('title ::text').extract_first() main_html = sel.css('div[role=main]').extract_first() page_dict = { 'objectID': url, 'url': url, 'fileType': 'html', 'title': title, 'source': specificGoogleSitesUrl, 'service': 'gsites', 'content': main_html, 'organisationID': organisationID, 'modified': calendar.timegm(time.gmtime()), # Not ideal!! 'created': calendar.timegm(time.gmtime()), # Definitely not right!!!! } pp.pprint(page_dict) algoliaScrapedIndex.save_object(page_dict) return main_html
def parse(self, response): sel = Selector(response) item = ProblemItem() item['origin_oj'] = 'hdu' item['problem_id'] = self.problem_id item['problem_url'] = response.url item['title'] = sel.xpath('//h1/text()').extract()[0] item['description'] = sel.css('.panel_content').extract()[0] item['input'] = sel.css('.panel_content').extract()[1] item['output'] = sel.css('.panel_content').extract()[2] item['time_limit'] = \ sel.xpath('//b/span/text()').re('T[\S*\s]*S')[0][12:] item['memory_limit'] = \ sel.xpath('//b/span/text()').re('Me[\S*\s]*K')[0][14:] item['sample_input'] = sel.xpath('//pre').extract()[0] item['sample_output'] = sel.xpath('//pre').extract()[1] item['update_time'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") return item
def parse(self, response): detail_page_links = [] for html_text in response.css('record *::text').getall(): record = Selector(text=html_text) url = record.css('a::attr(href)').get() UID = url.split('/')[-1][:-5] + '_' + url.split( '/')[-4] + url.split('/')[-3] + url.split('/')[-2] detail_page_links.append(url) yield { 'UID': UID, 'title': record.css('a::attr(title)').get(), 'date': record.css('b::text').get(), 'FileNumber': None, 'text length': 0, 'url': url, 'crawl state': 'half' } for url in detail_page_links: yield scrapy.Request(url=url, callback=self.parse_content)
def get_json(js): # 处理字符串,由于返回的岗位职责是一个包含html的json数据,需要处理一下 if js: json_content = js.get('zpData').get('html') content = Selector(text=json_content) content_text = content.css(".detail-bottom-text::text").re( "[\u4e00-\u9fa5_a-zA-Z0-9]+") return content_text else: print("未获取数据")
def parse(self, response): # 获取文章列表页的文章url,并交给解析函数进行具体字段的解析 post_nodes = response.css( "#archive .floated-thumb .post-thumb a").extract() for post_node in post_nodes: post_node = Selector(text=post_node) image_url = post_node.css("img::attr(src)").extract_first("") post_url = post_node.css("::attr(href)").extract_first("") yield Request(url=parse.urljoin(response.url, post_url), meta={"front_image_url": image_url}, callback=self.parse_detail) # 获取下一页的url,并交给scrapy进行下载 next_urls = response.css( ".next.page-numbers::attr(href)").extract_first("") if next_urls: yield Request(url=parse.urljoin(response.url, post_url), callback=self.parse)
def parse_news_item(self, response): sel = Selector(response) news = NewsItem() news['url'] = response.url news['source'] = SOURCE_ID_NIC news['title'] = sel.css('.newstitle > h1::text').extract()[0] news['content'] = normalize_content(sel.css('#contentText ::text').extract()) # 文章元数据居然人性地放在了不同的元素里, 不用上正则, 简直良心 news['author'] = '未知' # 这里可能会出现 u'\xa0', 需要标准化掉 news['publisher'] = normalize_content(sel.css('.newstitle > span > b:nth-child(3)::text').extract()[0]) ctime_str = sel.css('.newstitle > span > b:nth-child(1)::text').extract()[0] news['ctime'] = strptime_helper(ctime_str, '%Y/%m/%d') return news