def roadwork_data_parse(self, response): jdata = json.loads(response.body) events = jdata[u'data'] for event in events: item = MyspiderItem() enc_event_text = event[u'remark'].strip().replace( '\n', ' ').replace('\r', '').encode('utf-8') X = event[u'coor_x'] Y = event[u'coor_y'] if len(event) < 1: # null-checking continue item['ID'] = event[u'eventid'] item['POSTFROM'] = u'福建高速公路' item['CONTENT'] = enc_event_text item['TITLE'] = u'LOCATION AT {' + str(Y) + u' ,' + str(X) + u'}' item['DIRECTION'] = (event[u'occplace'] + event[u'startnodename'] + u'-' + event[u'endnodename']).encode('utf-8') item['POSTDATE'] = (event[u'intime']).encode('utf-8') item['EVENTTYPE'] = u'路况施工' item['START_TIME'] = (event[u'occtime']).encode('utf-8') item['END_TIME'] = (event[u'planovertime']) item['COLLECTDATE'] = datetime.datetime.today().strftime( '%Y-%m-%d') item['REF'] = self.start_urls[0][-1:] + '4' yield item
def parse(self, response): logging.info("*" * 100) logging.info(u"爬虫开始") li_list = response.xpath("//ul[@class='wp-list clearfix']//li") for li in li_list: urls = [] small_url = li.xpath( ".//div[@class='pic']/a/img/@src").extract_first() urls.append(small_url) item = MyspiderItem() item['image_urls'] = urls detail_href = li.xpath( ".//div[@class='pic']/a/@href").extract_first().replace( "http", "https") yield scrapy.Request( url=detail_href, callback=self.parse_detail, meta={'item': item}, ) next_url = response.xpath(u"//a[text()='下一页1']/@href").extract_first() if next_url is not None: next_url = 'https://meizitu.com/a/' + next_url logging.info("*" * 100) logging.info("开如睡眠10分钟") logging.info(next_url) time.sleep(600) logging.info("睡眠结束继续爬行") logging.info("*" * 100) yield scrapy.Request(next_url, callback=self.parse) else: logging.info("=" * 100) logging.info("--------->spider close<---------")
def parse(self, response): jrlist = response.xpath("//div[@class='j-r-list']/ul/li") # print("************************************************************") print(type(jrlist)) # print("************************************************************") for jrlistli in jrlist: author = jrlistli.xpath( ".//div[@class='j-list-user']//div[@class='u-txt']//a/text()" ).get() content = jrlistli.xpath( ".//div[@class='j-r-list-c']//div[@class='j-r-list-c-desc']//a/text()" ).get() # print(f"author:{author}") # print(f"content:{content}") item = MyspiderItem(author=author, content=content) # duanzi = {"author":author,"content":content} # print(duanzi) print(item) yield item next_url = response.xpath( "//div[@class='j-page']//a[@class='pagenxt']/@href").get() print( "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) next_url_full = "http://www.budejie.com/text/" + next_url print(next_url_full) print(int(next_url)) if int(next_url) >= 50: return else: yield scrapy.Request(next_url_full, callback=self.parse, dont_filter=True)
def parse_item(self, response): item = MyspiderItem() print("######################") #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() #i['name'] = response.xpath('//div[@id="name"]').extract() #i['description'] = response.xpath('//div[@id="description"]').extract() # return i print(response.url) selector = Selector(response) title = selector.xpath('//h4/em/text()').extract()[0] # title1 = selector.xpath('//h4/em/text()') # print('#########################') # print(title1) # print(type(title1)) # print(title1[0].extract()) # print('#####################') address = selector.xpath('//p/span[@class="pr5"]/text()').extract()[0].strip() price = selector.xpath('//*[@id="pricePart"]/div[1]/span/text()').extract()[0] lease_type = selector.xpath('//*[@id="introduce"]/li[1]/h6/text()').extract()[0] suggestion = selector.xpath('//*[@id="introduce"]/li[2]/h6/text()').extract()[0] bed = selector.xpath('//*[@id="introduce"]/li[3]/h6/text()').extract()[0] item['title'] = title item['address'] = address item['price'] = price item['lease_type'] = lease_type item['suggestion'] = suggestion item['bed'] = bed item['crawled_time'] = self.crawled_time yield item
def fill_in_items(self, response): # parse json and fill them into items item = MyspiderItem() data = json.loads(response.body) real_data = data[u'roadEvents'][u'roadEvents'] strnow = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') for row in real_data: item['ID'] = row[u'eventId'] item['ROADNAME'] = row[u'roadName'].encode('utf-8') item['COLLECTDATE'] = strnow item['EVENTTYPE'] = row[u'eventType'] item['DIRECTION'] = row[u'dealCase'].encode('utf-8') item['EVENTTYPE'] = row[u'eventType'] item['START_TIME'] = row[u'occurTime'] item['END_TIME'] = row[u'endTime'] item['CONTENT'] = row[u'description'].strip().replace( '\n', ' ').replace('\r', '').encode('utf-8') item['TITLE'] = (row[u'roadName'] + u'-location at: ' + row[u'lonlatData']).encode('utf-8') item['POSTDATE'] = strnow item['POSTFROM'] = u'北京市路政局公路出行信息服务站' item['REF'] = 'http://glcx.bjlzj.gov.cn/bjglwww/index.shtml' yield item
def parse(self, response): r = response.xpath('//div[@class="mw-parser-output"]//tr') pmItem = MyspiderItem() for i in range(len(r)): try: rlist = r[i].xpath('td') newlist = [] for j in range(4): if j == 0: templist = [] templist.append( rlist[j].xpath('text()').extract()[0].strip('\n')) newlist.append(templist) else: newlist.append(rlist[j].xpath('a//text()').extract()) pmItem["pm_dexId"] = newlist[0][0] pmItem["pm_name_cn"] = newlist[1][0] pmItem["pm_name_jp"] = newlist[2][0] pmItem["pm_name_en"] = newlist[3][0] pmItem["pm_url"] = self.domain + r[i].xpath( 'td/a/@href').extract_first() yield scrapy.Request(pmItem["pm_url"], callback=self.parse_detail, meta=pmItem) except IndexError: continue
def parse_item(self, response): items = [] content = Selector(response) list1 = content.css('article.excerpt.excerpt-c3') # title = content.css('div.container') for each in list1: item = MyspiderItem() item['bookName'] = content.xpath('//head/title/text()').extract() item['chapterUrl'] = each.xpath('a/@href').extract() # cont = i.css('.a').xpath('text()').extract() # uurl = i.css('.a').xpath('@href').extract() text = each.css('a::text').extract() for i in text: try: item['bookTitle'] = i.split(' ')[0] item['chapterNum'] = i.split(' ')[1] except Exception, e: continue try: item['chapterName'] = i.split(' ')[2] except Exception, e: item['chapterName'] = i.split(' ')[1][-3:] items.append(item)
def parse(self, response): data = json.loads(response.body.decode('gb18030').encode('utf8')) strn = datetime.datetime.today().strftime('%Y-%m-%d') for case in data[u'LUWSJSSB']: item = MyspiderItem() item['COLLECTDATE'] = strn item['ROADNAME'] = case[u'LUDMC'] item['POSTFROM'] = u'江苏省交通运输厅' item['EVENTTYPE'] = self.event_type_switcher(case[u'SHIJLX']) item['DIRECTION'] = case[u'FANGX'] + u"-" + case[u'LUXBSM'] item['START_TIME'] = case[u'SHIFSJ'] item['POSTDATE'] = case[u'CHUANGJSJ'] item['END_TIME'] = case[u'YUJHFSJ'] item['CONTENT'] = case[u'SHIJNR'].strip().replace('\n', ' ').replace('\r', '') item['TITLE'] = (u'locate at x: ' + case[u'X'] + u' Y: ' + case[u'Y']).strip().replace('\n', ' ').replace( '\r', '') item['REF'] = self.start_urls[0] yield item
def parse_sale(self, response): item = MyspiderItem() item._values = response.meta tr_list = response.xpath("/html/body/div[5]/div[3]/div[2]/div/div[2]/div[1]/table/tr") item["S_S_date"] = [] item["S_S_M_sale"] = [] item["S_S_M_rank"] = [] item["S_S_rank"] = [] for tr in tr_list[1: ]: item["S_S_date"].append(tr.xpath("./td/text()").extract()[0]) # 时间数组 item["S_S_M_sale"].append(tr.xpath("./td/text()").extract()[1]) # 相应时间月销量数组 item["S_S_M_rank"].append(tr.xpath("./td/a/text()").extract()[0]) # 相应时间月销量排名数组 item["S_S_rank"].append(tr.xpath("./td/text()").extract()[2]) # 相应时间占厂商份额数组 yield item
def parse(self, response): self.log(response.headers) # 获取 freebuf 首页所有的图片, 以列表形式保存到 image_urls 字段中。 piclist = response.xpath("//div[@class='news-img']/a/img/@src").extract() if piclist: item = MyspiderItem() item['image_urls'] = piclist yield item
def parse(self, response): node_list = response.xpath('//div[@class="li_txt"]') # data_list = [] for node in node_list: item = MyspiderItem() item['name'] = node.xpath('./h3/text()').extract_first().strip() item['title'] = node.xpath('./h4/text()').extract_first().strip() item['desc'] = node.xpath('./p/text()').extract_first().strip() yield item
def parse(self, response): items = MyspiderItem() result = json.loads(response.text) data = result.get('result').get('data') for i in range(10): items['create_time'] = data[i].get('createTime') items['title'] = data[i].get('title') items['videoUrl'] = data[i].get('videoUrl') items['author'] = data[i].get('nick') print(items['title']) yield items
def parse(self, response): html = Selector(response) images_urls = html.xpath('//li/a[1]/img/@original').extract() images_name = html.xpath('//li/a[1]/p/text()').extract() for index in range(len(images_urls)): item = MyspiderItem() item['img_url'] = images_urls[index] item['img_name'] = images_name[index] # print('name:%s--------url:%s' %(item['img_name'], item['img_url'])) yield item
def parse(self, response): #处理start_url对应的响应 res1 = response.xpath( "//div[@id='browse-journals-output']//div[@class='hide-body']//a") item = MyspiderItem() for url in res1: name = url.xpath("./text()").extract_first() tempurl = url.xpath(".//@href").extract()[0] if re.match('IEEE', name, re.IGNORECASE): item["journal"] = name yield scrapy.Request(url=tempurl, callback=self.parse_joural_url, meta={"item": item})
def parse_detail(response): # 通过 CSS 选择器找出具体值 title = response.css("h1>span:nth-child(1)::text").extract_first() rating = response.css('.rating_num::text').extract_first() # 实例化对象 subject_item = MyspiderItem() subject_item['title'] = title subject_item['douban_link'] = response.url subject_item['rating'] = rating # 移交 pipeline 流水线 yield subject_item
def parse(self, response): res_list = response.xpath('//div[@class="li_txt"]') arr = [] for i in res_list: item = MyspiderItem() name = i.xpath('./h3/text()').extract()[0] title = i.xpath('./h4/text()').extract()[0] info = i.xpath('./p/text()').extract()[0] #print name item['name'] = name.encode('gbk') item['title'] = title.encode('gbk') item['info'] = info.encode('gbk') arr.append(item) return arr
def parse_1(self, response): #item = response.meta["item"] soup = BeautifulSoup(response.text,"html.parser") title = soup.select('.content > h1:nth-child(1)')[0].text title1 = "\r\n\r\n\r\n\r\n\r\n "+title+"\r\n\r\n\r\n\r\n " text = soup.select('#content')[0].text.split("https")[0] text = '\r\n\r\n '.join(text.split()) item = MyspiderItem() item['title'] = title item['text'] = text yield item #href = soup.select('.page_chapter > ul:nth-child(1) > li:nth-child(3) > a:nth-child(1)')[0]['href'] #if not href.endswith('/'): #next_url = 'https://www.biqiuge8.com' + href'''
def parse(self, response): item = MyspiderItem() tittle = response.xpath( '//*[@id="js_origina_column"]/div/div/div/div/ul/li/div/ul/li/a/text()' ).extract() url = response.xpath( '//*[@id="js_origina_column"]/div/div/div/div/ul/li/div/ul/li/a/@href' ).extract() for t in tittle: item['tittle'] = t for u in url: item['url'] = u logger.warning(item) yield item
def parse_sale(self, response): item = MyspiderItem() item._values = response.meta tr_list = response.xpath( "/html/body/div[5]/div[3]/div[2]/div/div[2]/div[1]/table/tr") item["B_S_date"] = [] item["B_S_sale"] = [] item["B_S_share"] = [] item["B_S_detail"] = [] for tr in tr_list[1:]: item["B_S_date"].append( tr.xpath("./td/text()").extract()[0]) # 时间数组 item["B_S_sale"].append( tr.xpath("./td/text()").extract()[1]) # 相应时间销量数组 item["B_S_share"].append( tr.xpath("./td/text()").extract()[2]) # 相应时间市场份额数组 item["B_S_detail"].append( 'https://xl.16888.com' + tr.xpath("./td/a/@href").extract()[0]) # 相应时间市场份额链接数组 yield item
def parse(self, response): for item in response.css('.item'): movie = MyspiderItem() movie['Staring'] = item.css('.bd p::text').extract_first() movie['rank'] = item.css('.pic em::text').extract_first() movie['title'] = item.css('.hd span.title::text').extract_first() movie['start'] = item.css('.star span.rating_num::text').extract_first() movie['quote'] = item.css('.quote span.inq::text').extract_first() movie['url'] = item.css('.pic a::attr("href")').extract_first() movie['image_url'] = item.css('.pic img::attr("src")').extract_first() yield movie next_url = response.css('span.next a::attr("href")').extract_first() if next_url is not None: url = self.start_urls[0] + next_url yield scrapy.Request(url=url, callback=self.parse)
def parse_item(self, response): # i = {} #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() #i['name'] = response.xpath('//div[@id="name"]').extract() #i['description'] = response.xpath('//div[@id="description"]').extract() # return i node_list = response.xpath('//div[@class="li_txt"]') # data_list = [] for node in node_list: item = MyspiderItem() item['name'] = node.xpath('./h3/text()').extract_first().strip() item['title'] = node.xpath('./h4/text()').extract_first().strip() item['desc'] = node.xpath('./p/text()').extract_first().strip() yield item
def parse(self, response): # 获取所有老师信息的节点 node_list = response.xpath('//div[@class="li_txt"]') # 遍历所有的教师节点 data_list = [] for node in node_list: # 创建存储数据的容器 item = MyspiderItem() # 抽取数据,保存到item中 item['name'] = node.xpath('./h3/text()').extract()[0] item['title'] = node.xpath('./h4/text()').extract()[0] item['desc'] = node.xpath('./p/text()').extract()[0] # 使用yield返回数据 yield item
def parse(self, response): # 提取数据 node_list = response.xpath('//*[@id="content"]/table/tr') # print(len(node_list)) for i, node in enumerate(node_list): # if i != 5: item = MyspiderItem() item["date"] = node.xpath("./td[1]/a/text()").extract_first() item["link"] = response.urljoin( node.xpath("./td[1]/a/@href").extract_first()) item["situation"] = node.xpath( "./td[2]/text()").extract_first().strip() item["temperature"] = node.xpath( "./td[3]/text()").extract_first().strip() item["wind"] = node.xpath("./td[4]/text()").extract_first().strip() yield item
def parse_news(self, response): item = MyspiderItem() #tmp=response.xpath("//div[@class='headlineTxt']/h2[@class='newsTitle']/a/text()|//div[@class='headlineTxt']/p[@class='hbody']/text()").extract() tmp = response.xpath("//p[@class='ynDetailText']/text()").extract() if len(tmp) == 0: tmp=response.xpath("//a[@class='newsLink']/text()").extract() if len(tmp)>0: if tmp[0] == u"[記事全文]": return item else: tmp=response.xpath("//div[@class='headlineTxt']/h2[@class='newsTitle']/a/text()|//div[@class='headlineTxt']/p[@class='hbody']/text()").extract() else: return item word_str="" s="" for w in tmp: s+=w.encode("utf-8") #word_list = re.findall(u"[\u4E00-\u9FFF\u3040-\u309F\u30A0-\u30FF\n ]+",response.body_as_unicode(),re.U) url = response.xpath("//li[@class='next']/a/@href").extract() if len(url)>0: while url[0] != None: html = urlopen(url[0]) soup = BeautifulSoup(html) s += soup.find('p',class_='ynDetailText').text.encode("utf-8") temp_soup = soup.find('li', class_='next').find('a') if temp_soup != None: url[0]=temp_soup['href'].encode("utf-8") else: url[0]=None for w in re.findall(u"[\u4E00-\u9FFF\u3040-\u309F\u30A0-\u30FF]+",s.decode("utf-8"),re.U): word_str+=w #lines=(line.strip() for line in s.splitlines()) #chunks=(phrase.strip() for line in lines for phrase in line.split(" ")) #s='\n'.join(line for line in lines if line) item['id'] = self.count item['content'] = word_str.encode('utf-8') self.count = self.count +1 return item #88830
def parse(self, response): items = [] for each in response.xpath("//div[@class='li_txt']"): item = MyspiderItem() # 使用extract()方法返回的都是Unicode字符串 name = each.xpath("h3/text()").extract() title = each.xpath("h4/text()").extract() info = each.xpath("p/text()").extract() # xpath返回的是包含一个元素的列表 item["name"] = name[0] item["title"] = title[0] item["info"] = info[0] items.append(item) yield item
def parse(self, response): # scrapy的response对象可以直接进行xpath names = response.xpath('//div[@class="tea_con"]//li/div/h3/text()') print(names) # 获取具体数据文本的方式如下 # 分组 li_list = response.xpath('//div[@class="tea_con"]//li') for li in li_list: # 创建一个数据字典 #item = {} item = MyspiderItem() # 利用scrapy封装好的xpath选择器定位元素,并通过extract()或extract_first()来获取结果 item['name'] = li.xpath('.//h3/text()').extract_first() # 老师的名字 item['level'] = li.xpath('.//h4/text()').extract_first() # 老师的级别 item['text'] = li.xpath('.//p/text()').extract_first() # 老师的介绍 yield item
def parse(self, response): items = MyspiderItem() items['topics'] = response.xpath( r'//div[@class="body board-topics"]/div/@data-topic').extract() # print items['topics'] for topicID in items['topics']: started_by = response.xpath( '//div[@data-topic="' + topicID + '"]/div[@class="topic-first-comment"]/p/text()').extract() author = response.xpath( '//div[@data-topic="' + topicID + '"]/div[@class="topic-first-comment"]/p/a/text()').extract() title = response.xpath( '//div[@data-topic="' + topicID + '"]/div/div/div[@class="raw-topic-title"]/text()').extract() link = "https://devtalk.nvidia.com/default/topic/" + topicID + "/" + title[ 0] views = response.xpath( '//div[@data-topic="' + topicID + '"]/div/p[@class="topic-views"]/text()').extract() replies = response.xpath( '//div[@data-topic="' + topicID + '"]/div/p[@class="topic-replies"]/text()').extract() # with open(r'./topic.txt', 'a+') as fp: # if len(author): # fp.write(author[0]+';') # fp.write(views.pop()+';') # fp.write(started_by[0]+'\n') # fp.write(title[0]+';') # fp.write(link+'\n') if len(author): author_ = author.pop() else: author_ = 'empty name' views_ = views[0].replace(',', '').replace(' Views', '') replies_ = replies[0].replace(',', '').replace(' Replies', '') yield { 'author': author_, 'views': int(views_), 'replies': int(replies_), 'title': title.pop(), 'started_by': started_by.pop(), 'link': link, 'topicID': int(topicID) }
def data_parse(self, response): item = MyspiderItem() d = self.response_id_map int_realRoadID = int(response.request.body[-2:].replace("=", "")) realRoadName = d[int_realRoadID][0] jdata = json.loads(response.body) events = jdata[u'data'] if len(events) < 1: item['ID'] = int_realRoadID item['ROADNAME'] = realRoadName item['POSTFROM'] = u'浙江智慧高速' item['CONTENT'] = u'目前无路况' item['TITLE'] = u'目前无路况' yield None # return else: for e in events: item['ID'] = int_realRoadID item['ROADNAME'] = realRoadName item['COLLECTDATE'] = datetime.datetime.today().strftime( '%Y-%m-%d') str_passby_stations = e[u'startnodename'] + ' - ' + e[ u'endnodename'] item['EVENTTYPE'] = e[u'eventtype'] item['DIRECTION'] = (e[u'directionname'] + str_passby_stations) item['START_TIME'] = e[u'occtime'] item['END_TIME'] = datetime.datetime.today().strftime( '%Y-%m-%d') # strip content ecode_ctnt = (e[u'reportout'].strip().replace( '\n', ' ').replace('\r', '')).encode('utf-8') ecode_title = (''.join(e[u'title'].split())).encode('utf-8') item['CONTENT'] = ecode_ctnt item['TITLE'] = ecode_title item[ 'REF'] = 'http://app.zjzhgs.com/MQTTWechatAPIServer/businessserver/showhighdetail/' + str( int_realRoadID) item['POSTDATE'] = e[u'occtime'].encode('utf-8') item['POSTFROM'] = u'浙江智慧高速' yield item
def parse(self, response): selector = Selector(response) myspiderItem = MyspiderItem() myspiderItem['url'] = response.url myspiderItem['size'] = self.format_bytes(int(len(response.body))) # for cnblogs sel = selector.xpath( '//*[@id="mainContent"]/div/div/div/a/@href').extract() for url in sel: if url.endswith('.html'): self.redis_util.insert(url, 1) # yield response.follow(url, callback=self.parse) for url in selector.xpath( '//*[@id="cnblogs_post_body"]/p/a/@href').extract(): self.redis_util.insert(url, 1) # yield response.follow(url, callback=self.parse) next_url = response.xpath( u'//*[@id="nav_next_page"]/a/@href').extract_first() if next_url != None: self.redis_util.insert(next_url, 1) # yield scrapy.Request(next_url,callback=self.parse) next_url = response.xpath( u'//*[@id="homepage_bottom_pager"]/div/a/@href').extract() for url in next_url: if url.endswith("1") == False: self.redis_util.insert(url, 1) # yield scrapy.Request(url,callback=self.parse) # for most web page urls = response.css('a::attr(href)').re(r'^/.+?/$') for url in urls: self.redis_util.insert(url, 1) self.redis_util.insert(url, 2) # yield response.follow(url, callback=self.parse) # yield { # "url": myspiderItem['url'], # "size": myspiderItem['size'] # } yield myspiderItem
def parse(self, response): print('*' * 80) print(response.meta['id']) np = NewsParser() result = np.extract_news(response.text) if not result: return item = MyspiderItem() item['id'] = response.meta['id'] item['url'] = response.url item['title'] = result['title'] item['publish_time'] = result['publish_time'] item['author'] = result['author'] item['content'] = result['content'] item['langid'] = langid.classify(item['title'])[0] print(item['langid']) if item['langid'] in ['zh', 'en']: #en yield item