def filter(self, response): print ",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,," print self.filter_district print self.filter_date_from print self.filter_date_to if self.filter_date_from != '': conditionItem270 = Selector(response).xpath('//input[@id="TAB_QueryConditionItem270"]/@value').extract()[0] self.conditionData += "|" + conditionItem270.encode('ascii','ignore') + ":%s~%s"% (self.filter_date_from,self.filter_date_to) if self.filter_district != '': self.city = self.filter_district for dist in self.dist_dict_list: if dist['name'] == self.filter_district: self.dis_code = dist['value'] break if self.dis_code == '': raise Exception("Wrong input for district") if len(self.dis_code) == 2: self.city = '' self.state = self.filter_district else: statecode = self.dis_code[0:2] for state in self.dist_dict_list: if state['value'] == statecode: self.state = state['name'] break conditionItem256 = Selector(response).xpath('//input[@id="TAB_QueryConditionItem256"]/@value').extract()[0] dist_pref = "|" + conditionItem256.encode('ascii','ignore') + ":%s%%~" % self.dis_code dist_name = unicode(self.filter_district,"utf-8").encode("utf8") self.conditionData += dist_pref + dist_name
def parse_helper(self, response): has_content = False topic = Selector(response)\ .xpath("//div[@class='topic-item-info content']/h4/text()")\ .extract_first() #First, crawl all application in this page for app in Selector(response).xpath("//div[@class='nofloat']"): has_content = True item = HuaweiAppStoreTopicAppItem() item["topic"] = topic.encode("utf-8") item["title"] = app.xpath(".//img[@class='app']/@title")\ .extract_first().encode("utf-8") item["appid"] = app.xpath(".//a[1]/@href")\ .re('http://appstore.huawei.com:80/app/(C\d+)')[0] item["image"] = app.xpath(".//img[@class='app']/@lazyload")\ .extract_first().encode("utf-8") item["desc"] = u''.join(app.xpath(".//p[@class='ft-light']/text()")\ .extract()).encode("utf-8") yield item #Then, try find next page if has_content: yield scrapy.Request(\ self.get_next_page(response.url), self.parse_helper)
def parse_helper(self, response): has_content = False topic = Selector(response)\ .xpath("//div[@class='topic-item-info content']/h4/text()")\ .extract_first() #First, crawl all application in this page for app in Selector(response).xpath("//div[@class='nofloat']"): has_content = True item = HuaweiAppStoreTopicAppItem() item["topic"] = topic.encode("utf-8") item["title"] = app.xpath(".//img[@class='app']/@title")\ .extract_first().encode("utf-8") item["appid"] = app.xpath(".//a[1]/@href")\ .re('http://appstore.huawei.com:80/app/(C\d+)')[0] item["image"] = app.xpath(".//img[@class='app']/@lazyload")\ .extract_first().encode("utf-8") item["desc"] = u''.join(app.xpath(".//p[@class='ft-light']/text()")\ .extract()).encode("utf-8") yield item #Then, try find next page if has_content: yield scrapy.Request(\ self.get_next_page(response.url), self.parse_helper)
def parse(self, response): dirname = os.sep.join(['root'] + response.url.split('/')[2:-1]) filename = os.sep.join([dirname, response.url.split('/')[-1]]) article_text = Selector(response).xpath( '//div[@class="post"]').extract()[0] parser = Selector(text=article_text) article_title = parser.xpath( '//a[@id="cb_post_title_url"]/text()').extract()[0] title_link = parser.xpath( '//a[@id="cb_post_title_url"]/@href').extract()[0] article_text = article_text.replace(title_link, title_link[6:]) item = ArticleItem() item['image_urls'] = [x for x in parser.xpath('//img/@src').extract()] item['image_names'] = [x.split('/')[-1] for x in item['image_urls']] # process image links. for url in item['image_urls']: article_text = article_text.replace(url, url[6:]) if not os.path.exists(dirname): os.makedirs(dirname) with open(filename, 'wb') as fp: fp.write(self.html_start_l + article_title.encode('utf-8') + self.html_start_r + article_text.encode('utf-8', 'ignore') + self.html_end) return item
def parse_item(self, response): text = Selector(response).xpath("//body//text()").re('(\w+)') for text in text: newtext = text.encode('utf8') hxs = HtmlXPathSelector(response) item = BSiteItem() if newtext == 'aerospace' or newtext == 'Aerospace' or newtext == 'AEROSPACE': print 'True' test = response.url print test
def parse(self, response): dirname = os.sep.join(['root'] + response.url.split('/')[2:-1]) filename = os.sep.join( [dirname, response.url.split('/')[-1] + '.html']) # parse artitle text. article_text = Selector(response).xpath( '//div[@id="article_details"]').extract()[0] parser = Selector(text=article_text) # parse artile title. article_title = parser.xpath( '//span[@class="link_title"]/a/text()').extract()[0] article_links = parser.xpath( '//a[re:test(@href, "[^/]+/article/details/\d+")]/@href').extract( ) # replace links. article_text = article_text.replace( 'http://static.blog.csdn.net/css/blog_detail.css', '/static.blog.csdn.net/css/blog_detail.css') for link in article_links: article_text = article_text.replace( link, '/blog.csdn.net' + link + '.html') item = ArticleItem() item['image_urls'] = [x for x in parser.xpath('//img/@src').extract()] # handle such image(with watermark) url: # http://img.blog.csdn.net/20140917165912117?watermark/2/text/aHR0cDovL2Jsb2cuY3Nkbi5uZXQvaWFpdGk=/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70/gravity/SouthEast item['image_names'] = [ (lambda k: k if '?' not in k else k.split('?')[0] + '.png')(x).split('/')[-1] for x in item['image_urls'] ] # process image links. for url in item['image_urls']: article_text = article_text.replace( url, (lambda k: k if '?' not in k else k.split('?')[0] + '.png')(url)[6:]) if not os.path.exists(dirname): os.makedirs(dirname) with open(filename, 'wb') as fp: fp.write(self.html_start_l + article_title.encode('utf-8') + self.html_start_r + article_text.encode('utf-8', 'ignore') + self.html_end) return item
def parse_downurl(self, response): try: antivirus1 = response.css("#static_antivirus").extract()[0] antivirus = Selector(response).css( "#static_antivirus").extract()[0] # 从Static Analysis ------ Antivirus的结果页 antiresult = re.findall( "((Microsoft|Kaspersky|ESET\-NOD32)</td>\n\s*<td>\n\s*<span class=\"text\-error\")", antivirus.encode("utf-8"), re.S) # 如果返回的列表为空,则表示这个样本eset、卡巴斯基和微软不报,不入库,直接返回。 if antiresult == []: return # 提取点击下载按钮的下载地址 url = response.xpath("//a[contains(@class,'btn-primary')]/@href" ).extract()[0].encode('utf-8') url = urlparse.urljoin("https://malwr.com", url) item = MalwrItem() item['file_urls'] = [url] return item except Exception, e: pass
def parse(self, response): description = response.xpath( "//table[@class='itemlist']/tr[not(re:test(@class, " "'(spacer)'))]").extract() row = self.get_default_row_dict() # print description for i, v in enumerate(description): index = i if not row['rank']: value = Selector(text=v).xpath( '//td[1]/span[@class="rank"]/text()').extract_first() row['rank'] = int(value.replace('.', '')) if value else 0 if not row['story_text']: value = Selector(text=v).xpath( '//td[3]/a[@class="storylink"]/text()').extract_first() row['story_text'] = value.encode("utf8") if value else '' if not row['link_href']: value = Selector(text=v).xpath( '//td[3]/a[@class="storylink"]/@href').extract_first() # print value row['link_href'] = value if value else '' if not row['hn_user']: value = Selector(text=v).xpath( '//a[@class="hnuser"]/text()').extract_first() row['hn_user'] = value.encode("utf8") if value else '' if not row['age']: value = Selector(text=v).xpath( '//span[@class="age"]/a/text()').extract_first() row['age'] = int(value.split(' ')[0]) if value else 0 if not row['total_comments']: value = Selector(text=v).xpath( '//td[@class="subtext"]/a[contains(@href, "item?id=")]/text()' ).extract_first() if value: value = value.encode('ascii', 'ignore').replace( 'comments', '') if value else '' value = value.encode('ascii', 'ignore').replace( 'comment', '') if value else '' row['total_comments'] = int(value) if represents_int( value) else 0 if not row['score']: value = Selector(text=v).xpath( '//span[@class="score"]/text()').extract_first() row['score'] = int(value.split(' ')[0]) if value else 0 if not row['hn_id_code']: value = Selector( text=v).xpath('//tr[@class="athing"]/@id').extract_first() row['hn_id_code'] = int(value) if represents_int(value) else 0 if all([None for i, v in row.items() if v == None]): print 'Go for save >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>' data = row.copy() row = self.get_default_row_dict() self.comment_url.append( 'https://news.ycombinator.com/item?id=15318440') news_id = data['hn_id_code'] item = NewsBotItem(data) yield item request = scrapy.Request( url='https://news.ycombinator.com/item?id=' + str(news_id), callback=self.parse_comment) request.meta['item'] = item request.meta['news_id'] = int(news_id) yield request if index % 2: row = self.get_default_row_dict()
def parse(self, response): dirname = os.sep.join(['root'] + response.url.split('/')[2:-1]) filename = os.sep.join([dirname, response.url.split('/')[-1]]) article_text = Selector(response).xpath('//div[@class="post"]').extract()[0] parser = Selector(text = article_text) article_title = parser.xpath('//a[@id="cb_post_title_url"]/text()').extract()[0] title_link = parser.xpath('//a[@id="cb_post_title_url"]/@href').extract()[0] article_text = article_text.replace(title_link, title_link[6:]) item = ArticleItem() item['image_urls'] = [x for x in parser.xpath('//img/@src').extract()] item['image_names'] = [x.split('/')[-1] for x in item['image_urls']] # process image links. for url in item['image_urls']: article_text = article_text.replace(url, url[6:]) if not os.path.exists(dirname): os.makedirs(dirname) with open(filename, 'wb') as fp: fp.write(self.html_start_l + article_title.encode('utf-8') + self.html_start_r + article_text.encode('utf-8', 'ignore') + self.html_end) return item
title_id = [x for x in title_id if x != "0"] j = 0 # if (len(title_id)!=len(title)): # if(title[j].encode('utf-8')==' '): for i in range(0, len(title_id)): if title[j].encode("utf-8") == " ": j = i + 1 ocr_title.writerow( [ title[j].encode("utf-8"), title_id[i].encode("utf-8"), section_id.encode("utf-8"), grade_id.encode("utf-8"), subject_id.encode("utf-8"), term_id.encode("utf-8"), ] ) j = j + 1 # section section = Selector(text=content).xpath("//h4/text()").extract() for i in range(0, len(section)): ocr_section.writerow( [ section[i].encode("utf-8"), i, grade_id.encode("utf-8"),
def extract_from_row(html, xpath): content = Selector(text=html).xpath(xpath).extract()[0] return content.encode('ascii', 'ignore')
def extract_from_row(html, xpath): content = Selector(text=html).xpath(xpath).extract()[0] return content.encode("ascii", "ignore")
def parse_getxinwen(self,response): item = response.meta['item'] dates = {} day = [] time0 = Selector(response).xpath('//*[@id="zwconttb"]/div[2]/text()').extract()[0][4:14] # 爬取新闻的发表时间 k = 0 for key in item: if key == 'xinwen': break else: k = k + 1 if k == len(item.keys()): # 判断item['xinwen']是否存在 item['xinwen'] = {} k = 0 for key in item['xinwen']: if key == time0: # 判断该条新闻的发表日期是否已经存在,如果存在,直接在对应的时间中添加该新闻的标题,作者,内容和评论。 content = '' i = 0 while i < len(Selector(response).xpath('//p/text()').extract()) - 3: data = Selector(response).xpath('//p/text()').extract()[i] content = content + data.encode("UTF-8",'ignore') i = i + 1 item['xinwen'][time0].append({ 'date':time0, 'title':Selector(response).xpath('//*[@id="zwconttbt"]/text()').extract()[0], #'author':Selector(response).xpath('//*[@id="zwconttbn"]/strong/a/text()').extract()[0], 'content':content, 'comments':{} # 先初始化,后面再添加评论 }) else: k = k + 1 if k == len(item['xinwen'].keys()):# 如果日期原先不存在 if k == 30: # 判断是否已经爬满30天 return item # 爬取新闻内容 content = '' i = 0 while i < len(Selector(response).xpath('//p/text()').extract()) - 3: data = Selector(response).xpath('//p/text()').extract()[i] content = content + data.encode("UTF-8",'ignore') i = i + 1 # 如果没有30天,在字典中添加一个键对 item['xinwen'][time0] = [{ 'date':time0, 'title':Selector(response).xpath('//*[@id="zwconttbt"]/text()').extract()[0], #'author':Selector(response).xpath('//*[@id="zwconttbn"]/strong/a/text()').extract()[0], 'content':content, 'comments':{} # 先初始化,后面再添加评论 }] # day = item['xinwen'][time0] # for i in range(0, len(day)): # 遍历这篇新闻发表日期对应的数组 # if day[i]['title'] == Selector(response).xpath('//*[@id="zwconttbt"]/text()').extract()[0]:# 找到这篇新闻对应的字典 # j = 1 # while Selector(response).xpath('//*[@id="zwlist"]/div['+str(j)+']').extract():# 逐条爬取评论 # time1 = Selector(response).xpath('//*[@id="zwlist"]/div['+str(j)+']/div[3]/div/div[2]/text()').extract()[0][4:23] # 爬取评论时间 # if Selector(response).xpath('//*[@id="zwlist"]/div['+str(j)+']/div[3]/div/div/span/a/text()').extract(): # 爬取评论者的姓名 # name = Selector(response).xpath('//*[@id="zwlist"]/div['+str(j)+']/div[3]/div/div/span/a/text()').extract()[0] # else: # name = Selector(response).xpath('//*[@id="zwlist"]/div['+str(j)+']/div[3]/div/div/span/span/text()').extract()[0] # comment = '' # for data in Selector(response).xpath('//*[@id="zwlist"]/div['+str(j)+']/div[3]/div/div[3]/child::text()').extract(): # 爬取评论的内容 # comment = comment + data # day[i]['comments'][time1] = { # 'name':name, # 'comment':comment # } # 将爬取到的内容存入字典 # j = j + 1 # break # item['xinwen'][time0] = day num = 0 for key in item['xinwen']: num = num + len(item['xinwen'][key]) if item['numberxin'][1] < item['numberxin'][0] and num == item['numberxin'][1]: # 判断是否已经爬取完当前页面所有新闻的链接,能否跳转到下一页 url = "http://guba.eastmoney.com/list,"+item['_id']+",1,f_"+str(item['numberxin'][2]+1)+".html" return Request(url, meta={'item':item}, callback=self.parse_xinwen) else: return item