def parse(self, response): items = [] #获取所有大类的名称 parentTitle = response.xpath( '//div[@id="tab01"]//div//h3/a/text()').extract() #获取所有大类的链接 parentUrls = response.xpath( '//div[@id="tab01"]//div//h3/a/@href').extract() #获取所有小类的url和标题 subUrls = response.xpath( '//div[@id="tab01"]//div//ul/li/a/@href').extract() #获取所有小类的名称 subTitle = response.xpath( '//div[@id="tab01"]//div//ul/li/a/text()').extract() #爬取所有大类 for i in range(0, len(parentTitle)): item = SinaItem() #指定打猎目录的路径和目录名 paraentFilename = "./Data/" + parentTitle[i] #如果目录不存在,则创建目录 if (not os.path.exists(paraentFilename)): os.makedirs(paraentFilename) #爬取所有小类 for j in range(0, len(subUrls)): item = SinaItem() #保存大类的title和urls item['parentTitle'] = parentTitle[i] item['parentUrls'] = parentUrls[i] #检查小类的url是否已同类别大类url开头,如果是返回True(sports.sina.com.cn和sports.sina.com.cn/nba) if subUrls[j].startswith(item['parentUrls']): subFilename = paraentFilename + '/' + subTitle[j] print(subFilename) #如果目录不存在,则创建目录 if (not os.path.exists(subFilename)): os.makedirs(subFilename) #存储小类url、title和filename字段数据 item['subUrls'] = subUrls[j] item['subTitle'] = subTitle[j] item['subFilename'] = subFilename items.append(item) #发送每个小类url的Request请求,得到Response连同包含meta数据一同交给回调函数,second_parse方法处理 for item in items: yield scrapy.Request(url=item['subUrls'], meta={'meta_1': item}, callback=self.second_parse)
def parse(self, response): items = [] parent_urls = response.xpath("//div[1]/div/h3[@class='tit02']/a/@href").extract() parent_title = response.xpath("//div[1]/div/h3[@class='tit02']/a/text()").extract() sub_urls = response.xpath("//div[1]/div[not(@data-sudaclick='citynav')]/ul/li/a/@href").extract() sub_title = response.xpath("//div[1]/div[not(@data-sudaclick='citynav')]/ul/li/a/text()").extract() for i in range(0,len(parent_title)): parent_filename = "./Data/" + parent_title[i] if(not os.path.exists(parent_filename)): os.makedirs(parent_filename) for j in range(0,len(sub_title)): item = SinaItem() item['parent_title'] = parent_title[i] item['parent_urls'] = parent_urls[i] if_belong = sub_urls[j].startswith(parent_urls[i]) if if_belong == True: sub_filename = parent_filename + '/' + sub_title[j] if(not os.path.exists(sub_filename)): os.makedirs(sub_filename) item['sub_urls'] = sub_urls[j] item['sub_title'] = sub_title[j] item['sub_filename'] = sub_filename items.append(item) for item in items: yield scrapy.Request(url = item['sub_urls'], meta = {'meta_1':item}, callback = self.second_parse)
def second_parse(self, response): # 提取每次Response的meta数据 meta_1 = response.meta['meta_1'] # 取出小类里所有子链接 sonUrls = response.xpath('//a/@href').extract() items = [] for i in range(0, len(sonUrls)): # 检查每个链接是否以大类url开头、以.shtml结尾,如果是返回True if_belong = sonUrls[i].endswith( '.shtml') and sonUrls[i].startswith(meta_1['parentUrls']) # 如果属于本大类,获取字段值放在同一个item下便于传输 if (if_belong): item = SinaItem() item['parentTitle'] = meta_1['parentTitle'] item['parentUrls'] = meta_1['parentUrls'] item['subUrls'] = meta_1['subUrls'] item['subTitle'] = meta_1['subTitle'] item['subFilename'] = meta_1['subFilename'] item['sonUrls'] = sonUrls[i] items.append(item) #发送每个小类下子链接url的Request请求,得到Response后连同包含meta数据 一同交给回调函数 detail_parse 方法处理 for item in items: yield scrapy.Request(url=item['sonUrls'], meta={'meta_2': item}, callback=self.detail_parse)
def parse(self, response): # 大分类 bigclasses = response.css("#tab01 div") for bigclass in bigclasses: item = SinaItem() # 大类url和大类标题 parentUrls = bigclass.css("h3 a::attr('href')").extract_first() parentTitle = bigclass.css("h3 a::text").extract_first() if not parentTitle: parentTitle = bigclass.css("h3 span::text").extract_first() parentPath = "./data/" + parentTitle item['parentUrls'] = parentUrls item['parentTitle'] = parentTitle smallclasses = bigclass.css("ul.list01 li") for smallclass in smallclasses: subTitle = smallclass.css("a::text").extract_first() subUrl = smallclass.css("a::attr('href')").extract_first() childPath = "/" + subTitle subFilename = parentPath + childPath # os.makedirs(subFilename, exist_ok=True) item['subTitle'] = subTitle item['subUrls'] = subUrl if not parentUrls: item['parentUrls'] = subUrl item['subFilename'] = subFilename yield scrapy.Request(subUrl, callback=self.parse_sonurls, meta={"item": deepcopy(item)})
def parse(self, response): titlelist = response.xpath('//div[@id="tab01"]/div') dirroot = 'd:/study/sina/' #分析每个大类获取子类信息 for each in titlelist[:-1]: item = SinaItem() item['parentUrls'] = each.xpath('.//h3/a/@href').extract()[0] item['parentTitle'] = each.xpath('.//h3/a/text()').extract()[0] item['subUrls'] = each.xpath('.//ul/li/a/@href').extract() item['subTitle'] = each.xpath('.//ul/li/a/text()').extract() #按照小分类创建文件夹 parentroot = dirroot + item['parentTitle'] + '/' subroot = [parentroot + x + '/' for x in item['subTitle']] #先确认主分类文件夹 if not os.path.exists(parentroot): os.mkdir(parentroot) #创建子类文件,遍历整个子类的新闻 for i in range(len(subroot)): if not os.path.exists(subroot[i]): os.mkdir(subroot[i]) item['savepath'] = subroot[i] yield scrapy.Request(url=item['subUrls'][i], meta={'item': item}, callback=self.parsenext)
def parse(self, response): items = [] #所有大小类标题和url parentUrls = response.xpath( '//div[@id="tab01"]/div/h3/a/@href').extract() parentTitle = response.xpath( '//div[@id="tab01"]/div/h3/a/text()').extract() subUrls = response.xpath( '//div[@id="tab01"]/div/ul/li/a/@href').extract() subTitle = response.xpath( '//div[@id="tab01"]/div/ul/li/a/text()').extract() #所有大类 for i in range(0, len(parentTitle)): parentFilename = "./SinaData/" + parentTitle[i] if (not os.path.exists(parentFilename)): os.makedirs(parentFilename) #所有小类 for j in range(0, len(subTitle)): item = SinaItem() item['parentTitle'] = parentTitle[i] item['parentUrls'] = parentUrls[i] if_belong = subUrls[j].startswith(item['parentUrls']) if (if_belong): subFilename = parentFilename + '/' + subTitle[j] if (not os.path.exists(subFilename)): os.makedirs(subFilename) #存储小类字段数据 item['subUrls'] = subUrls[j] item['subTitle'] = subTitle[j] item['subFilename'] = subFilename items.append(item) for item in items: yield scrapy.Request(url=item['subUrls'], meta={"meta_1": item}, callback=self.second_parse)
def second_parse(self, response): #提取每次Response的meta数据 meta_1 = response.meta['meta_1'] #取出小类里的所有链接 sonUrls = response.xpath('//a/@href').extract() items = [] for i in range(0, len(sonUrls)): if sonUrls[i].endswith('.shtml') and sonUrls[i].startswith( meta_1['parentUrls']): item = SinaItem() item['parentUrls'] = meta_1['parentUrls'] item['parentTitle'] = meta_1['parentTitle'] item['subUrls'] = meta_1['subUrls'] item['subTitle'] = meta_1['subTitle'] item['subFilename'] = meta_1['subFilename'] item['sonUrls'] = sonUrls[i] items.append(item) #发送每个小类子链接url的Request请求,得到Response后连同包含meta数据一同交给回调函数detail_parse for item in items: yield scrapy.Request(url=item['sonUrls'], meta={'meta_2': item}, callback=self.detail_parse)
def second_parse(self, response): """ 解析小类的列表页 :param response: 响应 """ items = [] # 提取出每次response的meta数据 meta_1 = response.meta['meta_1'] # 所有小类下的链接 urls = response.xpath('//a/@href').extract() for i in range(0, len(urls)): item = SinaItem() # 判断小类的链接是否为文章链接,如是否以大类链接开头和以“.shtml结束” if_belong = urls[i].startswith( meta_1['par_urls']) and urls[i].endswith('.shtml') if if_belong: item['par_title'] = meta_1['par_title'] item['par_urls'] = meta_1['par_urls'] item['sub_title'] = meta_1['sub_title'] item['sub_urls'] = meta_1['sub_urls'] item['sub_filename'] = meta_1['sub_filename'] item['urls'] = urls[i] items.append(item) for item in items: yield scrapy.Request(url=item['urls'], meta={'meta_2': item}, callback=self.detail_parse)
def parseItem(self, response): body = response.body_as_unicode().strip(';').strip('(').strip(')') bodyData = body[body.index('"') + 1:body.rindex('"')] if not bodyData or len(bodyData) < 1: return datas = bodyData.split(',') hqItem = SinaItem() hqItem['name'] = datas[0] hqItem['time'] = datas[1] hqItem['openPrice'] = datas[2] hqItem['highestPrice'] = datas[3] hqItem['lowestPrice'] = datas[4] hqItem['yestodayClosePrice'] = datas[5] hqItem['buyPrice'] = datas[6] hqItem['sellPrice'] = datas[7] hqItem['newestPrice'] = datas[8] hqItem['clearPrice'] = datas[9] hqItem['yestodayClearPrice'] = datas[10] hqItem['buyQuantity'] = datas[11] hqItem['sellQuantity'] = datas[12] hqItem['holdPosQuantity'] = datas[13] hqItem['dealQuantity'] = datas[14] hqItem['tradeUnit'] = datas[15] hqItem['catogory'] = datas[16] hqItem['date'] = datas[17] # hqItem['dateAndTime']=datas[0]+datas[17]+"-"+datas[1] yield hqItem
def second_parse(self, response): #print('response:', response) #print('response meta:', response.meta['meta_1']) # 提取meta数据 meta_1 = response.meta['meta_1'] # 取出小类中帖子的链接 sonUrls = response.xpath('//a/@href').extract() items = [] for i in range(len(sonUrls)): # 检查是否以大类链接开头 if_belong = sonUrls[i].startswith( meta_1['parentUrls']) and sonUrls[i].endswith('.shtml') # 如果属于本大类的帖子链接 才需要提取出来 if if_belong: #print(sonUrls[i]) # 创建item对象 描述每一个帖子的链接 item = SinaItem() item['parentTitle'] = meta_1['parentTitle'] item['parentUrls'] = meta_1['parentUrls'] item['subTitle'] = meta_1['subTitle'] item['subUrls'] = meta_1['subUrls'] #item['subFilename'] = meta_1['subFilename'] item['sonUrls'] = sonUrls[i] # 唯一 items.append(item) #print(len(items)) # for item in items: yield scrapy.Request(url=item['sonUrls'], callback=self.detail_parse, meta={'meta_2': item})
def detail_parse(self, response): #print('response:', response) #print('response meta:', response.meta['meta_2']) # 提取meta数据 meta_2 = response.meta['meta_2'] head = response.xpath( '//h1[@id="artibodyTitle"]/text() | //h1[@class="main-title"]/text()' ).extract() if len(head) > 0: head = head[0] else: head = '' content_list = response.xpath( '//div[@id="artibody"]/p/text()').extract() #print('head:', head) #print('content_list:', content_list) content = '' for content_one in content_list: content += content_one item = SinaItem() item['parentTitle'] = meta_2['parentTitle'] item['parentUrls'] = meta_2['parentUrls'] item['subTitle'] = meta_2['subTitle'] item['subUrls'] = meta_2['subUrls'] #item['subFilename'] = meta_2['subFilename'] item['sonUrls'] = meta_2['sonUrls'] item['head'] = head item['content'] = content yield item
def parse(self, response): #根结点 pre_root = response.xpath( "//div[@id='tab01']/div[@class='clearfix']")[:19] for seed in pre_root: #第一层循环,取大标题的名称并创建文件夹 preTitle = seed.xpath("./h3[@class='tit02']/a/text()").extract()[0] preUrl = seed.xpath("./h3[@class='tit02']/a/@href").extract()[0] #第二层循环,取大标题下的li标签为列表做迭代,以小标题为单位生成item li_list = seed.xpath("./ul[@class='list01']/li") for li in li_list: #创建item item = SinaItem() #大标题取值 item["preTitle"] = preTitle item["preUrl"] = preUrl #小标题取值 item["subTitle"] = li.xpath("./a/text()").extract()[0] subUrl = li.xpath("./a/@href").extract()[0] #判断小标题链接开头是否是大标题链接,如果是则存储 #if subUrl.startswith(item["preUrl"]): item["subUrl"] = subUrl #else: #continue #保存小标题文件夹路径,用于存储文章内容,同时创建小标题文件夹 item["subFilepath"] = './sinainfo/' + item[ "preTitle"] + '/' + item["subTitle"] yield scrapy.Request(item["subUrl"], meta={"meta_item": item}, callback=self.parse_info)
def parse(self, response): soup = BeautifulSoup(response.body, "html.parser") # 如果爬取的页面格式不正确返回DateItem空对象 if soup.body.find(class_="main_editor") is None: return None title = soup.body.find(class_="main_editor").find(class_="title").string.strip() publish_time = soup.body.find(class_="main_editor").find(class_="time").string.strip() publish_time = re.sub(r'[\u4e00-\u9fa5]', '', publish_time).strip() read_num = soup.body.find(class_="main_editor").find(class_="W_fr").find(class_="num").string.strip() # 格式为阅读数:94577 read_num = read_num[4:].strip() li_list = soup.body.find(class_="WB_feed").find(class_="WB_row_line").findAll('li') forward_num = li_list[0].find(class_="pos").span.string.strip() # 格式为转发 55 forward_num = forward_num[2:].strip() comment_num = li_list[1].find(class_="pos").span.string.strip() # 格式为评论 6 comment_num = comment_num[2:].strip() if li_list[2].find(class_="pos").span.span.em.string is None: like_num = '' else: like_num = li_list[2].find(class_="pos").span.span.em.string.strip() item = SinaItem() item['title'] = title item['publish_time'] = publish_time item['read_num'] = read_num item['forward_num'] = forward_num item['comment_num'] = comment_num item['like_num'] = like_num yield item
def parse_item(self, response): meta_item = response.meta['meta_item'] #获取子类里的所有链接 sonUrls = response.xpath('//a/@href').extract() items = [] for x in range(0, len(sonUrls)): # is_belong = None # if((sonUrls[x].find('.shtml') or sonUrls[x].find('.html')) != -1): # is_belong = sonUrls[x].startswith(meta_item['parentUrls']) # print is_belong is_belong = (sonUrls[x].endswith('.shtml') or sonUrls[x].find('.html')) and sonUrls[x].startswith( meta_item['parentUrls']) if (is_belong): item = SinaItem() item['parentTitle'] = meta_item['parentTitle'] item['parentUrls'] = meta_item['parentUrls'] item['subUrls'] = meta_item['subUrls'] item['subTitle'] = meta_item['subTitle'] item['subFilePath'] = meta_item['subFilePath'] item['sonUrls'] = sonUrls[x] items.append(item) for item in items: yield scrapy.Request(url=item['sonUrls'], meta={'meta_item_detail': item}, callback=self.parse_detail)
def parse(self, response): """ 解析网址导航的列表页 :param response: 响应 """ # 用于保存小类的标题,链接和目录 items = [] # 所有大类的标题和URL par_title = response.xpath( '//div[@id="tab01"]//div/h3/a/text()').extract() par_urls = response.xpath( '//div[@id="tab01"]//div/h3/a/@href').extract() # 所有小类的标题和URL sub_title = response.xpath( '//div[@id="tab01"]//div/ul/li/a/text()').extract() sub_urls = response.xpath( '//div[@id="tab01"]//div/ul/li/a/@href').extract() # 给每个大类创建一个路径 for i in range(0, len(par_title)): par_filename = './Data/' + par_title[i] # 如果这个大类的目录不存在,则创建 if not os.path.exists(par_filename): os.makedirs(par_filename) # 将大类的标题与链接保存 item = SinaItem() item['par_title'] = par_title[i] item['par_urls'] = par_urls[i] for j in range(0, len(sub_title)): # 判断小类链接的前缀与大类链接是否一致 if_belong = sub_urls[j].startswith(item['par_urls']) if if_belong: # 拼接大类目录/小类目录 sub_filename = par_filename + '/' + sub_title[j] # 如果小类目录不存在,则创建 if not os.path.exists(sub_filename): os.makedirs(sub_filename) # 将小类的链接与标题保存 item['sub_urls'] = sub_urls[j] item['sub_title'] = sub_title[j] item['sub_filename'] = sub_filename # 添加到items的列表中 items.append(item) for item in items: # 访问每个小类链接,注意meta是可选参数 yield scrapy.Request(url=item['sub_urls'], meta={'meta_1': item}, callback=self.second_parse)
def parse(self, response): items = [] # 获取大类的标题和链接 parentTitles = response.xpath( '//div[@class="clearfix"]/h3[@class="tit02"]/a/text()').extract() parentUrls = response.xpath( '//div[@class="clearfix"]/h3[@class="tit02"]/a/@href').extract() # 获取小类的标题和链接 subTitles = response.xpath( '//div[@class="clearfix"]/ul[@class="list01"]/li/a/text()' ).extract() subUrls = response.xpath( '//div[@class="clearfix"]/ul[@class="list01"]/li/a/@href').extract( ) # 遍历每个大类 for i in range(len(parentTitles)): #print(parentTitle) # 父目录的路径 parentFilename = './Data/' + parentTitles[i] # 如果父目录路径不存在 则创建 #if not os.path.exists(parentFilename): # os.makedirs(parentFilename) # 爬取所有的小类 for j in range(0, len(subTitles)): if_belong = subUrls[j].startswith(parentUrls[i]) # 只有该小类是属于当前的大类的情况下 才把小类放到大类的目录下 if if_belong: #print(parentUrls[i], '---', subUrls[j]) #subFilename = parentFilename + '/' + subTitles[j] #print(subFilename) # 如果子目录不存在 则创建 #if not os.path.exists(subFilename): # os.makedirs(subFilename) # 创建item对象 描述每一个小类链接 item = SinaItem() item['parentTitle'] = parentTitles[i] item['parentUrls'] = parentUrls[i] item['subTitle'] = subTitles[j] item['subUrls'] = subUrls[j] #item['subFilename'] = subFilename # 唯一 # 把item对象添加到items数组中 items.append(item) #print(len(items)) # 217 for item in items: #print(item) # 给小类url发送请求 meta表示要额外传递的参数 # 参数 url 发送新的请求的url链接 # 参数callback Scrapy框架发送新的请求后 回到Spiders模块后触发的方法 # 参数meta 额外传递的数据 yield scrapy.Request(url=item['subUrls'], callback=self.second_parse, meta={'meta_1': item})
def parse(self, response): items = [] # 所有大类的url 和 标题 parentUrls = response.xpath( '//div[@id=\"tab01\"]/div/h3/a/@href').extract() parentTitle = response.xpath( "//div[@id=\"tab01\"]/div/h3/a/text()").extract() # 所有小类的ur 和 标题 subUrls = response.xpath( '//div[@id=\"tab01\"]/div/ul/li/a/@href').extract() subTitle = response.xpath( '//div[@id=\"tab01\"]/div/ul/li/a/text()').extract() #爬取所有大类 for i in range(0, len(parentTitle)): # 指定大类的路径和目录名 #parentFilename = "./Data/" + parentTitle[i] #如果目录不存在,则创建目录 #if(not os.path.exists(parentFilename)): # os.makedirs(parentFilename) # 爬取所有小类 for j in range(0, len(subUrls)): item = SinaItem() # 保存大类的title和urls item['parentTitle'] = parentTitle[i] item['parentUrls'] = parentUrls[i] # 检查小类的url是否以同类别大类url开头,如果是返回True (sports.sina.com.cn 和 sports.sina.com.cn/nba) if_belong = subUrls[j].startswith(item['parentUrls']) # 如果属于本大类,将存储目录放在本大类目录下 if (if_belong): #subFilename =parentFilename + '/'+ subTitle[j] # 如果目录不存在,则创建目录 #if(not os.path.exists(subFilename)): # os.makedirs(subFilename) # 存储 小类url、title和filename字段数据 item['subUrls'] = subUrls[j] item['subTitle'] = subTitle[j] #item['subFilename'] = subFilename # yield item items.append(item) #发送每个小类url的Request请求,得到Response连同包含meta数据 一同交给回调函数 second_parse 方法处理 # for item in items: yield scrapy.Request(url=item['subUrls'], meta={'meta_1': item}, callback=self.second_parse, dont_filter=True)
def parse(self, response): items = [] base_path = 'J:\Desktop\sina' sub_links = response.xpath( "//div[@id='tab01']/div[@class='clearfix']//ul//a/@href").extract( ) sub_titles = response.xpath( "//div[@id='tab01']/div[@class='clearfix']//ul//a/text()").extract( ) parent_titles = response.xpath( "//div[@id='tab01']/div[@class='clearfix']//h3//text()").extract() parent_links = response.xpath( "//div[@id='tab01']/div[@class='clearfix']//h3//a/@href").extract( ) for i in range(len(parent_titles) - 1): # 创建大类目录 parent_path = os.path.join(base_path, parent_titles[i]) if not os.path.exists(parent_path): os.mkdir(parent_path) for j in range(len(sub_titles)): item = SinaItem() item["parent_title"] = parent_titles[i] # 因为最后一个地方站没有url,所以得随加一个 # if i == len(parent_links): # item['parent_url'] = '*.sina.com.cn' # else: item['parent_url'] = parent_links[i] belong = sub_links[j].startswith(item["parent_url"]) # 为了给地方站擦屁股 # if i == len(parent_links) and j >= len(sub_titles) - 27: # sub_path = os.path.join(parent_path, sub_titles[j]) # if not os.path.exists(sub_path): # os.mkdir(sub_path) # item['sub_url'] = sub_links[j] # item['sub_title'] = sub_titles[j] # item['sub_path'] = sub_path # items.append(item) if belong: sub_path = os.path.join(parent_path, sub_titles[j]) if not os.path.exists(sub_path): os.mkdir(sub_path) item['sub_url'] = sub_links[j] item['sub_title'] = sub_titles[j] item['sub_path'] = sub_path items.append(item) # with open(r'J:\Desktop\test.txt', 'w') as fp: # fp.write(i['sub_path'] + ' ' + i['sub_title'] + # ' ' + i['sub_url']+'\n') for i in items: yield scrapy.Request(url=i['sub_url'], callback=self.sub_parse, meta={'sub_item': i})
def parse(self, response): items = [] # 所有大类的url 和 标题 parentUrls = response.xpath( '//div[@id="tab01"]/div/h3/a/@href').extract() parentTitle = response.xpath( '//div[@id="tab01"]/div/h3/a/text()').extract() # 所有小类的ur 和 标题 subUrls = response.xpath( '//div[@id="tab01"]/div/ul/li/a/@href').extract() subTitle = response.xpath( '//div[@id="tab01"]/div/ul/li/a/text()').extract() # 爬取所有的大类 for i in range(0, len(parentTitle)): # 指定大类路径和目录名 parentFileName = './Data/' + parentTitle[i] # 判断该大类的目录是否存在,如果不存在就创建该目录 if (not os.path.exists(parentFileName)): os.mkdir(parentFileName) # 爬取该大类下的所有小类 for j in range(0, len(subUrls)): item = SinaItem() # 保存大类的title和url item['parentTitle'] = parentTitle[i] item['parentUrls'] = parentUrls[i] # 检查小类的url是否以同类别大类url开头,如果是返回True (sports.sina.com.cn 和 sports.sina.com.cn/nba) if_before = subUrls[j].startswith(item['parentUrls']) # 如果属于本大类,将存储目录放在本大类目录下 if if_before: # 拼接小类路径和目录名 subFileName = parentFileName + '/' + subTitle[j] # 判断该小类目录是否存在,如果不存在就创建该目录 if not os.path.exists(subFileName): os.mkdir(subFileName) # 存储 小类url、title和filename字段数据 item['subUrls'] = subUrls[j] item['subTitle'] = subTitle[j] item['subFileName'] = subFileName items.append(item) # 发送每个小类url的Request请求,得到Response连同包含meta数据 一同交给回调函数 second_parse 方法处理 for item in items: yield scrapy.Request(url=item['subUrls'], meta={'meta_1': item}, callback=self.second_parse)
def parse_stock(self, response): html = response.body.decode('utf-8') result = re.search(r'\"text\": \".*\",\n', html) result2 = re.search(r'\"screen_name\": \".*?\"', html) author = eval(result2.group(0).split(':')[1]) html = result.group(0)[9:-3] new_html = "".join((re.sub("\n", " ", html)).split(" ")) content = re.sub('<.*?>', '', new_html) item = SinaItem(author=author, content=content) yield item
def parse(self,response): url = str(response.url) if 'comment5' in url: item = SinaItem() #Dynamic Contents Extraction:(3)Open selenium explorer driver = webdriver.Firefox() driver.get(url) #extracing news title(adjustment for abnormal urls) try: title = driver.find_element_by_xpath('//h1[@id = "J_NewsTitle"]/a').text.encode('utf-8') #when channel is not 'kj' title = driver.find_element_by_xpath('//h1[@bbs-node-type = "title"]/a').text.encode('utf-8') #when channel is 'kj' except: title = '' item['title'] = title #Dynamic Contents Extraction:(4)Extracting # of reviews contents = driver.find_elements_by_xpath('//span[contains(@class,"f_red")]') #when channel is not 'kj' # contents = driver.find_elements_by_xpath('//span[contains(@class,"count")]//em') #when channel is 'kj' try: item['num_comment'] = contents[0].text.encode('utf-8') except: item['num_comment'] = 0 #Dynamic Contents Extraction:(5)Extractng # of participants try: item['num_part'] = contents[1].text.encode('utf-8') except: item['num_part'] = 0 #Dynamic Contents Extraction:(6)Extracting reviews comments = '' #when channel is not 'kj' for comment in driver.find_elements_by_xpath('//div[@id="J_Comment_List_Hot"]//div[@class="orig_content"]'): comments += comment.text.encode('utf-8')+'\n'+'\n' for comment in driver.find_elements_by_xpath('//div[@id="J_Comment_List_Hot"]//div[@class="t_txt"]'): comments += comment.text.encode('utf-8')+'\n'+'\n' for comment in driver.find_elements_by_xpath('//div[@class="comment_item_page_first"]//div[@class="orig_content"]'): comments += comment.text.encode('utf-8')+'\n'+'\n' for comment in driver.find_elements_by_xpath('//div[@class="comment_item_page_first"]//div[@class="t_txt"]'): comments += comment.text.encode('utf-8')+'\n'+'\n' #when channel is 'kj' # for comment in driver.find_elements_by_xpath('//div[@class="sina-comment-page sina-comment-page-show"]//div[@comment-type="itemTxt"]'): # comments += comment.text.encode('utf-8')+'\n'+'\n' item['comment'] = comments driver.close() yield item #Obtain the url of next piece of news reader = csv.reader(open('/media/sunzeyeah/Personal/SENIOR/Thesis/Data/Chinese/Sina/news_0317.csv','r')) for line in reader: if line[0] != 'comment_url': yield Request(line[0], callback=self.parse)
def parse_item(self, response): news_list = response.xpath('//ul[@class="list_009"]/li') for news in news_list: title = news.xpath('./a/text()').extract_first() data_time = news.xpath('./span/text()').extract_first() new_url = news.xpath('./a/@href').extract_first() item = SinaItem() item['title'] = title item['data_time'] = data_time item['new_url'] = new_url yield item
def parse_user(self, response): """ 解析用户信息 :param response: Response对象 通过解析json数据得到想要的内容 """ self.logger.debug(response) result = json.loads(response.text) if result.get('data').get('userInfo'): user_info = result.get('data').get('userInfo') user_item = SinaItem() field_map = { 'id': 'id', 'name': 'screen_name', 'avatar': 'profile_image_url', 'cover': 'cover_image_phone', 'gender': 'gender', 'description': 'description', 'fans_count': 'followers_count', 'follows_count': 'follow_count', 'weibos_count': 'statuses_count', 'verified': 'verified', 'verified_reason': 'verified_reason', 'verified_type': 'verified_type' } print("********************************") print(field_map) for field, attr in field_map.items(): user_item[field] = user_info.get(attr) yield user_item uid = user_info.get('id') yield Request(self.follow_url.format(uid=uid, page=1), callback=self.parse_follows, meta={ 'page': 1, 'uid': uid }) # 粉丝 yield Request(self.fan_url.format(uid=uid, page=1), callback=self.parse_fans, meta={ 'page': 1, 'uid': uid }) # 微博 yield Request(self.weibo_url.format(uid=uid, page=1), callback=self.parse_weibos, meta={ 'page': 1, 'uid': uid })
def parse(self, response): categories = response.xpath("//div[@id='tab01']/div") # 最后一个是城市的,不爬取了 categories.pop() for category in categories: main_name = category.xpath(".//a/text()").extract_first() sublis = category.xpath(".//ul/li") for li in sublis: sub_name = li.xpath(".//a/@href").extract_first() sub_url = li.xpath(".//a/text()").extract_first() item = SinaItem(main_name=main_name,sub_name=sub_name) yield scrapy.Request(url=sub_url,meta={"item":item},callback=self.parse_sub_page)
def parse(self, response): sel = Selector(response) item = SinaItem() brand = sel.xpath("//a[@class='fL logo']/img/@alt").extract_first() model = response.url.strip('/').split('/')[-1] model_name = sel.xpath( "//span[@class='fL name']/a[1]/text()").extract_first() js_pi_url = 'http://db.auto.sina.com.cn/api/car/getFilterCar.json?subid=%s&niankuan=&derailleur_type=&product_status=1,2&outgas=&auto_type=' % model js_p_text = requests.get(js_pi_url) js_p_load = json.loads(js_p_text.text) tds = sel.xpath( "//div[@class='cartype_list lump']/table/tbody/tr/td[1]") for td in tds: url = td.xpath('a[1]/@href').extract_first() version_name = td.xpath('a[1]/span/text()').extract_first() version_id = url.strip('/').split('/')[-1] js_url = 'http://db.auto.sina.com.cn/api/car/getFilterCarInfo.json?carid=%s' % version_id js_text = requests.get(js_url) js_load = json.loads(js_text.text) for i in js_load['baseinfo']['data']: if i['name'] == '变速箱': paidang = i['data'][-1].get('data', '') item['paidang'] = paidang item['version_id'] = version_id item['version'] = version_name item['url'] = response.url item['model'] = model item['brand'] = brand now = datetime.datetime.now() item['collect_date'] = now.strftime("%Y-%m-%d") item['standard_version'] = '%s %s' % (model_name, version_name) for i in js_p_load: if i['car_id'] == version_id: item['classfy'] = '厂商指导价' item['item'] = '厂商指导价' item['karw'] = i['merchant_price_indoor'] yield (item) for k in js_load.keys(): for data in js_load[k].get('data', ''): item['classfy'] = js_load[k]['name'] item['item'] = data['name'] item['karw'] = data['data'][-1].get('data', '') # print(item) yield (item)
def parse_item(self,response): url = str(response.url) #Title Extraction title = [n.encode('utf-8') for n in response.xpath('//h1[contains(@id,"artibodyTitle")]/text()').extract()] #title = response.xpath('//head/title/text()').extract() #Date Exctration Date = response.xpath('//span[contains(@id,"pub_date")]/text()').extract() #html Date += response.xpath('//span[contains(@class,"time-source")]/text()').extract() #shtml date = [n.encode('utf-8') for n in Date] #Contents Extraction contents = '' for body in response.xpath('//div[contains(@id,"artibody")]//p/text()'): for n in body.extract(): contents += n.encode('utf-8') #Dynamic Contents Extraction:(1)Extract News ID and review channel newsID = '' channel = '' raw = str(response.xpath('//meta[contains(@content,"comment_channel")]/@content').extract()) real='' i = 3 while i <= len(raw)-3: real += raw[i] i = i + 1 final=[] for s in real.split(':'): ss = s.split(';') for eachone in ss: final.append(eachone) i = 0 while i < len(final): if 'comment_id' in final[i]: i = i + 1 newsID = final[i] elif 'comment_channel' in final[i]: i = i + 1 channel = final[i] i = i + 1 #Dynamic Contents Extraction:(2)Generate the url of reviews comment_url = 'http://comment5.news.sina.com.cn/comment/skin/default.html?channel='+channel+'&newsid='+newsID if title and date and contents: #and newsID and channel: item = SinaItem() item['url'] = url.encode('utf-8') item['comment_url'] = comment_url.encode('utf-8') item['title'] = title item['date'] = date item['body'] = contents yield item
def parse(self, response): item = SinaItem() div_list = response.xpath('.//div[@id="tab01"]/div')[:-1] for div in div_list: item['b_title'] = div.xpath('./h3/a/text()').extract_first() item['b_urls'] = div.xpath('./h3/a/@href').extract_first() for li in div.xpath('./ul/li'): item['s_title'] = li.xpath('./a/text()').extract_first() item['s_urls'] = li.xpath('./a/@href').extract_first() # print(item) yield scrapy.Request(item['s_urls'], callback=self.parse_detail, meta={'item': item})
def parse(self, response): """ This function parses sina page @url http://news.sina.com.cn/ @returns items 1 @scrapes centerNews rightNews hostname author """ loader = ItemLoader(item=SinaItem(), response=response) loader.add_xpath('centerNews', '//*/h1[@data-client="headline"]/a/text()', MapCompose(lambda t: t[:4]), Join()) loader.add_xpath('rightNews', '//*/div[@class="tl"]/a/text()') loader.add_value('hostname', response.url) loader.add_value('author', 'Mory') return loader.load_item()
def parse(self, response): for i in response.xpath('//div[@class=\'quote\']'): content = i.xpath('span[@class=\'text\']/text()').extract_first() author = i.xpath('span/small/text()').extract_first() tag = i.xpath('div[@class=\'tags\']//a/text()').extract() item = SinaItem(content = content, author = author, tag = tag) yield item # g to next page try: next_page = response.xpath('//li[@class=\'next\']/a/attribute::href').extract_first() if next_page: next_page = response.urljoin(str(next_page)) yield scrapy.Request(next_page, callback=self.parse) except Exception as identifier: print('error')
def parse_details(self, response): soup = BeautifulSoup(response.text, 'html.parser') try: title = self.extract_title(soup) if not title: raise Exception('Skip ' + response.url + ' cannot find the title.') content = self.extract_content(soup) if not content: raise Exception('Skip ' + response.url + ' cannot find the content.') print(title) except Exception as e: self.logger.error(str(e)) self.logger.error(traceback.format_exc()) item = SinaItem(_id=response.url, title=title, content=content) yield item