Пример #1
0
    def parse(self, response):
        items = []

        #获取所有大类的名称
        parentTitle = response.xpath(
            '//div[@id="tab01"]//div//h3/a/text()').extract()
        #获取所有大类的链接
        parentUrls = response.xpath(
            '//div[@id="tab01"]//div//h3/a/@href').extract()

        #获取所有小类的url和标题
        subUrls = response.xpath(
            '//div[@id="tab01"]//div//ul/li/a/@href').extract()
        #获取所有小类的名称
        subTitle = response.xpath(
            '//div[@id="tab01"]//div//ul/li/a/text()').extract()
        #爬取所有大类
        for i in range(0, len(parentTitle)):

            item = SinaItem()

            #指定打猎目录的路径和目录名
            paraentFilename = "./Data/" + parentTitle[i]

            #如果目录不存在,则创建目录
            if (not os.path.exists(paraentFilename)):
                os.makedirs(paraentFilename)

            #爬取所有小类
            for j in range(0, len(subUrls)):
                item = SinaItem()

                #保存大类的title和urls
                item['parentTitle'] = parentTitle[i]
                item['parentUrls'] = parentUrls[i]

                #检查小类的url是否已同类别大类url开头,如果是返回True(sports.sina.com.cn和sports.sina.com.cn/nba)

                if subUrls[j].startswith(item['parentUrls']):
                    subFilename = paraentFilename + '/' + subTitle[j]
                    print(subFilename)
                    #如果目录不存在,则创建目录
                    if (not os.path.exists(subFilename)):
                        os.makedirs(subFilename)

                    #存储小类url、title和filename字段数据
                    item['subUrls'] = subUrls[j]
                    item['subTitle'] = subTitle[j]
                    item['subFilename'] = subFilename

                    items.append(item)

    #发送每个小类url的Request请求,得到Response连同包含meta数据一同交给回调函数,second_parse方法处理

        for item in items:
            yield scrapy.Request(url=item['subUrls'],
                                 meta={'meta_1': item},
                                 callback=self.second_parse)
Пример #2
0
    def parse(self, response):
        items = []
        parent_urls = response.xpath("//div[1]/div/h3[@class='tit02']/a/@href").extract()
        parent_title = response.xpath("//div[1]/div/h3[@class='tit02']/a/text()").extract()

        sub_urls = response.xpath("//div[1]/div[not(@data-sudaclick='citynav')]/ul/li/a/@href").extract()
        sub_title = response.xpath("//div[1]/div[not(@data-sudaclick='citynav')]/ul/li/a/text()").extract()

        for i in range(0,len(parent_title)):
            parent_filename = "./Data/" + parent_title[i]
            if(not os.path.exists(parent_filename)):
                os.makedirs(parent_filename)

            for j in range(0,len(sub_title)):
                item = SinaItem()
                item['parent_title'] = parent_title[i]
                item['parent_urls'] = parent_urls[i]

                if_belong = sub_urls[j].startswith(parent_urls[i])
                if if_belong == True:
                    sub_filename = parent_filename + '/' + sub_title[j]
                    if(not os.path.exists(sub_filename)):
                        os.makedirs(sub_filename)

                    item['sub_urls'] = sub_urls[j]
                    item['sub_title'] = sub_title[j]
                    item['sub_filename'] = sub_filename

                    items.append(item)
        for item in items:
            yield scrapy.Request(url = item['sub_urls'], meta = {'meta_1':item}, callback = self.second_parse)
Пример #3
0
    def second_parse(self, response):
        # 提取每次Response的meta数据
        meta_1 = response.meta['meta_1']

        # 取出小类里所有子链接
        sonUrls = response.xpath('//a/@href').extract()

        items = []
        for i in range(0, len(sonUrls)):
            # 检查每个链接是否以大类url开头、以.shtml结尾,如果是返回True
            if_belong = sonUrls[i].endswith(
                '.shtml') and sonUrls[i].startswith(meta_1['parentUrls'])

            # 如果属于本大类,获取字段值放在同一个item下便于传输
            if (if_belong):
                item = SinaItem()
                item['parentTitle'] = meta_1['parentTitle']
                item['parentUrls'] = meta_1['parentUrls']
                item['subUrls'] = meta_1['subUrls']
                item['subTitle'] = meta_1['subTitle']
                item['subFilename'] = meta_1['subFilename']
                item['sonUrls'] = sonUrls[i]
                items.append(item)

        #发送每个小类下子链接url的Request请求,得到Response后连同包含meta数据 一同交给回调函数 detail_parse 方法处理
        for item in items:
            yield scrapy.Request(url=item['sonUrls'],
                                 meta={'meta_2': item},
                                 callback=self.detail_parse)
Пример #4
0
    def parse(self, response):

        # 大分类
        bigclasses = response.css("#tab01 div")

        for bigclass in bigclasses:
            item = SinaItem()

            # 大类url和大类标题
            parentUrls = bigclass.css("h3 a::attr('href')").extract_first()
            parentTitle = bigclass.css("h3 a::text").extract_first()
            if not parentTitle:
                parentTitle = bigclass.css("h3 span::text").extract_first()
            parentPath = "./data/" + parentTitle

            item['parentUrls'] = parentUrls
            item['parentTitle'] = parentTitle
            smallclasses = bigclass.css("ul.list01 li")

            for smallclass in smallclasses:
                subTitle = smallclass.css("a::text").extract_first()
                subUrl = smallclass.css("a::attr('href')").extract_first()
                childPath = "/" + subTitle
                subFilename = parentPath + childPath
                # os.makedirs(subFilename, exist_ok=True)

                item['subTitle'] = subTitle
                item['subUrls'] = subUrl
                if not parentUrls:
                    item['parentUrls'] = subUrl
                item['subFilename'] = subFilename
                yield scrapy.Request(subUrl,
                                     callback=self.parse_sonurls,
                                     meta={"item": deepcopy(item)})
Пример #5
0
    def parse(self, response):
        titlelist = response.xpath('//div[@id="tab01"]/div')
        dirroot = 'd:/study/sina/'

        #分析每个大类获取子类信息
        for each in titlelist[:-1]:
            item = SinaItem()
            item['parentUrls'] = each.xpath('.//h3/a/@href').extract()[0]
            item['parentTitle'] = each.xpath('.//h3/a/text()').extract()[0]
            item['subUrls'] = each.xpath('.//ul/li/a/@href').extract()
            item['subTitle'] = each.xpath('.//ul/li/a/text()').extract()

            #按照小分类创建文件夹
            parentroot = dirroot + item['parentTitle'] + '/'
            subroot = [parentroot + x + '/' for x in item['subTitle']]

            #先确认主分类文件夹
            if not os.path.exists(parentroot):
                os.mkdir(parentroot)

            #创建子类文件,遍历整个子类的新闻
            for i in range(len(subroot)):
                if not os.path.exists(subroot[i]):
                    os.mkdir(subroot[i])
                item['savepath'] = subroot[i]

                yield scrapy.Request(url=item['subUrls'][i],
                                     meta={'item': item},
                                     callback=self.parsenext)
Пример #6
0
 def parse(self, response):
     items = []
     #所有大小类标题和url
     parentUrls = response.xpath(
         '//div[@id="tab01"]/div/h3/a/@href').extract()
     parentTitle = response.xpath(
         '//div[@id="tab01"]/div/h3/a/text()').extract()
     subUrls = response.xpath(
         '//div[@id="tab01"]/div/ul/li/a/@href').extract()
     subTitle = response.xpath(
         '//div[@id="tab01"]/div/ul/li/a/text()').extract()
     #所有大类
     for i in range(0, len(parentTitle)):
         parentFilename = "./SinaData/" + parentTitle[i]
         if (not os.path.exists(parentFilename)):
             os.makedirs(parentFilename)
         #所有小类
         for j in range(0, len(subTitle)):
             item = SinaItem()
             item['parentTitle'] = parentTitle[i]
             item['parentUrls'] = parentUrls[i]
             if_belong = subUrls[j].startswith(item['parentUrls'])
             if (if_belong):
                 subFilename = parentFilename + '/' + subTitle[j]
                 if (not os.path.exists(subFilename)):
                     os.makedirs(subFilename)
                 #存储小类字段数据
                 item['subUrls'] = subUrls[j]
                 item['subTitle'] = subTitle[j]
                 item['subFilename'] = subFilename
                 items.append(item)
     for item in items:
         yield scrapy.Request(url=item['subUrls'],
                              meta={"meta_1": item},
                              callback=self.second_parse)
Пример #7
0
    def second_parse(self, response):
        #提取每次Response的meta数据
        meta_1 = response.meta['meta_1']

        #取出小类里的所有链接
        sonUrls = response.xpath('//a/@href').extract()

        items = []
        for i in range(0, len(sonUrls)):
            if sonUrls[i].endswith('.shtml') and sonUrls[i].startswith(
                    meta_1['parentUrls']):
                item = SinaItem()
                item['parentUrls'] = meta_1['parentUrls']
                item['parentTitle'] = meta_1['parentTitle']
                item['subUrls'] = meta_1['subUrls']
                item['subTitle'] = meta_1['subTitle']
                item['subFilename'] = meta_1['subFilename']
                item['sonUrls'] = sonUrls[i]

                items.append(item)

        #发送每个小类子链接url的Request请求,得到Response后连同包含meta数据一同交给回调函数detail_parse
        for item in items:
            yield scrapy.Request(url=item['sonUrls'],
                                 meta={'meta_2': item},
                                 callback=self.detail_parse)
Пример #8
0
    def second_parse(self, response):
        """
        解析小类的列表页
        :param response: 响应
        """
        items = []

        # 提取出每次response的meta数据
        meta_1 = response.meta['meta_1']

        # 所有小类下的链接
        urls = response.xpath('//a/@href').extract()

        for i in range(0, len(urls)):
            item = SinaItem()
            # 判断小类的链接是否为文章链接,如是否以大类链接开头和以“.shtml结束”
            if_belong = urls[i].startswith(
                meta_1['par_urls']) and urls[i].endswith('.shtml')
            if if_belong:
                item['par_title'] = meta_1['par_title']
                item['par_urls'] = meta_1['par_urls']
                item['sub_title'] = meta_1['sub_title']
                item['sub_urls'] = meta_1['sub_urls']
                item['sub_filename'] = meta_1['sub_filename']
                item['urls'] = urls[i]

                items.append(item)
        for item in items:
            yield scrapy.Request(url=item['urls'],
                                 meta={'meta_2': item},
                                 callback=self.detail_parse)
Пример #9
0
    def parseItem(self, response):
        body = response.body_as_unicode().strip(';').strip('(').strip(')')
        bodyData = body[body.index('"') + 1:body.rindex('"')]
        if not bodyData or len(bodyData) < 1:
            return

        datas = bodyData.split(',')

        hqItem = SinaItem()
        hqItem['name'] = datas[0]
        hqItem['time'] = datas[1]
        hqItem['openPrice'] = datas[2]
        hqItem['highestPrice'] = datas[3]
        hqItem['lowestPrice'] = datas[4]
        hqItem['yestodayClosePrice'] = datas[5]
        hqItem['buyPrice'] = datas[6]
        hqItem['sellPrice'] = datas[7]
        hqItem['newestPrice'] = datas[8]
        hqItem['clearPrice'] = datas[9]
        hqItem['yestodayClearPrice'] = datas[10]
        hqItem['buyQuantity'] = datas[11]
        hqItem['sellQuantity'] = datas[12]
        hqItem['holdPosQuantity'] = datas[13]
        hqItem['dealQuantity'] = datas[14]
        hqItem['tradeUnit'] = datas[15]
        hqItem['catogory'] = datas[16]
        hqItem['date'] = datas[17]
        # hqItem['dateAndTime']=datas[0]+datas[17]+"-"+datas[1]
        yield hqItem
Пример #10
0
    def second_parse(self, response):
        #print('response:', response)
        #print('response meta:', response.meta['meta_1'])

        # 提取meta数据
        meta_1 = response.meta['meta_1']

        # 取出小类中帖子的链接
        sonUrls = response.xpath('//a/@href').extract()

        items = []
        for i in range(len(sonUrls)):
            # 检查是否以大类链接开头
            if_belong = sonUrls[i].startswith(
                meta_1['parentUrls']) and sonUrls[i].endswith('.shtml')

            # 如果属于本大类的帖子链接 才需要提取出来
            if if_belong:
                #print(sonUrls[i])
                # 创建item对象 描述每一个帖子的链接
                item = SinaItem()
                item['parentTitle'] = meta_1['parentTitle']
                item['parentUrls'] = meta_1['parentUrls']
                item['subTitle'] = meta_1['subTitle']
                item['subUrls'] = meta_1['subUrls']
                #item['subFilename'] = meta_1['subFilename']
                item['sonUrls'] = sonUrls[i]  # 唯一
                items.append(item)

        #print(len(items)) #
        for item in items:
            yield scrapy.Request(url=item['sonUrls'],
                                 callback=self.detail_parse,
                                 meta={'meta_2': item})
Пример #11
0
    def detail_parse(self, response):
        #print('response:', response)
        #print('response meta:', response.meta['meta_2'])

        # 提取meta数据
        meta_2 = response.meta['meta_2']

        head = response.xpath(
            '//h1[@id="artibodyTitle"]/text() | //h1[@class="main-title"]/text()'
        ).extract()
        if len(head) > 0:
            head = head[0]
        else:
            head = ''
        content_list = response.xpath(
            '//div[@id="artibody"]/p/text()').extract()

        #print('head:', head)
        #print('content_list:', content_list)
        content = ''
        for content_one in content_list:
            content += content_one

        item = SinaItem()
        item['parentTitle'] = meta_2['parentTitle']
        item['parentUrls'] = meta_2['parentUrls']
        item['subTitle'] = meta_2['subTitle']
        item['subUrls'] = meta_2['subUrls']
        #item['subFilename'] = meta_2['subFilename']
        item['sonUrls'] = meta_2['sonUrls']
        item['head'] = head
        item['content'] = content

        yield item
Пример #12
0
    def parse(self, response):
        #根结点
        pre_root = response.xpath(
            "//div[@id='tab01']/div[@class='clearfix']")[:19]
        for seed in pre_root:
            #第一层循环,取大标题的名称并创建文件夹
            preTitle = seed.xpath("./h3[@class='tit02']/a/text()").extract()[0]
            preUrl = seed.xpath("./h3[@class='tit02']/a/@href").extract()[0]

            #第二层循环,取大标题下的li标签为列表做迭代,以小标题为单位生成item
            li_list = seed.xpath("./ul[@class='list01']/li")
            for li in li_list:
                #创建item
                item = SinaItem()
                #大标题取值
                item["preTitle"] = preTitle
                item["preUrl"] = preUrl
                #小标题取值
                item["subTitle"] = li.xpath("./a/text()").extract()[0]
                subUrl = li.xpath("./a/@href").extract()[0]
                #判断小标题链接开头是否是大标题链接,如果是则存储
                #if subUrl.startswith(item["preUrl"]):
                item["subUrl"] = subUrl
                #else:
                #continue
                #保存小标题文件夹路径,用于存储文章内容,同时创建小标题文件夹
                item["subFilepath"] = './sinainfo/' + item[
                    "preTitle"] + '/' + item["subTitle"]
                yield scrapy.Request(item["subUrl"],
                                     meta={"meta_item": item},
                                     callback=self.parse_info)
Пример #13
0
 def parse(self, response):
     soup = BeautifulSoup(response.body, "html.parser")
     # 如果爬取的页面格式不正确返回DateItem空对象
     if soup.body.find(class_="main_editor") is None:
         return None
     title = soup.body.find(class_="main_editor").find(class_="title").string.strip()
     publish_time = soup.body.find(class_="main_editor").find(class_="time").string.strip()
     publish_time = re.sub(r'[\u4e00-\u9fa5]', '', publish_time).strip()
     read_num = soup.body.find(class_="main_editor").find(class_="W_fr").find(class_="num").string.strip()
     # 格式为阅读数:94577
     read_num = read_num[4:].strip()
     li_list = soup.body.find(class_="WB_feed").find(class_="WB_row_line").findAll('li')
     forward_num = li_list[0].find(class_="pos").span.string.strip()
     # 格式为转发 55
     forward_num = forward_num[2:].strip()
     comment_num = li_list[1].find(class_="pos").span.string.strip()
     # 格式为评论 6
     comment_num = comment_num[2:].strip()
     if li_list[2].find(class_="pos").span.span.em.string is None:
         like_num = ''
     else:
         like_num = li_list[2].find(class_="pos").span.span.em.string.strip()
     item = SinaItem()
     item['title'] = title
     item['publish_time'] = publish_time
     item['read_num'] = read_num
     item['forward_num'] = forward_num
     item['comment_num'] = comment_num
     item['like_num'] = like_num
     yield item
Пример #14
0
    def parse_item(self, response):
        meta_item = response.meta['meta_item']

        #获取子类里的所有链接
        sonUrls = response.xpath('//a/@href').extract()

        items = []
        for x in range(0, len(sonUrls)):
            # is_belong = None
            # if((sonUrls[x].find('.shtml') or sonUrls[x].find('.html')) != -1):
            # is_belong = sonUrls[x].startswith(meta_item['parentUrls'])
            # print is_belong

            is_belong = (sonUrls[x].endswith('.shtml') or
                         sonUrls[x].find('.html')) and sonUrls[x].startswith(
                             meta_item['parentUrls'])
            if (is_belong):
                item = SinaItem()
                item['parentTitle'] = meta_item['parentTitle']
                item['parentUrls'] = meta_item['parentUrls']
                item['subUrls'] = meta_item['subUrls']
                item['subTitle'] = meta_item['subTitle']
                item['subFilePath'] = meta_item['subFilePath']
                item['sonUrls'] = sonUrls[x]
                items.append(item)
        for item in items:
            yield scrapy.Request(url=item['sonUrls'],
                                 meta={'meta_item_detail': item},
                                 callback=self.parse_detail)
Пример #15
0
    def parse(self, response):
        """
        解析网址导航的列表页
        :param response: 响应
        """
        # 用于保存小类的标题,链接和目录
        items = []
        # 所有大类的标题和URL
        par_title = response.xpath(
            '//div[@id="tab01"]//div/h3/a/text()').extract()
        par_urls = response.xpath(
            '//div[@id="tab01"]//div/h3/a/@href').extract()

        # 所有小类的标题和URL
        sub_title = response.xpath(
            '//div[@id="tab01"]//div/ul/li/a/text()').extract()
        sub_urls = response.xpath(
            '//div[@id="tab01"]//div/ul/li/a/@href').extract()

        # 给每个大类创建一个路径
        for i in range(0, len(par_title)):
            par_filename = './Data/' + par_title[i]

            # 如果这个大类的目录不存在,则创建
            if not os.path.exists(par_filename):
                os.makedirs(par_filename)

            # 将大类的标题与链接保存
            item = SinaItem()
            item['par_title'] = par_title[i]
            item['par_urls'] = par_urls[i]

            for j in range(0, len(sub_title)):

                # 判断小类链接的前缀与大类链接是否一致
                if_belong = sub_urls[j].startswith(item['par_urls'])
                if if_belong:

                    # 拼接大类目录/小类目录
                    sub_filename = par_filename + '/' + sub_title[j]

                    # 如果小类目录不存在,则创建
                    if not os.path.exists(sub_filename):
                        os.makedirs(sub_filename)

                    # 将小类的链接与标题保存
                    item['sub_urls'] = sub_urls[j]
                    item['sub_title'] = sub_title[j]
                    item['sub_filename'] = sub_filename

                    # 添加到items的列表中
                    items.append(item)

        for item in items:
            # 访问每个小类链接,注意meta是可选参数
            yield scrapy.Request(url=item['sub_urls'],
                                 meta={'meta_1': item},
                                 callback=self.second_parse)
Пример #16
0
    def parse(self, response):
        items = []
        # 获取大类的标题和链接
        parentTitles = response.xpath(
            '//div[@class="clearfix"]/h3[@class="tit02"]/a/text()').extract()
        parentUrls = response.xpath(
            '//div[@class="clearfix"]/h3[@class="tit02"]/a/@href').extract()

        # 获取小类的标题和链接
        subTitles = response.xpath(
            '//div[@class="clearfix"]/ul[@class="list01"]/li/a/text()'
        ).extract()
        subUrls = response.xpath(
            '//div[@class="clearfix"]/ul[@class="list01"]/li/a/@href').extract(
            )

        # 遍历每个大类
        for i in range(len(parentTitles)):
            #print(parentTitle)
            # 父目录的路径
            parentFilename = './Data/' + parentTitles[i]

            # 如果父目录路径不存在 则创建
            #if not os.path.exists(parentFilename):
            #    os.makedirs(parentFilename)

            # 爬取所有的小类
            for j in range(0, len(subTitles)):
                if_belong = subUrls[j].startswith(parentUrls[i])
                # 只有该小类是属于当前的大类的情况下 才把小类放到大类的目录下
                if if_belong:
                    #print(parentUrls[i], '---', subUrls[j])
                    #subFilename = parentFilename + '/' + subTitles[j]
                    #print(subFilename)
                    # 如果子目录不存在 则创建
                    #if not os.path.exists(subFilename):
                    #    os.makedirs(subFilename)

                    # 创建item对象 描述每一个小类链接
                    item = SinaItem()
                    item['parentTitle'] = parentTitles[i]
                    item['parentUrls'] = parentUrls[i]
                    item['subTitle'] = subTitles[j]
                    item['subUrls'] = subUrls[j]
                    #item['subFilename'] = subFilename # 唯一
                    # 把item对象添加到items数组中
                    items.append(item)
        #print(len(items)) # 217
        for item in items:
            #print(item)
            # 给小类url发送请求 meta表示要额外传递的参数
            # 参数 url 发送新的请求的url链接
            # 参数callback Scrapy框架发送新的请求后 回到Spiders模块后触发的方法
            # 参数meta 额外传递的数据
            yield scrapy.Request(url=item['subUrls'],
                                 callback=self.second_parse,
                                 meta={'meta_1': item})
Пример #17
0
    def parse(self, response):

        items = []
        # 所有大类的url 和 标题
        parentUrls = response.xpath(
            '//div[@id=\"tab01\"]/div/h3/a/@href').extract()
        parentTitle = response.xpath(
            "//div[@id=\"tab01\"]/div/h3/a/text()").extract()

        # 所有小类的ur 和 标题
        subUrls = response.xpath(
            '//div[@id=\"tab01\"]/div/ul/li/a/@href').extract()
        subTitle = response.xpath(
            '//div[@id=\"tab01\"]/div/ul/li/a/text()').extract()

        #爬取所有大类
        for i in range(0, len(parentTitle)):

            # 指定大类的路径和目录名
            #parentFilename = "./Data/" + parentTitle[i]

            #如果目录不存在,则创建目录
            #if(not os.path.exists(parentFilename)):
            #    os.makedirs(parentFilename)

            # 爬取所有小类
            for j in range(0, len(subUrls)):
                item = SinaItem()

                # 保存大类的title和urls
                item['parentTitle'] = parentTitle[i]
                item['parentUrls'] = parentUrls[i]

                # 检查小类的url是否以同类别大类url开头,如果是返回True (sports.sina.com.cn 和 sports.sina.com.cn/nba)
                if_belong = subUrls[j].startswith(item['parentUrls'])

                # 如果属于本大类,将存储目录放在本大类目录下
                if (if_belong):
                    #subFilename =parentFilename + '/'+ subTitle[j]

                    # 如果目录不存在,则创建目录
                    #if(not os.path.exists(subFilename)):
                    #    os.makedirs(subFilename)

                    # 存储 小类url、title和filename字段数据
                    item['subUrls'] = subUrls[j]
                    item['subTitle'] = subTitle[j]
                    #item['subFilename'] = subFilename
                    # yield item
                    items.append(item)

        #发送每个小类url的Request请求,得到Response连同包含meta数据 一同交给回调函数 second_parse 方法处理
        # for item in items:
            yield scrapy.Request(url=item['subUrls'],
                                 meta={'meta_1': item},
                                 callback=self.second_parse,
                                 dont_filter=True)
Пример #18
0
    def parse(self, response):
        items = []
        base_path = 'J:\Desktop\sina'
        sub_links = response.xpath(
            "//div[@id='tab01']/div[@class='clearfix']//ul//a/@href").extract(
            )
        sub_titles = response.xpath(
            "//div[@id='tab01']/div[@class='clearfix']//ul//a/text()").extract(
            )
        parent_titles = response.xpath(
            "//div[@id='tab01']/div[@class='clearfix']//h3//text()").extract()
        parent_links = response.xpath(
            "//div[@id='tab01']/div[@class='clearfix']//h3//a/@href").extract(
            )

        for i in range(len(parent_titles) - 1):
            # 创建大类目录
            parent_path = os.path.join(base_path, parent_titles[i])
            if not os.path.exists(parent_path):
                os.mkdir(parent_path)

            for j in range(len(sub_titles)):
                item = SinaItem()
                item["parent_title"] = parent_titles[i]

                # 因为最后一个地方站没有url,所以得随加一个
                # if i == len(parent_links):
                #     item['parent_url'] = '*.sina.com.cn'
                # else:
                item['parent_url'] = parent_links[i]
                belong = sub_links[j].startswith(item["parent_url"])
                # 为了给地方站擦屁股
                # if i == len(parent_links) and j >= len(sub_titles) - 27:
                #     sub_path = os.path.join(parent_path, sub_titles[j])
                #     if not os.path.exists(sub_path):
                #         os.mkdir(sub_path)
                #     item['sub_url'] = sub_links[j]
                #     item['sub_title'] = sub_titles[j]
                #     item['sub_path'] = sub_path
                #     items.append(item)
                if belong:
                    sub_path = os.path.join(parent_path, sub_titles[j])
                    if not os.path.exists(sub_path):
                        os.mkdir(sub_path)
                    item['sub_url'] = sub_links[j]
                    item['sub_title'] = sub_titles[j]
                    item['sub_path'] = sub_path
                    items.append(item)

        # with open(r'J:\Desktop\test.txt', 'w') as fp:
        #     fp.write(i['sub_path'] + ' ' + i['sub_title'] +
        #                  ' ' + i['sub_url']+'\n')
        for i in items:
            yield scrapy.Request(url=i['sub_url'],
                                 callback=self.sub_parse,
                                 meta={'sub_item': i})
Пример #19
0
    def parse(self, response):
        items = []

        # 所有大类的url 和 标题
        parentUrls = response.xpath(
            '//div[@id="tab01"]/div/h3/a/@href').extract()
        parentTitle = response.xpath(
            '//div[@id="tab01"]/div/h3/a/text()').extract()

        # 所有小类的ur 和 标题
        subUrls = response.xpath(
            '//div[@id="tab01"]/div/ul/li/a/@href').extract()
        subTitle = response.xpath(
            '//div[@id="tab01"]/div/ul/li/a/text()').extract()

        # 爬取所有的大类
        for i in range(0, len(parentTitle)):
            # 指定大类路径和目录名
            parentFileName = './Data/' + parentTitle[i]

            # 判断该大类的目录是否存在,如果不存在就创建该目录
            if (not os.path.exists(parentFileName)):
                os.mkdir(parentFileName)

            # 爬取该大类下的所有小类
            for j in range(0, len(subUrls)):
                item = SinaItem()

                # 保存大类的title和url
                item['parentTitle'] = parentTitle[i]
                item['parentUrls'] = parentUrls[i]

                # 检查小类的url是否以同类别大类url开头,如果是返回True (sports.sina.com.cn 和 sports.sina.com.cn/nba)
                if_before = subUrls[j].startswith(item['parentUrls'])

                # 如果属于本大类,将存储目录放在本大类目录下
                if if_before:
                    # 拼接小类路径和目录名
                    subFileName = parentFileName + '/' + subTitle[j]

                    # 判断该小类目录是否存在,如果不存在就创建该目录
                    if not os.path.exists(subFileName):
                        os.mkdir(subFileName)

                    # 存储 小类url、title和filename字段数据
                    item['subUrls'] = subUrls[j]
                    item['subTitle'] = subTitle[j]
                    item['subFileName'] = subFileName

                    items.append(item)

            # 发送每个小类url的Request请求,得到Response连同包含meta数据 一同交给回调函数 second_parse 方法处理
            for item in items:
                yield scrapy.Request(url=item['subUrls'],
                                     meta={'meta_1': item},
                                     callback=self.second_parse)
Пример #20
0
 def parse_stock(self, response):
     html = response.body.decode('utf-8')
     result = re.search(r'\"text\": \".*\",\n', html)
     result2 = re.search(r'\"screen_name\": \".*?\"', html)
     author = eval(result2.group(0).split(':')[1])
     html = result.group(0)[9:-3]
     new_html = "".join((re.sub("\n", " ", html)).split(" "))
     content = re.sub('<.*?>', '', new_html)
     item = SinaItem(author=author, content=content)
     yield item
Пример #21
0
 def parse(self,response):
  url = str(response.url)
  if 'comment5' in url:
  item = SinaItem()
  
  #Dynamic Contents Extraction:(3)Open selenium explorer
  driver = webdriver.Firefox()
  driver.get(url)
  #extracing news title(adjustment for abnormal urls)
  try:
   title = driver.find_element_by_xpath('//h1[@id = "J_NewsTitle"]/a').text.encode('utf-8') #when channel is not 'kj'
   title = driver.find_element_by_xpath('//h1[@bbs-node-type = "title"]/a').text.encode('utf-8') #when channel is 'kj'
  except:
   title = ''
   item['title'] = title
   
  #Dynamic Contents Extraction:(4)Extracting # of reviews
  contents = driver.find_elements_by_xpath('//span[contains(@class,"f_red")]') #when channel is not 'kj'
# contents = driver.find_elements_by_xpath('//span[contains(@class,"count")]//em')  #when channel is 'kj'
  try:
   item['num_comment'] = contents[0].text.encode('utf-8')
  except:
   item['num_comment'] = 0
   
  #Dynamic Contents Extraction:(5)Extractng # of participants
  try:
   item['num_part'] = contents[1].text.encode('utf-8')
  except:
   item['num_part'] = 0
   
  #Dynamic Contents Extraction:(6)Extracting reviews
  comments = ''
  #when channel is not 'kj'
  for comment in driver.find_elements_by_xpath('//div[@id="J_Comment_List_Hot"]//div[@class="orig_content"]'):
   comments += comment.text.encode('utf-8')+'\n'+'\n'
  for comment in driver.find_elements_by_xpath('//div[@id="J_Comment_List_Hot"]//div[@class="t_txt"]'):
   comments += comment.text.encode('utf-8')+'\n'+'\n'
  for comment in driver.find_elements_by_xpath('//div[@class="comment_item_page_first"]//div[@class="orig_content"]'):
   comments += comment.text.encode('utf-8')+'\n'+'\n'
  for comment in driver.find_elements_by_xpath('//div[@class="comment_item_page_first"]//div[@class="t_txt"]'):
   comments += comment.text.encode('utf-8')+'\n'+'\n'
  #when channel is 'kj'
# for comment in driver.find_elements_by_xpath('//div[@class="sina-comment-page sina-comment-page-show"]//div[@comment-type="itemTxt"]'):
# comments += comment.text.encode('utf-8')+'\n'+'\n'
  item['comment'] = comments
  driver.close()
  yield item
  
  #Obtain the url of next piece of news
  reader = csv.reader(open('/media/sunzeyeah/Personal/SENIOR/Thesis/Data/Chinese/Sina/news_0317.csv','r'))
  for line in reader:
   if line[0] != 'comment_url':
    yield Request(line[0], callback=self.parse)
Пример #22
0
    def parse_item(self, response):
        news_list = response.xpath('//ul[@class="list_009"]/li')
        for news in news_list:
            title = news.xpath('./a/text()').extract_first()
            data_time = news.xpath('./span/text()').extract_first()
            new_url = news.xpath('./a/@href').extract_first()

            item = SinaItem()
            item['title'] = title
            item['data_time'] = data_time
            item['new_url'] = new_url
            yield item
Пример #23
0
    def parse_user(self, response):
        """
        解析用户信息
        :param response: Response对象
        通过解析json数据得到想要的内容
        """
        self.logger.debug(response)
        result = json.loads(response.text)
        if result.get('data').get('userInfo'):
            user_info = result.get('data').get('userInfo')
            user_item = SinaItem()

            field_map = {
                'id': 'id',
                'name': 'screen_name',
                'avatar': 'profile_image_url',
                'cover': 'cover_image_phone',
                'gender': 'gender',
                'description': 'description',
                'fans_count': 'followers_count',
                'follows_count': 'follow_count',
                'weibos_count': 'statuses_count',
                'verified': 'verified',
                'verified_reason': 'verified_reason',
                'verified_type': 'verified_type'
            }
            print("********************************")
            print(field_map)
            for field, attr in field_map.items():
                user_item[field] = user_info.get(attr)
            yield user_item

            uid = user_info.get('id')
            yield Request(self.follow_url.format(uid=uid, page=1),
                          callback=self.parse_follows,
                          meta={
                              'page': 1,
                              'uid': uid
                          })
            # 粉丝
            yield Request(self.fan_url.format(uid=uid, page=1),
                          callback=self.parse_fans,
                          meta={
                              'page': 1,
                              'uid': uid
                          })
            # 微博
            yield Request(self.weibo_url.format(uid=uid, page=1),
                          callback=self.parse_weibos,
                          meta={
                              'page': 1,
                              'uid': uid
                          })
Пример #24
0
 def parse(self, response):
     categories = response.xpath("//div[@id='tab01']/div")
     # 最后一个是城市的,不爬取了
     categories.pop()
     for category in categories:
         main_name = category.xpath(".//a/text()").extract_first()
         sublis = category.xpath(".//ul/li")
         for li in sublis:
             sub_name = li.xpath(".//a/@href").extract_first()
             sub_url = li.xpath(".//a/text()").extract_first()
             item = SinaItem(main_name=main_name,sub_name=sub_name)
             yield scrapy.Request(url=sub_url,meta={"item":item},callback=self.parse_sub_page)
Пример #25
0
    def parse(self, response):
        sel = Selector(response)
        item = SinaItem()
        brand = sel.xpath("//a[@class='fL logo']/img/@alt").extract_first()
        model = response.url.strip('/').split('/')[-1]
        model_name = sel.xpath(
            "//span[@class='fL name']/a[1]/text()").extract_first()

        js_pi_url = 'http://db.auto.sina.com.cn/api/car/getFilterCar.json?subid=%s&niankuan=&derailleur_type=&product_status=1,2&outgas=&auto_type=' % model
        js_p_text = requests.get(js_pi_url)
        js_p_load = json.loads(js_p_text.text)
        tds = sel.xpath(
            "//div[@class='cartype_list lump']/table/tbody/tr/td[1]")
        for td in tds:
            url = td.xpath('a[1]/@href').extract_first()
            version_name = td.xpath('a[1]/span/text()').extract_first()
            version_id = url.strip('/').split('/')[-1]

            js_url = 'http://db.auto.sina.com.cn/api/car/getFilterCarInfo.json?carid=%s' % version_id
            js_text = requests.get(js_url)
            js_load = json.loads(js_text.text)

            for i in js_load['baseinfo']['data']:
                if i['name'] == '变速箱':
                    paidang = i['data'][-1].get('data', '')

            item['paidang'] = paidang
            item['version_id'] = version_id
            item['version'] = version_name
            item['url'] = response.url
            item['model'] = model
            item['brand'] = brand
            now = datetime.datetime.now()
            item['collect_date'] = now.strftime("%Y-%m-%d")
            item['standard_version'] = '%s %s' % (model_name, version_name)

            for i in js_p_load:
                if i['car_id'] == version_id:
                    item['classfy'] = '厂商指导价'
                    item['item'] = '厂商指导价'
                    item['karw'] = i['merchant_price_indoor']

                    yield (item)

            for k in js_load.keys():
                for data in js_load[k].get('data', ''):
                    item['classfy'] = js_load[k]['name']
                    item['item'] = data['name']
                    item['karw'] = data['data'][-1].get('data', '')

                    # print(item)
                    yield (item)
Пример #26
0
 def parse_item(self,response):
  url = str(response.url)
  
  #Title Extraction
  title = [n.encode('utf-8') for n in response.xpath('//h1[contains(@id,"artibodyTitle")]/text()').extract()]
  #title = response.xpath('//head/title/text()').extract()
  
  #Date Exctration
  Date = response.xpath('//span[contains(@id,"pub_date")]/text()').extract()  #html
  Date += response.xpath('//span[contains(@class,"time-source")]/text()').extract()   #shtml
  date = [n.encode('utf-8') for n in Date]
  
  #Contents Extraction
  contents = ''
  for body in response.xpath('//div[contains(@id,"artibody")]//p/text()'):
   for n in body.extract():
    contents += n.encode('utf-8')
    
  #Dynamic Contents Extraction:(1)Extract News ID and review channel
  newsID = ''
  channel = ''
  raw = str(response.xpath('//meta[contains(@content,"comment_channel")]/@content').extract())
  real=''
  i = 3
  while i <= len(raw)-3:
   real += raw[i]
   i = i + 1
   final=[]
   for s in real.split(':'):
    ss = s.split(';')
    for eachone in ss:
     final.append(eachone)
   i = 0
   while i < len(final):
    if 'comment_id' in final[i]:
     i = i + 1
     newsID = final[i]
    elif 'comment_channel' in final[i]:
     i = i + 1
     channel = final[i]
    i = i + 1
    
  #Dynamic Contents Extraction:(2)Generate the url of reviews
  comment_url = 'http://comment5.news.sina.com.cn/comment/skin/default.html?channel='+channel+'&newsid='+newsID
  if title and date and contents: #and newsID and channel:
   item = SinaItem()
   item['url'] = url.encode('utf-8')
   item['comment_url'] = comment_url.encode('utf-8')
   item['title'] = title
   item['date'] = date
   item['body'] = contents
   yield item
Пример #27
0
    def parse(self, response):
        item = SinaItem()
        div_list = response.xpath('.//div[@id="tab01"]/div')[:-1]
        for div in div_list:
            item['b_title'] = div.xpath('./h3/a/text()').extract_first()
            item['b_urls'] = div.xpath('./h3/a/@href').extract_first()

            for li in div.xpath('./ul/li'):
                item['s_title'] = li.xpath('./a/text()').extract_first()
                item['s_urls'] = li.xpath('./a/@href').extract_first()
                # print(item)

            yield scrapy.Request(item['s_urls'],
                                 callback=self.parse_detail,
                                 meta={'item': item})
Пример #28
0
    def parse(self, response):
        """ This function parses sina page
        @url http://news.sina.com.cn/
        @returns items 1
        @scrapes centerNews rightNews hostname author
        """
        loader = ItemLoader(item=SinaItem(), response=response)
        loader.add_xpath('centerNews',
                         '//*/h1[@data-client="headline"]/a/text()',
                         MapCompose(lambda t: t[:4]), Join())
        loader.add_xpath('rightNews', '//*/div[@class="tl"]/a/text()')

        loader.add_value('hostname', response.url)
        loader.add_value('author', 'Mory')
        return loader.load_item()
Пример #29
0
 def parse(self, response):
     for i in response.xpath('//div[@class=\'quote\']'):
         
         content = i.xpath('span[@class=\'text\']/text()').extract_first()
         author = i.xpath('span/small/text()').extract_first()
         tag = i.xpath('div[@class=\'tags\']//a/text()').extract()
         item = SinaItem(content = content, author = author, tag = tag)
         yield item
     # g to next page
     try:
         next_page = response.xpath('//li[@class=\'next\']/a/attribute::href').extract_first() 
         if next_page:
             next_page = response.urljoin(str(next_page))
             yield scrapy.Request(next_page, callback=self.parse)
     except Exception as identifier:
         print('error')
Пример #30
0
 def parse_details(self, response):
     soup = BeautifulSoup(response.text, 'html.parser')
     try:
         title = self.extract_title(soup)
         if not title:
             raise Exception('Skip ' + response.url +
                             ' cannot find the title.')
         content = self.extract_content(soup)
         if not content:
             raise Exception('Skip ' + response.url +
                             ' cannot find the content.')
         print(title)
     except Exception as e:
         self.logger.error(str(e))
         self.logger.error(traceback.format_exc())
     item = SinaItem(_id=response.url, title=title, content=content)
     yield item