Пример #1
0
    def parse(self, response):
        item = BlogItem()
        selector = Selector(response)
        Course = selector.xpath('//div[@class="post_item_body"]')

        for eachCourse in Course:
            # Title = eachCourse.xpath('h3/a[@class="titlelnk"]/text()').extract()
            # Name = eachCourse.xpath('div[@class="post_item_foot"]/a[@class="lightblue"]/text()').extract()[0]
            # path1 = eachCourse.xpath('p[@class="post_item_summary"]/a/img/@src').extract()[0]
            # [ @

            # class ="pfs"]
            # path = "http:" + path1
            Name = eachCourse.xpath(
                'div[@class="post_item_foot"]/a["@class=lightblue"]/text()'
            ).extract()[0]
            #
            # # data = eachCourse.xpath()
            # num = eachCourse.xpath('span[@class="article_view"]/a["@class="gray"]/text()').extract()

            # item['Title'] = Title
            item['Name'] = Name
            # item['path'] = path
            # item['num'] = num
            yield item
Пример #2
0
	def parse(self,response):
		#print response.body
		items = response.xpath('//div[@class="showlist"]/li')
		for item in items:
			blogitem = BlogItem()
			blogitem['url'] = item.xpath('//p[@class="showpic"]/a/@href').extract()
			blogitem['img'] = item.xpath('//p[@class="showpic"]/a/img/@src').extract()
			blogitem['title']= item.xpath('//p[@class="showpic"]/a/img/@alt').extract()
			yield blogitem
Пример #3
0
    def parse_details(self, response):
        item = BlogItem()
        item['url'] = response.url
        query_str = parse.parse_qs(parse.urlsplit(response.url).query)
        item['author'] = query_str['blogId'][0]

        titlecontent = ""
        title = ""

        if 'blog.naver.com' in response.url:
            title = str(
                response.xpath(
                    "//div[@class='se-module se-module-text se-title-text']/p/span/text()"
                ).get())
            item['date'] = response.xpath(
                "//span[contains(@class, 'se_publishDate')]/text()").get()
            content = str(
                response.xpath("//div[@class='se-main-container']").get())

            if content == 'None':
                title = str(
                    response.xpath(
                        "//div[contains(@class,'se_title')]//h3").get())
                item['date'] = response.xpath(
                    "//span[contains(@class, 'se_publishDate')]/text()").get()
                content = str(
                    response.xpath(
                        "//div[contains(@class, 'sect_dsc')]").get())

            if content == 'None':
                title = str(
                    response.xpath("//div[@class='htitle']/span/text()").get())
                item['date'] = response.xpath(
                    "//p[contains(@class,'_postAddDate')]/text()").get()
                content = str(
                    response.xpath("//div[@id='postViewArea']/div").get())

        title = re.sub(
            ' +', ' ',
            str(
                re.sub(re.compile('<.*?>'), ' ',
                       title.replace('"', '')).replace('\r\n', '').replace(
                           '\n', '').replace('\t', '').replace('\u200b',
                                                               '').strip()))
        content = re.sub(
            ' +', ' ',
            str(
                re.sub(re.compile('<.*?>'), ' ',
                       content.replace('"', '')).replace('\r\n', '').replace(
                           '\n', '').replace('\t', '').replace('\u200b',
                                                               '').strip()))
        item['title'] = title
        item['content'] = content

        yield item
Пример #4
0
    def parse(self, response):
        sel = Selector(response)
        articles = sel.xpath('//div[@id="content"]/article')

        for article in articles:
            item = BlogItem()
            item['title'] = article.xpath(
                'header/h1[@class="entry-title"]/a/text()').extract()
            item['link'] = article.xpath(
                'header/h1[@class="entry-title"]/a/@href').extract()
            item['description'] = article.xpath(
                'div[@class="entry-content"]/p[position()<3]/text()').extract(
                )

            yield item
Пример #5
0
 def parse(self, response):
     titles = []
     links = []
     for sel in response.xpath('//*[@class="postTitle2"]'):
         title = sel.xpath('text()').extract()
         for ti in title:
             titles.append(ti)
             self.logger.info("<TITLE> : \t" + ti)
         link = sel.xpath('@href').extract()
         links.append(link)
     for i in range(0, len(titles) - 1):
         item = BlogItem()
         item['title'] = titles[i]
         item['link'] = links[i]
         yield item
Пример #6
0
    def detail_parse(self, response):

        post = response.xpath('//div[contains(@class,"article-item-box")]')
        for i in post:
            item = BlogItem()
            item["is_origin"] = s_format(i.xpath('//a/span/text()').get())
            item["title"] = s_format(i.xpath('./h4//text()[2]').get())
            item["commit"] = i.xpath(
                '//div[contains(@class,"info-box")]/p[3]/span/text()').get(
                ).split(':')[1].strip(' ')
            item["look"] = i.xpath(
                '//div[contains(@class,"info-box")]/p[2]/span/text()').get(
                ).split(":")[1].strip(' ')
            item["create_time"] = i.xpath(
                '//div[contains(@class,"info-box")]/p[1]/span/text()').get()
            yield item
Пример #7
0
    def parse(self, response):

        item = BlogItem()
        items = response.xpath(
            '//div[@class="blog_list_wrap"]/dl[@class="blog_list clearfix"]')

        for i in items:
            item['name'] = i.xpath(
                './/h3[@class="csdn-tracking-statistics"]/a/text()').extract()
            item['url'] = i.xpath(
                './/h3[@class="csdn-tracking-statistics"]/a/@href').extract()
            item['author'] = i.xpath('.//dt/a[2]/text()').extract()
            item['skim'] = i.xpath(
                './/dd/div[2]/div[2]/span/em/text()').extract()
            item['sort'] = i.xpath(
                './/dd/div[2]/div[1]/span/a/text()').extract()

            yield item

        for i in range(1, num):
            next_url = 'http://blog.csdn.net/?&page={}'.format(i)
            yield Request(next_url, headers=self.headers)
Пример #8
0
    def parse_item(self, response):
        items = BlogItem()
        sel = Selector(response)
        btitle = sel.xpath('//*[@class="articalTitle"]/h2/text()').extract(
        )[0].encode('utf-8')
        #        item['title'] = sel.xpath('//*[@class="link_title"]/a/text()').extract()[0].encode('utf-8')
        btime = sel.xpath('//*[@class="time SG_txtc"]/text()').extract()[0]
        bcontents = sel.xpath('//font[contains(@size, "+0")]/text()').extract()
        boldcontents = sel.xpath('//b/text()').extract()
        boldfont = sel.xpath('//b/font/text()').extract()

        like = "".join(response.xpath('//*[@class="IL"]//text()').extract())

        print('yuedu/r/n' + like)

        #获取文章的主要内容
        acontent = "".join(
            response.xpath(
                '//div[@id="sina_keyword_ad_area2"]//text()').extract())
        bcontent = ''
        for content in bcontents:
            bcontent += content.encode('utf-8')
        for bold in boldcontents:
            bcontent += bold.encode('utf-8')
        if boldfont != []:
            bcontent += boldfont[0].encode('utf-8')

        #获取时间
        btime = btime.replace('(', '')
        btime = btime.replace(')', '')

        # 初步去除内容的回车
        acontent = acontent.replace('\n', '')
        acontent = acontent.replace('\t', '')

        # 获取名字id
        nameid = response.css('link[rel*=alternate]::attr(href)').extract(
        )[0].split('/')[-1].split('.')[0]
        name = requests.get(
            'http://uic.sso.sina.com.cn/uic/Mutiquery.php?UID=0&Check=null&UIDS=['
            + nameid +
            ']&UserInfoTypes=[1]&ProductType=2&varname =requestId_8481872'
        ).content.split('"')[-2].decode("unicode-escape")

        # 获取阅读量
        viewids = response.url.split('_')[-1].split('.')[0].split('01')
        con = requests.get(
            'http://comet.blog.sina.com.cn/api?maintype=num&uid=' +
            viewids[0] + "&aids=" + viewids[1] +
            "&requestId=aritlces_number_3610").content
        arr = con.split('{')[-1].split('}')[0].split(',')
        view = 0
        like = 0
        comment = 0
        for a in arr:
            if a.find("r") != -1:
                print("\r\n\r\n\r\n" + a.split(":")[-1] + "\r\n\r\n\r\n")
                view = a.split(":")[-1]
            if a.find("d") != -1:
                like = a.split(":")[-1]

        # 存入数据库
        items['name'] = name
        items['time'] = btime
        items['title'] = btitle
        items['content'] = acontent
        items['view'] = view
        items['like'] = like
        items['arturl'] = response.url
        yield items
Пример #9
0
class UserRelationshipNetsSpider(scrapy.Spider):
    name = "user_relationship_nets"
    count_dict = {}
    maxpage_dict = {}
    focus_user_dict = {}
    save_uid = []
    single_user_uid_save = []
    single_user_uid_has_requested = []

    headers = {'Host': 'blog.cnfol.com',
               'Accept': "application/json, text/javascript, */*; q=0.01",
               'Accept-Language': "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
               'Accept-Encoding': "gzip, deflate",
               'Connection': "keep-alive",
               'Cache-Control': "max-age=0",
               'Cookie': "SUV=1471944245695628",
               'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0',}

    def __init__(self, first_uid, runner):
        self.user_url = 'http://blog.cnfol.com/%s/myfocus/friend'
        self.first_uid = first_uid
        self.runner = runner

    def start_requests(self):
        yield scrapy.Request(self.user_url % (self.first_uid),
                             method='GET',
                             headers=self.headers,
                             callback=self.request_info,
                             meta={'cookiejar': 1,
                                   'request_id': self.first_uid,
                                   'proxy': 'http://%s' % (UsersConfig['proxy']),
                                   },
                             )

    def request_info(self, response):
        requsest_id = response.request.meta.get('request_id')
        try:
            page_num = response.xpath('//*[@class="CoRed"]/text()').extract()[0].split('/')[1]
        except:
            page_num = 1

        for j in range(1, int(page_num) + 1):
            focus_url = 'http://blog.cnfol.com/%s/myfocus/friend?type=&&p=%s' % (requsest_id, j)
            yield FormRequest(focus_url,
                              method='GET',
                              headers=self.headers,
                              callback=self.parse_page,
                              meta={
                                  'proxy': 'http://%s' % (UsersConfig['proxy']),
                                  'maxpage': page_num,
                                  'cookiejar': response.meta['cookiejar'],
                              },
                              dont_filter=True)

    def parse_page(self, response):
        page_data = response.body
        maxpage = response.request.meta.get('maxpage')
        page_data_len = len(response.xpath('//div[starts-with(@class,"MyFollowBox FirstMesg")]'))
        if page_data_len > 0:
                pattern2 = 'http://blog.cnfol.com/(.*)/myfocus/frien.*'
                key_user_id2 = re.findall(pattern2, response.url)[0]
                if key_user_id2 in self.count_dict.keys():
                    self.count_dict[key_user_id2] = self.count_dict[key_user_id2] + 1
                else:
                    self.count_dict[key_user_id2] = 1
                self.maxpage_dict[key_user_id2] = maxpage

                if int(self.count_dict[key_user_id2]) <= int(self.maxpage_dict[key_user_id2]):
                    for i in range(page_data_len):
                        try:
                            focus_link = \
                                response.xpath(
                                    '//div[starts-with(@class,"MyFollowBox FirstMesg")]/div[2]/p[2]/span[1]/a/@href')[
                                    i].extract()
                            if 'returnbolg' not in focus_link:
                                # print focus_link
                                request_id = re.findall(pattern2, focus_link)[0]
                                # print request_id
                                friends_count = response.xpath(
                                    '//div[starts-with(@class,"MyFollowBox FirstMesg")]/div[2]/p[2]/span[1]/a/em/text()')[
                                    i].extract()
                                follows_count = response.xpath(
                                    '//div[starts-with(@class,"MyFollowBox FirstMesg")]/div[2]/p[2]/span[2]/a/em/text()')[
                                    i].extract()
                                if int(friends_count) < 500:
                                    yield scrapy.Request(self.user_url % (request_id),
                                                         method='GET',
                                                         headers=self.headers,
                                                         callback=self.request_info,
                                                         # errback=self.error_back,
                                                         meta={
                                                             'proxy': 'http://%s' % (UsersConfig['proxy']),
                                                             'request_id': request_id,
                                                             'cookiejar': response.meta['cookiejar'],
                                                         },
                                                         )
                        except:
                            print 'first datalength '
                            pass

                for i in range(page_data_len):
                    try:
                        focus_link = \
                            response.xpath('//div[starts-with(@class,"MyFollowBox FirstMesg")]/div[2]/p[2]/span[1]/a/@href')[
                                i].extract()
                        if 'returnbolg' not in focus_link:
                            request_id = re.findall(pattern2, focus_link)[0]
                            friends_count = \
                                response.xpath(
                                    '//div[starts-with(@class,"MyFollowBox FirstMesg")]/div[2]/p[2]/span[1]/a/em/text()')[
                                    i].extract()
                            follows_count = \
                                response.xpath(
                                    '//div[starts-with(@class,"MyFollowBox FirstMesg")]/div[2]/p[2]/span[2]/a/em/text()')[
                                    i].extract()
                            if int(friends_count) < 500 and int(follows_count) > 5000:
                                pass_id = request_id
                                if pass_id not in self.single_user_uid_has_requested:

                                    data_dict = {
                                        'uid': pass_id
                                    }
                                    self.runner.crawl(UserInfoSpider, data_dict)
                                    self.runner.crawl(ArticleDataSpider, data_dict)

                                    # UserInfoSpider(data_dict)
                                    self.single_user_uid_has_requested.append(pass_id)
                                else:
                                    pass
                                # UserInfoSpider(data_dict)

                                if key_user_id2 in self.focus_user_dict.keys():
                                    self.focus_user_dict[key_user_id2].append('%s' % pass_id)
                                else:
                                    self.focus_user_dict[key_user_id2] = []
                                    self.focus_user_dict[key_user_id2].append('%s' % pass_id)
                    except Exception, e:
                        print 'second datalength '
                        pass

        item = BlogItem()
        if self.count_dict[key_user_id2] == self.maxpage_dict[key_user_id2]:
            self.count = 0
            if str(key_user_id2) not in self.save_uid and str(key_user_id2) in self.focus_user_dict.keys():
                save_data = '%s\t%s\n' % (key_user_id2.strip(), ','.join(self.focus_user_dict[key_user_id2]))
                item['user_relationship_nets'] = save_data
                yield item
            self.save_uid.append(key_user_id2)