예제 #1
0
    def parse(self, response):
        type =  response.meta['type']
        hxs = Selector(response)
        
        #个股吧
        if type ==1:
            stocks = hxs.xpath('//div[@class="ngbggulbody list clearfix"]//li/a').extract()
        
            #爬取股票论坛的地址和名字          
            for stock in stocks:
                m_stocks = re.search('href="(.+)">(.+)<\/a', stock)
                if m_stocks:
                    item = GubaItem()
                    item['content'] = {}
                    url_stock = "http://guba.eastmoney.com/" + m_stocks.group(1)
                    item['content']['guba_url'] = url_stock
                    item['content']['guba_name'] = m_stocks.group(2)
                    #url_stock == http://guba.eastmoney.com/list,000766.html
                    yield Request(url = url_stock, meta = {'item':item}, callback = self.parse_page_num)

        #主题吧
        elif type ==2:
            stocks = hxs.xpath('//div[@class="allzhutilistb"]/ul/li/a').extract()
            for stock in stocks:
                m_stocks = re.search('href="(.+)">(.+)<\/a', stock)
                item = GubaItem()
                item['content'] = {}
                url_stock = "http://guba.eastmoney.com/" + m_stocks.group(1)
                item['content']['guba_url'] = url_stock
                item['content']['guba_name'] = m_stocks.group(2)

                yield Request(url = url_stock, meta = {'item':item}, callback = self.parse_page_num)
        
        #行业吧
        elif type ==3:
            stocks = hxs.xpath('//ul[@class="ngblistitemul"]/li/a').extract()
            for stock in stocks:
                m_stocks = re.search('href="(.+)">(.+)<\/a', stock)
                item = GubaItem()
                item['content'] = {}
                url_stock = "http://guba.eastmoney.com/" + m_stocks.group(1)
                item['content']['guba_url'] = url_stock
                item['content']['guba_name'] = m_stocks.group(2)

                yield Request(url = url_stock, meta = {'item':item}, callback = self.parse_page_num)

        #概念吧
        elif type ==4:
            stocks = hxs.xpath('//ul[@class="ngblistitemul"]/li/a').extract()
            for stock in stocks:
                m_stocks = re.search('href="(.+)">(.+)<\/a', stock)
                item = GubaItem()
                item['content'] = {}
                url_stock = "http://guba.eastmoney.com/" + m_stocks.group(1)
                item['content']['guba_url'] = url_stock
                item['content']['guba_name'] = m_stocks.group(2)
                yield Request(url = url_stock, meta = {'item':item}, callback = self.parse_page_num)
예제 #2
0
    def parse(self, response):
        hxs = Selector(response)
        posts = hxs.xpath('//div[@class="articleh"]').extract()
        for post in posts:
            item = GubaItem()
            item['content'] = {}
            readnum = Selector(
                text=post).xpath('//span[@class="l1"]/text()').extract()
            if readnum:
                readnum = readnum[0]
            replynum = Selector(
                text=post).xpath('//span[@class="l2"]/text()').extract()
            if replynum:
                replynum = replynum[0]
            url = Selector(
                text=post).xpath('//span[@class="l3"]/a/@href').extract()
            if url:
                url = url[0]
            guba_id = re.search(',(.+).html', response.url).group(1)

            if str(guba_id) in str(url):
                m_stock = re.search("(^\/.+)", url)
                if m_stock:
                    post_url = "http://guba.eastmoney.com" + m_stock.group(1)
                    post_id = re.search('\/(n.+)\.html', url).group(1)
                    item['content']['readnum'] = readnum
                    item['content']['replynum'] = replynum
                    item['content']['post_id'] = post_id
                    yield Request(url=post_url,
                                  meta={
                                      'item': item,
                                      'replynum': replynum
                                  },
                                  callback=self.parse_post)
예제 #3
0
    def parse(self, response):
        try:
            if response.status == 200:
                hxs = Selector(response)
                reply_author_url = response.meta['reply_author_url']
                item = GubaItem()
                item['content'] = {}
                reply_author_name = hxs.xpath(
                    '//div[@class="taname"]/text()').extract()[0]
                item['content']['reply_author_name'] = reply_author_name.strip(
                )
                sign_up_time = hxs.xpath('//div[@id="influence"]').extract()[0]
                sign_up_time = re.search('999;">\((.+)\)<\/span',
                                         sign_up_time).group(1).strip()
                sign_up_time = datetime.strptime(sign_up_time, "%Y-%m-%d")
                item['content']['sign_up_time'] = sign_up_time
                item['content']['reply_author_url'] = reply_author_url
                yield item

        except Exception as ex:
            self.logger.warn('Parse Exception: %s %s' %
                             (str(ex), response.url))
예제 #4
0
    def parse(self, response):
        try:
            if response.status == 200:
                try:
                    filter_body = response.body.decode('utf8')
                except:
                    try:
                        filter_body = response.body.decode("gbk")
                    except:
                        try:
                            filter_body = response.body.decode("gb2312")
                        except Exception as ex:
                            print("Decode webpage failed: " + response.url)
                            return
                filter_body = re.sub('<[A-Z]+[0-9]*[^>]*>|</[A-Z]+[^>]*>', '',
                                     filter_body)
                response = response.replace(body=filter_body)
                hxs = Selector(response)

                item = GubaItem()
                dt = hxs.xpath('//div[@class="zwfbtime"]/text()').extract()[0]
                dt = re.search('\D+(\d{4}-\d{2}-.+:\d{2}).+', dt).group(1)
                creat_time = datetime.strptime(dt, "%Y-%m-%d %H:%M:%S")
                item['content'] = {}
                item['content']['create_time'] = creat_time

                try:  #针对发帖者是注册会员
                    author_url = hxs.xpath(
                        '//div[@id="zwconttbn"]/strong/a/@href').extract()[0]
                    item['content']['author_url'] = author_url

                except Exception as ex:  #针对发帖者不是注册会员
                    author = hxs.xpath(
                        '//div[@id="zwconttbn"]//span').extract()[0]
                    author = re.search('gray">(.+)<\/span', author).group(1)
                    item['content']['author'] = author

                try:  #针对普通帖子
                    postcontent = hxs.xpath(
                        '//div[@id="zwconbody"]/div[@class="stockcodec"]/text()'
                    ).extract()[0].strip()
                    if postcontent:
                        item['content']['content'] = postcontent

                    postitle = hxs.xpath(
                        '//div[@class="zwcontentmain"]/div[@id="zwconttbt"]/text()'
                    ).extract()[0].strip()
                    item['content']['title'] = postitle
                except:  #针对问答的帖子
                    try:
                        postcontent = hxs.xpath(
                            '//div[@class="qa"]//div[contains(@class,"content")]/text()'
                        ).extract()
                        postquestion = postcontent[0]
                        postanswer = postcontent[2].strip(
                        ) + postcontent[3].strip()
                        item['content']['content'] = postquestion
                        item['content']['answer'] = postanswer
                        try:
                            postanswer_time = hxs.xpath(
                                '//div[@class="sign"]/text()').extract()
                            postanswer_time = re.search(
                                '\D+(\d{4}-\d{2}-.+:\d{2})',
                                postanswer_time[1].strip()).group(1)
                            answer_time = datetime.strptime(
                                postanswer_time, "%Y-%m-%d %H:%M:%S")
                            item['content']['answer_time'] = answer_time
                        except Exception as ex:
                            item['content']['answer_time'] = None

                        postitle = "Q&A"
                        item['content']['title'] = postitle
                    except Exception as ex:
                        print("Decode webpage content failed: " + response.url)
                        return

                replynum = response.meta['replynum']
                item['content']['replynum'] = replynum
                item['content']['reply'] = []

                if int(replynum) % 30 == 0:
                    rptotal = int(int(replynum) / 30)

                else:
                    rptotal = int(int(replynum) / 30) + 1

                if rptotal > 0:
                    head = re.search('(.+)\.html', response.url).group(1)
                    reply_url = head + "_" + str(1) + ".html"
                    yield Request(url=reply_url,
                                  meta={
                                      'item': item,
                                      'page': 1,
                                      'rptotal': rptotal,
                                      'head': head
                                  },
                                  callback=self.parse_reply)
                else:
                    yield item
                    print(item)

        except Exception as ex:
            self.logger.warn('Parse Exception: %s %s' %
                             (str(ex), response.url))
예제 #5
0
    def parse(self, response):
        type = response.meta['type']
        hxs = Selector(response)

        #个股吧
        if type == 1:
            stocks = hxs.xpath(
                '//div[@class="ngbglistdiv"]/ul[@class="ngblistul2"]/li/a'
            ).extract()
            fund_orgs = hxs.xpath(
                '//div[@class="ngbglistdiv"]/ul[@class="ngblistul2"]/div[@class="ngbglistjjt"]/a'
            ).extract()
            funds = hxs.xpath(
                '//div[@class="ngbglistdiv"]/ul[@class="ngblistul2"]/ul[@class="ngblistul3"]/li/a'
            ).extract()

            #爬取股票论坛的地址和名字
            for stock in stocks:
                m_stocks = re.search('href="(.+)">(.+)<\/a', stock)
                if m_stocks:
                    item = GubaItem()
                    item['content'] = {}
                    url_stock = "http://guba.eastmoney.com/" + m_stocks.group(
                        1)
                    item['content']['guba_url'] = url_stock
                    item['content']['guba_name'] = m_stocks.group(2)

                    yield Request(url=url_stock,
                                  meta={'item': item},
                                  callback=self.parse_page_num)

            #爬取基金论坛母吧的地址和名字
            for fund_org in fund_orgs:
                m_fund_orgs = re.search('href="(.+)">(.+)<\/a', fund_org)
                if m_fund_orgs:
                    item = GubaItem()
                    item['content'] = {}
                    url_fund_org = m_fund_orgs.group(1)
                    item['content']['guba_url'] = url_fund_org
                    item['content']['guba_name'] = m_fund_orgs.group(2)
                    yield Request(url=url_fund_org,
                                  meta={'item': item},
                                  callback=self.parse_page_num)

            #爬取基金论坛子吧的地址和名字
            for fund in funds:
                m_funds = re.search('href="(.+)">(.+)<\/a', fund)
                if m_funds:
                    item = GubaItem()
                    item['content'] = {}
                    url_fund = m_funds.group(1)
                    item['content']['guba_url'] = url_fund
                    item['content']['guba_name'] = m_funds.group(2)

                    yield Request(url=url_fund,
                                  meta={'item': item},
                                  callback=self.parse_page_num)

        #主题吧
        elif type == 2:
            stocks = hxs.xpath(
                '//div[@class="allzhutilistb"]/ul/li/a').extract()
            for stock in stocks:
                m_stocks = re.search('href="(.+)">(.+)<\/a', stock)
                item = GubaItem()
                item['content'] = {}
                url_stock = "http://guba.eastmoney.com/" + m_stocks.group(1)
                item['content']['guba_url'] = url_stock
                item['content']['guba_name'] = m_stocks.group(2)

                yield Request(url=url_stock,
                              meta={'item': item},
                              callback=self.parse_page_num)

        #行业吧
        elif type == 3:
            stocks = hxs.xpath('//ul[@class="ngblistitemul"]/li/a').extract()
            for stock in stocks:
                m_stocks = re.search('href="(.+)">(.+)<\/a', stock)
                item = GubaItem()
                item['content'] = {}
                url_stock = "http://guba.eastmoney.com/" + m_stocks.group(1)
                item['content']['guba_url'] = url_stock
                item['content']['guba_name'] = m_stocks.group(2)

                yield Request(url=url_stock,
                              meta={'item': item},
                              callback=self.parse_page_num)

        #概念吧
        elif type == 4:
            stocks = hxs.xpath('//ul[@class="ngblistitemul"]/li/a').extract()
            for stock in stocks:
                m_stocks = re.search('href="(.+)">(.+)<\/a', stock)
                item = GubaItem()
                item['content'] = {}
                url_stock = "http://guba.eastmoney.com/" + m_stocks.group(1)
                item['content']['guba_url'] = url_stock
                item['content']['guba_name'] = m_stocks.group(2)

                yield Request(url=url_stock,
                              meta={'item': item},
                              callback=self.parse_page_num)