Exemplos de GovItem em Python, exemplos de gov.items.GovItem em Python

Exemplo n.º 1

0

Exibir arquivo

    def parse(self, response):
        filename = 'home.html'

        for project in response.css('ul.vT-srch-result-list-bid').css("li"):
            project_name = project.css("a::text").extract()[0].strip()
            province = project.css("a::text").extract()[1].strip()
            money_url = project.css("li").css("a::attr(href)").extract_first()
            #yield response.follow(money_url,self.parse_money)
            item = GovItem()
            item['name'] = project_name
            item['pro'] = province
            request = response.follow(money_url, self.parse_money)
            request.meta['item'] = item
            yield request
            """
            yield{
                'project_name':project_name,
                'province':province,
                'money':money,
            }
            """
        prefix = 'http://search.ccgp.gov.cn/bxsearch?searchtype=2&page_index='
        suffix = '&bidSort=&buyerName=&projectId=&pinMu=&bidType=&dbselect=bidx&kw=%E5%8C%BB%E6%83%A0%E7%A7%91%E6%8A%80&start_time=2017%3A09%3A21&end_time=2017%3A12%3A22&timeType=4&displayZone=&zoneId=&pppStatus=0&agentName='
        next_page = prefix + str(3) + suffix
        if next_page is not None:
            yield response.follow(next_page, callback=self.parse)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: hlj.py Projeto: arshesoftime/Gov

    def hlj_parse(self, response):
        count = 1
        item = GovItem()
        browser = response.browser
        while count < self.page:
            infos = browser.find_elements_by_css_selector('.td1 tr')
            for info in infos:
                try:
                    if info.find_elements_by_css_selector('td'):
                        item['title'] = info.find_element_by_css_selector(
                            'td:nth-of-type(2)').text
                        detail_url = info.find_element_by_css_selector(
                            'tr div a').get_attribute('href')
                        item['detail_url'] = detail_url
                        item['department'] = info.find_element_by_css_selector(
                            'td:nth-of-type(3)').text
                        item['res_date'] = info.find_element_by_css_selector(
                            'td:nth-of-type(4)').text
                        self.browser2.get(detail_url)
                        self.parse_detail(self.browser2, item)
                        yield item
                except StaleElementReferenceException as e:
                    pass

            count += 1
            browser.execute_script(
                "__doPostBack('AspNetPager1','{page}')".format(
                    page=str(count)))

Exemplo n.º 3

0

Exibir arquivo

Arquivo: toutiao.py Projeto: vergilCloud/MyScrapy

 def request_back(self, response):
     data = json.loads(response.text)
     if ('message' in data.keys()):
         message = data['message']
         if message and message.lower() == 'success':
             max_behot_time = data['next']['max_behot_time']
             data_items = data['data']
             for data_item in data_items:
                 title = data_item['title']
                 item_id = data_item['item_id']
                 sourceOrg = data_item['source']
                 timeStamp = data_item['behot_time']
                 timeArray = time.localtime(timeStamp)
                 behotTime = time.strftime("%Y--%m--%d %H:%M:%S", timeArray)
                 detail_url = "https://www.toutiao.com/group/" + item_id
                 #yield scrapy.Request(detail_url, callback=self.content_request, headers=self.header3)
                 #yield scrapy.Request(detail_url, headers=self.header2,
                 #callback=lambda response, title=title: self.content_request(response, title))
                 if ('comments_count' in data_item.keys()):
                     comment_counts = data_item['comments_count']
                     print("title:  " + title + "  item_id: " + item_id +
                           "  detail_url: " + detail_url +
                           " comment_counts: " + str(comment_counts))
                     comment_url = self.create_comment_url(
                         item_id, 0, comment_counts)
                     #yield scrapy.Request(detail_url, callback=self.content_request, headers=self.headers)
                     yield scrapy.Request(
                         comment_url,
                         headers=self.header1,
                         callback=lambda response, title=title, detail_url=
                         detail_url, sourceOrg=sourceOrg, behotTime=
                         behotTime: self.comment_request(
                             response, title, detail_url, sourceOrg,
                             behotTime))
                 else:
                     print("title:  " + title + "  item_id: " + item_id +
                           "  detail_url: " + detail_url)
                     item = GovItem()
                     item['title'] = title
                     item['content'] = ''
                     item['sourceOrg'] = sourceOrg
                     item['comments'] = ''
                     item['publishTime'] = behotTime
                     item['sourceUrl'] = detail_url
                     yield item
             rewriteUrl = 'https://www.toutiao.com/api/pc/feed/?category=news_finance&utm_source=toutiao&widen=1&max_behot_time={0}&max_behot_time_tmp={0}&tadrequire=true'
             rewriteUrl = rewriteUrl.format(max_behot_time)
             print(rewriteUrl)
             yield scrapy.Request(rewriteUrl,
                                  callback=self.request_back,
                                  headers=self.header1,
                                  dont_filter=True)
         else:
             print(message + "  " + str(response.url).strip())
             time.sleep(2)
             yield scrapy.Request(str(response.url).strip(),
                                  callback=self.request_back,
                                  headers=self.header1,
                                  dont_filter=True)

Exemplo n.º 4

0

Exibir arquivo

Arquivo: pppsearch.py Projeto: vergilCloud/MyScrapy

 def cpppc_req_content(self, response, title, pub_time):
     content = response.xpath("//div[@class='cont']").extract()[0]
     item = GovItem()
     item['title'] = title
     item['content'] = content
     item['sourceOrg'] = '政府和社会资本合作中心'
     item['comments'] = ''
     item['publishTime'] = pub_time
     item['sourceUrl'] = response.url
     yield item

Exemplo n.º 5

0

Exibir arquivo

Arquivo: finance.py Projeto: vergilCloud/MyScrapy

 def hb_content_detail(self, response, title, pub_time):
     org_info = "河北省财政厅"
     content = response.xpath("//div[@class='content']").extract()[0]
     item = GovItem()
     item['title'] = title
     item['content'] = content
     item['sourceOrg'] = org_info
     item['comments'] = ''
     item['publishTime'] = pub_time
     item['sourceUrl'] = response.url
     yield item

Exemplo n.º 6

0

Exibir arquivo

Arquivo: finance.py Projeto: vergilCloud/MyScrapy

 def sh_content_detail(self, response, title, pub_time):
     content = response.xpath(
         "//div[@class='article_content']").extract()[0]
     item = GovItem()
     item['title'] = title
     item['content'] = content
     item['sourceOrg'] = "上海财政局"
     item['comments'] = ''
     item['publishTime'] = pub_time
     item['sourceUrl'] = response.url
     yield item

Exemplo n.º 7

0

Exibir arquivo

Arquivo: finance.py Projeto: vergilCloud/MyScrapy

 def jcz_cq_content_detail(self, response, title, pub_time):
     content = response.xpath("//div[@id='showcontent']").extract()[0]
     #org_info = response.xpath("//*[@id='text']/h3/text()").extract()[0]
     item = GovItem()
     item['title'] = title
     item['content'] = content
     item['sourceOrg'] = "重庆财政局"
     item['comments'] = ''
     item['publishTime'] = pub_time
     item['sourceUrl'] = response.url
     yield item

Exemplo n.º 8

0

Exibir arquivo

Arquivo: pppsearch.py Projeto: vergilCloud/MyScrapy

 def zbr_req_content(self, response, title):
     pub_time = response.xpath("//div[@class='project_d_t']/span/text()").extract()[0]
     content = response.xpath("//div[@class='project_d_c']").extract()[0]
     item = GovItem()
     item['title'] = title
     item['content'] = content
     item['sourceOrg'] = '智博睿'
     item['comments'] = ''
     item['publishTime'] = pub_time
     item['sourceUrl'] = response.url
     yield item

Exemplo n.º 9

0

Exibir arquivo

Arquivo: finance.py Projeto: vergilCloud/MyScrapy

 def tj_content_detail(self, response, title, pub_time):
     content = response.xpath("//div[@id='zoom']").extract()[0]
     org_info = response.xpath(
         "//table[@id='c']/tr[3]/td/span[2]/text()").extract()[0]
     source_org = org_info.replace('来源：', '')
     item = GovItem()
     item['title'] = title
     item['content'] = content
     item['sourceOrg'] = source_org
     item['comments'] = ''
     item['publishTime'] = pub_time
     item['sourceUrl'] = response.url
     yield item

Exemplo n.º 10

0

Exibir arquivo

Arquivo: finance.py Projeto: vergilCloud/MyScrapy

 def szfb_content_detail(self, response, title, pub_time):
     org_info = response.xpath(
         "//div[@class='tit']/h6/span[1]/text()").extract()[0]
     content = response.xpath(
         "//div[@class='news_cont_d_wrap']").extract()[0]
     item = GovItem()
     item['title'] = title
     item['content'] = content
     item['sourceOrg'] = org_info.replace('信息来源：', '')
     item['comments'] = ''
     item['publishTime'] = pub_time
     item['sourceUrl'] = response.url
     yield item

Exemplo n.º 11

0

Exibir arquivo

Arquivo: finance.py Projeto: vergilCloud/MyScrapy

 def bjcz_content_detail(self, response, title):
     pub_time = response.xpath(
         "//span[@style='display: inline-block;margin:15px 10px;font-size: 14px;'][1]/text()"
     ).extract()[0]
     content = response.xpath("//div[@class='txt']").extract()[0]
     item = GovItem()
     item['title'] = title
     item['content'] = content
     item['sourceOrg'] = "北京财政局"
     item['comments'] = ''
     item['publishTime'] = pub_time
     item['sourceUrl'] = response.url
     yield item

Exemplo n.º 12

0

Exibir arquivo

Arquivo: topsub.py Projeto: vergilCloud/MyScrapy

 def parse(self, response):
     hot_list = [
         'http://top.baidu.com/buzz?b=342&c=513&fr=topbuzz_b42_c513',
         'http://top.baidu.com/buzz?b=341&c=513&fr=topbuzz_b42_c513',
         'http://top.baidu.com/buzz?b=42&c=513&fr=topbuzz_b342_c513'
     ]
     sub_name_list = []
     for hot_url in hot_list:
         sourceReq = requests.get(hot_url)
         sourceReq.encoding = 'gb2312'
         sourceHtml = sourceReq.text
         selector = etree.HTML(sourceHtml)
         items = selector.xpath("//table[@class='list-table']/tr")
         if 'b=342' in hot_url:
             hot_type = '民生热点'
         elif 'b=341' in hot_url:
             hot_type = '今日热点'
         else:
             hot_type = '七日热点'
         count_index = 1
         for item in items:
             if count_index != 1:
                 subject_name = item.xpath(
                     "./td[@class='keyword']/a[@class='list-title']/text()"
                 )[0]
                 hot_num = item.xpath("./td[@class='last']/span/text()")[0]
                 icon_statu = item.xpath(
                     "./td[@class='keyword']/span/@class")
                 status = ''
                 if icon_statu:
                     if 'icon-new' in icon_statu[0]:
                         status = '新'
                         print(status)
                 govItem = GovItem()
                 govItem['subName'] = subject_name
                 govItem['hotNum'] = hot_num
                 govItem['hotType'] = hot_type
                 govItem['status'] = status
                 sub_name_list.append(govItem)
             count_index = count_index + 1
     if len(sub_name_list) > 0:
         dist_list = []
         # 去掉重复主题
         unique = collections.OrderedDict()
         for govItem in sub_name_list:
             unique.setdefault(govItem["subName"], govItem)
         for item in unique.values():
             dist_list.append(item)
             yield item
         print(len(dist_list))

Exemplo n.º 13

0

Exibir arquivo

Arquivo: toutiao.py Projeto: vergilCloud/MyScrapy

 def content_request(self, response, title):
     str_body = response.body.decode(response.encoding)
     print(str_body)
     content = self.parse_page_detail(str_body)
     if content is None:
         content = title
     #print(content)
     item = GovItem()
     item['title'] = title
     item['content'] = content
     item['sourceOrg'] = '头条'
     item['publishTime'] = ''
     item['sourceUrl'] = response.url
     item['comments'] = ''
     print(item['content'])

Exemplo n.º 14

0

Exibir arquivo

Arquivo: finance.py Projeto: vergilCloud/MyScrapy

 def gd_content_detail(self, response, title, pub_time):
     org_info = response.xpath(
         "//div[@class='meta']/div/span[2]/text()").extract()[0]
     source_org = org_info.replace('来源：', '')
     if source_org is '':
         source_org = "广东财政局"
     content = response.xpath("//div[@class='content']/p").extract()[0]
     item = GovItem()
     item['title'] = title
     item['content'] = content
     item['sourceOrg'] = source_org
     item['comments'] = ''
     item['publishTime'] = pub_time
     item['sourceUrl'] = response.url
     yield item

Exemplo n.º 15

0

Exibir arquivo

Arquivo: toutiao.py Projeto: vergilCloud/MyScrapy

 def comment_request(self, response, title, detail_url, sourceOrg,
                     behotTime):
     comment_datas = json.loads(response.text)['data']['comments']
     item = GovItem()
     item['title'] = title
     item['content'] = ''
     item['sourceOrg'] = sourceOrg
     item['publishTime'] = behotTime
     item['sourceUrl'] = detail_url
     # 默认大小5
     comments_list = []
     for comment in comment_datas:
         comments_list.append(comment['text'])
     item['comments'] = comments_list
     yield item

Exemplo n.º 16

0

Exibir arquivo

    def parse(self, response):
        filename = 'home.html'
        items = []
        for project in response.css('ul.vT-srch-result-list-bid').css("li"):

            item = GovItem()
            item['name'] = project.css("a::text").extract()[0]
            item['pro'] = project.css("a::text").extract()[1]
            item['info'] = project.css("p::text").extract()
            items.append(item)

        #prefix='http://search.ccgp.gov.cn/bxsearch?searchtype=2&page_index='
        #suffix='&bidSort=&buyerName=&projectId=&pinMu=&bidType=&dbselect=bidx&kw=%E5%8C%BB%E6%83%A0%E7%A7%91%E6%8A%80&start_time=2017%3A09%3A21&end_time=2017%3A12%3A22&timeType=4&displayZone=&zoneId=&pppStatus=0&agentName='
        #next_page=prefix+str(2)+suffix
        #if next_page is not None:
        #    yield response.follow(next_page,callback=self.parse)
        return items

Exemplo n.º 17

0

Exibir arquivo

Arquivo: finance.py Projeto: vergilCloud/MyScrapy

 def mof_cn_content_detail(self, response, title, pub_time):
     org_path = '//*[@id="tb_select"]/option/text()'
     source_org = response.xpath(org_path).extract()
     if len(source_org) == 0:
         source_org = '财政部'
     else:
         source_org = source_org[0]
     content_path = '//*[@id="Zoom"]'
     content = response.xpath(content_path).extract()[0]
     item = GovItem()
     item['title'] = title
     item['content'] = content
     item['sourceOrg'] = source_org
     item['comments'] = ''
     item['publishTime'] = pub_time
     item['sourceUrl'] = response.url
     yield item

Exemplo n.º 18

0

Exibir arquivo

    def _real_parse_item(self, response):

        item = GovItem(domain_collection=None,
                       html=None,
                       pdf=[],
                       xls=[],
                       images=[],
                       others=[])
        # 1.保存html

        filename = make_file_name(response.url, 'html')
        item['html'] = filename

        domain = response.url.split('/')[2]
        item['domain_collection'] = md5_encode(domain)
        abpath = DATA_DIR + item['domain_collection']

        if not os.path.exists(abpath):  # 第一次创建文件夹

            os.makedirs(abpath)

        with open(abpath + '/' + filename, 'wb') as f:
            f.write(response.body)

        # 2.保存其他资源
        images = response.selector.xpath('//img/@src').extract()
        pdf = response.selector.xpath(
            '//a/@href[contains(.,".pdf")]').extract()
        xls = response.selector.xpath(
            '//a/@href[contains(.,".xls")]').extract()
        urls = images + pdf + xls

        if urls:
            for url in urls:
                """
                url = response.urljoin(url)
                self.logger.info(url)
                yield scrapy.Request(
                    "http://localhost:8050/render.html?url=" + url,
                    callback=self.save_files, 
                    cb_kwargs=dict(item=item)
                )
                """
                yield response.follow(url,
                                      callback=self.save_files,
                                      cb_kwargs=dict(item=item))

Exemplo n.º 19

0

Exibir arquivo

Arquivo: finance.py Projeto: vergilCloud/MyScrapy

 def sc_content_detail(self, response):
     title = response.xpath("//div[@class='infoTxt']/p/text()").extract()[0]
     org_info = response.xpath(
         "//div[@class='infoTxt']/div[1]/div[1]/text()").extract()[0]
     pub_time = response.xpath(
         "//div[@class='infoTxt']/div[1]/div[2]/text()").extract()[0]
     content = response.xpath("//div[@class='txt2-in']").extract()[0]
     source_org = org_info.replace('信息来源：', '')
     if source_org is '':
         source_org = "四川财政局"
     item = GovItem()
     item['title'] = title
     item['content'] = content
     item['sourceOrg'] = source_org
     item['comments'] = ''
     item['publishTime'] = pub_time
     item['sourceUrl'] = response.url
     yield item

Exemplo n.º 20

0

Exibir arquivo

    def req_bjh_content(self, response, title, source_org, pub_time,
                        source_url, keyword):
        contents = response.xpath("//div[@class='article-content']/p").xpath(
            'string(.)').extract()
        print('++++++++++++')
        content = ''
        for item in contents:
            content = content + item + '\n'
        print(content)

        govItem = GovItem()
        govItem['title'] = title
        govItem['content'] = content
        govItem['sourceOrg'] = source_org
        govItem['comments'] = ''
        govItem['publishTime'] = pub_time
        govItem['sourceUrl'] = source_url
        govItem['subName'] = keyword
        yield govItem

Exemplo n.º 21

0

Exibir arquivo

Arquivo: zj.py Projeto: arshesoftime/Gov

 def zj_parse(self, response):
     item = GovItem()
     browser = response.browser
     infos = browser.find_elements_by_css_selector('tr[valign=top]')
     for info in infos:
         try:
             if info.find_element_by_css_selector('a'):  #如果元素中含有a
                 item['title'] = info.find_element_by_css_selector(
                     '.text').text
                 detail_url = info.find_element_by_css_selector(
                     'a').get_attribute('href')
                 item['detail_url'] = detail_url
                 self.browser2.get(detail_url)
                 self.parse_detail(self.browser2, item)
                 yield item
         except (StaleElementReferenceException, NoSuchElementException):
             pass
     self.count += 1
     yield Request(self.start_url.format(page=self.count), self.zj_parse)

Exemplo n.º 22

0

Exibir arquivo

Arquivo: pppsearch.py Projeto: vergilCloud/MyScrapy

    def bd_req_page(self, response):
        items = response.xpath("//div[@class='result']")
        for item in items:
            #detail_url = item.xpath("./div/span/a/@href").extract()[0]
            title = item.xpath("./h3/a/text()").extract()[0].replace(' ', '').replace('\\n', '')
            '''source_time = item.xpath("./div/p/text()").extract()[0].split( )# 以空格为分隔符，包含 \n
            if len(source_time) == 3:
                source_org = source_time[0]
                pub_time = source_time[1]+source_time[2]
            elif len(source_time) ==2:
                source_org = source_time[0]
                pub_time = source_time[1]
            else:
                source_org = source_time[0]
                pub_time = str(datetime.date.today())
            if pub_time:
               if  '前' in pub_time:
                   pub_time = str(datetime.date.today())
            print (source_org +'\n' +pub_time)'''

            #pub_time = source_time[1]
            content = item.xpath("./div").extract()
            pattern1 = re.compile(r'(?<=(</p>))\S*?(?=(<spanclass))')
            match_content = pattern1.search(str(content).replace(' ', ''))
            whole_content = ''
            if match_content:
                whole_content = match_content.group()
            '''len_content = len(content)
            i = 0
            whole_content = ''
            while i <len_content:
                whole_content = whole_content + content[i]
                i = i +1'''
            whole_content = whole_content.replace('<em>', '').replace('</em>','').replace('\\n', '')
            item = GovItem()
            item['title'] = title
            item['content'] = whole_content
            item['sourceOrg'] = 'baidu'
            item['comments'] = ''
            item['publishTime'] = str(datetime.date.today())
            item['sourceUrl'] = 'www.baidu.com/news/search'
            yield item

Exemplo n.º 23

0

Exibir arquivo

Arquivo: hn.py Projeto: arshesoftime/Gov

 def hn_parse(self, response):
     item = GovItem()
     browser = response.browser
     letters =browser.find_elements_by_css_selector('div.myxjgs-content div table tbody tr')
     for letter in letters:
        try:
            item['title'] = letter.find_element_by_css_selector('td:nth-child(1) a').text
            detail_url = letter.find_element_by_css_selector('td:nth-child(1) a').get_attribute('href')
            item['detail_url'] = detail_url
            item['department'] = letter.find_element_by_css_selector('td:nth-child(2) a').text
            item['raise_date'] = letter.find_element_by_css_selector('td:nth-child(3) span').text
            item['res_date'] = letter.find_element_by_css_selector('td:nth-child(4) span').text  # 可能是none
            self.browser2.get(detail_url)
            self.parse_detail(self.browser2,item)
            yield item
        except (NoSuchElementException,StaleElementReferenceException):
            pass
     if self.offset<self.page:
         self.offset +=15
     yield Request(self.basic_url.format(offset =self.offset),self.hn_parse)

Exemplo n.º 24

0

Exibir arquivo

Arquivo: angov.py Projeto: arshesoftime/Gov

 def bj_parse(self, response):
     item = GovItem()
     browser = response.browser
     letters = browser.find_elements_by_css_selector('#newLetterReply li')
     for letter in letters:
         try:
             item['title'] = letter.find_element_by_css_selector(
                 'p.font14.mymail_title a span').text
             detail_url = letter.find_element_by_css_selector(
                 'p.font14.mymail_title a').get_attribute('href')
             item['detail_url'] = detail_url
             item['department'] = letter.find_element_by_css_selector(
                 '.font12.gray .mail_margin[name]').text
             self.browser2.get(detail_url)
             self.parse_detail(self.browser2, item)
             yield item
         except (NoSuchElementException, StaleElementReferenceException):
             pass
     self.count += 1
     yield Request(
         self.beijing_url.format(PCon=self.count, type='nextPage'),
         self.bj_parse)

Exemplo n.º 25

0

Exibir arquivo

 def sh_parse(self, response):
     item = GovItem()
     browser = response.browser
     infos = browser.find_elements_by_css_selector('#FBList tr')
     for info in infos:
         try:
             if info.find_element_by_css_selector('a'):  #如果元素中含有a标签
                 item['title'] = info.find_element_by_css_selector(
                     'td a').text
                 detail_url = info.find_element_by_css_selector(
                     'td a').get_attribute('href')
                 item['detail_url'] = detail_url
                 item['department'] = info.find_element_by_css_selector(
                     'span').text
                 item['res_date'] = info.find_element_by_css_selector(
                     'td:nth-of-type(6)').text
                 self.browser2.get(detail_url)
                 self.parse_detail(self.browser2, item)
                 yield item
         except NoSuchElementException as e:
             pass
     self.count += 1
     yield Request(self.start_url.format(page=self.count), self.sh_parse)

Exemplo n.º 26

0

Exibir arquivo

Arquivo: finance.py Projeto: vergilCloud/MyScrapy

 def zj_content_detail(self, response):
     str_body = response.body.decode(response.encoding)
     # 获取总页数
     pattern1 = re.compile(r'(?<=(发布日期：))\S*?(?=(</td))')
     match_time = pattern1.search(str_body)
     pub_time = ''
     if match_time:
         pub_time = match_time.group()
     pattern2 = re.compile(r'(?<=(信息来源]>begin-->))\S*?(?=(<!))')
     match_org = pattern2.search(str_body)
     org_info = '浙江财政局'
     if match_org:
         org_info = match_org.group()
     content = response.xpath("//div[@id='zoom']").extract()[0]
     title = response.xpath("//title/text()").extract()[0]
     item = GovItem()
     item['title'] = title
     item['content'] = content
     item['sourceOrg'] = org_info
     item['comments'] = ''
     item['publishTime'] = pub_time
     item['sourceUrl'] = response.url
     yield item

Exemplo n.º 27

0

Exibir arquivo

 def item_parse(self, publish_time, title, content, comment_url, thread_id,
                source_org):
     detail_comment_url = self.create_comment_url(thread_id, 30, 0)
     item = GovItem()
     item['title'] = title
     item['content'] = content
     item['sourceOrg'] = source_org
     item['publishTime'] = publish_time
     item['sourceUrl'] = comment_url
     print("title: " + title + "  publish_time:  " + publish_time +
           "   source_org: " + source_org + " comment_url: " + comment_url +
           "publishTime " + publish_time + "content  " + content)
     # 第一次获取总数
     data = self.comment_parse(detail_comment_url)
     list_size = data.get('newListSize')
     if list_size != 0:
         if list_size <= 300:
             limit = 30
         else:
             limit = 40  #只能最大40 不然会报分页参数错误
         page_num = math.ceil(list_size / limit)
         comments_list = []
         page = 0
         while page < page_num:
             offset = page * limit
             detail_comment_url = self.create_comment_url(
                 thread_id, limit, offset)
             print(detail_comment_url)
             data = self.comment_parse(detail_comment_url)
             for key in data['comments'].keys():
                 user_name = jsonpath.jsonpath(data['comments'][key],
                                               '$..nickname')
                 if user_name != False:
                     user_name = user_name[0]
                 else:
                     user_name = ''
                 location = jsonpath.jsonpath(data['comments'][key],
                                              '$..location')
                 if location != False:
                     location = location[0]
                 else:
                     location = ''
                 timeArray = time.strptime(
                     data['comments'][key]['createTime'].strip(),
                     "%Y-%m-%d %H:%M:%S")
                 timeStamp = int(time.mktime(timeArray) * 1000)
                 json_str = {
                     "content":
                     data['comments'][key]['content'].replace('[',
                                                              '').replace(
                                                                  ']', ''),
                     "userName":
                     user_name,
                     "place":
                     location,
                     "hotNum":
                     data['comments'][key]['vote'],
                     "publishDate":
                     timeStamp
                 }
                 comments_list.append(json_str)
             page = page + 1
         item['comments'] = comments_list
     else:
         item['comments'] = ''
     yield item

Exemplo n.º 28

0

Exibir arquivo

 def bd_req_page(self, response, keyword):
     items = response.xpath("//div[@class='result']")
     for item in items:
         title = item.xpath("./h3/a").xpath(
             'string(.)').extract()[0].replace('\n', '').replace(' ', '')
         source_time = item.xpath("./div/p").xpath('string(.)').extract()[0].replace('\n', '').replace('\t', '') \
             .replace(' ', '').split('\xa0')  # 以空格为分隔符，包含 \n
         source_org = source_time[0]
         whole_time = source_time[2]
         if '小时前' in whole_time:
             # 当前时间毫秒
             t = time.time()
             timeStamp = int(round(t * 1000))
             hoursStamp = int(whole_time.replace('小时前',
                                                 '')) * 60 * 60 * 1000
             pub_time = timeStamp - hoursStamp
         elif '分钟前' in whole_time:
             t = time.time()
             timeStamp = int(round(t * 1000))
             hoursStamp = int(whole_time.replace('分钟前', '')) * 60 * 1000
             pub_time = timeStamp - hoursStamp
         else:
             # 2018年07月18日10:05
             format_time = whole_time.replace('年', '-').replace(
                 '月', '-').replace('日', ' ')
             timeArray = time.strptime(format_time, "%Y-%m-%d %H:%M")
             pub_time = int(time.mktime(timeArray) * 1000)
         source_url = item.xpath("./h3/a/@href").extract()[0]
         if 'baijiahao.baidu.com' in source_url:
             print('baijiahao')
             yield scrapy.Request(
                 source_url,
                 callback=lambda response, title=title, source_org=
                 source_org, pub_time=pub_time, source_url=source_url,
                 keyword=keyword: self.req_bjh_content(
                     response, title, source_org, pub_time, source_url,
                     keyword))
             '''sourceReq = requests.get(source_url)
             sourceReq.encoding = 'utf-8'
             sourceHtml = sourceReq.text
             selector = etree.HTML(sourceHtml)
             content_items = selector.xpath("//div[@class='article-content']/p")
             for content_item in content_items:
                print(content_item.xpath("./text()"))'''
         else:
             print('not baijiahao')
             str_content = item.xpath("./div").extract()[0].replace(
                 '\n', '').replace('\t', '').replace(' ', '')
             pattern1 = re.compile(r'(?<=(</p>))\S*?(?=(<span))')
             match_content = pattern1.search(str_content)
             if match_content:
                 content = match_content.group().replace('<em>',
                                                         '').replace(
                                                             '</em>', '')
             else:
                 content = item.xpath("./div/text()").extract()[0]
             govItem = GovItem()
             govItem['title'] = title
             govItem['content'] = content
             govItem['sourceOrg'] = source_org
             govItem['comments'] = ''
             govItem['publishTime'] = pub_time
             govItem['sourceUrl'] = source_url
             govItem['subName'] = keyword
             yield govItem