示例#1
0
 def parse_page(self, response):
     base_url = 'http://www.innocom.gov.cn'
     regex = '//div[@class="list7"]/ul/li'
     for sel in response.xpath(regex):
         link = sel.xpath(r'a/@href').extract_first(default='').strip()
         title = sel.xpath(r'a/text()').extract_first(default='').strip()
         date = sel.xpath(r'span/text()').extract_first(default='').strip()
         lst = [link, title, date]
         if not all(lst):
             logging.warning(f'{response.url}--{link}: get data failed')
             continue
         url = base_url + link
         unique_id = get_md5(url)
         if self.mongo_col.find_one({'$and': [{'unique_id': unique_id}, {'origin': f'{self.name}'}]}):
             logging.warning(f'{url} is download already, unique_id: {unique_id}')
             continue
         date = date.strip('[').strip(']')
         if len(date) == 10:
             now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
             date += ' ' + now.split(' ')[-1]
         item = GovInfoItem()
         item['url'] = url
         item['unique_id'] = unique_id
         item['title'] = title
         item['source'] = '科技部火炬高技术产业开发中心'
         item['date'] = date
         item['origin'] = self.name
         item['type'] = 'web'
         item['location'] = '高新区'
         item['crawled'] = 1
         yield scrapy.FormRequest(url, method='GET', headers=self.headers,
                                  meta={'item': item}, callback=self.parse_item)
示例#2
0
 def parse_page(self, response):
     headers = copy.deepcopy(self.headers)
     headers.update({'Host': 'www.cdmbc.gov.cn'})
     json_data = json.loads(json.dumps(xmltodict.parse(response.body)))
     for row in json_data['rows']['row']:
         url = re.findall(r'href="(.*?)"|$', row['cell'][0])[0]
         if url == '' or 'cdmbc' not in url:
             logging.warning(f'{response.url}--{url}: get data failed')
             continue
         date = row['cell'][1]
         unique_id = get_md5(url)
         if self.mongo_col.find_one({'$and': [{'unique_id': unique_id}, {'origin': f'{self.name}'}]}):
             logging.warning(f'{url} is download already, unique_id: {unique_id}')
             continue
         date = date.strip('[').strip(']')
         if len(date) == 10:
             now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
             date += ' ' + now.split(' ')[-1]
         item = GovInfoItem()
         item['url'] = url
         item['unique_id'] = unique_id
         item['source'] = '成都市商务委'
         item['date'] = date
         item['origin'] = self.name
         item['type'] = 'web'
         item['location'] = '成都市'
         item['crawled'] = 1
         yield scrapy.FormRequest(url, method='GET', headers=headers,
                                  meta={'item': item}, callback=self.parse_item)
示例#3
0
 def parse(self, response):
     base_url = 'http://www.cdsme.com'
     regex = '//div[@class="clearFix MN_A1_box"]'
     for sel in response.xpath(regex):
         link = sel.xpath(r'div[1]/a/@href').extract_first(default='').strip()
         summary = sel.xpath(r'div[2]/div/p/text()').extract_first(default='').strip()
         date = sel.xpath(r'div[1]/p/text()').extract_first(default='').strip()
         lst = [link, date]
         if not all(lst):
             logging.warning(f'{response.url}--{link}: get data failed')
             continue
         url = base_url + link
         unique_id = get_md5(url)
         if self.mongo_col.find_one({'$and': [{'unique_id': unique_id}, {'origin': f'{self.name}'}]}):
             logging.warning(f'{url} is download already, unique_id: {unique_id}')
             continue
         date = date.strip('[').strip(']')
         if len(date) == 10:
             now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
             date += ' ' + now.split(' ')[-1]
         item = GovInfoItem()
         item['url'] = url
         item['unique_id'] = unique_id
         item['summary'] = summary
         item['source'] = '成都市中小企业网'
         item['date'] = date
         item['origin'] = self.name
         item['type'] = 'web'
         item['location'] = '成都市'
         item['crawled'] = 1
         yield scrapy.FormRequest(url, method='GET', headers=self.headers,
                                  meta={'item': item}, callback=self.parse_item)
示例#4
0
 def parse(self, response):
     json_data = response.meta['json_data']
     item = GovInfoItem(json_data['item'])
     selector = etree.HTML(response.body)
     regex = r'//*[@id="js_content"]'
     title = response.xpath(r'//*[@id="activity-name"]/text()'
                            ).extract_first(default='').strip()
     content = response.xpath(regex).xpath('string(.)').extract_first(
         '').strip()
     if (content == '') and (title == ''):
         logging.warning(f'{item["url"]}: title and content is none')
         self.task_col.update({'unique_id': item['task_unique_id']},
                              {"$set": {
                                  'crawled': -1
                              }})
         return
     try:
         content = etree.tostring(selector.xpath(regex)[0],
                                  encoding='utf-8')
     except Exception as err:
         logging.error(f'{item["url"]}: get content failed')
         return
     if item['summary'] == '':
         item['summary'] = content[:100] if (
             content != '') else item['title']
     item['content'] = content.decode('utf-8').replace('
', '')
     item['title'] = title.strip()
     item['unique_id'] = item['unique_id']
     item['crawled'] = 1
     yield item
示例#5
0
 def parse_page(self, response):
     regex = '//div[@class="newlist_left_cont"]/ul'
     for sel in response.xpath(regex):
         link = sel.xpath(r'li[1]/a/@href').extract_first(
             default='').strip()
         text = sel.xpath(r'li[2]/text()').extract_first(default='').strip()
         title = sel.xpath(r'li[1]/a/@title').extract_first(
             default='').strip()
         lst = [link, title, text]
         if not all(lst):
             logging.warning(f'{response.url}.{link}: get data failed')
             continue
         if 'http' not in link:
             url = self.base_url + link
         else:
             url = link
         if 'cdgy.gov.cn' not in url:
             logging.warning(f'{url} is out the domain')
             continue
         unique_id = get_md5(url)
         if self.mongo_col.find_one({
                 '$and': [{
                     'unique_id': unique_id
                 }, {
                     'origin': f'{self.name}'
                 }]
         }):
             logging.warning(
                 f'{url} is download already, unique_id: {unique_id}')
             continue
         text = ''.join(text.split())
         text = re.sub(r'\s|:|:', '', text)
         promulgator, date = re.findall(r'来源(.*?)发布时间(\d{4}-\d{2}-\d{2})',
                                        text)[0]
         if len(date) == 10:
             now = time.strftime('%Y-%m-%d %H:%M:%S',
                                 time.localtime(time.time()))
             date += ' ' + now.split(' ')[-1]
         item = GovInfoItem()
         item['url'] = url
         item['unique_id'] = unique_id
         item['title'] = title.strip()
         item['source'] = promulgator
         item['date'] = date
         item['origin'] = self.name
         item['type'] = 'web'
         item['location'] = '成都市'
         item['crawled'] = 1
         yield scrapy.FormRequest(url,
                                  method='GET',
                                  headers=self.headers,
                                  meta={'item': item},
                                  callback=self.parse_item)
示例#6
0
    def parse(self, response):
        result = 1
        url = response.url
        task = response.meta['task']
        params = task['params']
        origin = task['origin']
        total = re.findall(r'找到约(\d+)条结果|$'.encode('utf-8'), response.body)[0]
        if total != b'' and int(total) > 10:
            result = -1
            logging.error(f'{url}: {params} page too more')
        self.task_col.update({'_id': task['_id']}, {"$set": {'crawled': result}})

        redis_values = []
        for sel in response.xpath(r'//li[contains(@id, "sogou_vr_")]'):
            item = GovInfoItem()
            unique_id = sel.xpath(r'./@d').extract_first(default='').strip()
            date = sel.xpath(r'div/div/@t').extract_first(default='').strip()
            source = sel.xpath(r'div/div/a/text()').extract_first(default='').strip()
            link = sel.xpath(r'div/h3/a/@href').extract_first(default='').strip()
            name = sel.xpath(r'div/div/a/text()').extract_first(default='').strip()
            lst = [unique_id, date, source, link]
            if not all(lst):
                result = -1
                logging.warning(f'{url}: {params}.{link}: get data failed')
                self.task_col.update({'_id': task['_id']}, {"$set": {'crawled': result}})
                continue
            unique_id = get_md5(unique_id)
            if self.mongo_col.find_one({'$and': [{'unique_id': unique_id}, {'origin': f'{origin}'}]}):
                logging.warning(f'{url} is download already, unique_id: {unique_id}')
                continue
            if task['name'] != name:
                logging.warning(f'{url}: {params}.{link}: is not publish from {task["name"]}')
                continue
            url = link.replace('http', 'https')
            item['url'] = url
            item['task_unique_id'] = task['unique_id']
            item['unique_id'] = unique_id
            try:
                item['summary'] = sel.xpath(r'./div/p[@class="txt-info"]')[0].xpath(
                    'string(.)').extract_first('').strip()
            except Exception as err:
                logging.warning(f'{url}: {params}.{link}: get summary failed')
                item['summary'] = ''
            item['date'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(int(date)))
            item['source'] = source
            item['origin'] = origin
            item['type'] = 'wxgzh'
            item['crawled'] = 0
            item['location'] = task['location']
            redis_values.append(json.dumps({'item': dict(item)}))
        if redis_values:
            self.redis_con.sadd("{}".format(self.redis_key), *redis_values)
示例#7
0
    def parse_page(self, response):
        base_url = 'http://www.sczwfw.gov.cn:82'
        for sel in response.xpath('//div[@class="news_r"]//li'):
            item = GovInfoItem()
            link = sel.xpath('.//a/@href').extract_first(default='').strip()
            title = sel.xpath('.//a/@title').extract_first(default='').strip()
            date = sel.xpath('.//em/text()').extract_first(default='').strip()
            lst = [link, title, date]
            if not all(lst):
                logging.warning(f'{response.url}: get data failed')
                continue
            url = base_url + link
            unique_id = get_md5(url)
            if self.mongo_col.find_one({
                    '$and': [{
                        'unique_id': unique_id
                    }, {
                        'origin': f'{self.name}'
                    }]
            }):
                logging.warning(
                    f'{url} is download already, unique_id: {unique_id}')
                continue

            if len(date) == 10:
                now = time.strftime('%Y-%m-%d %H:%M:%S',
                                    time.localtime(time.time()))
                date += ' ' + now.split(' ')[-1]

            item['url'] = url
            item['unique_id'] = unique_id
            item['title'] = title
            item['source'] = '四川省人民政府办公厅'
            item['origin'] = self.name
            item['date'] = date
            item['type'] = 'web'
            item['location'] = '四川省'
            item['crawled'] = 1
            yield scrapy.FormRequest(url,
                                     method='GET',
                                     headers=self.headers,
                                     meta={'item': item},
                                     callback=self.parse_item)
示例#8
0
 def parse_page(self, response):
     base_url = 'http://125.70.9.164:1250/csidc/csidc2/site/noticeView.jsp?id='
     regex = r'//tr[@class="btd"]'
     for sel in response.xpath(regex):
         link = sel.xpath(r'td[1]/a/@href').re(
             r'javascript:display\(\'(.*?)\'\)')
         title = sel.xpath(r'td[1]/a/text()').extract_first(
             default='').strip()
         date = sel.xpath(r'td[3]/text()').extract_first(default='').strip()
         lst = [link, date]
         if not all(lst):
             logging.warning(f'{response.url}.{link}: get data failed')
             continue
         url = base_url + link[0]
         unique_id = get_md5(url)
         if self.mongo_col.find_one({
                 '$and': [{
                     'unique_id': unique_id
                 }, {
                     'origin': f'{self.name}'
                 }]
         }):
             logging.warning(
                 f'{url} is download already, unique_id: {unique_id}')
             continue
         date = time.strftime("%Y-%m-%d %H:%M:%S",
                              time.strptime(date, "%Y-%m-%d %H:%M"))
         item = GovInfoItem()
         item['url'] = url
         item['unique_id'] = unique_id
         item['date'] = date
         item['title'] = title
         item['origin'] = self.name
         item['source'] = '成都市经济和信息化委员会'
         item['type'] = 'web'
         item['location'] = '成都市'
         item['crawled'] = 1
         yield scrapy.FormRequest(url,
                                  method='GET',
                                  headers=self.headers,
                                  meta={'item': item},
                                  callback=self.parse_item)
示例#9
0
 def parse(self, response):
     base_url = 'http://www.cdst.gov.cn/Readnews.asp?NewsID={}'
     for sel in response.xpath(r'//div[@class="listline"]/li'):
         title = sel.xpath(r'a/@title').extract_first(default='').strip()
         unique_id = sel.xpath('a/@href').extract_first(default='').strip()
         date = sel.xpath(r'span/text()').extract_first(default='').strip()
         lst = [title, unique_id, date]
         if not all(lst):
             logging.warning(f'{response.url}: get data failed')
             continue
         url = base_url.format(unique_id.split('=')[-1])
         unique_id = get_md5(url)
         if self.mongo_col.find_one({
                 '$and': [{
                     'unique_id': unique_id
                 }, {
                     'origin': f'{self.name}'
                 }]
         }):
             logging.warning(
                 f'{url} is download already, unique_id: {unique_id}')
             continue
         date = date.strip('[').strip(']')
         if len(date) == 10:
             now = time.strftime('%Y-%m-%d %H:%M:%S',
                                 time.localtime(time.time()))
             date += ' ' + now.split(' ')[-1]
         item = GovInfoItem()
         item['url'] = url
         item['unique_id'] = unique_id
         item['title'] = title
         item['source'] = '成都市科学技术局'
         item['date'] = date
         item['origin'] = self.name
         item['type'] = 'web'
         item['location'] = '成都市'
         item['crawled'] = 1
         yield scrapy.FormRequest(url,
                                  method='GET',
                                  headers=self.headers,
                                  meta={'item': item},
                                  callback=self.parse_item)
示例#10
0
    def parse(self, response):
        base_url = 'http://www.cdhrsip.com/article/newsInfo?id={}'
        json_data = json.loads(response.body)
        records = json_data['records']
        for record in records:
            url = base_url.format(record['id'])
            unique_id = get_md5(str(record['id']))
            if self.mongo_col.find_one({
                    '$and': [{
                        'unique_id': unique_id
                    }, {
                        'origin': f'{self.name}'
                    }]
            }):
                logging.warning(
                    f'{url} is download already, unique_id: {unique_id}')
                continue
            selector = etree.HTML(record['content'])
            summary = selector.xpath('string(.)').strip()[:100]
            summary = summary if (summary not in [b'', ''
                                                  ]) else record['title']

            date = record['publishTime']
            if len(date) == 10:
                now = time.strftime('%Y-%m-%d %H:%M:%S',
                                    time.localtime(time.time()))
                date += ' ' + now.split(' ')[-1]
            item = GovInfoItem()
            item['url'] = url
            item['unique_id'] = unique_id
            item['title'] = record['title']
            item['summary'] = summary
            item['source'] = record['author']
            item['date'] = date
            item['origin'] = self.name
            item['type'] = 'web'
            item['location'] = '成都市'
            item['crawled'] = 1
            item['content'] = record['content']
            yield item
示例#11
0
    def parse_page(self, response):
        base_url = 'http://www.zgzzscxd.com/'
        for sel in response.xpath('//li[@class="clearfix"]'):
            item = GovInfoItem()
            link = sel.xpath('a/@href').extract_first(default='').strip()
            title = sel.xpath('a/@title').extract_first(default='').strip()
            lst = [link, title]
            if not all(lst):
                logging.warning(f'{response.url}: get data failed')
                continue
            url = base_url + link
            unique_id = get_md5(url)
            if self.mongo_col.find_one({
                    '$and': [{
                        'unique_id': unique_id
                    }, {
                        'origin': f'{self.name}'
                    }]
            }):
                logging.warning(
                    f'{url} is download already, unique_id: {unique_id}')
                continue

            item['url'] = url
            item['unique_id'] = unique_id
            item['title'] = title
            item['source'] = '成都市经济和信息化委员会'
            item['origin'] = self.name
            item['type'] = 'web'
            item['location'] = '成都市'
            item['crawled'] = 1
            yield scrapy.FormRequest(url,
                                     method='GET',
                                     headers=self.headers,
                                     meta={'item': item},
                                     callback=self.parse_item)