def parse_page(self, response): base_url = 'http://www.innocom.gov.cn' regex = '//div[@class="list7"]/ul/li' for sel in response.xpath(regex): link = sel.xpath(r'a/@href').extract_first(default='').strip() title = sel.xpath(r'a/text()').extract_first(default='').strip() date = sel.xpath(r'span/text()').extract_first(default='').strip() lst = [link, title, date] if not all(lst): logging.warning(f'{response.url}--{link}: get data failed') continue url = base_url + link unique_id = get_md5(url) if self.mongo_col.find_one({'$and': [{'unique_id': unique_id}, {'origin': f'{self.name}'}]}): logging.warning(f'{url} is download already, unique_id: {unique_id}') continue date = date.strip('[').strip(']') if len(date) == 10: now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) date += ' ' + now.split(' ')[-1] item = GovInfoItem() item['url'] = url item['unique_id'] = unique_id item['title'] = title item['source'] = '科技部火炬高技术产业开发中心' item['date'] = date item['origin'] = self.name item['type'] = 'web' item['location'] = '高新区' item['crawled'] = 1 yield scrapy.FormRequest(url, method='GET', headers=self.headers, meta={'item': item}, callback=self.parse_item)
def parse_page(self, response): headers = copy.deepcopy(self.headers) headers.update({'Host': 'www.cdmbc.gov.cn'}) json_data = json.loads(json.dumps(xmltodict.parse(response.body))) for row in json_data['rows']['row']: url = re.findall(r'href="(.*?)"|$', row['cell'][0])[0] if url == '' or 'cdmbc' not in url: logging.warning(f'{response.url}--{url}: get data failed') continue date = row['cell'][1] unique_id = get_md5(url) if self.mongo_col.find_one({'$and': [{'unique_id': unique_id}, {'origin': f'{self.name}'}]}): logging.warning(f'{url} is download already, unique_id: {unique_id}') continue date = date.strip('[').strip(']') if len(date) == 10: now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) date += ' ' + now.split(' ')[-1] item = GovInfoItem() item['url'] = url item['unique_id'] = unique_id item['source'] = '成都市商务委' item['date'] = date item['origin'] = self.name item['type'] = 'web' item['location'] = '成都市' item['crawled'] = 1 yield scrapy.FormRequest(url, method='GET', headers=headers, meta={'item': item}, callback=self.parse_item)
def parse(self, response): base_url = 'http://www.cdsme.com' regex = '//div[@class="clearFix MN_A1_box"]' for sel in response.xpath(regex): link = sel.xpath(r'div[1]/a/@href').extract_first(default='').strip() summary = sel.xpath(r'div[2]/div/p/text()').extract_first(default='').strip() date = sel.xpath(r'div[1]/p/text()').extract_first(default='').strip() lst = [link, date] if not all(lst): logging.warning(f'{response.url}--{link}: get data failed') continue url = base_url + link unique_id = get_md5(url) if self.mongo_col.find_one({'$and': [{'unique_id': unique_id}, {'origin': f'{self.name}'}]}): logging.warning(f'{url} is download already, unique_id: {unique_id}') continue date = date.strip('[').strip(']') if len(date) == 10: now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) date += ' ' + now.split(' ')[-1] item = GovInfoItem() item['url'] = url item['unique_id'] = unique_id item['summary'] = summary item['source'] = '成都市中小企业网' item['date'] = date item['origin'] = self.name item['type'] = 'web' item['location'] = '成都市' item['crawled'] = 1 yield scrapy.FormRequest(url, method='GET', headers=self.headers, meta={'item': item}, callback=self.parse_item)
def parse(self, response): json_data = response.meta['json_data'] item = GovInfoItem(json_data['item']) selector = etree.HTML(response.body) regex = r'//*[@id="js_content"]' title = response.xpath(r'//*[@id="activity-name"]/text()' ).extract_first(default='').strip() content = response.xpath(regex).xpath('string(.)').extract_first( '').strip() if (content == '') and (title == ''): logging.warning(f'{item["url"]}: title and content is none') self.task_col.update({'unique_id': item['task_unique_id']}, {"$set": { 'crawled': -1 }}) return try: content = etree.tostring(selector.xpath(regex)[0], encoding='utf-8') except Exception as err: logging.error(f'{item["url"]}: get content failed') return if item['summary'] == '': item['summary'] = content[:100] if ( content != '') else item['title'] item['content'] = content.decode('utf-8').replace(' ', '') item['title'] = title.strip() item['unique_id'] = item['unique_id'] item['crawled'] = 1 yield item
def parse_page(self, response): regex = '//div[@class="newlist_left_cont"]/ul' for sel in response.xpath(regex): link = sel.xpath(r'li[1]/a/@href').extract_first( default='').strip() text = sel.xpath(r'li[2]/text()').extract_first(default='').strip() title = sel.xpath(r'li[1]/a/@title').extract_first( default='').strip() lst = [link, title, text] if not all(lst): logging.warning(f'{response.url}.{link}: get data failed') continue if 'http' not in link: url = self.base_url + link else: url = link if 'cdgy.gov.cn' not in url: logging.warning(f'{url} is out the domain') continue unique_id = get_md5(url) if self.mongo_col.find_one({ '$and': [{ 'unique_id': unique_id }, { 'origin': f'{self.name}' }] }): logging.warning( f'{url} is download already, unique_id: {unique_id}') continue text = ''.join(text.split()) text = re.sub(r'\s|:|:', '', text) promulgator, date = re.findall(r'来源(.*?)发布时间(\d{4}-\d{2}-\d{2})', text)[0] if len(date) == 10: now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) date += ' ' + now.split(' ')[-1] item = GovInfoItem() item['url'] = url item['unique_id'] = unique_id item['title'] = title.strip() item['source'] = promulgator item['date'] = date item['origin'] = self.name item['type'] = 'web' item['location'] = '成都市' item['crawled'] = 1 yield scrapy.FormRequest(url, method='GET', headers=self.headers, meta={'item': item}, callback=self.parse_item)
def parse(self, response): result = 1 url = response.url task = response.meta['task'] params = task['params'] origin = task['origin'] total = re.findall(r'找到约(\d+)条结果|$'.encode('utf-8'), response.body)[0] if total != b'' and int(total) > 10: result = -1 logging.error(f'{url}: {params} page too more') self.task_col.update({'_id': task['_id']}, {"$set": {'crawled': result}}) redis_values = [] for sel in response.xpath(r'//li[contains(@id, "sogou_vr_")]'): item = GovInfoItem() unique_id = sel.xpath(r'./@d').extract_first(default='').strip() date = sel.xpath(r'div/div/@t').extract_first(default='').strip() source = sel.xpath(r'div/div/a/text()').extract_first(default='').strip() link = sel.xpath(r'div/h3/a/@href').extract_first(default='').strip() name = sel.xpath(r'div/div/a/text()').extract_first(default='').strip() lst = [unique_id, date, source, link] if not all(lst): result = -1 logging.warning(f'{url}: {params}.{link}: get data failed') self.task_col.update({'_id': task['_id']}, {"$set": {'crawled': result}}) continue unique_id = get_md5(unique_id) if self.mongo_col.find_one({'$and': [{'unique_id': unique_id}, {'origin': f'{origin}'}]}): logging.warning(f'{url} is download already, unique_id: {unique_id}') continue if task['name'] != name: logging.warning(f'{url}: {params}.{link}: is not publish from {task["name"]}') continue url = link.replace('http', 'https') item['url'] = url item['task_unique_id'] = task['unique_id'] item['unique_id'] = unique_id try: item['summary'] = sel.xpath(r'./div/p[@class="txt-info"]')[0].xpath( 'string(.)').extract_first('').strip() except Exception as err: logging.warning(f'{url}: {params}.{link}: get summary failed') item['summary'] = '' item['date'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(int(date))) item['source'] = source item['origin'] = origin item['type'] = 'wxgzh' item['crawled'] = 0 item['location'] = task['location'] redis_values.append(json.dumps({'item': dict(item)})) if redis_values: self.redis_con.sadd("{}".format(self.redis_key), *redis_values)
def parse_page(self, response): base_url = 'http://www.sczwfw.gov.cn:82' for sel in response.xpath('//div[@class="news_r"]//li'): item = GovInfoItem() link = sel.xpath('.//a/@href').extract_first(default='').strip() title = sel.xpath('.//a/@title').extract_first(default='').strip() date = sel.xpath('.//em/text()').extract_first(default='').strip() lst = [link, title, date] if not all(lst): logging.warning(f'{response.url}: get data failed') continue url = base_url + link unique_id = get_md5(url) if self.mongo_col.find_one({ '$and': [{ 'unique_id': unique_id }, { 'origin': f'{self.name}' }] }): logging.warning( f'{url} is download already, unique_id: {unique_id}') continue if len(date) == 10: now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) date += ' ' + now.split(' ')[-1] item['url'] = url item['unique_id'] = unique_id item['title'] = title item['source'] = '四川省人民政府办公厅' item['origin'] = self.name item['date'] = date item['type'] = 'web' item['location'] = '四川省' item['crawled'] = 1 yield scrapy.FormRequest(url, method='GET', headers=self.headers, meta={'item': item}, callback=self.parse_item)
def parse_page(self, response): base_url = 'http://125.70.9.164:1250/csidc/csidc2/site/noticeView.jsp?id=' regex = r'//tr[@class="btd"]' for sel in response.xpath(regex): link = sel.xpath(r'td[1]/a/@href').re( r'javascript:display\(\'(.*?)\'\)') title = sel.xpath(r'td[1]/a/text()').extract_first( default='').strip() date = sel.xpath(r'td[3]/text()').extract_first(default='').strip() lst = [link, date] if not all(lst): logging.warning(f'{response.url}.{link}: get data failed') continue url = base_url + link[0] unique_id = get_md5(url) if self.mongo_col.find_one({ '$and': [{ 'unique_id': unique_id }, { 'origin': f'{self.name}' }] }): logging.warning( f'{url} is download already, unique_id: {unique_id}') continue date = time.strftime("%Y-%m-%d %H:%M:%S", time.strptime(date, "%Y-%m-%d %H:%M")) item = GovInfoItem() item['url'] = url item['unique_id'] = unique_id item['date'] = date item['title'] = title item['origin'] = self.name item['source'] = '成都市经济和信息化委员会' item['type'] = 'web' item['location'] = '成都市' item['crawled'] = 1 yield scrapy.FormRequest(url, method='GET', headers=self.headers, meta={'item': item}, callback=self.parse_item)
def parse(self, response): base_url = 'http://www.cdst.gov.cn/Readnews.asp?NewsID={}' for sel in response.xpath(r'//div[@class="listline"]/li'): title = sel.xpath(r'a/@title').extract_first(default='').strip() unique_id = sel.xpath('a/@href').extract_first(default='').strip() date = sel.xpath(r'span/text()').extract_first(default='').strip() lst = [title, unique_id, date] if not all(lst): logging.warning(f'{response.url}: get data failed') continue url = base_url.format(unique_id.split('=')[-1]) unique_id = get_md5(url) if self.mongo_col.find_one({ '$and': [{ 'unique_id': unique_id }, { 'origin': f'{self.name}' }] }): logging.warning( f'{url} is download already, unique_id: {unique_id}') continue date = date.strip('[').strip(']') if len(date) == 10: now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) date += ' ' + now.split(' ')[-1] item = GovInfoItem() item['url'] = url item['unique_id'] = unique_id item['title'] = title item['source'] = '成都市科学技术局' item['date'] = date item['origin'] = self.name item['type'] = 'web' item['location'] = '成都市' item['crawled'] = 1 yield scrapy.FormRequest(url, method='GET', headers=self.headers, meta={'item': item}, callback=self.parse_item)
def parse(self, response): base_url = 'http://www.cdhrsip.com/article/newsInfo?id={}' json_data = json.loads(response.body) records = json_data['records'] for record in records: url = base_url.format(record['id']) unique_id = get_md5(str(record['id'])) if self.mongo_col.find_one({ '$and': [{ 'unique_id': unique_id }, { 'origin': f'{self.name}' }] }): logging.warning( f'{url} is download already, unique_id: {unique_id}') continue selector = etree.HTML(record['content']) summary = selector.xpath('string(.)').strip()[:100] summary = summary if (summary not in [b'', '' ]) else record['title'] date = record['publishTime'] if len(date) == 10: now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) date += ' ' + now.split(' ')[-1] item = GovInfoItem() item['url'] = url item['unique_id'] = unique_id item['title'] = record['title'] item['summary'] = summary item['source'] = record['author'] item['date'] = date item['origin'] = self.name item['type'] = 'web' item['location'] = '成都市' item['crawled'] = 1 item['content'] = record['content'] yield item
def parse_page(self, response): base_url = 'http://www.zgzzscxd.com/' for sel in response.xpath('//li[@class="clearfix"]'): item = GovInfoItem() link = sel.xpath('a/@href').extract_first(default='').strip() title = sel.xpath('a/@title').extract_first(default='').strip() lst = [link, title] if not all(lst): logging.warning(f'{response.url}: get data failed') continue url = base_url + link unique_id = get_md5(url) if self.mongo_col.find_one({ '$and': [{ 'unique_id': unique_id }, { 'origin': f'{self.name}' }] }): logging.warning( f'{url} is download already, unique_id: {unique_id}') continue item['url'] = url item['unique_id'] = unique_id item['title'] = title item['source'] = '成都市经济和信息化委员会' item['origin'] = self.name item['type'] = 'web' item['location'] = '成都市' item['crawled'] = 1 yield scrapy.FormRequest(url, method='GET', headers=self.headers, meta={'item': item}, callback=self.parse_item)