def parseDetail(self, response): soup = BeautifulSoup(response.text, 'lxml') trs = soup.find('table', id='table').tbody.find_all('tr') for each in trs: item = crawler114() item['spider_name'] = self.name item['entity_name'] = each.find_all('td')[0].get_text(strip=True) item['notice_id'] = re.search( "Detail\(\'(\w+)\'", each.find_all('td')[0].a.attrs['onclick']).group(1) item[ 'source_url'] = 'http://www.tjcredit.gov.cn/gsxt/excdir/detail?id=' + item[ 'notice_id'] item['punish_agent'] = each.find_all('td')[1].get_text(strip=True) item['punish_reason'] = each.find_all('td')[2].get_text(strip=True) item['punish_date'] = each.find_all('td')[3].get_text(strip=True) item['report_year'] = each.find_all('td')[4].get_text(strip=True) item['create_date'] = time.strftime('%Y-%m-%d', time.localtime()) item['source_page'] = str(each).replace(' ', '').replace( '\t', '').replace('\n', '').replace('\r', '') item['data_id'] = '' item['data_source'] = self.name item['del_flag'] = '' item['op_flag'] = '' item['case_no'] = '' item['reg_no'] = '' yield item
def parse(self, response): li_list = response.xpath('//table[@class="noticelist-t"]/tr') for li in li_list: item = crawler114() title = li.xpath('./td[1]/a/text()').extract_first() href = li.xpath('./td[1]/a/@href').extract_first() href = 'http://gx.gsxt.gov.cn' + href item['pun_org'] = li.xpath('./td[2]/text()').extract_first() pun_date = li.xpath('./td[3]/text()').extract_first() item['pun_date'] = pun_date item['ent_name'] = title.replace(u'关于将', '').replace(u'列入经营异常名录的公告', '') hashcode = hash(title + pun_date) item['data_id'] = 'gx' + '-' + str(hashcode) yield scrapy.Request(href, callback=self.parse_detail, meta={'item': item}) # 翻页 url = response.url cur_page = url.split('pageNos=')[-1] total_pages = response.xpath( '//div[@class="pages"]/span[2]/text()').extract_first() total_pages = total_pages.replace(u'\xa0', u' ') total_pages = re.findall('.*?(\d+).*?', total_pages)[0] if int(cur_page) < int(total_pages): next_page = int(cur_page) + 1 next_href = 'http://gx.gsxt.gov.cn/xxgg/xxggAction!queryGgxx.dhtml?vchr_bmdm=¬itype=11¬iceTitle=%E8%AF%B7%E8%BE%93%E5%85%A5%E9%9C%80%E8%A6%81%E6%9F%A5%E8%AF%A2%E4%BF%A1%E6%81%AF&pageNos=' + str( next_page) yield scrapy.Request(next_href, callback=self.parse)
def parse_detail(self, response): item = crawler114() content = response.body soup = BeautifulSoup(content, "lxml") tag_p = soup.find_all('p') if len(tag_p) >= 2: case_no = tag_p[0].get_text(strip=True) pun_reason = tag_p[1].get_text(strip=True) else: case_no = None pun_reason = None data_source = 'crawler114_6' del_flag = '0' op_flag = 'a' create_date = time.strftime('%Y-%m-%d', time.localtime()) hashcode = hash(response.meta['ent_name'] + response.meta['pun_date']) item['case_no'] = case_no item['punish_org'] = response.meta['pun_org'] item['punish_date'] = response.meta['pun_date'] item['entity_name'] = response.meta['ent_name'] item['punish_reason'] = pun_reason item['data_source'] = data_source item['del_flag'] = del_flag item['op_flag'] = op_flag item['create_date'] = create_date item['source_url'] = response.url item['source_page'] = content item['data_id'] = 'yunnan' item['spider_name'] = self.name yield item
def parse_detail(self, response): item = crawler114() data = json.loads(response.body.decode('utf-8')) url = response.url item['source_url'] = url item['spider_name'] = self.name item['source_page'] = data item['create_date'] = time.strftime('%Y-%m-%d', time.localtime()) item['case_no'] = data['NOTNO'] ent_name = data['ENTNAME'] item['ent_name'] = ent_name pun_org = data['DECORGNAME'] item['pun_org'] = pun_org pun_date = data['ABNTIMEString'] item['pun_date'] = pun_date reg_no = data['REGNO'] item['reg_no'] = reg_no pun_reason1 = data['FACTANDRULE'] pun_reason2 = data['BASISINFO'] pun_reason = u'经查,你单位因' + pun_reason1 + u',违反了' + pun_reason2 + u'的规定,现决定将其列入经营异常名录。' item['pun_reason'] = pun_reason item['del_flag'] = '0' item['op_flag'] = 'a' item['data_source'] = self.name hashcode = hash(ent_name + pun_date) item['data_id'] = 'jl' + '-' + str(hashcode) item['report_year'] = '' item['notice_id'] = '' yield item
def parse(self, response): news_li = response.xpath('//ul[@class="news_li"]/li') for li in news_li: item = crawler114() title = li.xpath('./p//a/text()').extract_first() ent_name = title.replace(u'关于', '').replace(u'移出经营异常名录公告', '') item['ent_name'] = ent_name href = li.xpath('./p//a/@href').extract_first() detail_url = 'http://nx.gsxt.gov.cn' + href item['pun_org'] = li.xpath('./p//span/text()').extract_first() pun_date = li.xpath('./em/text()').extract_first() item['pun_date'] = pun_date hashcode = hash(ent_name + pun_date) item['data_id'] = 'nx' + '-' + str(hashcode) item['data_source'] = self.name yield scrapy.Request(detail_url, callback=self.parse_detail, meta={'item': item}) # 翻页 url = response.url cur_page = url.split("currPage=")[-1] total_pages = response.xpath( './/input[@id="countPage"]/@value').extract_first() if int(cur_page) < int(total_pages): cur_page = int(cur_page) + 1 next_href = 'http://nx.gsxt.gov.cn/noticeAction_noticeList.action?noticeType=11&currPage=' + str( cur_page) yield scrapy.Request(next_href, callback=self.parse)
def parse_detail(self, response): item = crawler114() content = response.body soup = BeautifulSoup(content, "lxml") tag_p = soup.find_all('p') case_no = tag_p[0].get_text(strip=True).replace(u'公告号:', '') pun_reason = tag_p[1].get_text(strip=True) data_source = 'crawler114_7' del_flag = '0' op_flag = 'a' create_date = time.strftime('%Y-%m-%d', time.localtime()) item['case_no'] = case_no item['punish_org'] = response.meta['pun_org'] item['punish_date'] = response.meta['pun_date'] item['entity_name'] = response.meta['ent_name'] item['punish_reason'] = pun_reason item['data_source'] = data_source item['data_id'] = 'shanghai' item['del_flag'] = del_flag item['op_flag'] = op_flag item['create_date'] = create_date item['source_url'] = response.url item['source_page'] = content item['spider_name'] = self.name yield item
def parse_detail(self, response): item = crawler114() content = response.body soup = BeautifulSoup(content, "lxml") tag_p = soup.find_all('p', attrs={"class": False}) tag_span = soup.find_all('span', attrs={"lang": "EN-US"}) create_date = time.strftime('%Y-%m-%d', time.localtime()) case_no = tag_span[2].get_text(strip=True) ent_name = tag_p[8].get_text(strip=True).encode('utf-8').replace(':', '') reason = tag_p[9].get_text(strip=True).encode('utf-8').replace(' ', '') pun_org = response.meta['pun_org'] pun_date = response.meta['pun_date'] data_id = 'hlj' data_source = 'crawler114_3' del_flag = '0' op_flag = 'a' item['case_no'] = case_no item['entity_name'] = ent_name item['punish_reason'] = reason item['punish_org'] = pun_org item['punish_date'] = pun_date item['data_id'] = data_id item['data_source'] = data_source item['del_flag'] = del_flag item['op_flag'] = op_flag item['create_date'] = create_date item['source_url'] = response.url item['source_page'] = content item['spider_name'] = self.name yield item
def parseJson(self, response): r = json.loads(response.text) for each in r['data']: item = crawler114() item['data_id'] = each['NOTICEID'] item['release_org'] = each['JUDAUTH_CN'] item['release_date'] = each['NOTICEDATE'] title = each['ENTNAME'] item['entity_name'] = title.replace(u'关于', '').replace(u'列入经营异常名录公告', '') url = 'http://jx.gsxt.gov.cn/affichebase/queryAffichebaseFinallyDetails.do?noticeid={}¬icetype=12' yield scrapy.Request(url.format(item['data_id']), meta={'item': item}, dont_filter=True, callback=self.parseDetail)
def parse_detail(self, response): item = crawler114() content = response.body soup = BeautifulSoup(content, "lxml") # try: # case_no = soup.find_all('p')[2].get_text(strip=True) # except: # case_no = None # # try: # pun_reason = soup.find_all('p')[4].get_text(strip=True) # except: # pun_reason = None try: case_no = response.xpath( '''/html/body/form/div[1]/div[4]/div/div/div/div[2]/div/div/p[2]/text()''' )[0].extract() except: case_no = None try: pun_reason = response.xpath( '''/html/body/form/div[1]/div[4]/div/div/div/div[2]/div/div/p[4]/text()''' )[0].extract() except: pun_reason = None data_source = 'crawler114_8' del_flag = '0' op_flag = 'a' create_date = time.strftime('%Y-%m-%d', time.localtime()) item['case_no'] = case_no item['punish_org'] = response.meta['pun_org'] item['punish_date'] = response.meta['pun_date'] item['entity_name'] = response.meta['ent_name'] item['punish_reason'] = pun_reason item['data_source'] = data_source item['del_flag'] = del_flag item['op_flag'] = op_flag item['create_date'] = create_date item['source_url'] = response.url item['source_page'] = content item['data_id'] = 'shanxi' item['spider_name'] = self.name yield item
def parseDetail(self,response): soup=BeautifulSoup(response.text,'lxml') trs=soup.find_all('tr') for each in trs: item=crawler114() if each.find_all('td')[2].get_text(strip=True): item['punish_agent'] = each.find_all('td')[2].get_text(strip=True) item['punish_date'] = each.find_all('td')[3].get_text(strip=True) item['spider_name']=item['data_source']=self.name if each.find_all('td')[1].a: item['notice_id'] = each.find_all('td')[1].a.attrs['href'].split('id=')[-1] item['entity_name'] = each.find_all('td')[1].get_text(strip=True).replace(u'关于', '').replace(u'列入经营异常名录公告', '') item['source_url'] = 'http://ah.gsxt.gov.cn'+each.find_all('td')[1].a.attrs['href'] yield scrapy.Request(item['source_url'], meta={'item': item}, dont_filter=True, callback=self.parsePageDetail )
def parse(self, response): li_list = response.xpath('//table/tr')[:-1] for li in li_list: item = crawler114() title = li.xpath('./td[2]/a/text()').extract_first() ent_name = title.replace(u'关于', '').replace(u'企业列入经营异常名录公告', '') ent_name = ent_name.replace(u'\xa0', u' ').replace('\r\n', '').replace(' ', '') item['ent_name'] = ent_name item['pun_org'] = li.xpath('./td[@id="A5"]/text()').extract_first() item['pun_org'] = item['pun_org'].replace(' ', '').replace('\r\n', '') pun_date = li.xpath( './td[@class="td4"]/span/text()').extract_first() item['pun_date'] = pun_date if pun_date: hashcode = hash(ent_name + pun_date) else: hashcode = hash(ent_name) item['data_source'] = self.name item['del_flag'] = '0' item['op_flag'] = 'a' item['data_id'] = 'sx' + '-' + str(hashcode) detail_url = li.xpath('.//a[@id="A3"]/@href').extract_first() detail_url = 'http://sx.gsxt.gov.cn/' + detail_url yield scrapy.Request(detail_url, callback=self.parse_detail, meta={'item': item}) # 翻页 url = response.url ret = response.xpath( './/div[@class="newfy"]/ul/li[2]/text()').extract_first() total_pages = re.findall('.*?(\d+).*?', ret)[0] cur_page = url.split('pageNo=')[-1].split('&')[0] if int(cur_page) < int(total_pages) - 1: next_page = int(cur_page) + 1 next_href = 'http://sx.gsxt.gov.cn/ycmlNoticeInfo.jspx?mark=01&pageNo=' + str( next_page) + '&order=2&title=&area=' yield scrapy.Request(next_href, callback=self.parse)
def parse_detail(self,response): content = response.body soup = BeautifulSoup(content,"lxml") item = crawler114() pun_reason = soup.find_all(class_='Section1')[0].find_all('p')[1].get_text() create_date = time.strftime('%Y-%m-%d', time.localtime()) item['punish_reason'] = pun_reason item['entity_name'] = response.meta['ent_name'] item['punish_org'] = response.meta['pun_org'] item['punish_date'] = response.meta['pun_date'] item['source_url'] = response.url item['source_page'] = content item['create_date'] = create_date item['data_id'] = 'henan' item['data_source'] = 'crawler114_5' item['del_flag'] = '0' item['op_flag'] = 'a' item['spider_name'] = self.name yield item
def parseJson(self,response): soup=BeautifulSoup(response.text,'lxml') if not soup.find('div', class_=re.compile(u'ip')): r = json.loads(response.text.split('</script>')[-1]) data=r['result']['data'] for each in data: item=crawler114() item['punish_date']=each['date'] item['entity_name']=each['etpName'] item['source_url']='http://hn.gsxt.gov.cn/notice/search/announce_detail?uuid={}&category=01&categorySub=01'.format(each['link']) item['punish_agent']=each['orgName'] item['notice_id']=each['link'] item['spider_name']=item['data_source']=self.name yield scrapy.Request(item['source_url'], meta={'item': item}, dont_filter=True, callback=self.parsePageDetail ) else: print('=====================ip blocked========================',soup.find('div', class_=re.compile(r'ip')).get_text(strip=True))
def parseJson(self, response): soup = BeautifulSoup(response.text, 'lxml') if not soup.find('div', class_=re.compile(u'ip')): r = json.loads(response.text.split('</script>')[-1]) data = r['result']['data'] for each in data: item = crawler114() item['spider_name'] = item['data_source'] = self.name item['release_date'] = each['date'] item['entity_name'] = each['etpName'] item['source_url'] = 'http://sc.gsxt.gov.cn/notice/' + each[ 'link'] item['release_org'] = each['orgName'] item['data_id'] = each['link'].split('uuid=')[-1].split('&')[0] yield scrapy.Request(item['source_url'], meta={'item': item}, dont_filter=True, callback=self.parsePageDetail) else: print('=====================ip blocked========================') print( soup.find('div', class_=re.compile(r'ip')).get_text(strip=True))
def parse_detail(self, response): item = crawler114() content = response.body soup = BeautifulSoup(content, "lxml") p_tags = soup.find_all('p') pun_reason = p_tags[1].get_text(strip=True).replace(' ', '') create_date = time.strftime('%Y-%m-%d', time.localtime()) item['entity_name'] = response.meta['ent_name'] item['punish_org'] = response.meta['pun_org'] item['punish_date'] = response.meta['pun_date'] # item['ent_name'] = response.meta['ent_name'] item['punish_reason'] = pun_reason item['data_id'] = 'qinghai' item['data_source'] = 'crawler114_1' item['del_flag'] = '0' item['op_flag'] = 'a' item['create_date'] = create_date item['source_url'] = response.url item['source_page'] = content item['spider_name'] = self.name yield item
def parse(self, response): li_list = response.xpath('//tbody/tr') for li in li_list: item = crawler114() url = response.url item['source_url'] = url item['spider_name'] = self.name data = response.text item['source_page'] = data ent_name = li.xpath('./td[1]/a/text()').extract_first().replace(' ', '') item['ent_name'] = ent_name item['pun_org'] = li.xpath('./td[2]/text()').extract_first().replace('\t', '').replace('\r', '').replace('\n', '') item['pun_reason'] = li.xpath('./td[3]/text()').extract_first().replace('\t', '').replace('\r', '').replace('\n', '').replace(u'\xa0', u' ') pun_date = li.xpath('./td[5]/text()').extract_first().replace('\t', '').replace('\r', '').replace('\n', '') item['pun_date'] = pun_date hashcode = hash(ent_name + pun_date) item['data_source'] = self.name item['del_flag'] = '0' item['op_flag'] = 'a' item['data_id'] = 'tj' + '-' + str(hashcode) item['create_date'] = time.strftime('%Y-%m-%d', time.localtime()) item['case_no'] = '' item['reg_no'] = '' item['report_year'] = '' item['notice_id'] = '' yield item # 翻页 url = response.url total_pages = re.findall('var maxPage = (.*?);', response.text)[0] cur_page = re.findall('var pageIndex = (.*?);', response.text)[0] if int(cur_page) < int(total_pages): next_page = int(cur_page) + 1 yield scrapy.FormRequest(url, formdata={'pageIndex': str(next_page), 'pageSize': '10', 'entname': ''}, callback=self.parse)