def company_information(self, response): company_name = Selector(response=response).xpath( '//td[@style="text-align: left"]/text()').extract_first() number = Selector(response=response).xpath( '//td[@width="150"]')[2].xpath('text()').extract_first() company_name = company_name.split()[0] number = number.split()
def company_information(self, response): company_name = Selector(response=response).xpath( '//td[@class="name_level3"]/text()').extract_first() number = Selector(response=response).xpath( '//td[@id="LicenseNum"]/text()').extract_first() address = Selector(response=response).xpath( '//td[@id="Description"]/text()').extract_first() company_name = company_name.split()[0] address = address.split()[0] self.data['companyName'] = company_name if number != None: number = number.split()[0] if len(number) != 18: self.data['licenseNum'] = '' else: self.data['licenseNum'] = number else: self.data['licenseNum'] = '' if address == None: self.data['contactAddress'] = '' else: self.data['contactAddress'] = address print(self.data) yield scrapy.Request( url= 'https://api.maotouin.com/rest/companyInfo/addCompanyRecord.htm', # url='http://192.168.199.188:8080/web/rest/companyInfo/addCompanyRecord.htm', method="POST", headers={'Content-Type': 'application/json'}, body=json.dumps(self.data), callback=self.zz, meta={ 'company_name': company_name, 'data': self.data })
def xt_admin_date(cls, raw_person): # Extract administration admin_datestring = Selector(text=raw_person).xpath( '//td[1]/span/@title').extract()[0] if ';' in admin_datestring: admin_datestring = admin_datestring.split(";")[0] if ',' in admin_datestring: admin_datestring = admin_datestring.split(",")[0] try: if " - " in admin_datestring: start_date = _clean(admin_datestring.split(' - ')[0]) end_date = _clean(admin_datestring.split(' - ')[1]) start_date = datetime.datetime.strptime( start_date, "%d.%m.%Y").date() end_date = datetime.datetime.strptime( end_date, "%d.%m.%Y").date() else: start_date = datetime.datetime.strptime( _clean(admin_datestring), "%d.%m.%Y").date() end_date = None except: logger.error( "Couldn't extract date from datestring {}".format( admin_datestring)) import ipdb ipdb.set_trace() return (start_date, end_date)
def company_information(self, response): company_name = Selector(response=response).xpath( '//td[@colspan="3"]/text()').extract_first() number = Selector(response=response).xpath('//table[@class="detailTable"]')[0] \ .xpath('./tr[2]/td[4]/text()').extract_first() company_name = company_name.split()[0] repeat = self.r.sadd('Company_name', company_name) if repeat: number = number.split() if number: number = number[0] if len(number) != 18: self.data['licenseNum'] = '' else: self.data['licenseNum'] = number else: self.data['licenseNum'] = '' self.data['companyName'] = company_name yield scrapy.Request( url= 'https://api.maotouin.com/rest/companyInfo/addCompanyRecord.htm', # url='http://192.168.199.188:8080/web/rest/companyInfo/addCompanyRecord.htm', method="POST", headers={'Content-Type': 'application/json'}, body=json.dumps(self.data), callback=self.zz, meta={ 'company_name': company_name, 'data': self.data }) else: print('此公司信息已经存在', company_name)
def company_information(self, response): company_name = Selector(response=response).xpath( '//span[@id="ctl00_ContentPlaceHolder1_FormView1_Label10"]/text()' ).extract_first() number = Selector(response=response).xpath('//td[@class="inquiry_intitleb"]')[5] \ .xpath('./span/text()').extract_first() company_name = company_name.split()[0] repeat = self.r.sadd('Company_name', company_name) if repeat: self.data['companyName'] = company_name if number is not None: number = number.split()[0] if len(number) == 18: self.data['licenseNum'] = number print(self.data) yield scrapy.Request( url= 'https://api.maotouin.com/rest/companyInfo/addCompanyRecord.htm', # url='http://192.168.199.188:8080/web/rest/companyInfo/addCompanyRecord.htm', method="POST", headers={'Content-Type': 'application/json'}, body=json.dumps(self.data), callback=self.zz, meta={ 'company_name': company_name, 'data': self.data }) person_zz = Selector( response=response).xpath('//table[@id="GridView2"]') print(len(person_zz), 'zzzzzzzzzzzzzzzzzzzzzzz') else: print('此公司信息已经存在', company_name)
def company_information(self, response): company_name = Selector(response=response).xpath( '//td[@class="name_level3"]')[0].xpath('text()').extract_first() number = Selector( response=response).xpath('//td[@id="Td3"]/text()').extract_first() person = Selector(response=response).xpath( '//td[@id="EconType"]/text()').extract_first() address = Selector(response=response).xpath( '//td[@id="LicenseNum"]/text()').extract_first() phone = Selector(response=response).xpath( '//td[@id="RegPrin"]/text()').extract_first() company_name = company_name.split()[0] if phone is None: self.data['contactPhone'] = '' else: phone = phone.split() phone = phone[0] if phone == '/': self.data['contactPhone'] = '' else: self.data['contactPhone'] = phone if person is None: self.data['contactMan'] = '' else: person = person.split() person = person[0] self.data['contactMan'] = person if address is None: self.data['contactAddress'] = '' else: address = address.split() address = address[0] self.data['contactAddress'] = address if number is not None: number = number.split() number = number[0] if len(number) != 18: self.data['licenseNum'] = '' else: self.data['licenseNum'] = number else: self.data['licenseNum'] = '' self.data['companyName'] = company_name # print(self.data) yield scrapy.Request( url= 'https://api.maotouin.com/rest/companyInfo/addCompanyRecord.htm', # url='http://192.168.199.188:8080/web/rest/companyInfo/addCompanyRecord.htm', method="POST", headers={'Content-Type': 'application/json'}, body=json.dumps(self.data), callback=self.zz, meta={ 'company_name': company_name, 'data': self.data })
def company_info(self, response): company_name = Selector(response=response).xpath( '//td[@colspan="3"]')[0].xpath('./a/@title').extract_first() number = Selector(response=response).xpath( '//td[@colspan="3"]')[3].xpath('text()').extract_first() if number.split(): number = number.split()[0] if len(number) == 18: number = number else: number = '' tr = Selector( response=response).xpath('//table[@id="table_credit"]/tbody/tr') just_z = Selector(response=response).xpath( '//table[@id="table_credit"]/tbody/tr[1]/td[1]/text()' ).extract_first() if just_z != '没有相关数据': for t in tr: credit_evaluate = { 'type_name': '', 'e_result': '', 'have_date': '', 'department': '', 'validity_time': '', 'company_name': company_name, 'number': number } # 类别 type_name = t.xpath('./td/text()')[0].extract().split()[0] credit_evaluate['type_name'] = type_name # 评价结果 e_result = t.xpath('./td/text()')[1].extract() if e_result is not None: credit_evaluate['e_result'] = e_result # 颁发日期 have_date = t.xpath('./td/text()')[2].extract() if have_date is not None: credit_evaluate['have_date'] = have_date # 评价机构 department = t.xpath('./td/text()')[3].extract() if department is not None: credit_evaluate['department'] = department # 有效期 validity_time = t.xpath('./td/text()')[4].extract() if validity_time is not None: credit_evaluate['validity_time'] = validity_time print(credit_evaluate)
def company_info(self, response): company_name = Selector(response=response).xpath('//td[@colspan="3"]')[0].xpath('./a/@title').extract_first() number = Selector(response=response).xpath('//td[@colspan="3"]')[3].xpath( 'text()').extract_first() if number.split(): number = number.split()[0] if len(number) == 18: number = number else: number = '' tr = Selector(response=response).xpath('//table[@id="table_good"]/tbody/tr') if tr: for t in tr: good_action = {'project_name': '', 'good_grade': '', 'have_date': '', 'send_department': '', 'have_number': '', 'company_name': company_name, 'number': number } # 项目名称 if len(t.xpath('./td/text()')) >= 2: try: project_name = t.xpath('./td/text()')[0].extract().split()[0] good_action['project_name'] = project_name except IndexError: continue # 奖项级别 good_grade = t.xpath('./td/text()')[1].extract() if good_grade is not None: if good_grade.split(): good_action['good_grade'] = good_grade.split()[0] # 颁发单位 send_department = t.xpath('./td/text()')[2].extract() if send_department is not None: send_department = send_department.split()[0] good_action['send_department'] = send_department # 颁奖文号 have_number = t.xpath('./td/text()')[3].extract() if have_number is not None or have_number != '/': good_action['have_number'] = have_number # 颁奖时间 try: have_date = t.xpath('./td/text()')[4].extract() if have_date is not None: good_action['have_date'] = have_date except IndexError: continue print(good_action)
def company_information(self, response): # print(response.text) company_name = Selector(response=response).xpath('//td[@colspan="5"]')[0].xpath('text()').extract_first() # company_name = Selector(response=response).xpath('//td[@colspan="5"]') print(company_name) address = Selector(response=response).xpath('//td[@colspan="5"]/text()')[1].extract() number = Selector(response=response).xpath('//div[@class="detail_list"]/table/tr[2]/td[6]/text()').extract_first() person_name = Selector(response=response).xpath('//div[@class="detail_list"]/table/tr[7]/td[2]/text()').extract_first() print(company_name, address, number, person_name, 'AAAAAAAAAAAAAAAAAAAAAA') print(company_name, number, person_name, 'AAAAAAAAAAAAAAAAAAAAAA') data = {} print() data['companyName'] = company_name number = number.split() if number != []: number = number[0] if len(number) == 18: data['licenseNum'] = number else: data['licenseNum'] = '' else: data['licenseNum'] = '' person_name = person_name.split() print(person_name, type(person_name), 'AAAAAAAAAAAAAAAAAAAA') if person_name != []: person_name = person_name[0] print(person_name) data['contactMan'] = person_name else: data['contactMan'] = '' if address != None: adderss = address.split()[0] data['contactAddress'] = adderss else: data['contactAddress'] = '' data['companyArea'] = '浙江省' data['area'] = '' data['contactPhone'] = '' data['token'] = self.token print(data) yield scrapy.Request( url='https://api.maotouin.com/rest/companyInfo/addCompanyRecord.htm', # url='http://192.168.199.188:8080/web/rest/companyInfo/addCompanyRecord.htm', method="POST", headers={'Content-Type': 'application/json'}, body=json.dumps(data), callback=self.zz, meta={'company_name': company_name, 'data': data}, dont_filter=True )
def parse(self, response): ranking = Ranking() formatter = Data_Formatter() url = response.url link_checker = "www.ncaa.com/rankings" if link_checker not in url: print( "---------------------------------------------------------------------" ) print("Invalid link: {}, skipping".format(url)) print( "---------------------------------------------------------------------" ) else: table_body = response.xpath( '//*[@id="block-bespin-content"]/div/article/table/tbody[1]') rows = table_body.css('tr').getall() for row in rows: data_cells = Selector(text=row).css('td').getall() rank = Selector(text=data_cells[0]).css('td::text').get() team = Selector(text=data_cells[1]).css('td::text').get() try: team = team.split('(')[0].strip() ranking['rank'] = rank team = formatter.lengthen_abbreviated(team) ranking['team'] = team yield ranking except: print("Error with splitting team ranking item") print("Scraped Rankings @ {}".format(response.url))
def parse_comment(self, response): movie_name = Selector(response).xpath( '//h1/text()').extract_first().replace('短评', '').strip() comments = SelectorList( Selector(response).xpath('//div[@class="comment"]').extract()) for comment in comments: shorts = Selector( text=comment).xpath('//p/span/text()').extract_first() votes = Selector(text=comment).xpath( '//h3/span[@class="comment-vote"]/span/text()').extract_first( ) stars = Selector(text=comment).xpath( '//h3/span[@class="comment-info"]/span[contains(@class,"rating")]/@class' ).extract_first() if stars: stars = stars.split()[0].replace('allstar', '').strip().replace('0', '') else: stars = 0 comment_time = Selector(text=comment).xpath( '//h3/span[@class="comment-info"]/span[@class="comment-time "]/text()' ).extract_first() comment_item = CommentItem() comment_item['movie_name'] = movie_name comment_item['shorts'] = shorts comment_item['stars'] = stars comment_item['votes'] = votes comment_item['comment_time'] = comment_time yield comment_item
def GetPdfInfo(self, station_id): # set post data params = {} params['Type'] = 'pdf' params['ID'] = station_id # set url url = self.checkdraft_url # get request ret = self.session.get(url, params=params) if ret.status_code == 200: download_url = Selector( text=ret.text).xpath('//a/@href').extract()[0] file_name = Selector( text=ret.text).xpath('//a/text()').extract()[0] pdf_info = { 'download_url': download_url, 'file_name': file_name.split('/')[1] } print(pdf_info) return pdf_info else: print('failed to get pdf information') return None
def change_person_data(self, response): """变更记录表数据""" change_data = {} # 发送变更记录表 name = '变更记录' grade = Selector(response=response).xpath('//tbody/tr/td[1]/text()').extract_first() grade = grade.split(' ')[0] if grade == '暂未查询到已登记入库信息': print('zzzzzzzzzzzzzzzzzzz') else: change_data['grade'] = grade now_company = Selector(response=response).xpath('//div[@class="curQy"]/span/text()').extract_first() change_data['now_company'] = now_company change_record = Selector(response=response).xpath('//ul[@class="cbp_tmtimeline"]/li') for c in change_record: year = c.xpath('./div[1]/span[1]/text()').extract_first() month_day = c.xpath('./div[1]/span[2]/text()').extract_first() # 时间 date = year + month_day change_data['date'] = date # 原来的公司 original_company= c.xpath('./div[@class="cbp_tmlabel"]/p/span[1]/text').extract_first() change_data['original_company'] = original_company # 现在的公司 now_z_company = c.xpath('./div[@class="cbp_tmlabel"]/p/span[2]/text').extract_first() change_data['now_z_company'] = now_z_company
def achivment(self, response): """不良行为""" change_data = {} # 发送变更记录表 person_name = response.meta['item']['name'] name = '个人功绩' print(name, 'zz') change_data['person_name'] = person_name grade = Selector(response=response).xpath( '//tbody/tr/td[1]/text()').extract_first() grade = grade.split(' ')[0] if grade == '暂未查询到已登记入库信息': print('暂时无数据') else: content = Selector(response=response).xpath('//tbody/tr') for c in content: td = c.xpath('./td') merit = {} for t in td: field_name = t.xpath('@data-header').extract_first() # ''.split()[0] # print(field_name) field_name = field_name.split()[0] if field_name == '诚信记录编号': value = t.xpath('./span/text()').extract_first() merit['serial_number'] = value elif field_name == '诚信记录主体': value = t.xpath('./a/text()').extract_first() value = value.split()[0] merit['person_name'] = value elif field_name == '决定内容': result = t.xpath('text()')[1].extract() result = result.split()[0] merit['result'] = result what_action = t.xpath( './div/span[1]/text()').extract_first() what_action = what_action.replace('【', '') what_action = what_action.replace('】', '') merit['what_action'] = what_action start_date = t.xpath( './div/span[2]/text()').extract_first() start_date = start_date.slipt(':')[1] merit['start_date'] = start_date department = t.xpath( './div/a/@data-no').extract_first() merit['department'] = department d_content = t.xpath( './div/a/@data-text').extract_first() merit['content'] = d_content elif field_name == '实施部门(文号)': value = t.xpath('text()').extract_first() value = value.split()[0] merit['a_department'] = value elif field_name == '发布有效期': value = t.xpath('text()').extract_first() merit['project_type'] = value merit['token'] = self.token merit['corporate_name'] = self.corporate_name print(merit)
def company_information(self, response): company_name = Selector(response=response).xpath( '//td[@class="name_level3"]/text()').extract_first() number = Selector(response=response).xpath( '//td[@id="LicenseNum"]/text()').extract_first() company_name = company_name.split()[0] if number is not None: number = number.split()[0] number = number else: number = '' cc = 'http://218.60.144.163/LNJGPublisher/handle/Corp_Project.ashx?' \ 'CorpCode=%s&CorpName=%s&nPageCount=0&nPageIndex=1&nRecordSetCount=0&nPageSize=%s&_=1558580207472' \ % (number, company_name, 100) yield scrapy.Request(url=cc, callback=self.project, meta={'companyName': company_name})
def xt(cls, response): bio = { 'birthdate': None, 'birthplace': '', 'deathdate': None, 'deathplace': '', 'occupation': '' } bio_data = response.xpath(cls.XPATH).extract() if bio_data: bio_data = bio_data[0] else: return bio # Birth Data for data in bio_data.split('<br>'): birth = Selector(text=data)\ .xpath("//em[contains(text(),'Geb.')]/parent::*/text()")\ .extract() if birth: birth = birth[0] bio['birthdate'] = _clean(birth.split(',')[0]) try: bio['birthdate'] = datetime.datetime.strptime( bio['birthdate'], "%d.%m.%Y").date() except: logger.error( "Failed to parse birthdate: {}".format( bio['birthdate'])) bio['birthdate'] = None if len(birth.split(',')) > 1: bio['birthplace'] = birth.split(',')[1] # Death Data death = Selector(text=data)\ .xpath("//em[contains(text(),'Verst.')]/parent::*/text()")\ .extract() if death: death = death[0] bio['deathdate'] = _clean(death.split(',')[0]) try: bio['deathdate'] = datetime.datetime.strptime( bio['deathdate'], "%d.%m.%Y").date() except: logger.error( "Failed to parse deathdate: {}".format( bio['deathdate'])) bio['deathdate'] = None if len(death.split(',')) > 1: bio['deathplace'] = death.split(',')[1] # Occupation occupation = Selector(text=data)\ .xpath("//em[contains(text(),'Beruf')]/parent::*/text()")\ .extract() if occupation: occupation = occupation[0] bio['occupation'] = occupation.split(',')[0] return bio
def change_person_data(self, response): """变更记录表数据""" change_data = {} # 发送变更记录表 person_name = response.meta['item']['name'] name = '变更记录' print(name, 'zz') change_data['person_name'] = person_name grade = Selector(response=response).xpath( '//tbody/tr/td[1]/text()').extract_first() grade = grade.split(' ')[0] if grade == '暂未查询到已登记入库信息': print('暂时无数据') else: change_data['grade'] = grade now_company = Selector(response=response).xpath( '//div[@class="curQy"]/span/text()').extract_first() change_data['now_company'] = now_company change_record = Selector( response=response).xpath('//ul[@class="cbp_tmtimeline"]/li') # print(change_record) myset = set() for c in change_record: year = c.xpath('./div[1]/span[1]/text()').extract_first() month_day = c.xpath('./div[1]/span[2]/text()').extract_first() year = year.split('年')[0] month_day = month_day.split('/') month = month_day[0] day = month_day[1] # 时间 date = year + month + day date = int(date) date = date * 10 + 1 if date in myset: date += 1 myset.add(date) change_data['not_data'] = date # 原来的公司 original_company = c.xpath( './div[@class="cbp_tmlabel"]/p/span[1]/text()' ).extract_first() change_data['original_company'] = original_company # 现在的公司 now_z_company = c.xpath( './div[@class="cbp_tmlabel"]/p/span[2]/text()' ).extract_first() change_data['now_z_company'] = now_z_company change_data['name_company'] = self.corporate_name print(change_data) print('一条信息')
def company_information(self, response): data = {} td = Selector(response=response).xpath('//div[@id="ent-info "]/div[2]/div/h5/text()').extract_first() company_name = Selector(response=response).xpath('//div[@class="ln-title"]/text()').extract_first() company_name = company_name.split()[0] data['companyName'] = company_name data['area'] = '广东省' data['companyArea'] = '' data['token'] = self.token number = td.split() if number != []: number = number[0] if len(number) != 18: data['licenseNum'] = '' else: data['licenseNum'] = number else: data['licenseNum'] = '' div_person = Selector(response=response).xpath('//div[@id="ent-into"]/div') # print(len(div_person),'sssssssssssssssssssss') if len(div_person) == 2: data['contactMan'] = '' data['contactAddress'] = '' data['contactPhone'] = '' print('无人员注入') else: address = div_person[3].xpath('./div/h5/text()').extract_first() person_name = div_person[4].xpath('./div/h5/text()').extract_first() phone_number = div_person[5].xpath('./div/h5/text()').extract_first() person_name = person_name.split()[0] address = address.split()[0] phone_number = phone_number.split()[0] data['contactMan'] = person_name data['contactAddress'] = address data['phone_number'] = phone_number print(data) return Request(url='http://192.168.199.188:8080/web/rest/companyInfo/addCompanyRecord.htm', method="POST", headers={'Content-Type': 'application/json'}, body=json.dumps(data), callback=self.zz )
def parse_basic_info(self, response): contents = response.xpath( '//ul[@class="down_con downData"]//li').extract() for content in contents: img = Selector(text=content).xpath( '//li//div[@class="img"]/a/img/@src').extract()[0] if img != '' and img != None and img != [] and not img.startswith( 'http'): img = 'http://img4.gamersky.com/Files/GamerSky/' + img name = Selector(text=content).xpath( '//li//div[@class="img"]/a/@title').extract()[0] url = Selector(text=content).xpath( '//li//div[@class="img"]/a/@href').extract()[0] update_time = Selector(text=content).xpath( '//li//div[@class="txt"][1]/text()').extract()[0] if update_time != None: update_time = update_time.split(':')[1] type = Selector(text=content).xpath( '//li//div[@class="txt"][2]/text()').extract()[0] if type != None: type = type.split(':')[1] language = Selector(text=content).xpath( '//li//div[@class="txt"][3]/text()').extract()[0] if language != None: language = language.split(':')[1] size = Selector(text=content).xpath( '//li//div[@class="txt"][4]/text()').extract()[0] if size != None: size = size.split(':')[1] yield scrapy.Request(url=url, headers=self.default_headers, body=self.default_data, callback=self.parse_detail_info, meta={ 'img': img, 'name': name, 'update_time': update_time, 'type': type, 'language': language, 'size': size }, dont_filter=True)
def xt(cls, response): bio = { 'birthdate': None, 'birthplace': '', 'deathdate': None, 'deathplace': '', 'occupation': '' } bio_data = response.xpath(cls.XPATH).extract() if bio_data: bio_data = bio_data[0] else: return bio # Birth Data for data in bio_data.split('<br>'): birth = Selector(text=data)\ .xpath("//em[contains(text(),'Geb.')]/parent::*/text()")\ .extract() if birth: birth = birth[0] bio['birthdate'] = _clean(birth.split(',')[0]) try: bio['birthdate'] = datetime.datetime.strptime( bio['birthdate'], "%d.%m.%Y").date() except: logger.error("Failed to parse birthdate: {}".format( bio['birthdate'])) bio['birthdate'] = None if len(birth.split(',')) > 1: bio['birthplace'] = birth.split(',')[1] # Death Data death = Selector(text=data)\ .xpath("//em[contains(text(),'Verst.')]/parent::*/text()")\ .extract() if death: death = death[0] bio['deathdate'] = _clean(death.split(',')[0]) try: bio['deathdate'] = datetime.datetime.strptime( bio['deathdate'], "%d.%m.%Y").date() except: logger.error("Failed to parse deathdate: {}".format( bio['deathdate'])) bio['deathdate'] = None if len(death.split(',')) > 1: bio['deathplace'] = death.split(',')[1] # Occupation occupation = Selector(text=data)\ .xpath("//em[contains(text(),'Beruf')]/parent::*/text()")\ .extract() if occupation: occupation = occupation[0] bio['occupation'] = occupation.split(',')[0] return bio
def company_information(self, response): company_name = Selector(response=response).xpath( '//td[@class="name_level3"]/text()').extract_first() company_name = company_name.split()[0] basic_url = 'http://cx.jlsjsxxw.com/handle/Corp_Project.ashx?corpid=%s&_=1556177544518' % response.meta[ 'cc'] yield scrapy.Request(url=basic_url, callback=self.project, dont_filter=True, meta={'companyName': company_name})
def bad_recode(self,response): """不良行为""" content = Selector(response=response).xpath('//tbody/tr') if not content.xpath('./td/text()').extract_first() == "暂未查询到已登记入库信息": print(content.xpath('./td/text()').extract_first(), '不良行为相关信息') for c in content: td = c.xpath('./td') not_good = {} for t in td: h = t.xpath('@data-header').extract_first() h = h.split()[0] if h == "诚信记录编号": d = t.xpath('./span/text()').extract_first() print("诚信记录编号", d) d = d.split()[0] not_good['creditNum'] = d elif h == "诚信记录主体": d = t.xpath('./a/text()').extract_first() print("诚信记录主体", d) d = d.split()[0] not_good['companyName'] = d elif h == "决定内容": d = t.xpath('./div/span[2]/text()').extract_first() d = d.split(':')[1] print("决定内容", d) not_good['beginDate'] = d content = t.xpath('./div/a/@data-text').extract_first() content = content.split()[0] print("决定内容", content) not_good['fileContent'] = content result = t.xpath('text()')[1].extract() result = result.split()[0] not_good['mark'] = result print("决定内容", result) elif h == "实施部门(文号)": address = t.xpath('text()').extract_first() address = address.split()[0] print('实施部门(文号)', address) not_good['departName'] = address number = t.xpath('./div/text()').extract_first() number = number.split()[0] print('实施部门(文号)', number) not_good['fileNum'] = number elif h == "发布有效期": t = t.xpath('text()').extract_first() t = t.split()[0] not_good['endDate'] = t print('发布有效期', t) not_good['token'] = self.token yield Request(url=self.tongnie, method="POST", body=json.dumps(not_good), headers={'Content-Type': 'application/json'}, callback=self.zz) print('发送成功----', not_good) else: print(self.corporate_name, '--没有--', self.action, '这个相关的记录')
def company_information(self, response): company_name = Selector(response=response).xpath( '//td[@class="name_level3"]/text()').extract_first() number = Selector(response=response).xpath( '//td[@id="CorpCode"]/text()').extract_first() person = Selector( response=response).xpath('//td[@id="Td4"]/text()').extract_first() company_name = company_name.split()[0] repeat = self.r.sadd('Company_name', company_name + '辽宁省') if repeat: self.data['companyName'] = company_name if person is None: self.data['contactPhone'] = '' else: person = person.split()[0] self.data['contactPhone'] = person if number is not None: number = number.split()[0] if len(number) != 18: self.data['licenseNum'] = '' else: self.data['licenseNum'] = number else: self.data['licenseNum'] = '' print(self.data) yield scrapy.Request( url= 'https://api.maotouin.com/rest/companyInfo/addCompanyRecord.htm', # url='http://192.168.199.188:8080/web/rest/companyInfo/addCompanyRecord.htm', method="POST", headers={'Content-Type': 'application/json'}, body=json.dumps(self.data), callback=self.zz, meta={ 'company_name': company_name, 'data': self.data }) else: print('此公司信息已经存在', company_name)
def company_information(self, response): company_name = Selector(response=response).xpath('//td[@colspan="3"]/text()').extract_first() number = Selector(response=response).xpath('//table[@class="detailTable"]')[0] \ .xpath('./tr[2]/td[4]/text()').extract_first() company_name = company_name.split()[0] # repeat = self.r.sadd('Company_name', company_name) repeat = 1 if repeat != 0: if number.split(): number = number[0] if len(number) == 18: number = number else: number = '' project_info = Selector(response=response).xpath('//table[@class="detailTable"]')[4].xpath('./tr') title = project_info[0].xpath('./td/text()').extract()[0] if title == '项目信息(0个)': print('没有项目的公司--%s' % company_name) else: print('当前公司%s----项目%s' % (company_name, title)) # print(len(project_info)) project_info = project_info[2:] print(len(project_info), 'BBBBBBBBBBBBBBBBBB') for p in project_info: project_url = p.xpath('./td[2]/p/a/@onclick').extract_first() xx = 'window.open\(\'/(.*)\', \'dasfddd.*|window.open\(\'/(.*)\', \'fdsafa.*' cc = re.findall(xx, project_url)[0] if cc[0]: url = cc[0] else: url = cc[1] yield scrapy.Request(url='http://jzscyth.shaanxi.gov.cn:7001/' + url, callback=self.company_project, meta={'company_name': company_name, 'number': number}, dont_filter=True ) else: print('此公司信息已经存在', company_name)
def company_information(self, response): company_name = Selector(response=response).xpath( '//span[@id="ctl00_ContentPlaceHolder1_FormView1_Label10"]/text()' ).extract_first() number = Selector(response=response).xpath('//td[@class="inquiry_intitleb"]')[5] \ .xpath('./span/text()').extract_first() if number is not None: number = number.split()[0] if len(number) == 18: number = number else: number = '' company_name = company_name.split()[0] xx = 'http://hngcjs.hnjs.gov.cn/SiKuWeb/Gcxm.aspx?CorpName=%s&CorpCode=%s' % ( company_name, number) print(xx) yield scrapy.Request(url=xx, callback=self.project, meta={ 'company_name': company_name, 'number': number, 'page': 1 })
def company_information(self, response): company_name = Selector(response=response).xpath('//span[@class="user-name"]/text()').extract_first() number = Selector(response=response).xpath('//div[@class="bottom"]/dl/dt/text()').extract_first() company_name = company_name.split()[0] if number != None: number = number.split()[0] if len(number) != 18: self.data['licenseNum'] = '' else: self.data['licenseNum'] = number else: self.data['licenseNum'] = '' self.data['companyName'] = company_name print(self.data) yield scrapy.Request( url='https://api.maotouin.com/rest/companyInfo/addCompanyRecord.htm', # url='http://192.168.199.188:8080/web/rest/companyInfo/addCompanyRecord.htm', method="POST", headers={'Content-Type': 'application/json'}, body=json.dumps(self.data), callback=self.zz, meta={'company_name': company_name,'data':self.data} )
def all_project(self, response): xx = response.url vv = 'http://jsy.xjjs.gov.cn/dataservice/query/comp/compDetail/(.*)' cc = re.findall(vv, xx)[0] company_name = Selector(response=response).xpath('//span[@class="user-name"]/text()').extract_first() number = Selector(response=response).xpath('//div[@class="bottom"]/dl/dt/text()').extract_first() company_name = company_name.split()[0] url = 'http://jsy.xjjs.gov.cn/dataservice/query/comp/compPerformanceListSys/' + cc send_data = {'$total': '100', '$pgsz': '100', '$pg': '1', '$reload': '0', } yield scrapy.FormRequest(url=url, formdata=send_data, callback=self.project_info, meta={"company_name": company_name, "number": number} )
def get_case_studies_details(response: Response): content = response.content.decode("utf-8") article_selector = "div.card" articles = Selector(text=content).css(article_selector).extract() result = [] for article in articles: title = Selector(text=article).css("h3::text").extract()[0] summary = Selector( text=article).css("p.description::text").extract()[0] href = Selector(text=article).css("a::attr(href)").extract()[0] slug = href.split("/")[-2] assert slug, f"Couldn't extract case study slug from {article}" logging.debug("Got case study slug: %s", slug) result.append((title, summary, href, slug)) assert result, f"No Case Study details extracted from {articles}" return result
def _get_p_info(self, **kwargs): body = kwargs.get('body', '') tmp_p_info = Selector( text=body).css('div.showblock div p').extract_first() if tmp_p_info == '': return [] else: tmp_p_info = re.compile('<p>|</p>').sub('', tmp_p_info) tmp_p_info = re.compile(r'<!--思源品牌,隐藏品牌-->').sub('', tmp_p_info) p_info = [{ 'p_name': item.split(':')[0], 'p_value': item.split(':')[1] } for item in tmp_p_info.split('<br>') if item != ''] return p_info
def achivment(self, response): """个人功绩""" change_data = {} person_name = response.meta['item']['name'] name = '个人功绩' print(name, 'zz') change_data['person_name'] = person_name grade = Selector(response=response).xpath( '//tbody/tr/td[1]/text()').extract_first() grade = grade.split(' ')[0] if grade == '暂未查询到已登记入库信息': print('暂时无数据') else: content = Selector(response=response).xpath('//tbody/tr') for c in content: td = c.xpath('./td') merit = {} for t in td: field_name = t.xpath('@data-header').extract_first() # ''.split()[0] # print(field_name) field_name = field_name.split()[0] if field_name == '序号': value = t.xpath('text()').extract_first() merit['serial_number'] = value elif field_name == '项目编码': value = t.xpath('text()').extract_first() merit['project_recode'] = value elif field_name == '项目名称': value = t.xpath('./a/text()').extract_first() merit['project_name'] = value elif field_name == '项目属地': value = t.xpath('text()').extract_first() merit['project_address'] = value elif field_name == '项目类别': value = t.xpath('text()').extract_first() merit['project_type'] = value elif field_name == '建设单位': value = t.xpath('text()').extract_first() merit['project_Company'] = value merit['token'] = self.token merit['corporate_name'] = self.corporate_name print(merit)
def parse_basic_info(self, response): contents = response.xpath('//table[@id="REPORTID_tab1"]//tr').extract()[1:] for content in contents: company_name = Selector(text=content).xpath('//tr/td[1]/text()').extract()[0] writ_no = Selector(text=content).xpath('//tr/td[3]/text()').extract()[0] pub_type = Selector(text=content).xpath('//tr/td[4]//a/text()').extract()[0] writ_date = Selector(text=content).xpath('//tr/td[5]/text()').extract()[0] rel_bond = Selector(text=content).xpath('//tr/td[6]/text()').extract()[0] start = '给予' end = '的决定' pub_type = (pub_type.split(start))[1].split(end)[0] item = SszeLoaderItem(item=SszeZqxxJlcfResultItem(), response=response) item.add_value('batch_date', self.batch_date) item.add_value('company_name', company_name) item.add_value('writ_no', writ_no) item.add_value('pub_type', pub_type) item.add_value('writ_date', writ_date) item.add_value('rel_bond', rel_bond) item.add_value('table_name', 'spider.szse_zqxx_jlcf_result') yield item.load_item()