def parse_basic_info(self, response): contents = response.xpath( '//div[@class="wrap mb "]/div[@class="list_z2 g3"]/div[@class="list_z g4 pslpx"]/ul[@class="ipic"]//li' ).extract() for content in contents: url = 'https://www.jiadiandm.com' + Selector( text=content).xpath('//li[1]/a/@href').extract()[0] img = Selector(text=content).xpath('//li/a/img/@src').extract()[0] if img != None and img != '' and (not img.startswith('http')): img = 'https://www.jiadiandm.com' + img name = Selector(text=content).xpath('//li/a/img/@alt').extract()[0] status = None if len( Selector(text=content).xpath( '//li/a/span/text()').extract()) > 0: status = Selector( text=content).xpath('//li/a/span/text()').extract()[0] yield scrapy.Request(url=url, headers=self.default_headers, body=self.default_data, callback=self.parse_detail_info, meta={ 'img': img, 'name': name, 'status': status }, dont_filter=True)
def parse_basic_info(self, response): contents = response.xpath( '//div[@class="list_body"]//div[@class="list_body_contain"]//div[@class="list_body_con"]' ).extract() for content in contents: img = Selector(text=content).xpath( '//div[@class="list_body_con"]//a[@class="list_body_con_img"]//img/@data-original' ).extract()[0] name = Selector(text=content).xpath( '//div[@class="list_body_con"]//a[@class="list_body_con_img"]//img/@alt' ).extract()[0] zh_name = Selector(text=content).xpath( '//div[@class="list_body_con"]//div[@class="list_body_con_con"]/a/text()' ).extract()[0] url_tmp = Selector(text=content).xpath( '//div[@class="list_body_con"]//a[@class="list_body_con_img"]/@href' ).extract()[0] if not url_tmp.startswith('/'): continue url = 'http://down.ali213.net' + url_tmp yield scrapy.Request(url=url, headers=self.default_headers, body=self.default_data, callback=self.parse_detail_info, meta={ 'img': img, 'name': name, 'zh_name': zh_name }, dont_filter=True)
def parse_basic_info(self, response): contents = response.xpath( '//div[@class="dmList clearfix"]/ul//li').extract() for content in contents: img = Selector(text=content).xpath( '//li/p[@class="fl cover"]/a[@class="pic"]/img/@src').extract( )[0] if not (img != '' and img != None and img.startswith('http')): img = 'http://www.66mh.cc' + img name = Selector(text=content).xpath( '//li/p[@class="fl cover"]/a[@class="pic"]/img/@alt').extract( )[0] url = 'http://www.66mh.cc' + Selector(text=content).xpath( '//li/p[@class="fl cover"]/a[@class="pic"]/@href').extract()[0] status_part1 = '' if len( Selector(text=content).xpath( '//li/p[@class="fl cover"]/span/a/text()').extract() ) > 0: status_part1 = Selector(text=content).xpath( '//li/p[@class="fl cover"]/span/a/text()').extract()[0] update_time = None if len( Selector(text=content).xpath( '//li/dl/dd//p[1]/span/text()').extract()) > 0: update_time = Selector(text=content).xpath( '//li/dl/dd//p[1]/span/text()').extract()[0] else: update_time = Selector(text=content).xpath( '//li/dl/dd//p[1]/span/font/text()').extract()[0] status_part2 = '' if len( Selector(text=content).xpath( '//li/dl/dd//p[2]/span/text()').extract()) > 0: status_part2 = Selector(text=content).xpath( '//li/dl/dd//p[2]/span/text()').extract()[0] status = status_part2 + status_part1 type = None if len( Selector(text=content).xpath( '//li/dl/dd//p[3]/a/text()').extract()) > 0: type = Selector(text=content).xpath( '//li/dl/dd//p[3]/a/text()').extract()[0] yield scrapy.Request(url=url, headers=self.default_headers, body=self.default_data, meta={ 'img': img, 'name': name, 'update_time': update_time, 'status': status, 'type': type }, callback=self.parse_detail_info, dont_filter=True)
def parse_basic_info(self, response): contents = response.xpath( '//div[@class="list_con"]//ul[@class="game_list"]//li').extract() for content in contents: img = Selector(text=content).xpath( '//li//div[@class="img"]//a//img/@src').extract()[0] if img != '' and img != None and img != [] and not img.startswith( 'http'): img = 'http://dl.3dmgame.com' + img name = Selector(text=content).xpath( '//li//div[@class="img"]//a//img/@alt').extract()[0] zh_name = Selector(text=content).xpath( '//li//div[@class="img"]//a//img/@alt').extract()[0] url = Selector(text=content).xpath( '//li//div[@class="img"]//a/@href').extract()[0] introduction = Selector( text=content).xpath('//li//div[@class="text"]/dl').extract()[0] type = Selector(text=content).xpath( '//li//div[@class="more_info"]/dl//dd[1]/a/text()').extract( )[0] update_time = Selector(text=content).xpath( '//li//div[@class="more_info"]/dl//dd[2]/text()').extract()[0] size = Selector(text=content).xpath( '//li//div[@class="more_info"]/dl//dd[3]/text()').extract()[0] language = '' if len( Selector(text=content).xpath( '//li//div[@class="more_info"]/dl//dd[4]/text()'). extract()) > 0: language = Selector(text=content).xpath( '//li//div[@class="more_info"]/dl//dd[4]/text()').extract( )[0] yield scrapy.Request(url=url, headers=self.default_headers, body=self.default_data, callback=self.parse_detail_info, meta={ 'img': img, 'name': name, 'zh_name': zh_name, 'introduction': introduction, 'type': type, 'update_time': update_time, 'size': size, 'language': language }, dont_filter=True)
def parse_basic_info(self, response): contents = response.xpath( '//ul[@class="down_con downData"]//li').extract() for content in contents: img = Selector(text=content).xpath( '//li//div[@class="img"]/a/img/@src').extract()[0] if img != '' and img != None and img != [] and not img.startswith( 'http'): img = 'http://img4.gamersky.com/Files/GamerSky/' + img name = Selector(text=content).xpath( '//li//div[@class="img"]/a/@title').extract()[0] url = Selector(text=content).xpath( '//li//div[@class="img"]/a/@href').extract()[0] update_time = Selector(text=content).xpath( '//li//div[@class="txt"][1]/text()').extract()[0] if update_time != None: update_time = update_time.split(':')[1] type = Selector(text=content).xpath( '//li//div[@class="txt"][2]/text()').extract()[0] if type != None: type = type.split(':')[1] language = Selector(text=content).xpath( '//li//div[@class="txt"][3]/text()').extract()[0] if language != None: language = language.split(':')[1] size = Selector(text=content).xpath( '//li//div[@class="txt"][4]/text()').extract()[0] if size != None: size = size.split(':')[1] yield scrapy.Request(url=url, headers=self.default_headers, body=self.default_data, callback=self.parse_detail_info, meta={ 'img': img, 'name': name, 'update_time': update_time, 'type': type, 'language': language, 'size': size }, dont_filter=True)
def parse_basic_info(self, response): contents = response.xpath( '//div[@style="width:373px;float:left;margin:5px 0px 5px 5px;"]' ).extract() for content in contents: img = Selector(text=content).xpath( '//div[@style="width:373px;float:left;margin:5px 0px 5px 5px;"]/div[@style="width:95px;float:left;"]/a/img/@src' ).extract()[0] if not img.startswith('http'): img = 'http://www.wenku8.com' + img url = Selector(text=content).xpath( '//div[@style="width:373px;float:left;margin:5px 0px 5px 5px;"]/div[@style="width:95px;float:left;"]/a/@href' ).extract()[0] name = Selector(text=content).xpath( '//div[@style="width:373px;float:left;margin:5px 0px 5px 5px;"]/div[@style="width:95px;float:left;"]/a/@title' ).extract()[0] yield scrapy.Request(url=url, meta={ 'img': img, 'name': name }, callback=self.parse_detail_info, dont_filter=True)
def company_info(self, response): company_data = {'unit_type': response.meta['unit_type'], 'city': '', 'start_date': '', 'number': '', 'authority': '', 'type_of_registration': '', 'business_area': '', 'security_number': '', 'capital': '', 'unit_property': '', 'social_registration': '', 'registered_address': '', 'registered__postal_code': '', 'business_address': '', 'business_postal_number': '', 'legal_person': '', 'website': '', } company_name = Selector(response=response).xpath('//td[@colspan="3"]')[0].xpath('./a/@title').extract_first() company_data['company_name'] = company_name # yield scrapy.FormRequest() # test = self.r.sadd('title_name1', company_name) unit_property = Selector(response=response).xpath( '//td[@style="width: 350px;padding-top: 9px;"]/text()').extract_first() if unit_property.split(): unit_property = unit_property.split()[0] company_data['unit_property'] = unit_property capital = Selector(response=response).xpath('//td[@colspan="3"]')[2].xpath('text()').extract_first() if capital is not None: if capital != '/': company_data['capital'] = capital + '万元' city = Selector(response=response).xpath('//td[@colspan="3"]')[1].xpath('text()').extract_first() if city.split(): city = city.split()[0] company_data['city'] = city start_company_data = Selector(response=response).xpath('//td[@style="width: 230px;padding-top: 9px;"]')[ 3].xpath('text()').extract_first() if start_company_data.split(): start_company_data = start_company_data.split()[0] company_data['start_date'] = start_company_data number = Selector(response=response).xpath('//td[@colspan="3"]')[3].xpath( 'text()').extract_first() if number.split(): number = number.split()[0] if len(number) == 18: company_data['number'] = number authority = Selector(response=response).xpath('//td[@style="width: 230px;padding-top: 9px;"]')[5].xpath( 'text()').extract_first() if authority is not None: authority = authority.split()[0] if authority != '/': company_data['authority'] = authority type_of_registration = Selector(response=response).xpath('//td[@colspan="5"]')[0].xpath( 'text()').extract_first() if type_of_registration.split(): type_of_registration = type_of_registration.split()[0] company_data['type_of_registration'] = type_of_registration business_area = Selector(response=response).xpath('//td[@colspan="5"]')[1].xpath( 'text()').extract_first() if business_area is not None: business_area = business_area.split()[0] if business_area != '/': company_data['business_area'] = business_area security_number = Selector(response=response).xpath('//td[@colspan="3"]')[4].xpath( 'text()').extract_first() if security_number is not None: security_number = security_number.split()[0] if security_number != '/': company_data['security_number'] = security_number social_registration = Selector(response=response).xpath('//td[@style="width: 230px;padding-top: 9px;"]')[ 9].xpath( 'text()').extract_first() if social_registration is not None: social_registration = social_registration.split()[0] if social_registration != '/': company_data['social_registration'] = social_registration registered_address = Selector(response=response).xpath('//td[@colspan="3"]')[5].xpath( 'text()').extract_first() if registered_address is not None: registered_address = registered_address.split()[0] if registered_address != '/': company_data['registered_address'] = registered_address registered__postal_code = Selector(response=response).xpath('//td[@style="width: 230px;padding-top: 9px;"]')[ 11].xpath( 'text()').extract_first() if registered__postal_code is not None: registered__postal_code = registered__postal_code.split()[0] if registered__postal_code != '/': company_data['registered__postal_code'] = registered__postal_code business_address = Selector(response=response).xpath('//td[@colspan="3"]')[5].xpath( 'text()').extract_first() if business_address is not None: business_address = business_address.split()[0] if business_address != '/': company_data['business_address'] = business_address business_postal_number = Selector(response=response).xpath('//td[@style="width: 230px;padding-top: 9px;"]')[ 13].xpath( 'text()').extract_first() if business_postal_number is not None: business_postal_number = business_postal_number.split()[0] if business_postal_number != '/': company_data['business_postal_number'] = business_postal_number legal_person = Selector(response=response).xpath('//td[@colspan="2"]/text()').extract_first() if legal_person is not None: legal_person = legal_person.split()[0] if legal_person != '/': company_data['legal_person'] = legal_person if len(Selector(response=response).xpath('//td[@colspan="5"]')) == 3: website = Selector(response=response).xpath('//td[@colspan="5"]')[2].xpath( 'text()').extract_first() if website.split(): print(website.split(), 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA') website = website.split()[0] if website.startswith('www') or website.startswith('http'): company_data['website'] = website print('公司信息', company_data)