def parse_person(self, response): self.log(response.url) records = re.findall('<td class="t-manager">(.*?)咨询</a>', response.body, re.DOTALL) if not records and response.meta['retry'] < 2: print('%s未返回有效内容,重试第%d次,上限2次' % (response.url, response.meta['retry'] + 1)) yield scrapy.Request(response.url, cookies=self.cookies, callback=self.parse, meta={ 'flag': response.meta['flag'], 'retry': response.meta['retry'] + 1 }, dont_filter=True) for record in records: person_name = clean_str_strong( re.search( '<a href="/simu/m-(.*?).html" target="_blank">(.*?)</a>', record, re.DOTALL).group(2)) person_id = re.search( '<a href="/simu/m-(.*?).html" target="_blank">(.*?)</a>', record, re.DOTALL).group(1) url = 'https://www.jfz.com/simu/m-%s.html' % person_id yield scrapy.Request(url, cookies=self.cookies, callback=self.parse_info, meta={ 'person_name': person_name, 'person_id': person_id, 'retry': 0 }, dont_filter=True)
def parse_org(self, response): self.log(response.url) records = re.findall('<td class="t-company">(.*?)咨询</a>', response.body, re.DOTALL) if not records and response.meta['retry'] < 2: print('%s未返回有效内容,重试第%d次,上限2次' % (response.url, response.meta['retry'] + 1)) yield scrapy.Request(response.url, cookies=self.cookies, callback=self.parse_org, meta={'retry': response.meta['retry'] + 1}, dont_filter=True) for record in records: org_name = clean_str_strong(re.search(' target="_blank">(.*?)</a>', record, re.DOTALL).group(1)) org_id = re.search('<a href="/simu/c-(.*?).html" ', record, re.DOTALL).group(1) found_date = re.search('<td class="t-establish">(.*?)</td>', record, re.DOTALL).group(1) if found_date != '---': found_date = datetime.strptime(found_date, '%Y-%m-%d').date() area = re.search('<td class="t-area">(.*?)</td>', record, re.DOTALL).group(1) url = 'https://www.jfz.com/simu/c-%s.html' % org_id yield scrapy.Request(url, cookies=self.cookies, callback=self.parse_info, meta={'org_name': org_name, 'org_id': org_id, 'retry': 0, 'found_date': found_date, 'area': area}, dont_filter=True)
def parse_info(self, response): self.log(response.url) # soup = BeautifulSoup(response.body, 'lxml') item = dOrgInfoItem() item['org_name'] = response.meta['org_name'].strip() item['org_id'] = response.meta['org_id'] item['found_date'] = response.meta['found_date'] item['source_id'] = '020002' item['area'] = response.meta['area'] item['org_full_name'] = response.xpath( '//div[@class="v4_simu_pro_info_states"]//li[@class="states_item states_item_large"]//span[@class="content"]/text()' ).extract()[0] item['core_member'] = response.xpath( '//div[@class="v4_simu_pro_info_states"]//li[@class="states_item"][3]//span[@class="content"]//text()' ).extract()[0] item['funds_num'] = response.xpath( '//div[@class="v4_simu_pro_info_states"]//li[@class="states_item"][2]//span[@class="content"]//text()' ).extract()[0] item['reg_capital'] = response.xpath( '//div[@class="v4_simu_pro_info_states"]//li[@class="states_item"][4]//span[@class="content"]//text()' ).extract()[0] item['found_date'] = response.meta['found_date'] item['area'] = response.meta['area'] item['version'] = self.version yield item trs = response.xpath( '//div[@class="v4_simu_pro_box fm_products"]//div[@class="v4_simu_pro_box_bd"]//div[@class="fm_tab_con active"]//tr' ) if trs: for tr in trs[1:]: item2 = dOrgFundItem() item2['org_name'] = response.meta['org_name'].strip() item2['org_id'] = response.meta['org_id'] item2['source_id'] = '020002' item2['fund_name'] = clean_str_strong( tr.xpath('.//td[@class="tl"]//a/text()').extract() [0]).strip() item2['fund_id'] = re.search( '/simu/p-(.+?).html', tr.xpath('.//td[@class="tl"]//a/@href').extract()[0], re.DOTALL).group(1) item2['version'] = self.version yield item2 trs1 = response.xpath( '//div[@class="v4_simu_pro_box fm_products"]//div[@class="v4_simu_pro_box_bd"]//div[@class="fm_tab_con"]//tr' ) if trs1: for tr1 in trs1[1:]: item3 = dOrgFundItem() item3['org_name'] = response.meta['org_name'].strip() item3['org_id'] = response.meta['org_id'] item3['source_id'] = '020002' item3['fund_name'] = clean_str_strong( tr1.xpath('.//td[@class="tl"]//a/text()').extract() [0]).strip() item3['fund_id'] = re.search( '/simu/p-(.+?).html', tr1.xpath('.//td[@class="tl"]//a/@href').extract()[0], re.DOTALL).group(1) item3['version'] = self.version yield item3 profile = response.xpath( u'//div[div[span[contains(text(),"公司简介")]]]/following-sibling::div' ).xpath('string(.)').extract_first() investment_idea = response.xpath( u'//div[div[span[contains(text(),"投资理念")]]]/following-sibling::div' ).xpath('string(.)').extract_first() item4 = dOrgDpItem() item4['org_name'] = response.meta['org_name'] item4['org_id'] = response.meta['org_id'] item4['source_id'] = '020002' if profile: item4['profile'] = clean_str_strong(profile.strip()) if investment_idea: item4['investment_idea'] = clean_str_strong( investment_idea.strip()) item4['version'] = self.version yield item4
def parse_info(self, response): self.log(response.url) soup = BeautifulSoup(response.body, 'lxml') item = dOrgInfoItem() item['org_name'] = response.meta['org_name'].strip() item['org_id'] = response.meta['org_id'] item['found_date'] = response.meta['found_date'] item['source_id'] = '020002' item['area'] = response.meta['area'] item['org_full_name'] = response.xpath( '//div[@class="v4_simu_pro_info_states"]//li[@class="states_item states_item_large"]//span[@class="content"]/text()').extract()[ 0] item['core_member'] = response.xpath( '//div[@class="v4_simu_pro_info_states"]//li[@class="states_item"][3]//span[@class="content"]//text()').extract()[ 0] item['funds_num'] = response.xpath( '//div[@class="v4_simu_pro_info_states"]//li[@class="states_item"][2]//span[@class="content"]//text()').extract()[ 0] item['reg_capital'] = response.xpath( '//div[@class="v4_simu_pro_info_states"]//li[@class="states_item"][4]//span[@class="content"]//text()').extract()[ 0] item['found_date'] = response.meta['found_date'] item['area'] = response.meta['area'] item['version'] = self.version yield item trs = response.xpath( '//div[@class="v4_simu_pro_box fm_products"]//div[@class="v4_simu_pro_box_bd"]//div[@class="fm_tab_con active"]//tr') if trs: for tr in trs[1:]: item2 = dOrgFundItem() item2['org_name'] = response.meta['org_name'].strip() item2['org_id'] = response.meta['org_id'] item2['source_id'] = '020002' item2['fund_name'] = clean_str_strong(tr.xpath('.//td[@class="tl"]//a/text()').extract()[0]).strip() item2['fund_id'] = '05' + re.search('/simu/p-(.+?).html', tr.xpath('.//td[@class="tl"]//a/@href').extract()[0], re.DOTALL).group(1) item2['version'] = self.version yield item2 trs1 = response.xpath( '//div[@class="v4_simu_pro_box fm_products"]//div[@class="v4_simu_pro_box_bd"]//div[@class="fm_tab_con"]//tr') if trs1: for tr1 in trs1[1:]: item3 = dOrgFundItem() item3['org_name'] = response.meta['org_name'].strip() item3['org_id'] = response.meta['org_id'] item3['source_id'] = '020002' item3['fund_name'] = clean_str_strong(tr1.xpath('.//td[@class="tl"]//a/text()').extract()[0]).strip() item3['fund_id'] = '05' + re.search('/simu/p-(.+?).html', tr1.xpath('.//td[@class="tl"]//a/@href').extract()[0], re.DOTALL).group(1) item3['version'] = self.version yield item3 profile = soup.find('span', text=u'公司简介').find_next('div').text.strip() investment_idea = soup.find('span', text=u'投资理念') if investment_idea: investment_idea = investment_idea.find_next('div').text.strip() item4 = dOrgDpItem() item4['org_name'] = response.meta['org_name'] item4['org_id'] = response.meta['org_id'] item4['source_id'] = '020002' item4['profile'] = profile item4['investment_idea'] = investment_idea item4['version'] = self.version yield item4 person_url = response.xpath( '//div[@class="v4_simu_pro_info_states"]//li[@class="states_item"][3]//span[@class="content"]//a/@href').extract() if person_url: person_id = re.search('/simu/m-(.+?).html', person_url[0], re.DOTALL).group(1) item5 = dOrgPersonItem() item5['org_name'] = response.meta['org_name'].strip() item5['org_id'] = response.meta['org_id'] item5['source_id'] = '020002' item5['person_id'] = person_id item5['person_name'] = response.xpath( '//div[@class="v4_simu_pro_info_states"]//li[@class="states_item"][3]//span[@class="content"]//a/text()').extract()[ 0] item5['duty'] = '基金经理' item5['is_current'] = 1 item5['version'] = self.version yield item5
def parse_info(self, response): self.log(response.url) funds_num = response.xpath( '//div[@class="detail_box_2"]//tr[2]//td//text()').extract()[0] org = response.xpath( '//div[@class="detail_box_2"]//li[@class="attr_item attr_item_last"]//td[1]//text()' ).extract()[1] org_name = clean_str_strong(org) org_url = response.xpath( '//div[@class="detail_box_2"]//li[@class="attr_item attr_item_last"]//td[1]//a//@href' ).extract() if org_url: org_url = org_url[0] org_id = re.findall('/simu/c-(.+?).html', org_url, re.DOTALL)[0] else: org_id = None background = response.xpath( '//div[@class="detail_box_2"]//tr[3]//td//text()').extract()[2] investment_years = response.xpath( u'//descendant::th[text()="从业年限:"]/following-sibling::td//text()' ).extract()[0] item = FundPersonItem() item['user_id'] = response.meta['person_id'] item['user_name'] = response.meta['person_name'] item['org_name'] = org_name item['duty'] = '基金经理' item['background'] = background item['investment_years'] = investment_years item['funds_num'] = funds_num item['data_source'] = 4 item['data_source_name'] = '金斧子' item['version'] = self.version yield item item1 = dPersonOrgItem() item1['person_id'] = response.meta['person_id'] item1['person_name'] = response.meta['person_name'] item1['org_name'] = org_name item1['org_id'] = org_id item1['source_id'] = '020002' item1['is_current'] = 1 item1['version'] = self.version yield item1 trs2 = response.xpath('//div[@class="tab_con active"]//tr') for tr2 in trs2[1:]: if tr2: tr2 = tr2.extract() item3 = dPersonFundItem() item3['person_id'] = response.meta['person_id'] item3['person_name'] = response.meta['person_name'] item3['source_id'] = '020002' item3['fund_name'] = re.search( '<a href="(.+?)" target="_blank" title="(.+?)">(.*?)</a>', tr2, re.DOTALL).group(2) item3['fund_id'] = re.search( '<a href="/simu/p-(.*)\.html" target="_blank" ', tr2, re.DOTALL).group(1) item3['is_current'] = 1 item3['version'] = self.version yield item3 trs = response.xpath('//div[@class="tab_con"]//tr') for tr in trs[1:]: if tr: tr = tr.extract() item2 = dPersonFundItem() item2['person_id'] = response.meta['person_id'] item2['person_name'] = response.meta['person_name'] item2['source_id'] = '020002' item2['fund_name'] = re.search( '<a href="(.+?)" target="_blank" title="(.+?)">(.*?)</a>', tr, re.DOTALL).group(2) item2['fund_id'] = re.search( '<a href="/simu/p-(.*)\.html" target="_blank" ', tr, re.DOTALL).group(1) item2['is_current'] = 0 item2['version'] = self.version yield item2