示例#1
0
 def parse_person(self, response):
     self.log(response.url)
     records = re.findall('<td class="t-manager">(.*?)咨询</a>',
                          response.body, re.DOTALL)
     if not records and response.meta['retry'] < 2:
         print('%s未返回有效内容,重试第%d次,上限2次' %
               (response.url, response.meta['retry'] + 1))
         yield scrapy.Request(response.url,
                              cookies=self.cookies,
                              callback=self.parse,
                              meta={
                                  'flag': response.meta['flag'],
                                  'retry': response.meta['retry'] + 1
                              },
                              dont_filter=True)
     for record in records:
         person_name = clean_str_strong(
             re.search(
                 '<a href="/simu/m-(.*?).html" target="_blank">(.*?)</a>',
                 record, re.DOTALL).group(2))
         person_id = re.search(
             '<a href="/simu/m-(.*?).html" target="_blank">(.*?)</a>',
             record, re.DOTALL).group(1)
         url = 'https://www.jfz.com/simu/m-%s.html' % person_id
         yield scrapy.Request(url,
                              cookies=self.cookies,
                              callback=self.parse_info,
                              meta={
                                  'person_name': person_name,
                                  'person_id': person_id,
                                  'retry': 0
                              },
                              dont_filter=True)
示例#2
0
 def parse_org(self, response):
     self.log(response.url)
     records = re.findall('<td class="t-company">(.*?)咨询</a>', response.body, re.DOTALL)
     if not records and response.meta['retry'] < 2:
         print('%s未返回有效内容,重试第%d次,上限2次' % (response.url, response.meta['retry'] + 1))
         yield scrapy.Request(response.url, cookies=self.cookies, callback=self.parse_org,
                              meta={'retry': response.meta['retry'] + 1},
                              dont_filter=True)
     for record in records:
         org_name = clean_str_strong(re.search(' target="_blank">(.*?)</a>', record, re.DOTALL).group(1))
         org_id = re.search('<a href="/simu/c-(.*?).html" ', record, re.DOTALL).group(1)
         found_date = re.search('<td class="t-establish">(.*?)</td>', record, re.DOTALL).group(1)
         if found_date != '---':
             found_date = datetime.strptime(found_date, '%Y-%m-%d').date()
         area = re.search('<td class="t-area">(.*?)</td>', record, re.DOTALL).group(1)
         url = 'https://www.jfz.com/simu/c-%s.html' % org_id
         yield scrapy.Request(url, cookies=self.cookies, callback=self.parse_info,
                              meta={'org_name': org_name, 'org_id': org_id, 'retry': 0, 'found_date': found_date,
                                    'area': area}, dont_filter=True)
示例#3
0
    def parse_info(self, response):
        self.log(response.url)
        # soup = BeautifulSoup(response.body, 'lxml')
        item = dOrgInfoItem()
        item['org_name'] = response.meta['org_name'].strip()
        item['org_id'] = response.meta['org_id']
        item['found_date'] = response.meta['found_date']
        item['source_id'] = '020002'
        item['area'] = response.meta['area']
        item['org_full_name'] = response.xpath(
            '//div[@class="v4_simu_pro_info_states"]//li[@class="states_item states_item_large"]//span[@class="content"]/text()'
        ).extract()[0]
        item['core_member'] = response.xpath(
            '//div[@class="v4_simu_pro_info_states"]//li[@class="states_item"][3]//span[@class="content"]//text()'
        ).extract()[0]
        item['funds_num'] = response.xpath(
            '//div[@class="v4_simu_pro_info_states"]//li[@class="states_item"][2]//span[@class="content"]//text()'
        ).extract()[0]
        item['reg_capital'] = response.xpath(
            '//div[@class="v4_simu_pro_info_states"]//li[@class="states_item"][4]//span[@class="content"]//text()'
        ).extract()[0]
        item['found_date'] = response.meta['found_date']
        item['area'] = response.meta['area']
        item['version'] = self.version
        yield item
        trs = response.xpath(
            '//div[@class="v4_simu_pro_box fm_products"]//div[@class="v4_simu_pro_box_bd"]//div[@class="fm_tab_con active"]//tr'
        )
        if trs:
            for tr in trs[1:]:
                item2 = dOrgFundItem()
                item2['org_name'] = response.meta['org_name'].strip()
                item2['org_id'] = response.meta['org_id']
                item2['source_id'] = '020002'
                item2['fund_name'] = clean_str_strong(
                    tr.xpath('.//td[@class="tl"]//a/text()').extract()
                    [0]).strip()
                item2['fund_id'] = re.search(
                    '/simu/p-(.+?).html',
                    tr.xpath('.//td[@class="tl"]//a/@href').extract()[0],
                    re.DOTALL).group(1)
                item2['version'] = self.version
                yield item2
        trs1 = response.xpath(
            '//div[@class="v4_simu_pro_box fm_products"]//div[@class="v4_simu_pro_box_bd"]//div[@class="fm_tab_con"]//tr'
        )
        if trs1:
            for tr1 in trs1[1:]:
                item3 = dOrgFundItem()
                item3['org_name'] = response.meta['org_name'].strip()
                item3['org_id'] = response.meta['org_id']
                item3['source_id'] = '020002'
                item3['fund_name'] = clean_str_strong(
                    tr1.xpath('.//td[@class="tl"]//a/text()').extract()
                    [0]).strip()
                item3['fund_id'] = re.search(
                    '/simu/p-(.+?).html',
                    tr1.xpath('.//td[@class="tl"]//a/@href').extract()[0],
                    re.DOTALL).group(1)
                item3['version'] = self.version
                yield item3
        profile = response.xpath(
            u'//div[div[span[contains(text(),"公司简介")]]]/following-sibling::div'
        ).xpath('string(.)').extract_first()

        investment_idea = response.xpath(
            u'//div[div[span[contains(text(),"投资理念")]]]/following-sibling::div'
        ).xpath('string(.)').extract_first()
        item4 = dOrgDpItem()
        item4['org_name'] = response.meta['org_name']
        item4['org_id'] = response.meta['org_id']
        item4['source_id'] = '020002'
        if profile:
            item4['profile'] = clean_str_strong(profile.strip())
        if investment_idea:
            item4['investment_idea'] = clean_str_strong(
                investment_idea.strip())
        item4['version'] = self.version
        yield item4
示例#4
0
    def parse_info(self, response):
        self.log(response.url)
        soup = BeautifulSoup(response.body, 'lxml')
        item = dOrgInfoItem()
        item['org_name'] = response.meta['org_name'].strip()
        item['org_id'] = response.meta['org_id']
        item['found_date'] = response.meta['found_date']
        item['source_id'] = '020002'
        item['area'] = response.meta['area']
        item['org_full_name'] = response.xpath(
            '//div[@class="v4_simu_pro_info_states"]//li[@class="states_item states_item_large"]//span[@class="content"]/text()').extract()[
            0]
        item['core_member'] = response.xpath(
            '//div[@class="v4_simu_pro_info_states"]//li[@class="states_item"][3]//span[@class="content"]//text()').extract()[
            0]
        item['funds_num'] = response.xpath(
            '//div[@class="v4_simu_pro_info_states"]//li[@class="states_item"][2]//span[@class="content"]//text()').extract()[
            0]
        item['reg_capital'] = response.xpath(
            '//div[@class="v4_simu_pro_info_states"]//li[@class="states_item"][4]//span[@class="content"]//text()').extract()[
            0]
        item['found_date'] = response.meta['found_date']
        item['area'] = response.meta['area']
        item['version'] = self.version
        yield item
        trs = response.xpath(
            '//div[@class="v4_simu_pro_box fm_products"]//div[@class="v4_simu_pro_box_bd"]//div[@class="fm_tab_con active"]//tr')
        if trs:
            for tr in trs[1:]:
                item2 = dOrgFundItem()
                item2['org_name'] = response.meta['org_name'].strip()
                item2['org_id'] = response.meta['org_id']
                item2['source_id'] = '020002'
                item2['fund_name'] = clean_str_strong(tr.xpath('.//td[@class="tl"]//a/text()').extract()[0]).strip()
                item2['fund_id'] = '05' + re.search('/simu/p-(.+?).html',
                                                    tr.xpath('.//td[@class="tl"]//a/@href').extract()[0],
                                                    re.DOTALL).group(1)
                item2['version'] = self.version
                yield item2
        trs1 = response.xpath(
            '//div[@class="v4_simu_pro_box fm_products"]//div[@class="v4_simu_pro_box_bd"]//div[@class="fm_tab_con"]//tr')
        if trs1:
            for tr1 in trs1[1:]:
                item3 = dOrgFundItem()
                item3['org_name'] = response.meta['org_name'].strip()
                item3['org_id'] = response.meta['org_id']
                item3['source_id'] = '020002'
                item3['fund_name'] = clean_str_strong(tr1.xpath('.//td[@class="tl"]//a/text()').extract()[0]).strip()
                item3['fund_id'] = '05' + re.search('/simu/p-(.+?).html',
                                                    tr1.xpath('.//td[@class="tl"]//a/@href').extract()[0],
                                                    re.DOTALL).group(1)
                item3['version'] = self.version
                yield item3
        profile = soup.find('span', text=u'公司简介').find_next('div').text.strip()

        investment_idea = soup.find('span', text=u'投资理念')
        if investment_idea:
            investment_idea = investment_idea.find_next('div').text.strip()
        item4 = dOrgDpItem()
        item4['org_name'] = response.meta['org_name']
        item4['org_id'] = response.meta['org_id']
        item4['source_id'] = '020002'
        item4['profile'] = profile
        item4['investment_idea'] = investment_idea
        item4['version'] = self.version
        yield item4
        person_url = response.xpath(
            '//div[@class="v4_simu_pro_info_states"]//li[@class="states_item"][3]//span[@class="content"]//a/@href').extract()
        if person_url:
            person_id = re.search('/simu/m-(.+?).html', person_url[0], re.DOTALL).group(1)
            item5 = dOrgPersonItem()
            item5['org_name'] = response.meta['org_name'].strip()
            item5['org_id'] = response.meta['org_id']
            item5['source_id'] = '020002'
            item5['person_id'] = person_id
            item5['person_name'] = response.xpath(
                '//div[@class="v4_simu_pro_info_states"]//li[@class="states_item"][3]//span[@class="content"]//a/text()').extract()[
                0]
            item5['duty'] = '基金经理'
            item5['is_current'] = 1
            item5['version'] = self.version
            yield item5
示例#5
0
 def parse_info(self, response):
     self.log(response.url)
     funds_num = response.xpath(
         '//div[@class="detail_box_2"]//tr[2]//td//text()').extract()[0]
     org = response.xpath(
         '//div[@class="detail_box_2"]//li[@class="attr_item attr_item_last"]//td[1]//text()'
     ).extract()[1]
     org_name = clean_str_strong(org)
     org_url = response.xpath(
         '//div[@class="detail_box_2"]//li[@class="attr_item attr_item_last"]//td[1]//a//@href'
     ).extract()
     if org_url:
         org_url = org_url[0]
         org_id = re.findall('/simu/c-(.+?).html', org_url, re.DOTALL)[0]
     else:
         org_id = None
     background = response.xpath(
         '//div[@class="detail_box_2"]//tr[3]//td//text()').extract()[2]
     investment_years = response.xpath(
         u'//descendant::th[text()="从业年限:"]/following-sibling::td//text()'
     ).extract()[0]
     item = FundPersonItem()
     item['user_id'] = response.meta['person_id']
     item['user_name'] = response.meta['person_name']
     item['org_name'] = org_name
     item['duty'] = '基金经理'
     item['background'] = background
     item['investment_years'] = investment_years
     item['funds_num'] = funds_num
     item['data_source'] = 4
     item['data_source_name'] = '金斧子'
     item['version'] = self.version
     yield item
     item1 = dPersonOrgItem()
     item1['person_id'] = response.meta['person_id']
     item1['person_name'] = response.meta['person_name']
     item1['org_name'] = org_name
     item1['org_id'] = org_id
     item1['source_id'] = '020002'
     item1['is_current'] = 1
     item1['version'] = self.version
     yield item1
     trs2 = response.xpath('//div[@class="tab_con active"]//tr')
     for tr2 in trs2[1:]:
         if tr2:
             tr2 = tr2.extract()
             item3 = dPersonFundItem()
             item3['person_id'] = response.meta['person_id']
             item3['person_name'] = response.meta['person_name']
             item3['source_id'] = '020002'
             item3['fund_name'] = re.search(
                 '<a href="(.+?)" target="_blank" title="(.+?)">(.*?)</a>',
                 tr2, re.DOTALL).group(2)
             item3['fund_id'] = re.search(
                 '<a href="/simu/p-(.*)\.html" target="_blank" ', tr2,
                 re.DOTALL).group(1)
             item3['is_current'] = 1
             item3['version'] = self.version
             yield item3
     trs = response.xpath('//div[@class="tab_con"]//tr')
     for tr in trs[1:]:
         if tr:
             tr = tr.extract()
             item2 = dPersonFundItem()
             item2['person_id'] = response.meta['person_id']
             item2['person_name'] = response.meta['person_name']
             item2['source_id'] = '020002'
             item2['fund_name'] = re.search(
                 '<a href="(.+?)" target="_blank" title="(.+?)">(.*?)</a>',
                 tr, re.DOTALL).group(2)
             item2['fund_id'] = re.search(
                 '<a href="/simu/p-(.*)\.html" target="_blank" ', tr,
                 re.DOTALL).group(1)
             item2['is_current'] = 0
             item2['version'] = self.version
             yield item2