def parse1(self, response):
        bs_obj = bs4.BeautifulSoup(response.text, 'html.parser')
        item = response.meta['item']
        try:
            if item['fund_code'] in blacklist:
                raise Exception('此基金已列入黑名单')

            # 基金信息
            e_div = bs_obj.find('div', class_='infoOfFund')
            e_table = e_div.table
            s = re.sub(r'\s+|\xa0','',u','.join([e.get_text(strip=True) for e in e_table.find_all('td')]))
            ser = pd.Series([])
            d = {
                u'fund_type':u'基金类型:',
                u'fund_size':u'基金规模:',
                u'found_date':u'成立日:',
                u'manager':u'管理人:'
            }
            ser['fund_code'] = item['fund_code']
            for key in d:
                ser[key] = re.search(ur'(?<=%s).+?(?=\,)' %d[key], s).group() if re.search(ur'(?<=%s).+(?=,)' %d[key], s) else None

            df = pd.DataFrame(ser).T
            df.index = [item['fund_code'], ]

            mysql_connecter.insert_df_data(df, 'fund_info', method='UPDATE')
    def parse1(self, response):
        item = response.meta['item']
        try:
            str_list = eval(response.text)
            df = pd.DataFrame([])
            for s in str_list:
                l = s.split(',')
                df0 = pd.Series([l[1],l[2], item['plate_name'], item['plate_type']], index=['stock_code', 'stock_name', 'plate_name', 'plate_type'])
                df = df.append(df0, ignore_index=True)

            if not df.empty:
                df['crawler_key'] = df['stock_code'] + '/' + df['plate_name']
                mysql_connecter.insert_df_data(df, 'stock_info', method='UPDATE')

        except:
            log_obj.error("%s中无法解析\n原因:%s" %(self.name, traceback.format_exc()))
Пример #3
0
    def parse(self, response):
        item = response.meta['item']
        try:
            print u"正在爬取%s板块的历史资金流数据" %item['plate_name']
            bs_obj = bs4.BeautifulSoup(response.text, 'html.parser')
            e_table = bs_obj.find('table', id='tb_lishi')

            plate_name = item['plate_name']

            df = pd.read_html(e_table.prettify(encoding='utf8'), encoding='utf8')[0]
            df.columns = [
                u'日期',
                u'主力净流入净额',
                u'主力净流入净占比',
                u'超大单净流入净额',
                u'超大单净流入净占比',
                u'大单净流入净额',
                u'大单净流入净占比',
                u'中单净流入净额',
                u'中单净流入净占比',
                u'小单净流入净额',
                u'小单净流入净占比'
            ]
            df = df.rename({
                u'日期':'value_date',
                u'主力净流入净额':'main_flow_amount',
                u'主力净流入净占比':'main_flow_ratio',
                u'超大单净流入净额':'super_flow_amount',
                u'超大单净流入净占比':'super_flow_ratio',
                u'大单净流入净额':'big_flow_amount',
                u'大单净流入净占比':'big_flow_ratio',
                u'中单净流入净额':'median_flow_amount',
                u'中单净流入净占比':'median_flow_ratio',
                u'小单净流入净额':'small_flow_amount',
                u'小单净流入净占比':'small_flow_ratio'
            }, axis=1)
            df['plate_name'] = plate_name
            df['crawler_key'] = df['plate_name'] + '/' + df['value_date']

            if not df.empty:
                mysql_connecter.insert_df_data(df, 'capital_flow_data', method='UPDATE')

        except:
            log_obj.error("%s中无法解析\n原因:%s" %(self.name, traceback.format_exc()))
Пример #4
0
    def parse1(self, response):
        bs_obj = bs4.BeautifulSoup(response.text, 'html.parser')
        item = response.meta['item']

        if item['fund_code'] in blacklist:
            raise Exception('此基金已列入黑名单')

        # 读取最新净值日期
        lastest_date = self.newest_date_df['newest_date'][
            self.newest_date_df['fund_code'] == item['fund_code']]
        if lastest_date.empty:
            raise Exception('本地数据库没有找到基金代号%s' % item['fund_code'])
        lastest_date = lastest_date.iat[0]
        lastest_date = datetime.datetime(lastest_date.year, lastest_date.month,
                                         lastest_date.day)  # 从date格式转为datetime

        print "本地%s的最新净值日期为%s" % (item['fund_code'], lastest_date)

        try:
            # 净值估算
            e_dl = bs_obj.find('dl', class_='dataItem01')
            data = [
                e.get_text(strip=True)
                for e in e_dl.find('dd', class_='dataNums').find_all('span')
            ]
            data_type = e_dl.find('span', class_='sp01').get_text(strip=True)
            data_date = e_dl.find('span', id='gz_gztime').get_text(strip=True)

            # if data_date != '--':
            data_date = datetime.datetime.strptime(
                re.sub(r'\(|\)', '', data_date), '%y-%m-%d %H:%M')
            # 周六,周日按周五算
            data_date = data_date - datetime.timedelta(
                days=1) if data_date.isoweekday() == 6 else data_date
            data_date = data_date - datetime.timedelta(
                days=2) if data_date.isoweekday() == 7 else data_date

            df = pd.DataFrame(data + [data_type, data_date],
                              index=[u'净值', u'涨跌值', u'涨跌幅', u'数据类型',
                                     u'数据日期']).T
            df = df.drop([u'涨跌值', u'数据类型'], axis=1)
            df = df.rename(
                {
                    u'净值': u'estimate_net_value',
                    u'涨跌幅': u'estimate_daily_growth_rate',
                    u'数据日期': u'value_date'
                },
                axis=1)
            df[u'fund_code'] = item['fund_code']
            df[u'value_date'] = df[u'value_date'].apply(
                lambda date0: date0.strftime('%Y-%m-%d'))
            df[u'crawler_key'] = df[u'fund_code'] + '/' + df[u'value_date']
            df.index = df[u'crawler_key']
            print u"网页日期:", df[u'value_date'].iat[0], u'本地日期:', lastest_date
            # if datetime.datetime.strptime(df[u'value_date'].iat[0],'%Y-%m-%d').date() <= lastest_date.date():
            #     mysql_connecter.update_df_data(df, u'eastmoney_daily_data', u'crawler_key')
            # else:
            #     mysql_connecter.insert_df_data(df, u'eastmoney_daily_data', method='UPDATE')
            if not df.empty:
                mysql_connecter.insert_df_data(df,
                                               'eastmoney_daily_data',
                                               method='UPDATE')
            else:
                print u"无最新数据"
        except:
            log_obj.error("%s( %s )中无法解析\n%s" %
                          (self.name, response.url, traceback.format_exc()))
            with open(u'净值估算.html', 'w') as f:
                f.write(response.text)

        try:
            # 基金净值
            e_div = bs_obj.find_all('div',
                                    class_='poptableWrap singleStyleHeight01')[
                                        0]  #有三个标签页,分别是净值,分红,评级
            e_table = e_div.table
            df = pd.read_html(e_table.prettify(encoding='utf8'),
                              encoding='utf8',
                              header=0)[0]

            # 此处有时间BUG
            year_num = datetime.datetime.now().year
            df[u'日期'] = pd.to_datetime(
                df[u'日期'].apply(lambda s: '%s-%s' % (year_num, s)))

            #print df[u'日期'].dtype
            #print type(lastest_date)

            df = df.astype(np.str)
            df[u'crawler_key'] = df[u'日期'].apply(lambda date: "%s/%s" %
                                                 (item['fund_code'], date))
            df[u'fund_code'] = item['fund_code']
            df = df.rename(
                {
                    u'日期': u'value_date',
                    u'单位净值': u'net_asset_value',
                    u'累计净值': u'accumulative_net_value',
                    u'日增长率': u'daily_growth_rate'
                },
                axis=1)
            df.index = df[u'crawler_key']

            if not df.empty:
                mysql_connecter.insert_df_data(df,
                                               'eastmoney_daily_data',
                                               method='UPDATE')
            else:
                print u"无最新数据"
        except:
            log_obj.error("%s( %s )中无法解析\n%s" %
                          (self.name, response.url, traceback.format_exc()))
            with open(u'基金净值.html', 'w') as f:
                f.write(response.text)

        try:
            # js.v中的数据
            url = 'http://fund.eastmoney.com/pingzhongdata/%s.js?v=%s' % (
                item['fund_code'],
                datetime.datetime.now().strftime('%Y%m%d%H%M%S'))
            js_data = requests_manager.get_html(url)
            js_data = re.sub('\s+', '', js_data)
            re_func = lambda key: re.search(
                (r'(?<=%s\=).+?(?=;)' % key), js_data, re.S).group(
                ) if re.search((r'%s\=.+?;' % key), js_data) else None

            # 股票仓位
            Data_fundSharesPositions = pd.DataFrame(
                eval(re_func('Data_fundSharesPositions')),
                columns=[u'value_date',
                         u'fund_shares_positions']).astype(np.str)

            Data_fundSharesPositions[u'value_date'] = Data_fundSharesPositions[
                u'value_date'].apply(lambda s: datetime.datetime.fromtimestamp(
                    int(s[:10])).strftime('%Y-%m-%d'))
            Data_fundSharesPositions[
                u'fund_shares_positions'] = Data_fundSharesPositions[
                    u'fund_shares_positions'] + '%'

            Data_fundSharesPositions[u'crawler_key'] = item[
                'fund_code'] + '/' + Data_fundSharesPositions[u'value_date']
            Data_fundSharesPositions = Data_fundSharesPositions.drop([
                u'value_date',
            ],
                                                                     axis=1)
            Data_fundSharesPositions.index = Data_fundSharesPositions[
                u'crawler_key']

            if not Data_fundSharesPositions.empty:
                mysql_connecter.insert_df_data(Data_fundSharesPositions,
                                               'eastmoney_daily_data',
                                               method='UPDATE')

        except:
            log_obj.error("%s( %s )中无法解析\n%s" %
                          (self.name, response.url, traceback.format_exc()))
            with open(u'js_v中的数据.html', 'w') as f:
                f.write(response.text)
    def parse1(self, response):
        print "准备解析:", response.url
        item = response.meta['item']

        if item['fund_code'] in blacklist:
            raise Exception('此基金已列入黑名单')

        # 年份列表
        url = "http://fund.eastmoney.com/f10/FundArchivesDatas.aspx?type=jjcc&code=%s&topline=200" % (
            item['fund_code'])
        html = requests_manager.get_html(url)
        with open('test1.html', 'w') as f:
            f.write(html)
        year_list = eval(
            re.search(
                r'(?<=arryear:)\[.+?\](?=,)', html).group()) if re.search(
                    r'(?<=arryear:)\[.+?\](?=,)', html) else None
        if year_list is None:
            raise Exception(u'错误的 url %s' % url)

        for year0 in year_list:
            url = "http://fund.eastmoney.com/f10/FundArchivesDatas.aspx?type=jjcc&code=%s&topline=200&year=%s" % (
                item['fund_code'], str(year0))
            html = requests_manager.get_html(url)
            with open('test2.html', 'w') as f:
                f.write(html)

            html = re.search(r"<div class='box'>.+</div>",
                             html).group() if re.search(
                                 r"<div class='box'>.+</div>", html) else None
            if html is None:
                raise Exception(u'错误的 url %s' % url)

            bs_obj = bs4.BeautifulSoup(html, 'html.parser')

            with open('test3.html', 'w') as f:
                f.write(bs_obj.prettify(encoding='utf8'))

            for e_div in bs_obj.find_all('div', class_="box"):
                title = e_div.find('h4', class_="t").get_text(strip=True)
                print response.url
                print title
                converters = {u'股票代码': lambda s: str(s)}
                df0 = pd.read_html(e_div.table.prettify(encoding='utf8'),
                                   encoding='utf8',
                                   converters=converters)[0]
                df0.columns = [re.sub(r'\s+', '', s) for s in df0.columns]

                func = lambda s: re.search(ur'占净值|持股数|持仓市值', s).group(
                ) if re.search(ur'占净值|持股数|持仓市值', s) else s
                df0.columns = [func(s) for s in df0.columns]

                df0[u'标题'] = title
                df0[u'cut_off_date'] = title.split(u'截止至:')[-1]
                df0[u'对应基金'] = item[u'fund_code']

                df0[u'年份'] = year0

                df0 = df0.rename(
                    {
                        u'股票代码': u'stock_code',
                        u'股票名称': u'stock_name',
                        u'占净值': u'net_value_ratio',
                        u'持股数': u'share_holding',
                        u'持仓市值': u'market_value',
                        u'对应基金': u'fund_code',
                        u'标题': u'title',
                        u'年份': u'year'
                    },
                    axis=1)

                df0 = df0.drop([u'序号', u'相关资讯', u'最新价', u'涨跌幅'],
                               axis=1,
                               errors='ignore')
                df0[u'crawler_key'] = df0[u'fund_code'] + u'/' + df0[
                    u'stock_code'] + u'/' + df0[u'cut_off_date']

                mysql_connecter.insert_df_data(df0,
                                               u'fund_holdings',
                                               method='UPDATE')
Пример #6
0
    def parse1(self, response):
        bs_obj = bs4.BeautifulSoup(response.text, 'html.parser')
        item = response.meta['item']

        if item['fund_code'] in blacklist:
            raise Exception('此基金已列入黑名单')

        try:
            # js.v中的数据
            url = 'http://fund.eastmoney.com/pingzhongdata/%s.js?v=%s' % (
                item['fund_code'],
                datetime.datetime.now().strftime('%Y%m%d%H%M%S'))
            js_data = requests_manager.get_html(url)
            js_data = re.sub('\s+', '', js_data)
            re_func = lambda key: re.search(
                (r'(?<=%s\=).+?(?=;)' % key), js_data, re.S).group(
                ) if re.search((r'%s\=.+?;' % key), js_data) else None

            # 规模变动
            Data_fluctuationScale = pd.read_json(
                re_func('Data_fluctuationScale')
            )  # , columns = [u'value_date', u'fund_shares_positions']).astype(np.str)

            for i in range(Data_fluctuationScale.shape[0]):
                #print Data_fluctuationScale.loc[i,'series']
                #print type(Data_fluctuationScale.loc[i,'series'])
                ser = pd.Series(Data_fluctuationScale.loc[i, 'series'])
                ser = ser.rename({'mom': u'较上期环比', 'y': u'净资产规模(亿)'})
                ser['value_date'] = Data_fluctuationScale.loc[i, 'categories']
                Data_fluctuationScale.loc[i, 'series'] = ser.to_json()

            Data_fluctuationScale['fund_code'] = item['fund_code']
            Data_fluctuationScale['data_type'] = u'规模变动'
            Data_fluctuationScale['crawler_key'] = Data_fluctuationScale[
                'fund_code'] + '/' + Data_fluctuationScale[
                    'data_type'] + '/' + Data_fluctuationScale['categories']
            Data_fluctuationScale = Data_fluctuationScale.drop([
                'categories',
            ],
                                                               axis=1)
            Data_fluctuationScale = Data_fluctuationScale.rename(
                {'series': 'json_data'}, axis=1)

            Data_fluctuationScale.index = Data_fluctuationScale['crawler_key']
            #print Data_fluctuationScale
            if not Data_fluctuationScale.empty:
                mysql_connecter.insert_df_data(Data_fluctuationScale,
                                               'fund_mixed_data')

            # 持有人结构
            Data_holderStructure = json.loads(
                re_func('Data_holderStructure')
            )  # , columns = [u'value_date', u'fund_shares_positions']).astype(np.str)
            #print Data_holderStructure
            categories = Data_holderStructure['categories']
            series = Data_holderStructure['series']

            d = {d0['name']: d0['data'] for d0 in series}

            df = pd.DataFrame(d, index=categories)
            df['value_date'] = df.index
            ser = df.T.apply(lambda ser: ser.to_json())
            ser.name = 'json_data'

            Data_holderStructure = pd.DataFrame(ser, index=categories)
            Data_holderStructure['fund_code'] = item['fund_code']
            Data_holderStructure['data_type'] = u'持有人结构'
            Data_holderStructure['crawler_key'] = item[
                'fund_code'] + '/' + Data_holderStructure[
                    'data_type'] + '/' + Data_holderStructure.index

            Data_holderStructure.index = Data_holderStructure['crawler_key']
            if not Data_holderStructure.empty:
                mysql_connecter.insert_df_data(Data_holderStructure,
                                               'fund_mixed_data')

            # 资产配置
            Data_assetAllocation = json.loads(re_func('Data_assetAllocation'))
            categories = Data_assetAllocation['categories']
            series = Data_assetAllocation['series']

            d = {d0['name']: d0['data'] for d0 in series}

            df = pd.DataFrame(d, index=categories)
            df['value_date'] = df.index
            ser = df.T.apply(lambda ser: ser.to_json())
            ser.name = 'json_data'

            Data_assetAllocation = pd.DataFrame(ser, index=categories)
            Data_assetAllocation['fund_code'] = item['fund_code']
            Data_assetAllocation['data_type'] = u'资产配置'
            Data_assetAllocation['crawler_key'] = item[
                'fund_code'] + '/' + Data_assetAllocation[
                    'data_type'] + '/' + Data_assetAllocation.index

            Data_assetAllocation.index = Data_assetAllocation['crawler_key']
            if not Data_assetAllocation.empty:
                mysql_connecter.insert_df_data(Data_assetAllocation,
                                               'fund_mixed_data')

            # 基金经理变动一览
            e_table = bs_obj.find('li', class_='fundManagerTab').table
            df0 = pd.read_html(e_table.prettify(encoding='utf8'),
                               encoding='utf8')[0]
            df0.columns = df0.loc[0, :]
            df0.columns.name = None
            df0 = df0.drop([
                0,
            ])
            df0.index = range(df0.shape[0])
            df = pd.DataFrame(
                {
                    'crawler_key': item['fund_code'] + '/' + u'基金经理变动',
                    'fund_code': item['fund_code'],
                    'data_type': u'基金经理变动',
                    'json_data': df0.to_json()
                },
                index=[
                    0,
                ])
            if not df.empty:
                mysql_connecter.insert_df_data(df,
                                               'fund_mixed_data',
                                               method='UPDATE')

        except:
            log_obj.error("%s( %s )中无法解析\n%s" %
                          (self.name, response.url, traceback.format_exc()))