def run1(date): jpntop = lb.load_production_universe('jpn400', date) univ = lb.load_production_universe('npxchnpak', date) univ['cnt'] = map(lambda x: x.upper()[:3], univ.index) jpn = univ[univ['cnt'] == 'JPN'] ix = jpn.index.isin(jpntop.index) jpn = jpn.ix[-ix] jpn = jpn.reset_index().rename(columns={'BARRID': 'barrid'}) univ = univ[univ['cnt'] != 'JPN'] ax = loadaxioma(date) data = pandas.merge(ax, univ, left_on='barrid', right_index=True) med = data.groupby('cnt')['adv_20d'].median() data = pandas.merge(data, pandas.DataFrame(med, columns=['med_adv']), left_on='cnt', right_index=True) print med data = data[data['adv_20d'] > data['med_adv']] data = data.append(jpn.reset_index(), ignore_index=True) data = data[['barrid', 'SEDOL', 'NATURAL', 'VALID_UNTIL']] data['VALID_UNTIL'] = data['VALID_UNTIL'].apply( lambda x: x.strftime('%Y%m%d')) data['NATURAL'] = data['NATURAL'].astype(int) data.to_csv('/research/data/universe/npx_big/npx_big%s.univ' % date.strftime('%Y%m%d'), sep='|', header=False, index=False)
def saveCSV(date): sql = '''SELECT isnull(tm.barrid, sm.barrid) as barrid, sm.sedol, sm.name, sm.localid, sm.listed_country, s4.datadate, s4.category, s4.rank, s4.region, s4.country, s4.sector FROM [wzhu].[dbo].[shortcolor4] s4 join [nipun_prod].[dbo].[security_master] sm on s4.datadate between sm.datadate and isnull(sm.stopdate, '2050-01-01') and s4.sedol = sm.sedol and s4.section = 'TopDTC' and s4.region in ('AS', 'JP') and s4.datadate = '{:%Y%m%d}' left join nipun_prod..thai_barrid_map tm on left(sm.barrid, 6)=tm.root_id '''.format(date) dbo = db.db(connect='qai') data = dbo.query(sql, df=True) data = data.sort(['barrid', 'rank']) data = data.drop_duplicates(['barrid'], take_last=False) data = data.sort(['category', 'region', 'rank']) univ = lb.load_production_universe(UNIVERSE, date).index data = data[map(lambda x: x in univ, data['barrid'])] fout = "{}/shortcolorDTC.{:%Y%m%d}.csv".format(OPT_DIR, date) data.to_csv(fout, index=False, columns=[ 'barrid', 'sedol', 'name', 'localid', 'listed_country', 'datadate', 'category', 'region', 'rank', 'sector' ])
def run(date): ### load forward returns and recent momentums retdata = rets.daily_resrets(date + pandas.datetools.BDay(30), lookback=60 + RET_WINDOW) fret = gen_fwd_returns(retdata.copy(), date, [5, 10, 20]) exrets = gen_pmom_returns(retdata.copy(), date, pmom=PMOM_BREAK) rsk = lb.loadrsk2('ase1jpn', 'S', date, daily=True) rsk = rsk[['COUNTRY', 'USD_CAPT']] ### load returns compute residual cumulative ret = rets.daily_resrets(date, lookback=RET_WINDOW) ### load volume and signal data voldata = vol.load_volume(date, window=VOL_WINDOW) voldata = gen_vol(voldata, date, STVOL_WINDOW) signals = load_signals(date) data = pandas.merge(exrets, voldata, left_index=True, right_index=True, how='inner') data = pandas.merge(data, signals, left_index=True, right_index=True, how='inner') data = pandas.merge(data, fret, left_index=True, right_index=True, how='inner') data = pandas.merge(data, rsk, left_index=True, right_index=True, how='inner') data['datadate'] = date univ = lb.load_production_universe('npxchnpak', date) data = data[data.index.isin(univ.index)] print len(data) gdata.write_gce(data, 'users', 'dsargent/{version}/{dt}.pd'.format( version=VERSION, dt=date.strftime('%Y%m%d')), enable_compression=True) return
def run1(date): RSKDATA = lb.loadrsk2("ASE1JPN", "S", date, daily=True) npxchnpak = lb.load_production_universe('npxchnpak', date) data = pandas.merge(RSKDATA[['COUNTRY', 'LOC_CAPT']], npxchnpak, left_index=True, right_index=True) data = data[data['COUNTRY'] == 'KOR'] data['VALID_UNTIL'] = data['VALID_UNTIL'].map( lambda x: x.strftime('%Y%m%d')) data['rank'] = data['LOC_CAPT'].rank() data = data.sort('rank') nbot = len(data) / 2 ntop = len(data) - nbot bot = data.head(nbot) top = data.tail(ntop) print bot.head() print top.head() bot = bot[['SEDOL', 'NATURAL', 'VALID_UNTIL']] top = top[['SEDOL', 'NATURAL', 'VALID_UNTIL']] bot.to_csv("/research/data/universe/kor_small/kor_small" + date.strftime('%Y%m%d') + ".univ", header=False, sep='|') top.to_csv("/research/data/universe/kor_big/kor_big" + date.strftime('%Y%m%d') + ".univ", header=False, sep='|')
def pair(as_of_date): univ = lb.load_production_universe('npxchnpak', as_of_date) barrids = univ.index.tolist() sql = ''' select sm.barrid parent_barrid, sm.name parent_name, oh.* from dsargent..owner_holdings_all oh join qai..ownsecmap os on oh.ownercode=os.ownercode join nipun_prod..security_master sm on os.securitycode=sm.own_id and oh.qtrdate between sm.datadate and isnull(sm.stopdate, '20500101') where qtrdate <= '%(max_date)s' and qtrdate >= '%(min_date)s' and sharesheld>=0 and sharesout>=0 and sharesheld/sharesout > %(min_threshold)s ''' repl = { 'min_threshold': min_threshold, \ 'max_date': (as_of_date - pandas.datetools.MonthEnd()*3 +datetime.timedelta(1)).strftime('%Y%m%d'), \ 'min_date': (as_of_date - pandas.datetools.MonthEnd()*9 +datetime.timedelta(1)).strftime('%Y%m%d') } ##REPORT DATE MAY BE BACKFILLED, SO NOT USED IN BACKTEST ##TO BE CONSERVATIVE, TAKE 3 MON LAG FOR DATADATE data = dbo.query(sql % repl, df=True) print as_of_date print(as_of_date - pandas.datetools.MonthEnd() * 3 + datetime.timedelta(1)).strftime('%Y%m%d') print(as_of_date - pandas.datetools.MonthEnd() * 9 + datetime.timedelta(1)).strftime('%Y%m%d') ##PARENT COMPANY NEED TO BE IN UNIVERSE; HOLDINGS CAN BE OUT OF UNIVERSE data = data[data['parent_barrid'].isin(barrids)] ##When thre are multiple reports on the same qtrdate for the same barrid, keep the most recent one data = data.sort( ['parent_barrid', 'barrid', 'qtrdate', 'ownercode', 'reportdate']) data = data.drop_duplicates( ["parent_barrid", "barrid", "qtrdate", "ownercode"], take_last=True) ##When thre are more than one qtrdate, keep the most recent one data = data.sort(['parent_barrid', 'barrid', 'ownercode', 'qtrdate']) data = data.drop_duplicates(["parent_barrid", "barrid", "ownercode"], take_last=True) cols = ['valueheld', 'valuechg', 'shareschg', 'sharesheld', 'sharesout'] data[cols] = data[cols].astype(float) ##sum up multiple holdings of the same company by different ownercode of same parent_barrid cols = ['parent_barrid', 'barrid', 'sharesheld', 'valueheld'] sum = data[cols].groupby(['parent_barrid', 'barrid']).sum() sum = sum.rename(columns={ 'sharesheld': 'sharesheld_sum', 'valueheld': 'valueheld_sum' }) data = pandas.merge(data, sum, how='inner', left_on=['parent_barrid', 'barrid'], right_index=True) #data['shares_pct_of_holding'] = data['sharesheld'] / data['sharesout'] ##MERGE IN BOOK EQUITY FROM WS be = n.getWSCalc(as_of_date, items=['common_equity_ex_mi_lag0'], production=PRODUCTION) ttm_current = be.pick_ttm('current') be = be.join(ttm_current) idx = be['common_equity_ex_mi_lag0'] <= 0 be['common_equity_ex_mi_lag0'][idx] = np.nan print as_of_date print "book equity distribution before currency converting" print be['common_equity_ex_mi_lag0'].describe() print be.head().to_string() be = be[['common_equity_ex_mi_lag0']] data = pandas.merge(data, be, left_on=['parent_barrid'], right_index=True, how='left') rskdata = n.loadrsk2('ASE1JPN', 'S', as_of_date, daily=True) usdcapt = rskdata[['USD_CAPT', 'INDNAME', 'LOC_CAPT']] usdcapt = usdcapt.rename(columns={'USD_CAPT': 'USD_CAPT_parent'}) data = pandas.merge(data, usdcapt, left_on=['parent_barrid'], right_index=True, how='left') data['USD_BE'] = data['common_equity_ex_mi_lag0'] * data[ 'USD_CAPT_parent'] / data['LOC_CAPT'] * 1e6 usdcapt = rskdata[['USD_CAPT']] usdcapt = usdcapt.rename(columns={'USD_CAPT': 'USD_CAPT_holding'}) data = pandas.merge(data, usdcapt, left_on=['barrid'], right_index=True, how='left') data[ 'mcap_pct_of_parent'] = data['valueheld_sum'] / data['USD_CAPT_parent'] data['mcap_pct_of_holding'] = data['valueheld_sum'] / data[ 'USD_CAPT_holding'] data['be_pct_of_parent'] = data['valueheld_sum'] / data['USD_BE'] cols = [ 'USD_CAPT_parent', 'USD_CAPT_holding', 'mcap_pct_of_parent', 'mcap_pct_of_holding', 'USD_BE', 'be_pct_of_parent' ] data[cols] = data[cols].astype(float) print "print distribution of BE_pct_of_parent" print data['be_pct_of_parent'].describe().to_string() vars = ['mcap_pct_of_holding', 'mcap_pct_of_parent', 'be_pct_of_parent'] for col in vars: data[col][data[col] > 1.0] = 1.0 data = data.sort(['parent_barrid', 'barrid']) data = data.drop_duplicates(["parent_barrid", "barrid"], take_last=True) ##get omonthly returns ##compute PMOM12 PMOM6 pmom3 for every barrid retdata = n.monthly_returns(as_of_date, lookback=400, model='GEM2') retdata.columns = map(lambda x: x.strip(), retdata.columns.tolist()) retdata['date'] = map( lambda x: datetime.datetime(int(str(x)[:4]), int(str(x)[4:]), 1) + pandas.datetools.MonthEnd(), retdata['DATADATE']) retdata['RET'] += 1 ##GET PMOM3 # if as_of_date=20111231, min_date=20111031 min_date = as_of_date - pandas.datetools.MonthEnd(2) print "print pmom3 dates" print as_of_date print min_date retdata3 = retdata[retdata['date'] >= min_date] #(this way we filter out the old months) pmom3 = retdata3.groupby('BARRID').apply( lambda x: pandas.np.product(x['RET']) - 1) pmom3 = pandas.DataFrame(pmom3) pmom3.columns = ['pmom3'] res = pandas.merge(data, pmom3, left_on='barrid', right_index=True) ##GET PMOM6 min_date = as_of_date - pandas.datetools.MonthEnd(5) print "print pmom6 dates" print as_of_date print min_date retdata6 = retdata[retdata['date'] >= min_date] #(this way we filter out the old months) pmom6 = retdata6.groupby('BARRID').apply( lambda x: pandas.np.product(x['RET']) - 1) pmom6 = pandas.DataFrame(pmom6) pmom6.columns = ['pmom6'] res = pandas.merge(res, pmom6, left_on='barrid', right_index=True) ##GET PMOM12 min_date = as_of_date - pandas.datetools.MonthEnd(11) print "print pmom12 dates" print as_of_date print min_date retdata12 = retdata[retdata['date'] >= min_date] #(this way we filter out the old months) pmom12 = retdata12.groupby('BARRID').apply( lambda x: pandas.np.product(x['RET']) - 1) pmom12 = pandas.DataFrame(pmom12) pmom12.columns = ['pmom12'] res = pandas.merge(res, pmom12, left_on='barrid', right_index=True) ##merge in past month return retdata = n.monthly_returns(as_of_date, lookback=0, model='GEM2') res = pandas.merge(res, retdata, left_on='barrid', right_on='BARRID') ##winsorize returns idx = res['pmom3'] >= 1 res['pmom3'][idx] = 1 idx = (res['pmom3'].notnull()) & (res['pmom3'] < -1) res['pmom3'][idx] = -1 idx = res['pmom6'] >= 1 res['pmom6'][idx] = 1 idx = (res['pmom6'].notnull()) & (res['pmom6'] < -1) res['pmom6'][idx] = -1 idx = res['pmom12'] >= 1 res['pmom12'][idx] = 1 idx = (res['pmom12'].notnull()) & (res['pmom12'] < -1) res['pmom12'][idx] = -1 idx = res['RET'] >= 1 res['RET'][idx] = 1 idx = (res['RET'].notnull()) & (res['RET'] < -1) res['RET'][idx] = -1 ##compute sig = equal-weighted mean of all impulse returns grouped = res[['parent_barrid', 'pmom3', 'pmom6', 'pmom12', 'RET']].groupby('parent_barrid') holding_pmom3 = pandas.DataFrame(grouped['pmom3'].mean()) holding_pmom3 = holding_pmom3.rename( columns={'pmom3': 'crossholding_pmom3_v1beallhldtype'}) write_alpha_files(holding_pmom3, 'crossholding_pmom3_v1beallhldtype', as_of_date, production=PRODUCTION) holding_pmom6 = pandas.DataFrame(grouped['pmom6'].mean()) holding_pmom6 = holding_pmom6.rename( columns={'pmom6': 'crossholding_pmom6_v1beallhldtype'}) write_alpha_files(holding_pmom6, 'crossholding_pmom6_v1beallhldtype', as_of_date, production=PRODUCTION) holding_pmom12 = pandas.DataFrame(grouped['pmom12'].mean()) holding_pmom12 = holding_pmom12.rename( columns={'pmom12': 'crossholding_pmom12_v1beallhldtype'}) write_alpha_files(holding_pmom12, 'crossholding_pmom12_v1beallhldtype', as_of_date, production=PRODUCTION) holding_ret = pandas.DataFrame(grouped['RET'].mean()) holding_ret = holding_ret.rename( columns={'RET': 'crossholding_ret_v1beallhldtype'}) write_alpha_files(holding_ret, 'crossholding_ret_v1beallhldtype', as_of_date, production=PRODUCTION) ##COMPUTE HOLDING PORTFOLIO RETURNS, weighted by VALUEHELD pf = res[[ 'parent_barrid', 'valueheld_sum', 'pmom3', 'pmom6', 'pmom12', 'RET' ]] ## compute weighted sum ## the following function sum_weighted_scores returns a couple of columns, ## barrid, N, value_weighted, relative_across (value_weighted/N) , relative_within (value_weighted / wts) pfret_pmom3 = nu.sum_weighted_scores(pf['parent_barrid'], pf['valueheld_sum'], pf['pmom3']) pfret_pmom3['crossholding_pfret_pmom3_v1beallhldtype'] = pfret_pmom3[ 'relative_within'] write_alpha_files(pfret_pmom3['crossholding_pfret_pmom3_v1beallhldtype'], 'crossholding_pfret_pmom3_v1beallhldtype', as_of_date, production=PRODUCTION) pfret_pmom6 = nu.sum_weighted_scores(pf['parent_barrid'], pf['valueheld_sum'], pf['pmom6']) pfret_pmom6['crossholding_pfret_pmom6_v1beallhldtype'] = pfret_pmom6[ 'relative_within'] write_alpha_files(pfret_pmom6['crossholding_pfret_pmom6_v1beallhldtype'], 'crossholding_pfret_pmom6_v1beallhldtype', as_of_date, production=PRODUCTION) pfret_pmom12 = nu.sum_weighted_scores(pf['parent_barrid'], pf['valueheld_sum'], pf['pmom12']) pfret_pmom12['crossholding_pfret_pmom12_v1beallhldtype'] = pfret_pmom12[ 'relative_within'] write_alpha_files(pfret_pmom12['crossholding_pfret_pmom12_v1beallhldtype'], 'crossholding_pfret_pmom12_v1beallhldtype', as_of_date, production=PRODUCTION) pfret_ret = nu.sum_weighted_scores(pf['parent_barrid'], pf['valueheld_sum'], pf['RET']) pfret_ret['crossholding_pfret_ret_v1beallhldtype'] = pfret_ret[ 'relative_within'] write_alpha_files(pfret_ret['crossholding_pfret_ret_v1beallhldtype'], 'crossholding_pfret_ret_v1beallhldtype', as_of_date, production=PRODUCTION) ##compute weighted average returns as impulse res = res.sort(['parent_barrid', 'barrid']) res['wgtpmom3'] = np.nan res['wgtpmom6'] = np.nan res['wgtpmom12'] = np.nan res['wgtret'] = np.nan for c in set(res['parent_barrid']): idx = res['parent_barrid'] == c res['wgtpmom3'][idx] = res['be_pct_of_parent'][idx] * res['pmom3'][idx] res['wgtpmom6'][idx] = res['be_pct_of_parent'][idx] * res['pmom6'][idx] res['wgtpmom12'][ idx] = res['be_pct_of_parent'][idx] * res['pmom12'][idx] res['wgtret'][idx] = res['be_pct_of_parent'][idx] * res['RET'][idx] ##compute sig = weighted SUM of all impulse returns grouped = res[[ 'parent_barrid', 'wgtpmom3', 'wgtpmom6', 'wgtpmom12', 'wgtret' ]].groupby('parent_barrid') holding_pmom3 = pandas.DataFrame(grouped['wgtpmom3'].sum()) holding_pmom3 = holding_pmom3.rename( columns={'wgtpmom3': 'crossholding_wgtpmom3_v1beallhldtype'}) write_alpha_files(holding_pmom3, 'crossholding_wgtpmom3_v1beallhldtype', as_of_date, production=PRODUCTION) crossholding_wgtpmom3_v1beallhldtype_in, crossholding_wgtpmom3_v1beallhldtype_it = ta.timing_alphas( holding_pmom3, 'crossholding_wgtpmom3_v1beallhldtype', as_of_date, UNIVERSE, 'INDUSTRY', production=PRODUCTION) #write_alpha_files(crossholding_wgtpmom3_v1beallhldtype_in, 'crossholding_wgtpmom3_v1beallhldtype_in', as_of_date, production=PRODUCTION) holding_pmom6 = pandas.DataFrame(grouped['wgtpmom6'].sum()) holding_pmom6 = holding_pmom6.rename( columns={'wgtpmom6': 'crossholding_wgtpmom6_v1beallhldtype'}) write_alpha_files(holding_pmom6, 'crossholding_wgtpmom6_v1beallhldtype', as_of_date, production=PRODUCTION) crossholding_wgtpmom6_v1beallhldtype_in, crossholding_wgtpmom6_v1beallhldtype_it = ta.timing_alphas( holding_pmom6, 'crossholding_wgtpmom6_v1beallhldtype', as_of_date, UNIVERSE, 'INDUSTRY', production=PRODUCTION) #write_alpha_files(crossholding_wgtpmom6_v1beallhldtype_in, 'crossholding_wgtpmom6_v1beallhldtype_in', as_of_date, production=PRODUCTION) holding_pmom12 = pandas.DataFrame(grouped['wgtpmom12'].sum()) holding_pmom12 = holding_pmom12.rename( columns={'wgtpmom12': 'crossholding_wgtpmom12_v1beallhldtype'}) write_alpha_files(holding_pmom12, 'crossholding_wgtpmom12_v1beallhldtype', as_of_date, production=PRODUCTION) crossholding_wgtpmom12_v1beallhldtype_in, crossholding_wgtpmom12_v1beallhldtype_it = ta.timing_alphas( holding_pmom12, 'crossholding_wgtpmom12_v1beallhldtype', as_of_date, UNIVERSE, 'INDUSTRY', production=PRODUCTION) #write_alpha_files(crossholding_wgtpmom12_v1beallhldtype_in, 'crossholding_wgtpmom12_v1beallhldtype_in', as_of_date, production=PRODUCTION) holding_ret = pandas.DataFrame(grouped['wgtret'].sum()) holding_ret = holding_ret.rename( columns={'wgtret': 'crossholding_wgtret_v1beallhldtype'}) write_alpha_files(holding_ret, 'crossholding_wgtret_v1beallhldtype', as_of_date, production=PRODUCTION) crossholding_wgtret_v1beallhldtype_in, crossholding_wgtret_v1beallhldtype_it = ta.timing_alphas( holding_ret, 'crossholding_wgtret_v1beallhldtype', as_of_date, UNIVERSE, 'INDUSTRY', production=PRODUCTION) #write_alpha_files(crossholding_wgtret_v1beallhldtype_in, 'crossholding_wgtret_v1beallhldtype_in', as_of_date, production=PRODUCTION) return True
def run(as_of_date): """ load Barra risk data """ RSKDATA = lb.loadrsk2("ASE1JPN", "S", as_of_date, daily=True) npxchnpak = lb.load_production_universe('npxchnpak', as_of_date) #topbot = pandas.read_csv("/research/data/prealpha/topbot_npxchnpak/topbot_npxchnpak_"+as_of_date.strftime("%Y%m%d")+".alp",header=False, \ # names=['BARRID','TOPBOT']) # old research version of the files nextmonth = as_of_date + datetime.timedelta(1) print nextmonth, as_of_date try: filename = "/production/%s/%s/%s/prealpha/ic_scaling_npxchnpak_%s.alp" % (nextmonth.strftime('%Y'), nextmonth.strftime('%m'), nextmonth.strftime('%Y%m%d'), \ as_of_date.strftime('%Y%m%d')) topbot = (pandas.read_csv(filename, index_col=0)).rename(columns={'ic1': 'BIG'}) except: print 'rolling back!' topbot = pandas.read_csv("/research/data/prealpha/icscale_npxchnpak/icscale_npxchnpak_"+as_of_date.strftime("%Y%m%d")+".alp",header=False, \ names=['BARRID','BIG']) # topbot = pandas.read_csv(topbot,header=True, names=['BARRID','BIG']) topbot = topbot.reset_index() univdata = npxchnpak.join(RSKDATA[['COUNTRY', 'USD_CAPT']], how='left') univdata = univdata[univdata['COUNTRY'] == 'JPN'] univdata = topbot.join(univdata, on='BARRID', how='right') univdata.index = univdata.pop('BARRID') univdata['VALID_UNTIL'] = univdata['VALID_UNTIL'].map( lambda x: x.strftime('%Y%m%d')) #univdata_top = univdata[univdata['BIG']=='JPN_BIG'] univdata_top = univdata[univdata['BIG'] < 1] univdata_top = univdata_top[['SEDOL', 'NATURAL', 'VALID_UNTIL']] univdata_bot = univdata[(univdata['BIG'] == 'JPN') | (univdata['BIG'] == 1)] # univdata_bot = univdata[univdata['BIG']==1] univdata_bot['rnk'] = univdata_bot['USD_CAPT'].rank() univdata_bot = univdata_bot.sort('rnk') print univdata_bot.head().to_string() univdata_bot = univdata_bot[['SEDOL', 'NATURAL', 'VALID_UNTIL']] univdata_bot.to_csv('/research/data/universe/jpnx400/jpnx400' + as_of_date.strftime('%Y%m%d') + '.univ', header=False, sep='|') univdata_bot.tail(600).to_csv( '/research/data/universe/jpnx400_t600/jpnx400_t600' + as_of_date.strftime('%Y%m%d') + '.univ', header=False, sep='|') nbot = len(univdata_bot) / 2 ntop = len(univdata_bot) - nbot print univdata_bot.head().to_string() print univdata_bot.tail().to_string() univdata_bot.head(nbot).to_csv( "/research/data/universe/jpnx400_small/jpnx400_small" + as_of_date.strftime('%Y%m%d') + ".univ", header=False, sep='|') univdata_bot.tail(ntop).to_csv( "/research/data/universe/jpnx400_big/jpnx400_big" + as_of_date.strftime('%Y%m%d') + ".univ", header=False, sep='|') #UNIVDATA.to_csv("/research/data/universe/barraestu/barraestu.univ"+yymm,header=False) # univdata_top.to_csv("/research/data/universe/jpn400/jpn400"+as_of_date.strftime('%Y%m%d')+".univ",header=False,sep='|') return True
def backfill_1b(date, bkt, bkt_df, univ='npxchnpak', alpha='alpha_v5', freq='daily', model='ase1jpn', ctry_df=None, univ_df=None, dir=None): """ backfill one bucket """ import nipun.cpa.load_barra as lb #import pandas.util.py3compat as py3compat print "backfill bucket %s for %s" % (bkt, date) date_str = date.strftime('%Y%m%d') keep_alphas = bkt_df.abs().sum() keep_alphas = keep_alphas[keep_alphas > 0].index.tolist() bkt_df = bkt_df[keep_alphas] ALPHA_DIR = '%(dir)s/%(freq)s/current/' % {'dir': dir, 'freq': freq} big_df = pandas.DataFrame() for alpha in keep_alphas: raw_alpha_dir = "%s/%s/%s/" % (ALPHA_DIR, univ, alpha) fn = raw_alpha_dir + '%s_%s.alp' % (alpha, date_str) if os.path.exists(fn): tmp_df = pandas.read_csv(fn, header=None, names=['barrid', alpha], index_col=0) big_df = big_df.join(tmp_df, how='outer') else: big_df[alpha] = None #the v1 def. for bucket alphas b_out_dir = "%s/%s/%s/" % (ALPHA_DIR, univ, 'b_' + bkt + '_eq_all') if not os.path.exists(b_out_dir): os.makedirs(b_out_dir) output_df = big_df.fillna(0).mean(axis=1) output_df.to_csv('%(dir)s/b_%(bkt)s_eq_all_%(date)s.alp' % { 'dir': b_out_dir, 'bkt': bkt, 'date': date_str }) #the v2 def. #add country into big_df b_out_dir = "%s/%s/%s/" % (ALPHA_DIR, univ, 'b_' + bkt + '_eq_c') if not os.path.exists(b_out_dir): os.makedirs(b_out_dir) if ctry_df is None: ctry_df = pandas.DataFrame( lb.loadrsk2(model, 'S', date, daily=True)['COUNTRY']) if univ_df is None: univ_df = lb.load_production_universe(univ, date) big_df = big_df[big_df.index.isin(univ_df.index)] big_df = big_df.join(ctry_df, how='left') output_df2 = pandas.DataFrame() for k, v in big_df.groupby('COUNTRY'): if k in bkt_df.index: keep_alphas = bkt_df.ix[k] keep_alphas = keep_alphas[keep_alphas > 0].index.tolist() output_df2 = pandas.concat([ output_df2, pandas.DataFrame(v[keep_alphas].fillna(0).mean(axis=1)) ]) output_df2.to_csv('%(dir)s/b_%(bkt)s_eq_c_%(date)s.alp' % { 'dir': b_out_dir, 'bkt': bkt, 'date': date_str }, header=None)
def backfill_b_alphas(weights_date=None, buckets=[ 'analyst', 'fmom', 'industry', 'iu', 'quality', 'sentiment', 'special', 'value' ], univ='npxchnpak', alpha='alpha_v5', startdate=datetime.datetime(2005, 1, 1), enddate=datetime.datetime.today() - pandas.datetools.day, model='ase1jpn', ncpus=8, freq='daily'): """ this function is to calculate bucket alphas based on the latest backfilled raw alphas """ print "start to backfill bucket alphas eq_all and eq_c" if weights_date is None: ctry_wt_df = dbo.query("select * from alphagen..country_wt__%s" % alpha, df=True) alpha_wt_df = dbo.query("select * from alphagen..alpha_wt__%s" % alpha, df=True) else: ctry_wt_df = dbo.query( "select * from production_reporting..country_wt_backup \ where cast(datadate AS date)='%(date)s' and alphagen_vers='%(alpha)s'" % { 'alpha': alpha, 'date': weights_date }, df=True) alpha_wt_df = dbo.query( "select * from production_reporting..alpha_wt_backup \ where cast(datadate AS date)='%(date)s' and alphagen_vers='%(alpha)s'" % { 'alpha': alpha, 'date': weights_date }, df=True) ctry_wt_df = ctry_wt_df[['alpha_name', 'country', 'weight']] alpha_wt_df = alpha_wt_df[['bucket_name', 'alpha_name']] bucket_alpha_df = pandas.merge(alpha_wt_df, ctry_wt_df, on=['alpha_name'], how='left') job_server = pp.Server(ncpus) jobs = [] for date in pandas.DateRange(startdate, enddate, offset=pandas.datetools.day): ctry_df = pandas.DataFrame( lb.loadrsk2(model, 'S', date, daily=True)['COUNTRY']) univ_df = lb.load_production_universe(univ, date) for bkt in buckets: bkt_df = bucket_alpha_df[bucket_alpha_df[ 'bucket_name'] == bkt] #.drop(labels=['bucket_name'], axis=1) bkt_df = bkt_df.pivot(index='country', columns='alpha_name', values='weight') #backfill_1b(date, bkt, bkt_df, ctry_df=ctry_df, univ_df=univ_df, dir=DIR) jobs.append( job_server.submit(backfill_1b, (date, bkt, bkt_df, univ, alpha, freq, model, ctry_df, univ_df, DIR), (), ('pandas', 'datetime', 'os'))) for job in jobs: job() job_server.print_stats() job_server.destroy()