def run(): args = BPRMF_Args(parse_args()) # 获取训练集的dataloader形式 data = DATA(args.data_path, args.dataset_name) train_set, train_U2I, test_U2I, n_users, n_items = data.load() train_dl = get_loader(train_set, train_U2I, n_items, args.batch_size, args.cores) # 定义网络 model = BPRMF(n_users, n_items, args) model = model.cuda() optimizer = optim.Adam(model.parameters(), lr=args.lr) # 定义会话 sess = Session(model) for epoch in range(args.num_epochs): loss, mf_loss, emb_loss = sess.train(train_dl, optimizer) print("epoch: {:d}, loss = [{:.6f} == {:.6f} + {:.6f}]".format( epoch, loss, mf_loss, emb_loss)) perf_info = evaluate(model, n_users, n_items, train_U2I, test_U2I, args) print("precision: [{:.6f}] recall: [{:.6f}] ndcg: [{:.6f}]".format( perf_info[0], perf_info[1], perf_info[2]))
def run(): args = NGCF_Args(parse_args()) # 获取训练集的dataloader形式 data = DATA(args.data_path, args.dataset_name) train_set, train_U2I, test_U2I, edge_indices, edge_weight, n_users, n_items = data.load( ) train_dl = get_loader(train_set, train_U2I, n_items, args.batch_size, args.cores) # 获取归一化的拉普拉斯矩阵 laplace_graph = Graph(edge_indices, edge_weight) laplace_graph.add_self_loop() laplace_graph.norm() norm_adj = laplace_graph.mat.cuda() # 定义网络 model = NGCF(n_users, n_items, norm_adj, args) model = model.cuda() optimizer = optim.Adam(model.parameters(), lr=args.lr) # 定义会话 sess = Session(model) for epoch in range(args.num_epochs): loss, mf_loss, emb_loss = sess.train(train_dl, optimizer) print("epoch: {:d}, loss = [{:.6f} == {:.6f} + {:.6f}]".format( epoch, loss, mf_loss, emb_loss)) perf_info = evaluate(model, n_users, n_items, train_U2I, test_U2I, args) print("precision: [{:.6f}] recall: [{:.6f}] ndcg: [{:.6f}]".format( perf_info[0], perf_info[1], perf_info[2]))
def correlation(self,indicators=None): if not indicators: indicators=self.indicators comb=DATA.by_indicators(indicators) def _spearman(df): df=df.dropna() if df.shape[0]>10:#TODO:thresh to choose return cal_corr(df,'spearman',winsorize=False) def _pearson(df): df=df.dropna() if df.shape[0]>10:#TODO: min_samples return cal_corr(df,'pearson',winsorize=True) corrs=comb.groupby('t').apply(_spearman) corrp=comb.groupby('t').apply(_pearson) corrsAvg=corrs.groupby(level=1).mean().reindex(index=indicators, columns=indicators) corrpAvg=corrp.groupby(level=1).mean().reindex(index=indicators, columns=indicators) corr1 = np.tril(corrpAvg.values, k=-1) corr2 = np.triu(corrsAvg.values, k=1) corr = pd.DataFrame(corr1 + corr2, index=corrpAvg.index, columns=corrpAvg.columns) np.fill_diagonal(corr.values, np.NaN) corr.to_csv(os.path.join(self.path, 'corr.csv'))
def fm(self,wsz=None): comb=DATA.by_indicators(self.indicators+['eretM']) data=[] ps=[] for indicator in self.indicators: subdf=comb[[indicator,'eretM']] subdf=subdf.dropna() subdf.columns=['y','x'] # The independent variable is winsorized at a given level on a monthly basis. as page 141 subdf['x']=subdf.groupby('t')['x'].apply(lambda s:winsorize(s,limits=WINSORIZE_LIMITS)) subdf=subdf.reset_index() formula='y ~ x' r,adj_r2,n,p=famaMacBeth(formula,'t',subdf,lags=5) #TODO: why intercept tvalue is so large? # TODO: why some fm regression do not have a adj_r2 ? data.append([r.loc['x', 'coef'], r.loc['x', 'tvalue'], r.loc['Intercept', 'coef'], r.loc['Intercept', 'tvalue'], adj_r2, n]) ps.append(p['x']) print(indicator) result = pd.DataFrame(data, index=self.indicators, columns=['slope', 't', 'Intercept', 'Intercept_t', 'adj_r2', 'n']).T result.to_csv(os.path.join(self.path, 'fama macbeth regression analysis.csv')) parameters=pd.concat(ps,axis=1,keys=self.indicators) parameters.to_csv(os.path.join(self.path,'fama macbeth regression parameters in first stage.csv'))
def _get_port_data(self,indicator): groupid=DATA.by_indicators([indicator]) groupid['g']=groupid.groupby('t',group_keys=False).apply( lambda df:pd.qcut(df[indicator],self.q, labels=[indicator+str(i) for i in range(1,self.q+1)]) ) return groupid
def portfolio_analysis(self): ''' table 8.4 :return: ''' #TODO: add a parameter to declare what risk models will be used. [ff3,capm,ff5] all_indicators = list(set(self.indicators + ['capM', 'eretM'])) comb = DATA.by_indicators(all_indicators) result_eavg=[] result_wavg=[] for indicator in self.indicators: gcol='g_%s'%indicator # comb[gcol]=comb.groupby('t').apply( # lambda df:grouping(df[indicator].reset_index(level='t'),self.q,labels=self.groupnames)) comb[gcol]=comb.groupby('t',group_keys=False).apply( lambda df:assign_port_id(df[indicator], self.q, self.groupnames)) #TODO:Add an alternative sorting method,that is,updating yearly as page 9 of Chen et al., “On the Predictability of Chinese Stock Returns.” panel_stk_eavg,panel_stk_wavg=self._get_panel_stk_avg(comb, indicator, gcol) for panel_stk in [panel_stk_eavg,panel_stk_wavg]: panel=panel_stk.unstack(level=[gcol]) panel.columns=panel.columns.astype(str) panel['_'.join([self.groupnames[-1],self.groupnames[0]])]=panel[self.groupnames[-1]]-panel[self.groupnames[0]] panel['avg']=panel.mean(axis=1) #TODO: use the risk models declared above a_data = comb.groupby(['t', gcol])[indicator].mean() a_data = a_data.unstack() a_data.columns = a_data.columns.astype(str) a_data.index = a_data.index.astype(str) a_data['_'.join([self.groupnames[-1], self.groupnames[0]])] = a_data[self.groupnames[-1]] - a_data[ self.groupnames[0]] a_data['avg']=a_data.mean(axis=1) a = a_data.mean() a.name='avg' a=a.to_frame().T riskAdjusted=risk_adjust(panel) #TODO:something must be wrong with size or portfolio_analysse. if panel_stk is panel_stk_eavg: result_eavg.append(pd.concat([a,riskAdjusted],axis=0)) else: result_wavg.append(pd.concat([a,riskAdjusted],axis=0)) table_e=pd.concat(result_eavg,axis=0,keys=self.indicators) table_w=pd.concat(result_wavg,axis=0,keys=self.indicators) #reorder the columns initialOrder=table_e.columns.tolist() newOrder=self.groupnames+[col for col in initialOrder if col not in self.groupnames] table_e=table_e.reindex(columns=newOrder) table_w=table_w.reindex(columns=newOrder) table_e.to_csv(os.path.join(self.path,'univariate portfolio analysis-equal weighted.csv')) table_w.to_csv(os.path.join(self.path,'univariate portfolio analysis-value weighted.csv'))
def portfolio_characteristics(self,sortedIndicator,otherIndicators): ''' as table 12.3 panel A :param sortedIndicator: :param otherIndicators: :return: ''' groupid=self._get_port_data(sortedIndicator) comb=DATA.by_indicators(otherIndicators) comb=pd.concat([groupid,comb],axis=1) characteristics_avg=comb.groupby(['t','g']).mean().groupby('g').mean() characteristics_avg.to_csv(os.path.join(self.path,'portfolio characteristics.csv'))
def get_percent_ratio(self): '''Fig 9.1 page 152''' def _get_ratio(s): s=s.dropna() total=s.shape[0] ratios = [0.01, 0.05, 0.10, 0.25] num=[int(r*total) for r in ratios] return pd.Series([s.nlargest(n).sum() / s.sum() for n in num], index=ratios) df=DATA.by_indicators('mktCap') d=df.groupby('t')['mktCap'].apply(_get_ratio) fig=d.unstack().plot().get_figure() fig.savefig(os.path.join(self.path,'percent of market value.png'))
def _fm(self, ll_indeVars): ''' :param ll_indeVars: list of list,the inside list contains all the indepedent variables to construct a regress equation :return: ''' indeVars=list(set(var for l_indeVars in ll_indeVars for var in l_indeVars)) indicators = indeVars + ['eretM'] comb = DATA.by_indicators(indicators) # The independent variable is winsorized at a given level on a monthly basis. as page 170 comb[indeVars]=comb.groupby('t')[indeVars].apply(lambda x:winsorize(x,limits=WINSORIZE_LIMITS,axis=0)) comb = comb.reset_index() stks = [] for l_indeVars in ll_indeVars: ''' replace the old name with new name,since patsy do not support name starts with number ''' newname = ['name' + str(i) for i in range(1, len(l_indeVars) + 1)] df = comb[l_indeVars + ['t', 'eretM']].dropna() df.columns = newname + ['t', 'eretM'] formula = 'eretM ~ ' + ' + '.join(newname) # TODO:lags? r, adj_r2, n,p= famaMacBeth(formula, 't', df, lags=5)#TODO: r = r.rename(index=dict(zip(newname, l_indeVars))) #save the first stage regression parameters p=p.rename(columns=dict(zip(newname,l_indeVars))) p.to_csv(os.path.join(self.path,'first stage parameters '+'_'.join(l_indeVars)+'.csv')) stk = r[['coef', 'tvalue']].stack() stk.index = stk.index.map('{0[0]} {0[1]}'.format) stk['adj_r2'] = adj_r2 stk['n'] = n stks.append(stk) table = pd.concat(stks, axis=1, keys=range(1, len(ll_indeVars) + 1)) newIndex = [var + ' ' + suffix for var in indicators for suffix in ['coef', 'tvalue']] + \ ['Intercept coef', 'Intercept tvalue', 'adj_r2', 'n'] table = table.reindex(index=newIndex) table.to_csv(os.path.join(os.path.join(self.path, 'fama macbeth regression analysis.csv')))
def _get_dependent_data(self,indicators): ''' :param indicators:list with two elements,the first is the controlling variable :return: ''' # sometimes the self.indicators and ['mktCap','eret'] may share some elements comb=DATA.by_indicators(indicators+['capM','eretM']) comb=comb.dropna() comb['g1']=comb.groupby('t',group_keys=False).apply( lambda df:assign_port_id(df[indicators[0]], self.q, [indicators[0] + str(i) for i in range(1,self.q + 1)])) comb['g2']=comb.groupby(['t','g1'],group_keys=False).apply( lambda df:assign_port_id(df[indicators[1]], self.q, [indicators[1] + str(i) for i in range(1,self.q + 1)])) return comb
def _one_indicator(self,indicator): ns=range(0,12) all_indicators=list(set([indicator]+['capM','eretM'])) comb=DATA.get_by_indicators(all_indicators) comb=comb.dropna() comb['g']=comb.groupby('t',group_keys=False).apply( lambda df:pd.qcut(df[indicator],self.q, labels=[indicator+str(i) for i in range(1,self.q+1)]) ) def _one_indicator_one_weight_type(group_ts, indicator): def _big_minus_small(s, ind): time=s.index.get_level_values('t')[0] return s[(time, ind + str(self.q))] - s[(time, ind + '1')] spread_data=group_ts.groupby('t').apply(lambda series:_big_minus_small(series, indicator)) s=risk_adjust(spread_data) return s eret=comb['eret'].unstack() s_es=[] s_ws=[] eret_names=[] for n in ns: eret_name='eret_ahead%s'%(n+1) comb[eret_name]=eret.shift(-n).stack() group_eavg_ts=comb.groupby(['t','g'])[eret_name].mean() group_wavg_ts=comb.groupby(['t','g']).apply(lambda df:np.average(df[eret_name],weights=df['mktCap'])) s_e=_one_indicator_one_weight_type(group_eavg_ts,indicator) s_w=_one_indicator_one_weight_type(group_wavg_ts,indicator) s_es.append(s_e) s_ws.append(s_w) eret_names.append(eret_name) eq_table=pd.concat(s_es,axis=1,keys=eret_names) vw_table=pd.concat(s_ws,axis=1,keys=eret_names) return eq_table,vw_table
def _get_independent_data(self): # TODO: add the method of ratios such as [0.3,0.7] # sometimes the self.indicators and ['capM','eretM'] may share some elements comb=DATA.by_indicators([self.indicator1,self.indicator2,'capM','eretM']) comb=comb.dropna() comb['g1']=comb.groupby('t',group_keys=False).apply( lambda df:assign_port_id(df[self.indicator1], self.q, [self.indicator1 + str(i) for i in range(1, self.q + 1)])) comb['g2']=comb.groupby('t',group_keys=False).apply( lambda df:assign_port_id(df[self.indicator2], self.q, [self.indicator2 + str(i) for i in range(1,self.q + 1)])) # comb['g1']=comb.groupby('t',group_keys=False).apply( # lambda df:pd.qcut(df[self.indicator1],self.q, # labels=[self.indicator1+str(i) for i in range(1,self.q+1)]) # ) # # comb['g2']=comb.groupby('t',group_keys=False).apply( # lambda df:pd.qcut(df[self.indicator2],self.q, # labels=[self.indicator2+str(i) for i in range(1,self.q+1)]) # ) return comb
from dataset import MRIDataset as DATA else: from dataset import MRIDataset_threechannel as DATA if args.network == 'Inception_v3': from Inception_v3 import inception_v3_pretrain as MODEL Transform = transforms.Compose( [transforms.Resize((SIZE, SIZE)), transforms.ToTensor()]) if __name__ == '__main__': # writer = SummaryWriter(path_to_logs_dir) dataset = DATA(path_to_data, path_to_label, mode=MODE, transform=Transform, aug=True) weight = 1. / torch.tensor([dataset.negative, dataset.positive], dtype=torch.float) target = torch.tensor(dataset._label['label'], dtype=torch.long) sample_weight = torch.tensor([weight[t] for t in target], dtype=torch.float) sampler = WeightedRandomSampler(sample_weight, len(sample_weight)) dataloader = DataLoader(dataset, Batch_size, sampler=sampler, num_workers=1, drop_last=True) dataset_test = DATA(path_to_testdata,