def get_mu(x0,wt_decay=None,std_th=3,bootstrap=True,dn='norm',smooth_beta=None) : """ remove outliers and get the median To be improved by maximu likelihood """ x=x0.copy() n=len(x) if wt_decay is not None: wt=l1d.getwt(n,wt_decay) else : wt=np.ones(n).astype(float) wt/=np.sum(wt) xm=np.dot(x,wt) xs=np.sqrt(np.dot((x-xm)**2,wt)) # outlier x=outlier(x,xm,xs,in_th=std_th,out_th=std_th*4) #ix=np.nonzero(np.abs(x-xm) > xs*std_th)[0] #if len(ix) > 0 : # x[ix]=xm+np.sign(x[ix]-xm)*(xs*std_th) if smooth_beta is not None : #x=tanh_smooth(x, xm, xs, beta=smooth_beta) x=tanh_smooth(x, x.mean(), x.std(), beta=smooth_beta) if bootstrap : mu=[] try_cnt = n*3/2 for i in np.arange(try_cnt) : mu.append(np.mean(x[np.random.choice(n,n*2/3,p=wt)])) x=np.array(mu) return f3(x,dn=dn), np.std(x)
def shp_cm(wb5s, yh, wt_decay): sc = np.cumsum(wb5s.T * np.sign(yh), axis=0).T wt = l1_reader.getwt(sc.shape[0], wt_decay) wt /= np.sum(wt) scm = np.dot(sc.T, wt) scsd = np.sqrt(np.dot((sc - scm).T**2, wt)) #scm=np.mean(sc,axis=0) #scsd=np.std(sc,axis=0) return scm / scsd
def getYh(Xtrain, Ytrain, Xtest, clf, wt_decay=None, feat_select=True, fit_sign=False, wt_neg=False): #omp=linear_model.Lars() #omp=linear_model.OrthogonalMatchingPursuitCV() #omp=linear_model.LassoCV() #omp=linear_model.RidgeCV() omp = copy.deepcopy(clf) n, k = Xtrain.shape if feat_select: ix = feat_sel(Xtrain, Ytrain) if len(ix) == 0: print 'nothing got selected!' return np.ones(Xtest.shape[0]) * Ytrain.mean(), omp, [0] ix = np.r_[0, ix] else: ix = np.arange(k) wt = None if wt_decay is not None: wt = l1_reader.getwt(len(Ytrain), wt_decay) if fit_sign: #remove zero cases ixz = np.nonzero(Ytrain != 0)[0] Xtrain = Xtrain[ixz, :] Ytrain0 = np.sign(Ytrain[ixz]) if wt is not None: wt = wt[ixz] * np.sqrt(np.abs(Ytrain[ixz])) #wt=wt[ixz] #pl.figure() ; pl.plot(np.abs(Ytrain)) ; pl.plot(np.sqrt(np.abs(Ytrain0))) else: Ytrain0 = Ytrain if wt_neg: ixn = np.nonzero(Ytrain0 < 0)[0] if len(ixn) > 0: if wt is None: wt = np.ones(len(Ytrain0)) wt[ixn] *= (1 + np.sqrt(np.abs(Ytrain0[ixn]))) omp.fit(Xtrain[:, ix], Ytrain0, sample_weight=wt) #ys=np.abs(Ytrain) #wty=np.sqrt(ys/ys.std()+1) #wty/=wty.std() #omp.fit(Xtrain[:,ix], Ytrain0,sample_weight=wt*wty) if fit_sign: yp = omp.predict_proba(Xtest[:, ix]) yh = yp[:, -1] - yp[:, 0] else: yh = omp.predict(Xtest[:, ix]) return yh, omp, ix.copy()
def getYTrain(wb5m, y0=-42, y1=-30, wt_decay=0.1, fd=None): if fd is not None: y = fd.copy() else: y = np.sum(wb5m[:, y0:y1, 1], axis=1) wt = l1_reader.getwt(len(y), wt_decay) wt /= np.sum(wt) mu = np.dot(y, wt) sd = np.sqrt(np.dot(wt, (y - mu)**2)) y0 = ean.outlier(y, mu, sd, in_th=1, out_th=3) return y0, mu, sd
def eval_latest_avg(x, lb=16, k=3, step=2, wt_decay=0.1): n = len(x) avg = [] for ix1, ix2 in zip(n - np.arange(k) * step - lb, n - np.arange(k) * step): #print ix1, ix2 avg.append(np.mean(x[ix1:ix2])) if len(avg) > 1: wt = l1_reader.getwt(k, wt_decay)[::-1] wt /= np.sum(wt) else: wt = np.array([1]) #print avg, wt return np.dot(avg, wt)
def wt_corr2d(X2d, y, wt_decay=0.1, whiten=True): n, k = X2d.shape assert n == len(y), 'X2d and y shape mismatch' wt = l1_reader.getwt(n, wt_decay) wt /= np.sum(wt) mcorr = [] mcstd = [] for i in np.arange(k): mc, ms = wt_corr(X2d[:, i], y, wt=wt, whiten=whiten) mcorr.append(mc) mcstd.append(ms) mcorr = np.array(mcorr) mcstd = np.array(mcstd) return mcorr, mcstd
def score_sign_insample(clf, X0, Y0, wt_train=0.5, lb=16, k=3, step=2, wt_decay=0.1): clf0 = copy.deepcopy(clf) n, m = X0.shape wt = l1_reader.getwt(n, wt_train) wt /= np.sum(wt) clf0.fit(X0, Y0, wt) yh = clf0.predict(X0[-lb - k * step:, :]) x = Y0[-lb - k * step:] * np.sign(yh) return eval_latest_avg(x, lb=lb, k=k, step=step, wt_decay=wt_decay), clf0
def wt_corr(x1d, y, wt=None, wt_decay=0.1, whiten=True): n = len(x1d) assert n == len(y), 'x1d and y length mismatch' if wt is None: wt = l1_reader.getwt(n, wt_decay) wt /= np.sum(wt) x0 = x1d.copy() y0 = y.copy() if whiten: x0 -= x0.mean() sd = x0.std() if sd > 1e-10: x0 /= sd y0 -= y0.mean() sd = y0.std() if sd > 1e-10: y0 /= sd W = np.diag(wt) mc = np.dot(x0, np.dot(W, y0)) ms0 = x0 * y0 - mc ms = np.sqrt(np.dot(ms0, np.dot(W, ms0))) return mc, ms
def feat_sel3(X, y, st_smp_cnt=16, lt_smp_cnt=200, wt_decay=0.25, wt_train=0.5, clf=None, cor100=0.05, vbose=False): n, m = X.shape assert n > lt_smp_cnt and lt_smp_cnt >= st_smp_cnt, 'len error' assert len(y) == n, 'X,y shape error' mt_smp_cnt = (lt_smp_cnt + st_smp_cnt) / 2 mc, ms = wt_corr2d(X, y, wt_decay=wt_decay, whiten=True) mclt, mslt = wt_corr2d(X[-lt_smp_cnt:, :], y[-lt_smp_cnt:], wt_decay=0, whiten=True) mcmt, msmt = wt_corr2d(X[-mt_smp_cnt:, :], y[-mt_smp_cnt:], wt_decay=0, whiten=True) mcst, msst = wt_corr2d(X[-st_smp_cnt:, :], y[-st_smp_cnt:], wt_decay=0, whiten=True) ixst = np.nonzero(mc * mcst > 0)[0] ixmt = np.nonzero(mc * mcmt > 0)[0] ixlt = np.nonzero(mc * mclt > 0)[0] #pdb.set_trace() ixa = np.intersect1d(np.intersect1d(ixst, ixmt), ixlt) #cnadidate features if cor100 > 0: ixcor = np.nonzero(np.abs(mc) * np.sqrt(n) / 10 > cor100)[0] ixa = np.intersect1d(ixa, ixcor) if len(ixa) == 0: #nothing to be chosen return [7] if vbose: print 'ixst: ', len(ixst), ixst print 'ixmt: ', len(ixmt), ixmt print 'ixlt: ', len(ixlt), ixlt print 'ixcor:', len(ixcor), ixcor print 'got ixa:', len(ixa), ixa if wt_train is not None: wtlt = l1_reader.getwt(n - lt_smp_cnt, wt_train) wtmt = l1_reader.getwt(n - mt_smp_cnt, wt_train) wtst = l1_reader.getwt(n - st_smp_cnt, wt_train) wtlt /= np.sum(wtlt) wtmt /= np.sum(wtmt) wtst /= np.sum(wtst) else: wtlt = None wtmt = None wtst = None if clf is None: #clf = linear_model.RidgeCV(alphas=[0.1,0.5,1,2,5,10,20,40,60,100]) clf = linear_model.RidgeCV() tol = 1e-10 ixi = np.array([0]) maxit = 100 it = 0 scb = 0 scbp = [] while len(ixa) > 0 and it < maxit: if vbose: print 'START ======= iter ', it, 'fs ', ixi, 'score ', scb, ' possible ', ixa sc = [] for i0 in ixa: # try to find a best feature from idx ixt = np.r_[ixi, i0] X0 = X[:, ixt] sc0 = 0 if vbose: print i0, for ct, wt in zip([lt_smp_cnt, mt_smp_cnt, st_smp_cnt], [wtlt, wtmt, wtst]): sc0 += score_sign(clf, X0[:-ct, :], y[:-ct], X0[-ct:, :], y[-ct:], wt=wt) if vbose: print ct, sc0, sc.append(sc0) if vbose: print # pick a best one ixch0 = np.argsort(sc)[-1] ixch = ixa[ixch0] if sc[ixch0] - scb > tol: if vbose: print 'adding ', ixch, ' score imp ', sc[ixch0] - scb scb = sc[ixch0] # but would removing existing features in ixi improve? while len(ixi) > 1: sc = [] for i0 in np.arange(len(ixi) - 1) + 1: # try to remove i0 if vbose: print 'try removing ' sc0 = 0 X0 = X[:, np.r_[np.delete(ixi, i0), ixch]] for ct, wt in zip([lt_smp_cnt, mt_smp_cnt, st_smp_cnt], [wtlt, wtmt, wtst]): sc0 += score_sign(clf, X0[:-ct, :], y[:-ct], X0[-ct:, :], y[-ct:], wt=wt) if vbose: print ixi[i0], sc0 sc.append(sc0) ixrm0 = np.argsort(sc)[-1] if sc[ixrm0] < scb: break ixrm = ixi[ixrm0 + 1] if vbose: print 'removing ', ixrm, ' score imp ', sc[ixrm0] - scb ixi = np.delete(ixi, ixrm0 + 1) scb = sc[ixrm0] ixi = np.r_[ixi, ixch] ixa = np.delete(ixa, ixch0) else: ixa = [] if vbose: print 'DONE ======= iter ', it, 'fs ', ixi, 'score ', scb it += 1 if len(ixi) == 1: return [7] return ixi[1:]
def test(wb5m,if_plot=True) : #fd2=np.sum(wb5m[:,8*12+4:9*12+7,1],axis=1) #fd3=np.sum(wb5m[:,11*12+2:11*12+8,1],axis=1) fd0=np.sum(wb5m[:,6*12:8*12,1],axis=1) fd1=np.sum(wb5m[:,10*12+3:12*12,1],axis=1) fd2=np.sum(wb5m[:,18*12:19*12,1],axis=1) fd3=np.sum(wb5m[:,23*12-2:23*12+11,1],axis=1) # tuesday is tricky, changes a lot at the last year [-50:] fd4=np.sum(wb5m[:,(23+5)*12+5:(23+6)*12+2,1],axis=1) fd5=np.sum(wb5m[:,(23+10)*12+9:(23+12)*12+2,1],axis=1) fd6=np.sum(wb5m[:,(23+12)*12+6:(23+13)*12+2,1],axis=1) fd7=np.sum(wb5m[:,(23+18)*12+8:(23+19)*12+4,1],axis=1) fd8=np.sum(wb5m[:,(23+19)*12+4:(23+19)*12+8,1],axis=1) fd9=np.sum(wb5m[:,(23+21)*12+5:(23+22)*12+3,1],axis=1) # wednesday is also tricky, oscillates for the last 3 years fd10=np.sum(wb5m[:,(2*23+22)*12+3:(2*23+22)*12+11,1],axis=1) fd12=np.sum(wb5m[:,(2*23+18)*12+3:(2*23+19)*12+3,1],axis=1) fd11=np.sum(wb5m[:,(3*23+10)*12+9:(3*23+12)*12+2,1],axis=1) fd13=np.sum(wb5m[:,(3*23+17)*12+3:(3*23+18)*12,1],axis=1) # this feature is not import fd14=np.sum(wb5m[:,(4*23+0)*12-1:(4*23+0)*12+6,1],axis=1) # has an overnight lr, note for roll fd15=np.sum(wb5m[:,-60-42:-60,1],axis=1) ### whole days? fd16=np.sum(wb5m[:,23*12*3+11:23*12*4,1],axis=1) fd17=np.sum(wb5m[:,23*12*1:23*12*3,1],axis=1) # this is too volatile! # prev #fd18=np.r_[0, np.sum(wb5m[:-1,-60:-30,1],axis=1)] #fd19=np.r_[0, fd18[:-1]] fd19=np.r_[np.zeros(2), np.sum(wb5m[:-2,-60:-30,1],axis=1)] fd20=np.r_[np.zeros(18), np.sum(wb5m[:-18,-60:-30,1],axis=1)] # good negative corr at 18th week fd21=np.r_[np.zeros(32), np.sum(wb5m[:-32,-60:-30,1],axis=1)] # good positive corr at 32th week fd22=np.r_[np.zeros(35), np.sum(wb5m[:-35,-60:-30,1],axis=1)] # good positive corr at 35th week ## short range yy0=np.sum(wb5m[:,-60:-58,1],axis=1) yy1=np.sum(wb5m[:,-60:-58,3],axis=1) yy0_=np.sign(yy0) #X=np.vstack((fd0,fd1,fd2,fd3,fd4,fd5,fd6,fd7,fd8,fd9,fd10,fd11,fd12,fd13,fd14,fd15)).T # consider adding fd13 in, but it has a flip in 1 of the previous year #X=np.vstack((fd0,fd1,fd3,fd11,fd14)).T # it seems that Monday,Thursday and Friday has the best #X=np.vstack((fd0,fd1,fd2,fd3,fd4,fd11,fd14,fd15,fd16).T # it seems that Monday,Thursday and Friday has the best X=np.vstack((fd0,fd1,fd2,fd3,fd4,fd11,fd14,fd15,fd16)).T # it seems that Monday,Thursday and Friday has the best Xvbs=get_Xvbs(wb5m[:,:,3]) if len(Xvbs.shape)== 1: X=np.vstack((X.T,Xvbs)).T else : X=np.hstack((X,Xvbs)) """ # this vol0 doesn't seem to help Xv0=get_vol0(wb5m[:,:,4]) if len(Xv0.shape)== 1: X=np.vstack((X.T,Xv0)).T else : X=np.hstack((X,Xv0)) """ #X=np.vstack((X.T,fd19,fd22)).T #X-=np.mean(X,axis=0) ; X/=np.std(X,axis=0); X=np.vstack((np.ones(len(fd0)),X.T)).T y=np.sum(wb5m[:,-60:-30,1],axis=1) wt=l1_reader.getwt(len(y),0.1) wt/=np.sum(wt) mu=np.dot(y,wt) sd=np.sqrt(np.dot(wt, (y-mu)**2)) y0=ean.outlier(y,mu,sd,in_th=1,out_th=3) fd19=np.r_[np.zeros(2), y0[:-2]] fd20=np.r_[np.zeros(18), y0[:-18]] # good negative corr at 18th week fd21=np.r_[np.zeros(32), y0[:-32]] # good positive corr at 32th week fd22=np.r_[np.zeros(35), y0[:-35]] # good positive corr at 35th week #X=np.vstack((X.T,fd19,np.sign(fd22)*np.abs(fd21))).T Xprv=np.vstack((fd19,np.sign(fd22)*np.abs(fd21))).T X=np.vstack((X.T,Xprv.T)).T xmu=np.mean(X,axis=0) xsd=np.std(X,axis=0) #X-=np.mean(X,axis=0) ; X/=np.std(X,axis=0); X=np.vstack((np.ones(len(fd0)),X.T)).T X-=xmu ; X/=xsd ; print xmu, xsd X=np.vstack((np.ones(wb5m.shape[0]),X.T)).T #return X,xmu,xsd fig=pl.figure() ax=fig.add_subplot(2,1,1) axp=fig.add_subplot(2,1,2) mu=0 ; sd=1 # don't scale it #omp=linear_model.SGDRegressor(loss='huber',penalty='l1',alpha=1) #y0=np.sign(y0) #omp=linear_model.Lars() #omp=linear_model.OrthogonalMatchingPursuitCV() #omp=linear_model.LassoCV() omp=linear_model.RidgeCV(); omp.fit(X[:-50,:],y0[:-50]) yh=omp.predict(X[-50:,:]) * sd + mu if ax is not None: ax.plot(omp.coef_, '.-', label='-50') print np.corrcoef(yh, y0[-50:])[0,1] omp=linear_model.RidgeCV(); omp.fit(X[:-100,:],y0[:-100]) yh=omp.predict(X[-100:-50,:]) * sd + mu print np.corrcoef(yh, y0[-100:-50])[0,1] if ax is not None: ax.plot(omp.coef_, '.-', label='-100:-50') omp=linear_model.RidgeCV(); omp.fit(X[:-200,:],y0[:-200]) yh=omp.predict(X[-200:-100,:]) * sd + mu print np.corrcoef(yh, y0[-200:-100])[0,1] if ax is not None: ax.plot(omp.coef_, '.-', label='-200:-100') omp=linear_model.RidgeCV(); omp.fit(X[:-100,:],y0[:-100]) yh=omp.predict(X[-100:,:]) * sd + mu print np.corrcoef(yh, y0[-100:])[0,1] if ax is not None: ax.plot(omp.coef_, '.-', label='-100') # look into the execution of the last 100 weeks yhh=yh.copy() ysd=yhh.std() yhh0=yh*yy0[-100:] ix1=np.nonzero(yhh0>0)[0] # correct sign at first 10 minutes yhh[ix1]+= yy0[-100:][ix1]/yy0.std()*ysd*2 # add the percentage of the high positive ones #th0=np.percentile(np.abs(yhh0[ix1]), 50) #ix00=np.nonzero(np.abs(yhh0[ix1])>th0)[0] #yhh[ix1[ix00]]+=yy0[-100:][ix1[ix00]]/yy0.std()*ysd*1 #ix2=np.nonzero(yh*yy1[-100:]>0)[0] #yhh[ix2]+= np.sign(yy1[-100:][ix2])*ysd*.1 yhh=np.sign(yhh)*(np.abs(yhh)**0.01) yhh/=np.max(np.abs(yhh)) # yhh is the normalized size to trade # try sign #yhh=np.sign(yhh) ylr=np.cumsum(wb5m[-100:,-58:-30,1],axis=1) yraw=ylr[:,-1] pnl=(ylr.T*yhh).T #pnl=yraw*yhh # stop loss doesn't work th=0.0125 #th=0.02 tick=0.0001 for i,(p0,yhh0) in enumerate(zip(pnl,np.abs(yhh))) : th1=th*yhh0 #th1=th #ix=np.nonzero(p0<-th1)[0] ix=np.nonzero(p0>th1)[0] if len(ix)>0 : eix=np.nonzero(p0[ix[0]:] <= th1*1.2)[0] if len(eix) > 0 : eix=ix[0]+eix[0] #th0=p0[eix]-tick*yhh0 th0=p0[eix] #print 'stop loss at ',ix[0],eix,p0[ix[0]],p0[eix], 'final pnl is ',pnl[i,-1],th0,'saved ', th0-pnl[i,-1] pnl[i,-1]=th0 ltx=-1 pnl0=pnl[:,ltx].copy() #pnl0=pnl[:,-10].copy() pnl=pnl[:,-1] ix=np.nonzero(pnl0<0)[0] ixp=np.nonzero(pnl0>0)[0] #for a0,a1,a2 in zip(yhh[ix],pnl[ix],pnl0[ix]) : # print a0, a1, a2 # let negative ones run longer ltx2=-1 #print ltx2, ltx, np.sum(wb5m[-100:,-30+ltx+1:ltx2,1],axis=1)[ix]*yhh[ix] pnl0[ix]-=(np.sum(wb5m[-100:,-30+ltx+1+10:ltx2,1],axis=1)[ix]*yhh[ix]) #pnl0[ix]+=(np.sum(wb5m[-100:,-30+ltx+1:-30+ltx+1+10,1],axis=1)[ix]*yhh[ix]) pnl0[ixp]+=(np.sum(wb5m[-100:,-30+ltx+1+18:ltx2,1],axis=1)[ixp]*yhh[ixp]) pnl0[ixp]-=(np.sum(wb5m[-100:,-30+ltx+1:-30+ltx+1+6,1],axis=1)[ixp]*yhh[ixp]) pnl[ixp]=pnl0[ixp].copy() pnl[ix]=pnl0[ix].copy() axp.plot(np.cumsum(pnl), '.-',label='pnl=%.2f,shp=%.2f'%(np.sum(pnl), pnl.mean()/pnl.std())) ax.legend() axp.legend() return X,y0,yh,yhh,pnl