def work(confer): ta1 = confer.get_ta_file() last_trade_date = confer.last_trade_date confer.last_trade_date = base.get_second_trade_date_local( confer.syms.get_name()) print(confer.last_trade_date) ta2 = confer.get_ta_file() confer.last_trade_date = last_trade_date print(ta1, ta2) df1 = pd.read_pickle(ta1) df1 = df1[ df1.date <= base.get_second_trade_date_local(confer.syms.get_name())] df2 = pd.read_pickle(ta2) syms1 = df1.sym.unique() syms2 = df2.sym.unique() print(syms1, syms2) assert len(syms1) == len(syms2) df1.reset_index(drop=True, inplace=True) df2.reset_index(drop=True, inplace=True) assert len(df1) == len(df2) assert_frame_equal(df1[base.get_feat_names(df1)], df2[base.get_feat_names(df2)])
def main(argv): clsName = argv[0] stage = int(argv[1]) taName = argv[2] start = argv[3] end = argv[4] label = argv[5] if end == "<0": end = "2099-12-31" cls = get_cls(clsName) ta = get_ta(taName) ta = ta[(ta.date >= start) & (ta.date <= end)] dfFeat = ta.loc[:, base.get_feat_names(ta)] print dfFeat.tail(1) npFeat = dfFeat.values #npPred = cls.predict_proba(npFeat) for i, npPred in enumerate(cls.staged_predict_proba(npFeat)): if i == stage: break ta["pred"] = npPred[:, 1] ta.sort("pred", inplace=True, ascending=False) print ta[["date", "sym", "pred"]].head(10) ta.to_csv( os.path.join(base.dir_preds(), base.fname_pred(clsName, taName, start, end))) ta[["date", "sym", "pred", label]].to_csv( os.path.join(base.dir_preds(), base.fname_pred_s(clsName, taName, start, end)))
def post_valid(classifier, df_train, df_test, score, is_fit): df_train = df_train.sort_values(["sym", "date"]) # from sklearn.exceptions import NotFittedError npTrainFeat, npTrainLabel = extract_feat_label(df_train, score.get_name()) npTestFeat, npTestLabel = extract_feat_label(df_test, score.get_name(), drop=False) feat_names = base.get_feat_names(df_test) if not is_fit: probas_ = classifier.predict_proba(npTestFeat) else: classifier.fit(npTrainFeat, npTrainLabel) probas_ = classifier.predict_proba(npTestFeat) d_feat_ipts = classifier.get_feature_importances(feat_names) ipts = [] if len(d_feat_ipts) > 0: for each in sorted(d_feat_ipts.items(), key=lambda a: a[1], reverse=True): ipts.append({"name":each[0], "score": each[1]}) fpr, tpr, thresholds = roc_curve(npTestLabel, probas_[:, 1]) roc_auc = auc(fpr, tpr) min = str(df_test.head(1)["yyyy"].values[0]) max = str(df_test.tail(1)["yyyy"].values[0]) df_test.loc[:, "pred"] = probas_[:, 1] df_test.loc[:, "pred2"] = probas_[:, 0] #pdt.assert_numpy_array_equal(df_test.round(2).loc[:, "pred"].values[0:10], 1 - df_test.round(2).loc[:, "pred2"].values[0:10]) post = {"classifier": classifier, 'ipts':ipts, "fpr":fpr, "tpr":tpr, "thresholds": thresholds, "roc_auc":roc_auc, "name":"%s-%s" % (min, max), "min":min, "max":max, "df_test":df_test} return post
def pred(self, start=None): df_all = pd.read_pickle(self.confer.get_sel_file()) if start != None: df_all = df_all[df_all.date >= start] score = self.confer.scores[0] df_all_1 = df_all[df_all[score.get_name()] < 0.5] df_all_2 = df_all[df_all[score.get_name()] > 0.5] assert len(df_all_1) + len(df_all_2) == len(df_all) df_all_2 = df_all_2.sample(n=len(df_all_1)) assert (len(df_all_2) == len(df_all_1)) df_all = pd.concat([df_all_1, df_all_2], axis=0) assert (len(df_all) == 2 * len(df_all_1)) df_all = df_all.sample(frac=1.0, random_state=1253) feat_names = base.get_feat_names(df_all) np_feat = df_all.loc[:, feat_names].values print("pred start : %s pred end: %s total:%d" % (df_all.sort_values('date').head(1)['date'].values[0], df_all.sort_values('date').tail(1)['date'].values[0], len(df_all))) np_pred = self.confer.classifier.predict_proba(np_feat) #df_all = df_all.iloc[2-1:] if np_pred.shape[1] == 2: df_all["pred"] = np_pred[:, 1] else: df_all["pred"] = np_pred[:, 0] df_all = df_all.sample(frac=1.0) return df_all.sort_values("pred", ascending=False)
def _get_metas(dfTa, depth, min_, label, n_pool): feat_names = base.get_feat_names(dfTa) idx = 0 results = [] import concurrent.futures if n_pool == 1: for cur_feat in feat_names: feat_meta = _feat_meta(cur_feat, dfTa, len(dfTa[dfTa[label] == 1]), len(dfTa[dfTa[label] == 0]), len(dfTa), label, depth, min_) if None != feat_meta: results.append(feat_meta) else: Executor = concurrent.futures.ProcessPoolExecutor plen = len(dfTa[dfTa[label] == 1]) nlen = len(dfTa[dfTa[label] == 0]) alen = len(dfTa) with Executor(max_workers=n_pool) as executor: futures = {executor.submit(_feat_meta, cur_feat, dfTa[[cur_feat, label]].copy(), plen, nlen, alen, label, depth, min_): cur_feat for cur_feat in feat_names} for future in concurrent.futures.as_completed(futures): try: cur_feat = futures[future] results.append(future.result()) except Exception as exc: import traceback traceback.print_exc() sys.exit(1) return results
def one_work(cls, taName, label, date_range, th): df = base.get_merged(base.dir_ta(taName)) df = get_range(df, date_range[0], date_range[1]) m = joblib.load(os.path.join(root, 'data', 'models',"model_" + cls + ".pkl")) s = joblib.load(os.path.join(root, 'data', 'models',"scaler_" + cls + ".pkl")) feat_names = base.get_feat_names(df) npFeat = df.loc[:,feat_names].values #npPred = cls.predict_proba(npFeat)[:,1] #prent npPred res = "" for i, npPred in enumerate(m.staged_predict_proba(s.transform(npFeat))): #if i % 1 != 0: # continue re = "%s\t%s\t%s\t%s\t%s\t%f\t" % (cls, taName, label, date_range[0], date_range[1],th) df["pred"] = npPred[:,1] dacc = accu(df, label, th) re += "%d\t%d\t%d\t" % (i, dacc["trueInPos"], dacc["pos"]) if dacc["pos"] > 0: re += "%f" % (dacc["trueInPos"]*1.0 / dacc["pos"]) else : re += "0.0" re += "\n" print re res += re return re
def _select(self, df, start, end, score): """ http://nlp.stanford.edu/IR-book/html/htmledition/mutual-information-1.html#mifeatsel """ df = df[(df.date >= start) & (df.date < end)] feat_names = base.get_feat_names(df) label = df.loc[:, score] res = pd.DataFrame(data=None, index=feat_names) df = df[feat_names] n11 = df[label > 0.5].sum() n10 = df[label < 0.5].sum() n01 = (1 - df[label > 0.5]).sum() n00 = (1 - df[label < 0.5]).sum() n = df.count() n1_ = n11 + n10 n0_ = n01 + n00 n_1 = n01 + n11 n_0 = n00 + n10 assert 0 == (n11 + n01 - df[label > 0.5].count()).sum() assert 0 == (n11 + n01 + n10 + n00 - df.count()).sum() mi = n11/n*np.log2(n*n11/(n1_*n_1)) + n01/n*np.log2(n*n01/(n0_*n_1)) \ + n10/n*np.log2(n*n10/(n1_*n_0)) + n00/n*np.log2(n*n00/(n0_*n_0)) res["mi"] = mi res["pn_ratio"] = n11 / (n11 + n10) res = res.sort_values("mi", ascending=False) return res
def one_work(name, dir_ta, model, label, date_range): if os.path.isfile( os.path.join(root, 'data', 'models', "model_%s.pkl" % name)): print "%s already exists!" % name return dfTa = ta.get_merged(dir_ta) dfTrain = build_trains(dfTa, date_range[0], date_range[1]) feat_names = base.get_feat_names(dfTrain) npTrainFeat = dfTrain.loc[:, feat_names].values npTrainLabel = dfTrain.loc[:, label].values.copy() npTrainLabel[npTrainLabel != 1.0] npTrainLabel[npTrainLabel > 1.0] = 1 npTrainLabel[npTrainLabel < 1.0] = 0 model.fit(npTrainFeat, npTrainLabel) joblib.dump(model, os.path.join(root, "data", "models", "model_%s.pkl" % name), compress=3) dFeatImps = dict(zip(feat_names, model.feature_importances_)) with open( os.path.join(root, 'data', 'models', 'model_%s_importance' % name), 'w') as fipt: for each in sorted(dFeatImps.iteritems(), key=lambda a: a[1], reverse=True): print >> fipt, each[0], ",", each[1]
def _select(self, df, start, end, score): """ http://nlp.stanford.edu/IR-book/html/htmledition/mutual-information-1.html#mifeatsel """ df = df[(df.date >= start) & (df.date < end)] feat_names = base.get_feat_names(df) label = df.loc[:,score] res = pd.DataFrame(data = None, index=feat_names) df = df[feat_names] n11 = df[label > 0.5].sum() n10 = df[label < 0.5].sum() n01 = (1-df[label>0.5]).sum() n00 = (1-df[label<0.5]).sum() n = df.count() n1_ = n11+n10 n0_ = n01+n00 n_1 = n01+n11 n_0 = n00+n10 assert 0 == (n11 + n01 - df[label>0.5].count()).sum() assert 0 == (n11 + n01 + n10 + n00 - df.count()).sum() mi = n11/n*np.log2(n*n11/(n1_*n_1)) + n01/n*np.log2(n*n01/(n0_*n_1)) \ + n10/n*np.log2(n*n10/(n1_*n_0)) + n00/n*np.log2(n*n00/(n0_*n_0)) res["mi"] = mi res["pn_ratio"] = n11/(n11+n10) res = res.sort_values("mi", ascending=False) return res
def main(args): exec "import main.pandas_talib.sig_%s as conf" % args.signame build.work2(20, 'sp500Top50', args.signame) df = base.get_merged(conf.__name__, yeod.get_sp500Top50()) df.to_csv("ta.csv") tree = DecisionTreeClassifier() feat_names = base.get_feat_names(df) dfTrain = df[(df.date >= '1970-01-01') & (df.date <= '2009-12-31')] npTrainFeat = dfTrain.loc[:, feat_names].values.copy() npTrainLabel = dfTrain.loc[:, "label5"].values.copy() npTrainLabel[npTrainLabel > 1.0] = 1 npTrainLabel[npTrainLabel < 1.0] = 0 tree.fit(npTrainFeat, npTrainLabel) joblib.dump(tree, "tree.pkl", compress=3) dfTest = df[(df.date >= '2010-01-01') & (df.date <= '2099-12-31')] npTestFeat = dfTest.loc[:, feat_names].values.copy() npPred = tree.predict_proba(npTestFeat) dfTest.loc[:, "pred"] = npPred[:, 1] print dfTest['pred'].head() dfPos = dfTest[dfTest['pred'] > 0.55] print 1.0 * len(dfPos[dfPos['label5'] > 1]) / len(dfPos) print 1.0 * len(dfTest[dfTest['label5'] > 1]) / len(dfTest)
def main(args): lsym = getattr(yeod, "get_%s" % args.setname)() if args.start is None: args.start = base.last_trade_date() args.end = args.start cls = joblib.load(os.path.join(base.dir_model(), args.model)) ta = base.get_merged_with_na(args.taname, lsym) ta = ta[(ta['date'] >= args.start) & (ta['date'] <= args.end)] dfFeat = ta.loc[:, base.get_feat_names(ta)] dfFeat = dfFeat.replace([np.inf,-np.inf],np.nan)\ .dropna() npFeat = dfFeat.values npPred = cls.predict_proba(npFeat) #for i, npPred in enumerate(cls.staged_predict_proba(npFeat)): # if i == args.stage: # break ta["pred"] = npPred[:, 1] ta.sort("pred", inplace=True, ascending=False) freport, fcsv = base.file_pred(args) ta.to_csv(fcsv) #ta[["date", "sym", "pred", label]].to_csv(os.path.join(out_dir, 'pred.s.csv')) with open(freport, 'w') as fout: print >> fout, ta[["date", "sym", "pred"]].head(10)
def extract_feat_label(df, scorename): df = df.replace([np.inf, -np.inf], np.nan).dropna() feat_names = base.get_feat_names(df) npFeat = df.loc[:, feat_names].values.copy() npLabel = df.loc[:, scorename].values.copy() npPred = df.loc[:, "pred"].values.copy() return npFeat, npLabel, npPred
def mutual_information(df, confer): """ http://nlp.stanford.edu/IR-book/html/htmledition/mutual-information-1.html#mifeatsel """ feat_names = base.get_feat_names(df) label = df.loc[:,confer.score1.get_name()] res = pd.DataFrame(data = None, index=feat_names) df = df[feat_names] n11 = df[label > 0.5].sum() n10 = df[label < 0.5].sum() n01 = (1-df[label>0.5]).sum() n00 = (1-df[label<0.5]).sum() n = df.count() n1_ = n11+n10 n0_ = n01+n00 n_1 = n01+n11 n_0 = n00+n10 assert 0 == (n11 + n01 - df[label>0.5].count()).sum() assert 0 == (n11 + n01 + n10 + n00 - df.count()).sum() mi = n11/n*np.log2(n*n11/(n1_*n_1)) + n01/n*np.log2(n*n01/(n0_*n_1)) \ + n10/n*np.log2(n*n10/(n1_*n_0)) + n00/n*np.log2(n*n00/(n0_*n_0)) res["n11"] = n11 res["n10"] = n10 res["n01"] = n01 res["n00"] = n00 res["mi"] = mi return res
def main(args): exec "import main.pandas_talib.sig_%s as conf" % args.signame build.work2(20, 'sp500Top50', args.signame) df = base.get_merged(conf.__name__, yeod.get_sp500Top50()) df.to_csv("ta.csv") tree = DecisionTreeClassifier() feat_names = base.get_feat_names(df) dfTrain = df[(df.date>='1970-01-01') & (df.date <='2009-12-31')] npTrainFeat = dfTrain.loc[:,feat_names].values.copy() npTrainLabel = dfTrain.loc[:,"label5"].values.copy() npTrainLabel[npTrainLabel > 1.0] = 1 npTrainLabel[npTrainLabel < 1.0] = 0 tree.fit(npTrainFeat, npTrainLabel) joblib.dump(tree, "tree.pkl", compress = 3) dfTest = df[(df.date>='2010-01-01') & (df.date <='2099-12-31')] npTestFeat = dfTest.loc[:, feat_names].values.copy() npPred = tree.predict_proba(npTestFeat) dfTest.loc[:,"pred"] = npPred[:,1] print dfTest['pred'].head() dfPos = dfTest[ dfTest['pred'] > 0.55 ] print 1.0 * len(dfPos[dfPos['label5']>1]) / len(dfPos) print 1.0 * len(dfTest[dfTest['label5']>1]) / len(dfTest)
def _get_metas(dfTa, depth, min_, label, n_pool): feat_names = base.get_feat_names(dfTa) idx = 0 results = [] import concurrent.futures Executor = concurrent.futures.ProcessPoolExecutor plen = len(dfTa[dfTa[label] > 0.5]) nlen = len(dfTa[dfTa[label] < 0.5]) alen = len(dfTa) with Executor(max_workers=n_pool) as executor: futures = { executor.submit(_feat_meta, cur_feat, dfTa[[cur_feat, label]].copy(), plen, nlen, alen, label, depth, min_): cur_feat for cur_feat in feat_names } for future in concurrent.futures.as_completed(futures): try: cur_feat = futures[future] results.append(future.result()) except Exception as exc: import traceback traceback.print_exc() sys.exit(1) return results
def main(args): lsym = getattr(yeod, "get_%s" % args.setname)() dfTa = base.get_merged(args.taname, lsym) if dfTa is None: print "can not merge " % args.setname sys.exit(1) dfTa = base.get_range(dfTa, args.start, args.end) print dfTa.shape #if args.filter: # dfTa = filter_trend(dfTa) #print dfTa.shape cls = joblib.load(os.path.join(base.dir_model(), args.model)) feat_names = base.get_feat_names(dfTa) npFeat = dfTa.loc[:, feat_names].values if isscaler: scaler = get_scaler(clsName) npFeatScaler = scaler.transform(npFeat) else: npFeatScaler = npFeat #for i, npPred in enumerate(cls.staged_predict_proba(npFeatScaler)): # if i == args.stage: # break npPred = cls.predict_proba(npFeat) dfTa["pred"] = npPred[:, 1] dfTa = dfTa.sort_values(['pred'], ascending=False) freport, fpred = base.file_paper(args) dfTa.to_csv(fpred) ana.main([fpred, args.top, args.thresh, freport, args.level]) print freport
def one_work(cls, taName, label, date_range, th): df = base.get_merged(base.dir_ta(taName)) df = get_range(df, date_range[0], date_range[1]) m = joblib.load( os.path.join(root, 'data', 'models', "model_" + cls + ".pkl")) s = joblib.load( os.path.join(root, 'data', 'models', "scaler_" + cls + ".pkl")) feat_names = base.get_feat_names(df) npFeat = df.loc[:, feat_names].values #npPred = cls.predict_proba(npFeat)[:,1] #prent npPred res = "" for i, npPred in enumerate(m.staged_predict_proba(s.transform(npFeat))): #if i % 1 != 0: # continue re = "%s\t%s\t%s\t%s\t%s\t%f\t" % (cls, taName, label, date_range[0], date_range[1], th) df["pred"] = npPred[:, 1] dacc = accu(df, label, th) re += "%d\t%d\t%d\t" % (i, dacc["trueInPos"], dacc["pos"]) if dacc["pos"] > 0: re += "%f" % (dacc["trueInPos"] * 1.0 / dacc["pos"]) else: re += "0.0" re += "\n" print re res += re return re
def pred(self, start = None): df_all = pd.read_pickle(self.confer.get_sel_file()) if start != None: df_all = df_all[df_all.date >= start] score = self.confer.scores[0] df_all_1 = df_all[df_all[score.get_name()] < 0.5] df_all_2 = df_all[df_all[score.get_name()] > 0.5] assert len(df_all_1) + len(df_all_2) == len(df_all) df_all_2 = df_all_2.sample(n = len(df_all_1)) assert(len(df_all_2) == len(df_all_1)) df_all = pd.concat([df_all_1, df_all_2], axis=0) assert(len(df_all) == 2*len(df_all_1)) df_all = df_all.sample(frac=1.0, random_state = 1253) feat_names = base.get_feat_names(df_all) np_feat = df_all.loc[:, feat_names].values print("pred start : %s pred end: %s total:%d" % (df_all.sort_values('date').head(1)['date'].values[0], df_all.sort_values('date').tail(1)['date'].values[0], len(df_all))) np_pred = self.confer.classifier.predict_proba(np_feat) #df_all = df_all.iloc[2-1:] if np_pred.shape[1] == 2: df_all["pred"] = np_pred[:, 1] else: df_all["pred"] = np_pred[:, 0] df_all = df_all.sample(frac=1.0) return df_all.sort_values("pred", ascending=False)
def work(classifer, df_pred): feat_names = base.get_feat_names(df_pred) np_feat = df_pred.loc[:, feat_names].values np_pred = classifer.predict_proba(np_feat) df_pred["pred"] = np_pred[:,1] df_pred.sort_values(['pred'], ascending=False, inplace=True) return df_pred[["date", "sym", "open", "high", "low", "close", "pred"]].head(20)
def verify_predict(self, df): feat_names = base.get_feat_names(df) ipts = self.get_feature_importances(feat_names) s = 0 for each in ipts: if int(df[each]) == 1 : s += ipts[each] * 1 import math return 1 / (1 + math.exp(-1 * (s + self.classifier.intercept_)))
def verify_predict(self, df): feat_names = base.get_feat_names(df) ipts = self.get_feature_importances(feat_names) s = 0 for each in ipts: if int(df[each]) == 1: s += ipts[each] * 1 import math return 1 / (1 + math.exp(-1 * (s + self.classifier.intercept_)))
def main(args): cls = params_set.d_model[args.clsname] file_model, file_ipt = base.file_model(args) if os.path.isfile(file_model): print "%s already exists!" % file_model return dfTa = base.get_merged(args.taname, getattr(yeod, "get_%s" % args.setname)(),args.start,args.end) if dfTa is None: return None dfTrain = dfTa # build_trains(dfTa, args.start, args.end) if args.sample: print "sampling ..." sample = len(dfTrain)/sample rows = random.sample(range(len(dfTrain)), sample) print len(rows) dfTrain = dfTrain.reset_index(drop=True) dfTrain = dfTrain.ix[rows] if args.repeat: print "repeat ..." toAppends = [] for i in range(1,3): dfTmp = dfTrain[dfTrain.label5>=1+i/20.0] toAppends.append(dfTmp) print dfTrain.shape dfTrain = dfTrain.append(toAppends) print dfTrain.shape if args.sw: dfTrain = getattr(sw, "sw_%s" % args.sw)(dfTrain) feat_names = base.get_feat_names(dfTrain) npTrainFeat = dfTrain.loc[:,feat_names].values npTrainLabel = dfTrain.loc[:,args.labelname].values.copy() npTrainLabel[npTrainLabel != 1.0] npTrainLabel[npTrainLabel > 1.0] = 1 npTrainLabel[npTrainLabel < 1.0] = 0 if args.scaler: scaler = getMinMax(npTrainFeat) npTrainFeatScaled = scaler.transform(npTrainFeat) else: npTrainFeatScaled = npTrainFeat if args.sw: cls.fit(npTrainFeatScaled, npTrainLabel, sample_weight=dfTrain["sample_weight"].values) else: cls.fit(npTrainFeatScaled, npTrainLabel) joblib.dump(cls, file_model, compress = 3) #joblib.dump(scaler, os.path.join(root, 'data', 'models',scalerName), compress = 3) dFeatImps = dict(zip( feat_names, cls.feature_importances_)) with open(file_ipt, 'w') as fipt: for each in sorted(dFeatImps.iteritems(), key = lambda a: a[1], reverse=True): print >> fipt, each[0], ",", each[1]
def get_ipts(confer): df = pd.read_pickle(confer.get_sel_file()) d_feat_ipts = confer.classifier.get_feature_importances(base.get_feat_names(df)) ipts = [] if len(d_feat_ipts) > 0: for each in sorted(d_feat_ipts.items(), key = lambda a: a[1], reverse=True): ipts.append({"name":each[0], "score":each[1]}) df_ipts = pd.DataFrame(data=ipts) df_ipts = df_ipts.set_index("name") return df_ipts
def _get_metas2(dfTa, depth, min_, label): feat_names = base.get_feat_names(dfTa) idx = 0 results = [] for cur_feat in feat_names: idx += 1 ## plen, nlen and len_ is to speed up results.append(_feat_meta2(cur_feat, dfTa, len(dfTa[dfTa[label] == 1]), len(dfTa[dfTa[label] == 0]), len(dfTa), label, depth, min_)) print "%d done!" % idx return [result for result in results]
def get_metas(dfTa): feat_names = base.get_feat_names(dfTa) list_feat_meta = [] idx = 0 for cur_feat in feat_names: idx += 1 if istest: if idx > 10: break list_feat_meta.append(feat_meta(cur_feat, dfTa, "label5")) return list_feat_meta
def top_10_decision_path(crosses, crossname): print('|symset|glo_l1|sel_l1|glo_l2|sel_l2|select_len|min|max|', file=out_file) print('|------|------|------|------|------|----------|---|---|', file=out_file) for each in crosses: cross = each[crossname] df_test = pd.concat([c.df_test for c in cross]) df_select, df_year, df_month = ana.select2(confer.score1, confer.score2, df_test, 1, 10) print('|%s|%.2f|%.2f|%.2f|%.2f|%d|%.2f|%.2f|' % (each["symsetname"], ana.accurate(df_test, confer.score1), ana.accurate(df_select, confer.score1), ana.accurate(df_test, confer.score2), ana.accurate(df_select, confer.score2), len(df_select), df_select.tail(1)["pred"] if len(df_select) > 0 else 0, df_select.head(1)["pred"] if len(df_select) > 0 else 0), file=out_file) np_feat = df_select[base.get_feat_names(df_select)].values classifier = cross[0].classifier for i in range(len(np_feat)): x = np_feat[i, :] print(x) dot_file = os.path.join( root, "data", "cross", 'top_10_decision_path-%s-%d' % (each["symsetname"], i)) decision_path.export_decision_path2( classifier, x, dot_file + ".dot", feature_names=base.get_feat_names(df_select)) import pydot (graph, ) = pydot.graph_from_dot_file(dot_file + ".dot") graph.write_png(dot_file + ".png") for each in crosses: for i in range(10): dot_file = os.path.join( root, "data", "cross", 'top_10_decision_path-%s-%d' % (each["symsetname"], i)) print("![](%s.png)" % (dot_file), file=out_file)
def main(df): df = base1.main(df) df.reset_index(inplace=True,drop=True) orig_feats = base.get_feat_names(df) df = merge(df, 1) df = merge(df, 2) df = merge(df, 3) for each in orig_feats: if not each.startswith("ta_ADX"): del df[each] print list(df.columns) return df
def verify_predict(self, df): feat_names = base.get_feat_names(df) ipts = self.get_feature_importances(feat_names) s = None for each in ipts: tmp = df[each] * ipts[each] if s is None: s = tmp else: s += tmp return 1 / (1 + np.exp(-1 * (s + self.classifier.intercept_)))
def pn_ratio(df, confer): feat_names = base.get_feat_names(df) label = df.loc[:,confer.score1.get_name()] res = pd.DataFrame(data = None, index=feat_names) df = df[feat_names] n11 = df[label > 0.5].sum() n10 = df[label < 0.5].sum() res["n11"] = n11 res["n10"] = n10 res["pn_ratio"] = res["n11"]/(res["n10"]+res["n11"]) return res
def main(argv): clsName = argv[1] cls = joblib.load(clsName) idx = 0 out_dir = os.path.join(root, 'data', 'graph', "tmp2") mkdir_p(out_dir) print out_dir dot_data = StringIO() ta = pd.read_csv(argv[2]) names = base.get_feat_names(ta) dotfile = os.path.join(out_dir, '%d.dot' % idx) export_graphviz(cls, feature_names = names, out_file=os.path.join(out_dir, '%d.dot' % idx))
def main(argv): clsName = argv[1] cls = joblib.load(clsName) idx = 0 out_dir = os.path.join(root, "data", "graph", "tmp2") mkdir_p(out_dir) print out_dir dot_data = StringIO() ta = pd.read_csv(argv[2]) names = base.get_feat_names(ta) dotfile = os.path.join(out_dir, "%d.dot" % idx) export_graphviz(cls, feature_names=names, out_file=os.path.join(out_dir, "%d.dot" % idx))
def verify_predict(self, df): feat_names = base.get_feat_names(df) ipts = self.get_feature_importances(feat_names) s = None for each in ipts: tmp = df[each]*ipts[each] if s is None: s = tmp else: s += tmp return 1 / (1 + np.exp(-1 * (s + self.classifier.intercept_)))
def main(df): df = base1.main(df) df.reset_index(inplace=True, drop=True) orig_feats = base.get_feat_names(df) print df.shape df = merge(df, 1) print df.shape df = merge(df, 2) print df.shape for each in orig_feats: del df[each] print df.shape return df
def get_ipts(confer): df = pd.read_pickle(confer.get_sel_file()) d_feat_ipts = confer.classifier.get_feature_importances( base.get_feat_names(df)) ipts = [] if len(d_feat_ipts) > 0: for each in sorted(d_feat_ipts.items(), key=lambda a: a[1], reverse=True): ipts.append({"name": each[0], "score": each[1]}) df_ipts = pd.DataFrame(data=ipts) df_ipts = df_ipts.set_index("name") return df_ipts
def feat_split(dfo, start, end, split_point, label, depth, min_, n_pool): df = dfo[(dfo.date >= start) & (dfo.date < end)] df_len = len(df) split_point = int(df_len * split_point) df1 = df.iloc[:split_point] df2 = df.iloc[split_point:] if True: assert df_len == len(df1) + len(df2) df_bit1s = bitlize(df1, label, depth, min_, n_pool) candis = [dfo[list(set(dfo.columns) - set(base.get_feat_names(dfo)))]] l_feat = [] for i in range(0, 2**depth): df_bit1 = df_bit1s[i] if len(df2) > 0: df_bit2 = apply(df_bit1, df2, label) df_bit1.loc["direct"] = \ ((df_bit1.loc["p_chvfa"] -1 ) * (df_bit2.loc["p_chvfa"] - 1)) > 0.00008 print(df_bit1.shape) df_bit1 = df_bit1.loc[:, df_bit1.loc["direct"]] print(df_bit1.shape) feat_names = base.get_feat_names(df_bit1) df_candi = (dfo[feat_names] >= df_bit1.loc["start"]) & ( dfo[feat_names] < df_bit1.loc["end"]) pd.set_option('display.expand_frame_repr', False) df_candi.columns = df_bit1.loc["name"] feat_cur = df_bit1.copy() feat_cur.columns = feat_cur.loc["name"] l_feat.append(feat_cur) candis.append(df_candi) df_res = pd.concat(candis, axis=1) df_res.sort_values(["sym", "date"]) df_feat = pd.concat(l_feat, axis=1) return df_res, df_feat
def main(df): df = base1.main(df) df.reset_index(inplace=True,drop=True) orig_feats = base.get_feat_names(df) print df.shape df = merge(df, 1) print df.shape df = merge(df, 2) print df.shape for each in orig_feats: del df[each] print df.shape return df
def get_metas(dfTa, depth): #pool = multiprocessing.Pool(processes=20) feat_names = base.get_feat_names(dfTa) idx = 0 results = [] for cur_feat in feat_names: idx += 1 if istest: if idx > 10: break #results.append(pool.apply_async(feat_meta, (cur_feat, dfTa, "label5"))) results.append(feat_meta(cur_feat, dfTa, "label5", depth)) print "%d done!" % idx return [result for result in results]
def top_10_decision_path(crosses, crossname): print('|symset|glo_l1|sel_l1|glo_l2|sel_l2|select_len|min|max|', file=out_file) print('|------|------|------|------|------|----------|---|---|', file=out_file) for each in crosses: cross = each[crossname] df_test = pd.concat([c.df_test for c in cross]) df_select, df_year, df_month = ana.select2(confer.score1, confer.score2, df_test, 1, 10) print('|%s|%.2f|%.2f|%.2f|%.2f|%d|%.2f|%.2f|' % ( each["symsetname"], ana.accurate(df_test, confer.score1), ana.accurate(df_select, confer.score1), ana.accurate(df_test, confer.score2), ana.accurate(df_select , confer.score2), len(df_select), df_select.tail(1)["pred"] if len(df_select) > 0 else 0, df_select.head(1)["pred"] if len(df_select) > 0 else 0), file=out_file) np_feat = df_select[base.get_feat_names(df_select)].values classifier = cross[0].classifier for i in range(len(np_feat)): x = np_feat[i,:] print(x) dot_file = os.path.join(root, "data", "cross", 'top_10_decision_path-%s-%d' % (each["symsetname"], i)) decision_path.export_decision_path2(classifier, x, dot_file + ".dot" , feature_names=base.get_feat_names(df_select)) import pydot (graph,) = pydot.graph_from_dot_file(dot_file + ".dot") graph.write_png(dot_file + ".png") for each in crosses: for i in range(10): dot_file = os.path.join(root, "data", "cross", 'top_10_decision_path-%s-%d' % (each["symsetname"], i)) print("![](%s.png)" % (dot_file), file=out_file)
def _select(self, df, start, end, score): df = df[(df.date >= start) & (df.date < end)] #df = df[(df.date >= "2013-01-01") & (df.date < "2014-01-01")] feat_names = base.get_feat_names(df) label = df.loc[:, score] res = pd.DataFrame(data=None, index=feat_names) df = df[feat_names] n11 = df[label > 0.5].sum() n10 = df[label < 0.5].sum() res["n11"] = n11 res["n10"] = n10 res["pn_ratio"] = res["n11"] / (res["n10"] + res["n11"]) #res = res[(res["pn_ratio"]<=self.threshold)&(res["pn_ratio"]>=1-self.threshold)] res = res.sort_values("pn_ratio", ascending=False) return res.tail(10)
def work(confer): ta1 = confer.get_ta_file() last_trade_date = confer.last_trade_date confer.last_trade_date = base.get_second_trade_date_local(confer.syms.get_name()) print(confer.last_trade_date) ta2 = confer.get_ta_file() confer.last_trade_date = last_trade_date print(ta1,ta2) df1 = pd.read_pickle(ta1) df1 = df1[df1.date <= base.get_second_trade_date_local(confer.syms.get_name())] df2 = pd.read_pickle(ta2) syms1 = df1.sym.unique() syms2 = df2.sym.unique() print(syms1, syms2) assert len(syms1) == len(syms2) df1.reset_index(drop=True, inplace=True) df2.reset_index(drop=True, inplace=True) assert len(df1) == len(df2) assert_frame_equal(df1[base.get_feat_names(df1)], df2[base.get_feat_names(df2)])
def _select(self,df,start,end, score): df = df[(df.date >= start) & (df.date < end)] #df = df[(df.date >= "2013-01-01") & (df.date < "2014-01-01")] feat_names = base.get_feat_names(df) label = df.loc[:,score] res = pd.DataFrame(data = None, index=feat_names) df = df[feat_names] n11 = df[label > 0.5].sum() n10 = df[label < 0.5].sum() res["n11"] = n11 res["n10"] = n10 res["pn_ratio"] = res["n11"]/(res["n10"]+res["n11"]) #res = res[(res["pn_ratio"]<=self.threshold)&(res["pn_ratio"]>=1-self.threshold)] res = res.sort_values("pn_ratio", ascending=False) return res.tail(10)
def main(argv): clsName = argv[1] cls = joblib.load(clsName) idx = 0 out_dir = os.path.join(root, 'data', 'graph', "tmp") mkdir_p(out_dir) print out_dir dot_data = StringIO() ta = pd.read_pickle(argv[2]) names = base.get_feat_names(ta) for estimator in cls.estimators_: dotfile = os.path.join(out_dir, '%d.dot' % idx) export_graphviz(estimator, feature_names=names, out_file=os.path.join(out_dir, '%d.dot' % idx)) #graph = pydot.graph_from_dot_file(dotfile) #graph.write_pdf(os.path.join(out_dir, "%d.pdf" % idx)) #Image(graph.create_png()) idx += 1
def post_valid(classifier, df_train, df_test, score, is_fit): df_train = df_train.sort_values(["sym", "date"]) # from sklearn.exceptions import NotFittedError npTrainFeat, npTrainLabel = extract_feat_label(df_train, score.get_name()) npTestFeat, npTestLabel = extract_feat_label(df_test, score.get_name(), drop=False) feat_names = base.get_feat_names(df_test) if not is_fit: probas_ = classifier.predict_proba(npTestFeat) else: classifier.fit(npTrainFeat, npTrainLabel) probas_ = classifier.predict_proba(npTestFeat) d_feat_ipts = classifier.get_feature_importances(feat_names) ipts = [] if len(d_feat_ipts) > 0: for each in sorted(d_feat_ipts.items(), key=lambda a: a[1], reverse=True): ipts.append({"name": each[0], "score": each[1]}) fpr, tpr, thresholds = roc_curve(npTestLabel, probas_[:, 1]) roc_auc = auc(fpr, tpr) min = str(df_test.head(1)["yyyy"].values[0]) max = str(df_test.tail(1)["yyyy"].values[0]) df_test.loc[:, "pred"] = probas_[:, 1] df_test.loc[:, "pred2"] = probas_[:, 0] #pdt.assert_numpy_array_equal(df_test.round(2).loc[:, "pred"].values[0:10], 1 - df_test.round(2).loc[:, "pred2"].values[0:10]) post = { "classifier": classifier, 'ipts': ipts, "fpr": fpr, "tpr": tpr, "thresholds": thresholds, "roc_auc": roc_auc, "name": "%s-%s" % (min, max), "min": min, "max": max, "df_test": df_test } return post
def one_work(cls, ta, labelName, start, end, top): df = ta df = get_range(df, start, end) m = cls feat_names = base.get_feat_names(df) npFeat = df.loc[:, feat_names].values res = "" if isscaler: npFeat = s.transform(npFeat) topscore = None l = [] for i, npPred in enumerate(m.staged_predict_proba(npFeat)): df.loc[:, "pred"] = npPred[:, 1] dacc = accu(df, labelName, top) acc = 0.0 if dacc["pos"] > 0: acc = (dacc["trueInPos"] * 1.0 / dacc["pos"]) print i, acc l.append([i, acc]) return pd.DataFrame(np.asarray(l), columns=["idx", "acc"])
def one_work(cls, ta, labelName, start, end, top): df = ta df = get_range(df, start, end) m = cls feat_names = base.get_feat_names(df) npFeat = df.loc[:,feat_names].values res = "" if isscaler : npFeat = s.transform(npFeat) topscore = None l = [] for i, npPred in enumerate(m.staged_predict_proba(npFeat)): df.loc[:,"pred"] = npPred[:,1] dacc = accu(df, labelName, top) acc = 0.0 if dacc["pos"] > 0: acc = (dacc["trueInPos"]*1.0 / dacc["pos"]) print i, acc l.append([i, acc]) return pd.DataFrame(np.asarray(l), columns=["idx", "acc"])
def one_work(clsName, taName, labelName, start, end, top): df = base.get_merged(base.dir_ta(taName)) df = get_range(df, start, end) m = joblib.load(os.path.join(root, 'data', 'models',"model_" + clsName + ".pkl")) if isscaler: s = joblib.load(os.path.join(root, 'data', 'models',"scaler_" + clsName + ".pkl")) feat_names = base.get_feat_names(df) npFeat = df.loc[:,feat_names].values res = "" if isscaler : npFeat = s.transform(npFeat) topscore = None l = [] for i, npPred in enumerate(m.staged_predict_proba(npFeat)): df.loc[:,"pred"] = npPred[:,1] dacc = accu(df, labelName, top) acc = 0.0 if dacc["pos"] > 0: acc = (dacc["trueInPos"]*1.0 / dacc["pos"]) print i, acc l.append([i, acc]) return pd.DataFrame(np.asarray(l), columns=["idx", "acc"])
def main(df): df1 = base1.main(df) df1.reset_index(drop=True, inplace=True) df2 = stable_3.main(df) df2.reset_index(drop=True, inplace=True) l = [ 'ta_CMO_14', 'ta_RSI_14', 'ta_CMO_7', 'ta_RSI_7', 'ta_CMO_10', 'ta_RSI_10', 'ta_TRIX_2', 'ta_RSI_28', 'ta_CMO_28', 'ta_RSI_5', 'ta_STOCHRSI_slowd_5_20_12', 'ta_ROC_7', 'ta_ROCR100_7', 'ta_ROCP_7', 'ta_ROCR_7', 'ta_STOCHRSI_slowd_7_20_12', 'ta_RSI_2', 'ta_ROC_5', 'ta_ROCR100_5', 'ta_ROCR_5', 'ta_ROCP_5', 'ta_WILLR_10', 'ta_WILLR_14', 'ta_ROC_2', 'ta_ROCR100_2', 'ta_ROCP_2', 'ta_ROCR_2', 'ta_WILLR_7' ] df1 = feat_select.append_deep_feats(df1, l) df1 = df1[base.get_feat_names(df1).extend(["date"])] print df2.shape df2 = df2.merge(df1, left_on='date', right_on="date", how='inner') print df2.shape return df2
def main(df): df1 = base1.main(df) df1.reset_index(drop=True, inplace=True) df2 = stable_3.main(df) df2.reset_index(drop=True, inplace=True) l = ['ta_CMO_14', 'ta_RSI_14','ta_CMO_7','ta_RSI_7','ta_CMO_10', 'ta_RSI_10', 'ta_TRIX_2', 'ta_RSI_28', 'ta_CMO_28', 'ta_RSI_5', 'ta_STOCHRSI_slowd_5_20_12', 'ta_ROC_7', 'ta_ROCR100_7', 'ta_ROCP_7', 'ta_ROCR_7', 'ta_STOCHRSI_slowd_7_20_12', 'ta_RSI_2', 'ta_ROC_5', 'ta_ROCR100_5', 'ta_ROCR_5', 'ta_ROCP_5', 'ta_WILLR_10', 'ta_WILLR_14', 'ta_ROC_2', 'ta_ROCR100_2', 'ta_ROCP_2', 'ta_ROCR_2', 'ta_WILLR_7'] df1 = feat_select.append_deep_feats(df1,l) df1 = df1[base.get_feat_names(df1).extend(["date"])] print df2.shape df2 = df2.merge(df1, left_on='date', right_on="date", how='inner') print df2.shape return df2
def one_work(cls, ta_dir, label, date_range, th): re = "%s\t%s\t%s\t%s\t%s\t%f\t" % (cls, ta_dir[-4:], label, date_range[0], date_range[1],th) df = ta.get_merged(ta_dir) df = get_range(df, date_range[0], date_range[1]) cls = joblib.load(os.path.join(root, 'data', 'models',"model_" + cls + ".pkl")) feat_names = base.get_feat_names(df) npFeat = df.loc[:,feat_names].values #for i, npPred in enumerate(cls.staged_predict_proba(npFeat)): # #if i == 322: # break npPred = cls.predict_proba(npFeat) df["pred"] = npPred[:,1] dacc = accu(df, label, th) re += "%f\t%d\t%d\t" % (dacc["rate"],dacc["trueInPos"], dacc["pos"]) if dacc["pos"] > 0: re += "%f" % (dacc["trueInPos"]*1.0 / dacc["pos"]) else : re += "0.0" print re return re
def extract_feat_label(df, scorename): df = df.replace([np.inf,-np.inf],np.nan).dropna() feat_names = base.get_feat_names(df) npFeat = df.loc[:,feat_names].values.copy() npLabel = df.loc[:,scorename].values.copy() return npFeat, npLabel
#!/usr/bin/env python2.7
confer = MyConfStableLTa() #build.work(confer) #model.work(confer) df = pd.read_pickle(os.path.join(root, 'output', "result_20170205.pkl")) print(ana.roc_auc(df, confer)) clazz_file_name = confer.get_classifier_file() with open(clazz_file_name, 'rb') as fin: clazz = pickle.load(fin) feat_names = base.get_feat_names(df) ipts = sorted(clazz.get_feature_importances(feat_names).items(), key=lambda a:a[1], reverse=True) for each in ipts: print(each) dfo = df.sort_values("pred", ascending=False) df = dfo[feat_names] df_sum = df.sum(axis=0).to_frame(name='sum') df_sum = df_sum/len(df) print(df_sum.sort_values("sum", ascending=False).head()) df_top = dfo[dfo[confer.score1.get_name()]==0].head(100)[feat_names] df_top_sum = df_top.sum(axis=0).to_frame(name='sum') df_top_sum = (df_top_sum/len(df_top)).sort_values("sum", ascending=False) print(df_top_sum.head())
def _select(self, df, start, end, score): df = df[(df.date >= start)&(df.date < end)] feat_names = base.get_feat_names(df) label = df.loc[:, score]