示例#1
0
def work(confer):
    ta1 = confer.get_ta_file()
    last_trade_date = confer.last_trade_date
    confer.last_trade_date = base.get_second_trade_date_local(
        confer.syms.get_name())
    print(confer.last_trade_date)
    ta2 = confer.get_ta_file()
    confer.last_trade_date = last_trade_date

    print(ta1, ta2)

    df1 = pd.read_pickle(ta1)
    df1 = df1[
        df1.date <= base.get_second_trade_date_local(confer.syms.get_name())]
    df2 = pd.read_pickle(ta2)

    syms1 = df1.sym.unique()
    syms2 = df2.sym.unique()

    print(syms1, syms2)
    assert len(syms1) == len(syms2)
    df1.reset_index(drop=True, inplace=True)
    df2.reset_index(drop=True, inplace=True)
    assert len(df1) == len(df2)
    assert_frame_equal(df1[base.get_feat_names(df1)],
                       df2[base.get_feat_names(df2)])
示例#2
0
文件: pred.py 项目: fswzb/pytrade
def main(argv):
    clsName = argv[0]
    stage = int(argv[1])
    taName = argv[2]
    start = argv[3]
    end = argv[4]
    label = argv[5]

    if end == "<0":
        end = "2099-12-31"

    cls = get_cls(clsName)
    ta = get_ta(taName)
    ta = ta[(ta.date >= start) & (ta.date <= end)]

    dfFeat = ta.loc[:, base.get_feat_names(ta)]
    print dfFeat.tail(1)
    npFeat = dfFeat.values
    #npPred = cls.predict_proba(npFeat)
    for i, npPred in enumerate(cls.staged_predict_proba(npFeat)):
        if i == stage:
            break
    ta["pred"] = npPred[:, 1]
    ta.sort("pred", inplace=True, ascending=False)
    print ta[["date", "sym", "pred"]].head(10)
    ta.to_csv(
        os.path.join(base.dir_preds(),
                     base.fname_pred(clsName, taName, start, end)))
    ta[["date", "sym", "pred", label]].to_csv(
        os.path.join(base.dir_preds(),
                     base.fname_pred_s(clsName, taName, start, end)))
示例#3
0
def post_valid(classifier, df_train, df_test, score, is_fit):
    df_train = df_train.sort_values(["sym", "date"])
    # from sklearn.exceptions import NotFittedError
    npTrainFeat, npTrainLabel = extract_feat_label(df_train, score.get_name())
    npTestFeat, npTestLabel = extract_feat_label(df_test, score.get_name(), drop=False)
    feat_names = base.get_feat_names(df_test)
    if not is_fit:
        probas_ = classifier.predict_proba(npTestFeat)
    else:
        classifier.fit(npTrainFeat, npTrainLabel)
        probas_ = classifier.predict_proba(npTestFeat)
    d_feat_ipts = classifier.get_feature_importances(feat_names)
    ipts = []
    if len(d_feat_ipts) > 0:
        for each in sorted(d_feat_ipts.items(), key=lambda a: a[1], reverse=True):
            ipts.append({"name":each[0], "score": each[1]})

    fpr, tpr, thresholds = roc_curve(npTestLabel, probas_[:, 1])
    roc_auc = auc(fpr, tpr)

    min = str(df_test.head(1)["yyyy"].values[0])
    max = str(df_test.tail(1)["yyyy"].values[0])
    df_test.loc[:, "pred"] = probas_[:, 1]
    df_test.loc[:, "pred2"] = probas_[:, 0]
    #pdt.assert_numpy_array_equal(df_test.round(2).loc[:, "pred"].values[0:10], 1 - df_test.round(2).loc[:, "pred2"].values[0:10])
    post = {"classifier": classifier,
            'ipts':ipts,
            "fpr":fpr, "tpr":tpr, 
            "thresholds": thresholds,
            "roc_auc":roc_auc,
            "name":"%s-%s" % (min, max),
            "min":min,
            "max":max,
            "df_test":df_test}
    return post
示例#4
0
 def pred(self, start=None):
     df_all = pd.read_pickle(self.confer.get_sel_file())
     if start != None:
         df_all = df_all[df_all.date >= start]
     score = self.confer.scores[0]
     df_all_1 = df_all[df_all[score.get_name()] < 0.5]
     df_all_2 = df_all[df_all[score.get_name()] > 0.5]
     assert len(df_all_1) + len(df_all_2) == len(df_all)
     df_all_2 = df_all_2.sample(n=len(df_all_1))
     assert (len(df_all_2) == len(df_all_1))
     df_all = pd.concat([df_all_1, df_all_2], axis=0)
     assert (len(df_all) == 2 * len(df_all_1))
     df_all = df_all.sample(frac=1.0, random_state=1253)
     feat_names = base.get_feat_names(df_all)
     np_feat = df_all.loc[:, feat_names].values
     print("pred start : %s pred end: %s total:%d" %
           (df_all.sort_values('date').head(1)['date'].values[0],
            df_all.sort_values('date').tail(1)['date'].values[0],
            len(df_all)))
     np_pred = self.confer.classifier.predict_proba(np_feat)
     #df_all = df_all.iloc[2-1:]
     if np_pred.shape[1] == 2:
         df_all["pred"] = np_pred[:, 1]
     else:
         df_all["pred"] = np_pred[:, 0]
     df_all = df_all.sample(frac=1.0)
     return df_all.sort_values("pred", ascending=False)
示例#5
0
def _get_metas(dfTa, depth, min_, label, n_pool):
    feat_names = base.get_feat_names(dfTa)
    idx = 0
    results = []
    import concurrent.futures
    if n_pool == 1:
        for cur_feat in feat_names:
            feat_meta = _feat_meta(cur_feat, dfTa, len(dfTa[dfTa[label] == 1]),
                                      len(dfTa[dfTa[label] == 0]), len(dfTa), label, depth, min_)
            if None != feat_meta:
                results.append(feat_meta)
    else:
        Executor = concurrent.futures.ProcessPoolExecutor
        plen = len(dfTa[dfTa[label] == 1])
        nlen = len(dfTa[dfTa[label] == 0])
        alen = len(dfTa)
        with Executor(max_workers=n_pool) as executor:
            futures = {executor.submit(_feat_meta, cur_feat, dfTa[[cur_feat, label]].copy(), plen, nlen, alen, label, depth, min_): cur_feat for cur_feat in feat_names}
            for future in concurrent.futures.as_completed(futures):
                try:
                    cur_feat = futures[future]
                    results.append(future.result())
                except Exception as exc:
                    import traceback
                    traceback.print_exc()
                    sys.exit(1)

    return results
示例#6
0
def one_work(cls, taName, label, date_range, th):
    df = base.get_merged(base.dir_ta(taName))
    df = get_range(df, date_range[0], date_range[1])
    m = joblib.load(os.path.join(root, 'data', 'models',"model_" + cls + ".pkl"))
    s = joblib.load(os.path.join(root, 'data', 'models',"scaler_" + cls + ".pkl"))
    feat_names = base.get_feat_names(df)
    npFeat = df.loc[:,feat_names].values
    #npPred = cls.predict_proba(npFeat)[:,1]
    #prent npPred
    res = ""
    for i, npPred in enumerate(m.staged_predict_proba(s.transform(npFeat))):
        #if i % 1 != 0:
        #    continue
        re =  "%s\t%s\t%s\t%s\t%s\t%f\t" % (cls, taName, label, date_range[0], date_range[1],th)
        df["pred"] = npPred[:,1]
        dacc =  accu(df, label, th)
        re += "%d\t%d\t%d\t" % (i, dacc["trueInPos"], dacc["pos"])
        if dacc["pos"] > 0:
            re += "%f" % (dacc["trueInPos"]*1.0 / dacc["pos"])
        else :
            re += "0.0"
        re += "\n"
        print re
        res += re
    return re
示例#7
0
    def _select(self, df, start, end, score):
        """
        http://nlp.stanford.edu/IR-book/html/htmledition/mutual-information-1.html#mifeatsel
        """
        df = df[(df.date >= start) & (df.date < end)]
        feat_names = base.get_feat_names(df)
        label = df.loc[:, score]
        res = pd.DataFrame(data=None, index=feat_names)
        df = df[feat_names]
        n11 = df[label > 0.5].sum()
        n10 = df[label < 0.5].sum()
        n01 = (1 - df[label > 0.5]).sum()
        n00 = (1 - df[label < 0.5]).sum()

        n = df.count()
        n1_ = n11 + n10
        n0_ = n01 + n00
        n_1 = n01 + n11
        n_0 = n00 + n10

        assert 0 == (n11 + n01 - df[label > 0.5].count()).sum()
        assert 0 == (n11 + n01 + n10 + n00 - df.count()).sum()

        mi = n11/n*np.log2(n*n11/(n1_*n_1)) + n01/n*np.log2(n*n01/(n0_*n_1)) \
                + n10/n*np.log2(n*n10/(n1_*n_0)) + n00/n*np.log2(n*n00/(n0_*n_0))

        res["mi"] = mi
        res["pn_ratio"] = n11 / (n11 + n10)
        res = res.sort_values("mi", ascending=False)

        return res
示例#8
0
def one_work(name, dir_ta, model, label, date_range):
    if os.path.isfile(
            os.path.join(root, 'data', 'models', "model_%s.pkl" % name)):
        print "%s already exists!" % name
        return
    dfTa = ta.get_merged(dir_ta)
    dfTrain = build_trains(dfTa, date_range[0], date_range[1])
    feat_names = base.get_feat_names(dfTrain)
    npTrainFeat = dfTrain.loc[:, feat_names].values
    npTrainLabel = dfTrain.loc[:, label].values.copy()
    npTrainLabel[npTrainLabel != 1.0]
    npTrainLabel[npTrainLabel > 1.0] = 1
    npTrainLabel[npTrainLabel < 1.0] = 0
    model.fit(npTrainFeat, npTrainLabel)
    joblib.dump(model,
                os.path.join(root, "data", "models", "model_%s.pkl" % name),
                compress=3)
    dFeatImps = dict(zip(feat_names, model.feature_importances_))
    with open(
            os.path.join(root, 'data', 'models', 'model_%s_importance' % name),
            'w') as fipt:
        for each in sorted(dFeatImps.iteritems(),
                           key=lambda a: a[1],
                           reverse=True):
            print >> fipt, each[0], ",", each[1]
示例#9
0
    def _select(self, df, start, end, score):
        """
        http://nlp.stanford.edu/IR-book/html/htmledition/mutual-information-1.html#mifeatsel
        """
        df = df[(df.date >= start) & (df.date < end)]
        feat_names = base.get_feat_names(df)
        label = df.loc[:,score]
        res = pd.DataFrame(data = None, index=feat_names)
        df = df[feat_names]
        n11 = df[label > 0.5].sum()
        n10 = df[label < 0.5].sum()
        n01 = (1-df[label>0.5]).sum()
        n00 = (1-df[label<0.5]).sum()

        n = df.count()
        n1_ = n11+n10
        n0_ = n01+n00
        n_1 = n01+n11
        n_0 = n00+n10
    
        assert 0 == (n11 + n01 - df[label>0.5].count()).sum() 
        assert 0 == (n11 + n01 + n10 + n00 - df.count()).sum() 

        mi = n11/n*np.log2(n*n11/(n1_*n_1)) + n01/n*np.log2(n*n01/(n0_*n_1)) \
                + n10/n*np.log2(n*n10/(n1_*n_0)) + n00/n*np.log2(n*n00/(n0_*n_0))
    
        res["mi"] = mi
        res["pn_ratio"] = n11/(n11+n10)
        res = res.sort_values("mi", ascending=False)

        return res
示例#10
0
def main(args):
    exec "import main.pandas_talib.sig_%s as conf" % args.signame
    build.work2(20, 'sp500Top50', args.signame)
    df = base.get_merged(conf.__name__, yeod.get_sp500Top50())
    df.to_csv("ta.csv")

    tree = DecisionTreeClassifier()

    feat_names = base.get_feat_names(df)

    dfTrain = df[(df.date >= '1970-01-01') & (df.date <= '2009-12-31')]
    npTrainFeat = dfTrain.loc[:, feat_names].values.copy()
    npTrainLabel = dfTrain.loc[:, "label5"].values.copy()
    npTrainLabel[npTrainLabel > 1.0] = 1
    npTrainLabel[npTrainLabel < 1.0] = 0

    tree.fit(npTrainFeat, npTrainLabel)
    joblib.dump(tree, "tree.pkl", compress=3)

    dfTest = df[(df.date >= '2010-01-01') & (df.date <= '2099-12-31')]
    npTestFeat = dfTest.loc[:, feat_names].values.copy()

    npPred = tree.predict_proba(npTestFeat)

    dfTest.loc[:, "pred"] = npPred[:, 1]

    print dfTest['pred'].head()

    dfPos = dfTest[dfTest['pred'] > 0.55]
    print 1.0 * len(dfPos[dfPos['label5'] > 1]) / len(dfPos)
    print 1.0 * len(dfTest[dfTest['label5'] > 1]) / len(dfTest)
示例#11
0
def main(args):
    lsym = getattr(yeod, "get_%s" % args.setname)()
    if args.start is None:
        args.start = base.last_trade_date()
        args.end = args.start
    cls = joblib.load(os.path.join(base.dir_model(), args.model))

    ta = base.get_merged_with_na(args.taname, lsym)

    ta = ta[(ta['date'] >= args.start) & (ta['date'] <= args.end)]
    dfFeat = ta.loc[:, base.get_feat_names(ta)]
    dfFeat = dfFeat.replace([np.inf,-np.inf],np.nan)\
        .dropna()
    npFeat = dfFeat.values
    npPred = cls.predict_proba(npFeat)
    #for i, npPred in enumerate(cls.staged_predict_proba(npFeat)):
    #    if i == args.stage:
    #        break
    ta["pred"] = npPred[:, 1]
    ta.sort("pred", inplace=True, ascending=False)
    freport, fcsv = base.file_pred(args)
    ta.to_csv(fcsv)
    #ta[["date", "sym", "pred", label]].to_csv(os.path.join(out_dir, 'pred.s.csv'))
    with open(freport, 'w') as fout:
        print >> fout, ta[["date", "sym", "pred"]].head(10)
示例#12
0
def extract_feat_label(df, scorename):
    df = df.replace([np.inf, -np.inf], np.nan).dropna()
    feat_names = base.get_feat_names(df)
    npFeat = df.loc[:, feat_names].values.copy()
    npLabel = df.loc[:, scorename].values.copy()
    npPred = df.loc[:, "pred"].values.copy()
    return npFeat, npLabel, npPred
示例#13
0
def mutual_information(df, confer):
    """
    http://nlp.stanford.edu/IR-book/html/htmledition/mutual-information-1.html#mifeatsel
    """
    feat_names = base.get_feat_names(df)
    label = df.loc[:,confer.score1.get_name()]
    res = pd.DataFrame(data = None, index=feat_names)
    df = df[feat_names]
    n11 = df[label > 0.5].sum()
    n10 = df[label < 0.5].sum()
    n01 = (1-df[label>0.5]).sum()
    n00 = (1-df[label<0.5]).sum()
    n = df.count()
    n1_ = n11+n10
    n0_ = n01+n00
    n_1 = n01+n11
    n_0 = n00+n10

    assert 0 == (n11 + n01 - df[label>0.5].count()).sum() 
    assert 0 == (n11 + n01 + n10 + n00 - df.count()).sum() 
    

    mi = n11/n*np.log2(n*n11/(n1_*n_1)) + n01/n*np.log2(n*n01/(n0_*n_1)) \
            + n10/n*np.log2(n*n10/(n1_*n_0)) + n00/n*np.log2(n*n00/(n0_*n_0))

    res["n11"] = n11
    res["n10"] = n10
    res["n01"] = n01
    res["n00"] = n00
    res["mi"] = mi
    return res
示例#14
0
def main(args):
    exec "import main.pandas_talib.sig_%s as conf" % args.signame
    build.work2(20, 'sp500Top50', args.signame)
    df = base.get_merged(conf.__name__, yeod.get_sp500Top50())
    df.to_csv("ta.csv")

    tree = DecisionTreeClassifier() 
    
    feat_names = base.get_feat_names(df)

    dfTrain = df[(df.date>='1970-01-01') & (df.date <='2009-12-31')]
    npTrainFeat = dfTrain.loc[:,feat_names].values.copy()
    npTrainLabel = dfTrain.loc[:,"label5"].values.copy()
    npTrainLabel[npTrainLabel >  1.0] = 1
    npTrainLabel[npTrainLabel <  1.0] = 0

    tree.fit(npTrainFeat, npTrainLabel)
    joblib.dump(tree, "tree.pkl", compress = 3)
    
    dfTest = df[(df.date>='2010-01-01') & (df.date <='2099-12-31')]
    npTestFeat = dfTest.loc[:, feat_names].values.copy()
    
    npPred = tree.predict_proba(npTestFeat)

    dfTest.loc[:,"pred"] = npPred[:,1]
    
    print dfTest['pred'].head()

    dfPos = dfTest[ dfTest['pred'] > 0.55 ]
    print 1.0 * len(dfPos[dfPos['label5']>1])  / len(dfPos)
    print 1.0 * len(dfTest[dfTest['label5']>1])  / len(dfTest)
示例#15
0
def _get_metas(dfTa, depth, min_, label, n_pool):
    feat_names = base.get_feat_names(dfTa)
    idx = 0
    results = []
    import concurrent.futures
    Executor = concurrent.futures.ProcessPoolExecutor
    plen = len(dfTa[dfTa[label] > 0.5])
    nlen = len(dfTa[dfTa[label] < 0.5])
    alen = len(dfTa)
    with Executor(max_workers=n_pool) as executor:
        futures = {
            executor.submit(_feat_meta, cur_feat, dfTa[[cur_feat,
                                                        label]].copy(), plen,
                            nlen, alen, label, depth, min_): cur_feat
            for cur_feat in feat_names
        }
        for future in concurrent.futures.as_completed(futures):
            try:
                cur_feat = futures[future]
                results.append(future.result())
            except Exception as exc:
                import traceback
                traceback.print_exc()
                sys.exit(1)

    return results
示例#16
0
文件: paper.py 项目: fswzb/pytrade
def main(args):
    lsym = getattr(yeod, "get_%s" % args.setname)()
    dfTa = base.get_merged(args.taname, lsym)
    if dfTa is None:
        print "can not merge " % args.setname
        sys.exit(1)
    dfTa = base.get_range(dfTa, args.start, args.end)
    print dfTa.shape
    #if args.filter:
    #    dfTa = filter_trend(dfTa)
    #print dfTa.shape

    cls = joblib.load(os.path.join(base.dir_model(), args.model))
    feat_names = base.get_feat_names(dfTa)
    npFeat = dfTa.loc[:, feat_names].values
    if isscaler:
        scaler = get_scaler(clsName)
        npFeatScaler = scaler.transform(npFeat)
    else:
        npFeatScaler = npFeat
    #for i, npPred in enumerate(cls.staged_predict_proba(npFeatScaler)):
    #    if i == args.stage:
    #        break
    npPred = cls.predict_proba(npFeat)
    dfTa["pred"] = npPred[:, 1]
    dfTa = dfTa.sort_values(['pred'], ascending=False)
    freport, fpred = base.file_paper(args)
    dfTa.to_csv(fpred)

    ana.main([fpred, args.top, args.thresh, freport, args.level])
    print freport
示例#17
0
def one_work(cls, taName, label, date_range, th):
    df = base.get_merged(base.dir_ta(taName))
    df = get_range(df, date_range[0], date_range[1])
    m = joblib.load(
        os.path.join(root, 'data', 'models', "model_" + cls + ".pkl"))
    s = joblib.load(
        os.path.join(root, 'data', 'models', "scaler_" + cls + ".pkl"))
    feat_names = base.get_feat_names(df)
    npFeat = df.loc[:, feat_names].values
    #npPred = cls.predict_proba(npFeat)[:,1]
    #prent npPred
    res = ""
    for i, npPred in enumerate(m.staged_predict_proba(s.transform(npFeat))):
        #if i % 1 != 0:
        #    continue
        re = "%s\t%s\t%s\t%s\t%s\t%f\t" % (cls, taName, label, date_range[0],
                                           date_range[1], th)
        df["pred"] = npPred[:, 1]
        dacc = accu(df, label, th)
        re += "%d\t%d\t%d\t" % (i, dacc["trueInPos"], dacc["pos"])
        if dacc["pos"] > 0:
            re += "%f" % (dacc["trueInPos"] * 1.0 / dacc["pos"])
        else:
            re += "0.0"
        re += "\n"
        print re
        res += re
    return re
示例#18
0
 def pred(self, start = None):
     df_all = pd.read_pickle(self.confer.get_sel_file())
     if start != None:
         df_all = df_all[df_all.date >= start]
     score = self.confer.scores[0]
     df_all_1 = df_all[df_all[score.get_name()] < 0.5]
     df_all_2 = df_all[df_all[score.get_name()] > 0.5]
     assert len(df_all_1) + len(df_all_2) == len(df_all)
     df_all_2 = df_all_2.sample(n = len(df_all_1))
     assert(len(df_all_2) == len(df_all_1))
     df_all = pd.concat([df_all_1, df_all_2], axis=0)
     assert(len(df_all) == 2*len(df_all_1))
     df_all = df_all.sample(frac=1.0, random_state = 1253)
     feat_names = base.get_feat_names(df_all)
     np_feat = df_all.loc[:, feat_names].values
     print("pred start : %s pred end: %s total:%d" % (df_all.sort_values('date').head(1)['date'].values[0],
                                                        df_all.sort_values('date').tail(1)['date'].values[0], len(df_all)))
     np_pred = self.confer.classifier.predict_proba(np_feat)
     #df_all = df_all.iloc[2-1:]
     if np_pred.shape[1] == 2:
         df_all["pred"] = np_pred[:, 1]
     else:
         df_all["pred"] = np_pred[:, 0]
     df_all = df_all.sample(frac=1.0)
     return df_all.sort_values("pred", ascending=False)
示例#19
0
def work(classifer, df_pred):
    feat_names = base.get_feat_names(df_pred)
    np_feat = df_pred.loc[:, feat_names].values
    np_pred = classifer.predict_proba(np_feat)

    df_pred["pred"] = np_pred[:,1]
    df_pred.sort_values(['pred'], ascending=False, inplace=True)
    return df_pred[["date", "sym", "open", "high", "low", "close", "pred"]].head(20)
示例#20
0
 def verify_predict(self, df):
     feat_names = base.get_feat_names(df)
     ipts = self.get_feature_importances(feat_names)
     s = 0
     for each in ipts:
         if int(df[each]) == 1 :
             s += ipts[each] * 1
     import math
     return 1 / (1 + math.exp(-1 * (s + self.classifier.intercept_)))
示例#21
0
 def verify_predict(self, df):
     feat_names = base.get_feat_names(df)
     ipts = self.get_feature_importances(feat_names)
     s = 0
     for each in ipts:
         if int(df[each]) == 1:
             s += ipts[each] * 1
     import math
     return 1 / (1 + math.exp(-1 * (s + self.classifier.intercept_)))
示例#22
0
def main(args):
    cls = params_set.d_model[args.clsname]
    file_model, file_ipt = base.file_model(args)

    if os.path.isfile(file_model):
        print "%s already exists!" % file_model
        return
    dfTa = base.get_merged(args.taname, getattr(yeod, "get_%s" % args.setname)(),args.start,args.end)
    if dfTa is None:
        return None
    dfTrain = dfTa # build_trains(dfTa, args.start, args.end)

    if args.sample:
        print "sampling ..."
        sample = len(dfTrain)/sample
        rows = random.sample(range(len(dfTrain)), sample)
        print len(rows)
        dfTrain = dfTrain.reset_index(drop=True)
        dfTrain = dfTrain.ix[rows]

    if args.repeat:
        print "repeat ..."
        toAppends = []
        for i in range(1,3):
            dfTmp = dfTrain[dfTrain.label5>=1+i/20.0]
            toAppends.append(dfTmp)
        print dfTrain.shape
        dfTrain = dfTrain.append(toAppends)
        print dfTrain.shape

    if args.sw:
        dfTrain = getattr(sw, "sw_%s" % args.sw)(dfTrain)

    feat_names = base.get_feat_names(dfTrain)
    npTrainFeat = dfTrain.loc[:,feat_names].values

    npTrainLabel = dfTrain.loc[:,args.labelname].values.copy()
    npTrainLabel[npTrainLabel != 1.0]
    npTrainLabel[npTrainLabel >  1.0] = 1
    npTrainLabel[npTrainLabel <  1.0] = 0

    if args.scaler:
        scaler = getMinMax(npTrainFeat)
        npTrainFeatScaled = scaler.transform(npTrainFeat)
    else:
        npTrainFeatScaled = npTrainFeat
    if args.sw:
        cls.fit(npTrainFeatScaled, npTrainLabel, sample_weight=dfTrain["sample_weight"].values)
    else:
        cls.fit(npTrainFeatScaled, npTrainLabel)
    joblib.dump(cls, file_model, compress = 3)
    #joblib.dump(scaler, os.path.join(root, 'data', 'models',scalerName), compress = 3)
    dFeatImps = dict(zip( feat_names, cls.feature_importances_))
    with open(file_ipt, 'w') as fipt:
        for each in sorted(dFeatImps.iteritems(), key = lambda a: a[1], reverse=True):
            print >> fipt, each[0], ",", each[1]
示例#23
0
def get_ipts(confer):
    df = pd.read_pickle(confer.get_sel_file())
    d_feat_ipts = confer.classifier.get_feature_importances(base.get_feat_names(df))
    ipts = []
    if len(d_feat_ipts) > 0:
        for each in sorted(d_feat_ipts.items(), key = lambda a: a[1], reverse=True):
            ipts.append({"name":each[0], "score":each[1]})
    df_ipts = pd.DataFrame(data=ipts)
    df_ipts = df_ipts.set_index("name")
    return df_ipts
示例#24
0
def _get_metas2(dfTa, depth, min_, label):
    feat_names = base.get_feat_names(dfTa)
    idx = 0
    results = []
    for cur_feat in feat_names:
        idx += 1
        ## plen, nlen and len_ is to speed up
        results.append(_feat_meta2(cur_feat, dfTa, len(dfTa[dfTa[label] == 1]),
                                  len(dfTa[dfTa[label] == 0]), len(dfTa), label, depth, min_))
        print "%d done!" % idx
    return [result for result in results]
示例#25
0
def get_metas(dfTa):
    feat_names = base.get_feat_names(dfTa)
    list_feat_meta = []
    idx = 0
    for cur_feat in feat_names:
        idx += 1
        if istest:
            if idx > 10:
                break
        list_feat_meta.append(feat_meta(cur_feat, dfTa, "label5"))
    return list_feat_meta
示例#26
0
 def top_10_decision_path(crosses, crossname):
     print('|symset|glo_l1|sel_l1|glo_l2|sel_l2|select_len|min|max|',
           file=out_file)
     print('|------|------|------|------|------|----------|---|---|',
           file=out_file)
     for each in crosses:
         cross = each[crossname]
         df_test = pd.concat([c.df_test for c in cross])
         df_select, df_year, df_month = ana.select2(confer.score1,
                                                    confer.score2, df_test,
                                                    1, 10)
         print('|%s|%.2f|%.2f|%.2f|%.2f|%d|%.2f|%.2f|' %
               (each["symsetname"], ana.accurate(df_test, confer.score1),
                ana.accurate(df_select, confer.score1),
                ana.accurate(df_test, confer.score2),
                ana.accurate(df_select, confer.score2), len(df_select),
                df_select.tail(1)["pred"] if len(df_select) > 0 else 0,
                df_select.head(1)["pred"] if len(df_select) > 0 else 0),
               file=out_file)
         np_feat = df_select[base.get_feat_names(df_select)].values
         classifier = cross[0].classifier
         for i in range(len(np_feat)):
             x = np_feat[i, :]
             print(x)
             dot_file = os.path.join(
                 root, "data", "cross",
                 'top_10_decision_path-%s-%d' % (each["symsetname"], i))
             decision_path.export_decision_path2(
                 classifier,
                 x,
                 dot_file + ".dot",
                 feature_names=base.get_feat_names(df_select))
             import pydot
             (graph, ) = pydot.graph_from_dot_file(dot_file + ".dot")
             graph.write_png(dot_file + ".png")
     for each in crosses:
         for i in range(10):
             dot_file = os.path.join(
                 root, "data", "cross",
                 'top_10_decision_path-%s-%d' % (each["symsetname"], i))
             print("![](%s.png)" % (dot_file), file=out_file)
示例#27
0
def main(df):
    df = base1.main(df)
    df.reset_index(inplace=True,drop=True)
    orig_feats = base.get_feat_names(df)
    df = merge(df, 1)
    df = merge(df, 2)
    df = merge(df, 3)
    for each in orig_feats:
        if not each.startswith("ta_ADX"):
            del df[each]
    print list(df.columns)
    return df
示例#28
0
    def verify_predict(self, df):
        feat_names = base.get_feat_names(df)
        ipts = self.get_feature_importances(feat_names)

        s = None
        for each in ipts:
            tmp = df[each] * ipts[each]
            if s is None:
                s = tmp
            else:
                s += tmp
        return 1 / (1 + np.exp(-1 * (s + self.classifier.intercept_)))
示例#29
0
def pn_ratio(df, confer):
    feat_names = base.get_feat_names(df)
    label = df.loc[:,confer.score1.get_name()]
    res = pd.DataFrame(data = None, index=feat_names)
    df = df[feat_names]
    n11 = df[label > 0.5].sum()
    n10 = df[label < 0.5].sum()

    res["n11"] = n11
    res["n10"] = n10
    res["pn_ratio"] = res["n11"]/(res["n10"]+res["n11"])
    return res
示例#30
0
def main(argv):
    clsName = argv[1]
    cls = joblib.load(clsName)
    idx = 0
    out_dir = os.path.join(root, 'data', 'graph',  "tmp2")
    mkdir_p(out_dir)
    print out_dir
    dot_data = StringIO()
    ta = pd.read_csv(argv[2])
    names = base.get_feat_names(ta)
    dotfile = os.path.join(out_dir, '%d.dot' % idx)
    export_graphviz(cls, feature_names = names, out_file=os.path.join(out_dir, '%d.dot' % idx))
示例#31
0
def main(argv):
    clsName = argv[1]
    cls = joblib.load(clsName)
    idx = 0
    out_dir = os.path.join(root, "data", "graph", "tmp2")
    mkdir_p(out_dir)
    print out_dir
    dot_data = StringIO()
    ta = pd.read_csv(argv[2])
    names = base.get_feat_names(ta)
    dotfile = os.path.join(out_dir, "%d.dot" % idx)
    export_graphviz(cls, feature_names=names, out_file=os.path.join(out_dir, "%d.dot" % idx))
示例#32
0
 def verify_predict(self, df):
     feat_names = base.get_feat_names(df)
     ipts = self.get_feature_importances(feat_names)
     
     s = None
     for each in ipts:
         tmp = df[each]*ipts[each]
         if s is None:
             s = tmp
         else:
             s += tmp
     return 1 / (1 + np.exp(-1 * (s + self.classifier.intercept_)))
示例#33
0
def main(df):
    df = base1.main(df)
    df.reset_index(inplace=True, drop=True)
    orig_feats = base.get_feat_names(df)
    print df.shape
    df = merge(df, 1)
    print df.shape
    df = merge(df, 2)
    print df.shape
    for each in orig_feats:
        del df[each]
    print df.shape
    return df
示例#34
0
def get_ipts(confer):
    df = pd.read_pickle(confer.get_sel_file())
    d_feat_ipts = confer.classifier.get_feature_importances(
        base.get_feat_names(df))
    ipts = []
    if len(d_feat_ipts) > 0:
        for each in sorted(d_feat_ipts.items(),
                           key=lambda a: a[1],
                           reverse=True):
            ipts.append({"name": each[0], "score": each[1]})
    df_ipts = pd.DataFrame(data=ipts)
    df_ipts = df_ipts.set_index("name")
    return df_ipts
示例#35
0
def feat_split(dfo, start, end, split_point, label, depth, min_, n_pool):
    df = dfo[(dfo.date >= start) & (dfo.date < end)]
    df_len = len(df)
    split_point = int(df_len * split_point)
    df1 = df.iloc[:split_point]
    df2 = df.iloc[split_point:]
    if True:
        assert df_len == len(df1) + len(df2)
    df_bit1s = bitlize(df1, label, depth, min_, n_pool)
    candis = [dfo[list(set(dfo.columns) - set(base.get_feat_names(dfo)))]]

    l_feat = []
    for i in range(0, 2**depth):
        df_bit1 = df_bit1s[i]
        if len(df2) > 0:
            df_bit2 = apply(df_bit1, df2, label)

            df_bit1.loc["direct"] = \
                    ((df_bit1.loc["p_chvfa"] -1 ) * (df_bit2.loc["p_chvfa"] - 1)) > 0.00008
            print(df_bit1.shape)
            df_bit1 = df_bit1.loc[:, df_bit1.loc["direct"]]
            print(df_bit1.shape)

        feat_names = base.get_feat_names(df_bit1)

        df_candi = (dfo[feat_names] >= df_bit1.loc["start"]) & (
            dfo[feat_names] < df_bit1.loc["end"])
        pd.set_option('display.expand_frame_repr', False)
        df_candi.columns = df_bit1.loc["name"]

        feat_cur = df_bit1.copy()
        feat_cur.columns = feat_cur.loc["name"]
        l_feat.append(feat_cur)
        candis.append(df_candi)
    df_res = pd.concat(candis, axis=1)
    df_res.sort_values(["sym", "date"])

    df_feat = pd.concat(l_feat, axis=1)
    return df_res, df_feat
示例#36
0
def main(df):
    df = base1.main(df)
    df.reset_index(inplace=True,drop=True)
    orig_feats = base.get_feat_names(df)
    print df.shape
    df = merge(df, 1)
    print df.shape
    df = merge(df, 2)
    print df.shape
    for each in orig_feats:
        del df[each]
    print df.shape
    return df
示例#37
0
def get_metas(dfTa, depth):
    #pool = multiprocessing.Pool(processes=20)
    feat_names = base.get_feat_names(dfTa)
    idx = 0
    results = []
    for cur_feat in feat_names:
        idx += 1
        if istest:
            if idx > 10:
                break
        #results.append(pool.apply_async(feat_meta, (cur_feat, dfTa, "label5")))
        results.append(feat_meta(cur_feat, dfTa, "label5", depth))
        print "%d done!" % idx
    return [result for result in results]
示例#38
0
 def top_10_decision_path(crosses, crossname):
     print('|symset|glo_l1|sel_l1|glo_l2|sel_l2|select_len|min|max|', file=out_file)
     print('|------|------|------|------|------|----------|---|---|', file=out_file)
     for each in crosses:
         cross = each[crossname]
         df_test = pd.concat([c.df_test for c in cross])
         df_select, df_year, df_month = ana.select2(confer.score1, confer.score2, df_test,
                                                    1, 10)
         print('|%s|%.2f|%.2f|%.2f|%.2f|%d|%.2f|%.2f|'
               % (
                   each["symsetname"],
                   ana.accurate(df_test, confer.score1),
                   ana.accurate(df_select, confer.score1),
                   ana.accurate(df_test, confer.score2),
                   ana.accurate(df_select , confer.score2),
                   len(df_select),
                   df_select.tail(1)["pred"] if len(df_select) > 0 else 0,
                   df_select.head(1)["pred"] if len(df_select) > 0 else 0), file=out_file)
         np_feat = df_select[base.get_feat_names(df_select)].values
         classifier = cross[0].classifier
         for i in range(len(np_feat)):
             x = np_feat[i,:]
             print(x)
             dot_file = os.path.join(root, "data", "cross",
                                     'top_10_decision_path-%s-%d'
                                     % (each["symsetname"], i))
             decision_path.export_decision_path2(classifier, x, dot_file + ".dot" ,
                                                 feature_names=base.get_feat_names(df_select))
             import pydot
             (graph,) = pydot.graph_from_dot_file(dot_file + ".dot")
             graph.write_png(dot_file + ".png")
     for each in crosses:
         for i in range(10):
             dot_file = os.path.join(root, "data", "cross",
                                     'top_10_decision_path-%s-%d'
                                     % (each["symsetname"], i))
             print("![](%s.png)" % (dot_file), file=out_file)
示例#39
0
 def _select(self, df, start, end, score):
     df = df[(df.date >= start) & (df.date < end)]
     #df = df[(df.date >= "2013-01-01") & (df.date < "2014-01-01")]
     feat_names = base.get_feat_names(df)
     label = df.loc[:, score]
     res = pd.DataFrame(data=None, index=feat_names)
     df = df[feat_names]
     n11 = df[label > 0.5].sum()
     n10 = df[label < 0.5].sum()
     res["n11"] = n11
     res["n10"] = n10
     res["pn_ratio"] = res["n11"] / (res["n10"] + res["n11"])
     #res = res[(res["pn_ratio"]<=self.threshold)&(res["pn_ratio"]>=1-self.threshold)]
     res = res.sort_values("pn_ratio", ascending=False)
     return res.tail(10)
示例#40
0
def work(confer):
    ta1 = confer.get_ta_file()
    last_trade_date = confer.last_trade_date
    confer.last_trade_date = base.get_second_trade_date_local(confer.syms.get_name())
    print(confer.last_trade_date)
    ta2 = confer.get_ta_file()
    confer.last_trade_date = last_trade_date

    print(ta1,ta2)


    df1 = pd.read_pickle(ta1)
    df1 = df1[df1.date <= base.get_second_trade_date_local(confer.syms.get_name())]
    df2 = pd.read_pickle(ta2)

    syms1 = df1.sym.unique()
    syms2 = df2.sym.unique()

    print(syms1, syms2)
    assert len(syms1) == len(syms2)
    df1.reset_index(drop=True, inplace=True)
    df2.reset_index(drop=True, inplace=True)
    assert len(df1) == len(df2)
    assert_frame_equal(df1[base.get_feat_names(df1)], df2[base.get_feat_names(df2)])
示例#41
0
 def _select(self,df,start,end, score):
     df = df[(df.date >= start) & (df.date < end)]
     #df = df[(df.date >= "2013-01-01") & (df.date < "2014-01-01")]
     feat_names = base.get_feat_names(df)
     label = df.loc[:,score]
     res = pd.DataFrame(data = None, index=feat_names)
     df = df[feat_names]
     n11 = df[label > 0.5].sum()
     n10 = df[label < 0.5].sum()
     res["n11"] = n11
     res["n10"] = n10
     res["pn_ratio"] = res["n11"]/(res["n10"]+res["n11"])
     #res = res[(res["pn_ratio"]<=self.threshold)&(res["pn_ratio"]>=1-self.threshold)]
     res = res.sort_values("pn_ratio", ascending=False)
     return res.tail(10)
示例#42
0
def main(argv):
    clsName = argv[1]
    cls = joblib.load(clsName)
    idx = 0
    out_dir = os.path.join(root, 'data', 'graph', "tmp")
    mkdir_p(out_dir)
    print out_dir
    dot_data = StringIO()
    ta = pd.read_pickle(argv[2])
    names = base.get_feat_names(ta)
    for estimator in cls.estimators_:
        dotfile = os.path.join(out_dir, '%d.dot' % idx)
        export_graphviz(estimator,
                        feature_names=names,
                        out_file=os.path.join(out_dir, '%d.dot' % idx))
        #graph = pydot.graph_from_dot_file(dotfile)
        #graph.write_pdf(os.path.join(out_dir, "%d.pdf" % idx))
        #Image(graph.create_png())
        idx += 1
示例#43
0
def post_valid(classifier, df_train, df_test, score, is_fit):
    df_train = df_train.sort_values(["sym", "date"])
    # from sklearn.exceptions import NotFittedError
    npTrainFeat, npTrainLabel = extract_feat_label(df_train, score.get_name())
    npTestFeat, npTestLabel = extract_feat_label(df_test,
                                                 score.get_name(),
                                                 drop=False)
    feat_names = base.get_feat_names(df_test)
    if not is_fit:
        probas_ = classifier.predict_proba(npTestFeat)
    else:
        classifier.fit(npTrainFeat, npTrainLabel)
        probas_ = classifier.predict_proba(npTestFeat)
    d_feat_ipts = classifier.get_feature_importances(feat_names)
    ipts = []
    if len(d_feat_ipts) > 0:
        for each in sorted(d_feat_ipts.items(),
                           key=lambda a: a[1],
                           reverse=True):
            ipts.append({"name": each[0], "score": each[1]})

    fpr, tpr, thresholds = roc_curve(npTestLabel, probas_[:, 1])
    roc_auc = auc(fpr, tpr)

    min = str(df_test.head(1)["yyyy"].values[0])
    max = str(df_test.tail(1)["yyyy"].values[0])
    df_test.loc[:, "pred"] = probas_[:, 1]
    df_test.loc[:, "pred2"] = probas_[:, 0]
    #pdt.assert_numpy_array_equal(df_test.round(2).loc[:, "pred"].values[0:10], 1 - df_test.round(2).loc[:, "pred2"].values[0:10])
    post = {
        "classifier": classifier,
        'ipts': ipts,
        "fpr": fpr,
        "tpr": tpr,
        "thresholds": thresholds,
        "roc_auc": roc_auc,
        "name": "%s-%s" % (min, max),
        "min": min,
        "max": max,
        "df_test": df_test
    }
    return post
示例#44
0
def one_work(cls, ta, labelName, start, end, top):
    df = ta
    df = get_range(df, start, end)
    m = cls
    feat_names = base.get_feat_names(df)
    npFeat = df.loc[:, feat_names].values
    res = ""
    if isscaler:
        npFeat = s.transform(npFeat)
    topscore = None
    l = []
    for i, npPred in enumerate(m.staged_predict_proba(npFeat)):
        df.loc[:, "pred"] = npPred[:, 1]
        dacc = accu(df, labelName, top)
        acc = 0.0
        if dacc["pos"] > 0:
            acc = (dacc["trueInPos"] * 1.0 / dacc["pos"])
        print i, acc
        l.append([i, acc])
    return pd.DataFrame(np.asarray(l), columns=["idx", "acc"])
示例#45
0
def one_work(cls, ta, labelName, start, end, top):
    df = ta
    df = get_range(df, start, end)
    m = cls
    feat_names = base.get_feat_names(df)
    npFeat = df.loc[:,feat_names].values
    res = ""
    if isscaler :
        npFeat = s.transform(npFeat)
    topscore = None
    l = []
    for i, npPred in enumerate(m.staged_predict_proba(npFeat)):
        df.loc[:,"pred"] = npPred[:,1]
        dacc =  accu(df, labelName, top)
        acc = 0.0
        if dacc["pos"] > 0:
            acc = (dacc["trueInPos"]*1.0 / dacc["pos"])
        print i, acc
        l.append([i, acc])
    return pd.DataFrame(np.asarray(l), columns=["idx", "acc"])
示例#46
0
def one_work(clsName, taName, labelName, start, end, top):
    df = base.get_merged(base.dir_ta(taName))
    df = get_range(df, start, end)
    m = joblib.load(os.path.join(root, 'data', 'models',"model_" + clsName + ".pkl"))
    if isscaler:
        s = joblib.load(os.path.join(root, 'data', 'models',"scaler_" + clsName + ".pkl"))
    feat_names = base.get_feat_names(df)
    npFeat = df.loc[:,feat_names].values
    res = ""
    if isscaler :
        npFeat = s.transform(npFeat)
    topscore = None
    l = []
    for i, npPred in enumerate(m.staged_predict_proba(npFeat)):
        df.loc[:,"pred"] = npPred[:,1]
        dacc =  accu(df, labelName, top)
        acc = 0.0
        if dacc["pos"] > 0:
            acc = (dacc["trueInPos"]*1.0 / dacc["pos"])
        print i, acc
        l.append([i, acc])
    return pd.DataFrame(np.asarray(l), columns=["idx", "acc"])
示例#47
0
def main(df):
    df1 = base1.main(df)
    df1.reset_index(drop=True, inplace=True)

    df2 = stable_3.main(df)
    df2.reset_index(drop=True, inplace=True)

    l = [
        'ta_CMO_14', 'ta_RSI_14', 'ta_CMO_7', 'ta_RSI_7', 'ta_CMO_10',
        'ta_RSI_10', 'ta_TRIX_2', 'ta_RSI_28', 'ta_CMO_28', 'ta_RSI_5',
        'ta_STOCHRSI_slowd_5_20_12', 'ta_ROC_7', 'ta_ROCR100_7', 'ta_ROCP_7',
        'ta_ROCR_7', 'ta_STOCHRSI_slowd_7_20_12', 'ta_RSI_2', 'ta_ROC_5',
        'ta_ROCR100_5', 'ta_ROCR_5', 'ta_ROCP_5', 'ta_WILLR_10', 'ta_WILLR_14',
        'ta_ROC_2', 'ta_ROCR100_2', 'ta_ROCP_2', 'ta_ROCR_2', 'ta_WILLR_7'
    ]
    df1 = feat_select.append_deep_feats(df1, l)

    df1 = df1[base.get_feat_names(df1).extend(["date"])]
    print df2.shape
    df2 = df2.merge(df1, left_on='date', right_on="date", how='inner')
    print df2.shape
    return df2
示例#48
0
def main(df):
    df1 = base1.main(df)
    df1.reset_index(drop=True, inplace=True)

    df2 = stable_3.main(df)
    df2.reset_index(drop=True, inplace=True)

    l = ['ta_CMO_14', 'ta_RSI_14','ta_CMO_7','ta_RSI_7','ta_CMO_10',
         'ta_RSI_10',
         'ta_TRIX_2',
         'ta_RSI_28',
         'ta_CMO_28',
         'ta_RSI_5',
         'ta_STOCHRSI_slowd_5_20_12',
         'ta_ROC_7',
         'ta_ROCR100_7',
         'ta_ROCP_7',
         'ta_ROCR_7',
         'ta_STOCHRSI_slowd_7_20_12',
         'ta_RSI_2',
         'ta_ROC_5',
         'ta_ROCR100_5',
         'ta_ROCR_5',
         'ta_ROCP_5',
         'ta_WILLR_10',
         'ta_WILLR_14',
         'ta_ROC_2',
         'ta_ROCR100_2',
         'ta_ROCP_2',
         'ta_ROCR_2',
         'ta_WILLR_7']
    df1 = feat_select.append_deep_feats(df1,l)

    df1 = df1[base.get_feat_names(df1).extend(["date"])]
    print df2.shape
    df2 = df2.merge(df1, left_on='date', right_on="date", how='inner')
    print df2.shape
    return df2
示例#49
0
def one_work(cls, ta_dir, label, date_range, th):
    re =  "%s\t%s\t%s\t%s\t%s\t%f\t" % (cls, ta_dir[-4:], label, date_range[0], date_range[1],th)
    df = ta.get_merged(ta_dir)
    df = get_range(df, date_range[0], date_range[1])
    cls = joblib.load(os.path.join(root, 'data', 'models',"model_" + cls + ".pkl"))
    feat_names = base.get_feat_names(df)
    npFeat = df.loc[:,feat_names].values
    #for i, npPred in enumerate(cls.staged_predict_proba(npFeat)):
    #
    #if i == 322:
    #        break
    
    
    npPred = cls.predict_proba(npFeat)
    df["pred"] = npPred[:,1]
    dacc = accu(df, label, th)
    re += "%f\t%d\t%d\t" % (dacc["rate"],dacc["trueInPos"], dacc["pos"])
    if dacc["pos"] > 0:
        re += "%f" % (dacc["trueInPos"]*1.0 / dacc["pos"])
    else :
        re += "0.0"
    print re
    return re
示例#50
0
def extract_feat_label(df, scorename):
    df = df.replace([np.inf,-np.inf],np.nan).dropna()
    feat_names = base.get_feat_names(df)
    npFeat = df.loc[:,feat_names].values.copy()
    npLabel = df.loc[:,scorename].values.copy()
    return npFeat, npLabel
示例#51
0
#!/usr/bin/env python2.7
示例#52
0
confer = MyConfStableLTa()

#build.work(confer)
#model.work(confer)



df = pd.read_pickle(os.path.join(root, 'output', "result_20170205.pkl"))
print(ana.roc_auc(df, confer))

clazz_file_name = confer.get_classifier_file()
with open(clazz_file_name, 'rb') as fin:
    clazz = pickle.load(fin)

feat_names = base.get_feat_names(df)
ipts = sorted(clazz.get_feature_importances(feat_names).items(), key=lambda a:a[1], reverse=True)
for each in ipts:
    print(each)


dfo = df.sort_values("pred", ascending=False)
df = dfo[feat_names]
df_sum = df.sum(axis=0).to_frame(name='sum')
df_sum = df_sum/len(df)
print(df_sum.sort_values("sum", ascending=False).head())

df_top = dfo[dfo[confer.score1.get_name()]==0].head(100)[feat_names]
df_top_sum = df_top.sum(axis=0).to_frame(name='sum')
df_top_sum = (df_top_sum/len(df_top)).sort_values("sum", ascending=False)
print(df_top_sum.head())
示例#53
0
 def _select(self, df, start, end, score):
     df = df[(df.date >= start)&(df.date < end)]
     feat_names = base.get_feat_names(df)
     label = df.loc[:, score]