Пример #1
0
def feature_extraction(data_df, inputs):
    """
    feature extraction --> Transform the existing features
                           into a lower dimensional space
    Transforms the data - can be used to linearly separate
                          data thru dimensionality reduction
    """
    timeme(principal_component_analysis)(data_df, tuple(inputs))
Пример #2
0
def feature_selection(train_df, inputs):
    """
    Sequential Backward Selection - feature selection to see
    which are the most telling variable
    Default is K-means Clustering

    Feature selection: Select a subset of the existing
                       features without a transformation

    Use this to limit down the factors we learn on
    """
    ests = []
    ests.append([
        DecisionTreeClassifier(criterion='entropy',
                               max_depth=3,
                               random_state=0), 'DecTree'
    ])
    ests.append([
        RandomForestClassifier(criterion='entropy',
                               n_estimators=3,
                               random_state=1,
                               n_jobs=3), 'RandForest'
    ])
    # ests.append([SVC(kernel='linear', C=100, random_state=0), 'SVC'])
    ests.append(
        [LogisticRegression(C=100, random_state=0, penalty='l1'), 'LogRegr'])
    # ests.append([AdalineSGD(n_iter=15, eta=0.001, random_state=1),
    #              'AdalineSGD'])
    # ests.append([AdalineGD(n_iter=20, eta=0.001), 'AdalineGD'])
    ests.append([KNeighborsClassifier(n_neighbors=3), 'Kmeans'])

    ranks = []
    # for ind_est in ests:
    #     print("running for {}".format(ind_est[1]))
    #     ranks.append([ind_est[1], timeme(sbs_run)(train_df, tuple(inputs),
    #                                               est=ind_est[0], name=ind_est[1])])

    # Random Forest Feature Selection - using a random forest to identify
    # which factors decrease impurity the most
    pdb.set_trace()
    ranks.append([
        timeme(random_forest_feature_importance)(train_df, tuple(inputs)),
        'RandForestFeats'
    ])

    # Logistic Regression Feature Selection - logistic regression
    # should expose the important variables through its weights
    pdb.set_trace()
    ranks.append([
        timeme(logistic_regression_feature_importance)(train_df,
                                                       tuple(inputs)),
        "LogRegrWgts"
    ])
    pdb.set_trace()
    for rank in ranks:
        print("Ranks for {}".format(rank[1]))
        print(rank[0])
def run_analysis():
    """
    Function to run thru the whole analysis process
    """
    # Run the filter
    print("Running Screener:")
    ticks = timeme(run_filter)()
    print("{} stocks thru filter\n".format(len(ticks)))

    # check recent momentum returns
    print("Running Momentum Check:")
    ticks = timeme(check_momentum)('20190703', ticks)
    print("{} stocks thru momentum checks\n".format(len(ticks)))

    # check recent big vs small results
    print("Running Big vs. Small Filter:")
    ticks = timeme(check_big_v_small)('20190703', ticks.reset_index())
    print("{} stocks thru big vs small filter\n".format(len(ticks)))

    # remove ticks that should be ignored
    print("Ignoring Certain Symbols:")
    ticks = timeme(ignore_ticks)(ticks.reset_index())
    print("{} stocks thru ignore filter\n".format(len(ticks)))

    # Run equity valuation
    print("Running Equity Valuation on:  {}".format(
        ticks.index.levels[0].values))
    timeme(run_eq_valuation)(ticks)
    print()
Пример #4
0
def model_evaluation(data_df, inputs):
    """
    Evaluate the performance of your model thru different techniques
    """
    timeme(kfold_cross_validation)(data_df, tuple(inputs))
Пример #5
0
def model_evaluation(df, inputs):
    timeme(kfold_cross_validation)(df, tuple(inputs))