Exemplo n.º 1
0
    def fit(self, x, y):
        from genetic_selection import GeneticSelectionCV
        import random as rn
        self.X = x
        self.y = y
        if self.seed is not None:
            np.random.seed(self.seed)
            rn.seed(self.seed)

        # calculate ga
        selector = GeneticSelectionCV(self.estimator,
                                      cv=10,
                                      verbose=1,
                                      scoring="accuracy",
                                      n_population=50,
                                      crossover_proba=0.5,
                                      mutation_proba=0.2,
                                      n_generations=50,
                                      crossover_independent_proba=0.5,
                                      mutation_independent_proba=0.05,
                                      tournament_size=3,
                                      caching=True,
                                      n_jobs=-1)

        self.selector = selector.fit(self.X, self.y)

        print('GA - Selection')
        print(f'Number of selected features: {self.selector.n_features_}')
        print('Selected index:')
        print(
            pd.Series(
                self.colname).values[:self.feature][self.selector.support_])
def genetic_select(X, y, columns):
    scaler = MinMaxScaler()
    # Features need to be scaled in order to reduce the problem's complexity
    # otherwise, LogisticRegression will fail to converge!
    X = scaler.fit_transform(X)
    estimator = LogisticRegression(solver="liblinear", multi_class="ovr")
    selector = GeneticSelectionCV(estimator,
                                      cv=3,
                                      verbose=1,
                                      scoring="accuracy",
                                      max_features=min(X.shape[1], 30),
                                      n_population=50,
                                      crossover_proba=0.5,
                                      mutation_proba=0.2,
                                      n_generations=80,
                                      crossover_independent_proba=0.5,
                                      mutation_independent_proba=0.05,
                                      tournament_size=5,
                                      n_gen_no_change=10,
                                      caching=True,
                                      n_jobs=-1)
    selector = selector.fit(X, y)
    support_names = [columns[i] for i, s in enumerate(selector.support_) if s]
    return {
        # pick selected features names
        'support': support_names,
        # pick feature coefficients
        #'coef': {support_names[i]: c for i, c in enumerate(selector.estimator_.coef_)},
    }
def main():
    data = pd.read_csv('predict_house.csv')
    # Some noisy data not correlated
    E = np.random.uniform(0, 0.1, size=(len(data), 20))

    X = data.iloc[:, 0:79]
    y = data.iloc[:, -1]

    estimator = linear_model.LogisticRegression(solver="liblinear",
                                                multi_class="ovr")

    selector = GeneticSelectionCV(estimator,
                                  cv=5,
                                  verbose=1,
                                  scoring="accuracy",
                                  max_features=5,
                                  n_population=50,
                                  crossover_proba=0.5,
                                  mutation_proba=0.2,
                                  n_generations=40,
                                  crossover_independent_proba=0.5,
                                  mutation_independent_proba=0.05,
                                  tournament_size=3,
                                  n_gen_no_change=10,
                                  caching=True,
                                  n_jobs=-1)
    selector = selector.fit(X, y)
    kq = selector.predict(X)
    j = 0
    for i in y:
        print(i)
        print(kq[j])
        print()
        j = j + 1
def randomforest_genetic(X, y, X_test, y_test, columns):
    logger.info("Start RandomForest + Genetic")
    estimator = RandomForestClassifier(**CLASSIFIER_PARAMS)
    selector = GeneticSelectionCV(estimator,
                                      cv=5,
                                      verbose=0,
                                      scoring="accuracy",
                                      max_features=min(X.shape[1], 30),
                                      n_population=50,
                                      crossover_proba=0.5,
                                      mutation_proba=0.2,
                                      n_generations=80,
                                      crossover_independent_proba=0.5,
                                      mutation_independent_proba=0.05,
                                      tournament_size=5,
                                      n_gen_no_change=10,
                                      caching=True,
                                      n_jobs=-1)
    selector = selector.fit(X, y)
    logger.info("End RandomForest + Genetic")
    support_names = [columns[i] for i, s in enumerate(selector.support_) if s]
    importances = {columns[i]: v for i, v in enumerate(selector.estimator_.feature_importances_)}
    labeled = {str(k): v for k, v in sorted(importances.items(), key=lambda item: -item[1])}
    return {
        # pick selected features names
        'support': support_names,
        # pick feature coefficients
        #'coef': {support_names[i]: c for i, c in enumerate(selector.estimator_.coef_)},
        'feature_importances': labeled,
        'score': selector.score(X,y),
        'test_score': selector.score(X_test, y_test)
    }
Exemplo n.º 5
0
def main():
    iris = datasets.load_iris()

    # Some noisy data not correlated
    E = np.random.uniform(0, 0.1, size=(len(iris.data), 20))

    X = np.hstack((iris.data, E))
    y = iris.target

    estimator = linear_model.LogisticRegression(solver="liblinear",
                                                multi_class="ovr")

    selector = GeneticSelectionCV(estimator,
                                  cv=5,
                                  verbose=1,
                                  scoring="accuracy",
                                  max_features=5,
                                  n_population=50,
                                  crossover_proba=0.5,
                                  mutation_proba=0.2,
                                  n_generations=40,
                                  crossover_independent_proba=0.5,
                                  mutation_independent_proba=0.05,
                                  tournament_size=3,
                                  caching=True,
                                  n_jobs=-1)
    selector = selector.fit(X, y)

    print(selector.support_)
Exemplo n.º 6
0
def optim_feature_genetic( clf, df, dftest,colX, coly,   params={}) :
    """
      https://github.com/manuel-calzolari/sklearn-genetic


    """
    from genetic_selection import GeneticSelectionCV
    selector = GeneticSelectionCV(clf,  params)
      """
                                  cv=5,
                                  verbose=1,
                                  scoring="accuracy",
                                  max_features=5,
                                  n_population=50,
                                  crossover_proba=0.5,
                                  mutation_proba=0.2,
                                  n_generations=40,
                                  crossover_independent_proba=0.5,
                                  mutation_independent_proba=0.05,
                                  tournament_size=3,
                                  n_gen_no_change=10,
                                  caching=True,
                                  n_jobs=-1)

      """
      selector = selector.fit(df[colX].values, df[coly].values )
Exemplo n.º 7
0
def run():
    candidates = CandidateFeatureVector.objects.all().values()
    candidates_df = pd.DataFrame(candidates)
    candidates_df.set_index('id', inplace=True)
    candidates_df.drop(columns=['candidate_id'], inplace=True)

    candidates_df = EncodingUtil.basic_label_encode_cols(
        candidates_df, ConstantsUtil.BASIC_ENCODE_COLS)
    candidates_df = EncodingUtil.sort_position_cols_and_encode(
        candidates_df, ConstantsUtil.STRING_TUPLE_ENCODE_COLS)

    svm = SVM(C=.75, kernel='poly')
    X_train, X_test, y_train, y_test = svm.split_test_data(
        candidates_df, .3, 'classification', True)
    svm.fit_and_predict(X_train, X_test, y_train)
    print(svm.get_confusion_matrix(y_test))
    print(svm.get_classification_report(y_test))

    estimator = svm.get_model()

    selector = GeneticSelectionCV(estimator,
                                  cv=5,
                                  verbose=1,
                                  scoring="accuracy",
                                  max_features=50,
                                  n_population=50,
                                  crossover_proba=0.5,
                                  mutation_proba=0.2,
                                  n_generations=40,
                                  crossover_independent_proba=0.5,
                                  mutation_independent_proba=0.05,
                                  tournament_size=3,
                                  n_gen_no_change=10,
                                  caching=True,
                                  n_jobs=-1)
    X, y = svm.get_data()
    selector = selector.fit(X, y)

    print(selector.support_)
    def GA_features(x, y):
        rf = RandomForestClassifier(max_depth=8, n_estimators=10)
        selector = GeneticSelectionCV(
            rf,
            cv=TimeSeriesSplit(n_splits=4),
            verbose=1,
            scoring="accuracy",
            max_features=80,
            n_population=200,
            crossover_proba=0.5,
            mutation_proba=0.2,
            n_generations=100,
            crossover_independent_proba=0.5,
            mutation_independent_proba=0.05,
            tournament_size=3,
            n_gen_no_change=5,
            caching=True,
            n_jobs=-1
        )
        selector = selector.fit(x, y)
        features = x.columns[selector.support_]

        return features
Exemplo n.º 9
0
def main(dataset):
    indexFile = 'data/datasets/{}/index.json'.format(dataset)
    resultFile = 'data/datasets/{}/feature_selection.json'.format(dataset)
    with open(indexFile) as f:
        index = json.load(f)

    result = {}
    for _sym, files in index.items():
        params = {
            'estimator':
            LogisticRegression(**{
                'solver': 'liblinear',
                'multi_class': 'ovr'
            }),
            'cv':
            3,
            'verbose':
            1,
            'scoring':
            "accuracy",
            'max_features':
            10,
            'n_population':
            50,
            'crossover_proba':
            0.5,
            'mutation_proba':
            0.2,
            'n_generations':
            80,
            'crossover_independent_proba':
            0.5,
            'mutation_independent_proba':
            0.05,
            'tournament_size':
            5,
            'n_gen_no_change':
            10,
            'caching':
            True,
            'n_jobs':
            -1
        }
        pipe = Pipeline([
            ('scaler', MinMaxScaler()),
            ('SVC', GeneticSelectionCV(**params)),
        ])
Exemplo n.º 10
0
]
random.shuffle(
    together)  #groups based on first item of x_data, which should be shot!
final_random = [i for j in together for i in j]
X_data = (np.array(final_random))[:, 1:-1]
Y_data = (np.array(final_random, dtype=int))[:, -1]
scaler = StandardScaler()
scaler.fit(X_data)
X_data_v2 = scaler.transform(X_data)

X = X_data_v2
y = Y_data

estimator = linear_model.LogisticRegression()

selector = GeneticSelectionCV(estimator,
                              cv=5,
                              verbose=1,
                              scoring="accuracy",
                              n_population=50,
                              crossover_proba=0.5,
                              mutation_proba=0.2,
                              n_generations=40,
                              crossover_independent_proba=0.5,
                              mutation_independent_proba=0.05,
                              tournament_size=3,
                              caching=True,
                              n_jobs=-1)
selector = selector.fit(X, y)

print(selector.support_)
Exemplo n.º 11
0

df = pd.read_csv(str(path)+'PLMF.csv')
n_col = df.shape[1]
X = df.iloc[:,1:(n_col-1)]
y = df.iloc[:,n_col-1:]

estimator = linear_model.Lasso(1e-3,normalize=True, max_iter=1e9)

selector = GeneticSelectionCV(estimator,
                              cv=5,
                              verbose=1,
                              scoring="explained_variance",
                              max_features=100,
                              n_population=30,
                              crossover_proba=0.5,
                              mutation_proba=0.2,
                              n_generations=20,
                              crossover_independent_proba=0.1,
                              mutation_independent_proba=0.05,
                              tournament_size=5,
                              caching=True,
                              n_jobs=-1)

selector = selector.fit(X, y)
print (selector.score(X,y))
#print(selector.support_)


selection = pd.DataFrame(X.columns , columns = ['features'])
selection['support'] = selector.support_
selection['Flag'] = [1 if x == True else 0 for x in selection['support'] ]
Exemplo n.º 12
0
pop_size = [50]
cross_over = [0.2, 0.5, 0.8]
mutation = [0.01, 0.05, 0.1]
variations = [i for i in itertools.product(pop_size, cross_over, mutation)]
run = 0
best_fitness_values = [0] * len(variations)
for var_index, var in enumerate(variations):
    bsf_score_run = 0
    selector = GeneticSelectionCV(
        estimator,
        cv=rkf,
        verbose=0,
        scoring="accuracy",
        max_features=len(allfeats),
        n_population=var[0],
        crossover_proba=var[1],
        mutation_proba=var[2],
        n_generations=30,
        crossover_independent_proba=0.5,
        mutation_independent_proba=0.1,
        #tournament_size = 3,
        n_gen_no_change=10,
        caching=True,
        n_jobs=-1)
    for i in range(30):
        print(
            "-------------------------run {} ----------------------".format(i))

        selector = selector.fit(x_train, y_train)
        run += 1
        genfeats = data[allfeats].columns[selector.support_]
        genfeats = list(genfeats)
Exemplo n.º 13
0
    def process_feature_selection(self, estimator, features, trainX, trainY,
                                  file, norm):
        model = None
        selection = None
        subsets = []
        ####SequentialFeatureSelector from sklearn

        if self.dat["featureselection"]["name"] == "fs_importance":

            # Train model
            model = estimator.fit(trainX, trainY)
            selection = estimator.feature_importances_.argsort().tolist()

        elif self.dat["featureselection"]["name"] == "fs_extratrees":

            #estimator = ExtraTreesClassifier(n_estimators=150, n_jobs=-1)
            model = SelectFromModel(estimator,
                                    prefit=False).fit(trainX, trainY)
            selection = model.get_support(indices=True).tolist()

        elif self.dat["featureselection"]["name"] == "fs_svc":
            #estimator = LinearSVC(C=0.01, penalty="l2", dual=False)

            model = SelectFromModel(estimator,
                                    prefit=False).fit(trainX, trainY)
            selection = model.get_support(indices=True).tolist()
        ## only for positive values
        #elif self.feature_selection=="fs_chi2":
        #    model = SelectKBest(score_func=chi2, k=trainX.shape[1]-5)
        #   trainX = model.fit_transform(trainX)
        #    testX = model.transform(testX)

        elif self.dat["featureselection"]["name"] == "fs_geuni":
            model = GenericUnivariateSelect(score_func=mutual_info_classif,
                                            mode='percentile',
                                            param=70).fit(trainX, trainY)
            selection = model.get_support(indices=True).tolist()

        elif self.dat["featureselection"]["name"] == "fs_rfecv":
            #clf = DecisionTreeClassifier()
            #clf = LogisticRegression(C=1, penalty='l2')
            #estimator = SVR(kernel="linear")
            #model = RFECV(clf, trainX.shape[1]-15)

            #estimator = LinearSVC(C=0.01, penalty="l2", dual=False)
            model = RFECV(estimator,
                          min_features_to_select=int(len(features) / 3.),
                          n_jobs=-1).fit(trainX, trainY)
            selection = model.get_support(indices=True).tolist()

        elif self.dat["featureselection"]["name"] == "fs_lasso":
            #estimator = LassoCV(cv=5, normalize = True, n_jobs=1)
            model = SelectFromModel(estimator,
                                    threshold=0.25,
                                    norm_order=1,
                                    max_features=None,
                                    prefit=False).fit(trainX, trainY)
            selection = model.get_support(indices=True).tolist()

        elif self.dat["featureselection"]["name"] == "fs_genetic":
            #estimator = linear_model.LogisticRegression(solver="liblinear", multi_class="ovr")

            #estimator = ExtraTreesClassifier(n_estimators=150)

            #estimator = KNeighborsClassifier(n_neighbors=2, n_jobs=-1)
            model = GeneticSelectionCV(estimator,
                                       cv=5,
                                       verbose=1,
                                       scoring="f1",
                                       max_features=int((len(features)) -
                                                        (len(features) / 3)),
                                       n_population=70,
                                       crossover_proba=0.5,
                                       mutation_proba=0.2,
                                       n_generations=40,
                                       crossover_independent_proba=0.5,
                                       mutation_independent_proba=0.05,
                                       tournament_size=3,
                                       n_gen_no_change=10,
                                       caching=True,
                                       n_jobs=-1).fit(trainX, trainY)
            selection = model.get_support(indices=True).tolist()

        elif self.dat["featureselection"]["name"] == "fs_sequential_forward":
            #estimator = KNeighborsClassifier(n_neighbors=2)
            #estimator = LogisticRegression()

            #estimator = RandomForestClassifier(n_estimators=50, random_state=7)
            model = SequentialFeatureSelector(
                estimator,
                direction="forward",
                n_features_to_select=self.dat["featureselection"]
                ["n_features"],
                n_jobs=-1).fit(trainX, trainY)
            selection = model.get_support(indices=True).tolist()

        elif self.dat["featureselection"]["name"] == "fs_sequential_backward":
            #estimator = KNeighborsClassifier(n_neighbors=2)

            #cls = LogisticRegression()
            #cls = RandomForestClassifier(n_estimators=100, random_state=7, n_jobs=1)
            model = SequentialFeatureSelector(estimator,
                                              direction="backward",
                                              n_features_to_select=None,
                                              n_jobs=-1).fit(trainX, trainY)
            selection = model.get_support(indices=True).tolist()

        elif self.dat["featureselection"][
                "name"] == "fs_mlxtend_sequential_forward":
            #estimator = KNeighborsClassifier(n_neighbors=2)

            #cls = LogisticRegression()
            #estimator = RandomForestClassifier(n_estimators=100, random_state=7, n_jobs=1)
            #estimator = svm.SVC(kernel="rbf")
            model = SFS(estimator,
                        k_features=int(len(features) / 2.),
                        forward=True,
                        floating=False,
                        verbose=2,
                        scoring='f1',
                        cv=3,
                        n_jobs=-1).fit(trainX, trainY)
            selection = list(model.k_feature_idx_)
            #subsets = model.subsets_()

        elif self.dat["featureselection"][
                "name"] == "fs_mlxtend_sequential_backward":
            #estimator = KNeighborsClassifier(n_neighbors=2)

            #cls = LogisticRegression()
            #cls = RandomForestClassifier(n_estimators=100, random_state=7, n_jobs=1)
            model = SFS(
                estimator,
                k_features=50,
                #k_features=int(len(features)/2.),
                forward=False,
                floating=False,
                scoring='accuracy',
                cv=4,
                n_jobs=-1).fit(trainX, trainY)
            selection = list(model.k_feature_idx_)
            #subsets = model.subsets_()

        elif self.dat["featureselection"][
                "name"] == "fs_mlxtend_sequential_forward_floating":
            #estimator = KNeighborsClassifier(n_neighbors=2)
            #estimator = LogisticRegression()

            #estimator = RandomForestClassifier(n_estimators=50, random_state=7)

            #estimator = XGBClassifier(
            #                learning_rate=0.2, n_estimators=50, max_depth=4,
            #                min_child_weight=2, gamma=0.0, subsample=0.8, colsample_bytree=0.8,
            #                objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27,
            #                ##tree_method='gpu_hist'  # THE MAGICAL PARAMETER
            #            )

            #estimator = svm.SVC(kernel="rbf")

            model = SFS(
                estimator,
                k_features=self.dat["featureselection"]["n_features"],
                #k_features=int(len(features)/2.),
                forward=True,
                floating=True,
                verbose=2,
                scoring='f1',
                cv=3,
                n_jobs=-1).fit(trainX, trainY)
            selection = list(model.k_feature_idx_)
            #subsets = model.subsets_

        elif self.dat["featureselection"][
                "name"] == "fs_mlxtend_sequential_backward_floating":
            #estimator = KNeighborsClassifier(n_neighbors=2)
            #estimator = LogisticRegression()

            #estimator = RandomForestClassifier(n_estimators=50, random_state=7)
            model = SFS(estimator,
                        k_features=self.dat["featureselection"]["n_features"],
                        forward=False,
                        floating=True,
                        verbose=2,
                        scoring='f1',
                        cv=4,
                        n_jobs=-1).fit(trainX, trainY)
            selection = list(model.k_feature_idx_)
            #subsets = model.subsets_

        #selection = model.get_support(indices=False).tolist()
        datafesel = {
            "features": features,
            "selected": selection,
            "subset": subsets
        }
        normshift = {"None": 0, "std": 1, "minmax": 2}
        Util.makedir(self.dat["outputdir"] + "/featureselection/")
        Util.write(
            self.dat["outputdir"] + "/featureselection/" +
            self.dat["featureselection"]["name"] + "_" +
            self.dat["featureselection"]["estimator"] + "_" +
            str(normshift[norm]) + "_" + file, datafesel)
        return datafesel
df_selected_features_filter = pd.DataFrame(
    df_selected_features_filter, columns=filter_method_selected_features)

# 4. Final feature selection with the wrapper method of genetic algorithm

X = df_selected_features_filter.copy()

estimator = RandomForestClassifier(n_estimators=1000, n_jobs=1)
selector = GeneticSelectionCV(estimator,
                              cv=5,
                              verbose=1,
                              scoring="accuracy",
                              max_features=18,
                              n_population=300,
                              crossover_proba=0.5,
                              mutation_proba=0.2,
                              n_generations=50,
                              crossover_independent_proba=0.1,
                              mutation_independent_proba=0.05,
                              tournament_size=3,
                              n_gen_no_change=10,
                              caching=True,
                              n_jobs=-1)
selector = selector.fit(X, y.values.ravel())

print(selector.support_)
print(X.columns)

X.drop(X.columns[np.where(selector.support_ == False)[0]],
       axis=1,
       inplace=True)
Exemplo n.º 15
0
def main():
    result = {}
    for _sym in SYMBOLS:
        dataset = 'data/result/datasets/csv/{}.csv'.format(_sym)
        df = pd.read_csv(dataset,
                         sep=',',
                         encoding='utf-8',
                         index_col='Date',
                         parse_dates=True)
        df = df.replace([np.inf, -np.inf], np.nan).dropna()
        X = df[df.columns.difference(['target', 'target_pct', 'target_label'])]
        y = df['target']
        #print("======"+_sym+"======")
        #print(X.info())

        # Variance Threshold
        sel = VarianceThreshold()
        sel.fit_transform(X)
        sup = sel.get_support()
        X = X[[name for flag, name in zip(sup, X.columns) if flag]]
        ## SelectKBest
        sel = SelectKBest(chi2, k=30)
        sX = scale(X, scaler='minmax')
        sel.fit_transform(sX, y)
        sup = sel.get_support()
        sX = sX[[name for flag, name in zip(sup, sX.columns) if flag]]

        ## Recursive Feature Elimination
        # Create the RFE object and compute a cross-validated score.
        # The "accuracy" scoring is proportional to the number of correct
        # classifications
        # model = SVC(kernel="linear")
        # rfecv = RFECV(estimator=model, step=1, cv=StratifiedKFold(2), scoring='accuracy', n_jobs=-1, verbose=1)
        # rfecv.fit(X, y)
        # X = X[[name for flag, name in zip(rfecv.support_, X.columns) if flag]]
        ### Genetic
        # estimator = MLPClassifier(**{
        #     'hidden_layer_sizes': (10, 4),
        #     'solver': 'lbfgs',
        #     'learning_rate': 'constant',
        #     'learning_rate_init': 0.001,
        #     'activation': 'logistic'
        # })
        estimator = LogisticRegression(solver="liblinear", multi_class="ovr")
        gscv = GeneticSelectionCV(estimator,
                                  cv=2,
                                  verbose=1,
                                  scoring="accuracy",
                                  max_features=30,
                                  n_population=50,
                                  crossover_proba=0.5,
                                  mutation_proba=0.2,
                                  n_generations=80,
                                  crossover_independent_proba=0.5,
                                  mutation_independent_proba=0.05,
                                  tournament_size=3,
                                  n_gen_no_change=10,
                                  caching=True,
                                  n_jobs=-1)
        gscv = gscv.fit(X, y)
        X = X[[name for flag, name in zip(gscv.support_, X.columns) if flag]]

        #print(X.columns)

        # print("[%s] Optimal number of features : %d Set: %s" % (_sym, rfecv.n_features_, ', '.join(X.columns)))
        # plt.figure()
        # plt.title(_sym + ' SVC RFECV K=2')
        # plt.xlabel("Number of features selected")
        # plt.ylabel("Cross validation score (nb of correct classifications)")
        # plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
        # plt.show()

        logger.info("{}: {}".format(_sym, X.columns))
        result[_sym] = {
            'dataset': dataset,
            'columns_genetic_lr_30': [c for c in X.columns],
            'columns_kbest_30': [c for c in sX.columns]
        }
    return result
Exemplo n.º 16
0
def main():
    djia_in = []
    djia_out = []
    dat_ = []
    dat = []
    st = 0
    with open("DJIA.csv", "r") as djia_r:
        djia_r.readline()
        for l in djia_r:
            x = l.strip().split(",")
            if x[1] == ".":
                continue
            u = [(datetime.datetime.strptime(x[0], "%Y-%m-%d") -
                  datetime.datetime(1970, 1, 1)).total_seconds(),
                 djia_out[-1] if len(djia_out) != 0 else float(x[1]),
                 djia_out[-2] if len(djia_out) > 1 else float(x[1])]
            if x[0] == "2018-11-09":  # Check this date
                st = len(dat_)
            v = float(x[1])
            dat_.append(u + [v])

    dat = dat_[st:]
    print(st)
    split = int(.8 * len(dat))

    scaler = MinMaxScaler()
    scaler.fit(dat)
    djia_s = scaler.transform(dat)

    djia_in = [x[:-1] for x in djia_s]
    djia_out = [x[-1] for x in djia_s]

    djia_in_train = np.array(djia_in[:split])
    djia_out_train = np.array(djia_out[:split])
    djia_in_test = np.array(djia_in[split:])
    djia_out_test = np.array(djia_out[split:])

    print(djia_s)

    m = svm.SVR(C=0.01,
                cache_size=1000,
                coef0=djia_out_train[-1],
                degree=5,
                epsilon=0.005,
                gamma='auto',
                kernel='poly',
                max_iter=5000,
                shrinking=True,
                tol=0.0001,
                verbose=True)

    model = m.fit(djia_in_train, djia_out_train)
    res = copy.deepcopy(m.predict(djia_in_test))

    xs = [x[0] for x in djia_in_test]
    plt.plot(xs, res, "b", label="SVR")

    m2 = GeneticSelectionCV(m,
                            cv=5,
                            verbose=1,
                            scoring="neg_mean_squared_error",
                            n_population=1000,
                            crossover_proba=0.5,
                            mutation_proba=0.2,
                            n_generations=2000,
                            crossover_independent_proba=0.5,
                            mutation_independent_proba=0.05,
                            tournament_size=3,
                            n_gen_no_change=10,
                            caching=True,
                            n_jobs=10)
    m2.fit(djia_in_train, djia_out_train)

    res2 = m2.predict(djia_in_test)

    plt.plot(xs, res2, "g", label="GA/SVR")
    plt.plot(xs, djia_out_test, "m", label="Actual")
    plt.xlabel('Time (scaled)')
    plt.ylabel('Points (scaled)')
    plt.legend()

    tp = 0
    tn = 0
    fp = 0
    fn = 0
    for i in range(1, len(res)):
        dirres = res[i] - res[i - 1] >= 0
        diract = djia_out_test[i] - djia_out_test[i - 1] >= 0
        if dirres == diract:
            if dirres == True:
                tp += 1
            else:
                tn += 1
        else:
            if dirres == True:
                fp += 1
            else:
                fn += 1

    print(tp, tn, fp, fn)

    plt.suptitle("RMSE = " +
                 str(math.sqrt(mean_squared_error(djia_out_test, res))))
    plt.show()
Exemplo n.º 17
0
                print('Test error: {}'.format(
                    1 - clf.score(X_test[:, features], y_test)))
                if d == 2:
                    make_plot(X1[:, features], X2[:, features])

    # GA
    print('')
    print('=== Genetic Algorithm ===')
    clf = SVC(C=10, kernel='linear')
    selector = GeneticSelectionCV(clf,
                                  cv=5,
                                  verbose=1,
                                  scoring="accuracy",
                                  n_population=20,
                                  crossover_proba=0.5,
                                  mutation_proba=0.2,
                                  n_generations=10,
                                  crossover_independent_proba=0.5,
                                  mutation_independent_proba=0.05,
                                  tournament_size=3,
                                  caching=True,
                                  n_jobs=8)
    selector = selector.fit(X_train, y_train)
    features = [i for i in range(X_train.shape[1]) if selector.support_[i]]
    print('Features selected: {}'.format(features))
    clf = SVC(C=10, kernel='linear')
    clf.fit(X_train[:, features], y_train)
    print('Train error: {}'.format(1 -
                                   clf.score(X_train[:, features], y_train)))
    print('Test error: {}'.format(1 - clf.score(X_test[:, features], y_test)))