Exemplo n.º 1
1
def predict_TestData(Food_df,People_df):
    cTrainF = rand(len(Food_df)) > .5
    cTestF = ~cTrainF
    cTrainP = rand(len(People_df)) > .5
    cTestP = ~cTrainP

    TrainX_df = pd_concat([People_df[cTrainP], Food_df[cTrainF]],axis=0)
    TestX_df = pd_concat([People_df[cTestP], Food_df[cTestF]],axis=0)

    TrainX= TrainX_df.ix[:,2:].values
    TestX= TestX_df.ix[:,2:].values
    TrainY = concatenate([ones(len(People_df[cTrainP])), zeros(len(Food_df[cTrainF]))])
    TestY = concatenate([ones(len(People_df[cTestP])), zeros(len(Food_df[cTestF]))])

    ET_classifier = ExtraTreesClassifier(n_estimators=50, max_depth=None, min_samples_split=1, random_state=0)
    ET_classifier.fit(TrainX,TrainY)
    ET_prediction = ET_classifier.predict(TestX) 

    LinSVC_classifier = svm.LinearSVC()
    LinSVC_classifier.fit(TrainX,TrainY)
    LinSVC_predict = LinSVC_classifier.predict(TestX)

    a=DataFrame()
    a["url"]=TestX_df.urls.values
    a["answer"]=TestY
    a["ET_predict"]=ET_prediction
    a["LinSVC_predict"]=LinSVC_predict
    a.to_csv("prediction_for_TestData.csv")
Exemplo n.º 2
0
def tree(train_data, train_labels, all_bigrams, task):
	forest = ExtraTreesClassifier(n_estimators=100, random_state=0)
	forest.fit(train_data, train_labels)
	importances = forest.feature_importances_
	indices = np.argsort(importances)[::-1]

	# Print the feature ranking
	print "-"*45
	print task

	for f in range(20):
	  print("%d. feature, name: %s, importance: %f" % (f + 1, all_bigrams[indices[f]], importances[indices[f]]))

	# Plot the feature importances of the forest
	pl.figure()
	n = train_data.shape[1]
	n = 2000
	pl.title("Sorted feature importance for %s" %(task))
	pl.bar(range(n), importances[indices][:n], color="black", align="center")
	pl.xlim([0, (n)])
	pl.xticks([num for num  in range(0, n+1, 250)])
	pl.savefig(task+'.pdf', bbox_inches='tight')
	print "plot saved"

	return indices
Exemplo n.º 3
0
def main():

    # Define the known data points or "training" data
    explanatory_fields = "d100 dd0 dd5 fday ffp gsdd5 gsp map mat_tenths mmax_tenths mmindd0 mmin_tenths mtcm_tenths mtwm_tenths sday".split()
    explanatory_rasters = [os.path.join(TRAINING_DIR, "current_" + r + ".img") for r in explanatory_fields]
    response_shapes = os.path.join(TRAINING_DIR, "DF.shp")

    # Load the training rasters using the sampled subset
    try:
        cached = json.load(open("_cached_training.json"))
        train_xs = np.array(cached['train_xs'])
        train_y = np.array(cached['train_y'])
    except IOError:
        train_xs, train_y = load_training_vector(response_shapes, 
            explanatory_rasters, response_field='GRIDCODE')
        cache = {'train_xs': train_xs.tolist(), 'train_y': train_y.tolist()}
        with open("_cached_training.json", 'w') as fh:
            fh.write(json.dumps(cache))

    print(train_xs.shape, train_y.shape)

    # Train the classifier
    clf = ExtraTreesClassifier(n_estimators=120, n_jobs=3)
    clf.fit(train_xs, train_y)
    print(clf)

    evaluate_clf(clf, train_xs, train_y, feature_names=explanatory_fields)
def eval_param(params):
    """Evaluation of one set of xgboost's params.
    Then, use 3 folds as training and cv in a row as xgboost's watchlist with an early_stop at 50.
    """
    global df_results, train, target, test
    print ("Training with params : ")
    print (params)

    random_state = 42
    avg_score = 0.
    n_folds = 3
    predict = np.zeros(test.shape[0])
    #dtest = xgb.DMatrix(test)
    skf = StratifiedKFold(target, n_folds=n_folds, random_state=random_state)
    for train_index, cv_index in skf:
        # train
        x_train, x_cv = train[train_index], train[cv_index]
        y_train, y_cv = target[train_index], target[cv_index]
        clf = ExtraTreesClassifier(**params).fit(x_train, y_train)
        #bst = xgb.train(params, dtrain, num_round, watchlist, early_stopping_rounds=early_stopping_rounds, maximize=True)
            # test / score
        predict_cv = clf.predict_proba(x_cv, y_cv)#bst.predict(dvalid, ntree_limit=bst.best_iteration)
        avg_score += -log_loss(y_cv, predict_cv)
        predict += clf.predict_proba(test)#bst.predict(dtest, ntree_limit=bst.best_iteration)
    predict /= n_folds
    avg_score /= n_folds 
    # store
    new_row = pd.DataFrame([np.append([avg_score], list(params.values()))],
                                 columns=np.append(['score'], list(params.keys())))
    df_results = df_results.append(new_row, ignore_index=True)
    np.savetxt('hyperopt_preds/pred' + str(df_results.index.max()) + '.txt', predict, fmt='%s')
    df_results.to_csv('hyperopt_results_sgd.csv')
    print ("\tScore {0}\n\n".format(avg_score))
    return {'loss': - avg_score, 'status': STATUS_OK}
def calc_prob(df_features_driver, df_features_other):

    df_train = df_features_driver.append(df_features_other)
    df_train.reset_index(inplace = True)
    df_train.Driver = df_train.Driver.astype(int)

    # So far, the best result was achieved by using a RandomForestClassifier with Bagging
    # model = BaggingClassifier(base_estimator = ExtraTreesClassifier())
    # model = BaggingClassifier(base_estimator = svm.SVC(gamma=2, C=1))
    # model = BaggingClassifier(base_estimator = linear_model.LogisticRegression())
    # model = BaggingClassifier(base_estimator = linear_model.LogisticRegression())
    # model = BaggingClassifier(base_estimator = AdaBoostClassifier())
    #model = RandomForestClassifier(200)
    # model = BaggingClassifier(base_estimator = [RandomForestClassifier(), linear_model.LogisticRegression()])
    # model = EnsembleClassifier([BaggingClassifier(base_estimator = RandomForestClassifier()),
    #                             GradientBoostingClassifier])
    #model = GradientBoostingClassifier(n_estimators = 10000)
    model = ExtraTreesClassifier(n_estimators=100,max_features='auto',random_state=0, n_jobs=2, criterion='entropy', bootstrap=True)
    # model = ExtraTreesClassifier(500, criterion='entropy')

    feature_columns = df_train.iloc[:, 4:]

    # Train the classifier
    model.fit(feature_columns, df_train.Driver)
    df_submission = pd.DataFrame()

    df_submission['driver_trip'] = create_first_column(df_features_driver)

    probs_array = model.predict_proba(feature_columns[:200]) # Return array with the probability for every driver
    probs_df = pd.DataFrame(probs_array)

    df_submission['prob'] = np.array(probs_df.iloc[:, 1])

    return df_submission
Exemplo n.º 6
0
def learn(f):
    global raw_data
    print 'testing classifier'
    data = raw_data[raw_data['label'] != 'unknown']
    data = data[data['file type'] == 'EXECUTE']
    X = data.as_matrix(f)
    y = np.array(data['label'].tolist())
    #clf = RandomForestClassifier(n_estimators=100)
    clf = ExtraTreesClassifier(n_estimators=100)
    #clf = AdaBoostClassifier()
    scores = sklearn.cross_validation.cross_val_score(clf, X, y, cv=10)
    print("predicted accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    seed = 3301
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed)
    clf.fit(X_train, y_train)
    scores = clf.score(X_test, y_test)
    print("actual accuracy: %0.2f" % scores)
    importances = zip(f, clf.feature_importances_)
    importances.sort(key=lambda k:k[1], reverse=True)
    for im in importances[0:20]:
        print im[0].ljust(30), im[1]
    #y_pred = clf.predict(X_test)
    #labels = ['good', 'bad']
    #cm = confusion_matrix(y_test, y_pred, labels)
    #plot_cm(cm, labels)
    #joblib.dump(clf, 'model.pkl')
    return clf
Exemplo n.º 7
0
def doTreeFeatureSelection(estimator, X, y):
	clf = ExtraTreesClassifier()
	clf = clf.fit(X, y)
		
	#print str(clf.feature_importances_)	
	model =  SelectFromModel(clf, prefit=True)
	return model
Exemplo n.º 8
0
def ET_classif(features_df=None, labels_df=None):
    '''Scoring function to be used in SelectKBest feature selection class 
        object.
        
    This scoring function assigns varaible importances to the features
        passed in to it using the ExtraTreesClassifier. It then returns
        the features as two identical arrays mimicking the scores and 
        p-values arrays required by SelectKBest to pick the top K 
        features.
        
    Args:
        features_df: Pandas dataframe of features to be used to predict 
            using the ExtraTreesClassifier.
        labels_df: Pandas dataframe of the labels being predicted.
    Returns:
        Two identical arrays containing the feature importance scores
            returned for each feature by the ExtraTreesClassifier.
    '''
    reducer = ExtraTreesClassifier(n_estimators=500, bootstrap=False,
                                   oob_score=False, max_features=.10,
                                   min_samples_split=10, min_samples_leaf=2,
                                   criterion='gini', random_state=42)

    reducer.fit(features_df, labels_df)
    return reducer.feature_importances_, reducer.feature_importances_
Exemplo n.º 9
0
def feature_engineering_common(Y, X, X1):
    print "### Shape of training set (X)", X.shape
    print "### Shape of labels (Y)", Y.shape
    print "### Shape of Kaggle Test set (X1)", X1.shape

    # Scale features
    scaler = preprocessing.StandardScaler()
    X_SCALED = scaler.fit_transform(X)
    X1_SCALED = scaler.transform(X1)
    print "### (After scaling) Shape of training set", X_SCALED.shape
    print "### (After scaling ) Shape of Kaggle Test set", X1_SCALED.shape

    # Find Important Features using Random Forest
    xtClf = ExtraTreesClassifier().fit(X_SCALED, Y)
    X_SCALED_SUBSET = xtClf.transform(X_SCALED)
    X1_SCALED_SUBSET = xtClf.transform(X1_SCALED)
    importances = xtClf.feature_importances_
    print xtClf.feature_importances_
    print "### (After scaling & feature selection using Random Forrest) Shape of training set", X_SCALED_SUBSET.shape
    print "### (After scaling & feature selection using Random Forrest) Shape of Kaggle Test set", X1_SCALED_SUBSET.shape

    indices = np.argsort(importances)[::-1]

    # Print the feature ranking
    print("Feature ranking:")
    for f in xrange(10):
        print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
Exemplo n.º 10
0
def train_random_forest(X_train,y_train,**kwargs):
    from sklearn.ensemble import ExtraTreesClassifier

    n_estimators = kwargs.pop('n_estimators',300)
    max_features = kwargs.pop('max_features','auto')
    n_jobs       = kwargs.pop('n_jobs',-1)
    verbose      = kwargs.pop('verbose',0)
    tuned_params = kwargs.pop('tuned_params',None)

    # initialize baseline classifier
    clf = ExtraTreesClassifier(n_estimators=n_estimators,random_state=42,
                               n_jobs=n_jobs,verbose=verbose,criterion='gini',
                               max_features=max_features,oob_score=True,
                               bootstrap=True)
    
    if tuned_params is not None: # optimize if desired
        from sklearn.grid_search import GridSearchCV
        cv = GridSearchCV(clf,tuned_params,cv=5,scoring='roc_auc',
                          n_jobs=n_jobs,verbose=verbose,refit=True)
        cv.fit(X_train, y_train)
        clf = cv.best_estimator_
    else: # otherwise train with the specified parameters (no tuning)
        clf.fit(X_train,y_train)

    return clf
Exemplo n.º 11
0
def feature_important(filename):
    from sklearn.datasets import make_classification
    from sklearn.ensemble import ExtraTreesClassifier

    content = read_csv(filename)
    X = [c.decisions for c in content]
    y = [c.objective for c in content]

    # Build a forest and compute the feature importances
    forest = ExtraTreesClassifier(n_estimators=250,
                                  random_state=0)

    forest.fit(X, y)
    importances = forest.feature_importances_
    std = np.std([tree.feature_importances_ for tree in forest.estimators_],
                 axis=0)
    indices = np.argsort(importances)[::-1]

    # Print the feature ranking
    print("Feature ranking:")
    #
    for f in range(len(X[0])):
        print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

    # Plot the feature importances of the forest
    plt.figure()
    plt.title("Feature importances")
    plt.bar(range(len(X[0])), importances[indices],
           color="r", yerr=std[indices], align="center")
    plt.xticks(range(len(X[0])), indices)
    plt.xlim([-1, len(X[0])])
    plt.show()
Exemplo n.º 12
0
    def tree_based_selection(self, data_set, data_target, feature_names):
        """

        :param data_set:
        :return:
        """

        clf = ExtraTreesClassifier()
        clf = clf.fit(data_set, data_target)
        print clf.feature_importances_

        model = SelectFromModel(clf, prefit=True)
        feature_set = model.transform(data_set)

        fea_index = []
        for A_col in np.arange(data_set.shape[1]):
            for B_col in np.arange(feature_set.shape[1]):
                if (data_set[:, A_col] == feature_set[:, B_col]).all():
                    fea_index.append(A_col)

        check = {}
        for i in fea_index:
            check[feature_names[i]] = data_set[0][i]
        print np.array(check)

        return feature_set, fea_index
Exemplo n.º 13
0
def top_importances(features_df=None, labels_df=None, top_N=10):
    ''' Finds the top N importances using the ExtraTreesClassifier.
        
    Finds the top N importances of a dataframe of features and a dataframe
        of labels using the ExtraTreesClassifier.
    
    Args:
        features_df: Pandas dataframe of features used to predict.
        labels_df: Pandas dataframe of labels to be predicted.
        top_N: interger value of the top N most importance features to return.
    Returns:
        Pandas dataframe containing the top N importances and their 
        importance scores.
    
    '''
    reducer = ExtraTreesClassifier(n_estimators=2000, bootstrap=False,
                                   oob_score=False, max_features=.10,
                                   min_samples_split=10, min_samples_leaf=2,
                                   criterion='gini')

    reducer.fit(features_df, labels_df)
    scores = pd.DataFrame(reducer.feature_importances_,
                          index=features_df.columns)
    scores.columns = ['Importances']
    scores = scores.sort(['Importances'], ascending=False)
    return scores[0:top_N]
def crossVal(positions, X, y, missedYFile):
    outF = open(missedYFile, 'w')
    posArray = np.array(positions)
    # Split into training and test
    sss = StratifiedShuffleSplit(y, 4, test_size=0.1, random_state=442)
    cvRound = 0
    for train_index, test_index in sss:
        clf = ExtraTreesClassifier(n_estimators=300,
                                   random_state=13,
                                   bootstrap=True,
                                   max_features=20,
                                   min_samples_split=1,
                                   max_depth=8,
                                   min_samples_leaf=13,
                                   n_jobs=4
                                   )
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        pos_test = posArray[test_index]

        clf = clf.fit(X_train, y_train)
        preds = clf.predict(X_test)
        metrics.confusion_matrix( y_test, preds )
        print( metrics.classification_report(y_test, clf.predict(X_test)) )
        for loc,t,p in zip(pos_test, y_test, preds):
            if t=='0' and p=='1':
                print >> outF, loc + '\t' + str(cvRound)
        cvRound += 1
    outF.close()
Exemplo n.º 15
0
class FeaturesSelectionRandomForests(object):
    
    
    def __init__(self, n_estimators = 100, feature_importance_th = 0.005):
        
        self.n_estimators = n_estimators
        self.feature_importance_th = feature_importance_th
        
            
    def fit(self, X, y, n_estimators = None, feature_importance_th = None):
        
        if n_estimators is not None:
            assert isinstance(n_estimators,(int,long,float))
            self.n_estimators = n_estimators
        if feature_importance_th is not None:
            assert isinstance(feature_importance_th,(int,long,float))
            self.feature_importance_th = feature_importance_th
        
        #filter features by forest model
        self.trees = ExtraTreesClassifier(n_estimators=100, compute_importances=True)
        self.trees.fit(X, y)
        self.features_mask = np.where(self.trees.feature_importances_ > 0.005)[0]

    
    def plot_features_importance(self):
        
        pd.DataFrame(self.trees.feature_importances_).plot(kind='bar')
        plt.show()
        
    
    def transform(self, X):

        assert hasattr(self,"features_mask")

        return X[:, self.features_mask]
Exemplo n.º 16
0
    def tree_based_feature_selection(self, x: np.ndarray, y: np.ndarray) -> np.ndarray:
        n = len(self.features)
        forest = ExtraTreesClassifier(n_estimators=250, random_state=0)
        forest.fit(x, y)
        importances = forest.feature_importances_
        print(importances)
        std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)
        indices = np.argsort(importances)[::-1]
        print("Feature ranking:")

        for f in range(n):
            print("%d. feature %d: %s (%f)" % (f + 1, indices[f], self.features[indices[f]],importances[indices[f]]))

        # Plot the feature importances of the forest
        # plt.figure()
        # plt.title("Feature importances")
        # plt.bar(range(n), importances[indices],
        #         color="r", yerr=std[indices], align="center")
        # plt.xticks(range(n), indices)
        # plt.xlim([-1, n])
        # plt.show()
        n = 12
        print(indices[0:n+1])
        print(self.features[indices[0:n+1]])
        new_x = x[:, indices[0:n+1]]
        return new_x
Exemplo n.º 17
0
def kfold_cv(X_train, y_train,idx,k):

    kf = StratifiedKFold(y_train,n_folds=k)
    xx=[]
    count=0
    for train_index, test_index in kf:
        count+=1
        X_train_cv, X_test_cv = X_train[train_index,:],X_train[test_index,:]
        gc.collect()
        y_train_cv, y_test_cv = y_train[train_index],y_train[test_index]
        y_pred=np.zeros(X_test_cv.shape[0])
        m=0
         
        for j in range(m):
            clf=xgb_classifier(eta=0.1,min_child_weight=20,col=0.5,subsample=0.7,depth=5,num_round=200,seed=j*77,gamma=0.1)
            y_pred+=clf.train_predict(X_train_cv,(y_train_cv),X_test_cv,y_test=(y_test_cv))
        #y_pred/=m;
        clf=ExtraTreesClassifier(n_estimators=700,max_features= 50,criterion= 'entropy',min_samples_split= 3,
                            max_depth= 60, min_samples_leaf= 4,verbose=1,n_jobs=-1)
        #clf=RandomForestClassifier(n_jobs=-1,n_estimators=100,max_depth=100)
        clf.fit(X_train_cv,(y_train_cv))
        y_pred=clf.predict_proba(X_test_cv).T[1]
        print y_pred.shape
        xx.append(llfun(y_test_cv,(y_pred)))
        ypred=y_pred
        yreal=y_test_cv
        idx=idx[test_index]
        print xx[-1]#,y_pred.shape
        break

    print xx,'average:',np.mean(xx),'std',np.std(xx)
    return ypred,yreal,idx#np.mean(xx)
Exemplo n.º 18
0
def plotImportance(X,y):
	forest = ExtraTreesClassifier(n_estimators=250,
	                              random_state=0)

	forest.fit(X, y)
	importances = forest.feature_importances_
	std = np.std([tree.feature_importances_ for tree in forest.estimators_],
	             axis=0)
	indices = np.argsort(importances)[::-1]
	n=X.shape[1]

	#Print the feature ranking
	#print("Feature ranking:")

	#for f in range(n):
	#    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

	# Plot the feature importances of the forest
	plt.figure(figsize=(20,15))
	plt.title("Feature importances")
	plt.bar(range(n), importances[indices],
	       color="r", yerr=std[indices], align="center")
	plt.xticks(range(n), X.columns[indices],rotation=90)
	plt.xlim([-1, n])
	plt.savefig('featuresel.pdf')
Exemplo n.º 19
0
def train_classifiers(X_data, y_data):
    ############ Linear SVM: 0.908 #############
    clf_LSVM = svm.SVC(kernel = 'linear')
    clf_LSVM.fit(X_data, y_data)
    
    ############ MultinomialNB: 0.875 #############
    clf_MNB = MultinomialNB()
    clf_MNB.fit(X_data, y_data)
    
    ############ Random Forest: 0.910 #############
    clf_RF = RandomForestClassifier(n_estimators=200, criterion='entropy')
    clf_RF.fit(X_data, y_data)
    
    ############ Extra Tree: 0.915 ##################
    clf_ETC = ExtraTreesClassifier(n_estimators=500, max_depth=None, min_samples_split=1, random_state=0)
    clf_ETC.fit(X_data, y_data)
    
    ############ AdaBoost: 0.88 ##################
    clf_Ada = AdaBoostClassifier()
    clf_Ada.fit(X_data, y_data)
    
    ############ rbf SVM: 0.895 #############
    clf_rbf = svm.SVC(C=200, gamma=0.06, kernel='rbf')
    clf_rbf.fit(X_data, y_data)
    
    ############ GradientBoosting: 0.88 #############
    clf_GBC = GradientBoostingClassifier()
    clf_GBC.fit(X_data, y_data)
    
    return clf_LSVM, clf_MNB, clf_RF, clf_ETC, clf_Ada, clf_rbf, clf_GBC    
def get_important_features(Xtrain, Ytrain, n=250, threshold=0.01, verbose=False):
    """ Use entirety of provided X, Y to train random forest

    Arguments
    Xtrain -- Training data
    Ytrain -- Training prediction

    Optional Arguments
    n -- number of ensemble members
    threshold -- threshold of importance above which a feature is relevant
    verbose -- if true, prints results of ranking

    Returns
    ranking -- a ranked list of indices of important features
    """
    # Train and fit tree classifier ensemble
    classifier = ExtraTreesClassifier(n_estimators=n, random_state=0)
    classifier.fit(Xtrain, Ytrain)

    # Compute important features
    importances = classifier.feature_importances_
    std = np.std([tree.feature_importances_ for tree in classifier.estimators_], axis=0)
    indices = np.argsort(importances)[::-1]

    ranking = [[indices[f], importances[indices[f]]] for f in range(Xtrain.shape[1])]
    ranking = filter(lambda r: r[1] >= threshold, ranking)

    if verbose:
        for r in range(len(ranking)):
            print str(r+1) + ". ", ranking[r][0], ranking[r][1]

    return ranking
Exemplo n.º 21
0
def get_most_important_features(train):
  train = train.drop('ID', 1)
  train_y = train['TARGET']
  train_X = train.drop('TARGET', 1)

  random_forest = RandomForestClassifier(n_estimators=100)
  random_forest.fit(train_X, train_y)

  feater_importance = pd.Series(random_forest.feature_importances_, index=train_X.columns)
  feater_importance.sort_values(inplace=True)
  feater_importance.tail(20).plot(kind='barh', figsize=(15  ,7), title='Feature importance by random forest')

  # plt.savefig("feature_importance.png")

  grad_boosting = GradientBoostingClassifier()
  grad_boosting.fit(train_X, train_y)

  feater_importance = pd.Series(grad_boosting.feature_importances_, index=train_X.columns)
  feater_importance.sort_values(inplace=True)
  feater_importance.tail(20).plot(kind='barh', figsize=(10,7), title='Feature importance by gradient boosting')

  # plt.savefig("feature_importance2.png")

  extra_trees = ExtraTreesClassifier()
  extra_trees.fit(train_X, train_y)

  feater_importance = pd.Series(extra_trees.feature_importances_, index=train_X.columns)
  feater_importance.sort_values(inplace=True)
  feater_importance.tail(20).plot(kind='barh', figsize=(20,7), title='Feature importance by extra trees classifier')
def remove_feature_tree_based(train_X,train_Y):
    '''
    Removes features based on trees - see sklearn:
    http://scikit-learn.org/dev/auto_examples/ensemble/plot_forest_importances.html#example-ensemble-plot-forest-importances-py

    Actually removes based on "importance"
    '''
    forest = ExtraTreesClassifier(n_estimators=1000,
                                  compute_importances = True,
                                  random_state = 0)

    forest.fit(train_X, train_Y)
    importances = forest.feature_importances_
    std = np.std([tree.feature_importances_ for tree in forest.estimators_],
                  axis=0)
    indices = np.argsort(importances)[::-1]

    x_labels = ['rc1', 'rc2', 'dca1', 'dca2','dcm1', 'dcm2','ace1','ace2','acsc1', 'acsc2', 'acsv1', 'acsv2', 'acss1','acss2', 'acsk1', 'acsk2', 'taca1', 'taca2', 'tdc1', 'tdc2', 'gmin', 'gmean', 'trd','ep111','ep112','ep211', 'ep212', 'ep311','ep312', 'ep411','ep412','ep511','ep512','ep611','ep612','ep121','ep122','ep221', 'ep222', 'ep321','ep322', 'ep421','ep422','ep521','ep522','ep621','ep622']

    # Print the feature ranking
    print "Feature ranking:"

    for f in xrange(46):
        print "%d. feature %s (%f)" % (f + 1, x_labels[indices[f]], importances[indices[f]])

    # Transform the data to have only the features that are important
    x_new = forest.transform(train_X)

    return (forest, x_new)
def FeaturesImportance(trainData, trainLabels):
    forest = ExtraTreesClassifier(n_estimators=250, random_state=0)
    forest.fit(trainData, trainLabels)
    importances = forest.feature_importances_

    indices = np.argsort(importances)[::-1]

    # Print the feature ranking
    print("Feature ranking:")

    for f in range(16):
        print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

    # Plot the feature importances of the forest
    plt.figure()
    plt.title("Feature importances")
    plt.bar(range(16), importances[range(16)], color="r", align="center")
    plt.xticks(range(16), [r'$x_1$', r'$x_2$', r'$x_3$', r'$x_4$', r'$x_5$',
                          r'$x_6$', r'$x_7$', r'$x_8$', r'$x_9$', r'$x_{10}$', 
                          r'$x_{11}$', r'$x_{12}$', r'$x_{13}$', r'$x_{14}$', r'$x_{15}$', 
                          r'$x_{16}$'])
    plt.yticks([0.0, 0.05, 0.10, 0.15, 0.20, 0.25], [r'$0.00$', r'$0.05$', r'$0.10$', r'$0.15$', r'$0.20$', r'$0.25$'])  
    plt.xlabel('Features')
    plt.ylabel('Importance')
    plt.xlim([-1, 16])
    plt.show()
    
    return importances
Exemplo n.º 24
0
def reduceRF(label):
  global x_data_rf_reduced, importantFeatureLocs
  model = ExtraTreesClassifier()
  model.fit(x_data, y_data[:, label])

  # the relative importance of each attribute
  importance = model.feature_importances_
  weight = float(0)
  del importantFeatureLocs[:] # reset
  #print(importance)  

  for ele in np.sort(importance)[::-1]:
    weight += float(ele)
    featureIndex = np.where(importance==ele)
    for loc in featureIndex[0]:
      importantFeatureLocs.append(loc)
  
    if weight > RFThreshold :
      break
  
  # remove duplications
  importantFeatureLocs = list(set(importantFeatureLocs))

  # extracting relevant columns from input data. Note that importantFeatureLocs
  # may be unsorted (since python 'set' is unsorted), so features are extracted
  # in unorderd fashion. This info is stored in the softmax model class
  x_data_rf_reduced = x_data[:, importantFeatureLocs]
    def fit(self, X, Y, sample_weight=None):
        from sklearn.ensemble import ExtraTreesClassifier
        from sklearn.feature_selection import SelectFromModel

        num_features = X.shape[1]
        max_features = int(float(self.max_features) * (np.log(num_features) + 1))
        # Use at most half of the features
        max_features = max(1, min(int(X.shape[1] / 2), max_features))
        estimator = ExtraTreesClassifier(
            n_estimators=self.n_estimators,
            criterion=self.criterion,
            max_depth=self.max_depth,
            min_samples_split=self.min_samples_split,
            min_samples_leaf=self.min_samples_leaf,
            bootstrap=self.bootstrap,
            max_features=max_features,
            max_leaf_nodes=self.max_leaf_nodes,
            oob_score=self.oob_score,
            n_jobs=self.n_jobs,
            verbose=self.verbose,
            random_state=self.random_state,
            class_weight=self.class_weight,
        )
        estimator.fit(X, Y, sample_weight=sample_weight)
        self.preprocessor = SelectFromModel(estimator=estimator, threshold="mean", prefit=True)
        return self
def extratreeclassifier(input_file,Output,test_size):
    lvltrace.lvltrace("LVLEntree dans extratreeclassifier split_test")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    print X_train.shape, X_test.shape
    clf = ExtraTreesClassifier(n_estimators=10)
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print "Extremely Randomized Trees"
    print "classification accuracy:", metrics.accuracy_score(y_test, y_pred)
    print "precision:", metrics.precision_score(y_test, y_pred)
    print "recall:", metrics.recall_score(y_test, y_pred)
    print "f1 score:", metrics.f1_score(y_test, y_pred)
    print "\n"
    results = Output+"_Extremely_Random_Forest_metrics_test.txt"
    file = open(results, "w")
    file.write("Extremely Random Forest Classifier estimator accuracy\n")
    file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred))
    file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred))
    file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred))
    file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred))
    file.write("\n")
    file.write("True Value, Predicted Value, Iteration\n")
    for n in xrange(len(y_test)):
        file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1)))
    file.close()
    title = "Extremely Randomized Trees %f"%test_size
    save = Output + "Extremely_Randomized_Trees_confusion_matrix"+"_%s.png"%test_size
    plot_confusion_matrix(y_test, y_pred,title,save)
    lvltrace.lvltrace("LVLSortie dans extratreeclassifier split_test")
Exemplo n.º 27
0
    def _cascade_layer(self, X, y=None, layer=0):
        n_tree = getattr(self, 'n_cascadeRFtree')
        n_cascadeRF = getattr(self, 'n_cascadeRF')
        min_samples = getattr(self, 'min_samples_cascade')

        prf = RandomForestClassifier(
            n_estimators=100, max_features=8,
            bootstrap=True, criterion="entropy", min_samples_split=20,
            max_depth=None, class_weight='balanced', oob_score=True)
        crf = ExtraTreesClassifier(
            n_estimators=100, max_depth=None,
            bootstrap=True, oob_score=True)

        prf_pred = []
        if y is not None:
            # print('Adding/Training Layer, n_layer={}'.format(self.n_layer))
            for irf in range(n_cascadeRF):
                prf.fit(X, y)
                crf.fit(X, y)
                setattr(self, '_casprf{}_{}'.format(self.n_layer, irf), prf)
                setattr(self, '_cascrf{}_{}'.format(self.n_layer, irf), crf)
                probas = prf.oob_decision_function_
                probas += crf.oob_decision_function_
                prf_pred.append(probas)
        elif y is None:
            for irf in range(n_cascadeRF):
                prf = getattr(self, '_casprf{}_{}'.format(layer, irf))
                crf = getattr(self, '_cascrf{}_{}'.format(layer, irf))
                probas = prf.predict_proba(X)
                probas += crf.predict_proba(X)
                prf_pred.append(probas)

        return prf_pred
Exemplo n.º 28
0
class MyExtraTree(MyClassifier):
    def __init__(self, params=dict()):
        self._params = params
        self._extree = ExtraTreesClassifier(**(self._params))

    def update_params(self, updates):
        self._params.update(updates)
        self._extree = ExtraTreesClassifier(**(self._params))

    def fit(self, Xtrain, ytrain):
        self._extree.fit(Xtrain, ytrain)

    # def predict(self, Xtest, option = None):
    #   return self._extree.predict(Xtest)

    def predict_proba(self, Xtest, option = None):
        return self._extree.predict_proba(Xtest)[:, 1]

    def predict_proba_multi(self, Xtest, option = None):
        return self._extree.predict_proba(Xtest)

    def plt_feature_importance(self, fname_list, f_range = list()):
        importances = self._extree.feature_importances_

        std = np.std([tree.feature_importances_ for tree in self._extree.estimators_], axis=0)
        indices = np.argsort(importances)[::-1]

        fname_array = np.array(fname_list)

        if not f_range:
            f_range = range(indices.shape[0])

        n_f = len(f_range)

        plt.figure()
        plt.title("Extra Tree Feature importances")
        plt.barh(range(n_f), importances[indices[f_range]],
               color="b", xerr=std[indices[f_range]], ecolor='k',align="center")
        plt.yticks(range(n_f), fname_array[indices[f_range]])
        plt.ylim([-1, n_f])
        plt.show()


    def list_feature_importance(self, fname_list, f_range = list(), return_list = False):
        importances = self._extree.feature_importances_
        indices = np.argsort(importances)[::-1]

        print 'Extra tree feature ranking:'

        if not f_range :
            f_range = range(indices.shape[0])

        n_f = len(f_range)

        for i in range(n_f):
            f = f_range[i]
            print '{0:d}. feature[{1:d}]  {2:s}  ({3:f})'.format(f + 1, indices[f], fname_list[indices[f]], importances[indices[f]])

        if return_list:
            return [indices[f_range[i]] for i in range(n_f)]
Exemplo n.º 29
0
def select_with_forest(X, y, n_trees=10, treshold=0.01):
    from sklearn.preprocessing import LabelEncoder
    from sklearn.ensemble import ExtraTreesClassifier
    import pandas as pd
    import numpy as np
    # encode labels (str -> int):
    le = LabelEncoder()
    X = X.copy()
    for col in X.columns:
        le.fit(X[col].unique())
        X[col] = le.transform(X[col])
    # train the classifier:
    forest = ExtraTreesClassifier(criterion="entropy", n_estimators=n_trees)
    forest.fit(X, y)
    print('number of selected features: ', np.sum(forest.feature_importances_ >= treshold))
    # select important features:
    importances = pd.DataFrame()
    importances['predictor name'] = X.columns.tolist()
    importances['importance'] = forest.feature_importances_
    importances = importances.sort_values(by='importance', ascending=False)
    #X2 = forest.transform(X, treshold)
    #labels2 = X.columns[list(forest.feature_importances_>=treshold)]
    #X2 = pd.DataFrame(X2)
    #X2.columns = labels2
    return importances #X2
Exemplo n.º 30
0
def algo_fit_cross_validated(training_matrix, target):
    # Build a forest and compute the feature importances
    forest = ExtraTreesClassifier(n_estimators=250,
                                  random_state=0)

    forest.fit(training_matrix, target)
    importances = forest.feature_importances_
    std = np.std([tree.feature_importances_ for tree in forest.estimators_],
                 axis=0)
    indices = np.argsort(importances)[::-1]

    l = list(training_matrix.columns.values)
    for f in range(training_matrix.shape[1]):
        print("%d. feature %d(%s) (%f)" % (f + 1, indices[f], l[indices[f]], importances[indices[f]]))

    ##### Works well ######
    # SVM
    # svm = SVC(kernel="linear", C=0.06)
    # svm.fit(training_matrix, target)
    #
    # scores_svm = cross_validation.cross_val_score(svm, training_matrix, target, cv=5)
    # print("(svm) Accuracy: %0.5f (+/- %0.2f)" % (scores_svm.mean(), scores_svm.std() * 2))
    #
    # return svm
    ##### Works well ######

    # Random Forest
    rf = RandomForestClassifier(n_estimators=1500, max_depth=2, max_features=4)
    scores_rf = cross_validation.cross_val_score(rf, training_matrix, target, cv=5)
    print("(Random Forest) Accuracy: %0.5f (+/- %0.2f)" % (scores_rf.mean(), scores_rf.std() * 2))
    rf.fit(training_matrix, target)
    return rf
Exemplo n.º 31
0
            test_positions = np.take(selected_positions, test_index, axis=0)

            img_train, img_test = input.train_test_images(
                train_positions, test_positions)
            save_rgb(fold_dir + "train.png", img_train, format='png')
            save_rgb(fold_dir + "test.png", img_test, format='png')

            if rotation_oversampling:
                X_train, y_train = input.rotation_oversampling(
                    X_train, y_train)

            X_train = X_train.reshape(len(X_train), -1)
            X_test = X_test.reshape(len(X_test), -1)

            if feature_selection:
                fs = ExtraTreesClassifier(n_estimators=200)
                fs = fs.fit(X_train, y_train)
                model = SelectFromModel(fs, prefit=True)
                X_train, X_test = model.transform(X_train), model.transform(
                    X_test)
                print(X_train.shape)
            else:
                model = None

            print("Size training set", len(X_train))
            print("Size test set", len(X_test))

            if fold_num == 1:
                file.write("Size training set: %d\n" % len(X_train))
                file.write("Size test set: %d\n" % len(X_test))
                file.write("Class distribution:\n")
def classify(algorithm, fname, input_data, label_name, n_cores, random_state):
    train_y = np.array(input_data[label_name])
    input_data = input_data.drop('ID', axis=1)
    training_x = input_data.drop(label_name, axis=1)

    le = preprocessing.LabelEncoder()
    le.fit(train_y)
    train_y = le.transform(train_y)

    cv_metrics = pd.DataFrame()

    # 10-fold cross validation
    predicted_n_actual_pd = pd.DataFrame(
        columns=['ID', 'predicted', 'actual', 'fold'])

    kf = KFold(n_splits=10, shuffle=True, random_state=random_state)
    fold = 1

    for train, test in kf.split(training_x):
        # number of train and test instances is based on training_x.

        train_cv_features, test_cv_features, train_cv_label, test_cv_label = training_x.iloc[
            train], training_x.iloc[test], train_y[train], train_y[test]

        if algorithm == 'GB':
            temp_classifier = GradientBoostingClassifier(n_estimators=300,
                                                         random_state=1)

        elif (algorithm == 'RF'):
            temp_classifier = RandomForestClassifier(n_estimators=300,
                                                     random_state=1,
                                                     n_jobs=n_cores)

        elif (algorithm == 'M5P'):
            temp_classifier = ExtraTreesClassifier(n_estimators=300,
                                                   random_state=1,
                                                   n_jobs=n_cores)

        elif (algorithm == 'KNN'):
            temp_classifier = KNeighborsClassifier(n_neighbors=3,
                                                   n_jobs=n_cores)

        elif (algorithm == 'NEURAL'):
            temp_classifier = MLPClassifier(random_state=1)

        temp_classifier.fit(train_cv_features, train_cv_label)
        temp_prediction = temp_classifier.predict(test_cv_features)

        predicted_n_actual_pd = predicted_n_actual_pd.append(pd.DataFrame({
            'ID':
            test,
            'actual':
            test_cv_label,
            'predicted':
            temp_prediction,
            'fold':
            fold
        }),
                                                             ignore_index=True,
                                                             sort=True)

        fold += 1

    try:
        roc_auc = round(
            roc_auc_score(predicted_n_actual_pd['actual'].to_list(),
                          predicted_n_actual_pd['predicted'].to_list()), 3)

    except ValueError:
        roc_auc = 0.0

    matthews = round(
        matthews_corrcoef(predicted_n_actual_pd['actual'].to_list(),
                          predicted_n_actual_pd['predicted'].to_list()), 3)
    balanced_accuracy = round(
        balanced_accuracy_score(predicted_n_actual_pd['actual'].to_list(),
                                predicted_n_actual_pd['predicted'].to_list()),
        3)
    f1 = round(
        f1_score(predicted_n_actual_pd['actual'].to_list(),
                 predicted_n_actual_pd['predicted'].to_list()), 3)

    try:
        tn, fp, fn, tp = confusion_matrix(
            predicted_n_actual_pd['actual'].to_list(),
            predicted_n_actual_pd['predicted'].to_list()).ravel()

    except:
        tn, fp, fn, tp = 0, 0, 0, 0

    cv_metrics = cv_metrics.append(pd.DataFrame(np.column_stack(['cv',roc_auc, matthews,\
        balanced_accuracy, f1, tn, fp, fn, tp]),\
        columns=['type','roc_auc','matthew','bacc','f1','TN','FP','FN','TP']), ignore_index=True, sort=True)

    cv_metrics = cv_metrics.round(3)
    cv_metrics = cv_metrics.astype({
        'TP': 'int64',
        'TN': 'int64',
        'FP': 'int64',
        'FN': 'int64'
    })
    cv_metrics = cv_metrics[[
        'type', 'matthew', 'f1', 'bacc', 'roc_auc', 'TP', 'TN', 'FP', 'FN'
    ]]

    predicted_n_actual_pd['predicted'] = le.inverse_transform(
        predicted_n_actual_pd['predicted'].to_list())
    predicted_n_actual_pd['actual'] = le.inverse_transform(
        predicted_n_actual_pd['actual'].to_list())
    fname_predicted_n_actual_pd = os.path.join(
        output_result_dir, 'cv_{}_predited_data.csv'.format(algorithm))
    predicted_n_actual_pd['ID'] = predicted_n_actual_pd['ID'] + 1
    predicted_n_actual_pd = predicted_n_actual_pd.sort_values(by=['ID'])
    predicted_n_actual_pd.to_csv(fname_predicted_n_actual_pd, index=False)

    return cv_metrics
Exemplo n.º 33
0
def test_importances_asymptotic():
    # Check whether variable importances of totally randomized trees
    # converge towards their theoretical values (See Louppe et al,
    # Understanding variable importances in forests of randomized trees, 2013).

    def binomial(k, n):
        return 0 if k < 0 or k > n else comb(int(n), int(k), exact=True)

    def entropy(samples):
        n_samples = len(samples)
        entropy = 0.

        for count in np.bincount(samples):
            p = 1. * count / n_samples
            if p > 0:
                entropy -= p * np.log2(p)

        return entropy

    def mdi_importance(X_m, X, y):
        n_samples, n_features = X.shape

        features = list(range(n_features))
        features.pop(X_m)
        values = [np.unique(X[:, i]) for i in range(n_features)]

        imp = 0.

        for k in range(n_features):
            # Weight of each B of size k
            coef = 1. / (binomial(k, n_features) * (n_features - k))

            # For all B of size k
            for B in combinations(features, k):
                # For all values B=b
                for b in product(*[values[B[j]] for j in range(k)]):
                    mask_b = np.ones(n_samples, dtype=np.bool)

                    for j in range(k):
                        mask_b &= X[:, B[j]] == b[j]

                    X_, y_ = X[mask_b, :], y[mask_b]
                    n_samples_b = len(X_)

                    if n_samples_b > 0:
                        children = []

                        for xi in values[X_m]:
                            mask_xi = X_[:, X_m] == xi
                            children.append(y_[mask_xi])

                        imp += (coef
                                * (1. * n_samples_b / n_samples)  # P(B=b)
                                * (entropy(y_) -
                                   sum([entropy(c) * len(c) / n_samples_b
                                        for c in children])))

        return imp

    data = np.array([[0, 0, 1, 0, 0, 1, 0, 1],
                     [1, 0, 1, 1, 1, 0, 1, 2],
                     [1, 0, 1, 1, 0, 1, 1, 3],
                     [0, 1, 1, 1, 0, 1, 0, 4],
                     [1, 1, 0, 1, 0, 1, 1, 5],
                     [1, 1, 0, 1, 1, 1, 1, 6],
                     [1, 0, 1, 0, 0, 1, 0, 7],
                     [1, 1, 1, 1, 1, 1, 1, 8],
                     [1, 1, 1, 1, 0, 1, 1, 9],
                     [1, 1, 1, 0, 1, 1, 1, 0]])

    X, y = np.array(data[:, :7], dtype=np.bool), data[:, 7]
    n_features = X.shape[1]

    # Compute true importances
    true_importances = np.zeros(n_features)

    for i in range(n_features):
        true_importances[i] = mdi_importance(i, X, y)

    # Estimate importances with totally randomized trees
    clf = ExtraTreesClassifier(n_estimators=500,
                               max_features=1,
                               criterion="entropy",
                               random_state=0).fit(X, y)

    importances = sum(tree.tree_.compute_feature_importances(normalize=False)
                      for tree in clf.estimators_) / clf.n_estimators

    # Check correctness
    assert_almost_equal(entropy(y), sum(importances))
    assert_less(np.abs(true_importances - importances).mean(), 0.01)
Exemplo n.º 34
0
           max_iter=-1,
           probability=False,
           random_state=None,
           shrinking=True,
           tol=0.001,
           verbose=False)
#knn = KNeighborsClassifier(algorithm='brute',n_neighbors=3,metric='mahalanobis')
nn = MLPClassifier(alpha=0.0001,
                   hidden_layer_sizes=(500, ),
                   random_state=None,
                   max_iter=500,
                   activation='logistic',
                   solver='adam')
grad_boost = GradientBoostingClassifier(n_estimators=500, learning_rate=1)
extrat = ExtraTreesClassifier(n_estimators=50,
                              max_depth=None,
                              class_weight='balanced')

clf_array = [rf, dtree, nn, svml, extrat, grad_boost]

eclf = VotingClassifier(estimators=[('Random Forest', rf),
                                    ('Decision Tree', dtree), ('NN', nn),
                                    ('GRADIENT', grad_boost),
                                    ('EXTRAT', extrat)])  #('NN',nn),

for clf_array, label in zip([rf, dtree, svml, nn, grad_boost, extrat, eclf], [
        'Random Forest', 'Decision Tree', 'SVML', 'NN', 'GRADIENT', 'EXTRAT',
        'Ensemble'
]):  #'NN',
    scores = cross_val_score(clf_array,
                             training_samples,
# Generate a list of all combinations of categories, up to a max length
category_subsets = []
max_classes = 5
for L in range(1, max_classes + 1):
  for subset in itertools.combinations(categories, L):
    category_subsets.append(subset)
# Now make a look-up table for the index corresponding to a tuple of categories
subset_index = {}
for i, category_subset in enumerate(category_subsets):
    subset_index[category_subset] = i

if do_train_coarse:
    # Coarse classifier
    coarse_classifier = Pipeline([
            ('features', CountVectorizer(ngram_range=(1,2))),
            ('classifier', ExtraTreesClassifier(max_depth=150, random_state=88,
                               n_estimators=200, n_jobs=cpu_count()-1)),
    ])
    # Fit coarse classifier
    print 'Fitting coarse classifier'
    coarse_classifier.fit(train.question, train.coarse_label)


if do_train_fine:
# Fine classifiers
    fine_classifiers = []
    for _ in range(len(category_subsets)):
        fine_classifier = Pipeline([
                ('features', CountVectorizer(ngram_range=(1,2))),
                ('classifier', ExtraTreesClassifier(max_depth=150, random_state=88*2,
                                   n_estimators=400, n_jobs=cpu_count()-1)),
        ])
Exemplo n.º 36
0
from sklearn.model_selection import train_test_split
X_train_val, X_test, y_train_val, y_test = train_test_split(
    mnist.data, mnist.target, test_size=10000, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=10000, random_state=42)

# Exercise: Then train various classifiers, such as a Random Forest classifier,
# an Extra-Trees classifier, and an SVM.
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier

random_forest_clf = RandomForestClassifier(random_state=42)
extra_trees_clf = ExtraTreesClassifier(random_state=42)
svm_clf = LinearSVC(random_state=42)
mlp_clf = MLPClassifier(random_state=42)

estimators = [random_forest_clf, extra_trees_clf, svm_clf, mlp_clf]
for estimator in estimators:
    print("Training the", estimator)
    estimator.fit(X_train, y_train)

[estimator.score(X_val, y_val) for estimator in estimators]
# Out[3]: [0.9467, 0.9512, 0.8327, 0.9592]

# The linear SVM is far outperformed by the other classifiers.
# However, let's keep it for now since it may improve the voting classifier's performance.

# Exercise: Next, try to combine them into an ensemble that outperforms them all on the validation set,
Exemplo n.º 37
0
from sklearn import datasets
from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier
import pandas as pd
from sklearn.cross_validation import train_test_split

data = pd.read_csv('Xtrain.csv', sep=',', header=None)
dataset = data.values
header = dataset[0,1:dataset.shape[1]]
dataset = dataset[1:dataset.shape[0],:]
''' Split data into training and testing '''
X = dataset[:,1:dataset.shape[1]]
y = dataset[:,0]
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

# fit an Extra Trees model to the data
model = ExtraTreesClassifier()
model.fit(X_train, y_train)
# display the relative importance of each attribute
print(model.feature_importances_)
Exemplo n.º 38
0
             LogisticRegression(C=0.1, penalty='l2', solver='lbfgs',
                                n_jobs=-1))),
        ("nb", OneVsRestClassifier(BernoulliNB(alpha=5.0))),
        ("rf",
         OneVsRestClassifier(
             RandomForestClassifier(n_estimators=300,
                                    max_depth=10,
                                    min_samples_split=5,
                                    n_jobs=-1))),
        ("xgb",
         OneVsRestClassifier(
             XGBClassifier(n_estimators=150, max_depth=8, n_jobs=8))),
        ("et",
         OneVsRestClassifier(
             ExtraTreesClassifier(n_estimators=300,
                                  max_depth=10,
                                  min_samples_split=10,
                                  n_jobs=-1))),
        ("ensemble", OneVsRestClassifier(ensemble)),
        #("svm", SVC(C=100, gamma=0.0001, probability=True)),
    ]

    results = {}

    X_train = feature_extractor.fit_transform(Xr_train, y_train['label_pa'])
    X_test = feature_extractor.transform(Xr_test)

    for name, classifier in models:
        print(name)
        results[name] = {}

        cv = StratifiedKFold(n_splits=5, random_state=42)
Exemplo n.º 39
0
    def randomised_search(self):
        print_to_consol('Running randomized search to find best classifier')

        #create the decision forest
        clf1 = ExtraTreesClassifier(random_state=20,
                                    class_weight='balanced',
                                    max_features=self.numf,
                                    max_depth=1)

        logging.info(f'Initialised classifier \n')

        #set up randomized search
        param_dict = {
            'criterion': ['gini', 'entropy'],
            'n_estimators': randint(100,
                                    10000),  #number of base estimators to use
            'min_samples_split': randint(2, 20),
            'min_samples_leaf': randint(1, 20),
            'max_leaf_nodes': randint(10, 20)
        }

        logging.info(
            f'Following parameters will be explored in randomized search \n'
            f'{param_dict} \n')

        #building and running the randomized search
        rand_search = RandomizedSearchCV(clf1,
                                         param_dict,
                                         random_state=5,
                                         cv=self.cv,
                                         n_iter=self.numc,
                                         scoring='accuracy',
                                         n_jobs=-1)

        rand_search_fitted = rand_search.fit(self.X_train_scaled, self.y_train)

        best_parameters = rand_search_fitted.best_params_
        best_scores = rand_search_fitted.best_score_

        logging.info(
            f'Running randomised search for best patameters of classifier \n'
            f'Best parameters found: {best_parameters} \n'
            f'Best accuracy scores found: {best_scores} \n')

        self.model = rand_search_fitted.best_estimator_

        datestring = datetime.strftime(datetime.now(), '%Y%m%d_%H%M')
        joblib.dump(
            self.model,
            os.path.join(self.directory,
                         'best_predictor_' + datestring + '.pkl'))

        logging.info(f'Writing best classifier to disk in {self.directory} \n')

        print_to_consol(
            'Getting 95% confidence interval for uncalibrated classifier')

        alpha, upper, lower = get_confidence_interval(
            self.X_train_scaled, self.y_train, self.X_test_scaled, self.y_test,
            self.model, self.directory, self.bootiter, 'uncalibrated')

        logging.info(f'{alpha}% confidence interval {upper}% and {lower}% \n'
                     f'for uncalibrated classifier. \n')

        print_to_consol('Getting feature importances for best classifier')

        best_clf_feat_import = self.model.feature_importances_
        best_clf_feat_import_sorted = sorted(zip(best_clf_feat_import,
                                                 self.X_train_scaled.columns),
                                             reverse=True)

        logging.info(
            f'Feature importances for best classifier {best_clf_feat_import_sorted} \n'
        )

        all_clf_feat_import_mean = np.mean(
            [tree.feature_importances_ for tree in self.model.estimators_],
            axis=0)
        all_clf_feat_import_mean_sorted = sorted(zip(
            all_clf_feat_import_mean, self.X_train_scaled.columns),
                                                 reverse=True)

        print_to_consol('Plotting feature importances for best classifier')

        feature_importances_best_estimator(best_clf_feat_import_sorted,
                                           self.directory)
        logging.info(
            f'Plotting feature importances for best classifier in decreasing order \n'
        )
        feature_importances_error_bars(self.model, self.X_train_scaled.columns,
                                       self.directory)
        logging.info(
            f'Plotting feature importances for best classifier with errorbars \n'
        )
Exemplo n.º 40
0
     'name': 'Ridge Classifier'
 },
 'GradientBoostingClassifier': {
     'model': GradientBoostingClassifier(max_features=2),
     'name': 'Gradient Boost'
 },
 'SVC': {
     'model': SVC(),
     'name': 'SVC'
 },
 'BaggingClassifier': {
     'model': BaggingClassifier(),  #base_estimator = LinearRegression()),
     'name': 'Bagging Classifier'
 },
 'ExtraTreesClassifier': {
     'model': ExtraTreesClassifier(),
     'name': 'Extra Trees Classifier'
 },
 'KNeighborsClassifier': {
     'model': KNeighborsClassifier(),
     'name': 'K Neighbors Classifier'
 },
 'DecisionTreeClassifier': {
     'model': DecisionTreeClassifier(),
     'name': 'Decision Tree Classifier'
 },
 'AdaBoostClassifier': {
     'model': AdaBoostClassifier(),  #base_estimator = LinearRegression()),
     'name': 'AdaBoost'
 },
 'LogisticRegression': {
train_x = enc.fit_transform(df)

test_y = data_2015['delay'] >= 15
df = data_2015.drop('delay', axis=1)
df['carrier'] = pd.factorize(df['carrier'])[0]
df['dest'] = pd.factorize(df['dest'])[0]
test_x = enc.transform(df)

print train_x.shape

from sklearn.ensemble import ExtraTreesClassifier

# Create Random Forest classifier with 50 trees
clf_etc = ExtraTreesClassifier(n_estimators=50,
                               max_depth=None,
                               min_samples_split=1,
                               random_state=0,
                               n_jobs=-1)
clf_etc.fit(train_x.toarray(), train_y)

# Evaluate on test set
pr = clf_etc.predict(test_x.toarray())

# print results
cm = confusion_matrix(test_y, pr)
print "<-------  ExtraTreesClassifier -------->"
print "Confusion matrix:"
print pd.DataFrame(cm)
report_svm = precision_recall_fscore_support(list(test_y),
                                             list(pr),
                                             average='binary')
Exemplo n.º 42
0
            columns=["probability(0)", "probability(1)"])
        adjusted = pandas.concat((adjusted, adjusted_proba), axis=1)
    store_csv(adjusted, name + ".csv")


build_audit(DecisionTreeClassifier(random_state=13, min_samples_leaf=2),
            "DecisionTreeAudit",
            compact=False)
build_audit(
    BaggingClassifier(DecisionTreeClassifier(random_state=13,
                                             min_samples_leaf=5),
                      random_state=13,
                      n_estimators=3,
                      max_features=0.5), "DecisionTreeEnsembleAudit")
build_audit(DummyClassifier(strategy="most_frequent"), "DummyAudit")
build_audit(ExtraTreesClassifier(random_state=13, min_samples_leaf=5),
            "ExtraTreesAudit")
build_audit(
    GradientBoostingClassifier(random_state=13, loss="exponential", init=None),
    "GradientBoostingAudit")
build_audit(
    OptimalLGBMClassifier(objective="binary",
                          n_estimators=37,
                          num_iteration=17), "LGBMAudit")
build_audit(LinearDiscriminantAnalysis(solver="lsqr"),
            "LinearDiscriminantAnalysisAudit")
build_audit(
    LogisticRegression(multi_class="multinomial",
                       solver="newton-cg",
                       max_iter=500), "MultinomialLogisticRegressionAudit")
build_audit(LogisticRegressionCV(multi_class="ovr"),
Exemplo n.º 43
0
def get_model_from_name(model_name, training_params=None):

    # For Keras
    epochs = 250
    if os.environ.get('is_test_suite',
                      0) == 'True' and model_name[:12] == 'DeepLearning':
        print(
            'Heard that this is the test suite. Limiting number of epochs, which will increase training speed dramatically at the expense of model accuracy'
        )
        epochs = 30

    all_model_params = {
        'LogisticRegression': {
            'n_jobs': -2
        },
        'RandomForestClassifier': {
            'n_jobs': -2
        },
        'ExtraTreesClassifier': {
            'n_jobs': -1
        },
        'AdaBoostClassifier': {
            'n_estimators': 10
        },
        'SGDClassifier': {
            'n_jobs': -1
        },
        'Perceptron': {
            'n_jobs': -1
        },
        'LinearSVC': {
            'dual': False
        },
        'LinearRegression': {
            'n_jobs': -2
        },
        'RandomForestRegressor': {
            'n_jobs': -2
        },
        'LinearSVR': {
            'dual': False,
            'loss': 'squared_epsilon_insensitive'
        },
        'ExtraTreesRegressor': {
            'n_jobs': -1
        },
        'MiniBatchKMeans': {
            'n_clusters': 8
        },
        'GradientBoostingRegressor': {
            'presort': False,
            'learning_rate': 0.05,
            'warm_start': True
        },
        'GradientBoostingClassifier': {
            'presort': False,
            'learning_rate': 0.05,
            'warm_start': True
        },
        'SGDRegressor': {
            'shuffle': False
        },
        'PassiveAggressiveRegressor': {
            'shuffle': False
        },
        'AdaBoostRegressor': {
            'n_estimators': 10
        },
        'XGBRegressor': {
            'nthread': -1,
            'n_estimators': 200
        },
        'XGBClassifier': {
            'nthread': -1,
            'n_estimators': 200
        },
        'LGBMRegressor': {
            'n_estimators': 2000,
            'learning_rate': 0.05,
            'num_leaves': 8,
            'lambda_l2': 0.001
        },
        'LGBMClassifier': {
            'n_estimators': 2000,
            'learning_rate': 0.05,
            'num_leaves': 8,
            'lambda_l2': 0.001
        },
        'DeepLearningRegressor': {
            'epochs': epochs,
            'batch_size': 50,
            'verbose': 2
        },
        'DeepLearningClassifier': {
            'epochs': epochs,
            'batch_size': 50,
            'verbose': 2
        },
        'CatBoostRegressor': {},
        'CatBoostClassifier': {}
    }

    model_params = all_model_params.get(model_name, None)
    if model_params is None:
        model_params = {}

    if training_params is not None:
        print('Now using the model training_params that you passed in:')
        print(training_params)
        # Overwrite our stock params with what the user passes in (i.e., if the user wants 10,000 trees, we will let them do it)
        model_params.update(training_params)
        print(
            'After overwriting our defaults with your values, here are the final params that will be used to initialize the model:'
        )
        print(model_params)

    model_map = {
        # Classifiers
        'LogisticRegression': LogisticRegression(),
        'RandomForestClassifier': RandomForestClassifier(),
        'RidgeClassifier': RidgeClassifier(),
        'GradientBoostingClassifier': GradientBoostingClassifier(),
        'ExtraTreesClassifier': ExtraTreesClassifier(),
        'AdaBoostClassifier': AdaBoostClassifier(),
        'SGDClassifier': SGDClassifier(),
        'Perceptron': Perceptron(),
        'PassiveAggressiveClassifier': PassiveAggressiveClassifier(),
        'LinearSVC': LinearSVC(),

        # Regressors
        'LinearRegression': LinearRegression(),
        'RandomForestRegressor': RandomForestRegressor(),
        'Ridge': Ridge(),
        'LinearSVR': LinearSVR(),
        'ExtraTreesRegressor': ExtraTreesRegressor(),
        'AdaBoostRegressor': AdaBoostRegressor(),
        'RANSACRegressor': RANSACRegressor(),
        'GradientBoostingRegressor': GradientBoostingRegressor(),
        'Lasso': Lasso(),
        'ElasticNet': ElasticNet(),
        'LassoLars': LassoLars(),
        'OrthogonalMatchingPursuit': OrthogonalMatchingPursuit(),
        'BayesianRidge': BayesianRidge(),
        'ARDRegression': ARDRegression(),
        'SGDRegressor': SGDRegressor(),
        'PassiveAggressiveRegressor': PassiveAggressiveRegressor(),

        # Clustering
        'MiniBatchKMeans': MiniBatchKMeans()
    }

    if xgb_installed:
        model_map['XGBClassifier'] = XGBClassifier()
        model_map['XGBRegressor'] = XGBRegressor()

    if lgb_installed:
        model_map['LGBMRegressor'] = LGBMRegressor()
        model_map['LGBMClassifier'] = LGBMClassifier()

    if catboost_installed:
        model_map['CatBoostRegressor'] = CatBoostRegressor(
            calc_feature_importance=True)
        model_map['CatBoostClassifier'] = CatBoostClassifier(
            calc_feature_importance=True)

    if keras_installed:
        model_map['DeepLearningClassifier'] = KerasClassifier(
            build_fn=make_deep_learning_classifier)
        model_map['DeepLearningRegressor'] = KerasRegressor(
            build_fn=make_deep_learning_model)

    try:
        model_without_params = model_map[model_name]
    except KeyError as e:
        print(
            'It appears you are trying to use a library that is not available when we try to import it, or using a value for model_names that we do not recognize'
        )
        raise (e)
    model_with_params = model_without_params.set_params(**model_params)

    return model_with_params
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier
import pickle
#%%
comment_start = 0
comment_end = 50000
matrix_size = 5000
#%% Diğer sınıflandırma metodlarıda karşılaştırılarak en yüksek başarılı sınıf seçilir.
models=[GaussianNB(),
       RandomForestClassifier(n_estimators=100),
       KNeighborsClassifier(n_neighbors=5),
       DecisionTreeClassifier(),
       SVC(gamma='scale'),
       GradientBoostingClassifier(),
       LogisticRegression(multi_class="auto", solver="liblinear"),
       ExtraTreesClassifier(n_estimators=100),
       BaggingClassifier()]

def best_model(models, show_metrics=False):
        print("INFO: Finding Accuracy Best Classifier...", end="\n\n")
        best_clf=None
        best_acc=0
        for clf in models:
            clf.fit(x_train, y_train)
            y_pred=clf.predict(x_test)
            acc=metrics.accuracy_score(y_test, y_pred)
            print(clf.__class__.__name__, end=" ")
            print("Accuracy: {:.3f}".format(acc))

            if best_acc<acc:
                best_acc=acc
Exemplo n.º 45
0
def compute_score(X_train, y_train, X_test, y_test, cat_indicator, n_jobs, orig_timeout):
    trees = 100
    max_iter = 1000
    if len(X_train) >= 100000:
        trees = 10
        max_iter = 100

    print("Start!")
    timeout = orig_timeout
    start = timer()
 
    (X_train, X_test, cat_indicator) = reduce_dimensionality(X_train, X_test, cat_indicator)

    classifiers = [BernoulliNB(), LinearDiscriminantAnalysis(), LogisticRegression(random_state=1), AdaBoostClassifier(random_state=1), LinearSVC(max_iter=max_iter, random_state=1), ExtraTreesClassifier(random_state=1, n_estimators=trees), RandomForestClassifier(random_state=1, n_estimators=trees), BaggingClassifier(random_state=1, n_estimators=10), MLPClassifier(random_state=1,early_stopping=True), GradientBoostingClassifier(max_features=5, random_state=1, n_estimators=10)]
    
    model_steps = [SimpleImputer(strategy='median'), RobustScaler()]
    ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)

    cats = []
    rows = len(X_train)
    # use rule of thumb to exclude categorical atts with high cardinality for one-hot-encoding
    max_num_cols = math.log(rows, 2)
    if rows > 100000:
        max_num_cols = max_num_cols/4
    
    # Iterate over all categorical attributes
    for i in range(len(cat_indicator)):
        if cat_indicator[i] is True:
            arity = len(X_train.iloc[:,i].unique())
            if arity <= max_num_cols:
                cats.append(i)

    if len(cats) > 0:
        start1=timer()
        X_train.reset_index(drop=True,inplace=True)
        X_object = X_train.iloc[:,cats]
        codes = ohe.fit_transform(X_object)
        X_train = pd.concat([X_train.drop(X_train.columns[cats],axis=1),
               pd.DataFrame(codes).astype(int)], axis=1)
        end1=timer()
    
    for m in model_steps:
        X_train = m.fit_transform(X_train)

    y_train = pd.DataFrame(y_train)
    X_train = pd.DataFrame(X_train)

    num_atts = X_train.shape[1]
    if num_atts <= 50 and rows <= 10000:
        classifiers.append(KNeighborsClassifier(n_neighbors=10))

    # For ensembles
    if num_atts >= 500:
        classifiers[5].max_features="log2"
        classifiers[6].max_features="log2"
        classifiers[7].max_features=0.8
    classifiers[9].max_features=min(5, num_atts) 

    # For bagging
    if num_atts < 100:
        if rows <= 10000:
            classifiers[7].n_estimators = 100
        elif rows <= 50000:
            classifiers[7].n_estimators = 50
        else:
            classifiers[7].n_estimators = 10

    async_message_thread = Pool((int)(n_jobs))
    results = [async_message_thread.apply_async(score_solution, (X_train, y_train, c)) for c in classifiers]
    index = 0
    scores = []

    end = timer()
    time_used = end - start
    timeout = timeout - time_used
    print("time remaining = ", timeout)
    for r in results:
        try:
            start_solution = timer()
            score = r.get(timeout=timeout)
            scores.append(score)
            end_solution = timer()
            time_used = end_solution - start_solution
            timeout = timeout - time_used
            if timeout <= 0:
                 timeout = 3
        except TimeoutError:
            timeout = 1
        except:
            print(sys.exc_info()[0])
            print("Solution terminated: ", classifiers[index])
            print(X_train.shape)
            scores.append(-1)
            end_solution = timer()
            time_used = end_solution - start_solution
            timeout = timeout - time_used
            if timeout <= 0:
                 timeout = 1
        index = index + 1

    pca = None
    RFpca = None
    print("time remaining = ", timeout)
    if timeout >= 10 and len(X_train) < 100000:
        from sklearn.decomposition import PCA
        start_solution = timer()
        n_comp = min(10, X_train.shape[1])
        pca = PCA(n_components=n_comp)
        Xpca = pca.fit_transform(X_train)
        end_solution = timer()
        time_used = end_solution - start_solution
        print("PCA = ", time_used)
        RFpca = RandomForestClassifier(random_state=1)
        score = score_solution(pd.DataFrame(Xpca), y_train, RFpca)
        scores.append(score)
        classifiers.append(RFpca)
    else:
        classifiers.append(None)
        scores.append(-1)

    timeout = timeout - time_used
    bagged_trees = classifiers[7].n_estimators
    while timeout > 0.1 * orig_timeout:
        trees = trees + 100
        print("Trying trees = ", trees)
        classifiers.append(ExtraTreesClassifier(random_state=1, n_estimators=trees))
        classifiers.append(RandomForestClassifier(random_state=1, n_estimators=trees))
        bagged_trees = bagged_trees + 10
        classifiers.append(BaggingClassifier(random_state=1, max_features=classifiers[7].max_features, n_estimators=bagged_trees))
        
        results = [async_message_thread.apply_async(score_solution, (X_train, y_train, c)) for c in classifiers[10:13]]
        for r in results:
            try:
                start_solution = timer()
                score = r.get(timeout=timeout)
                scores.append(score)
                end_solution = timer()
                time_used = end_solution - start_solution
                timeout = timeout - time_used
                if timeout <= 0:
                    timeout = 1
            except TimeoutError:
                timeout = 1
            except:
                print(sys.exc_info()[0])
                print("Solution terminated: ")
                scores.append(-1)
                end_solution = timer()
                time_used = end_solution - start_solution
                timeout = timeout - time_used
                if timeout <= 0:
                    timeout = 1
        if trees > 1000:
            break

    print(scores)
    # Sort solutions by their scores and rank them
    sorted_x = np.argsort(scores)
    best_model = None
    bestindex = sorted_x[len(scores)-1]

    if bestindex == 10:
        # Best is PCA-RF model
        best_model = RFpca
        best_model.fit(Xpca, y_train)
        model_steps.append(pca)
        cl = "pca+rf"
    else:
        best_model = classifiers[bestindex]
        print(best_model)
        best_model.fit(X_train, y_train)
        cl = type(best_model).__name__

    if len(cats) > 0:
       # OHE
        X_test.reset_index(drop=True,inplace=True)
        X_object = X_test.iloc[:,cats]
        codes = ohe.transform(X_object)
        X_test = pd.concat([X_test.drop(X_test.columns[cats],axis=1),
               pd.DataFrame(codes).astype(int)], axis=1)

    for m in model_steps:
        X_test = m.transform(X_test)

    y_hat = best_model.predict(X_test)
    best = accuracy_score(y_test, y_hat)
    #for c in classifiers:
    #    c.fit(X_train, y_train)
    #    y_hat = c.predict(X_test)
    #    best1 = accuracy_score(y_test, y_hat)
    #    print(c)
    #    print(best1)
    return (best, len(X_train.columns), cl)
Exemplo n.º 46
0
    learning_rate=0.75)
model2.fit(train_X, train_y)
model3 = RandomForestClassifier(n_jobs=-1,
     n_estimators=500,
     warm_start=True, 
     #'max_features': 0.2,
     max_depth=6,
     min_samples_leaf=2,
     max_features='sqrt',
     verbose=0

)
model3.fit(train_X, train_y)
model4 = ExtraTreesClassifier(n_jobs=-1,
    n_estimators=500,
    #max_features=0.5,
    max_depth=8,
    min_samples_leaf=2,
    verbose=0)
model4.fit(train_X, train_y)
model5 = SVC(kernel='linear',
    C=0.025)
model5.fit(train_X, train_y)

train_X1 = model1.predict(train_X)
train_X2 = model2.predict(train_X)
train_X3 = model3.predict(train_X)
train_X4 = model4.predict(train_X)
train_X5 = model5.predict(train_X)

train_X1 = train_X1[:, np.newaxis]
train_X2 = train_X2[:, np.newaxis]
Exemplo n.º 47
0
def fun(in_road):  # ok
    start = time.time()
    index = []
    # 获取csv文件里面一共有几列
    col_num = get_col.getCol(in_road)
    data_dimension = col_num - 1

    # 载入数据集
    dataset = loadtxt(in_road, delimiter=",", skiprows=1)
    print(type(dataset))

    # split data into x and y
    x = dataset[:, 0:data_dimension]  # x[:,m:n],即取所有数据的第m到n-1列数据,含左不含右
    y = dataset[:, data_dimension]

    random_s = [8, 20, 40, 100, 200, 1000]  # 依据不同的种子运算多次,之后进行投票选择继续约减
    for rs in random_s:
        # 把数据集拆分成训练集和测试集
        x_train, x_test, y_train, y_test = train_test_split(x,
                                                            y,
                                                            test_size=0.33,
                                                            random_state=rs)

        print("-----------------XGBoost-----------------")

        # 拟合XGBoost模型
        model1 = XGBClassifier(
            learning_rate=0.1,
            n_estimators=1000,  # 树的个数--1000棵树建立xgboost
            max_depth=5,  # 树的深度
            min_child_weight=1,  # 叶子节点最小权重
            gamma=0.,  # 惩罚项中叶子结点个数前的参数
            subsample=0.8,  # 随机选择80%样本建立决策树
            colsample_btree=0.8,  # 随机选择80%特征建立决策树
            objective='reg:logistic',  # 指定损失函数
            scale_pos_weight=1,  # 解决样本个数不平衡的问题
            random_state=27  # 随机数种子
        )
        model1.fit(x_train, y_train)

        # 强特征排序
        importance = model1.feature_importances_
        top = pd.Series(importance).sort_values(ascending=False)

        # 输出前10的index索引
        print(list(top.index)[:top_num])
        index.extend(list(top.index)[:top_num])

        # 对测试集做预测
        y_pred = model1.predict(x_test)
        predictions = [round(value) for value in y_pred]
        accuracy = accuracy_score(y_test, predictions)
        print("Accuracy: %.2f%%" % (accuracy * 100.0))
        precision = precision_score(y_test, predictions)
        print("precision: %.2f%%" % (precision * 100.0))

        print("-----------------LightGBM-----------------")

        params = {
            'task': 'train',
            'boosting_type': 'gbdt',  # GBDT算法为基础
            'objective': 'binary',
            'metric': 'auc',  # 评判指标
            'max_bin': 255,  # 大会有更准的效果,更慢的速度
            'learning_rate': 0.1,  # 学习率
            'num_leaves': 64,  # 大会更准,但可能过拟合
            # 'max_depth': -1,   小数据集下限制最大深度可防止过拟合,小于0表示无限制
            'feature_fraction': 0.8,  # 防止过拟合
            'bagging_freq': 5,  # 防止过拟合
            'bagging_fraction': 0.8,  # 防止过拟合
            'min_data_in_leaf': 10,  # 防止过拟合
            'min_sum_hessian_in_leaf': 3.0,  # 防止过拟合
            # 'header': True   数据集是否带表头
            'verbose':
            -1  # 忽略掉警告:No further splits with positive gain, best gain: -inf
        }

        lgb_train = lgb.Dataset(x_train, label=y_train)
        model2 = lgb.train(params, train_set=lgb_train)

        importance = model2.feature_importance()
        top = pd.Series(importance).sort_values(ascending=False)
        print(list(top.index)[:top_num])
        index.extend(list(top.index)[:top_num])

        y_pred = model2.predict(x_test)
        predictions = [round(value) for value in y_pred]
        accuracy = accuracy_score(y_test, predictions)
        print("Accuracy: %.2f%%" % (accuracy * 100.0))
        precision = precision_score(y_test, predictions)
        print("precision: %.2f%%" % (precision * 100.0))

        print("-----------------ExtraTree是随机森林的一个变种-----------------")

        model4 = ExtraTreesClassifier(n_estimators=10,
                                      max_depth=None,
                                      min_samples_split=2,
                                      random_state=0)
        model4.fit(x_train, y_train)

        importance = model4.feature_importances_
        top = pd.Series(importance).sort_values(ascending=False)
        print(list(top.index)[:top_num])
        index.extend(list(top.index)[:top_num])

        y_pred = model4.predict(x_test)
        predictions = [round(value) for value in y_pred]
        accuracy = accuracy_score(y_test, predictions)
        print("Accuracy: %.2f%%" % (accuracy * 100.0))
        precision = precision_score(y_test, predictions)
        print("precision: %.2f%%" % (precision * 100.0))

    end = time.time()
    running_time = end - start
    print('-----------time--------')
    print(running_time)

    print(index)

    #排序
    sort = get_count_by_counter(index)
    top_index = sort.most_common(top_num)

    return top_index
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.svm import LinearSVC
from tpot.builtins import StackingEstimator

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.9270935960591131
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=LinearSVC(
        C=0.01, dual=True, loss="hinge", penalty="l2", tol=1e-05)),
    ExtraTreesClassifier(bootstrap=True,
                         criterion="entropy",
                         max_features=0.7500000000000001,
                         min_samples_leaf=5,
                         min_samples_split=2,
                         n_estimators=100))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Exemplo n.º 49
0
     },
     'lr': {
         'cv_param': {
             'C': [.01, .05, .1, .5, 1.0, 5.0, 10.0],
             'penalty': ['l1', 'l2']
         },
         'estimator': LogisticRegression(random_state=Repeat * 10 + 2)
     },
     'et': {
         'cv_param': {
             'criterion': ['gini', 'entropy'],
             'max_depth': [3, 5, 7, None],
             'n_estimators': [10, 20, 30, 50, 100]
         },
         'estimator':
         ExtraTreesClassifier(n_jobs=-1, random_state=Repeat * 10 + 2)
     },
     'rf': {
         'cv_param': {
             'criterion': ['gini', 'entropy'],
             'max_depth': [3, 5, 7, None],
             'n_estimators': [10, 20, 30, 50, 100]
         },
         'estimator':
         RandomForestClassifier(n_jobs=-1, random_state=Repeat * 10 + 2)
     }
 }
 n_clf = len(CLF)
 Fscore_trn = np.zeros(n_clf)
 Fscore_tst = np.zeros(n_clf)
 prob_trn = np.zeros([n_clf, n_cases_trn, 10])
Exemplo n.º 50
0
def classifier():
    np.random.seed(0)  # seed to shuffle the train set

    n_folds = 5
    verbose = True
    shuffle = False

    X, y, X_submission, soln = load()

    if shuffle:
        idx = np.random.permutation(y.size)
        X = X[idx]
        y = y[idx]

    skf = list(StratifiedKFold(y, n_folds))

    clfs = [
        RandomForestClassifier(n_estimators=10, n_jobs=-1, criterion='gini'),
        RandomForestClassifier(n_estimators=10, n_jobs=-1,
                               criterion='entropy'),
        ExtraTreesClassifier(n_estimators=10, n_jobs=-1, criterion='gini'),
        ExtraTreesClassifier(n_estimators=10, n_jobs=-1, criterion='entropy')
    ]

    print "Creating train and test sets for blending."

    dataset_blend_train = np.zeros((X.shape[0], len(clfs)))
    dataset_blend_test = np.zeros((X_submission.shape[0], len(clfs)))

    for j, clf in enumerate(clfs):
        print j, clf
        dataset_blend_test_j = np.zeros((X_submission.shape[0], len(skf)))
        for i, (train, test) in enumerate(skf):
            print "Fold", i
            X_train = X[train]
            y_train = y[train]
            X_test = X[test]
            y_test = y[test]

            for item in X_train:
                if len(item) != 1776:
                    print len(item)

            clf.fit(X_train, y_train)

            y_submission = clf.predict_proba(X_test)[:, 1]
            dataset_blend_train[test, j] = y_submission
            dataset_blend_test_j[:, i] = clf.predict_proba(X_submission)[:, 1]
            dataset_blend_test[:, j] = dataset_blend_test_j.mean(1)
    print len(dataset_blend_test[0])
    print "Without Blending"
    y_submission = dataset_blend_test.mean(1)
    y_submission = (y_submission - y_submission.min()) / (y_submission.max() -
                                                          y_submission.min())
    print "Saving Results."
    np.savetxt(fname='test_ans.csv', X=y_submission, fmt='%0.9f')
    print "LogLoss."
    print logloss(y_submission, soln)

    print "Blending."
    clf = LogisticRegression()
    clf.fit(dataset_blend_train, y)
    y_submission = clf.predict_proba(dataset_blend_test)[:, 1]

    print "Linear stretch of predictions to [0,1]"
    y_submission = (y_submission - y_submission.min()) / (y_submission.max() -
                                                          y_submission.min())

    print "LogLoss."
    print logloss(y_submission, soln)
Exemplo n.º 51
0
# plt.show()

# 传统决策树,随机森林算法 极端随机数的区别
DT = DecisionTreeClassifier(max_depth=None,
                            min_samples_split=2,
                            random_state=0)

RF = RandomForestClassifier(n_estimators=10,
                            max_features=math.sqrt(n_features),
                            max_depth=None,
                            min_samples_split=2,
                            bootstrap=True)

EC = ExtraTreesClassifier(n_estimators=10,
                          max_features=math.sqrt(n_features),
                          max_depth=None,
                          min_samples_split=2,
                          bootstrap=False)

# 训练
DT.fit(x_train, y_train)
RF.fit(x_train, y_train)
EC.fit(x_train, y_train)

#区域预测
# 第0列的范围
x1_min, x1_max = x[:, 0].min(), x[:, 0].max()
# 第1列的范围
x2_min, x2_max = x[:, 1].min(), x[:, 1].max()
# 生成网格采样点行列均为200点
x1, x2 = np.mgrid[x1_min:x1_max:200j, x2_min:x2_max:200j]
        tmp_len = len(train[train_series.isnull()])
        if tmp_len > 0:
            #print "mean", train_series.mean()
            train.loc[train_series.isnull(), train_name] = -999
        #and Test
        tmp_len = len(test[test_series.isnull()])
        if tmp_len > 0:
            test.loc[test_series.isnull(), test_name] = -999

X_train = train
X_test = test
print('Training...')
extc = ExtraTreesClassifier(n_estimators=750,
                            max_features=60,
                            criterion='entropy',
                            min_samples_split=4,
                            max_depth=40,
                            min_samples_leaf=2,
                            n_jobs=-1)

extc.fit(X_train, target)

print('Predict...')
y_pred = extc.predict_proba(X_test)
#print y_pred

pd.DataFrame({
    "ID": id_test,
    "PredictedProb": y_pred[:, 1]
}).to_csv('extra_trees.csv', index=False)
Exemplo n.º 53
0
def TestPerformance2(X, Y, nF=3, testTimes=10, bScaled=1, _test_size=0.1):

    if bScaled == 1:
        X_scaled = preprocessing.scale(X)
        X = X_scaled

    print('--------------------START-----------')

    hitResult0 = []
    hitResult1 = []
    hitResult2 = []
    hitResult3 = []
    hitResult4 = []

    times = np.zeros(5, )

    for iter in range(testTimes):

        X_train, X_test, y_train, y_test = train_test_split(
            X, Y, test_size=_test_size)

        starttime = datetime.datetime.now()

        model = IWKNN()

        w = model.fit(X_train, y_train)

        endtime = datetime.datetime.now()

        times[0] = times[0] + (endtime - starttime).seconds

        print('------------IWKNN------------')

        HitFeatures(hitResult0, w, nF)

        # fit an Extra Trees model to the data

        starttime = datetime.datetime.now()

        model = ExtraTreesClassifier()
        model.fit(X, Y)

        endtime = datetime.datetime.now()

        times[1] = times[1] + (endtime - starttime).seconds

        print('------------ExtraTreesClassifier------------')
        # display the relative importance of each attribute
        # print(model.feature_importances_)

        HitFeatures(hitResult1, model.feature_importances_, nF)

        starttime = datetime.datetime.now()
        model = LogisticRegression()
        # create the RFE model and select 3 attributes
        rfe = RFE(model, nF)
        rfe = rfe.fit(X, Y)

        endtime = datetime.datetime.now()

        times[2] = times[2] + (endtime - starttime).seconds

        # summarize the selection of the attributes
        print('------------rfe logistic regression------------')
        # print(rfe.support_)
        # print(rfe.ranking_)

        # print(support2value(rfe.support_))

        HitFeatures(hitResult2, support2value(rfe.support_), nF)

        #    print(rfe.scores_)

        starttime = datetime.datetime.now()

        model = svm.SVC(kernel='linear')
        # create the RFE model and select 3 attributes
        rfe = RFE(model, nF)
        rfe = rfe.fit(X, Y)

        endtime = datetime.datetime.now()

        times[3] = times[3] + (endtime - starttime).seconds

        # summarize the selection of the attributes
        print('------------rfe svm linear------------')
        # print(rfe.support_)
        # print(rfe.ranking_)

        # print(support2value(rfe.support_))

        HitFeatures(hitResult3, support2value(rfe.support_), nF)

        starttime = datetime.datetime.now()

        ridge = Ridge(alpha=1)
        ridge.fit(X, Y)

        endtime = datetime.datetime.now()

        times[4] = times[4] + (endtime - starttime).seconds

        print('------------ridge------------')
        # print (ridge.coef_)
        # print (ridge.intercept_)

        HitFeatures(hitResult4, ridge.coef_, nF)

        print('time=')
        print(times)
Exemplo n.º 54
0
 "QuadraticDiscriminantAnalysis":
 QuadraticDiscriminantAnalysis(),
 "SupportVectorMachine":
 SVC(kernel="poly", degree=5),
 "LogisticRegression":
 LogisticRegression(solver="saga", n_jobs=-1),
 "ArtificalNeuralNetwork":
 MLPClassifier(hidden_layer_sizes=30, max_iter=2000, solver="lbfgs"),
 "DecisionTree":
 DecisionTreeClassifier(random_state=42),
 "ExtraTree":
 ExtraTreeClassifier(random_state=42),
 "RandomForest":
 RandomForestClassifier(n_jobs=-1, random_state=42),
 "ExtraTrees":
 ExtraTreesClassifier(n_jobs=-1, random_state=42),
 "XGBoost":
 XGBClassifier(use_label_encoder=False,
               eval_metric="error",
               n_jobs=-1,
               random_state=42),
 "LightGBM":
 LGBMClassifier(n_estimators=128, n_jobs=-1, random_state=42),
 "AdaBoost":
 AdaBoostClassifier(n_estimators=128, learning_rate=1.0, random_state=42),
 "Bagging":
 BaggingClassifier(n_estimators=128, n_jobs=-1, random_state=42),
 "GradientBoosting":
 GradientBoostingClassifier(n_estimators=128,
                            learning_rate=1.0,
                            random_state=42),
Exemplo n.º 55
0
                num_round,
                watchlist,
                obj=logregobj,
                feval=evalerror)

# scikit-learn ExtraTreesClassifier..................................
import gc
from time import time
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score

pipe_ext = make_pipeline(ExtraTreesClassifier(
    random_state=SEED,
    n_jobs=CPU,
))
param_grid_ext = {
    'extratreesclassifier__n_estimators': [1000],
    'extratreesclassifier__max_depth': [4, 6, 8],
    'extratreesclassifier__min_samples_split': [10],
    'extratreesclassifier__min_samples_leaf': [10],
    'extratreesclassifier__max_features': ['sqrt'],
    'extratreesclassifier__n_jobs': [CPU]
}
gridcv_ext = GridSearchCV(pipe_ext,
                          param_grid=param_grid_ext,
                          scoring='roc_auc',
                          n_jobs=1,
                          cv=StratifiedKFold(n_splits=5,
                                             shuffle=True,

# ## Selecting the best Features for our Model

# In[42]:


x = df4.drop("quality",axis=True)
y = df4["quality"]


# In[43]:


from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(x,y)


# In[44]:


print(model.feature_importances_)


# In[45]:


feat_importances = pd.Series(model.feature_importances_,index =x.columns)
feat_importances.nlargest(9).plot(kind="barh")
plt.show()
class SentimentAnalysis:
    def readFile(self, filePath):
        data = []
        y = []
        with open(filePath, 'r') as file:
            csvreader = csv.reader(file, delimiter='\t')
            next(csvreader)
            for row in csvreader:
                data.append(row[2])
                if len(row) > 3:
                    y.append(row[3])
        return data, y

    def preprocess(self, data):
        preprocessedCorpus = []

        for phrase in data:
            # All to lower case
            phrase = phrase.lower()

            # Split to tokens
            tokenizer = RegexpTokenizer(r'\w+')
            tokens = tokenizer.tokenize(phrase)

            # Stopword filtering
            nonstopTokens = [token for token in tokens if not token in self.stopWords]

            # Stemming
            stemmer = SnowballStemmer("english")
            for index, item in enumerate(nonstopTokens):
                stemmedWord = stemmer.stem(item)
                nonstopTokens[index] = stemmedWord

            # Remove numbers
            finalTokens = [token for token in nonstopTokens if not token.isnumeric()]

            # Add to corpus
            preprocessedCorpus.append(" ".join(nonstopTokens))

        return preprocessedCorpus

    def extractFeatures(self, corpus):
        wordIds = []
        CountVectorizer(binary=binary,
                        tokenizer=lambda x: x.split(),
                        min_df=min_df,
                        ngram_range=(1, 1),
                        stop_words=stopwords),
        ClassifierOvOAsFeatures()
        for phrase in corpus:
            wordIds.append([self.word2id[word] for word in phrase.split(" ")])
        return wordIds

    def classify(self):
        leafNodeSizeRange = range(1,100)
        scoreCrossVal = list()
        for minLeafNodeSize in leafNodeSizeRange:
            self.classifier = RandomForestClassifier(n_estimators=200, criterion='gini',
                                                     min_samples_leaf=minLeafNodeSize, n_jobs=-1)
            scores = cross_val_score(self.classifier, self.X, self.y, cv=10)
            scoreCrossVal.append(scores.mean())

        print(scores.mean())
        index, val = max(enumerate(scoreCrossVal), key=operator.itemgetter(1))
        print("Max cross validation score: " + str(val))
        optimLeafNodeSize = leafNodeSizeRange[index]
        print("Optimal min leaf node size: " + str(optimLeafNodeSize))

        plt.figure()
        plt.plot(leafNodeSizeRange, scoreCrossVal)
        plt.xlabel('Minimum samples in leaf node')
        plt.ylabel('Cross validation score')
        plt.title('Random Forest')
        plt.show()
        
		
		maxDepthRange = range(30, 100, 5)
        scoreCrossVal = list()
        for maxTreeDepth in maxDepthRange:
            self.classifier = RandomForestClassifier(n_estimators=200, criterion='gini',
                                         max_depth=maxTreeDepth,n_jobs=-1)
        
            scores = cross_val_score(self.classifier, self.X, self.y, cv=10)
            scoreCrossVal.append(scores.mean())

        index, val = max(enumerate(scoreCrossVal), key=operator.itemgetter(1))
        print("Max cross validation score: " + str(val))
        optimTreeDepth = maxDepthRange[index]
        print("Optimal max tree depth: " + str(optimTreeDepth))
        
        plt.figure()
        plt.plot(maxDepthRange, scoreCrossVal)
        plt.xlabel('Maximum tree depth')
        plt.ylabel('Cross validation score')
        plt.title('Random Forest')
        plt.show()

        # Try an extremely randomized forest.
        leafNodeSizeRange = range(1, 100)
        scoreCrossVal = list()
        for minLeafNodeSize in leafNodeSizeRange:
            print("Running model " + str(minLeafNodeSize) + "...")
            self.classifier = ExtraTreesClassifier(n_estimators=200, criterion='gini',
                                       min_samples_leaf=minLeafNodeSize)
            scores = cross_val_score(self.classifier, self.X, self.y, cv=10)
            scoreCrossVal.append(scores.mean())

        index, val = max(enumerate(scoreCrossVal), key=operator.itemgetter(1))
        print("Max cross validation score: " + str(val))
        optimLeafNodeSize = leafNodeSizeRange[index]
        print("Optimal min leaf node size: " + str(optimLeafNodeSize))

        plt.figure()
        plt.plot(leafNodeSizeRange, scoreCrossVal)
        plt.xlabel('Minimum samples in leaf node')
        plt.ylabel('Cross validation score')
        plt.title('Extremely Randomized Forest')
        plt.show()
Exemplo n.º 58
0
class BaseSkModel(object):
    """
    モデルに関する情報を定義する。モデル名、フォルダパス、目的変数等
    """
    version_str = 'base'
    """ モデルのバージョン名 """
    model_name = ''
    """ 学習モデルの名前(XGBoostとか)。init時の引数で定義される """
    model_path = ""
    """ モデルデータが格納される親フォルダ。 """
    class_list = ['競走種別コード', '場コード']
    """ 分類軸のリスト。このリスト毎に学習モデルを生成 """
    obj_column_list = ['WIN_FLAG', 'JIKU_FLAG', 'ANA_FLAG']
    """ 説明変数のリスト。このリストの説明変数毎に処理を実施する """
    ens_folder_path = ""
    """ モデルデータが格納される親フォルダ。 """
    dict_folder = ""
    """ 辞書フォルダのパス """
    index_list = ["RACE_KEY", "UMABAN", "NENGAPPI"]
    """ 対象データの主キー。ModeがRaceの場合はRACEにする """
    clfs = [
        RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
        ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
        GradientBoostingClassifier(learning_rate=0.05,
                                   subsample=0.5,
                                   max_depth=6,
                                   n_estimators=50),
        KNeighborsClassifier(n_neighbors=10, n_jobs=-1),
        GaussianNB(),
        XGBClassifier(learning_rate=0.1,
                      n_estimators=1000,
                      max_depth=5,
                      min_child_weight=1,
                      gamma=0,
                      subsample=0.8,
                      colsample_bytree=0.5,
                      objective='binary:logistic',
                      scale_pos_weight=1,
                      seed=0)
    ]
    """ アンサンブル学習時に利用するクラス """
    learning_df = ""

    def __init__(self, model_name, version_str, start_date, end_date,
                 mock_flag, test_flag, mode):
        self.model_name = model_name
        self.version_str = version_str
        self.start_date = start_date
        self.end_date = end_date
        self.dict_path = mc.return_base_path(test_flag)
        self._set_folder_path(mode)
        self.model_folder = self.model_path + model_name + '/'
        self.proc = self._get_skproc_object(version_str, start_date, end_date,
                                            model_name, mock_flag, test_flag)

    def _set_folder_path(self, mode):
        self.model_path = self.dict_path + 'model/' + self.version_str + '/'
        self.dict_folder = self.dict_path + 'dict/' + self.version_str + '/'
        self.ens_folder_path = self.dict_path + 'intermediate/' + self.version_str + '_' + mode + '/'

    def _get_skproc_object(self, version_str, start_date, end_date, model_name,
                           mock_flag, test_flag):
        print("-- check! this is BaseSkModel class: " +
              sys._getframe().f_code.co_name)
        proc = BaseSkProc(version_str, start_date, end_date, model_name,
                          mock_flag, test_flag, self.obj_column_list)
        return proc

    def create_learning_data(self):
        """ 学習用データを作成。処理はprocを呼び出す """
        self.learning_df = self.proc.proc_create_learning_data()

    def get_all_learning_df_for_save(self):
        save_learning_df = self.learning_df.drop(self.class_list, axis=1)
        return save_learning_df

    def get_val_list(self, df, cls_val):
        val_list = df[cls_val].drop_duplicates().astype(str)
        return val_list

    def get_filter_df(self, df, cls_val, val):
        if cls_val == "コース":
            query_str = cls_val + " == '" + str(val) + "'"
        else:
            query_str = cls_val + " == " + val
        print(query_str)
        filter_df = df.query(query_str)
        # 分類対象のデータを削除
        filter_df.drop(self.class_list, axis=1, inplace=True)
        return filter_df

    def create_featrue_select_data(self, learning_df):
        """  説明変数ごとに特徴量作成の処理(TargetEncodingとか)の処理を実施

        :param dataframe learning_df: dataframe
        """
        self.proc.proc_create_featrue_select_data(learning_df)

    def proc_learning_sk_model(self, df, cls_val, val):
        """  説明変数ごとに、指定された場所の学習を行う

        :param dataframe df: dataframe
        :param str basho: str
        """
        if not df.dropna().empty:
            if len(df.index) >= 30:
                print("----- アンサンブル学習用のクラスをセット -----")
                self.proc.set_ensemble_params(self.clfs, self.index_list,
                                              self.ens_folder_path)
                print("proc_learning_sk_model: df", df.shape)
                for target in self.obj_column_list:
                    print(target)
                    self.proc.learning_sk_model(df, cls_val, val, target)
            else:
                print("---- 少数レコードのため学習スキップ -- " + str(len(df.index)))
        else:
            print("---- NaNデータが含まれているため学習をスキップ")

    def create_predict_data(self):
        """ 予測用データを作成。処理はprocを呼び出す """
        predict_df = self.proc.proc_create_predict_data()
        return predict_df

    def proc_predict_sk_model(self, df, cls_val, val):
        """ predictする処理をまとめたもの。指定されたbashoのターゲットフラグ事の予測値を作成して連結したものをdataframeとして返す

        :param dataframe df: dataframe
        :param str val: str
        :return: dataframe
        """
        all_df = pd.DataFrame()
        if not df.empty:
            for target in self.obj_column_list:
                pred_df = self.proc._predict_sk_model(df, cls_val, val, target)
                if not pred_df.empty:
                    grouped_df = pred_df  #self._calc_grouped_data(pred_df)
                    grouped_df["target"] = target
                    grouped_df["target_date"] = pred_df[
                        "NENGAPPI"].dt.strftime('%Y/%m/%d')
                    grouped_df["model_name"] = self.model_name
                    all_df = pd.concat([all_df, grouped_df]).round(3)
        return all_df

    def create_import_data(self, all_df):
        """ データフレームをアンサンブル化(Vote)して格納 """
        all_df.dropna(inplace=True)
        grouped_all_df = all_df.groupby(["RACE_KEY", "UMABAN", "target"],
                                        as_index=False).mean()
        date_df = all_df[["RACE_KEY", "target_date"]].drop_duplicates()
        temp_grouped_df = pd.merge(grouped_all_df, date_df, on="RACE_KEY")
        grouped_df = self._calc_grouped_data(temp_grouped_df)
        import_df = grouped_df[[
            "RACE_KEY", "UMABAN", "pred", "prob", "predict_std",
            "predict_rank", "target", "target_date"
        ]].round(3)
        print(import_df)
        return import_df

    def eval_pred_data(self, df):
        """ 予測されたデータの精度をチェック """
        check_df = self.proc.create_eval_prd_data(df)
        for target in self.obj_column_list:
            print(target)
            target_df = check_df[check_df["target"] == target]
            target_df = target_df.query("predict_rank == 1")
            target_df.loc[:, "的中"] = target_df.apply(lambda x: 1
                                                     if x[target] == 1 else 0,
                                                     axis=1)
            print(target_df)
            avg_rate = target_df["的中"].mean()
            print(round(avg_rate * 100, 1))

    def import_data(self, df):
        print("-- check! this is BaseSkModel class: " +
              sys._getframe().f_code.co_name)

    @classmethod
    def get_recent_day(cls, start_date):
        print("-- check! this is BaseSkModel class: " +
              sys._getframe().f_code.co_name)

    def set_target_date(self, start_date, end_date):
        """ 学習等データ作成の対象期間をセットする

        :param str start_date: 開始日(文字列)
        :param str end_date: 終了日(文字列)
        """
        self.start_date = start_date
        self.end_date = end_date

    def set_test_table(self, table_name):
        """ test用のテーブルをセットする """
        self.table_name = table_name

    def _calc_grouped_data(self, df):
        """ 与えられたdataframe(予測値)に対して偏差化とランク化を行ったdataframeを返す

        :param dataframe df: dataframe
        :return: dataframe
        """
        grouped = df.groupby(["RACE_KEY", "target"])
        grouped_df = grouped.describe()['prob'].reset_index()
        merge_df = pd.merge(df, grouped_df, on=["RACE_KEY", "target"])
        merge_df['predict_std'] = (
            merge_df['prob'] - merge_df['mean']) / merge_df['std'] * 10 + 50
        df['predict_rank'] = grouped['prob'].rank("dense", ascending=False)
        merge_df = pd.merge(
            merge_df,
            df[["RACE_KEY", "UMABAN", "predict_rank", "target"]],
            on=["RACE_KEY", "UMABAN", "target"])
        return_df = merge_df[[
            'RACE_KEY', 'UMABAN', 'pred', 'prob', 'predict_std',
            'predict_rank', "target", "target_date"
        ]]
        return return_df
y_train = np.loadtxt('y_train_clas.csv', delimiter=',', skiprows=1)[:, 1]
X_data_test = np.loadtxt('X_test_clas.csv', delimiter=',', skiprows=1)
''' Optional Hyperparameter tuning:
pipeline = make_pipeline(ExtraTreesClassifier())
# Declare hyperparameters to tune
hyperparameters = {'extratreesclassifier__random_state': range(0,50,1),
                    'extratreesclassifier__n_estimators' : range(60,70,1),
                    'extratreesclassifier__max_features' : [None, 'sqrt', 'log2'],
                   'extratreesclassifier__max_depth' : [None, 4, 5, 6, 7, 8, 10]}

# Tune model using cross-validation
#clextr = RandomizedSearchCV(pipeline, hyperparameters, n_iter=1000)

'''
## fitting the model
clextr = ExtraTreesClassifier(random_state=22)
# Fit the model for the data
clextr.fit(X_train, y_train)
y_predict = clextr.predict(X_data_test)

# store data into the csv file
test_header = "Id,EpiOrStroma"
n_points = X_data_test.shape[0]
y_predict_pp = np.ones((n_points, 2))
y_predict_pp[:, 0] = range(n_points)
y_predict_pp[:, 1] = y_predict
np.savetxt('clas_et_submission.csv',
           y_predict_pp,
           fmt='%d',
           delimiter=",",
           header=test_header,
Exemplo n.º 60
0
        feature_set_test.append(feature_extraction(Xtest[i][j]))

feature_sets_train = np.array(feature_set_train)
feature_sets_test = np.array(feature_set_test)

print("Loading Feature Set Matrix...")
print("FeatureSet Train: ", feature_sets_train.shape)
print("FeatureSet Test: ", feature_sets_test.shape)

# In[8]:

ytrain = ytrain.reshape(-1, )
ytest = ytest.reshape(-1, )
# print ("ytrain Reshaped!")

# In[9]:

Emodel = ExtraTreesClassifier(n_estimators=150, random_state=5047)
Emodel.fit(feature_sets_train, ytrain)

# In[10]:

t1 = time()
pred = Emodel.predict(feature_sets_test[0].reshape(1, -1))
print("Running the Classifier, Sony Dependent mode... ")
print("Predicted Label: ", pred[0])
t2 = time()
print("Time taken per prediction (in sec): ", t2 - t1)

# In[ ]: