Пример #1
0
Файл: try_004.py Проект: kbai/us
def onescore(X, Y, Xtest):
    clf = RandomForestClassifier(oob_score=True, n_jobs=-1, n_estimators=1000, max_features=300, random_state=0)
    clf.fit(X, Y)
    print "oob_score = ", clf.oob_score_
    print clf.get_params()
    ytest = clf.predict(Xtest)
    output(ytest, "try_004.csv")
def test_set_params():
    """set_params should be able to set estimators"""
    clf1 = LogisticRegression(random_state=123, C=1.0)
    clf2 = RandomForestClassifier(random_state=123, max_depth=None)
    clf3 = GaussianNB()
    eclf1 = VotingClassifier([('lr', clf1), ('rf', clf2)], voting='soft',
                             weights=[1, 2])
    assert_true('lr' in eclf1.named_estimators)
    assert_true(eclf1.named_estimators.lr is eclf1.estimators[0][1])
    assert_true(eclf1.named_estimators.lr is eclf1.named_estimators['lr'])
    eclf1.fit(X, y)
    assert_true('lr' in eclf1.named_estimators_)
    assert_true(eclf1.named_estimators_.lr is eclf1.estimators_[0])
    assert_true(eclf1.named_estimators_.lr is eclf1.named_estimators_['lr'])

    eclf2 = VotingClassifier([('lr', clf1), ('nb', clf3)], voting='soft',
                             weights=[1, 2])
    eclf2.set_params(nb=clf2).fit(X, y)
    assert_false(hasattr(eclf2, 'nb'))

    assert_array_equal(eclf1.predict(X), eclf2.predict(X))
    assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
    assert_equal(eclf2.estimators[0][1].get_params(), clf1.get_params())
    assert_equal(eclf2.estimators[1][1].get_params(), clf2.get_params())

    eclf1.set_params(lr__C=10.0)
    eclf2.set_params(nb__max_depth=5)

    assert_true(eclf1.estimators[0][1].get_params()['C'] == 10.0)
    assert_true(eclf2.estimators[1][1].get_params()['max_depth'] == 5)
    assert_equal(eclf1.get_params()["lr__C"],
                 eclf1.get_params()["lr"].get_params()['C'])
Пример #3
0
def cross_validation(X, y):
    #fig = plt.figure()
    #ax = fig.add_subplot(111, projection='3d')
    assert(len(y) == len(X))
    # Split the dataset in two equal parts
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42)
 
    depth = [8, 16, 32, 64]
    split = [1, 2, 4, 8, 16, 32, 64]
    best_score = 0 
    best_train_score = 0
    best_param = None
    for d in depth:
        for s in split:
            model = RandomForestClassifier(n_estimators=500, criterion="entropy", max_features="sqrt", max_depth=d, min_samples_split=s, n_jobs=-1)
            model = model.fit(X_train, y_train)
            print "Depth: %d  split: %d" % (d, s)
            print "Model trainning score:"
            score_train = model.score(X_train, y_train)
            print score_train
            #ax.scatter(d, s, score_train, c='b', marker='o')
            print "Model test score:"
            score_test = model.score(X_test, y_test)
            print score_test
            #ax.scatter(d, s, score_test, c='r', marker='^')
 
            if score_test > best_score:
                best_score = score_test
                best_train_score = score_train
                best_param = model.get_params()
    print "=================="
    print best_train_score
    print best_score
    print best_param
    return best_param
Пример #4
0
	def fit(self,train_X,train_Y):
		#split set into ones and zeros
		zeros = train_X[train_Y == 0,:]
		ones = train_X[train_Y == 1,:]
		num_ones = ones.shape[0]
		# compute number of chunks to split
		num_chunks = int(zeros.shape[0]/num_ones)
		chunks = np.array_split(zeros,num_chunks)
		#train rfs
		i = 0
		for chunk in chunks:
			
			print('training random forest %s of %s' %(i,num_chunks))
			chunk_rf = RandomForestClassifier(n_estimators = 1000, n_jobs = -1)
			print(chunk_rf.get_params())
			chunk_train_X = np.concatenate([chunk,ones])
			chunk_train_Y = np.concatenate([np.zeros([chunk.shape[0],1]),np.ones([num_ones,1])]).ravel()
			#cross_validation
			if self.weights is not None:
				print('cross_validation')
				scores = cross_validation.cross_val_score(chunk_rf, chunk_train_X, chunk_train_Y, cv = 10, n_jobs = -1)
				print(scores.mean())
				self.weights.append(scores.mean())
			#train
			chunk_rf.fit(chunk_train_X,chunk_train_Y)
			self.rfs.append(chunk_rf)
			i+=1
Пример #5
0
def training_and_test(token, train_data, test_data, num_classes, result):
    """Train and test

    Args:
        token (:obj:`str`): token representing this run
        train_data (:obj:`tuple` of :obj:`numpy.array`): Tuple of training feature and label
        test_data (:obj:`tuple` of :obj:`numpy.array`): Tuple of testing feature and label
        num_classes (:obj:`int`): Number of classes
        result (:obj:`pyActLearn.performance.record.LearningResult`): LearningResult object to hold learning result
    """
    model = RandomForestClassifier(n_estimators=20, criterion="entropy")
    model.fit(train_data[0], train_data[1].flatten())
    # Test
    predicted_y = model.predict(test_data[0])
    predicted_proba = model.predict_proba(test_data[0])
    # Evaluate the Test and Store Result
    confusion_matrix = get_confusion_matrix(num_classes=num_classes,
                                            label=test_data[1].flatten(), predicted=predicted_y)
    result.add_record(model.get_params(), key=token, confusion_matrix=confusion_matrix)
    # In case any label is missing, populate it
    if predicted_proba.shape[1] != num_classes:
        temp_array = np.zeros((predicted_proba.shape[0], num_classes), np.float32)
        for i in range(len(model.classes_)):
            temp_array[:, model.classes_[i]] = predicted_proba[:, i]
        predicted_proba = temp_array
    return predicted_y, predicted_proba
Пример #6
0
def train_model_03(dataset_id):
    # Random Forest
    X, Y, test = prepare_data_for_training(dataset_id)
    clf = RandomForestClassifier(n_estimators=300, min_samples_split=150, 
                                 bootstrap=False, criterion="gini", 
                                 max_depth=117, min_samples_leaf=3, n_jobs=-1)
    train_and_make_predictions(clf, X, Y, test, 
                               "RandomForest %s" % clf.get_params())
Пример #7
0
def tuning_randomforest(X, y):
    clf = RandomForestClassifier(n_estimators=10000, criterion='entropy', max_depth=6,
                                 min_samples_split=2, min_samples_leaf=1,
                                 min_weight_fraction_leaf=0,
                                 max_features=0.2, n_jobs=-1, class_weight='balanced_subsample',
                                 verbose=0)
    print 'parameters:', clf.get_params()
    skf = StratifiedShuffleSplit(y, n_iter=1, test_size=0.25, random_state=0)
    for train_index, val_index in skf:
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]
        clf.fit(X_train, y_train)
        print 'train accuracy', clf.score(X_train, y_train)
        y_val_pred = clf.predict(X_val)
        print 'val auc:', roc_auc_score(y_val, y_val_pred)
Пример #8
0
def test_voting_classifier_set_params():
    # check equivalence in the output when setting underlying estimators
    clf1 = LogisticRegression(random_state=123, C=1.0)
    clf2 = RandomForestClassifier(random_state=123, max_depth=None)
    clf3 = GaussianNB()

    eclf1 = VotingClassifier([('lr', clf1), ('rf', clf2)], voting='soft',
                             weights=[1, 2]).fit(X, y)
    eclf2 = VotingClassifier([('lr', clf1), ('nb', clf3)], voting='soft',
                             weights=[1, 2])
    eclf2.set_params(nb=clf2).fit(X, y)

    assert_array_equal(eclf1.predict(X), eclf2.predict(X))
    assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
    assert eclf2.estimators[0][1].get_params() == clf1.get_params()
    assert eclf2.estimators[1][1].get_params() == clf2.get_params()
Пример #9
0
def do_generate_metrics_rf_optimazed_model(X_train, y_train, X_test, y_test,
                                           grid):
    file_operations.write_logs(FILENAME, 'Starting metrics calculation')
    model = RandomForestClassifier(random_state=my_constants.RANDOM_VALUE,
                                   oob_score=True)
    model.set_params(**grid.best_params_)
    model.fit(X_train, y_train)
    metrics = calculate_metrics(model, X_test, y_test)
    file_operations.write_logs(
        FILENAME, "Generated model params and results\n params:" +
        str(model.get_params()) + "\nscore " +
        str(model.score(X_test, y_test)))
    file_operations.write_logs(
        FILENAME, "Search grid best params and results\n params:" +
        str(grid.best_params_) + "\nscore " + str(grid.best_score_))

    return model, metrics
Пример #10
0
class RandForestPS():
    """ This classifier fist builds a features space concistiong of predicted probabilties from
    a list of classiefiers and then trains ont then space. One can use it like a classifier
    form the sklearn package.
    """
    def __init__(self,
                 estimators=None,
                 *args,
                 n_folds=8,
                 bootstrap=False,
                 **kwargs):
        self.RF = RandomForestClassifier(*args, **kwargs)
        self.estimators = estimators
        self.n_folds = n_folds
        self.pbb_space = ProbabilitySpace(estimators,
                                          n_folds=n_folds,
                                          bootstrap=bootstrap)

    def fit(self, X, y):
        Xp, yp = self.pbb_space.fit_transform(X, y)
        self.RF.fit(Xp, yp)
        return self

    def predict(self, X, y=None):
        Xp, yp = self.pbb_space.transform(X, y)
        return self.RF.predict(Xp)

    def predict_proba(self, X):
        Xp, _ = self.pbb_space.transform(X)
        return self.RF.predict_proba(Xp)

    def set_params(self, **kwargs):
        self.RF.set_params(**kwargs)
        return self

    def get_params(self, *args, **kwargs):
        params = self.RF.get_params(*args, **kwargs)
        params['estimators'] = self.estimators
        params['n_folds'] = self.n_folds
        return params

    def score(self, X, y):
        Xp, yp = self.pbb_space.transform(X, y)
        return self.RF.score(Xp, yp)
Пример #11
0
def do_ml(ticker):
    X, y, df = extract_featuresets(ticker)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

    clf = neighbors.KNeighborsClassifier(weights='distance')

    clf.fit(X_train, y_train)

    print("\n\n")
    print("Parameters of Kneighbors", clf.get_params())
    confidence = clf.score(X_test, y_test)
    print("Accuracy of Kneighbors", confidence)
    predicition = clf.predict(X_test)
    print("Predicted Spread of Kneighbors:", Counter(predicition))
    print("\n\n")

    print("Decision Tree")
    clf1 = DecisionTreeClassifier(max_depth=4)
    clf1.fit(X_train, y_train)
    print("Parameters of Decision Tree", clf1.get_params())
    print("Accuracy of Decision Tree", clf1.score(X_test, y_test))
    print("Predicted Spread of Decision Tree", Counter(clf1.predict(X_test)))
    print("\n\n")

    print("RandomForest")
    clf2 = RandomForestClassifier()
    clf2.fit(X_train, y_train)
    print("Parameters of RandomForest", clf2.get_params())
    print("Accuracy of RandomForest", clf2.score(X_test, y_test))
    print("Predicted Spread of RandomForest", Counter(clf2.predict(X_test)))

    print("Ensemble")
    clfn = VotingClassifier([('lsvc', svm.LinearSVC()),
                             ('knn', neighbors.KNeighborsClassifier()),
                             ('rfor', RandomForestClassifier())])

    clfn.fit(X_train, y_train)
    confidence = clfn.score(X_test, y_test)
    print("Accuracy of Ensembles", confidence)
    predicition = clfn.predict(X_test)

    print("Predicted Spread of ensembles:", Counter(predicition))

    return confidence
def train_by_RandForest():
    filename = 'train_and_test_data'
    x_train, x_test, y_train, y_test = Load_Traindata_Testdata_with_Tfidf(
        filename)
    p = open('./data/indices', 'rb')
    data = pickle.load(p)
    indices = data['indices']
    most_importance_feature = indices[:2000]
    x_train = x_train[:, most_importance_feature]
    x_test = x_test[:, most_importance_feature]
    print("Selected feature with shape:", x_train.shape)
    model = RandomForestClassifier(n_jobs=8, n_estimators=30)
    now = datetime.datetime.now()
    print("Training begin by RandForest:", now)
    model.fit(x_train, y_train)
    y_pre = model.predict(x_test)
    print(model.get_params())
    print(model.score(x_test, y_test))
    print(accuracy_score(y_test, y_pre))
    training_time = datetime.datetime.now() - now
    print("Training time(s):", training_time)
Пример #13
0
class RandomForestModel(object):
    def __init__(self, n_estimators, max_depth, verbose=1):
        self.model = RandomForestClassifier(n_estimators=n_estimators,
                                            max_depth=max_depth,
                                            verbose=verbose)
        self.name = 'RandomForest'

    def get_params(self):
        return self.model.get_params()

    def train(self, features, labels):
        self.model.fit(features, labels)

    def predict(self, feature):
        label_pred = self.model.predict(feature)
        return label_pred

    def score(self, features, labels):
        predictions = self.model.predict(features)

        accuracy = accuracy_score(labels, predictions)
        precision = precision_score(labels, predictions, average='macro')
        recall = recall_score(labels, predictions, average='macro')

        results = {
            'params': self.get_params(),
            'model': self.name,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall
        }

        print('---> Accuracy obtained is: {0:.2f}%'.format(accuracy * 100))

        figures = {}
        return results, figures

    def save(self, filename):
        with open(filename, 'wb') as fp:
            pickle.dump(self.model, fp)
Пример #14
0
def machine(k, temp1, temp2):
    #X_train, X_test, y_train, y_test = train_test_split(temp1, temp2, test_size=0.1, random_state=k, stratify=temp2)
    #X_train = [[float(j) for j in i] for i in X_train]
    #X_test = [[float(j) for j in i] for i in X_test]
    #y_train = [int(i) for i in y_train]
    #y_test = [int(i) for i in y_test]
    #y_train = np.ravel(y_train)
    #y_test = np.ravel(y_test)
    #X_train = np.array(X_train)
    #y_train = np.array(y_train)

    clf = RandomForestClassifier(n_estimators=100)
    cv = StratifiedKFold(n_splits=10)
    scores = cross_val_score(clf, temp1, temp2, cv=cv)
    print(scores)
    print(clf.get_params())
    mean = statistics.mean(scores)
    std = statistics.stdev(scores)
    left = mean - 1.96 * (std / 10**(1 / 2.0))
    right = mean + 1.96 * (std / 10**(1 / 2.0))
    plt.figure()
    plt.axvline(mean * 100, color="blue", ymax=0.75, label='Mean Accuracy')
    plt.axvline(left * 100,
                color="red",
                ymax=0.5,
                label='95% Confidence Interval')
    plt.axvline(right * 100, color="red", ymax=0.5)
    plt.text(
        50,
        0.8,
        ' Mean Accuracy = %0.2f%% \n Lower limit of CI = %0.2f%% \n Upper Limit of CI = %0.2f%%'
        % (mean * 100, left * 100, right * 100),
        fontsize=12)
    plt.xlim([50, 100])
    plt.legend(fontsize=11)
    plt.xlabel('Accuracy (in %)', fontsize=12)
    plt.title(
        'Mean Accuracy and Confidence Interval \n Random Forest Classifier')
    plt.show()
Пример #15
0
class AvgPredictor():
    def __init__(self):
        self.clf_svm = svm.SVC(kernel='linear', probability=True)
        self.clf_rf = RandomForestClassifier(n_estimators=1000, max_depth=10)

    def fit(self, X, y):
        # fit with svm
        self.clf_svm.fit(X, y)
        # fit with rf
        self.clf_rf.fit(X, y)

        return self

    def get_params(self, deep=True):
        if deep:
            return self.clf_rf.get_params(deep) | self.clf_svm.get_params(deep)
        else:
            return {}

    def predict(self, t):
        final_results = []

        # predict svm
        final_results.append(self.clf_svm.predict_proba(t))
        final_results[0] = final_results[0][:, 1]

        # predict rf
        final_results.append(self.clf_rf.predict_proba(t))
        final_results[1] = final_results[1][:, 1]

        samples = len(t)

        result = list(np.zeros(samples))
        for i in xrange(0, samples):
            result[i] = (final_results[0][i] + final_results[1][i]) / 2

        return result
Пример #16
0
def main_rfclassifier(n_est, max_depth, datastruct, experiment_id=None):
    print("Starting experiment [{}, {}]".format(n_est, max_depth))
    df, train_x, train_y, test_x, test_y = datastruct
    metrics = {}

    # if no experiment, set it up
    print("Setting up experiment")
    mlflow.set_experiment('RandomForest Classifier')

    with mlflow.start_run():
        # model params
        model = RandomForestClassifier(n_estimators=n_est,
                                       max_depth=max_depth,
                                       class_weight='balanced')

        print("Training model")
        # train the model
        start_timer = time.time()
        model.fit(train_x, train_y)
        stop_timer = time.time()
        print("Model trained")

        score = score_model(model, test_x, test_y, True)

        #mlflow logging
        mlflow.log_param('model_type', str(model.__class__))
        mlflow.log_param('features', train_x.columns)
        mlflow.log_param('sample_size', df.shape)
        mlflow.log_params(model.get_params())

        metrics['roc_auc'] = score
        metrics['elapsed_time'] = (stop_timer - start_timer)
        mlflow.log_metrics(metrics)

        mlflow.sklearn.log_model(model, "Random Forest Classifier")
        print("Completed")
Пример #17
0
from sklearn.ensemble import RandomForestClassifier


if '--example' in sys.argv:
    trainingdata = [[1, 1], [2, 0.5], [-1, -1], [-2, -2]]
    traininglabel = [1, 1, -1, -1]
    testdata = [[1, 3], [-3, -3]]
    model = RandomForestClassifier()
    model.fit(trainingdata, traininglabel)
    output = model.predict(testdata)
    for label in output: 
        print label
    probas = model.predict_proba(testdata)
    for label in probas:
        print label
    for weights in model.get_params():
        print weights
    for i, gini_imp in enumerate(model.feature_importances_):
        print "gini係数 index = ", i, gini_imp


if '--learn' in sys.argv:
    import json
    anses = []
    traings = []
    for line in open('./learning.json').read().split('\n'):
        if line.strip() == "" : continue
        ans_label, data = json.loads(line.strip())
        anses.append(ans_label)
        traings.append(data)
    model = RandomForestClassifier()
                            'dataset': str(data_set),
                            'kmer_size': kmer_size,
                            'n_splits': cv_gridsearch,
                            'n_repeats': n_iter_grid,
                            'acc': acc,
                            'auc': auc,
                            'model': learn_type,
                            'NMF_factors': n,
                            'params': 'liblinear'
                            })
                
            elif learn_type == "lasso":
                k_fold = RepeatedStratifiedKFold(n_splits=cv_gridsearch, n_repeats=n_iter_grid)
                estimator = LassoCV(alphas = param_grid["alpha"][0], cv = k_fold, n_jobs = -1)
                accuracies = []
                for train_i, test_i in skf.split(x, y):
                    x_train, x_test = x[train_i], x[test_i]
                    y_train, y_test = y[train_i], y[test_i]
                    y_train = list(map(int, y_train))
                    y_test = list(map(int, y_test))

                    estimator.fit(x_train, y_train)
                
                    accuracy = evaluate(estimator, x_test, y_test)
                    accuracies.append(estimator.get_params())
                    accuracies.append(accuracy)

                with open('/pollard/home/abustion/deep_learning_microbiome/lasso.txt', 'w') as f:
                    for item in accuracies:
                        f.write("%s\n" % item)
Пример #19
0
def train(datadict, model_id, n_estimators=25, max_depth=50):
    """Random forest model training.

    Trains random forest model. Only n_estimators, max_depth hyperparameters are available to the user for training.
    The rest of the hyperparameters have been tuned by the CMU team.
    Saves model and statistics after training.

    Args:
        datadict (dict): Dictionary of numpy arrays containing preprocessed train and test data.
        model_id (str): Timestamp used to identify model, scaler and feature names files.
        n_estimators (str): Number of trees in forest. Less likely to overfit with more trees.
        max_depth (str): The maximum depth of the tree. More likely to overfit if depth is large.

    """
    # Convert n_estimators and max_depth from string to int since model only accepts int
    n_estimators = int(n_estimators)
    max_depth = int(max_depth)
    print("\nTraining model...")
    start_time = time.time()
    X_train = datadict['X_train']
    y_train = datadict['y_train']
    X_test = datadict['X_test']
    y_test = datadict['y_test']
    model_rf = RandomForestClassifier(n_estimators=n_estimators,
                                      max_depth=max_depth,
                                      verbose=1,
                                      n_jobs=-1,
                                      bootstrap=False)
    print("\nFitting model...")
    print("Parameters used:", model_rf.get_params())
    model_rf.fit(X_train, y_train)
    print("\nPredicting results...")
    y_pred_rf = model_rf.predict(X_test)
    # y_pred_proba_rf = model_rf.predict_proba(X_test)
    print("\nCalculating accuracy...")
    accuracy_df = get_accuracy_windows(1, y_test, y_pred_rf)
    accuracy = accuracy_score(y_test, y_pred_rf) * 100
    # Save model
    print("\nSaving model...")
    model_path = os.path.join(
        paths.model_dir,
        "acc-" + f"{accuracy:.2f}" + "-model_" + model_id + ".pkl.z")
    print(f"Model saved in {model_path}")
    joblib.dump(model_rf, model_path)
    # Get model stats
    feature_importance_df = get_feature_importance(model_rf, model_id)
    classification_report_df = get_classification_report(y_test, y_pred_rf)
    params_df = get_params(model_rf)
    # Save stats to excel
    print("\nSaving model stats...")
    stats_path = os.path.join(
        paths.output_delivery_prediction_stats_dir,
        "acc-" + f"{accuracy:.2f}" + "-stats_" + model_id + ".xlsx")
    print(f"Stats saved in {stats_path}")
    with pd.ExcelWriter(stats_path) as writer:
        accuracy_df.to_excel(writer, sheet_name='Accuracy')
        feature_importance_df.to_excel(writer, sheet_name='Feature Importance')
        classification_report_df.to_excel(writer,
                                          sheet_name='Classification Report')
        params_df.to_excel(writer, sheet_name='Model Parameters')

    utilities.print_elapsed_time(start_time)
Пример #20
0
    'max_features': 'auto',
    'min_samples_split': 3,
    'n_estimators': 27
}
rfp2 = {
    'bootstrap': False,
    'criterion': 'gini',
    'max_depth': 5,
    'max_features': 'auto',
    'min_samples_split': 3,
    'n_estimators': 28
}
best_rf = RandomForestClassifier(**rfp,
                                 random_state=5,
                                 class_weight="balanced_subsample")
print(best_rf.get_params().keys(), **sp)
best_rf.fit(X_train, y_train)
fscore = best_rf.score(X_test, y_test)
print(f'Accuracy score for RandomForest Classifier: {fscore:.02f}', **sp)

# logistic regression
logReg = LogisticRegression(multi_class='multinomial', solver='lbfgs')

# Fit the classifier to the training data
logReg.fit(X_train, y_train)

# Predict the labels of the test set: y_pred
y_pred = logReg.predict(X_test)

lscore = logReg.score(X_test, y_test)
print(f'Accuracy score for logistic Regression Classifier: {lscore:.02f}',
Пример #21
0
    y_train = df.iloc[:, 0].values - 1
    f_names = df.columns[1:].values
    t_names = df.iloc[:, 0].unique()
    # 不同 Class 统计 (根据 Target 列)
    print('\nTraining dataset shape: ', X_train.shape, ' Number of features: ',
          X_train.shape[1])
    num_categories = np.unique(y_train).size
    sum_y = np.asarray(np.unique(y_train.astype(int), return_counts=True))
    df_sum_y = pd.DataFrame(sum_y.T, columns=['Class', 'Sum'], index=None)
    print('\n', df_sum_y)

    # 初始化 classifier 并完成数据集训练
    clf = RandomForestClassifier(verbose=1,
                                 n_jobs=-1,
                                 random_state=args.randomseed,
                                 n_estimators=100).fit(X_train, y_train)
    print('\nClassifier parameters:\n')
    print(clf.get_params())

    # 输出重要特征评分
    df_import = eli5.explain_weights_df(clf,
                                        target_names=t_names,
                                        feature_names=f_names)
    df_import.to_csv('f_weight_output.csv', index=None)
    print(
        "\nThe importance features have been saved to 'f_weight_output.csv'.")

    end_time = time.time()  # 程序结束时间
    print('\n[Finished in: {0:.6f} mins = {1:.6f} seconds]\n'.format(
        ((end_time - start_time) / 60), (end_time - start_time)))
Пример #22
0
import numpy as np
import time
from sklearn.svm import SVC
from sklearn.metrics import f1_score, classification_report
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

data = np.loadtxt('./data/TrainSamples.csv', delimiter=",")
label = np.loadtxt('./data/TrainLabels.csv', delimiter=",")
test = np.loadtxt('./data/TestSamples1.csv', delimiter=',')
testLabel = np.loadtxt('./data/TestLabels1.csv', delimiter=',')
start = time.time()
classifier = RandomForestClassifier()
print classifier.get_params(deep=True)
classifier.fit(data, label)
predictions = classifier.predict(test)
reportname = 'RandomForestClassifier.txt'
report = open('./result/' + reportname, 'w')
r = classification_report(testLabel, predictions)
report.write(r)
end = time.time()
report.write('time{0}'.format(str(end - start)))
report.close()
Пример #23
0
X_test=np.loadtxt("X_test.gz",delimiter=",")
####################################################################################
####################################################################################
####################################################################################
#classifier
RFmodel = RandomForestClassifier(
        n_estimators=10,        #number of trees to generate
        max_features='auto',    #consider sqrt of number of features when splitting
        n_jobs=1,               #run in parallel on all cores
        criterion="entropy"
        )

#train
RFmodel = RFmodel.fit(X_train, Y_train)
#get parameters
params=RFmodel.get_params()
#score on training set
acc_rate=RFmodel.score(X_train,Y_train)
print acc_rate
#feature importances
feat_imp=RFmodel.feature_importances_
#predict probabilities
test_probs=RFmodel.predict_proba(X_test)

#output test set probabilities to csv file
columns=['ARSON', 'ASSAULT', 'BAD CHECKS', 'BRIBERY',
            'BURGLARY', 'DISORDERLY CONDUCT',
            'DRIVING UNDER THE INFLUENCE', 'DRUG/NARCOTIC',
            'DRUNKENNESS', 'EMBEZZLEMENT', 'EXTORTION',
            'FAMILY OFFENSES', 'FORGERY/COUNTERFEITING', 'FRAUD',
            'GAMBLING', 'KIDNAPPING', 'LARCENY/THEFT',
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix

from sklearn.metrics import roc_curve, auc
from scipy import interp
from sklearn.cross_validation import KFold
from sklearn.preprocessing import StandardScaler


rfc = RandomForestClassifier()
rfc.get_params()

# Reads train & test features and labels data from files and returnes them as numpy arrays
X_train_df = pd.read_csv('X_train_header.txt')
y_train_df = pd.read_csv('y_train_header.txt')
X_test_df = pd.read_csv('X_test_header.txt')
y_test_df = pd.read_csv('y_test_header.txt');

n_train_samples = X_train_df.shape[0]
n_test_samples = X_test_df.shape[0]
n_features = X_train_df.shape[1]

X_train = np.array(X_train_df).reshape((n_train_samples,n_features))
y_train = np.array(y_train_df).reshape(n_train_samples,)    
X_test = np.array(X_test_df).reshape((n_test_samples,n_features))
y_test = np.array(y_test_df).reshape(n_test_samples,)
Пример #25
0
					nthread = 4, 
					min_child_weight = 1, 
					subsample= 0.8, 
					seed = 1337, 
					objective= 'multi:softprob', 
					max_depth = 7, 
					gamma= .2)

# use the xgb interface
xgb_param = clf.get_xgb_params()
xgb_param['num_class'] = 5
xgb_param['eval_metric'] = 'mlogloss'
Xg_train = xgb.DMatrix(X_train, label=y_train, missing=np.nan)
cvresult = xgb.cv(xgb_param, 
				  Xg_train, 
 				  num_boost_round = clf.get_params()['n_estimators'],
 				  nfold = 5,
 				  show_progress = True,
				  early_stopping_rounds = 100)
clf.set_params(n_estimators=cvresult.shape[0])
clf.fit(X_train, y_train)
best_outcome_params = clf.get_params()
best_outcome_score = cvresult.min()

try:
	# predict the outcome probabilities
	y_pred = grid.predict_proba(X_test)
except:
	# predict the outcome probabilities
	y_pred = clf.predict_proba(X_test)
Пример #26
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--path',
        nargs='?',
        const=True,
        type=str,
        default='C:/Users/AliGökalp/Documents/phd/data/2013_DFTC/2013_DFTC',
        help='Input data path')
    parser.add_argument(
        '--loader_name',
        nargs='?',
        const=True,
        type=str,
        default='GRSS2013DataLoader',
        help='Data set loader name, Values : GRSS2013DataLoader')
    parser.add_argument('--neighborhood',
                        nargs='?',
                        type=int,
                        default=5,
                        help='Neighborhood for data extraction')
    parser.add_argument('--hyperparamopt',
                        nargs='?',
                        const=True,
                        type=bool,
                        default=False,
                        help='If true, performs hyper parameter optimization.')
    parser.add_argument('--fullscene',
                        nargs='?',
                        const=True,
                        type=bool,
                        default=False,
                        help='If true, performs full scene classification.')
    parser.add_argument('--batch_size',
                        nargs='?',
                        type=int,
                        default=20,
                        help='Batch size')
    parser.add_argument('--split_count',
                        nargs='?',
                        type=int,
                        default=1,
                        help='Split count')
    parser.add_argument('--base_log_path',
                        nargs='?',
                        const=True,
                        type=str,
                        default=os.path.dirname(__file__),
                        help='Base path for saving logs')

    flags, unparsed = parser.parse_known_args()

    loader_name = flags.loader_name
    data_path = flags.path
    neighborhood = flags.neighborhood

    for run_index in range(flags.split_count):
        print('Starting episode#%d' % run_index)

        data_importer = InMemoryImporter.InMemoryImporter()
        training_data_with_labels, test_data_with_labels, validation_data_with_labels, shadow_dict, class_range, scene_shape, color_list = \
            data_importer.read_data_set(loader_name=loader_name, path=data_path,
                                        test_data_ratio=0, neighborhood=neighborhood, normalize=False)

        flattened_training_data = flatten_data(training_data_with_labels.data)
        flattened_validation_data = flatten_data(
            validation_data_with_labels.data)

        start_time = time.time()
        estimator = RandomForestClassifier(n_estimators=50,
                                           n_jobs=8,
                                           max_features=int(2 * sqrt(144)),
                                           verbose=False)
        # estimator = ExtraTreesClassifier(n_estimators=10000, n_jobs=8, verbose=1)
        # estimator = SVC(kernel='poly', degree=1, cache_size=200, verbose=True)  # GRSS2013
        # estimator = SVC(kernel='rbf', gamma=1e-09, C=10000, cache_size=200) # GRSS2013
        # estimator = SVC(kernel='rbf', gamma=1e-06, C=1000000, cache_size=1000, verbose=True)  # GULFPORT

        estimator.fit(flattened_training_data,
                      training_data_with_labels.labels)
        print('Completed training(%.3f sec)' % (time.time() - start_time))
        predicted_validation_data = estimator.predict(
            flattened_validation_data)

        overall_accuracy = accuracy_score(validation_data_with_labels.labels,
                                          predicted_validation_data)
        average_accuracy = balanced_accuracy_score(
            validation_data_with_labels.labels, predicted_validation_data)
        kappa = cohen_kappa_score(validation_data_with_labels.labels,
                                  predicted_validation_data)
        conf_matrix = confusion_matrix(validation_data_with_labels.labels,
                                       predicted_validation_data)
        print_output(estimator.get_params(), average_accuracy, conf_matrix,
                     kappa, overall_accuracy, run_index, loader_name,
                     flags.base_log_path)

        if flags.hyperparamopt:
            perform_hyperparamopt(flattened_training_data,
                                  training_data_with_labels)

        if flags.fullscene:
            perform_full_scene_classification(data_path, loader_name,
                                              neighborhood, estimator,
                                              flags.batch_size)
Пример #27
0
print('Accuracy of Extratrees classifier on test set: %0.04f' % (score_ABC))

# Accuracy of Extratrees classifier on test set: 0.8224
#******************************************************************************
#******************************************************************************

# *** Applying Machine Learning Technique #7 ***

from sklearn.ensemble import RandomForestClassifier

Rando = RandomForestClassifier(n_estimators=5)

from pprint import pprint
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(Rando.get_params())

classifier = Rando.fit(X_train, y_train)

score_RFC = Rando.score(X_test, y_test)
print('Accuracy of Extratrees classifier on test set: %0.04f' % (score_RFC))

# Accuracy of Extratrees classifier on test set: 0.8137

#******************************************************************************

# HYPERPARAMETER OPTIMIZATION --> GRID SEARCH <--
from sklearn.model_selection import GridSearchCV

# parameters for GridSearchCV
param_grid = {
Пример #28
0
    labels_test = pickle.load(data)
print(features_train.shape)
print(features_test.shape)

# Random Forest
from sklearn.ensemble import RandomForestClassifier
from pprint import pprint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import ShuffleSplit

rf_0 = RandomForestClassifier(random_state=8)

print('Parameters currently in use:\n')
pprint(rf_0.get_params())

# Aanpassen tune parameters
# n_estimators
n_estimators = [int(x) for x in np.linspace(start=200, stop=1000, num=5)]
# max_features
max_features = ['auto', 'sqrt']
# max_depth
max_depth = [int(x) for x in np.linspace(20, 100, num=5)]
max_depth.append(None)
# min_samples_split
min_samples_split = [2, 5, 10]
# min_samples_leaf
min_samples_leaf = [1, 2, 4]
# bootstrap
bootstrap = [True, False]
Пример #29
0
]

#Defining features and prediction target
X = df[features]
y = df.Survived

#Select random forest for classifier model.
survival_model_forest = RandomForestClassifier(random_state=1)

#fit model.
survival_model_forest.fit(X, y)

####RANDOM GRID SEARCH
#Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(survival_model_forest.get_params())

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
Пример #30
0
trainingSet = np.vstack((trainingSetEllipticals, trainingSetSpirals))  #using only elliptical and spiral for training
np.random.shuffle(trainingSet)
trainingSetLabels = trainingSet[:,12]  #putting labels in separate array

trainingSetLabels[trainingSetLabels == 0] = -1 #replacing all 0 with -1 to match sklearn format

trainingSet = trainingSet[:, 1:11] #removing label cols from actual inputs

trainingSet, testingSet, trainingSetLabels, testingSetLabels = train_test_split(trainingSet, trainingSetLabels, test_size = 0.6, random_state = 0) #fixes random_state so results reproducible

startTime = time.time()
print "Time before training = ", startTime

clf = RandomForestClassifier() #No max depth initial, tweak as necessary later
clf = clf.fit(trainingSet, trainingSetLabels)

print "Params after training:"
print clf.get_params()

trainingAccuracy = clf.score(trainingSet, trainingSetLabels)

print "Training accuracy = ", trainingAccuracy

testingAccuracy = clf.score(testingSet, testingSetLabels)

print "Testing accuracy = ", testingAccuracy

print "Done training and testing! Time = ", time.time() - startTime, "seconds"

from sklearn.ensemble import RandomForestClassifier
# train the model
wqp_rf = RandomForestClassifier()
wqp_rf.fit(wqp_train_SX, wqp_train_y)
# predict and evaluate performance
wqp_rf_predictions = wqp_rf.predict(wqp_test_SX)
meu.display_model_performance_metrics(true_labels=wqp_test_y, predicted_labels=wqp_rf_predictions, 
                                      classes=wqp_label_names)


# ## Hyperparameter tuning with Grid Search & Cross Validation

# In[23]:

print(wqp_rf.get_params())


# ### Get the best hyperparameter values

# In[24]:

from sklearn.model_selection import GridSearchCV

param_grid = {
                'n_estimators': [100, 200, 300, 500], 
                'max_features': ['auto', None, 'log2']    
              }

wqp_clf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5,
                       scoring='accuracy')
Пример #32
0
# vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
stop_words = ['in', 'of', 'at', 'a', 'the']
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3), stop_words=stop_words)
tfidf_vectorizer.fit(reviews_train_clean)
X = tfidf_vectorizer.transform(reviews_train_clean)
X_test = tfidf_vectorizer.transform(reviews_test_clean)

# classifier find c
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

target = [1 if i < 12500 else 0 for i in range(25000)]
final_model = RandomForestClassifier()
final_model.fit(X, target)
y_pred = final_model.predict(X_test)
model_coef = final_model.get_params()
print("Final Accuracy: %s" % accuracy_score(target, y_pred))

from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(y_pred, target).ravel()
print(str((tn, fp, fn, tp)))
Пример #33
0
    clf_etree = ExtraTreesClassifier(n_estimators=1000, max_depth=None, max_features=int(math.sqrt(n_features)), min_samples_split=100, random_state=144, n_jobs=4);
    clf_etree.fit(X_train, y_train)
    print "Validation set score: ERF " , clf_etree.score(X_val, y_val)

    clf_boost = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),algorithm="SAMME", n_estimators=500, random_state=74494, learning_rate=0.8) 
    clf_boost.fit(X_train, y_train)
    print "Validation set score: ABOOST " , clf_boost.score(X_val, y_val)


    #clf_gboost = GradientBoostingClassifier(n_estimators=int(reg), random_state=74494, learning_rate=0.2) 
    #clf_gboost.fit(X_train, y_train)
    #print "Validation set score:LR " , clf_gboost.score(X_val, y_val)


    print "Classifier:"
    print clf, clf.get_params()
    print clf_etree, clf_etree.get_params()
    print clf_boost, clf_boost.get_params()
    

    if(fe==1): #L1 norm based feature elimination
        clf_fe = LogisticRegression(C=1000,penalty='l1',random_state=0)
        clf_fe.fit(X_train, y_train)
        X_train = X_train[:,clf_fe.coef_.ravel()!=0]
        print "Xtrain.shape: ", X_train.shape
        X_val = X_val[:,clf_fe.coef_.ravel()!=0]

        clf2_l = svm.SVC(kernel='linear', C=reg)
        clf2_l.fit(X_train, y_train)
        print "Lasso Validation set score filtered coeff linear: " , clf2_l.score(X_val, y_val)
        clf2 = svm.SVC(kernel='rbf', C=reg, gamma=g)
Пример #34
0
cross_validation_accuracy_knn = []
for i in range(1, 26):
    clf_knn = KNeighborsClassifier(n_neighbors=i)
    scores = cross_val_score(clf_knn, X, y, cv=5)
    print(f"Average Accuracy Score when neighbours are {i} is: \t", scores.mean())
    cross_validation_accuracy_knn.append(scores.mean())

plt.figure(figsize=(20,10))
plt.plot([i for i in range(1, 26)], [i*100.0 for i in cross_validation_accuracy_knn])
for i in range(1, 26):
    plt.text(i, cross_validation_accuracy_knn[i-1]*100 + 0.2, s=f'{cross_validation_accuracy_knn[i-1]*100:.3f}%')

from sklearn.ensemble import RandomForestClassifier

clf_rfc = RandomForestClassifier(n_estimators=10, random_state=42)
clf_rfc.get_params()

clf_rfc.fit(X_train, y_train)

predictions_rfc = clf_rfc.predict(X_test)
accuracy_score(y_test, predictions_rfc)

plt.figure(figsize=(7,7))
sns.heatmap(confusion_matrix(y_test, predictions_rfc),
            annot=True,
            cmap="Blues",
            square=True,
            xticklabels=['No Disease', 'Disease'],
            yticklabels=['No Disease', 'Disease'])
plt.xlabel("Predicted", fontsize=15)
plt.ylabel("Actual", fontsize=15)
Пример #35
0
#for train,test in kf:
for _ in range(1):
    #X_learn, X_valid, y_learn, y_valid = X.iloc[train], X.iloc[test], \
    #                                     y.iloc[train], y.iloc[test]
    #y_valid = pd.DataFrame({'country': y_valid})
    #y_test = pd.DataFrame({'country': y_test})

    """ RANDOM FOREST """
    classif_base = RandomForestClassifier(n_estimators=300,
                                     criterion='entropy',
                                     random_state=0,
                                     min_samples_split=1000,
                                     max_depth=10,
                                     min_samples_leaf=100,
                                     n_jobs=-1)
    classif = RandomForestClassifier(**classif_base.get_params())

    """ GRADIENT BOOSTING """
    #classif_base = GradientBoostingClassifier(loss='deviance',
    #                                          learning_rate=0.25,
    #                                          n_estimators=20,
    #                                          max_depth=5,
    #                                          min_samples_split=50,
    #                                          min_samples_leaf=100,
    #                                          random_state=0,
    #                                          verbose=True)
    #classif = GradientBoostingClassifier(**classif_base.get_params())

    """ XGBOOST """
    xg_train = xgb.DMatrix(X_learn, label=y_learn)
    xg_valid = xgb.DMatrix(X_valid, label=y_valid)
os = SMOTETomek(1)
X_train_os, y_train_os = os.fit_sample(X_train, y_train)
print("The number of classes before fit {}".format(Counter(y_train)))
print("The number of classes after fit {}".format(Counter(y_train_os)))

X_train = X_train_os
y_train = y_train_os

from sklearn.ensemble import RandomForestClassifier  # import model_selection

rf = RandomForestClassifier()

from pprint import pprint
print('Parameters currently in use:\n')
pprint(rf.get_params())

from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=20)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in range(100, 2000, 2)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
Пример #37
0
Файл: rf.py Проект: jfraj/khor
class rfClf(BaseModel):

    """Model using random forest classifier."""

    def __init__(self, train_data_fname=None, nrows=None, **kwargs):
        """Initialize the data frame."""
        super(rfClf, self).__init__(train_data_fname, nrows, **kwargs)

    def set_model(self, **kwargs):
        """Set the classifier.

        No criterion parameters since only one choice: mean sqared error
        """
        verbose = kwargs.get('verbose', 0)
        n_estimators = kwargs.get('n_estimators', 200)
        max_depth = kwargs.get('max_depth', None)
        bootstrap = kwargs.get('bootstrap', True)
        min_samples_leaf = kwargs.get('min_samples_leaf', 1)
        min_samples_split = kwargs.get('min_samples_split', 2)
        max_features = kwargs.get('max_features', "auto")
        class_weight = kwargs.get('class_weight', "auto")
        n_jobs = kwargs.get('n_jobs', 1)
        criterion = kwargs.get('criterion', 'entropy')
        random_state = kwargs.get('random_state', 24)

        self.learner = RandomForestClassifier(n_estimators=n_estimators,
                                              max_depth=max_depth,
                                              bootstrap=bootstrap,
                                              min_samples_leaf=min_samples_leaf,
                                              min_samples_split=min_samples_split,
                                              max_features=max_features,
                                              n_jobs=n_jobs,
                                              verbose=verbose,
                                              criterion=criterion,
                                              class_weight=class_weight,
                                              random_state=random_state)
        print('\n\nRandom forest set with parameters:')
        par_dict = self.learner.get_params()
        for ipar in par_dict.keys():
            print('{}: {}'.format(ipar, par_dict[ipar]))
        print('\n\n')

    def fitNscore(self, **kwargs):
        """Fit classifier and produce score and related plots."""
        col2fit = kwargs.get('features')
        # cleaning
        bids_path = kwargs.get('bids_path', 'data/bids.csv')
        if not self.iscleaned:
            print 'Preparing the data...'
            self.prepare_data(bids_path, **kwargs)
        print('columns for fit=\n{}'.format(self.df_train.columns))

        test_size = 0.2  # fraction kept for testing
        rnd_seed = 24  # for reproducibility

        #features_train, features_test, target_train, target_test =\
        #    train_test_split(self.df_train[col2fit].values,
        #                     self.df_train['outcome'].values,
        #                     test_size=test_size,
        #                     random_state=rnd_seed)

        sss = StratifiedShuffleSplit(self.df_train['outcome'].values,
                                     n_iter=1,
                                     test_size=test_size,
                                     random_state=rnd_seed)
        for train_index, test_index in sss:
            features_train = self.df_train[col2fit].values[train_index]
            features_test = self.df_train[col2fit].values[test_index]
            target_train = self.df_train['outcome'].values[train_index]
            target_test = self.df_train['outcome'].values[test_index]


        # Fit Classifier
        self.fitModel(features_train, target_train, **kwargs)

        # Predict on the rest of the sample
        print('\nPredicting...')
        predictions = self.learner.predict(features_test)
        probas = self.learner.predict_proba(features_test)

        # Feature index ordered by importance
        ord_idx = np.argsort(self.learner.feature_importances_)
        print("Feature ranking:")
        for ifeaturindex in ord_idx[::-1]:
            print('{0} \t: {1}'.format(col2fit[ifeaturindex],
                                       round(self.learner.feature_importances_[ifeaturindex], 2)))


        # Score
        print('Score={}'.format(self.learner.score(features_test, target_test)))


        # Plots

        # Feature importances
        maxfeat2show = 30 # number of features to show in plots
        importances = self.learner.feature_importances_
        std = np.std([tree.feature_importances_ for tree in self.learner.estimators_],
                    axis=0)
        indices = np.argsort(importances)[::-1]
        indices = indices[:min(maxfeat2show, len(indices))]  # truncate if > maxfeat2show
        ordered_names = [col2fit[i] for i in indices]

        fig_import = plt.figure(figsize=(10, 10))
        plt.title("Feature importances, RF")
        plt.barh(range(len(indices)), importances[indices],
                color="b", xerr=std[indices], align="center",ecolor='r')
        plt.yticks(range(len(indices)), ordered_names)
        plt.ylim([-1, len(indices)])
        plt.ylim(plt.ylim()[::-1])
        plt.subplots_adjust(left=0.22)
        fig_import.show()

        # confusion matrix
        cm = confusion_matrix(target_test.astype(int), predictions.astype(int))
        cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        cm_normalized = np.clip(cm_normalized, 0.0, 0.5)

        fig_cm = plt.figure()
        ax_cm = fig_cm.add_subplot(1,1,1)
        im_cm = ax_cm.imshow(cm_normalized, interpolation='nearest')
        plt.title('Normalized confusion mtx, RF')
        plt.xlabel('Predicted')
        plt.ylabel('True')
        fig_cm.colorbar(im_cm)
        fig_cm.show()

        # ROC curve
        # This ones seems to reflect better the LB score
        #false_pos, true_pos, thr = roc_curve(target_test, predictions)
        false_pos, true_pos, thr = roc_curve(target_test, probas[:, 1])
        fig_roc = plt.figure()
        plt.plot(false_pos, true_pos,
                 label='ROC curve (area = %0.2f)' % auc(false_pos, true_pos))
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC')
        plt.legend(loc="lower right")
        fig_roc.show()
        raw_input('press enter when finished...')
Пример #38
0
    print("RandomForestClassifier, Cross_val_score=",results.mean() )
    

#
#results sounds great but if I add max_depth=5 I have many missclassified points


#How to Visualize a Decision Tree from a Random Forest in Python using Scikit-Learn
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, stratify=y ,random_state=0)    
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import pydotplus
from IPython.display import Image
model = RandomForestClassifier(n_estimators=10,max_depth=5)
pprint(model.get_params())
# Train
model.fit(X_train, y_train)
# Extract single tree
estimator = model.estimators_[3] #we can change the tree number:)

# Create DOT data
dot_data = tree.export_graphviz(estimator, out_file=None, 
                                feature_names=filter_col,  
                                class_names=['Yes','No'],
                                rounded = True, proportion = False,
                                precision = 2, filled = True)
                                

# Draw graph
graph = pydotplus.graph_from_dot_data(dot_data)  
features_list = df.columns.values[1::]

# Fit a random forest with (mostly) default parameters to determine feature importance
forest = RandomForestClassifier(oob_score=True, n_estimators=10000)
forest.fit(X, y)
feature_importance = forest.feature_importances_

# make importances relative to max importance
feature_importance = 100.0 * (feature_importance / feature_importance.max())

# Get the indexes of all features over the importance threshold
important_idx = np.where(feature_importance)[0]

# Get the sorted indexes of important features
sorted_idx = np.argsort(feature_importance[important_idx])[::-1]
print "\nFeatures sorted by importance (DESC):\n", important_idx[sorted_idx]

# Adapted from http://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_regression.html
pos = np.arange(sorted_idx.shape[0]) + .5
plt.subplot(1, 2, 2)
plt.barh(pos, feature_importance[important_idx][sorted_idx[::-1]], align='center')
plt.yticks(pos, important_idx[sorted_idx[:-1]])
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
plt.show()

sorted_idx
feature_importance
forest.get_params()
df.filter(regex='Survived|Age_sc|SibSp|Parch|Fare_[0, 7.896]|Fare_[7.896, 14.454]|Fare_[14.454, 31.275]|Fare_[31.275, 512.329]|Sex|Pclass|Child|FamilySize|Family|Title_id')
Пример #40
0
output = OutputClassification("ECG Classification", "random_forest", output_dir="./output",
                              file_dir="./output/images/", random_state=RANDOM_STATE, test_size=TEST_SIZE)

output.add_info("n_processors", 12)
output.add_info("n_folds", n_folds)

csv = pd.read_csv("ecg.csv", index_col=None)

X, y = csv.drop("abnormal", axis=1), csv["abnormal"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_STATE, test_size=TEST_SIZE)

train_start = time.time()
clf = RandomForestClassifier(n_jobs=-1, random_state=RANDOM_STATE)
output.add_model_parameter("basis", clf.get_params(deep=True))

cv = StratifiedKFold(n_splits=n_folds, random_state=RANDOM_STATE)

probas = []
y_tests = []
y_preds = []

for train, test in cv.split(X, y):
    y_tests.append(y.iloc[test])
    model = clf.fit(X.iloc[train], y.iloc[train])
    proba = model.predict_proba(X.iloc[test])
    y_pred = predict_from_proba(model, proba)
    y_preds.append(y_pred)
    probas.append(proba)
                           logging_level="Verbose",
                           metric_period=100)
# from catboost import cv as catcv
# catpool = Pool(X_train,y_train,cat_features=categorical_features_pos)
# cv_data = catcv(catpool,model.get_params(),fold_count=2)
# best_cat_iterations = cv_data['test-Accuracy-mean'].idxmax()
# print("Best Iteration: ",best_cat_iterations)
# print("Best Score: ", cv_data['test-Accuracy-mean'][best_cat_iterations])
model = CatBoostClassifier(eval_metric='Accuracy',
                           iterations=500,
                           scale_pos_weight=imbalance_weight,
                           random_seed=42,
                           logging_level="Verbose",
                           metric_period=100)
model.fit(X, y, cat_features=categorical_features_pos)
model.get_params()
# cat_cv_std = cv_data.loc[cv_data['test-Accuracy-mean'].idxmax(),["train-Accuracy-mean","train-Accuracy-std"]]
# print("Train CV Accuracy: %0.2f (+/- %0.2f)" % (cat_cv_std[0],cat_cv_std[1]))

# results = results.append({'Model': "Catboost",'Para': model.get_params(),'Test_Score': None,
#                              'CV Mean':cat_cv_std[0], 'CV STDEV': cat_cv_std[1]}, ignore_index=True)

# catprobpred = model.predict_proba(test_df)[:,1]
# catpred = model.predict(test_df).astype(np.int)
# submission = pd.DataFrame({'PassengerId':test_df.index,'Survived':catpred})
# submission.to_csv('catboost.csv',index=False)
lgtrain = lgb.Dataset(X_train,
                      y_train,
                      categorical_feature=categorical_features)

lgvalid = lgb.Dataset(X_test, y_test, categorical_feature=categorical_features)
Пример #42
0
# Extract the OOB accuracy from bc
oob_accuracy = bc.oob_score_

print('Test set accuracy of bc: {:.2f}'.format(acc_test))

print('OOB accuracy of bc: {:.2f}'.format(oob_accuracy))

# =========== RANDOM FOREST CLASSIFIER ==========#
# Instantiate rf
rf = RandomForestClassifier(criterion='gini', random_state=2)

# Fit rf to the training set
rf.fit(X_train, y_train)

rf.get_params()

# =========== RANDOM SEARCH ==========#
# criterion for information gain
criterion = ['gini', 'entropy']

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
max_depth.append(None)
lor_re = best_lor.predict_proba(x_test_s)
## loss function value
import math
z = 0
for i in range(len(y_test)):
    z = z + y_test[i]*math.log(lor_re[i][1]) + (1-y_test[i])*math.log(lor_re[i][0])
log_loss_lr = -(z/len(y_test))
print("The log loss for logistic regression is ", log_loss_lr)

## Random forest classfication
### random forest has three parameters for tune. Since this modle is designed
### avoid overfitting, so we do not consider accuracy VS complexity parameters 
### here 
from sklearn.ensemble import RandomForestClassifier
r_f = RandomForestClassifier(random_state = 1)
r_f.get_params()
param = { 
    'n_estimators': [500, 1000, 1300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [round(len(x_train.columns)/6),round(len(x_train.columns)/3),round(len(x_train.columns)/2),round(len(x_train.columns)/1)]
}
grid_search_rf = GridSearchCV(r_f, param_grid = param, cv = 5, n_jobs = -1)
grid_search_rf.fit(x_train, y_train)
grid_search_rf.best_params_
best_rf = grid_search_rf.best_estimator_
rf_score = best_rf.score(x_test, y_test)
rf_pre = best_rf.predict_proba(x_test)
rf_pre = pd.DataFrame(rf_pre)
### Change 0 to not equal to 0 to caculate log loss
rf_pre.loc[rf_pre[0] == 0, [0]] = 0.000000000001
## loss function value
Пример #44
0
class Trainer():

	def __init__(self):
		
		with open('credentials.json') as credentials_file:
		    credentials = json.load(credentials_file)

		passwd = credentials['mysql']['password']
		self.con = mdb.connect(host='127.0.0.1', port=3306, user='******', passwd=passwd, db='insight', autocommit=True)
		self.cur = self.con.cursor()
		print "Connected to database"
		
		self.load_data()

	def load_data(self):
		f = open('./pickles/mysql_dump.pickle', 'rb')
		self.loanData = pickle.load(f)
		self.loanData = pd.DataFrame(self.loanData)
		f.close()

	def drop_na(self):
		self.loanData = loanData.dropna()
		self.loanData.index = range(len(self.loanData))

	def drop_columns(self):
		#drop the columns with malformed data in mysql db
		self.loanData = self.loanData.drop(['none',
											'educational',
											'IA',
											'IDAHO',
											'ME',
											'NE',
											'other_housing',
											'issue_year'], 1)

	def drop_prepaid_loans(self):
		indices_to_drop = []
		for i in range(len(self.loanData)):
			if self.loanData['loan_status'][i]==1 and self.loanData['days_to_zero_dollars'][i] < 1000:
				indices_to_drop.append(i)
		self.loanData = self.loanData.drop(indices_to_drop, 0)
		print "Number of prepaid loans: ", len(indices_to_drop)
		print "Number of loans after dropping prepaids: ", len(self.loanData)


	def define_features_targets(self, kind="regression"):
		
		#take out 1000 random loans with 36 month terms for testing
		#ids are already populated in test_loans for consistency
		test_ids = []
		sql_query = "select id from test_loans;"
		self.cur.execute(sql_query)
		sql_resp = self.cur.fetchall()
		print "length of sql response: ", len(sql_resp)
		for val in sql_resp:
			test_ids.append(val[0])
		print "length of test_ids: ", len(test_ids)
		#make the test and train data frames
		self.testLoanData = self.loanData[self.loanData['id'].isin(test_ids)]
		self.trainLoanData = self.loanData[~self.loanData['id'].isin(test_ids)]
		self.testLoanData.index = range(len(self.testLoanData))
		self.trainLoanData.index = range(len(self.trainLoanData))
		print "Train Loan Data: ", len(self.trainLoanData)
		print "Test Loan Data: ", len(self.testLoanData)
		
		self.features = self.trainLoanData.drop(['loan_status', 
											'days_to_zero_dollars',
											'id'], 1)
		self.features = self.features.values
		#choose different target variables for regression vs classification
		if kind == "regression":
			self.targets = self.trainLoanData['days_to_zero_dollars'].values
			self.y_test = self.testLoanData['days_to_zero_dollars'].values
		elif kind == "classification":
			self.targets = self.trainLoanData['loan_status'].values
			self.y_test = self.testLoanData['loan_status'].values

	def preprocess(self):
		(self.X_train, 
		 self.X_cv, 
		 self.y_train, 
		 self.y_cv) = dm.split_train_test(features=self.features, 
		 									targets=self.targets, 
		 									test_size=0.1)
		self.X_test = self.testLoanData.drop(['loan_status', 
											  'days_to_zero_dollars',
											  'id'], 1).values
		(self.X_train, self.X_cv) = dm.standardize_samples(self.X_train, 
														  self.X_cv)
		(self.X_train, self.X_cv) = dm.scale_samples_to_range(self.X_train, 
																self.X_cv)
		(self.X_test, _) = dm.standardize_samples(self.X_test, 
														  self.X_test)
		(self.X_test, _) = dm.scale_samples_to_range(self.X_test, 
																self.X_test)

	def define_dummy_classifier(self):
		self.clf = DummyClassifier()

	def define_rfr(self, n_estimators=10):
		self.regr = RandomForestRegressor(n_estimators=n_estimators, oob_score=True)
		print self.regr.get_params()

	def define_linear_regressor(self):
		self.regr = LinearRegression()
		print self.regr.get_params()

	def define_SVR(self, C=1, gamma=0.1):
		self.regr = SVR(C=C, gamma=gamma, verbose=3)
		print self.regr.get_params()

	def define_logistic_regressor(self, penalty="l2", C=1.0, class_weight=None):
		self.clf = LogisticRegression(penalty=penalty, 
									  C=C, 
									  class_weight=class_weight)
		print self.clf.get_params()

	def define_rfc(self, n_estimators=10):
		self.clf = RandomForestClassifier(n_estimators=n_estimators, oob_score=True)
		print self.clf.get_params()

	def train(self, kind="regression"):
		print "Fitting training data"
		if kind == "regression":
			self.regr.fit(self.X_train, self.y_train)
		elif kind == "classification":
			self.clf.fit(self.X_train, self.y_train)

	def predict(self, X, kind="regression"):
		if kind == "regression":
			self.prediction = self.regr.predict(X)
		elif kind == "classification":
			self.prediction = self.clf.predict(X)

	def score(self, X, y, kind="regression"):
		if kind == "regression":
			score_val = self.regr.score(X, y)
			print "R2 Score: ", score_val
		elif kind == "classification":
			score_val = self.clf.score(X, y)
			print "Accuracy: ", score_val
			print classification_report(y, self.prediction)
			self.precision = precision_score(y, self.prediction, labels=[0,1,2], average=None)
			print "\n\nPrecision Score: ", self.precision, "\n\n"
			self.accuracy = accuracy_score(y, self.prediction)

	def test(self, kind="regression"):
		#run clf and regr on the test data to determine to top 100 loans
		#the top loans are the ones least likely to default
		if kind == "regression":
			pred = self.regr.predict(self.X_test)
			print "length of regression pred: ", len(pred)
			for i, loan in enumerate(self.testLoanData['id']):
				sql_query = "UPDATE test_loans SET pred_days_to_zero_dollars=%s where id='%s';" %(
						pred[i], self.testLoanData['id'][i])
				self.cur.execute(sql_query)
			print i
		elif kind == "classification":
			pred_proba = self.clf.predict_proba(self.X_test)
			for i, loan in enumerate(self.testLoanData['id']):
				sql_query = "UPDATE test_loans SET pred_default=%s, pred_paid=%s, pred_prepaid=%s where id='%s';" %(
						pred_proba[i][0], pred_proba[i][1],pred_proba[i][2], self.testLoanData['id'][i])
				self.cur.execute(sql_query)
		self.con.close()

	def run_pca(self, n_components=20):
		self.pca = PCA(n_components=n_components)
		self.X_train = self.pca.fit_transform(self.X_train)
		print "Reduced data down to ", self.pca.n_components_, " dimensions: "
		print "Transforming cv data ..."
		self.X_cv = self.pca.transform(self.X_cv)
		print "Transforming test data ..."
		self.X_test = self.pca.transform(self.X_test)

	def plot_prediction(self):
		plt.scatter(self.prediction, self.y_cv)
		plt.xlabel('prediction')
		plt.ylabel('y_test')
		plt.show()

	def runSVRGridSearch(self):
		C_vals = [0.01, 0.1, 1, 10, 100]
		gamma_vals = [1E-2, 1E-1, 1, 1E1, 1E2, 1E3, 1E4]

		for C in C_vals:
			for gamma in gamma_vals:
				print "\n\n C: ", C, "  gamma: ", gamma
				self.define_SVR(C=C, gamma=gamma)
				self.train()
				print "Training Scores:"
				self.predict(self.X_train)
				self.score(self.X_train, self.y_train)
				print "Testing Scores:"
				self.predict(self.X_cv)
				self.score(self.X_cv, self.y_cv)

	def roc(self):
		'''Compute ROC curve using one-vs-all technique'''
		pred_proba = self.clf.predict_proba(self.X_cv)
		fpr = []
		tpr = []
		thresholds = []
		for i in [0, 1, 2]:
			fpr_i, tpr_i, thresholds_i = roc_curve(self.y_cv, pred_proba[:,i], pos_label=i)
			fpr.append(fpr_i)
			tpr.append(tpr_i)
			thresholds.append(thresholds_i)
			print "AUC: ", auc(fpr_i, tpr_i)
		plt.plot([0,1], [0,1], '--', color=(0.6, 0.6, 0.6))
		plt.plot(fpr[0], tpr[0], label="Default", linewidth=3)
		plt.xlim([-0.05, 1.05])
		plt.ylim([-0.05, 1.05])
		plt.show()


	def pickle_algo(self, X, fileName):
		print "pickling algorithm"
		f = open(fileName, 'wb')
		pickle.dump(X, f)
		f.close()
Пример #45
0
from sklearn.ensemble import RandomForestClassifier
# train the model
wqp_rf = RandomForestClassifier()
wqp_rf.fit(wqp_train_SX, wqp_train_y)
# predict and evaluate performance
wqp_rf_predictions = wqp_rf.predict(wqp_test_SX)
meu.display_model_performance_metrics(true_labels=wqp_test_y,
                                      predicted_labels=wqp_rf_predictions,
                                      classes=wqp_label_names)

# ## Hyperparameter tuning with Grid Search & Cross Validation

# In[23]:

print(wqp_rf.get_params())

# ### Get the best hyperparameter values

# In[24]:

from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'max_features': ['auto', None, 'log2']
}

wqp_clf = GridSearchCV(RandomForestClassifier(random_state=42),
                       param_grid,
                       cv=5,
Пример #46
0
#!/usr/bin/env python
# coding: utf-8
# Setuppo i dati
import pandas as pd
import numpy as np
heart_disease = pd.read_csv("/heart-disease.csv")
heart_disease
# Features matrix
x = heart_disease.drop("target", axis=1)
# Labels
y = heart_disease["target"]
# Scelta del modello e hyperparameters
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.get_params()
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
clf.fit(x_train, y_train)
# Prediction
y_preds = clf.predict(x_test)
y_preds
y_test
# Valutazione modello sui train data
clf.score(x_train, y_train)
#Valutazione modello sui test data
clf.score(x_test, y_test)
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(classification_report(y_test, y_preds))
confusion_matrix(y_test, y_preds)
accuracy_score(y_test, y_preds)
Пример #47
0
   	    print "PLS Training error " , float(error)/yp_t.shape[0]
 	    yp_new = pls.predict(Xp_v, copy=True)
	    yp_pred = (yp_new[:,0] > yp_new[:,1]).astype(int)
	    #print y_new, y_pred, y_v
	    #print ((y_v - y_pred) ** 2).sum(), y_v.shape[0]
	    error = ((yp_v - yp_pred) ** 2).sum()
	    print "PLS Validation error " , float(error)/yp_v.shape[0]

	    X_new = pls.transform(X)
	    rf = RandomForestClassifier(n_estimators=500, max_depth=None, max_features=int(math.sqrt(n_components)), min_samples_split=100, random_state=144, n_jobs=4)
	    #print "shapes ", X_new.shape, y.shape
	    #print X_new,y
            X_t, X_v, y_t, y_v = tts(X_new,yd,train_size=0.85)

	    rf.fit(X_t, y_t)
            print "Random Forest Classifier: ", rf.get_params()
	    print "Covariance Classifier Training score: ", rf.score(X_t, y_t)
	    print "Covariance Classifier Validation score: ", rf.score(X_v, y_v)
	    #print "Class prob: ", zip(rf.predict_proba(X_v), y_v)

            sample_weights = rf.predict_proba(pls.transform(Xp_t))[:,1]
	    print sample_weights.shape
	    sample_weights = abs(sample_weights-0.5)

	    for a in [.01, .1, .3, 1, 3, 10, 20, 30, 40, 50, 100]:
                clf = SGDClassifier(alpha=a,loss=algo,n_iter=20) 
	        clf.fit(Xp_t,yp_t,sample_weight=sample_weights)
                clf2 = SGDClassifier(alpha=a,loss=algo,n_iter=20) 
	        clf2.fit(Xp_t,yp_t)
		print "alpha: ", a
	        print "Target score with weights: ", clf.score(Xt,yt)
Пример #48
0
from sklearn.ensemble import RandomForestClassifier

from ray.tune.sklearn import TuneGridSearchCV
from sklearn.model_selection import train_test_split

# Load the data
data = fetch_covtype()
x = data.data
y = data.target
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

default_model = RandomForestClassifier()
default_model.fit(x_train, y_train)
default_pred = default_model.predict(x_test)
default_params = default_model.get_params()
default_accuracy = np.count_nonzero(
    np.array(default_pred) == np.array(y_test)) / len(default_pred)

parameter_grid = {
    "n_estimators": [10, 50],
    "max_depth": [5, 50, 100],
    "ccp_alpha": [0.001, 0.01]
}

tune_search = TuneGridSearchCV(RandomForestClassifier(),
                               param_grid=parameter_grid,
                               scoring="accuracy")

start = time.time()
tune_search.fit(x_train, y_train)
test_iterations = range(0,1000)
average_score_sum = 0

for x in test_range:
    clf = RandomForestClassifier(criterion = "entropy",min_samples_leaf=4) #create the random forest classifier
    clf = clf.fit(features_train, labels_train) #train the classifier

    pred = clf.predict(features_test) #create an array of predictions

    from sklearn.metrics import accuracy_score
    acc = accuracy_score(pred, labels_test)  #determine the accuracy of those predictions    
    average_score_sum+=acc
    if acc > best_leaf_value[1]:
        best_leaf_value[0]=x  #store the leaf value which yields the highest accuracy
        best_leaf_value[1]=acc   #store the new highest average accuracy
        best_leaf_value[2]= clf.get_params(deep = True)
        
average_score = average_score_sum/len(test_range)

print "High Score: ", best_leaf_value[1]
print "Average Score: ", average_score
print "Deets: ", best_leaf_value


'''
for x in test_range:
    print x
    average_score_sum = 0
    for t in test_iterations:
        clf = RandomForestClassifier(min_samples_leaf=x) #create the random forest classifier
        clf = clf.fit(features_train, labels_train) #train the classifier