def fit_decision_tree(train_X, train_y, test_X, test_y):
    # print classification reports
    # print accuracy
    # The format should be
    """
    Classification Report:
             precision    recall  f1-score   support

        0.0       0.80      0.89      0.85      4932
        1.0       0.75      0.60      0.67      2676

    avg / total       0.78      0.79      0.78      7608

    Accuracy: 0.788512092534"""
    dtc = tree.DecisionTreeClassifier()
    dtc = dtc.fit(train_X,train_y.flat)
    pred_y = dtc.predict(test_X)

    print classification_report(test_y, pred_y)
    print accuracy_score(test_y,pred_y)


    # create the graph - Here you just need to create the dot file. Please uncomment my code below

    from sklearn.externals.six import StringIO
    f = open('tre.dot','w')
    tree.export_graphviz(dtc, out_file=f) # please change your_tree_model_fit with the variable you used above
    f.close()
def fit_logistic(train_X, train_y, test_X, test_y):
    logreg = linear_model.LogisticRegression()
    logreg = logreg.fit(train_X, train_y.flat)
    pred_y = logreg.predict(test_X)

    # print classification reports
    # print accuracy
    # The format should be

    print classification_report(test_y, pred_y)
    print accuracy_score(test_y,pred_y)

    """
    Classification Report:
             precision    recall  f1-score   support

        0.0       0.80      0.89      0.85      4932
        1.0       0.75      0.60      0.67      2676

    avg / total       0.78      0.79      0.78      7608

    Accuracy: 0.788512092534"""
    # don't worry about the values. Random sampling may lead to different varlue
    show_confusion_matrix(test_y,pred_y)
    return pred_y # predicted y values
def fit_decision_tree(train_X, train_y, test_X, test_y):
    # print classification reports
    # print accuracy
    # The format should be
    """
    Classification Report:
             precision    recall  f1-score   support

        0.0       0.80      0.89      0.85      4932
        1.0       0.75      0.60      0.67      2676

    avg / total       0.78      0.79      0.78      7608

    Accuracy: 0.788512092534"""
    dtc = tree.DecisionTreeClassifier()
    dtc = dtc.fit(train_X, train_y.flat)
    pred_y = dtc.predict(test_X)

    print classification_report(test_y, pred_y)
    print accuracy_score(test_y, pred_y)

    # create the graph - Here you just need to create the dot file. Please uncomment my code below

    from sklearn.externals.six import StringIO
    f = open('tre.dot', 'w')
    tree.export_graphviz(
        dtc, out_file=f
    )  # please change your_tree_model_fit with the variable you used above
    f.close()
예제 #4
0
def check_folding(classifier, check_instance=True, has_staged_pp=True, has_importances=True):
    X, y, sample_weight = generate_classification_data(distance=0.6)

    assert classifier == classifier.fit(X, y, sample_weight=sample_weight)
    assert list(classifier.features) == list(X.columns)

    check_classification_model(classifier, X, y, check_instance=check_instance, has_staged_pp=has_staged_pp,
                has_importances=has_importances)

    def mean_vote(x):
        return numpy.mean(x, axis=0)

    labels = classifier.predict(X, mean_vote)
    proba = classifier.predict_proba(X, mean_vote)
    assert numpy.all(proba == classifier.predict_proba(X, mean_vote))

    score = accuracy_score(y, labels)
    print(score)
    assert score > 0.7
    assert numpy.allclose(proba.sum(axis=1), 1), 'probabilities do not sum to 1'
    assert numpy.all(proba >= 0.), 'negative probabilities'

    auc_score = roc_auc_score(y, proba[:, 1])
    print(auc_score)
    assert auc_score > 0.8
    if has_staged_pp:
        for p in classifier.staged_predict_proba(X, mean_vote):
            assert p.shape == (len(X), 2)
        # checking that last iteration coincides with previous
        assert numpy.all(p == proba)
def main():
    pipeline = Pipeline([
        ('vect', TfidfVectorizer()),
        ('clf', LogisticRegression())
    ])
    parameters = {
        'vect__max_df': (0.25, 0.5, 0.75),
        'vect__stop_words': ('english', None),
        'vect__max_features': (5000, 10000, None),
        'vect__ngram_range': ((1, 1), (1, 2)),
        'vect__use_idf': (True, False),
        'vect__norm': ('l1', 'l2'),
        'clf__penalty': ('l1', 'l2'),
        'clf__C': (0.1, 1, 10),
    }
    df = pd.read_csv('movie-reviews/train.tsv', header=0, delimiter='\t')
    X, y = df['Phrase'], df['Sentiment'].as_matrix()
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5)
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    print 'Best score: %0.3f' % grid_search.best_score_
    print 'Best parameters set:'
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print '\t%s: %r' % (param_name, best_parameters[param_name])

    predictions = grid_search.predict(X_test)
    print 'Accuracy:', accuracy_score(y_test, predictions)
    print 'Precision:', precision_score(y_test, predictions)
    print 'Recall:', recall_score(y_test, predictions)
    print 'F1 score:', f1_score(y_test, predictions)
예제 #6
0
def calc_metrics(true_labels, predicted_labels):
    """Provide accuracy, precision, recall, and f1 as error measure.

    Parameters
    ----------
    true_labels : list, ndarray
        true labels
    predicted_labels : list, ndarray
        predicted labels

    Returns
    -------
    (float, float, float, float)
        accuracy, precision, recall, f1

    Example
    -------
    >>> y_true = [0, 1, 1, 0]
    >>> y_pred = [0, 0, 1, 1]
    >>> calc_metrics(y_true, y_pred)
    (0.5, 0.5, 0.5, 0.5)
    """
    acc = accuracy_score(true_labels, predicted_labels)
    p, r, f1, _ = precision_recall_fscore_support(true_labels, predicted_labels,
            average='micro')
    return (acc, p, r, f1)
예제 #7
0
def calc_metrics(true_labels, predicted_labels):
    """Provide accuracy, precision, recall, and f1 as error measure.

    Parameters
    ----------
    true_labels : list, ndarray
        true labels
    predicted_labels : list, ndarray
        predicted labels

    Returns
    -------
    (float, float, float, float)
        accuracy, precision, recall, f1

    Example
    -------
    >>> y_true = [0, 1, 1, 0]
    >>> y_pred = [0, 0, 1, 1]
    >>> calc_metrics(y_true, y_pred)
    (0.5, 0.5, 0.5, 0.5)
    """
    acc = accuracy_score(true_labels, predicted_labels)
    p, r, f1, _ = precision_recall_fscore_support(true_labels,
                                                  predicted_labels,
                                                  average='micro')
    return (acc, p, r, f1)
예제 #8
0
def test_quality(n_samples=3000):
    testX, testY = generate_sample(n_samples, 10, 0.6)
    trainX, trainY = generate_sample(n_samples, 10, 0.6)

    params = {
        'n_neighbors': 10,
        'n_estimators': 10,
        'uniform_variables': ['column0'],
        'base_estimator':
            DecisionTreeClassifier(min_samples_leaf=20, max_depth=5)
    }

    for algorithm in ['SAMME', 'SAMME.R']:
        uboost_classifier = uBoostClassifier(
            algorithm=algorithm, efficiency_steps=5, **params)

        bdt_classifier = uBoostBDT(algorithm=algorithm, **params)

        for classifier in [bdt_classifier, uboost_classifier]:
            classifier.fit(trainX, trainY)
            predict_proba = classifier.predict_proba(testX)
            predict = classifier.predict(testX)
            assert roc_auc_score(testY, predict_proba[:, 1]) > 0.7, \
                "quality is awful"
            print("Accuracy = %.3f" % accuracy_score(testY, predict))
def main():
    pipeline = Pipeline([
        ('vect', TfidfVectorizer(stop_words='english')),
        ('clf', LogisticRegression())
    ])
    parameters = {
        'vect__max_df': (0.25, 0.5),
        'vect__ngram_range': ((1, 1), (1, 2)),
        'vect__use_idf': (True, False),
        'clf__C': (0.1, 1, 10),
    }
    os.chdir('C:\\Users\\Dan\\1) Python Notebooks\\Datasets')
    df = pd.read_csv('data/train.tsv', header=0, delimiter='\t')
    X, y = df['Phrase'], df['Sentiment'].as_matrix()
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5)
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=3, verbose=1, scoring='accuracy')
    lb = LabelBinarizer()
    y_train = np.array([number[0] for number in lb.fit_transform(y_train)])    
    grid_search.fit(X_train, y_train)
    print 'Best score: %0.3f' % grid_search.best_score_
    print 'Best parameters set:'
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print '\t%s: %r' % (param_name, best_parameters[param_name])
    predictions = grid_search.predict(X_test)
    lb = LabelBinarizer()
    y_train = np.array([number[0] for number in lb.fit_transform(y_train)])
    print 'Accuracy:', accuracy_score(y_test, predictions)
    print 'Precision:', precision_score(y_test, predictions)
    print 'Recall:', recall_score(y_test, predictions)
예제 #10
0
def main():
    pipeline = Pipeline([
        ('vect', TfidfVectorizer(stop_words='english')),
        ('clf', LogisticRegression())
    ])
    parameters = {
        'vect__max_df': (0.25, 0.5),
        'vect__ngram_range': ((1, 1), (1, 2)),
        'vect__use_idf': (True, False),
        'clf__C': (0.1, 1, 10),
    }
    df = pd.read_csv('data/train.tsv', header=0, delimiter='\t')
    X, y = df['Phrase'], df['Sentiment'].as_matrix()
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5)
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=3, verbose=1, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    print 'Best score: %0.3f' % grid_search.best_score_
    print 'Best parameters set:'
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print '\t%s: %r' % (param_name, best_parameters[param_name])
    predictions = grid_search.predict(X_test)
    print 'Accuracy:', accuracy_score(y_test, predictions)
    print 'Confusion Matrix:', confusion_matrix(y_test, predictions)
    print 'Classification Report:', classification_report(y_test, predictions)
예제 #11
0
def main():
    pipeline = Pipeline([('vect', TfidfVectorizer()),
                         ('clf', LogisticRegression())])
    parameters = {
        # 'vect__max_df': (0.25, 0.5, 0.75),
        'vect__stop_words': ('english', None),
        # 'vect__max_features': (5000, 10000, None),
        # 'vect__ngram_range': ((1, 1), (1, 2)),
        # 'vect__use_idf': (True, False),
        # 'vect__norm': ('l1', 'l2'),
        # 'clf__penalty': ('l1', 'l2'),
        # 'clf__C': (0.1, 1, 10),
    }
    df = pd.read_csv('movie-reviews/train.tsv', header=0, delimiter='\t')
    X, y = df['Phrase'], df['Sentiment'].as_matrix()
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5)
    grid_search = GridSearchCV(pipeline,
                               parameters,
                               n_jobs=-1,
                               verbose=1,
                               scoring='accuracy')
    grid_search.fit(X_train, y_train)
    print 'Best score: %0.3f' % grid_search.best_score_
    print 'Best parameters set:'
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print '\t%s: %r' % (param_name, best_parameters[param_name])

    predictions = grid_search.predict(X_test)
    print 'Accuracy:', accuracy_score(y_test, predictions)
    print 'Classification Report:', classification_report(y_test, predictions)

    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(y_test, predictions)
    print cm

    plt.matshow(cm)
    plt.title('Confusion matrix')
    plt.colorbar()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()
    predictions = np.ones(len(predictions)) * 2
    print 'Accuracy:', accuracy_score(y_test, predictions)
    print 'Degenerate Classification Report:', classification_report(
        y_test, predictions)
예제 #12
0
 def train(self, instances, labels, centroid):
     self.centroid = centroid
     sample_weights = self.sampler_weigher.get_sample_weights(instances, centroid)
     self.sample_centroid = numpy.average(instances, axis=0, weights=sample_weights)
     self.base_estimator.fit(instances, labels, sample_weight=sample_weights)
     instances_oob, labels_oob = instances[sample_weights == 0], labels[sample_weights == 0]
     if len(instances_oob) > 0:
         self.oob_accuracy = accuracy_score(labels_oob, self.predict(instances_oob))
     return self
예제 #13
0
def evaluate(df):
    X = df.ix[:,0:7]
    y = df["seed"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    print len(X_train)

    y_test = np.array(y_test)
    clf = LogisticRegression()
    clf.fit(X_train,y_train)

    print "------------",clf.predict_proba(X_test)
    print clf.get_params()

    pipeline=  Pipeline([
                    ('clf',LogisticRegression())
                    ])

    parameters={


    }
    grid_search = GridSearchCV(pipeline,parameters,n_jobs=1,verbose=1)

    grid_search.fit(X_train,y_train)


    print "Best score:",grid_search.best_score_
    print "Best parameters set:"
    best_parameters = grid_search.best_estimator_.get_params()

    for param_name in sorted(parameters.keys()):
        print (param_name,best_parameters[param_name])

    prediction = grid_search.predict(X_test)
    for i,pred in enumerate(prediction):
        print "original:",y_test[i],"predicted",pred
    print grid_search.score(X_test,y_test)
    print accuracy_score(y_test,prediction)
    print "classification_report",classification_report(y_test,prediction)
    clf_pred = clf.predict(X_test)
    for i,pred in enumerate(clf_pred):
        print "original:",y_test[i],"predicted",pred
    print accuracy_score(y_test,clf_pred)
    print  clf.score(X_test,y_test)
예제 #14
0
def test_factory():
    factory = ClassifiersFactory()
    try:
        from rep.estimators.tmva import TMVAClassifier
        factory.add_classifier('tmva', TMVAClassifier())
    except ImportError:
        pass
    factory.add_classifier('rf', RandomForestClassifier(n_estimators=10))
    factory.add_classifier('ada', AdaBoostClassifier(n_estimators=20))

    X, y, sample_weight = generate_classification_data()
    assert factory == factory.fit(X, y, sample_weight=sample_weight, features=list(X.columns), parallel_profile='threads-4')
    for cl in factory.values():
        assert list(cl.features) == list(X.columns)
    proba = factory.predict_proba(X, parallel_profile='threads-4')
    labels = factory.predict(X, parallel_profile='threads-4')
    for key, val in labels.items():
        score = accuracy_score(y, val)
        print(key, score)
        assert score > 0.7, key

    for key, val in proba.items():
        assert numpy.allclose(val.sum(axis=1), 1), 'probabilities do not sum to 1'
        assert numpy.all(val >= 0.), 'negative probabilities'

        auc_score = roc_auc_score(y, val[:, 1])
        print(auc_score)
        assert auc_score > 0.8

    for key, iterator in factory.staged_predict_proba(X).items():
        assert key != 'tmva', 'tmva does not support staged pp'
        for p in iterator:
            assert p.shape == (len(X), 2)

        # checking that last iteration coincides with previous
        assert numpy.all(p == proba[key])

    # testing picklability
    dump_string = cPickle.dumps(factory)
    clf_loaded = cPickle.loads(dump_string)

    assert type(factory) == type(clf_loaded)

    probs1 = factory.predict_proba(X)
    probs2 = clf_loaded.predict_proba(X)
    for key, val in probs1.items():
        assert numpy.all(val == probs2[key]), 'something strange was loaded'

    report = ClassificationReport({'rf': factory['rf']}, LabeledDataStorage(X, y, sample_weight))
    report.feature_importance_shuffling(roc_auc_score_mod).plot(new_plot=True, figsize=(18, 3))
    report = factory.test_on_lds(LabeledDataStorage(X, y, sample_weight))
    report = factory.test_on(X, y, sample_weight=sample_weight)
    val = numpy.mean(X['column0'])
    check_report_with_mask(report, "column0 > %f" % (val / 2.), X)
    check_report_with_mask(report, lambda x: numpy.array(x['column0']) < val * 2., X)
    check_report_with_mask(report, None, X)
예제 #15
0
파일: model.py 프로젝트: lossrual/recomm
 def run_model(self, train_path, test_path):
     trainx, trainy = self.load_data(train_path)
     self.train_model(trainx, trainy)
     testx, testy = self.load_data(test_path)
     predy = self.predict_res(testx)
     accuracy = accuracy_score(testy, predy) 
     label = [1, 0]
     classifier = ['interested', 'nointerested']
     result = classification_report(testy, predy, labels=label, target_names = classifier) + '\naccuracy\t' + str(accuracy)
     print result
예제 #16
0
def main():
    pipeline = Pipeline([
        ('vect', TfidfVectorizer()),
        ('clf', LogisticRegression())
    ])
    parameters = {
        # 'vect__max_df': (0.25, 0.5, 0.75),
        'vect__stop_words': ('english', None),
        # 'vect__max_features': (5000, 10000, None),
        # 'vect__ngram_range': ((1, 1), (1, 2)),
        # 'vect__use_idf': (True, False),
        # 'vect__norm': ('l1', 'l2'),
        # 'clf__penalty': ('l1', 'l2'),
        # 'clf__C': (0.1, 1, 10),
    }
    df = pd.read_csv('movie-reviews/train.tsv', header=0, delimiter='\t')
    X, y = df['Phrase'], df['Sentiment'].as_matrix()
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5)
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    print 'Best score: %0.3f' % grid_search.best_score_
    print 'Best parameters set:'
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print '\t%s: %r' % (param_name, best_parameters[param_name])

    predictions = grid_search.predict(X_test)
    print 'Accuracy:', accuracy_score(y_test, predictions)
    print 'Classification Report:', classification_report(y_test, predictions)

    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(y_test, predictions)
    print cm

    plt.matshow(cm)
    plt.title('Confusion matrix')
    plt.colorbar()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()
    predictions = np.ones(len(predictions)) * 2
    print 'Accuracy:', accuracy_score(y_test, predictions)
    print 'Degenerate Classification Report:', classification_report(y_test, predictions)
 def Run(self, trainFileDir, testFileDir):
     XTrain, yTrain = self.loadData(trainFileDir)
     self.trainModel(XTrain, yTrain)
     XTest, yTest = self.loadData(testFileDir)
     yPred = self.predict(XTest)
     accuracy = accuracy_score(yTest, yPred)
     #precision, recall, fScore, _ = precision_recall_fscore_support(y, yPred) 
     labels = [1, 0]
     classNames = ['interested', 'notInterested']
     report = classification_report(yTest, yPred, labels=labels, target_names=classNames) + '\naccuracy\t' + str(accuracy)
     print report
def evaluate(df):
    X = df.ix[:, 0:7]
    y = df["seed"]
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.1,
                                                        random_state=42)
    print len(X_train)

    y_test = np.array(y_test)
    clf = LogisticRegression()
    clf.fit(X_train, y_train)

    print "------------", clf.predict_proba(X_test)
    print clf.get_params()

    pipeline = Pipeline([('clf', LogisticRegression())])

    parameters = {}
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=1, verbose=1)

    grid_search.fit(X_train, y_train)

    print "Best score:", grid_search.best_score_
    print "Best parameters set:"
    best_parameters = grid_search.best_estimator_.get_params()

    for param_name in sorted(parameters.keys()):
        print(param_name, best_parameters[param_name])

    prediction = grid_search.predict(X_test)
    for i, pred in enumerate(prediction):
        print "original:", y_test[i], "predicted", pred
    print grid_search.score(X_test, y_test)
    print accuracy_score(y_test, prediction)
    print "classification_report", classification_report(y_test, prediction)
    clf_pred = clf.predict(X_test)
    for i, pred in enumerate(clf_pred):
        print "original:", y_test[i], "predicted", pred
    print accuracy_score(y_test, clf_pred)
    print clf.score(X_test, y_test)
예제 #19
0
파일: report.py 프로젝트: etamponi/eole
    def analyze_run(self, prediction_matrix, labels):
        for j in range(self.accuracy_sample.shape[1]):
            predictions = prediction_matrix[:, j]
            self.accuracy_sample[self.current_run][j] = accuracy_score(
                labels, predictions)

            precision, recall, f1 = precision_recall_f1_score(
                labels, predictions)
            self.precision_sample[self.current_run][j] = precision
            self.recall_sample[self.current_run][j] = recall
            self.f1_sample[self.current_run][j] = f1
        self.current_run += 1
 def Run(self, trainFileDir, testFileDir):
     XTrain, yTrain = self.loadData(trainFileDir)
     self.trainModel(XTrain, yTrain)
     XTest, yTest = self.loadData(testFileDir)
     yPred = self.predict(XTest)
     accuracy = accuracy_score(yTest, yPred)
     #precision, recall, fScore, _ = precision_recall_fscore_support(y, yPred)
     labels = [1, 0]
     classNames = ['interested', 'notInterested']
     report = classification_report(
         yTest, yPred, labels=labels,
         target_names=classNames) + '\naccuracy\t' + str(accuracy)
     print report
def fit_logistic(train_X, train_y, test_X, test_y):
    logreg = linear_model.LogisticRegression()
    logreg = logreg.fit(train_X, train_y.flat)
    pred_y = logreg.predict(test_X)

    # print classification reports
    # print accuracy
    # The format should be

    print classification_report(test_y, pred_y)
    print accuracy_score(test_y, pred_y)
    """
    Classification Report:
             precision    recall  f1-score   support

        0.0       0.80      0.89      0.85      4932
        1.0       0.75      0.60      0.67      2676

    avg / total       0.78      0.79      0.78      7608

    Accuracy: 0.788512092534"""
    # don't worry about the values. Random sampling may lead to different varlue
    show_confusion_matrix(test_y, pred_y)
    return pred_y  # predicted y values
예제 #22
0
 def train(self, instances, labels, centroid):
     self.centroid = centroid
     sample_weights = self.sampler_weigher.get_sample_weights(
         instances, centroid)
     self.sample_centroid = numpy.average(instances,
                                          axis=0,
                                          weights=sample_weights)
     self.base_estimator.fit(instances,
                             labels,
                             sample_weight=sample_weights)
     instances_oob, labels_oob = instances[sample_weights == 0], labels[
         sample_weights == 0]
     if len(instances_oob) > 0:
         self.oob_accuracy = accuracy_score(labels_oob,
                                            self.predict(instances_oob))
     return self
예제 #23
0
def Classify(txtList, txtLabels, fileName, labelList):
    x_train = np.array(txtList[0:300])
    y_train = np.array(txtLabels[0:300])
    x_test = np.array(txtList[301:])
    y_test = np.array(txtLabels[301:])
    classifier = Pipeline([
        ('vectorizer', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', OneVsRestClassifier(LinearSVC()))])
    classifier.fit(x_train, y_train)
    predicted = classifier.predict(x_test)
    f=open(fileName,'w')
    f.writelines(metrics.classification_report(y_test, predicted,target_names=labelList))
    f.write('\nNumber of Labels:'+str(len(labelList)))
    f.write('\nhamming loss : '+str(metrics.hamming_loss(y_test,predicted)))
    f.write('\nf-beta(beta=0.5 - biased towards Precision) : '+str(metrics.fbeta_score(y_test,predicted,0.5)))
    f.write('\nzero-loss:'+str(zero_one_loss(y_test,predicted)))
    f.write('\nAccuracy score:'+str(metrics.accuracy_score(y_test,predicted)))
    f.close()
def Classify(txtList, txtLabels, fileName, labelList):
    x_train = np.array(txtList[0:300])
    y_train = np.array(txtLabels[0:300])
    x_test = np.array(txtList[301:])
    y_test = np.array(txtLabels[301:])
    classifier = Pipeline([
        ('vectorizer', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', OneVsRestClassifier(LinearSVC()))])
    classifier.fit(x_train, y_train)
    predicted = classifier.predict(x_test)
    f=open(fileName,'w')
    f.writelines(metrics.classification_report(y_test, predicted,target_names=labelList))
    f.write('\nNumber of Labels:'+str(len(labelList)))
    f.write('\nhamming loss : '+str(metrics.hamming_loss(y_test,predicted)))
    f.write('\nf-beta(beta=0.5 - biased towards Precision) : '+str(metrics.fbeta_score(y_test,predicted,0.5)))
    f.write('\nzero-loss:'+str(zero_one_loss(y_test,predicted)))
    f.write('\nAccuracy score:'+str(metrics.accuracy_score(y_test,predicted)))
    f.close()
def main(argv):

    try:
        opts, args = getopt.getopt(argv, "d:c:")

    except getopt.GetoptError:

        sys.exit(2)
    for opt, arg in opts:
        if opt == '-d':
            data_file = arg
        elif opt == '-c':
            label_col = int(arg)

    y_true = np.genfromtxt(data_file,
                           usecols=label_col,
                           delimiter="\t",
                           skip_header=1)
    for lab in range(2, 9):
        print "lab", lab
        y_pred = np.genfromtxt(data_file,
                               usecols=lab,
                               delimiter="\t",
                               skip_header=1)
        print "The classification report for Algorithm", lab, "is \n"
        #Make classification report
        print metrics.classification_report(y_true, y_pred)
        print "Accuracy: %.6f" % metrics.accuracy_score(y_true, y_pred)
        #Compute specificity from confusion amtrix
        cm = confusion_matrix(y_true, y_pred)
        print "Confusion matrix as \n", cm
        tn = int(cm[0, 0])
        fp = int(cm[0, 1])
        print "tn", tn
        print "fp", fp
        s = tn / (tn + fp)
        print "Speicificity is", s, "\n"
        print "Metthiew correlation co-efficient: %.6f" % matthews_corrcoef(
            y_true, y_pred)
예제 #26
0
def main(argv=None):

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    log.info('loading data')
    true = []
    pred = []

    for line in file_line_generator(args.true_labels):
        true.append(line)

    for line in file_line_generator(args.pred_labels):
        pred.append(line)

    acc = accuracy_score(true, pred)
    log.info('accuracy: %f' % acc)

    if args.precision or args.recall or args.f_measure:
        p, r, f, _ = precision_recall_fscore_support(
            true,
            pred,
            args.beta,
            pos_label=args.pos_label,
            average=None if not args.avg else args.avg)

        if args.precision:
            log.info('precision: %f' % p)
        if args.recall:
            log.info('recall: %f' % r)
        if args.f_measure:
            log.info('f-measure: %f' % f)

    log.info('finished')

if __name__ == '__main__':
    if len(sys.argv) != 2:
        print ("Illegal use of Arguments: Best_configuration.py <Training_samples_location> <Testing_Samples_Location>")
        exit(1)

    test =  sys.argv[1]
    header_list = []
    labels = []
    i=0

    header_test = []
    test_labels = []
    i = 0
    for root, dirs, files in os.walk(test):
        for name in files:
            fo = open(root +"/"+name, "r")
            content = fo.read().replace('\n', ' ')
            body = re.sub(r'^(.*) Lines: (\d)+ ', "", content)
            header_test.append(unicode(body,errors='ignore'))
            test_labels.append(i)
        i=i+1

    text_clf01 = joblib.load('Training_model.pkl')
    predicted01 = text_clf01.predict(header_test)
    print("Removed Stop Words + L2 penalization")
    print ("F1:",metrics.f1_score(test_labels, predicted01, average='macro'))
    print ("accuracy:", metrics.accuracy_score(test_labels, predicted01))
    print ("precision:",metrics.precision_score(test_labels, predicted01, average='macro'))
    print ("recall:",metrics.recall_score(test_labels, predicted01, average='macro'))
예제 #28
0
plt.show()

#There's a pitch-perfect illustration of overfitting. Look at the gulf between the training and cv scores. As we train
#on more and more examples, the training score does decrease and cv scores increases but we'll need exponentially more
#examples to reduce the gulf between the two. Let's confirm understanding by looking at the test scores.

# In[23]:

#Let's see how our trained model performs on the test set. We are not going to train on this set merely looking at how well
#our model can generalize.

#Calling Fit on the estimator object so we can predict. We're NOT retraining the classifier here.
estimator.fit(X_train, y_train)
y_pred = estimator.predict(X_test)
print metrics.classification_report(y_test, y_pred)
print "Decision Trees: Final Generalization Accuracy: %.6f" % metrics.accuracy_score(
    y_test, y_pred)

#That's not too bad but we can get a much better result if we addressed the overfitting problem. Let's now try the random
#forests classifier to see how it does.

# In[25]:

#WARNING - THIS MIGHT TAKE A WHILE TO RUN. TRY ADJUSTING parameters such as n_jobs (jobs to run in parallel, before
#increasing this make sure your system can handle it), n_iter for ShuffleSplit (in the function definition) and reducing
#number of values being tried for max_depth/n_estimators.

#SELECT INTERRUPT IN THE MENU AND PRESS INTERRUPT KERNEL IF YOU NEEDD TO STOP EXECUTION

max_depth = np.linspace(5, 10, 5)
n_estimators = [10, 100, 1000]
예제 #29
0
import pylab as pl

features_train, labels_train, features_test, labels_test = makeTerrainData()



#################################################################################


########################## DECISION TREE #################################



#### your code goes here
from classifyDT import classify
from sklearn.metrics.metrics import accuracy_score
clf = classify(features_train, labels_train,50.0)

clf.fit(features_train,labels_train)
pred = clf.predict_proba(features_test)
roundedNumber = []
for i in range(0,len(pred)):
    roundedNumber.append(round(pred[i,1]))
acc = accuracy_score(labels_test,roundedNumber)### you fill this in!
print acc### be sure to compute the accuracy on the test set


    
def submitAccuracies():
    return {"acc":round(acc,3)}
예제 #30
0
파일: gmm.py 프로젝트: ggaemo/deepanomaly
                        
            for multiplier in np.linspace(0.5, 1.5, 10):      

                threshold = np.percentile(y_prob, anomaly_prob * multiplier)
                y_label = list()
                
                for elem in y_prob:
                    if elem > threshold:
                        label = 1
                    else:
                        label = 0
                    y_label.append(label)
                
                result = classification_report(y, y_label,labels = [0,1], target_names = ['anomaly', 'normal'])
                f1 = f1_score(y, y_label, pos_label = 0)
                accuracy = accuracy_score(y, y_label)
                
#                 print '--------------------------------------------------------'
#                 print 'temp : ', data_name, (n_comp, cov_type), f1
#                 print multiplier, anomaly_prob, threshold
#                 print result
#                 print '--------------------------------------------------------'
                if data_name.endswith('.txt'):
                    encode_output = data_name.split('_')[4]
                    dbn_model = data_name[data_name.rindex('_')+1:data_name.index('.')]                    
                else:
                    encode_output = '-'
                    dbn_model = '-'
                
                
                result_table.loc[pos, :] = [data_name, dbn_model, encode_output, n_comp, cov_type, anomaly_prob, multiplier, f1, accuracy]
def whole_dataset_train_test(X, y):
    rfpred = RandomForestClassifier().fit(X,y)
    pred = rfpred.predict(X)
    print "When fitted on the whole dataset with selected features, then the classification report is found to be:\n";
    print "Random Forests: Accuracy: %.6f" %metrics.accuracy_score(y,pred)
    print metrics.classification_report(y, pred)
예제 #32
0
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.metrics import classification_report

y_true_all = []
predictions_all = []
for label in good_categories[:3]:
    print 'label', label
    y_train = [1 if label in instance else 0 for instance in y_train_all]
    y_test = [1 if label in instance else 0 for instance in y_test_all]
    y_true_all.append(y_test)
    classifier = LogisticRegression()
    classifier.fit_transform(X_train, y_train)
    predictions = classifier.predict(X_test)
    predictions_all.append(predictions)
    print classification_report(y_test, predictions)
    print confusion_matrix(y_test, predictions)
    print 'precision', precision_score(y_test, predictions)
    print 'recall', recall_score(y_test, predictions)
    print 'accuracy', accuracy_score(y_test, predictions)
    print '\n'

y_true_all = np.array(y_true_all)
predictions_all = np.array(predictions_all)

print hamming_loss(y_true_all, predictions_all)
vect = CountVectorizer() 

train_dtm = vect.fit_transform(X_train)
test_dtm = vect.transform(X_test)
#Task 5
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(train_dtm, y_train)

y_pred = nb.predict(test_dtm)

from sklearn.metrics import metrics

print metrics.accuracy_score(y_test, y_pred)
#92% Accuracy 
#Task 6 
# Map five to 1 and 1 to 0 
y_test[y_test ==1]  = 0
y_test[y_test == 5 ] = 1


y_pred_prob = nb.predict_proba(test_dtm)[:,1]
print metrics.roc_auc_score(y_test, y_pred_prob)
#Task 7
import matplotlib.pyplot as plt
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_prob)
plt.plot(fpr, tpr)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
    X_train, y_train = X[train_indices], y[train_indices]
    X_test, y_test = X[test_indices], y[test_indices]
    n_repeat = 100
    n_experts = 1
    y_preds = numpy.zeros((len(y_test), n_experts), dtype=int)
    for e in range(n_experts):
        indices = numpy.random.choice(len(X_train), size=int(15.0*len(X_train)), replace=True)
        X_train_exp, y_train_exp = X_train[indices], y_train[indices]
        for k, x in enumerate(X_test):
            X_matrix = numpy.tile([x], (n_repeat, 1))
            X_total = numpy.vstack((X_train_exp, X_matrix))
            y_total = numpy.hstack((y_train_exp, numpy.zeros(n_repeat)))
            value = 0
            for p in classes:
                y_total[-n_repeat:] = p
                curr_value = measure(X_total, y_total)
                if curr_value == 0 or curr_value == 1:
                    print "OPS!"
                if curr_value > value:
                    y_preds[k, e] = p
                    value = curr_value
    y_pred = numpy.asarray([numpy.bincount(row).argmax() for row in y_preds])
    curr_accuracies[fold] = accuracy_score(y_test, y_pred)
    print curr_accuracies[fold],
    numpy.random.seed(fold)
    curr_knn[fold] = DecisionTreeClassifier().fit(X_train, y_train).score(X_test, y_test)
    print curr_knn[fold]

print curr_accuracies.mean()
print curr_knn.mean()
def print_classification_report(y_test_report, y_predicted_report, target_names):
    # target_names = ['class 0', 'class 1']
    print ("overall accuracy score of the classifier is")
    print accuracy_score(y_test_report, y_predicted_report)
    print (classification_report(np.array(y_test_report), np.array(y_predicted_report), target_names=target_names))
    return None
예제 #36
0
                                                    test_size=0.33,
                                                    random_state=42)

from sklearn.ensemble import RandomForestClassifier

#se pasmo con 1000000
#probar con mas parametros
classifier = RandomForestClassifier(n_estimators=100)
classifier.fit(X_train, y_train)
prediction = classifier.predict(X_test)

#print X_train.shape

from sklearn.metrics.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score

print '\nAccuracy:', accuracy_score(y_test, prediction)
print '\nscore:', classifier.score(X_train, y_train)
print '\nrecall:', recall_score(y_test, prediction)
print '\nprecision:', precision_score(y_test, prediction)
print '\n clasification report:\n', classification_report(y_test, prediction)
print '\n confussion matrix:\n', confusion_matrix(y_test, prediction)

#plots:

import matplotlib.pyplot as plt
confusion_matrix_plot = confusion_matrix(y_test, prediction)
plt.title('matriz de confusion')
plt.colorbar()
plt.xlabel()
plt.xlabel('categoria de verdad')
plt.ylabel('categoria predecida')
예제 #37
0
    processed_comment_list = []
    for art in commentList.items():
        for comm in art[1]:
            processed_comment_list.append(comm.body.decode('ascii', 'ignore'))
    features = vectorizer.transform(processed_comment_list)

    y_train = load_numpy_matrix(feature_set_path + r'valueVector' + tag +
                                '_train.npy')
    y_test = load_numpy_matrix(feature_set_path + r'valueVector' + tag +
                               '_test.npy')

    print features.shape
    print y_train.shape
    print y_test.shape

    valueVector = np.concatenate([y_train, y_test])
    print
    print valueVector.shape

    # train_list = [' '.join(sent) for sent in train_list]
    # test_list = [' '.join(sent) for sent in test_list]
    predicted = [float(v) for v in clf.predict(features)]

    print "Accuracy: %0.3f " % (accuracy_score(valueVector, predicted))

    print classification_report(valueVector,
                                predicted,
                                target_names=['0', '1'])
    print draw_confusion_matrix(valueVector, predicted, ['ham', 'spam'])
예제 #38
0
    X_train = vectorizer.fit_transform(corpus_train)

    X_test = vectorizer.transform(corpus_test)

    clf = RandomForestClassifier(n_estimators=10)
    #clf = KNeighborsClassifier(n_neighbors=10)
    #clf = LinearSVC()

    clf.fit(X_train, y_train)

    print len(y_train)
    print len(y_test)

    pred = clf.predict(X_test)

    #pred = ['0']* len(y_test)
    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)
    total.append(score)

    n = 20

#     feature_names = vectorizer.get_feature_names()
#     coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
#     top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
#     for (coef_1, fn_1), (coef_2, fn_2) in top:
#         print "\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2)

print np.mean(total)
    lines = tLine.rstrip().split('|@~')
    tweet = lines[0]
    sentiment = lines[1]
    processedTweet = processTweet(tweet)
    featureVector = getFeatureVector(processedTweet)
    testTweets.append((featureVector, sentiment))
    tLine = tp.readline()
# end loop

# Train the SVM Classifier
result_train = getSVMFeatureVectorAndLabels(tweets, featureList)
result_test = getSVMFeatureVectorAndLabels(testTweets, featureList)
# Split the data into a training set and a test set
data_train = result_train['feature_vector']
target_train = result_train['labels']
data_test = result_test['feature_vector']
target_test = result_test['labels']

# Run SVM Classifier
SVMClassifier = svm.SVC(kernel='linear')

target_pred = SVMClassifier.fit(data_train, target_train).predict(data_test)

targetNames = ['cessation', 'no cessation']
print "Classification by SVM Classifier"
print classification_report(target_test, target_pred, target_names=targetNames)
print confusion_matrix(target_test, target_pred)
print accuracy_score(target_test, target_pred)

#
features_train, labels_train, features_test, labels_test = makeTerrainData()

def submitAccuracies():
    return {"acc_min_samples_split_2":round(acc_min_samples_split_2,3),
          "acc_min_samples_split_50":round(acc_min_samples_split_50,3)}


########################## DECISION TREE #################################


### your code goes here--now create 2 decision tree classifiers,
### one with min_samples_split=2 and one with min_samples_split=50
### compute the accuracies on the testing data and store
### the accuracy numbers to acc_min_samples_split_2 and
### acc_min_samples_split_5, respectively
from classifyDT import classify
from sklearn.metrics.metrics import accuracy_score
clf = classify(features_train, labels_train,50.0)
pred = clf.predict_proba(features_test)
roundedNumber = []
for i in range(0,len(pred)):
    roundedNumber.append(round(pred[i,1]))
acc_min_samples_split_50 = accuracy_score(labels_test,roundedNumber)### you fill this in!


clf = classify(features_train, labels_train,2.0)

pred = clf.predict_proba(features_test)
acc_min_samples_split_2 = accuracy_score(labels_test,pred[:,1])### you fill this in!
print submitAccuracies()
def print_classification_report(y_test_report, y_predicted_report,target_names):
    #target_names = ['class 0', 'class 1']
    print ("overall accuracy score of the classifier is")
    print accuracy_score(y_test_report, y_predicted_report)
    print(classification_report(np.array(y_test_report), np.array(y_predicted_report), target_names=target_names));
    return None
예제 #42
0
def submitAccuracies():
    return {
        "acc_min_samples_split_2": round(acc_min_samples_split_2, 3),
        "acc_min_samples_split_50": round(acc_min_samples_split_50, 3)
    }


########################## DECISION TREE #################################

### your code goes here--now create 2 decision tree classifiers,
### one with min_samples_split=2 and one with min_samples_split=50
### compute the accuracies on the testing data and store
### the accuracy numbers to acc_min_samples_split_2 and
### acc_min_samples_split_5, respectively
from classifyDT import classify
from sklearn.metrics.metrics import accuracy_score
clf = classify(features_train, labels_train, 50.0)
pred = clf.predict_proba(features_test)
roundedNumber = []
for i in range(0, len(pred)):
    roundedNumber.append(round(pred[i, 1]))
acc_min_samples_split_50 = accuracy_score(labels_test,
                                          roundedNumber)  ### you fill this in!

clf = classify(features_train, labels_train, 2.0)

pred = clf.predict_proba(features_test)
acc_min_samples_split_2 = accuracy_score(labels_test,
                                         pred[:, 1])  ### you fill this in!
print submitAccuracies()
예제 #43
0
        train_vectors.append(item[1])

train_vectors = array(train_vectors)

if TRY_WITH_PREBUILD:
    print('Now building a classifier for our initial test, how does it do on pre-computed vectors.')
    # The paper uses a neural network, whatever that is...

    clf = SVC(C=50.0, kernel='linear')

    # For our first test we use a subset of train data
    clf.fit(train_vectors[:20000], train_targets[:20000])

    print('Without loading in new stuff, lets get an idea of what we can do.')
    predicted = clf.predict(train_vectors[20000:25000])
    acc = metrics.accuracy_score(train_targets[20000:25000], predicted)
    print('Accuracy: ', str(acc * 100.0) + '%')
    del clf
    print('Now we got some new reviews coming in.\n'
          '\tBut before we read them lets rebuild the classifier with all available data.')
else:
    print('Now building a classifier')

clf = SVC(C=50.0, kernel='linear')
clf.fit(train_vectors, train_targets)


print('Extending vocab and building vectors for new labels')

# Freeze the words,should only matter for dm (high inflection)?
model_dm.train_words = False
#There's a pitch-perfect illustration of overfitting. Look at the gulf between the training and cv scores. As we train
#on more and more examples, the training score does decrease and cv scores increases but we'll need exponentially more
#examples to reduce the gulf between the two. Let's confirm understanding by looking at the test scores.


# In[23]:

#Let's see how our trained model performs on the test set. We are not going to train on this set merely looking at how well
#our model can generalize.

#Calling Fit on the estimator object so we can predict. We're NOT retraining the classifier here.
estimator.fit(X_train, y_train)
y_pred=estimator.predict(X_test)
print metrics.classification_report(y_test,y_pred)
print "Decision Trees: Final Generalization Accuracy: %.6f" %metrics.accuracy_score(y_test,y_pred)

#That's not too bad but we can get a much better result if we addressed the overfitting problem. Let's now try the random
#forests classifier to see how it does.


# In[25]:

#WARNING - THIS MIGHT TAKE A WHILE TO RUN. TRY ADJUSTING parameters such as n_jobs (jobs to run in parallel, before 
#increasing this make sure your system can handle it), n_iter for ShuffleSplit (in the function definition) and reducing 
#number of values being tried for max_depth/n_estimators.

#SELECT INTERRUPT IN THE MENU AND PRESS INTERRUPT KERNEL IF YOU NEEDD TO STOP EXECUTION

max_depth=np.linspace(5,10,5)
n_estimators=[10, 100, 1000]
예제 #45
0
train_vectors = array(train_vectors)

if TRY_WITH_PREBUILD:
    print(
        'Now building a classifier for our initial test, how does it do on pre-computed vectors.'
    )
    # The paper uses a neural network, whatever that is...

    clf = SVC(C=50.0, kernel='linear')

    # For our first test we use a subset of train data
    clf.fit(train_vectors[:20000], train_targets[:20000])

    print('Without loading in new stuff, lets get an idea of what we can do.')
    predicted = clf.predict(train_vectors[20000:25000])
    acc = metrics.accuracy_score(train_targets[20000:25000], predicted)
    print('Accuracy: ', str(acc * 100.0) + '%')
    del clf
    print(
        'Now we got some new reviews coming in.\n'
        '\tBut before we read them lets rebuild the classifier with all available data.'
    )
else:
    print('Now building a classifier')

clf = SVC(C=50.0, kernel='linear')
clf.fit(train_vectors, train_targets)

print('Extending vocab and building vectors for new labels')

# Freeze the words,should only matter for dm (high inflection)?
예제 #46
0
        if opts.mode in ['age', 'gender']:
            # Preparando la máquina de aprendizaje
            verbose("   Training fold   (%i)" % (i + 1))
            from sklearn.ensemble import RandomForestClassifier
            classifier = RandomForestClassifier(n_estimators=opts.estimators,
                                                class_weight=weight)

            # Aprendiendo
            classifier.fit(X_train, y_train)

            # Prediciendo
            verbose("   Predicting fold (%i)" % (i + 1))
            prediction = classifier.predict(X_test)

            verbose('   Accuracy fold   (%i):' % (i + 1),
                    accuracy_score(y_test, prediction))
            y_.extend(y_test)
            prediction_.extend(prediction)

        else:
            # Preparando la máquina de aprendizaje
            verbose("   Regressing fold   (%i)" % (i + 1))
            from sklearn.ensemble import RandomForestRegressor
            regressor = RandomForestRegressor(n_estimators=opts.estimators)

            # Aprendiendo
            regressor.fit(X_train, y_train)

            # Prediciendo
            verbose("   Predicting fold (%i)" % (i + 1))
            prediction = regressor.predict(X_test)
예제 #47
0
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)


from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.metrics import classification_report

y_true_all = []
predictions_all = []
for label in good_categories[:3]:
    print 'label', label
    y_train = [1 if label in instance else 0 for instance in y_train_all]
    y_test = [1 if label in instance else 0 for instance in y_test_all]
    y_true_all.append(y_test)
    classifier = LogisticRegression()
    classifier.fit_transform(X_train, y_train)
    predictions = classifier.predict(X_test)
    predictions_all.append(predictions)
    print classification_report(y_test, predictions)
    print confusion_matrix(y_test, predictions)
    print 'precision', precision_score(y_test, predictions)
    print 'recall', recall_score(y_test, predictions)
    print 'accuracy', accuracy_score(y_test, predictions)
    print '\n'


y_true_all = np.array(y_true_all)
predictions_all = np.array(predictions_all)

print hamming_loss(y_true_all, predictions_all)
예제 #48
0
	for tweet in reader[0:2*(numironicos/3)]:
		tweets_train.append(tweet["text"])
		labels_train.append("noironia")
	for tweet in reader[2*(numironicos/3):]:
		tweets_test.append(tweet["text"])
		labels_test.append("noironia")

stop_words = []
f = open("spanish.txt") 
for line in f:
	stop_words.append(line.strip())

f.close()

y_train = np.array(labels_train, dtype=object) 
y_test = np.array(labels_test, dtype=object) 

vectorizer = TfidfVectorizer(input='content', max_df=0.5, stop_words = stop_words)
X_train = vectorizer.fit_transform(np.array(tweets_train, dtype=object))
X_test = vectorizer.transform(np.array(tweets_test, dtype=object))
classifier = RandomForestClassifier(n_estimators = 10)
classifier.fit(X_train.toarray(), y_train)
prediction = classifier.predict(X_test.toarray())

print '\nAccuracy :', accuracy_score(y_test, prediction)
print '\nPrecision :', precision_score(y_test, prediction)
print '\nRecall :', recall_score(y_test, prediction)
print '\nF-score :', f1_score(y_test, prediction)
print '\nClasification report:\n', classification_report(y_test,prediction)
print '\nConfussion matrix :\n',confusion_matrix(y_test, prediction)
예제 #49
0
def main(argv):
    
    
    # get options passed at command line
    
    try:
        opts, args = getopt.getopt(argv, "d:o:c:C:t:m:")

    except getopt.GetoptError:
        
        #print helpString
        
        sys.exit(2)
#print opts
    for opt, arg in opts:
    
        if opt == '-d':
        
            data_file = arg
        
        elif opt == '-o':
            
            out_folder = arg
    
        elif opt == '-c':
            
            label_col = int(arg)
        
        elif opt == '-C':
            
            data_cols = arg
        
        elif opt == '-t':
            
            test_file = arg  #Whole genome prediction file

        elif opt == '-m':
            model_file = arg

    model_filename = os.path.abspath(model_file)
    data_file = os.path.abspath(data_file)
    test_file = os.path.abspath(test_file)
    print model_file, "\n"
    data_cols = [int(x) for x in data_cols.split(",")]
    x_data = np.loadtxt(data_file, usecols=data_cols, delimiter = "\t", skiprows=1)
    y_data = np.genfromtxt(data_file,  usecols = label_col, delimiter = "\t", skip_header=1)
    test_x_data = np.loadtxt(test_file, usecols=data_cols, delimiter = "\t", skiprows=1)
    test_y_data = np.genfromtxt(test_file,  usecols = label_col, delimiter = "\t", skip_header=1)
    
    #Load the model file#
    estimator = joblib.load(model_filename)

    #perform same scaling on training and testing data
    x_data, test_x_data = scaling_training_testing_data(x_data, test_x_data)
    np.random.seed(0)
    indices = np.random.permutation(len(test_x_data))
    test_x_data = test_x_data[indices]
    test_y_data = test_y_data[indices]
    cols = 0
    with open (test_file,"r") as temp:
        a =  '\n'.join(line.strip("\n") for line in temp)
        b = np.genfromtxt(StringIO(a), usecols = cols, delimiter="\t", dtype=None, skip_header=1)
        enhancer_names_test = b[indices]
    temp.close()
    y_FAN_pred = estimator.predict(test_x_data)
    y_score_test = estimator.predict_proba(test_x_data)
    print metrics.classification_report(test_y_data,y_FAN_pred)
    combined_test = zip(enhancer_names_test, test_y_data, y_FAN_pred, y_score_test[:,0], y_score_test[:,1])
    #f = open(out_folder + "/subroutine_RF_FANTOM_FeatureSelected_pred.txt", 'w')
    f = open(out_folder + "/GM12878_FANTOM_RF_FeatureSelected_ROC.txt", 'w')
    f.write("Enhancer_name\tY_true_labels\tY_predicted_labels\tProb_Class0\tProb_class1\n")
    for i in combined_test:
        line = '\t'.join(str(x) for x in i)
        f.write(line + '\n')
    f.close()
    print "Random Forests: On FANTOM, Final Generalization Accuracy: %.6f" %metrics.accuracy_score(test_y_data,y_FAN_pred)
    print metrics.classification_report(test_y_data,y_FAN_pred)
    print "Number of mislabeled points : %d" % (test_y_data != y_FAN_pred).sum()
    print metrics.classification_report(test_y_data,y_FAN_pred)
    print "Random Forests: Final Generalization Accuracy: %.6f" %metrics.accuracy_score(test_y_data,y_FAN_pred)
    #Before we move on, let's look at a key parameter that RF returns, namely feature_importances. This tells us which #features in our dataset seemed to matter the most (although won't matter in the present scenario with only 2 features)
    print estimator.feature_importances_

#Plot ROC#
    roc_plt = plot_roc(estimator, test_x_data, test_y_data, y_FAN_pred)
    #pl.savefig(out_folder + "/subroutine_RF_FeatureSelected_split_test_train_Kfold.svg", transparent=True, bbox_inches='tight', pad_inches=0.2)
    pl.savefig(out_folder + "/GM12878_FANTOM_RF_FeatureSelected_ROC.svg", transparent=True, bbox_inches='tight', pad_inches=0.2)
    roc_plt.show()
예제 #50
0
    X_train = vectorizer.fit_transform(corpus_train)

    X_test = vectorizer.transform(corpus_test)
    
    clf = RandomForestClassifier(n_estimators=10)
    #clf = KNeighborsClassifier(n_neighbors=10)
    #clf = LinearSVC()
    
    clf.fit(X_train, y_train)
    
    print len(y_train)
    print len(y_test)
    
    pred = clf.predict(X_test)
    
    #pred = ['0']* len(y_test)
    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)
    total.append(score)
    
    n = 20
    
#     feature_names = vectorizer.get_feature_names()
#     coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
#     top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
#     for (coef_1, fn_1), (coef_2, fn_2) in top:
#         print "\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2)                   
                    
    
print np.mean(total)
            from sklearn.ensemble import RandomForestClassifier
            classifier=RandomForestClassifier(n_estimators=10000, criterion='entropy')
            
            #classifier = SVC(C=10, kernel='linear', 
            #gamma=10, coef0=0.0, shrinking=True, 
            #probability=False, tol=0.001, cache_size=20000, 
            #class_weight='auto', verbose=False, max_iter=-1, 
            #random_state=None)
            # Aprendiendo
            classifier.fit(X_train, y_train)

            # Prediciendo
            verbose("   Predicting fold (%i)"%(i+1))
            prediction = classifier.predict(X_test)

            verbose('   Accuracy fold   (%i):'%(i+1), accuracy_score(y_test, prediction))
            y_.extend(y_test)
            prediction_.extend(prediction)

        else:
             # Preparando la máquina de aprendizaje
            verbose("   Regressing fold   (%i)"%(i+1))
            from sklearn.ensemble import RandomForestRegressor
            from sklearn.svm import SVR
            #regressor=RandomForestRegressor(n_estimators=opts.estimators)
            regressor = SVR(kernel='linear', degree=3, gamma=1.0, coef0=1.0, 
            tol=0.001, C=10, epsilon=0.1, shrinking=True, probability=False
            , cache_size=200, verbose=False, max_iter=-1, 
            random_state=None)
            
            # Aprendiendo