def main():
    pipeline = Pipeline([
        ('vect', TfidfVectorizer()),
        ('clf', LogisticRegression())
    ])
    parameters = {
        'vect__max_df': (0.25, 0.5, 0.75),
        'vect__stop_words': ('english', None),
        'vect__max_features': (5000, 10000, None),
        'vect__ngram_range': ((1, 1), (1, 2)),
        'vect__use_idf': (True, False),
        'vect__norm': ('l1', 'l2'),
        'clf__penalty': ('l1', 'l2'),
        'clf__C': (0.1, 1, 10),
    }
    df = pd.read_csv('movie-reviews/train.tsv', header=0, delimiter='\t')
    X, y = df['Phrase'], df['Sentiment'].as_matrix()
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5)
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    print 'Best score: %0.3f' % grid_search.best_score_
    print 'Best parameters set:'
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print '\t%s: %r' % (param_name, best_parameters[param_name])

    predictions = grid_search.predict(X_test)
    print 'Accuracy:', accuracy_score(y_test, predictions)
    print 'Precision:', precision_score(y_test, predictions)
    print 'Recall:', recall_score(y_test, predictions)
    print 'F1 score:', f1_score(y_test, predictions)
def main():
    pipeline = Pipeline([
        ('vect', TfidfVectorizer(stop_words='english')),
        ('clf', LogisticRegression())
    ])
    parameters = {
        'vect__max_df': (0.25, 0.5),
        'vect__ngram_range': ((1, 1), (1, 2)),
        'vect__use_idf': (True, False),
        'clf__C': (0.1, 1, 10),
    }
    os.chdir('C:\\Users\\Dan\\1) Python Notebooks\\Datasets')
    df = pd.read_csv('data/train.tsv', header=0, delimiter='\t')
    X, y = df['Phrase'], df['Sentiment'].as_matrix()
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5)
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=3, verbose=1, scoring='accuracy')
    lb = LabelBinarizer()
    y_train = np.array([number[0] for number in lb.fit_transform(y_train)])    
    grid_search.fit(X_train, y_train)
    print 'Best score: %0.3f' % grid_search.best_score_
    print 'Best parameters set:'
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print '\t%s: %r' % (param_name, best_parameters[param_name])
    predictions = grid_search.predict(X_test)
    lb = LabelBinarizer()
    y_train = np.array([number[0] for number in lb.fit_transform(y_train)])
    print 'Accuracy:', accuracy_score(y_test, predictions)
    print 'Precision:', precision_score(y_test, predictions)
    print 'Recall:', recall_score(y_test, predictions)
Пример #3
0
 def evaluate(self, params, rnnDataTest):
     predictLabels = []
     trueLabels = []        
     
     allSNum = rnnDataTest.allSNum
     allSTree = rnnDataTest.allSTree
     allSStr = rnnDataTest.allSStr
     verbIndices = rnnDataTest.verbIndices
     sentenceLabels = rnnDataTest.sentenceLabels
 
     ndoc = rnnDataTest.ndoc()
     print "Total number of trees/sentences to be evaluated: ", ndoc
     for s in range(ndoc):              
         if(s % 100 == 0) :
             print "Processing sentences ", s , ' - ', s+100   
         thissentVerbIndices = verbIndices[s]  
         sStr = allSStr[s]; sNum = allSNum[s]; sTree = allSTree[s]
         labels = sentenceLabels[s]
         if((len(sNum) == 1) or (len(thissentVerbIndices)==0) or (labels.shape[1] != len(sStr))):
             continue  #only one word in a sent, no verbs for this sent, tokens and labels mismatch                 
         for nverb, vid in enumerate(thissentVerbIndices):
             scoresMat = np.zeros((len(sStr), self.Wcat.shape[0]))
             for wid in range(len(sStr)):
                 indices = np.array([vid, wid])   
                 setVerbnWordDistanceFeat(self.Wv, sNum, vid, wid, params) 
                 tree = forwardPropTree(self.W, self.WO, self.Wcat, self.Wv, self.Wo, sNum, sTree, sStr, sNN=None, indicies=None,  params=params) 
                 calPredictions(tree, self.Wcat, self.Wv, indices, sStr, params) #updates score, nodepath etc for this verb, word pair
                 scoresMat[wid,:] = tree.score
             pred_answer = viterbi(scoresMat, self.Tran)
             true_answer = labels[nverb,:]
             for i in range(len(pred_answer)):
                 predictLabels.append(pred_answer[i])
                 trueLabels.append(true_answer[i])
             #TODO : calculate predicted labels     
     
     f1 = f1_score(y_true=trueLabels, y_pred=predictLabels, pos_label=None)#, labels=all_labels)
     p = precision_score(y_true=trueLabels, y_pred=predictLabels, pos_label=None)#, labels=all_labels)
     r = recall_score(y_true=trueLabels, y_pred=predictLabels, pos_label=None)#), labels=all_labels)
     print "XXXXXXX F1 = ", f1
     print "XXXXXXX P = ", p
     print "XXXXXXX R = ", r
     print 
     return predictLabels
Пример #4
0
    def evaluate(self, params, rnnDataTest):
#        predictLabels = np.zeros(len(rnnDataTest.allSNum), dtype='int32')
#        probabilities = np.zeros(len(rnnDataTest.allSNum))
        predictLabels = []
        trueLabels = []
        
        allSNum = rnnDataTest.allSNum
        allSTree = rnnDataTest.allSTree
        allSStr = rnnDataTest.allSStr
        verbIndices = rnnDataTest.verbIndices
#        allSNN = rnnDataTest.allSNN
#        allIndicies = rnnDataTest.allIndicies
        sentenceLabels = rnnDataTest.sentenceLabels
    
        ndoc = rnnDataTest.ndoc()
        print "Total number of trees/sentences to be evaluated: ", ndoc
        for s in range(ndoc):              
            if(s % 100 == 0) :
                print "Processing sentences ", s , ' - ', s+100   
            thissentVerbIndices = verbIndices[s]  
            sStr = allSStr[s]; sNum = allSNum[s]; sTree = allSTree[s]
            labels = sentenceLabels[s]
            if((len(sNum) == 1) or (len(thissentVerbIndices)==0) or (labels.shape[1] != len(sStr))):
                continue  #only one word in a sent, no verbs for this sent, tokens and labels mismatch                 
            for nverb, vid in enumerate(thissentVerbIndices):
                for wid in range(len(sStr)):
                    indices = np.array([vid, wid])   
                    truelabel = labels[nverb, wid]  
                    setVerbnWordDistanceFeat(self.Wv, sNum, vid, wid, params) 
                    tree = forwardPropTree(self.W, self.WO, self.Wcat, self.Wv, self.Wo, sNum, sTree, sStr, sNN=None, indicies=None,  params=params) 
                    trueLabels.append(truelabel)       
                    calPredictions(tree, self.Wcat, self.Wv, indices, sStr, params) #updates score, nodepath etc for this verb, word pair
                    predictedLabel = np.argmax(tree.y)
                    predictLabels.append(predictedLabel)
        
        f1 = f1_score(y_true=trueLabels, y_pred=predictLabels, pos_label=None)#, labels=all_labels)
        p = precision_score(y_true=trueLabels, y_pred=predictLabels, pos_label=None)#, labels=all_labels)
        r = recall_score(y_true=trueLabels, y_pred=predictLabels, pos_label=None)#), labels=all_labels)
        print "XXXXXXX F1 = ", f1
        print "XXXXXXX P = ", p
        print "XXXXXXX R = ", r
        print 
        return predictLabels
Пример #5
0
# have a different feature list when you do the final project.
features_list = ["poi", "salary"]

data = featureFormat(data_dict, features_list)
labels, features = targetFeatureSplit(data)


# it's all yours from here forward!
from time import time
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import metrics
features_train, features_test, labels_train, labels_test = train_test_split(
    features, labels, test_size=.3, random_state=42)
clf = DecisionTreeClassifier()
t0 = time()
pred = clf.fit(features_train, labels_train)
print "training time is ", round((time() - t0), 3), "s"

t1 = time()
pred = clf.predict(features_test)
print "prediction time is ", round(time() - t1, 3), "s"

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(labels_test, pred)
print accuracy
precision = metrics.precision_score(labels_test, pred)
print precision
recall = metrics.recall_score(labels_test, pred)
print recall
Пример #6
0
            # Prediciendo
            verbose("   Predicting fold (%i)" % (i + 1))
            prediction = regressor.predict(X_test)

            y_.extend(y_test)
            prediction_.extend(prediction)

    verbose('----------\n')
    verbose("Evaluation")

    if opts.mode in ['age', 'gender']:
        from sklearn.metrics.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score
        # Calculando desempeño
        print('Accuracy              :', accuracy_score(y_, prediction_))
        print('Precision             :', precision_score(y_, prediction_))
        print('Recall                :', recall_score(y_, prediction_))
        print('F-score               :', f1_score(y_, prediction_))
        print('\nClasification report:\n',
              classification_report(y_, prediction_))
        print('\nConfussion matrix   :\n', confusion_matrix(y_, prediction_))
    else:
        from sklearn.metrics.metrics import mean_absolute_error, mean_squared_error, r2_score
        print('Mean Abs Error        :', mean_absolute_error(y_, prediction_))
        print('Mean Sqr Error        :', mean_squared_error(y_, prediction_))
        print('R2 Error              :', r2_score(y_, prediction_))

    #plots:
    #import matplotlib.pyplot as plt
    #confusion_matrix_plot = confusion_matrix(y_test, prediction)
    #plt.title('matriz de confusion')
    #plt.colorbar()
if __name__ == '__main__':
    if len(sys.argv) != 2:
        print ("Illegal use of Arguments: Best_configuration.py <Training_samples_location> <Testing_Samples_Location>")
        exit(1)

    test =  sys.argv[1]
    header_list = []
    labels = []
    i=0

    header_test = []
    test_labels = []
    i = 0
    for root, dirs, files in os.walk(test):
        for name in files:
            fo = open(root +"/"+name, "r")
            content = fo.read().replace('\n', ' ')
            body = re.sub(r'^(.*) Lines: (\d)+ ', "", content)
            header_test.append(unicode(body,errors='ignore'))
            test_labels.append(i)
        i=i+1

    text_clf01 = joblib.load('Training_model.pkl')
    predicted01 = text_clf01.predict(header_test)
    print("Removed Stop Words + L2 penalization")
    print ("F1:",metrics.f1_score(test_labels, predicted01, average='macro'))
    print ("accuracy:", metrics.accuracy_score(test_labels, predicted01))
    print ("precision:",metrics.precision_score(test_labels, predicted01, average='macro'))
    print ("recall:",metrics.recall_score(test_labels, predicted01, average='macro'))
Пример #8
0
def fit_predict(config, X_train, y_train, X_test=None, y_test=None, ref_thd=None):
    """
    Uses the configuration dictionary settings to train a model using the
    specified training algorithm. If set, also evaluates the trained model 
    in a test set. Additionally, performs feature selection and model parameters
    optimization.
    
    @param config: the configuration dictionary obtained parsing the 
    configuration file.
    @param X_train: the np.array object for the matrix containing the feature
    values for each instance in the training set.
    @param y_train: the np.array object for the response values of each instance
    in the training set.
    @param X_test: the np.array object for the matrix containing the feature
    values for each instance in the test set. Default is None.
    @param y_test: the np.array object for the response values of each instance
    in the test set. Default is None.
    """
    # sets the selection method
    transformer = set_selection_method(config)

    # if the system is configured to run feature selection
    # runs it and modifies the datasets to the new dimensions
    if transformer is not None:
        log.info("Running feature selection %s" % str(transformer))

        log.debug("X_train dimensions before fit_transform(): %s,%s" % X_train.shape)
        log.debug("y_train dimensions before fit_transform(): %s" % y_train.shape)

        X_train = transformer.fit_transform(X_train, y_train)

        log.debug("Dimensions after fit_transform(): %s,%s" % X_train.shape)

        if X_test is not None:
            X_test = transformer.transform(X_test)

    # sets learning algorithm and runs it over the training data
    estimator, scorers = set_learning_method(config, X_train, y_train)
    log.info("Running learning algorithm %s" % str(estimator))
    estimator.fit(X_train, y_train)

    if (X_test is not None) and (y_test is not None):
        log.info("Predicting unseen data using the trained model...")
        y_hat = estimator.predict(X_test)
        log.info("Evaluating prediction on the test set...")
        for scorer_name, scorer_func in scorers:
            v = scorer_func(y_test, y_hat)
            log.info("%s = %s" % (scorer_name, v))
        log.info("Customized scores: ")
        try:
            log.info("pearson_corrcoef = %s" % pearson_corrcoef(y_test, y_hat))
        except:
            pass
        try:
            log.info("Precision score: = %s" % precision_score(y_test, y_hat))
        except:
            pass
        try:
            log.info("Recall score: = %s" % recall_score(y_test, y_hat))
        except:
            pass
        try:
            log.info("F1 score: = %s" % f1_score(y_test, y_hat))
        except:
            pass
        try:
            log.info("MAE: = %s" % mean_absolute_error(y_test, y_hat))
        except:
            pass
        try:
            log.info("RMSE: = %s" % root_mean_squared_error(y_test, y_hat))
        except:
            pass
        try:
            res = classify_report_bin(y_test, y_hat)
            if "N/A" <> res:
                log.info("Classify report bin: = %s" % res)
            else:
                res = classify_report_bin_regression(y_test, y_hat)
                if "N/A" <> res:
                    log.info("Classify report bin regression: = %s" % res)
                else:
                    if ref_thd is None:
                        log.error("No ref thd defined")
                    else:
                        refthd = float(ref_thd)
                        res = classify_report_regression(y_test, y_hat, refthd)
                        log.info("Classify report regression: = %s" % res)
        except Exception, e:
            print e
        with open("predicted.csv", "w") as _fout:
            for _x, _y in zip(y_test, y_hat):
                print >> _fout, "%f\t%f" % (_x, _y)
Пример #9
0
def fit_predict(config,
                X_train,
                y_train,
                X_test=None,
                y_test=None,
                ref_thd=None):
    '''
    Uses the configuration dictionary settings to train a model using the
    specified training algorithm. If set, also evaluates the trained model 
    in a test set. Additionally, performs feature selection and model parameters
    optimization.
    
    @param config: the configuration dictionary obtained parsing the 
    configuration file.
    @param X_train: the np.array object for the matrix containing the feature
    values for each instance in the training set.
    @param y_train: the np.array object for the response values of each instance
    in the training set.
    @param X_test: the np.array object for the matrix containing the feature
    values for each instance in the test set. Default is None.
    @param y_test: the np.array object for the response values of each instance
    in the test set. Default is None.
    '''
    # sets the selection method
    transformer = set_selection_method(config)

    # if the system is configured to run feature selection
    # runs it and modifies the datasets to the new dimensions
    if transformer is not None:
        log.info("Running feature selection %s" % str(transformer))

        log.debug("X_train dimensions before fit_transform(): %s,%s" %
                  X_train.shape)
        log.debug("y_train dimensions before fit_transform(): %s" %
                  y_train.shape)

        X_train = transformer.fit_transform(X_train, y_train)

        log.debug("Dimensions after fit_transform(): %s,%s" % X_train.shape)

        if X_test is not None:
            X_test = transformer.transform(X_test)

    # sets learning algorithm and runs it over the training data
    estimator, scorers = set_learning_method(config, X_train, y_train)
    log.info("Running learning algorithm %s" % str(estimator))
    estimator.fit(X_train, y_train)

    if (X_test is not None) and (y_test is not None):
        log.info("Predicting unseen data using the trained model...")
        y_hat = estimator.predict(X_test)
        log.info("Evaluating prediction on the test set...")
        for scorer_name, scorer_func in scorers:
            v = scorer_func(y_test, y_hat)
            log.info("%s = %s" % (scorer_name, v))
        log.info("Customized scores: ")
        try:
            log.info("pearson_corrcoef = %s" % pearson_corrcoef(y_test, y_hat))
        except:
            pass
        try:
            log.info("Precision score: = %s" % precision_score(y_test, y_hat))
        except:
            pass
        try:
            log.info("Recall score: = %s" % recall_score(y_test, y_hat))
        except:
            pass
        try:
            log.info("F1 score: = %s" % f1_score(y_test, y_hat))
        except:
            pass
        try:
            log.info("MAE: = %s" % mean_absolute_error(y_test, y_hat))
        except:
            pass
        try:
            log.info("RMSE: = %s" % root_mean_squared_error(y_test, y_hat))
        except:
            pass
        try:
            res = classify_report_bin(y_test, y_hat)
            if "N/A" <> res:
                log.info("Classify report bin: = %s" % res)
            else:
                res = classify_report_bin_regression(y_test, y_hat)
                if "N/A" <> res:
                    log.info("Classify report bin regression: = %s" % res)
                else:
                    if ref_thd is None:
                        log.error("No ref thd defined")
                    else:
                        refthd = float(ref_thd)
                        res = classify_report_regression(y_test, y_hat, refthd)
                        log.info("Classify report regression: = %s" % res)
        except Exception, e:
            print e
        with open("predicted.csv", 'w') as _fout:
            for _x, _y in zip(y_test, y_hat):
                print >> _fout, "%f\t%f" % (_x, _y)
Пример #10
0
	vect__norm: 'l2'
	vect__use_idf: True
"""
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.metrics.metrics import precision_score, recall_score, confusion_matrix

__author__ = 'gavin'
import pandas as pd

df = pd.read_csv('sms/sms.csv')

X_train_r, X_test_r, y_train, y_test = train_test_split(
    df['message'], df['label'])

vectorizer = TfidfVectorizer(max_df=0.5,
                             max_features=None,
                             ngram_range=(1, 1),
                             norm='l2',
                             use_idf=True)
X_train = vectorizer.fit_transform(X_train_r)
X_test = vectorizer.transform(X_test_r)
classifier = LogisticRegression(penalty='l2', C=7)
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)
print 'score', classifier.score(X_test, y_test)
print 'precision', precision_score(y_test, predictions)
print 'recall', recall_score(y_test, predictions)
print confusion_matrix(y_test, predictions)
Пример #11
0
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)


from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.metrics import classification_report

y_true_all = []
predictions_all = []
for label in good_categories[:3]:
    print 'label', label
    y_train = [1 if label in instance else 0 for instance in y_train_all]
    y_test = [1 if label in instance else 0 for instance in y_test_all]
    y_true_all.append(y_test)
    classifier = LogisticRegression()
    classifier.fit_transform(X_train, y_train)
    predictions = classifier.predict(X_test)
    predictions_all.append(predictions)
    print classification_report(y_test, predictions)
    print confusion_matrix(y_test, predictions)
    print 'precision', precision_score(y_test, predictions)
    print 'recall', recall_score(y_test, predictions)
    print 'accuracy', accuracy_score(y_test, predictions)
    print '\n'


y_true_all = np.array(y_true_all)
predictions_all = np.array(predictions_all)

print hamming_loss(y_true_all, predictions_all)
 from sklearn.pipeline import Pipeline
 start_time = time.time()
 text_clf = Pipeline([
     ('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer()),
     ('clf', MultinomialNB()),
 ])
 text_clf = text_clf.fit(header_list, labels)
 predicted = text_clf.predict(header_test)
 print("Naive bayes")
 print("F1:", metrics.f1_score(test_labels, predicted, average='macro'))
 print("accuracy:", metrics.accuracy_score(test_labels, predicted))
 print("precision:",
       metrics.precision_score(test_labels, predicted, average='macro'))
 print("recall:",
       metrics.recall_score(test_labels, predicted, average='macro'))
 print("Tine in seconds %s" % (time.time() - start_time))
 #SVM###
 from sklearn.linear_model import SGDClassifier
 start_time = time.time()
 text_clf = Pipeline([
     ('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer()),
     ('clf', SGDClassifier(
         loss='hinge',
         penalty='l2',
     )),
 ])
 text_clf = text_clf.fit(header_list, labels)
 predicted = text_clf.predict(header_test)
 print("SVM")
Пример #13
0
from sklearn.ensemble import RandomForestClassifier

#se pasmo con 1000000
#probar con mas parametros
classifier = RandomForestClassifier(n_estimators=100)
classifier.fit(X_train, y_train)
prediction = classifier.predict(X_test)

#print X_train.shape

from sklearn.metrics.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score

print '\nAccuracy:', accuracy_score(y_test, prediction)
print '\nscore:', classifier.score(X_train, y_train)
print '\nrecall:', recall_score(y_test, prediction)
print '\nprecision:', precision_score(y_test, prediction)
print '\n clasification report:\n', classification_report(y_test, prediction)
print '\n confussion matrix:\n', confusion_matrix(y_test, prediction)

#plots:

import matplotlib.pyplot as plt
confusion_matrix_plot = confusion_matrix(y_test, prediction)
plt.title('matriz de confusion')
plt.colorbar()
plt.xlabel()
plt.xlabel('categoria de verdad')
plt.ylabel('categoria predecida')
plt.show()
Пример #14
0
    le = preprocessing.LabelEncoder()
    le.fit(ids_)
    verbose("Total classes",le.classes_.shape[0])
    ids=le.transform(ids_)

    X_train, X_test, y_train, y_test=\
        train_test_split(feats, ids, test_size=0.20, random_state=42) 
    
    verbose("Training")
    classifier=RandomForestClassifier(
            n_estimators=opts.estimators,
            n_jobs=opts.nprocessors,
            max_depth=20,
            verbose=True)

    # Aprendiendo
    classifier.fit(X_train, y_train)

    # Prediciendo
    verbose("Prediction")
    prediction = classifier.predict(X_test)

    print( 'Accuracy              :', accuracy_score(y_test, prediction))
    print( 'Precision             :', precision_score(y_test, prediction))
    print( 'Recall                :', recall_score(y_test, prediction))
    print( 'F-score               :', f1_score(y_test, prediction))
    print( '\nClasification report:\n', classification_report(y_test,
            prediction))
    print( '\nConfussion matrix   :\n',confusion_matrix(y_test, prediction))
Пример #15
0
    #     ch2 = SelectFromModel(clf2, prefit=True)
    #
    #     X_train = ch2.transform(X_train)
    #     X_test = ch2.transform(X_test)

    clf.fit(X_train, y_train)

    print len(y_train)
    print len(y_test)

    pred = clf.predict(X_test)

    #pred = [0]* len(y_test)
    score = metrics.accuracy_score(y_test, pred)
    prec = metrics.precision_score(y_test, pred)
    recall = metrics.recall_score(y_test, pred)
    f1 = metrics.f1_score(y_test, pred)
    print("accuracy:   %0.3f   prec: %0.3f   recall: %0.3f   f1: %0.3f" %
          (score, prec, recall, f1))
    total.append(score)
    total2.append(f1)

    file2 = open('results/%s-1' % source, 'w')
    file3 = open('results/%s-0' % source, 'w')
    for s, (y, x) in zip(pred_sents, zip(pred, X_test)):
        if y == 1:
            file2.write(s + '\n')
            file2.write(str(x) + '\n')
        else:
            file3.write(s + '\n')
            file3.write(str(x) + '\n')
Пример #16
0
    def evaluate(self, params, rnnDataTest):
        #        predictLabels = np.zeros(len(rnnDataTest.allSNum), dtype='int32')
        #        probabilities = np.zeros(len(rnnDataTest.allSNum))
        predictLabels = []
        trueLabels = []

        allSNum = rnnDataTest.allSNum
        allSTree = rnnDataTest.allSTree
        allSStr = rnnDataTest.allSStr
        verbIndices = rnnDataTest.verbIndices
        #        allSNN = rnnDataTest.allSNN
        #        allIndicies = rnnDataTest.allIndicies
        sentenceLabels = rnnDataTest.sentenceLabels

        ndoc = rnnDataTest.ndoc()
        print "Total number of trees/sentences to be evaluated: ", ndoc
        for s in range(ndoc):
            if (s % 100 == 0):
                print "Processing sentences ", s, ' - ', s + 100
            thissentVerbIndices = verbIndices[s]
            sStr = allSStr[s]
            sNum = allSNum[s]
            sTree = allSTree[s]
            labels = sentenceLabels[s]
            if ((len(sNum) == 1) or (len(thissentVerbIndices) == 0)
                    or (labels.shape[1] != len(sStr))):
                continue  #only one word in a sent, no verbs for this sent, tokens and labels mismatch
            for nverb, vid in enumerate(thissentVerbIndices):
                for wid in range(len(sStr)):
                    indices = np.array([vid, wid])
                    truelabel = labels[nverb, wid]
                    setVerbnWordDistanceFeat(self.Wv, sNum, vid, wid, params)
                    tree = forwardPropTree(self.W,
                                           self.WO,
                                           self.Wcat,
                                           self.Wv,
                                           self.Wo,
                                           sNum,
                                           sTree,
                                           sStr,
                                           sNN=None,
                                           indicies=None,
                                           params=params)
                    trueLabels.append(truelabel)
                    calPredictions(
                        tree, self.Wcat, self.Wv, indices, sStr, params
                    )  #updates score, nodepath etc for this verb, word pair
                    predictedLabel = np.argmax(tree.y)
                    predictLabels.append(predictedLabel)

        f1 = f1_score(y_true=trueLabels, y_pred=predictLabels,
                      pos_label=None)  #, labels=all_labels)
        p = precision_score(y_true=trueLabels,
                            y_pred=predictLabels,
                            pos_label=None)  #, labels=all_labels)
        r = recall_score(y_true=trueLabels,
                         y_pred=predictLabels,
                         pos_label=None)  #), labels=all_labels)
        print "XXXXXXX F1 = ", f1
        print "XXXXXXX P = ", p
        print "XXXXXXX R = ", r
        print
        return predictLabels
Пример #17
0
    ids_ = np.load(opts.IDS)

    le = preprocessing.LabelEncoder()
    le.fit(ids_)
    verbose("Total classes", le.classes_.shape[0])
    ids = le.transform(ids_)

    X_train, X_test, y_train, y_test=\
        train_test_split(feats, ids, test_size=0.20, random_state=42)

    verbose("Training")
    classifier = RandomForestClassifier(n_estimators=opts.estimators,
                                        n_jobs=opts.nprocessors,
                                        max_depth=20,
                                        verbose=True)

    # Aprendiendo
    classifier.fit(X_train, y_train)

    # Prediciendo
    verbose("Prediction")
    prediction = classifier.predict(X_test)

    print('Accuracy              :', accuracy_score(y_test, prediction))
    print('Precision             :', precision_score(y_test, prediction))
    print('Recall                :', recall_score(y_test, prediction))
    print('F-score               :', f1_score(y_test, prediction))
    print('\nClasification report:\n',
          classification_report(y_test, prediction))
    print('\nConfussion matrix   :\n', confusion_matrix(y_test, prediction))
Пример #18
0
	for tweet in reader[0:2*(numironicos/3)]:
		tweets_train.append(tweet["text"])
		labels_train.append("noironia")
	for tweet in reader[2*(numironicos/3):]:
		tweets_test.append(tweet["text"])
		labels_test.append("noironia")

stop_words = []
f = open("spanish.txt") 
for line in f:
	stop_words.append(line.strip())

f.close()

y_train = np.array(labels_train, dtype=object) 
y_test = np.array(labels_test, dtype=object) 

vectorizer = TfidfVectorizer(input='content', max_df=0.5, stop_words = stop_words)
X_train = vectorizer.fit_transform(np.array(tweets_train, dtype=object))
X_test = vectorizer.transform(np.array(tweets_test, dtype=object))
classifier = RandomForestClassifier(n_estimators = 10)
classifier.fit(X_train.toarray(), y_train)
prediction = classifier.predict(X_test.toarray())

print '\nAccuracy :', accuracy_score(y_test, prediction)
print '\nPrecision :', precision_score(y_test, prediction)
print '\nRecall :', recall_score(y_test, prediction)
print '\nF-score :', f1_score(y_test, prediction)
print '\nClasification report:\n', classification_report(y_test,prediction)
print '\nConfussion matrix :\n',confusion_matrix(y_test, prediction)
Пример #19
0
            prediction = regressor.predict(X_test)

            y_.extend(y_test)
            prediction_.extend(prediction)



    verbose('----------\n')
    verbose("Evaluation")

    if opts.mode in ['age','gender']:
        from sklearn.metrics.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score
        # Calculando desempeño
        print( 'Accuracy              :', accuracy_score(y_, prediction_))
        print( 'Precision             :', precision_score(y_, prediction_))
        print( 'Recall                :', recall_score(y_, prediction_))
        print( 'F-score               :', f1_score(y_, prediction_))
        print( '\nClasification report:\n', classification_report(y_,
                prediction_))
        print( '\nConfussion matrix   :\n',confusion_matrix(y_, prediction_))
    else:
        from sklearn.metrics.metrics import mean_absolute_error, mean_squared_error,r2_score
        print( 'Mean Abs Error        :', mean_absolute_error(y_, prediction_))
        print( 'Mean Sqr Error        :', mean_squared_error(y_, prediction_))
        print( 'R2 Error              :', r2_score(y_, prediction_))


    #plots:
    #import matplotlib.pyplot as plt
    #confusion_matrix_plot = confusion_matrix(y_test, prediction)
    #plt.title('matriz de confusion')