def testa_modelo( x_tfidf_test,y_test, modelo):
    print(">> Testando classificador " + modelo.getNome())
    start_time = time.time()
    y_pred = modelo.getBestEstimator().predict(x_tfidf_test)
    y_pred_proba = modelo.getBestEstimator().predict_proba(x_tfidf_test)
    y_pred_proba_df = pd.DataFrame(y_pred_proba, columns =  modelo.getBestEstimator().classes_)
    total_time = time.time() - start_time
    print("Tempo para fazer a predicao de  " + str(x_tfidf_test.shape[0]) + " elementos: ", str(timedelta(seconds=total_time)))

    start_time = time.time()
    accuracy = accuracy_score(y_test, y_pred)
    balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
    macro_precision, macro_recall, macro_fscore =  score(y_test,y_pred,average='macro',labels=np.unique(y_pred))[:3]
    micro_precision, micro_recall, micro_fscore = score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))[:3]
    confusion_matrix = multilabel_confusion_matrix(y_true=y_test, y_pred=y_pred)
    classes = y_test.unique().astype(str).tolist()
    #print(classification_report(y_test, y_pred, target_names=classes))
    classification_report_dict = classification_report(y_test, y_pred,target_names=classes,output_dict=True)
    total_time = time.time() - start_time
    # print('Confusion matrix:\n', conf_mat)
    print("Tempo para recuperar métricas:  "+    str(timedelta(seconds=total_time)))

    modelo.setAccuracy(accuracy)
    modelo.setBalancedAccuracy(balanced_accuracy)
    modelo.setMacroPrecision(macro_precision)
    modelo.setMacroRecall(macro_recall)
    modelo.setMacroFscore(macro_fscore)
    modelo.setMicroPrecision(micro_precision)
    modelo.setMicroRecall(micro_recall)
    modelo.setMicroFscore(micro_fscore)
    modelo.setConfusionMatrix(confusion_matrix)
    modelo.setClassificationReport(classification_report_dict)

    return modelo, y_pred,y_pred_proba_df
Пример #2
0
		def xgb_alg(xtrain,ytrain,xtest,ytest):
			dtrain = xgb.DMatrix(xtrain,label=ytrain)
			dtest = xgb.DMatrix(xtest)
			params = {
				'booster': 'gbtree',
				'objective': 'multi:softmax',
				'num_class': 14,
				'max_depth': 6,
				'eta': 0.2,
				'lambda':0.005,
				'alpha':0,
				'gamma':0,
			}
			n_tree = range(10,100,10)
			train_score,test_score = list(),list()
			for num_round in n_tree:
				alg = xgb.train(params,dtrain,num_round)       
				train_pre = alg.predict(dtrain)
				test_pre = alg.predict(dtest)
				train_aft = score(ytrain,train_pre)
				test_aft = score(ytest,test_pre)
				train_score.append(train_aft)
				test_score.append(test_aft)
			plt.title('machine learning result with xgboost')
			plt.xlabel('n_estimators(n_trees)')
			plt.ylabel('learning accuracy(%)')
			plt.ylim(0.5,1)
			plt.plot(n_tree,train_score,'r-x',label = 'training set')
			plt.plot(n_tree,test_score,'b-x',label = 'testing set')
			plt.legend()
			plt.savefig('%s\\classify\\xgboost'%my_dic,dpi = 300)
			print('xgboost traing finished')
			plt.close()
Пример #3
0
def test(inputs, y_true, model_LSTM, model_Attention_LSTM, model_CNN_LSTM, model_Attention, model_CNN):
    """
    Evaluate the model with precision, recall and F1 score as metrics
    """
    print("model_LSTM: ")
    res = model_LSTM.predict_classes(inputs)
    precision, recall, f1, _ = score(y_true, res, average='weighted')
    m_acc = tf.keras.metrics.Accuracy()
    m_acc.update_state(res, y_true)
    print ("Accuracy: %f"%m_acc.result().numpy())
    print ("Precision: %f"%precision)
    print ("Recall: %f"%recall)
    print ("F1 score: %f"%f1)
    print ("\n")

    print("model_Attention_LSTM: ")
    res = model_Attention_LSTM.predict(inputs)
    res = tf.argmax(res, 1)
    precision, recall, f1, _ = score(y_true, res, average='weighted')
    m_acc = tf.keras.metrics.Accuracy()
    m_acc.update_state(res, y_true)
    print ("Accuracy: %f"%m_acc.result().numpy())
    print ("Precision: %f"%precision)
    print ("Recall: %f"%recall)
    print ("F1 score: %f"%f1)
    print ("\n")

    print("model_CNN_LSTM: ")
    res = model_CNN_LSTM.predict_classes(inputs)
    precision, recall, f1, _ = score(y_true, res, average='weighted')
    m_acc = tf.keras.metrics.Accuracy()
    m_acc.update_state(res, y_true)
    print ("Accuracy: %f"%m_acc.result().numpy())
    print ("Precision: %f"%precision)
    print ("Recall: %f"%recall)
    print ("F1 score: %f"%f1)
    print ("\n")

    print("model_Attention: ")
    res = model_Attention.predict(inputs)
    res = tf.argmax(res, 1)
    precision, recall, f1, _ = score(y_true, res, average='weighted')
    m_acc = tf.keras.metrics.Accuracy()
    m_acc.update_state(res, y_true)
    print ("Accuracy: %f"%m_acc.result().numpy())
    print ("Precision: %f"%precision)
    print ("Recall: %f"%recall)
    print ("F1 score: %f"%f1)
    print ("\n")

    print("model_CNN: ")
    res = model_CNN.predict_classes(inputs)
    precision, recall, f1, _ = score(y_true, res, average='weighted')
    m_acc = tf.keras.metrics.Accuracy()
    m_acc.update_state(res, y_true)
    print ("Accuracy: %f"%m_acc.result().numpy())
    print ("Precision: %f"%precision)
    print ("Recall: %f"%recall)
    print ("F1 score: %f"%f1)
    print ("\n")
Пример #4
0
def prevent_overfitting(tree_function, df, store=False):
    tfidf = TfidfVectorizer()
    x = tfidf.fit_transform(df.lemmatized)
    y = df.language
    if store:
        store_vectorizer(tfidf)

    x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, test_size=.2, random_state=42)

    train = pd.DataFrame(dict(actual=y_train))
    test = pd.DataFrame(dict(actual=y_test))

    score_diff = 1
    leaf = 0
    
    while score_diff > .06:
        leaf += 1
        model = tree_function(criterion='entropy', min_samples_leaf=leaf, random_state=42)
        model.fit(x_train, y_train)
        train['predicted'] = model.predict(x_train)
        test['predicted'] = model.predict(x_test)
        train_acc = score(train.actual, train.predicted)[2].mean()
        test_acc = score(test.actual, test.predicted)[2].mean()
        score_diff = train_acc - test_acc

    print(f'leaf = {leaf}')
    print(f'train acc = {train_acc}')
    print(f'test acc = {test_acc}')
    if store:
        store_model(model)
    return model
def check_threshold(myClf, test_bitter, test_nonbitter, test_x, test_y):
    i = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]
    test_Bitter_withProb = myClf.predict_proba(test_bitter)
    test_NonBitter_withProb = myClf.predict_proba(test_nonbitter)
    threshold = [j / 2 for j in i]
    test_all = myClf.predict_proba(test_x)
    red = []
    blue = []
    green = []
    for s in i:
        temp = get_temp_pred(test_Bitter_withProb, s)
        bitter_acc = score(temp, np.ones(len(temp)))
        red.append(bitter_acc)
        temp2 = get_temp_pred(test_NonBitter_withProb, s)
        nonbitter_acc = score(temp2, np.zeros(len(temp2)))
        blue.append(nonbitter_acc)
        temp3 = get_temp_pred(test_all, s)
        all_acc = score(temp3, test_y)
        green.append(all_acc)
        if i.index(s) == 4:
            print("########### BC_RF Results ###########")
            print("test Bitter: " + str(bitter_acc))
            print("test NonBitter: " + str(nonbitter_acc))
            print("test overall: " + str(all_acc))
            print("\n===============================\n")
    return red, blue, green, i
Пример #6
0
def calculate_fscore(sentiment_list):
    sentiment_list = sentiment_list[1:]
    label_list = np.array(['labelNo', 'labelYes', 'TrueLabel'])
    for line in sentiment_list:
        scoreNo, scoreYes, trueScore = float(line[1]), float(line[2]), float(
            line[3])
        if scoreNo > 0:
            scoreNo = 4
        else:
            scoreNo = 0

        if scoreYes > 0:
            scoreYes = 4
        else:
            scoreYes = 0
        label_list = np.vstack((label_list, [scoreNo, scoreYes, trueScore]))

    precisionNo, recallNo, fscoreNo, supportNo = score(label_list[1:, 2],
                                                       label_list[1:, 0])
    print('labels: {}'.format(['Negative', 'Positive']))
    print('precision: {}'.format(precisionNo))
    print('recall: {}'.format(recallNo))
    print('fscore: {}'.format(fscoreNo))
    print('support: {}'.format(supportNo))
    print('')
    precisionYes, recallYes, fscoreYes, supportYes = score(
        label_list[1:, 2], label_list[1:, 1])
    print('labels: {}'.format(['Negative', 'Positive']))
    print('precision: {}'.format(precisionYes))
    print('recall: {}'.format(recallYes))
    print('fscore: {}'.format(fscoreYes))
    print('support: {}'.format(supportYes))
def topicClustering(conversation, seeds, embeddingsSelected):

    totalMsg = len(conversation)
    gold = []
    predictions = []

    for msgDict in conversation:
        msgVector = iSQL.getMsgVector(msgDict["msg"], embeddingsSelected, 300,
                                      "en")
        distances = {}
        for seedName, seedVector in seeds.iteritems():
            distance = iSQL.distance(msgVector, seedVector)
            distances[seedName] = distance

        if distances["ubuntu"] > distances["unity"]:
            predictedLabel = 1
        else:
            predictedLabel = 0

        gold.append(int(msgDict["label"]))
        predictions.append(predictedLabel)

        #print msgDict["msg"], distances, predictedLabel, msgDict["label"]

    print "micro average"
    precision, recall, fscore, support = score(gold,
                                               predictions,
                                               average="micro")
    f2score = fbeta_score(gold, predictions, beta=2, average="micro")

    print 'precision: {}'.format(precision)
    print 'recall: {}'.format(recall)
    print 'fscore: {}'.format(fscore)
    print 'f2score: {}'.format(f2score)

    print "macro average"
    precision, recall, fscore, support = score(gold,
                                               predictions,
                                               average="macro")
    f2score = fbeta_score(gold, predictions, beta=2, average="macro")

    print 'precision: {}'.format(precision)
    print 'recall: {}'.format(recall)
    print 'fscore: {}'.format(fscore)
    print 'f2score: {}'.format(f2score)

    print "weighted"
    precision, recall, fscore, support = score(gold,
                                               predictions,
                                               average="weighted")
    f2score = fbeta_score(gold, predictions, beta=2, average="weighted")

    print 'precision: {}'.format(precision)
    print 'recall: {}'.format(recall)
    print 'fscore: {}'.format(fscore)
    print 'f2score: {}'.format(f2score)
Пример #8
0
def eval(y_test, y_predicted):

    precision, recall, fscore, _ = score(y_test, y_predicted)
    print('\n     {0}   {1}'.format("0", "1"))
    print('P: {}'.format(precision))
    print('R: {}'.format(recall))
    print('F: {}'.format(fscore))
    #"""
    _, _, fscore, _ = score(y_test, y_predicted, average='macro')
    print('Macro-F1: {}'.format(fscore))

    print('\n Confusion matrix:')
    print(confusion_matrix(y_test, y_predicted))
Пример #9
0
def eval(y_test, y_predicted):

    precision, recall, fscore, _ = score(y_test, y_predicted)
    print('\n     {0}   {1}'.format("0", "1"))
    print('P: {}'.format(precision))
    print('R: {}'.format(recall))
    print('F: {}'.format(fscore))

    mprecision, mrecall, mfscore, _ = score(y_test,
                                            y_predicted,
                                            average='macro')
    print('\n MACRO-AVG')
    print('P: {}'.format(mprecision))
    print('R: {}'.format(mrecall))
    print('F: {}'.format(mfscore))
Пример #10
0
def evaluateImage(queryFile, gtFile):
    queryImg = cv2.imread(queryFile, 0)
    gt = cv2.imread(gtFile, 0)

    if (queryImg.size <= 0):
        print "Image not found"
        return 0
    if (gt.size <= 0):
        print "Groundtruth not found"
        return 0

    predictionVector = []
    gtVector = []

    for pixel in range(0, queryImg.size):
        predictionVector.append(queryImg.flat[pixel])
    for pixel in range(0, gt.size):
        gtVector.append(conf.highwayMapping[gt.flat[pixel]])

    confMat = confusion_matrix(gtVector, predictionVector)
    precision, recall, fscore, support = score(gtVector, predictionVector)

    auc = roc_auc_score(gtVector, predictionVector)

    return confMat, precision, recall, fscore, auc
Пример #11
0
def get_performance(y_test, y_pred):
    '''
    @leosanchezsoler
    This function returns metrics of the applied machine learning model
    Parameters:
        - y_test: a test set
        - y_pred: a prediction set
    Returns:
        - accuracy
        - precision
        - recall
        - f1score
    '''
    # Evaluate Performance
    accuracy = round(accuracy_score(y_test, y_pred) * 100, 2)
    # Get precision, recall, f1 scores
    precision, recall, f1score, support = score(y_test,
                                                y_pred,
                                                average='micro')

    # Performance metrics
    print(f'Test Accuracy Score of Basic Log. Reg.: % {accuracy}')
    print(f'Precision : {precision}')
    print(f'Recall    : {recall}')
    print(f'F1-score   : {f1score}')
    return accuracy, precision, recall, f1score
Пример #12
0
def LSTM(X_train_seq, X_test_seq, y_train, y_test):
    max_words = 10000
    max_len = 200
    lstm_model = Sequential()
    lstm_model.add(Embedding(max_words, 50, input_length=max_len))
    lstm_model.add(LSTM_lib(128, dropout=0.25, recurrent_dropout=0.25))
    lstm_model.add(Dense(1, activation='sigmoid'))
    lstm_model.compile(loss='binary_crossentropy',
                       optimizer='adam',
                       metrics=['accuracy'])

    print('Train model')
    lstm_model.fit(X_train_seq,
                   y_train,
                   batch_size=32,
                   epochs=3,
                   validation_data=(X_test_seq, y_test))
    y_pred = lstm_model.predict_classes(X_test_seq)
    lstm_model.save('51_acc_language_model.h5')
    acc_score = accuracy_score(y_test, y_pred)
    precision, recall, fscore, support = score(y_test,
                                               y_pred,
                                               average='weighted')
    target_names = ['Non-Spam', 'Spam']
    print(classification_report(y_test, y_pred, target_names=target_names))
    return Measure.Measure(acc_score, precision, recall, fscore)
def evaluate_model(model, X_test, y_test):
    '''
    Args: 
        model (classifier)
        X_test (pandas DataFrame) : independent variables for testings=
        y_test (pandas DataFrame) : Dependent variables with 'true' values

    Output
        printed scores
    '''

    y_pred = model.predict(X_test)

    for i, col in enumerate(y_test):

        try:
            y_true = y_test[col]
            y_pred2 = y_pred[:, i]
            clas_report = classification_report(y_true, y_pred2)
            precision, recall, fscore, support = score(y_true, y_pred2)

            print(i, col)
            print(clas_report)
            print(
                f'Precision: from the {y_pred2.sum()} tweets labeled as {col}, {round(precision[1]*100,1)}% were actualy {col}'
            )
            print(
                f'Recall: From the {support[1]} tweets that were actually {col}, {round(recall[1]*100,1)}% were labeled as {col} \n'
            )
            print('-------------------------------------------------------')

        except:
            pass
    def per_class_metrics(self, labels, predictions):

        _, counts = np.unique(labels, return_counts=True)
        precision, recall, _, _ = score(labels, predictions)
        C = confusion_matrix(labels, predictions)
        avg_acc_per_class = np.average(recall)

        t = Texttable()
        t.add_rows([
            ['Metric', 'CAR', 'BUS', 'TRUCK', 'OTHER'],
            ['Count labels'] + counts.tolist(),
            ['Precision'] + precision.tolist(),
            ['Recall'] + recall.tolist()
        ])

        t2 = Texttable()
        t2.add_rows([
            ['-', 'CAR', 'BUS', 'TRUCK', 'OTHER'],
            ['CAR'] + C[0].tolist(),
            ['BUS'] + C[1].tolist(),
            ['TRUCK'] + C[2].tolist(),
            ['OTHER'] + C[3].tolist()
        ])

        return t, t2, avg_acc_per_class
def print_results(y_true, y_pred):
    """
    Prints a mean of the classification report of all 36 classifiers predictions
    Args:
        y_true (numpy array of size (,36)): groundtruth values array
        y_pred (numpy array of size (,36)): predicted labels


    Returns:

    """
    precisions = []
    recalls = []
    fscores = []
    for label in range(y_pred.shape[1]):
        precision, recall, fscore, _ = score(y_true[:, label],
                                             y_pred[:, label],
                                             average='weighted')
        precisions.append(precision)
        recalls.append(recall)
        fscores.append(fscore)

    print('Precision : {}'.format(np.mean(precisions)))
    print('Recall    : {}'.format(np.mean(recalls)))
    print('F-score   : {}'.format(np.mean(fscores)))
    print("Accuracy  : {}".format((y_true == y_pred).mean()))
Пример #16
0
def train(box_feature_variable,
          box_score_variable,
          box_box_variable,
          box_label_variable,
          box_num_variable,
          model,
          model_optimizer,
          box_weight_variable,
          pos_neg_weight=100):
    model_optimizer.zero_grad()

    output_record, output_label, output_weights = model(
        box_feature_variable, box_score_variable, box_box_variable,
        box_label_variable, box_weight_variable, box_num_variable)

    criterion = nn.BCELoss(weight=output_weights)
    loss = criterion(output_record.view(-1, 1), output_label)

    # loss = loss / output_record.size(0)

    loss.backward()
    model_optimizer.step()

    # calculate presicion and recall
    output_record_np = output_record.data.cpu().numpy().reshape(-1, 1)

    output_record_np = (output_record_np > 0.5).astype(np.int8)
    # print('dd')
    # print(np.unique(output_record_np))
    box_label_np = output_label.data.cpu().numpy().astype(np.int8)

    precision, recall, _, _ = score(output_record_np, box_label_np, labels=[1])
    return loss.data[0], float(precision), float(recall)
Пример #17
0
def main(params):
	datapath = params["datapath"]
	train_data_split = params["train_data_split"]
	max_len_sentence = params["max_len_sentence"]
	embeddings_path = params["embeddings_path"]
	model_path = params["model_path"]

	Ds = Dataset(datapath)
	train_data, test_data = Ds.create_dataset(train_data_split)
	X1_train, X2_train, Y_train = Ds.process_dataframe(train_data, max_len_sentence)
	# Storage reduction
	train_data = None
	print "Obtained processed training data"
	embedding_matrix = Ds.create_embedding_matrix(embeddings_path)
	print "Obtained embeddings"
	num_vocab = len(Ds.word_to_idx) + 1

	Sm = SiameseModel()
	model = Sm.build_model(num_vocab, embedding_matrix, max_len_sentence)
	print "Built Model"
	print "Training now..."
	filepath=model_path + "weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5"
	checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
	callbacks_list = [checkpoint]
	model.fit(x=[X1_train, X2_train], y=Y_train, batch_size=128, epochs=30, verbose=1, validation_split=0.2, shuffle=True, callbacks=callbacks_list)

	X1_test, X2_test, Y_test = Ds.process_dataframe(test_data, max_len_sentence)
	pred = model.predict([X1_test, X2_test], batch_size=32, verbose=0)
	precision, recall, fscore, support = score(Y_test, pred.round(), labels=[0, 1])

	print "Metrics on test dataset"
	print('precision: {}'.format(precision))
	print('recall: {}'.format(recall))
	print('fscore: {}'.format(fscore))
	print('support: {}'.format(support))
Пример #18
0
def stat_fscore(truth, predicted):
    precisionMicro, recallMicro, fscoreMicro, _ = score(truth,
                                                        predicted,
                                                        average='micro')
    precisionMacro, recallMacro, fscoreMacro, _ = score(truth,
                                                        predicted,
                                                        average='macro')

    return [
        precisionMicro.item(),
        recallMicro.item(),
        fscoreMicro.item(),
        precisionMacro.item(),
        recallMacro.item(),
        fscoreMacro.item()
    ]
Пример #19
0
def evaluate_model(model, X_test, Y_test, category_names):
    '''
    Function to test model against test set and print model evaluation metrics
    (precision, recall, fscore) for each output category.
    Inputs:
        model: model fitted to training set
        X_test : input variables (messages) of test set
        Y_test : output categorizations for test set
        category_names: (Pandas series) output category names
    Return:
        None, but function prints out evaluation metrics
    '''
    y_pred = model.predict(X_test)
    
    y_pred_df = pd.DataFrame(y_pred)
    y_test_df = pd.DataFrame(Y_test)
    
    results = []

    for cat in range(len(y_pred[0])):
        precision,recall,fscore,support = score(y_test_df[cat],y_pred_df[cat],average='weighted') 
        results.append((category_names[cat],precision,recall,fscore))
        
    results = pd.DataFrame(results,columns=('Category','Precision','Recall','fscore'))
    averages = pd.DataFrame([['Categories Average', results['Precision'].mean(),
           results['Recall'].mean(), results['fscore'].mean()]], columns = results.columns)
    
    print(results.append(averages, ignore_index=True))
Пример #20
0
def svm(X_train, Y_train, X_test, Y_test):
    start = time.time()

    svclassifier = SVC()
    svclassifier.fit(X_train, Y_train)
    Y_pred = svclassifier.predict(X_test)

    end = time.time()

    precision, recall, fscore, train_support = score(Y_test,
                                                     Y_pred,
                                                     pos_label='1',
                                                     average='binary')
    print('Precision: {} / Recall: {} / F1-Score: {} / Accuracy: {}'.format(
        round(precision, 3), round(recall, 3), round(fscore, 3),
        round(acs(Y_test, Y_pred), 3)))
    print("Execution Time: " + str(end - start))

    cm = confusion_matrix(Y_test, Y_pred)
    class_label = ["0", "1"]
    df_cm = pd.DataFrame(cm, index=class_label, columns=class_label)
    sns.heatmap(df_cm, annot=True, fmt='d')
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.show()

    sklearn.metrics.plot_roc_curve(svclassifier, X_test, Y_test)
    plt.title("ROC Curve")
    plt.show()
Пример #21
0
def train(box_feature_variable, box_score_variable, box_box_variable,
          box_label_variable, all_class_box_feature_variable,
          all_class_box_score_variable, all_class_box_box_variable,
          all_class_box_label_variable, model, model_optimizer, criterion,
          unique_class, unique_class_len):
    model_optimizer.zero_grad()

    input_length = box_feature_variable.size()[0]
    output_record = model(box_feature_variable, box_score_variable,
                          box_box_variable, all_class_box_feature_variable,
                          all_class_box_score_variable,
                          all_class_box_box_variable, unique_class,
                          unique_class_len)

    loss = criterion(output_record.view(-1, 1), all_class_box_label_variable)

    # loss = loss / output_record.size(0)

    loss.backward()
    model_optimizer.step()

    # calculate presicion and recall
    output_record_np = output_record.data.cpu().numpy().reshape(-1, 1)
    output_record_np = (output_record_np > 0.5).astype(np.int8)
    # print('dd')
    # print(np.unique(output_record_np))
    box_label_np = all_class_box_label_variable.data.cpu().numpy().astype(
        np.int8)
    # print(box_label_np.shape)
    # print(output_record_np.shape)
    # input()
    # output_record_np = (output_record_np > 0.5).astype(np.int8)
    precision, recall, _, _ = score(box_label_np, output_record_np, labels=[1])

    return loss.data[0], float(precision), float(recall)
Пример #22
0
def model(X, y):
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import confusion_matrix
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import precision_recall_fscore_support as score
    #have split the data to 80% train and 20% test
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=0)
    #scaling the features
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)

    #trainign the radon forest classification model
    classify = RandomForestClassifier(n_estimators=20, random_state=0)
    classify.fit(X_train, y_train)
    y_pred = classify.predict(X_test)

    #creating confusion matrix to evaluate the metrics of the model
    cm = confusion_matrix(y_test, y_pred)
    print(cm)

    #displying the accuracy
    print('Accuracy: {}'.format(accuracy_score(y_test, y_pred)))

    #calculating the metrics
    precision, recall, fscore, support = score(y_test, y_pred)

    print('precision: {}'.format(precision))
    print('recall: {}'.format(recall))
    print('fscore: {}'.format(fscore))
Пример #23
0
def plot_confusion_matrix(y_true, y_pred, target_names, title=None):
    confusionMatrix = confusion_matrix(y_true, y_pred)
    precision, recall, _, _ = score(y_true, y_pred, average='macro')

    plt.figure(figsize=(8, 6))
    plt.imshow(confusionMatrix,
               interpolation='nearest',
               cmap=plt.get_cmap('Blues'))
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=88)
        plt.yticks(tick_marks, target_names)

    thresh = confusionMatrix.max() / 2
    for i, j in itertools.product(range(confusionMatrix.shape[0]),
                                  range(confusionMatrix.shape[1])):
        plt.text(j,
                 i,
                 "{:,}".format(confusionMatrix[i, j]),
                 horizontalalignment="center",
                 color="white" if confusionMatrix[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label\nOverall Precision={:0.4f}; '
               'Overall Recall={:0.4f}'.format(precision, recall))
    plt.title(title)
    plt.show()
def evaluate_model(model, X_test, Y_test, category_names):
    """
    Function to evalute the model performance
    inputs: Model, X_test and Y_test from the train_test_split, and y column unique names

    Output: None
    """

    Y_pred = model.predict(X_test)

    # Print out the full classification report
    results = pd.DataFrame(
        columns=['category', 'precision', 'recall', 'f_score'])
    count = 0
    for category in category_names:
        precision, recall, f_score, support = score(Y_test[category],
                                                    Y_pred[:, count],
                                                    average='weighted')
        results.at[count + 1, 'category'] = category
        results.at[count + 1, 'precision'] = precision
        results.at[count + 1, 'recall'] = recall
        results.at[count + 1, 'f_score'] = f_score
        count += 1
    avg_precision = results['precision'].mean()
    print(' %  Precision:', avg_precision)
    print(' %  Recall:', results['recall'].mean())
    print(' % f_score:', results['f_score'].mean())
def get_statistics(model, model_fit, X, train_X, test_X, train_y, test_y, seq_length, kfold, nfolds, showPlots):
    
    predictions_train, probabilities_train = predict(model, train_X, seq_length)
    
    print("Percentage correct trainset: ", error(np.array(predictions_train), train_y) * 100, "%")

    predictions, probabilities = predict(model, test_X, seq_length)
    # predictions = np.roll(np.argmax(test_y, axis=1), 1, axis=0)

    accuracy = error(np.array(predictions), test_y) * 100

    if showPlots:
        print("Real: ", np.argmax(test_y, axis=1))
        print("Predicted: ", predictions)
        print("Percentage correct testset: ", accuracy, "%")
        make_prediction_plot(np.argmax(test_y, axis=1), predictions)
        loss = model_fit.history.get('val_loss')
        make_plot(loss, 'Training epochs (in days)', 'Loss', 'Training loss on validation set')
        acc = model_fit.history.get('val_acc')
        make_plot(acc, 'Accuracy (in percentage)', 'Training epochs (in days)', 'Training accuracy on validation set')
        ent = calculate_entropy(probabilities_train)
        make_plot(ent, 'Training epoch (in days)', 'Entropy', 'Entropy over training period')

        cnf_matrix = confusion_matrix(np.argmax(test_y, axis=1), predictions)
        class_names = ['Fall', 'Stay', 'Rise']
        plt.figure()
        name = 'Confusion matrix of test data consisting of %d days' %len(predictions)
        plot_confusion_matrix(cnf_matrix, class_names, title=name)
        plt.show()
        calculate_f1(np.argmax(test_y, axis=1), predictions)
        # get_data_plots(X)
    if kfold:
        return accuracy, score(np.argmax(test_y, axis=1), predictions, average='macro')
Пример #26
0
def print_boolean_matrix(true, pred):
    """Print a LaTex table containing the precision and recall values
       of all classes as well as the weighted average."""
    classes = list(true)
    classes.extend(pred)
    classes = list(set(classes))
    matrix_true = dict()
    matrix_false = dict()
    for c in classes:
        matrix_true[c] = 0
        matrix_false[c] = 0

    precision, recall, _, _ = score(true, pred, labels=classes)

    for i in range(len(true)):
        label_true = true[i]
        label_pred = pred[i]
        if label_true == label_pred:
            matrix_true[label_true] += 1
        else:
            matrix_false[label_true] += 1

    print('\\begin{table}[h]')
    print('\\centering')
    print('\\caption{Boolean Matrix}')
    print('\\label{boolean_matrix}')
    print('\\begin{tabular}{|r|r|r|r|r|}')
    print(' \\hline')
    print "Label & Predicted Correctly & Predicted Incorrectly & Precision & Recall \\\\ \\hline"
    for i in range(len(classes)):
        print "{} & {} & {} & {:0.2f} & {:0.2f} \\\\ \\hline".format(classes[i], matrix_true.get(classes[i], 0), matrix_false.get(classes[i], 0), precision[i], recall[i])
    print "\\multicolumn{{3}}{{|l|}}{{Weighted Average}} & {:0.2f} & {:0.2f} \\\\ \hline".format(precision_score(true, pred, average='weighted'), recall_score(true, pred, average='weighted'))
    print('\\end{tabular}')
    print('\\end{table}')
Пример #27
0
    def evaluate(self, dev_df):
        print('Evaluating ... ')
        if 'id' in dev_df.columns:
            dev_df = dev_df.drop(['id'], axis=1)
        dev_df2 = self.process_data_frame(dev_df)
        result, model_outputs, wrong_predictions = self.model.eval_model(
            dev_df2)

        column_names = []
        for col in dev_df.columns:
            if (col != 'text' and col != 'labels' and col != 'sentence'):
                column_names.append(col + '_predicted')
        print(len(column_names))

        model_outputs = model_outputs.round(0)
        output_df = pd.DataFrame(model_outputs, columns=column_names)

        res_pd = pd.concat([dev_df, output_df], axis=1)
        file_name = '/home/minh/out.tsv'
        res_pd.to_csv(file_name, sep='\t', index=False)

        for col in column_names:
            print(col)
            predicted_column = res_pd.loc[:, col]
            predicted = predicted_column.values
            col2 = col.replace('_predicted', '')
            test_column = res_pd.loc[:, col2]
            y_test = test_column.values
            precision, recall, fscore, support = score(y_test, predicted)
            logging.warning('{}\t{:.2%}\t{:.2%}\t{:.2%}\t{}'.format(
                col[:4], precision[-1], recall[-1], fscore[-1], support[-1]))

        print('Evaluating Finished!')
        return result, model_outputs, wrong_predictions
Пример #28
0
 def eval(self):
     precision, recall, fscore, support = score(self.TEST_LABELS,
                                                self.PRED_LABELS)
     print('precision: {}'.format(precision))
     print('recall: {}'.format(recall))
     print('fscore: {}'.format(fscore))
     print('support: {}'.format(support))
Пример #29
0
    def test(self):
        self.model = tf.keras.models.load_model(self.model_name)
        y_pred = self.model.predict_classes(self.X_test)
        y_true = self.y_test
        loss, acc = self.model.evaluate(self.X_test, self.y_test, verbose=2)
        precision, recall, f1, _ = score(y_true, y_pred, zero_division=1)
        print(acc)
        print(precision)
        print(recall)

        conf_matrix = confusion_matrix(y_true, y_pred)

        labels = ["Normal", "Attack"]
        plt.figure(figsize=(6, 6))
        sns.heatmap(conf_matrix,
                    xticklabels=labels,
                    yticklabels=labels,
                    annot=True,
                    fmt="d")
        plt.title("Confusion matrix")
        plt.ylabel('True class')
        plt.xlabel('Predicted class')
        plt.savefig('figures/Confusion_matrix.png')
        plt.show()

        return (loss, acc, precision, recall, f1)
Пример #30
0
def print_and_get_precision_recall_fscore_support(Y_testing, Y_hat):
    precision, recall, fscore, support = score(Y_testing, Y_hat)
    print('Precision: {}'.format(['{:.2f}'.format(100 * x) for x in precision]))
    print('Recall   : {}'.format(['{:.2f}'.format(100 * x) for x in recall]))
    print('Fscore   : {}'.format(['{:.2f}'.format(100 * x) for x in fscore]))
    print('Support  : {}'.format(['{:.2f}'.format(100 * x) for x in support]))
    return precision, recall, fscore, support
Пример #31
0
def main():
    rf = GradientBoostingClassifier(n_estimators=250,
                                    max_depth=31,
                                    learning_rate=0.05,
                                    max_features='sqrt',
                                    subsample=0.95,
                                    random_state=10)
    start = time.time()
    rf_model = rf.fit(X_train_vect, y_train)
    end = time.time()
    fit_time = end - start

    start = time.time()
    y_pred = rf_model.predict(X_test_vect)
    end = time.time()
    pred_time = end - start

    precision, recall, fscore, train_support = score(y_test,
                                                     y_pred,
                                                     pos_label='neg',
                                                     average='binary')
    print(
        'Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'
        .format(round(fit_time, 3), round(pred_time, 3), round(precision, 3),
                round(recall, 3),
                round((y_pred == y_test).sum() / len(y_pred), 3)))

    pickle.dump(rf_model, open('final_gbc.sav', 'wb'))
    def _compute_metrics(self, labels, predictions):
        _, counts = np.unique(labels, return_counts=True)
        acc = accuracy_score(labels, predictions)
        prec, rec, f1, _ = score(labels, predictions)

        return {
            'COUNT': counts,
            'ACC': acc,
            'AVG_ACC': np.average(rec),
            'PRECISION': prec,
            'RECALL': rec,
            'F1_SCORE': f1
        }
Пример #33
0
    def getAnalysis(self, true_pred, y_pred1):

        precision, recall, fscore, support = score(true_pred, y_pred1)
        return (
            matthews_corrcoef(true_pred, y_pred1),
            roc_auc_score(true_pred, y_pred1),
            precision[0],
            precision[1],
            recall[0],
            recall[1],
            fscore[0],
            fscore[1],
            support[0],
            support[1],
        )
def predOnTrainData(features, labels, maxent):
    
    features = np.asarray(features)
    labels = np.asarray(labels)

    for x in range(1):
        scores = defaultdict(list)
        scores2 = defaultdict(list)

        cnt = 0;
        for TrainIndices, TestIndices in cross_validation.KFold(n=features.shape[0], n_folds=10, shuffle=False, random_state=None):
    
            TrainX_i = features[TrainIndices]
            Trainy_i = labels[TrainIndices]
    
            TestX_i = features[TestIndices]
            Testy_i =  labels[TestIndices]
    
            maxent.fit(TrainX_i,Trainy_i)
            ypred_i = maxent.predict(TestX_i)
            
            scores["Accuracy"].append(accuracy_score(Testy_i, ypred_i))
            scores["F1"].append(f1_score(Testy_i, ypred_i, average='macro'))
            scores["Precision"].append(precision_score(Testy_i, ypred_i, average='macro'))
            scores["Recall"].append(recall_score(Testy_i, ypred_i, average='macro'))

            cm = confusion_matrix(Testy_i, ypred_i)
            print(cm)
            cnt += 1

            from sklearn.metrics import precision_recall_fscore_support as score

            precision, recall, fscore, support = score(Testy_i, ypred_i)

            print('precision: {}'.format(precision))
            print('recall: {}'.format(recall))
            print('fscore: {}'.format(fscore))
            print('support: {}'.format(support))
    
        #scores = cross_validation.cross_val_score(maxent, features, labels, cv=10)
        print("--")
    
        for key in sorted(scores.keys()):
            currentmetric = np.array(scores[key])
            print("%s : %0.2f (+/- %0.2f)" % (key,currentmetric.mean(), currentmetric.std()))
        print("\n--")
Пример #35
0
def precision_recall_graph(segment):
    plt.clf()
    X, y, X_test, y_test, features_map = \
        pickle.load(open('prepared/' + segment + '_allegedly_best.pickle', 'rb'))
    models = pickle.load(open('prepared/' + segment + '_logreg.pickle', 'rb'))
    m = max(models, key=lambda t: score(y_test, t.decision_function(X_test)))

    y_pred = [l[1] for l in m.predict_proba(X_test)]
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred)
    plt.figure(figsize=(3, 3))
    plt.gcf().subplots_adjust(bottom=0.15)
    plt.gcf().subplots_adjust(left=0.18)
    plt.plot(recall, precision, 'g-')
    plt.xlabel('recall', fontsize=10)
    plt.ylabel('precision', fontsize=10)
    plt.axis([0, 1, 0, 1])
    plt.title(segment + ' precision / recall', fontsize=10)
    plt.savefig('graphs/' + segment + '_precision_recall.png')
Пример #36
0
def client_target(task, callback):
    (experiment_name, experiment_id,
     train_dataset, test_dataset, _, _) = task['key']
    parameters = task['parameters']

    print 'Starting task %s...' % str(experiment_id)
    print 'Training Set: %s' % train_dataset
    print 'Test Set:     %s' % test_dataset
    print 'Parameters:'
    for k, v in parameters.items():
        print '\t%s: %s' % (k, str(v))
    #import pdb;pdb.set_trace()
   
    train = get_dataset(train_dataset)
    test = get_dataset(test_dataset)
    #import pdb;pdb.set_trace()
    """
    data_raw = np.genfromtxt('natural_scene.data',delimiter = ",")
    class data_class(object):
	def __init__(self):
		pass
    train=data_class()
    test=data_class()
    feature_matrix = data_raw[:, 2:-5]
    label_matrix = data_raw[:, -5:]
    num_instances = data_raw.shape[0]
    train.instances = feature_matrix[:int(math.floor(num_instances/2)),: ]
    test.instances = feature_matrix[int(math.floor(num_instances/2)):,: ]
    train.instance_labels = label_matrix[:int(math.floor(num_instances/2)),: ]
    test.instance_labels = label_matrix[int(math.floor(num_instances/2)):,:  ]
    """
    submission = {
        'instance_predictions' : {
            'train' : {},
            'test'  : {},
        },
        'bag_predictions' : {
            'train' : {},
            'test'  : {},
        },
        'statistics' : {}
    }
    timer = Timer()

    if parameters['kernel'] == 'emp':
        dataset = get_base_dataset(train_dataset)
        idxfile = os.path.join(IDX_DIR, IDX_FMT % dataset)
        kernelfile = os.path.join(PRECOMPUTED_DIR,
            PRECOMPUTED_FMT % (dataset, parameters['ktype']))
        parameters['dataset'] = dataset
        parameters['idxfile'] = idxfile
        parameters['kernelfile'] = kernelfile
        empirical_labels = list(map(str, train.bag_ids))
        if parameters.pop('transductive', False):
            empirical_labels += list(map(str, test.bag_ids))
        parameters['empirical_labels'] = empirical_labels
        train.bags = train.bag_ids
        test.bags = test.bag_ids

    classifier_name = parameters.pop('classifier')
    if classifier_name in CLASSIFIERS:
        classifier0 = CLASSIFIERS[classifier_name](**parameters)
	classifier1 = CLASSIFIERS[classifier_name](**parameters)
	classifier2 = CLASSIFIERS[classifier_name](**parameters)
 	classifier3 = CLASSIFIERS[classifier_name](**parameters)
 	classifier4 = CLASSIFIERS[classifier_name](**parameters)
    else:
        print 'Technique "%s" not supported' % classifier_name
        callback.quit = True
        return

    print 'Training...'
    timer.start('training')
    if train.regression:
        classifier1.fit(train.bags, train.bag_labels)
    else:
        #import pdb;pdb.set_trace()
	classifier0.fit(train.instances, train.instance_labels[:,0].reshape((-1,)))
	classifier1.fit(train.instances, train.instance_labels[:,1].reshape((-1,)))
	classifier2.fit(train.instances, train.instance_labels[:,2].reshape((-1,)))
	classifier3.fit(train.instances, train.instance_labels[:,3].reshape((-1,)))
	classifier4.fit(train.instances, train.instance_labels[:,4].reshape((-1,)))
    timer.stop('training')

    print 'Computing test bag predictions...'
    timer.start('test_bag_predict')
    bag_predictions0 = classifier0.predict(test.instances)
    bag_predictions1 = classifier1.predict(test.instances)
    bag_predictions2 = classifier2.predict(test.instances)
    bag_predictions3 = classifier3.predict(test.instances)
    bag_predictions4 = classifier4.predict(test.instances)

    timer.stop('test_bag_predict')

    if INSTANCE_PREDICTIONS:
        print 'Computing test instance predictions...'
        timer.start('test_instance_predict')
        instance_predictions = classifier.predict(test.instances_as_bags)
        timer.stop('test_instance_predict')

    print 'Computing train bag predictions...'
    timer.start('train_bag_predict')
    train_bag_labels = classifier0.predict() # Saves results from training set
    timer.stop('train_bag_predict')

    if INSTANCE_PREDICTIONS:
        print 'Computing train instance predictions...'
        timer.start('train_instance_predict')
        train_instance_labels = classifier.predict(train.instances_as_bags)
        timer.stop('train_instance_predict')

    print 'Constructing submission...'
    # Add statistics
    for attribute in ('linear_obj', 'quadratic_obj'):
        if hasattr(classifier0, attribute):
            submission['statistics'][attribute] = getattr(classifier,
                                                          attribute)
    submission['statistics'].update(timer.get_all('_time'))
    bag_predictions = np.hstack((bag_predictions0[:,np.newaxis], bag_predictions1[:,np.newaxis],bag_predictions2[:,np.newaxis],bag_predictions3[:,np.newaxis],bag_predictions4[:,np.newaxis]  ))
    for ( _,i), y in zip(test.instance_ids, map(tuple,bag_predictions)):
        submission['bag_predictions']['test'][i] = map(float,y)
    for (_, i), y in zip(train.instance_ids, train_bag_labels.flat):
        submission['bag_predictions']['train'][i] = float(y)
    if INSTANCE_PREDICTIONS:
        for i, y in zip(test.instance_ids, instance_predictions.flat):
            submission['instance_predictions']['test'][i] =float(y)
        for i, y in zip(train.instance_ids, train_instance_labels.flat):
            submission['instance_predictions']['train'][i] = float(y)

    # For backwards compatibility with older versions of scikit-learn
    if train.regression:
        from sklearn.metrics import r2_score as score
        scorename = 'R^2'
    else:
        try:
            from sklearn.metrics import roc_auc_score as score
        except:
            from sklearn.metrics import auc_score as score
        scorename = 'AUC'

    try:
        """
        if train.bag_labels.size > 1:
            print ('Training Bag %s Score: %f'
                   % (scorename, score(train.instance_labels, train_bag_labels)))
        if INSTANCE_PREDICTIONS and train.instance_labels.size > 1:
            print ('Training Inst. %s Score: %f'
                   % (scorename, score(train.instance_labels, train_instance_labels)))
        """
        if test.bag_labels.size > 1:
            AUC_list=[]
	    for ii in range(5):
		AUC_list.append(score(test.instance_labels[:,ii], bag_predictions[:,ii]))
	    AUC_mean=np.mean(AUC_list)
	    submission['statistics'][scorename]=AUC_mean
	    print ('Test Bag Average %s Score: %f'
                   % (scorename,AUC_mean ))
	    print( 'Test Bag Individual %s Score: ' %scorename   +','.join(map(str, AUC_list))   )
        """
        if INSTANCE_PREDICTIONS and test.instance_labels.size > 1:
            print ('Test Inst. %s Score: %f'
                   % (scorename, score(test.instance_labels, instance_predictions)))
        """
    except Exception as e:
        print "Couldn't compute scores."
        print e

    print 'Finished task %s.' % str(experiment_id)
    return submission
Пример #37
0

# a classic way to overfit is to use a small number
# of data points and a large number of features;
# train on only 150 events to put ourselves in this regime
features_train = features_train[:150].toarray()
labels_train = labels_train[:150]

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score as score

tree = DecisionTreeClassifier()
tree.fit(features_train,labels_train)
pred = tree.predict(features_test, labels_test)

score_test = score(labels_test, pred)
print score_test
word_index = []
for n,i in enumerate(tree.feature_importances_):
    if i != 0:
        print n, i
        word_index.append(n)

# this was used to find the most influenctal word,
# it is no longer needed

# print vectorizer.get_feature_names()[33614]
# print vectorizer.get_feature_names()[14343]

# for i in word_index:
#     print vectorizer.get_feature_names()[i]
Пример #38
0
def C_score(C, X, y, X_test, y_test):
    
    m = logreg(C=C)
    m.fit(X, y)
    
    return score(y_test, m.decision_function(X_test))
Пример #39
0
def client_target(task, callback):
    (experiment_name, experiment_id,
     train_dataset, test_dataset, _, _) = task['key']
    parameters = task['parameters']

    print 'Starting task %s...' % str(experiment_id)
    print 'Training Set: %s' % train_dataset
    print 'Test Set:     %s' % test_dataset
    print 'Parameters:'
    for k, v in parameters.items():
        print '\t%s: %s' % (k, str(v))

    train = get_dataset(train_dataset)
    test = get_dataset(test_dataset)

    submission = {
        'instance_predictions' : {
            'train' : {},
            'test'  : {},
        },
        'bag_predictions' : {
            'train' : {},
            'test'  : {},
        },
        'statistics' : {}
    }
    timer = Timer()

    if parameters['kernel'] == 'emp':
        dataset = get_base_dataset(train_dataset)
        idxfile = os.path.join(IDX_DIR, IDX_FMT % dataset)
        kernelfile = os.path.join(PRECOMPUTED_DIR,
            PRECOMPUTED_FMT % (dataset, parameters['ktype']))
        parameters['dataset'] = dataset
        parameters['idxfile'] = idxfile
        parameters['kernelfile'] = kernelfile
        empirical_labels = list(map(str, train.bag_ids))
        if parameters.pop('transductive', False):
            empirical_labels += list(map(str, test.bag_ids))
        parameters['empirical_labels'] = empirical_labels
        train.bags = train.bag_ids
        test.bags = test.bag_ids

    classifier_name = parameters.pop('classifier')
    if classifier_name in CLASSIFIERS:
        classifier = CLASSIFIERS[classifier_name](**parameters)
    else:
        print 'Technique "%s" not supported' % classifier_name
        callback.quit = True
        return

    print 'Training...'
    timer.start('training')
    if train.regression:
        classifier.fit(train.bags, train.bag_labels)
    else:
        classifier.fit(train.bags, train.pm1_bag_labels)
    timer.stop('training')

    print 'Computing test bag predictions...'
    timer.start('test_bag_predict')
    bag_predictions = classifier.predict(test.bags)
    timer.stop('test_bag_predict')

    if INSTANCE_PREDICTIONS:
        print 'Computing test instance predictions...'
        timer.start('test_instance_predict')
        instance_predictions = classifier.predict(test.instances_as_bags)
        timer.stop('test_instance_predict')

    print 'Computing train bag predictions...'
    timer.start('train_bag_predict')
    train_bag_labels = classifier.predict() # Saves results from training set
    timer.stop('train_bag_predict')

    if INSTANCE_PREDICTIONS:
        print 'Computing train instance predictions...'
        timer.start('train_instance_predict')
        train_instance_labels = classifier.predict(train.instances_as_bags)
        timer.stop('train_instance_predict')

    print 'Constructing submission...'
    # Add statistics
    for attribute in ('linear_obj', 'quadratic_obj'):
        if hasattr(classifier, attribute):
            submission['statistics'][attribute] = getattr(classifier,
                                                          attribute)
    submission['statistics'].update(timer.get_all('_time'))

    for i, y in zip(test.bag_ids, bag_predictions.flat):
        submission['bag_predictions']['test'][i] = float(y)
    for i, y in zip(train.bag_ids, train_bag_labels.flat):
        submission['bag_predictions']['train'][i] = float(y)
    if INSTANCE_PREDICTIONS:
        for i, y in zip(test.instance_ids, instance_predictions.flat):
            submission['instance_predictions']['test'][i] = float(y)
        for i, y in zip(train.instance_ids, train_instance_labels.flat):
            submission['instance_predictions']['train'][i] = float(y)

    # For backwards compatibility with older versions of scikit-learn
    if train.regression:
        from sklearn.metrics import r2_score as score
        scorename = 'R^2'
    else:
        try:
            from sklearn.metrics import roc_auc_score as score
        except:
            from sklearn.metrics import auc_score as score
        scorename = 'AUC'

    try:
        if train.bag_labels.size > 1:
            print ('Training Bag %s Score: %f'
                   % (scorename, score(train.bag_labels, train_bag_labels)))
        if INSTANCE_PREDICTIONS and train.instance_labels.size > 1:
            print ('Training Inst. %s Score: %f'
                   % (scorename, score(train.instance_labels, train_instance_labels)))
        if test.bag_labels.size > 1:
            print ('Test Bag %s Score: %f'
                   % (scorename, score(test.bag_labels, bag_predictions)))
        if INSTANCE_PREDICTIONS and test.instance_labels.size > 1:
            print ('Test Inst. %s Score: %f'
                   % (scorename, score(test.instance_labels, instance_predictions)))
    except Exception as e:
        print "Couldn't compute scores."
        print e

    print 'Finished task %s.' % str(experiment_id)
    return submission
clf = RandomForestClassifier(n_estimators=100, n_jobs=4)

# x should be (n_samples, n_features) in a matrix
x = x.as_matrix()

# fit the classifier using a multi-label indicator
clf.fit(x, y)

# 1. split the data into a training set and a test set
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=0)
# 2. run classifier
classifier = svm.SVC(kernel="linear")

# print respective feature_importance coefficients showing how important each feature is in predicitng target vector
print (clf.feature_importances_)
print "mean accuracy score for the clf model on test data: ", clf.score(x_test, y_test)
# show prediction results
print "test (prediction): ", clf.predict(x_train)
test_pred = clf.predict(x_test)

# compute mean error using multiclass log loss function (this is another means of showing how accurate prediction is)
def multiclass_log_loss(y_true, y_pred, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    https://www.kaggle.com/wiki/MultiClassLogLoss
    Parameters
    ----------
    y_true : array, shape = [n_samples]
            true class, intergers in [0, n_classes - 1)
    y_pred : array, shape = [n_samples, n_classes]
    Returns
    -------
#                 (left, right) = (tmp[0], tmp[2])
#                 # right = float(right)
#                 # print left, right
#                 # links.append((x, z, value))
#                 nodes.append((x, left, right))
#             else:
#                 nodes.append(x, "BBB", -1000)
#     except Exception:
#         pass
# print nodes
# print links
# print len(clf.feature_importances_), len(tmp)
# exit()
for i in range(0, len(tmp)):
    print tmp[i], ':', clf.feature_importances_[i]
# print clf.feature_importances_
# print len(clf.feature_importances_)
answer = clf.predict(x_train)
precision, recall, fscore, support = score(y_train, clf.predict(x_train))
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

del xdata, ydata, processed_data
gc.collect()
if fil1:
    fil1.close()
if fil2:
    fil2.close()
Пример #42
0
X_train , X_test , y_train , y_test = train_test_split(X_features,new_data['label'],test_size=0.2)


X_train.head()
X_test.head()
y_train.head()
new_data.head()
new_data[156:157]
len(new_data)

rf = RandomForestClassifier(n_estimators=50,max_depth=20,n_jobs=-1)
rf_model = rf.fit(X_train,y_train)
y_pred=rf_model.predict(X_test)
len(y_pred)
precision , recall , fscore , support = score(y_test,y_pred,pos_label='spam' , average='binary')

score(y_test[100:101],rf_model.predict(X_test[100:101]),pos_label='spam' , average='binary')

###############################################################################
###############################################################################

measurements = [
    {'city': 'Dubai', 'temperature': 33.},
    {'city': 'London', 'temperature': 12.},
    {'city': 'San Francisco', 'temperature': 18.},
]

from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer()