示例#1
0
def plot_confusion_matrix (confusion_matrix_array):

    print ('###### Start Confusion Matrix ####')

    print (confusion_matrix_array)

    save_report_to_csv (REPORT_FOLDER + get_model_name_by_file(VALIDATION_FILE)+'_confusion_report.csv', [
        'MultinomialNB', 
        get_model_name_by_file(MODEL_FILE),
        confusion_matrix_array[0][0],
        confusion_matrix_array[0][1],
        confusion_matrix_array[1][0],
        confusion_matrix_array[1][1]
    ])


    print ('###### End Confusion Matrix ####')


    df_cm = pd.DataFrame(confusion_matrix_array, range(2), range(2))

    #plt.figure(figsize = (10,7))

    plot = df_cm.plot()
    fig = plot.get_figure()
    

    ax = plt.subplot()
    
    sn.heatmap(df_cm, annot=True, fmt='g', ax = ax, annot_kws={"size": 16})# font size
    
    # labels, title and ticks
    ax.set_xlabel('Predicted')
    ax.set_ylabel('Real')
    
    ax.yaxis.set_ticklabels(['Non Political', 'Political']) 
    ax.xaxis.set_ticklabels(['Non Political', 'Political'])

    model_name = MODEL_FILE
    
    model_name = model_name.replace ('.politics_ben.skl', '')
    model_name = model_name.replace (SKL_FOLDER, '')
    
    ax.set_title(model_name.replace ('_', ' ').upper())

    fig.add_subplot(ax)

    fig.savefig(PLOT_FOLDER + 'confusion_matrix_publica_'+ model_name + '.png', dpi=400)
示例#2
0
def train_CNN(X,
              y,
              inp_dim,
              model,
              weights,
              epochs=EPOCHS,
              batch_size=BATCH_SIZE):
    cv_object = KFold(n_splits=NO_OF_FOLDS, shuffle=True, random_state=42)
    print(cv_object)
    p, r, f1 = [], [], []
    p1, r1, f11 = 0., 0., 0.
    p_class, r_class, f1_class = [], [], []
    sentence_len = X.shape[1]

    marcro_f1, macro_r, macro_p = [], [], []

    precision_scores = []
    recall_scores = []
    f1_scores = []

    for train_index, test_index in cv_object.split(X):
        if INITIALIZE_WEIGHTS_WITH == "word2vec":
            model.layers[0].set_weights([weights])
        elif INITIALIZE_WEIGHTS_WITH == "random":
            shuffle_weights(model)
        else:
            print("ERROR!")
            return
        X_train, y_train = X[train_index], y[train_index]
        X_test, y_test = X[test_index], y[test_index]
        y_train = y_train.reshape((len(y_train), 1))
        X_temp = np.hstack((X_train, y_train))
        for epoch in range(epochs):
            for X_batch in batch_gen(X_temp, batch_size):
                x = X_batch[:, :sentence_len]
                y_temp = X_batch[:, sentence_len]

                class_weights = None
                if SCALE_LOSS_FUN:
                    class_weights = {}
                    for cw in range(len(set(tx_class))):
                        class_weights[cw] = np.where(
                            y_temp == cw)[0].shape[0] / float(len(y_temp))
                try:
                    y_temp = np_utils.to_categorical(y_temp,
                                                     num_classes=len(
                                                         set(tx_class)))
                except Exception as e:
                    print(e)
                    print(y_temp)
                #print(x.shape, y.shape)
                loss, acc = model.train_on_batch(x,
                                                 y_temp,
                                                 class_weight=class_weights)

        y_pred = model.predict_on_batch(X_test)
        y_pred = np.argmax(y_pred, axis=1)
        #print(classification_report(y_test, y_pred))
        #print(precision_recall_fscore_support(y_test, y_pred))
        #print(y_pred)
        p.append(precision_score(y_test, y_pred, average='weighted'))
        p1 += precision_score(y_test, y_pred, average='micro')
        p_class.append(precision_score(y_test, y_pred, average=None))
        r.append(recall_score(y_test, y_pred, average='weighted'))
        r1 += recall_score(y_test, y_pred, average='micro')
        r_class.append(recall_score(y_test, y_pred, average=None))
        f1.append(f1_score(y_test, y_pred, average='weighted'))
        f11 += f1_score(y_test, y_pred, average='micro')
        f1_class.append(f1_score(y_test, y_pred, average=None))

        macro_p.append(precision_score(y_test, y_pred, average='macro'))
        macro_r.append(recall_score(y_test, y_pred, average='macro'))
        marcro_f1.append(f1_score(y_test, y_pred, average='macro'))

    print("macro results are")
    print("average precision is %f" % (np.array(p).mean()))
    print("average recall is %f" % (np.array(r).mean()))
    print("average f1 is %f" % (np.array(f1).mean()))

    save_report_to_csv(
        REPORT_FOLDER + 'CNN_training_report.csv',
        [
            'CNN',
            get_model_name_by_file(POLITICS_FILE),
            #weighted scores
            np.array(p).mean(),
            np.array(p).std() * 2,
            np.array(r).mean(),
            np.array(r).std() * 2,
            np.array(f1).mean(),
            np.array(f1).std() * 2,

            #macro scores
            np.array(macro_p).mean(),
            np.array(macro_p).std() * 2,
            np.array(macro_r).mean(),
            np.array(macro_r).std() * 2,
            np.array(marcro_f1).mean(),
            np.array(marcro_f1).std() * 2,

            #by class scores
            np.array(np.array(p_class)[:, 0]).mean(),
            np.array(np.array(p_class)[:, 1]).mean(),
            np.array(np.array(r_class)[:, 0]).mean(),
            np.array(np.array(r_class)[:, 1]).mean(),
            np.array(np.array(f1_class)[:, 0]).mean(),
            np.array(np.array(f1_class)[:, 1]).mean()
        ])

    print("micro results are")
    print("average precision is %f" % (p1 / NO_OF_FOLDS))
    print("average recall is %f" % (r1 / NO_OF_FOLDS))
    print("average f1 is %f" % (f11 / NO_OF_FOLDS))
示例#3
0
    path = dict(cf.items("file_path"))
    dir_w2v = path['dir_w2v']
    dir_in = path['dir_in']

    
    texts, y_true = load_validation_file_csv(VALIDATION_FILE)

    print ('Loading '+MODEL_FILE+' file...')
    model = joblib.load(MODEL_FILE)
    vectorizer = get_vectorizer()
    pol = ''
    n_pol = ''
    y_pred = list()

   
    mean_auc, std_auc = generate_roc_curve (model, texts, y_true, MODEL_FILE, get_model_name_by_file(VALIDATION_FILE))
    
    print ('Predicting...')

    X = vectorizer.transform(texts)
    y_pred = model.predict(X)

    print ('Classification Report')
    print(classification_report(y_true, y_pred))
    p, r, f1, s = precision_recall_fscore_support(y_true, y_pred)

    ff1 = f1_score (y_true, y_pred, average='weighted')
    recall = recall_score (y_true, y_pred, average='weighted')
    precision = precision_score (y_true, y_pred, average='weighted')

    f1_macro = f1_score (y_true, y_pred, average='macro')
示例#4
0
            n_pol += text + '\n'
            y_pred.append(0)

    print(classification_report(y_true, y_pred))
    p, r, f1, s = precision_recall_fscore_support(y_true, y_pred)

    ff1 = f1_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    precision = precision_score(y_true, y_pred, average='weighted')

    f1_macro = f1_score(y_true, y_pred, average='macro')
    recall_macro = recall_score(y_true, y_pred, average='macro')
    precision_macro = precision_score(y_true, y_pred, average='macro')

    accuracy = accuracy_score(y_true, y_pred)

    generate_normal(X, y_true)

    mean_auc, std_auc = generate_roc_curve(
        X, y_true, get_model_name_by_file(VALIDATION_FILE))

    #plot_confusion_matrix (confusion_matrix(y_true, y_pred))

    save_report_to_csv(REPORT_FOLDER + 'CNN_validation_report.csv', [
        'CNN',
        get_model_name_by_file(H5_FILE),
        get_model_name_by_file(VALIDATION_FILE), accuracy, p[0], p[1], r[0],
        r[1], f1[0], f1[1], s[0], s[1], f1_macro, recall_macro,
        precision_macro, mean_auc, std_auc, ff1, recall, precision
    ])
示例#5
0
        dir_w2v + W2VEC_MODEL_FILE, binary=False, unicode_errors="ignore")

    texts, y_true = load_validation_file_csv(VALIDATION_FILE)

    print('Loading ' + MODEL_FILE + ' file...')

    model = joblib.load(MODEL_FILE)
    pol = ''
    n_pol = ''
    y_pred = list()
    tp = TextProcessor()
    texts = tp.text_process(texts, text_only=True)
    X = gen_data(texts)

    mean_auc, std_auc = generate_roc_curve(
        model, X, y_true, MODEL_FILE, get_model_name_by_file(VALIDATION_FILE))

    print('Predicting...')

    y_pred = model.predict(X)

    print('Classification Report')
    print(classification_report(y_true, y_pred))
    p, r, f1, s = precision_recall_fscore_support(y_true, y_pred)

    model_name = MODEL_FILE.replace(SKL_FOLDER, '')
    model_name = model_name.replace('.politics_ben.skl', '')
    model_name = model_name.replace('_', ' ').upper()

    generate_normal(model, X, y_true, model_name)
示例#6
0
def train_classifier(classifier, vectorizer, data):
    train, test = train_test_split(data, test_size=0.1, random_state=SEED, shuffle=True)
    x_train, y_train = zip(*train)
    x_test, y_test = zip(*test)
    x_train = vectorizer.transform(x_train)
    x_test = vectorizer.transform(x_test)
    x_train, y_train = equalize_classes(x_train, y_train)
    print("final size of training data: %s" % x_train.shape[0])
    classifier.fit(x_train, y_train)
    print(classification_report(y_test, classifier.predict(x_test)))

    scores1 = cross_val_score(classifier, x_train, y_train, cv=NO_OF_FOLDS, scoring='precision_weighted')
    
    predictions = cross_val_predict(classifier.fit(x_train, y_train), x_train, y_train, cv=NO_OF_FOLDS)

    print("Precision(avg): %0.3f (+/- %0.3f)" %
          (scores1.mean(), scores1.std() * 2))
    precision_score_mean = scores1.mean()
    precision_score_std = scores1.std() * 2

    scores2 = cross_val_score(
        classifier, x_train, y_train, cv=NO_OF_FOLDS, scoring='recall_weighted')
    print("Recall(avg): %0.3f (+/- %0.3f)" %
          (scores2.mean(), scores2.std() * 2))

    recall_score_mean = scores2.mean()
    recall_score_std = scores2.std() * 2

    scores3 = cross_val_score(
        classifier, x_train, y_train, cv=NO_OF_FOLDS, scoring='f1_weighted')
    print("F1-score(avg): %0.3f (+/- %0.3f)" %
          (scores3.mean(), scores3.std() * 2))

    f1_score_mean = scores3.mean()
    f1_score_std = scores3.std() * 2

     # getting metrics by class
    f1_class = f1_score(y_train, predictions, average=None)
    r_class = recall_score(y_train, predictions, average=None)
    p_class = precision_score(y_train, predictions, average=None)

    f1_macro = cross_val_score(classifier, x_train, y_train, cv=NO_OF_FOLDS, scoring='f1_macro')
    r_macro = cross_val_score(classifier, x_train, y_train, cv=NO_OF_FOLDS, scoring='recall_macro')
    p_macro = cross_val_score(classifier, x_train, y_train, cv=NO_OF_FOLDS, scoring='precision_macro')

    save_report_to_csv (REPORT_FOLDER + 'training_report.csv', [
        'MultinomialNB', 
        get_model_name_by_file(POLITICS_FILE),

        precision_score_mean,
        precision_score_std,
        recall_score_mean,
        recall_score_std,
        f1_score_mean,
        f1_score_std,

        #macro scores
        f1_macro.mean(),
        f1_macro.std() * 2,
        r_macro.mean(),
        r_macro.std() * 2,
        p_macro.mean(),
        p_macro.std() * 2,

        #class scores
        f1_class[0],
        f1_class[1],
        r_class[0],
        r_class[1],
        p_class[0],
        p_class[1],
    ])

    return classifier
示例#7
0
def classification_model(X, Y, model_type=None):
    X, Y = shuffle(X, Y, random_state=SEED)
    print("Model Type:", model_type)
    
    params = load_hiperparameters (POLITICS_FILE)

    if not params:
        model = GridSearchCV(estimator=get_model(model_type), param_grid=param_grid[model_type], n_jobs=-1, verbose=3)
    else:
        model = get_model(model_type)
        model.set_params (**params)

    model.fit(X, Y)

    predictions = cross_val_predict(model, X, Y, cv=NO_OF_FOLDS)

    if params is None:
        try:
            print('\n Best estimator:')
            print(model.best_estimator_)
            save_hiperparameters (POLITICS_FILE, model.best_estimator_)

            print('\n Best hyperparameters:')
            print(model.best_params_)
        except Exception as error:
            print (error)
            print ('Nothind to do!')
            pass

    scores1 = cross_val_score(model, X, Y, cv=NO_OF_FOLDS, scoring='precision_weighted')
    
    print("Precision(avg): %0.3f (+/- %0.3f)" %(scores1.mean(), scores1.std() * 2))

    precision_score_mean = scores1.mean()
    precision_score_std = scores1.std() * 2

    scores2 = cross_val_score(model, X, Y, cv=NO_OF_FOLDS, scoring='recall_weighted')
    print("Recall(avg): %0.3f (+/- %0.3f)" % (scores2.mean(), scores2.std() * 2))

    recall_score_mean = scores2.mean()
    recall_score_std = scores2.std() * 2

    scores3 = cross_val_score(
        model, X, Y, cv=NO_OF_FOLDS, scoring='f1_weighted')
    print("F1-score(avg): %0.3f (+/- %0.3f)" %
          (scores3.mean(), scores3.std() * 2))

    f1_score_mean = scores3.mean()
    f1_score_std = scores3.std() * 2

    # getting metrics by class
    f1_class = f1_score(Y, predictions, average=None)
    r_class = recall_score(Y, predictions, average=None)
    p_class = precision_score(Y, predictions, average=None)

    f1_macro = cross_val_score(model, X, Y, cv=NO_OF_FOLDS, scoring='f1_macro')
    r_macro = cross_val_score(model, X, Y, cv=NO_OF_FOLDS, scoring='recall_macro')
    p_macro = cross_val_score(model, X, Y, cv=NO_OF_FOLDS, scoring='precision_macro')

    print (f1_class, r_class, p_class)
    
    save_report_to_csv (REPORT_FOLDER  + model_type +'_training_report.csv', [
        model_type, 
        get_model_name_by_file(POLITICS_FILE),
        
        # weighted scores
        precision_score_mean,
        precision_score_std,
        recall_score_mean,
        recall_score_std,
        f1_score_mean,
        f1_score_std,

        #macro scores
        f1_macro.mean(),
        f1_macro.std() * 2,
        r_macro.mean(),
        r_macro.std() * 2,
        p_macro.mean(),
        p_macro.std() * 2,

        # by class
        f1_class[0],
        f1_class[1],
        r_class[0],
        r_class[1],
        p_class[0],
        p_class[1],
    ])

    return model