示例#1
0
def train_model(dataset_feature, method, title):
    json = loadDict(dataset_feature)

    X = [item.reshape(-1) for item in json['list_features']]
    y = json['list_labels']

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=40)

    if method == 'knn':
        clf = KNeighborsClassifier(n_neighbors=3)
        clf.fit(X_train, y_train)
    elif method == 'svm':
        clf = SVC(kernel='linear')
        clf.fit(X_train, y_train)

    predict_list = clf.predict(X_test)
    target_names = set(y_test)

    plot_confusion_matrix(y_test,
                          predict_list,
                          classes=target_names,
                          title=title,
                          normalize=True)
    print('TITLE: ', title)
    print classification_report(y_test,
                                predict_list,
                                target_names=target_names)
def rbf_analysis(X, Y, c, g, title, filename):

	print "Performing Cross Validation on Penalty: {}".format(c)
	dataLength = len(X)
	loo = LeaveOneOut(dataLength)
	predictions = []
	expected = []
	TP, FN, TN, FP = 0, 0, 0, 0
	Accuracy = 0
	for train_index, test_index in loo:
		X_train, X_test = X[train_index], X[test_index]
		Y_train, Y_test = Y[train_index], Y[test_index][0]

		clf = SVC(C=c, gamma=g, kernel='rbf')
		clf.fit(X_train, Y_train)
		prediction = clf.predict(X_test)[0]
	
		predictions.append(prediction)
		expected.append(Y_test)

	print("Calculating.....")
	for i, prediction in enumerate(predictions):
		if(prediction == 1 and expected[i] == 1):
			TP += 1
		elif(prediction == 0 and expected[i] == 1):
			FN += 1
		elif(prediction == 0 and expected[i] == 0):
			TN += 1
		elif(prediction == 1 and expected[i] == 0):
			FP += 1
		else:
			pass

	Sensitivity = TP/float(TP + FN)
	Specificity = TN/float(TN + FP)
	Accuracy = (TP + TN)/float(TP + TN + FP + FN)

	# Saving data to file
	with open(filename, 'ab') as f:
		f.write("Sensitivity of Prediction: {} @ Penalty: {} @ Gamma: {}\n".format(Sensitivity, c, g))
		f.write("Specificity of Prediction: {} @ Penalty: {} @ Gamma: {}\n".format(Specificity, c, g))
		f.write("Accuracy of Prediction: {} @ Penalty: {} @ Gamma: {}\n".format(Accuracy, c, g))
		f.write("Matthews Correlation Coeefficient Value: {}\n".format(matthews_corrcoef(predictions, expected)))
		f.write("Classification Report:\n")
		f.write(classification_report(predictions, expected))
		f.write("Confusion Matrix\n")
		cm = confusion_matrix(predictions, expected)
		f.write(str(cm))
		cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
		label1 = "Negative"
		label2 = "Positive"
			
		plt.figure()
		plot_confusion_matrix(cm, title, label1, label2)
def plot_matrix(y_pred: np.ndarray, dataset: tf.data.Dataset) -> None:
    y_pred = np.argmax(y_pred, axis=1)

    y_test = np.concatenate([y for x, y in dataset], axis=0)
    y_test = np.argmax(y_test, axis=1)

    cnf_matrix = confusion_matrix(y_test, y_pred)
    class_names = [str(cl) for cl in range(10)]

    plt.figure()
    plot_confusion_matrix(cnf_matrix, classes=class_names)
    plt.show()
示例#4
0
def getConfusion(y_test, prediction, name):
    # confusion matrix for test
    # cnf_matrix = confusion_matrix(y_test, prediction)
    cnf_matrix = confusionMatrix(y_test, prediction)
    class_names = np.unique(prediction, return_counts=False)
    np.set_printoptions(precision=2)
    # Plot non-normalized confusion matrix
    plt.figure()
    cf_mat.plot_confusion_matrix(cnf_matrix, classes=class_names, title=name)
    plt.savefig("results/" + name)
    # plt.show()
    return
示例#5
0
def selectModel(data_set, models, visual=False, plist_file=None):
    scaler = StandardScaler()
    data_set.X_train = scaler.fit_transform(data_set.X_train)
    data_set.X_test = scaler.transform(data_set.X_test)
    #print("after scaling")
    #print(data_set.X_train.shape)
    # selector1= VarianceThreshold(threshold=(.9 * (1 - .9)))
    # X_train = selector1.fit_transform(X_train, y_train)
    # selector2 = SelectKBest(f_classif, k=min(X_train.shape[1], 3000))
    # X_train = selector2.fit_transform(X_train, y_train)

    # dimension reduction
    #pca = PCA()
    #data_set.X_train = pca.fit_transform(data_set.X_train)
    #print("after PCA")
    #print(data_set.X_train.shape)

    # feature selection
    # backward search
    #svc = SVC(kernel="linear", C=0.001)
    #rfecv = RFECV(estimator=svc, step=10, cv=StratifiedKFold(3), n_jobs=-1, scoring='accuracy', verbose=9)
    #data_set.X_train = rfecv.fit_transform(data_set.X_train, data_set.y_train)
    #print("Backward search gives number of features : %d" % rfecv.n_features_)

    #data_set.X_test = scaler.transform(data_set.X_test)
    #data_set.X_test = rfecv.predict(data_set.X_test)

    for (model, configs) in models:
        if 'skip' in configs and configs['skip']: continue
        t0 = time.time()
        clf = model(data_set, configs).clf
        #y_predict = clf.predict(data_set.X_test)
        score = clf.score(data_set.X_test, data_set.y_test)
        logging.info("%s (%.2f) Test Accuracy: %0.4f" %
                     (str(model), time.time() - t0, score))
        # Plot normalized confusion matrix
        if plist_file is not None:
            with open(plist_file, 'r') as fd:
                label_names = json.load(fd).keys()
        else:
            label_names = [str(x) for x in range(1, max(data_set.y_train) + 1)]
        if visual:
            confusion_matrix.plot_confusion_matrix(
                data_set.y_test,
                y_predict,
                classes=label_names,
                normalize=True,
                title='Normalized confusion matrix')
示例#6
0
def plot_confusion_matrices(activation_funs_with_names_list, xs, ys):
    """
    :param activation_funs_with_names_list: list of pairs ((act_fun_1, act_fun_2), (name_1, name_2))
    :param xs: xs: input data, NxD np.array
    :param ys: ys: output data, Nx1 np.array
    :return:
    """
    for activation_funs, activation_names in activation_funs_with_names_list:
        estimator = KerasClassifier(
            build_fn=lambda: build_model_with_activation_funs(activation_funs[0], activation_funs[1]), epochs=30,
            batch_size=5, verbose=0)
        y_pred = cross_val_predict(estimator, xs, ys, cv=5)
        confusion_matrix.plot_confusion_matrix(y_true=ys.astype(int), y_pred=y_pred.astype(int),
                                               classes=[0, 1, 2, 3], title="Confusion matrix tasted on {} samples for "
                                                                           "{} and {} activation functions".format(
                len(y_pred), activation_names[0], activation_names[1]))
        plt.show()
def NN_evaluation(model, testloader, criterion, patience=100, device="cpu"):
    """This function evaluates the models on a test set"""
    y_pred_test = []
    y_test = []
    batch_loss = []
    batch_accs = []
    num_classes = 16
    with torch.no_grad():
        for i, data in enumerate(testloader):
            inputs = data['bands'].float().to(device)
            labels = data['labels'].long().to(device)
            logits = model(inputs)
            loss = criterion(logits, labels)
            batch_loss.append(loss.item())
            batch_acc, batch_pred = logit_accuracy(logits, labels)
            y_pred_test.append(batch_pred)
            y_test.append(data['labels'])
            batch_accs.append(batch_acc)
    #    print("Validation loss: {:1.3f}, Validation Acc: {:1.3f} \n".
    #          format(np.mean(batch_loss), np.mean(batch_accs)))
    # Predicted labels to numpy array
    y_pred_test = np.concatenate([
        y_pred_test[i].to("cpu").numpy() for i in range(len(y_pred_test))
    ]).reshape(-1)
    y_test = np.concatenate(
        [y_test[i].to("cpu").numpy() for i in range(len(y_test))]).reshape(-1)
    # =============================================================================
    # Confusion Matrix testidation Set
    # =============================================================================
    from confusion_matrix import plot_confusion_matrix
    # testidation
    labels = list(set(y_test))
    print(classification_report(y_test, y_pred_test, digits=3))
    cm = confusion_matrix(y_test, y_pred_test, labels=list(range(num_classes)))
    print("\n")
    plt.rcParams["figure.figsize"] = (10, 6)
    plt.figure()
    plot_confusion_matrix(cm,
                          classes=labels,
                          title='Confusion matrix - Validation set',
                          cmap=plt.cm.Greens)
    return y_test, y_pred_test, cm
示例#8
0
def plot_confusion_matrices(neurons_numbers, xs, ys):
    """
    :param neurons_numbers: neurons_numbers: list of neurons numbers in hidden layer to test
    :param xs: xs: input data, NxD np.array
    :param ys: ys: output data, Nx1 np.array
    :return:
    """
    for neurons_no in neurons_numbers:
        estimator = KerasClassifier(build_fn=lambda: build_model_n(neurons_no),
                                    epochs=60,
                                    batch_size=5,
                                    verbose=0)
        y_pred = cross_val_predict(estimator, xs, ys, cv=5)
        confusion_matrix.plot_confusion_matrix(
            y_true=ys.astype(int),
            y_pred=y_pred.astype(int),
            classes=[0, 1, 2, 3],
            title="Confusion matrix tasted on {} samples for "
            "{} neurons in hidden layer".format(len(y_pred), neurons_no))
        plt.show()
示例#9
0
def create_confusion_matrix(model_info, y_test, y_predict, name):
    # Creating confusion matrix
    # -------------------------
    print('Creating confusion matrix...')

    # Get a list of valid labels
    labels = np.unique(y_test)

    cnf_matrix = confusion_matrix(y_test, y_predict)
    np.set_printoptions(precision=2)

    plt.figure()
    plot_confusion_matrix(cnf_matrix,
                          classes=labels,
                          title='Confusion matrix, without normalization')

    filename = PATH_OUTPUT + name + "_" + model_info.name + "-" + model_info.label + "-" + \
               model_info.param_name + "-" + str(model_info.param_value) + "_" + \
               "cv" + str(model_info.cross_validation_round )+ '-confusion_matrix.pdf'
    plt.savefig(filename, format='pdf', dpi=300)
    plt.close()
示例#10
0
def plot_confusion_matrices(optimizers_with_names, xs, ys):
    """
    :param optimizers_with_names: list of pairs (optimizer, optimizer_name)
    :param xs: xs: input data, NxD np.array
    :param ys: ys: output data, Nx1 np.array
    :return:
    """
    for optimizer, name in optimizers_with_names:
        estimator = KerasClassifier(
            build_fn=lambda: build_model_with_optimizer(optimizer),
            epochs=30,
            batch_size=5,
            verbose=0)
        y_pred = cross_val_predict(estimator, xs, ys, cv=5)
        confusion_matrix.plot_confusion_matrix(
            y_true=ys.astype(int),
            y_pred=y_pred.astype(int),
            classes=[0, 1, 2, 3],
            title="Confusion matrix tasted on {} samples for "
            "{} optimizer".format(len(y_pred), name))
        plt.show()
示例#11
0
def plot_confusion_matrices(hidden_layers_numbers, xs, ys):
    """
    :param hidden_layers_numbers: hidden layers numbers list - elem. min.1, max. 3
    :param xs: xs: input data, NxD np.array
    :param ys: ys: output data, Nx1 np.array
    :return:
    """
    for hidden_layers_no in hidden_layers_numbers:
        estimator = KerasClassifier(
            build_fn=lambda: build_model_with_hidden_layers_no(hidden_layers_no
                                                               ),
            epochs=30,
            batch_size=5,
            verbose=0)
        y_pred = cross_val_predict(estimator, xs, ys, cv=5)
        confusion_matrix.plot_confusion_matrix(
            y_true=ys.astype(int),
            y_pred=y_pred.astype(int),
            classes=[0, 1, 2, 3],
            title="Confusion matrix tasted on {} samples for "
            "{} hidden layers".format(len(y_pred), hidden_layers_no))
        plt.show()
示例#12
0
def plot_conf_matrix(y_true,
                     y_pred,
                     filename,
                     binary=False,
                     normalize=True,
                     title=None,
                     cmap='Greys'):
    if binary:
        plot_confusion_matrix(y_true,
                              y_pred, ["Нет фронта", "Фронт"],
                              normalize=normalize,
                              title=title,
                              cmap=cmap)
    else:
        plot_confusion_matrix(
            y_true,
            y_pred,
            ["Нет фронта", "Тёплый", "Холодный", "Стационарный", "Окклюзии"],
            normalize=normalize,
            title=title,
            cmap=cmap)
    plt.savefig(filename)
    plt.close()
示例#13
0
def tune_params(X_train, y_train, X_val, y_val, verbose):
    best_f1 = 0
    best_params = None

    kernels = ['linear', 'poly', 'rbf']
    degrees = [2,3,4]
    gammas = ['auto', 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10] # Only for non-linears. Higher gammas tend to over-fit
    Cs = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10] # Penalty for classifying wrongly. Higher Cs tend to over-fit
    functions = ['ovo', 'ovr']

    for k in kernels:
        for f in functions:
            for C in Cs:
                if k=='linear':
                    clf = svm.SVC(kernel=k,C=C,decision_function_shape=f)
                    clf.fit(X_train, y_train)
                    y_predicted = clf.predict(X_val)
                    f1 = f1_score(y_val, y_predicted, average='micro')
                    if f1>=best_f1:
                        best_f1 = f1
                        best_params = clf.get_params()
                    if verbose:
                        print(clf.get_params)
                        print(f1)
                        plot_confusion_matrix(y_val, y_predicted, np.array(('0', '1', '2')))
                else:
                    for g in gammas:
                        if k=='poly':
                            for d in degrees:
                                clf = svm.SVC(kernel=k,gamma=g,degree=d,C=C,decision_function_shape=f)
                                clf.fit(X_train, y_train)
                                y_predicted = clf.predict(X_val)
                                f1 = f1_score(y_val, y_predicted, average='micro')
                                if f1>best_f1:
                                    best_f1 = f1
                                    best_params = clf.get_params()
                                if verbose:
                                    print(clf.get_params)
                                    print(f1)
                                    plot_confusion_matrix(y_val, y_predicted, np.array(('0', '1', '2')))
                        else:
                            clf = svm.SVC(kernel=k,gamma=g,C=C,decision_function_shape=f)
                            clf.fit(X_train, y_train)
                            y_predicted = clf.predict(X_val)
                            f1 = f1_score(y_val, y_predicted, average='micro')
                            if f1>best_f1:
                                best_f1 = f1
                                best_params = clf.get_params()
                            if verbose:
                                print(clf.get_params)
                                print(f1)
                                plot_confusion_matrix(y_val, y_predicted, np.array(('0', '1', '2')))
    return best_params
示例#14
0
def print_predictions_stats(predicted, actual):
    cwe_counts = defaultdict(int)

    for p in predicted:
        cwe_counts[p] += 1

    correct = 0

    for p, a in zip(predicted, actual):
        if p == a:
            correct += 1


#    print("Correctly identified " + str(correct) + "/" + str(actual.shape[0]))
#    print("Accuracy: " + str(float(correct)/float(actual.shape[0])))

    l1 = numpy.unique(actual)
    l2 = numpy.unique(predicted)
    labels = numpy.unique(list(set(l1).union(set(l2))))

    return cm.plot_confusion_matrix(actual, predicted,
                                    labels), calculate_accuracy(
                                        predicted, actual)
示例#15
0
X_test_cont = scaler.transform(X_test_cont)

# fill scaled data
with pd.option_context('mode.chained_assignment', None):
    for l, f in enumerate(features_to_extract):
        X_train.loc[:, f] = X_train_cont[:, l]
        X_test.loc[:, f] = X_test_cont[:, l]

model = SVC(C=1 / float(best_penalties[0]), kernel="linear", gamma="scale")
model.fit(X_train, Z_train)

Z_pred = model.predict(X_test)

plot_confusion_matrix(Z_test,
                      Z_pred,
                      normalize=True,
                      ndecimals=3,
                      title="Support Vector Machine Confusion Matrix",
                      savename="CM_SVM")

# compute final estimate of accuracy
N = 5
kfold = KFold(n_splits=N, shuffle=True)

accuracy_kfold = np.zeros(N)
model = SVC(C=1 / float(best_penalties[0]), kernel="linear", gamma="scale")
for k, (train_index, test_index) in enumerate(kfold.split(data, death)):
    x_train = data.iloc[train_index]
    y_train = np.ravel(death.iloc[train_index])
    x_test = data.iloc[test_index]
    y_test = np.ravel(death.iloc[test_index])
示例#16
0
def test_RF(fn):
    """
    Function which will tune and test a Random Forest model. It will plot
    a confusion matrix and write a performance report to file.

    Arguments:
        - fn        :       Name of the input file.
    """
    #Timer variables
    start = 0
    end = 0

    #Load datasets
    X_train_df = pd.read_csv("input/{}_train_X.csv".format(fn), sep=";")
    y_train_df = pd.read_csv("input/{}_train_y.csv".format(fn), sep=";")
    X_test_df = pd.read_csv("input/{}_test_X.csv".format(fn), sep=";")
    y_test_df = pd.read_csv("input/{}_test_y.csv".format(fn), sep=";")

    X_val_tr = X_train_df.values
    y_val_tr = y_train_df.values
    X_val_test = X_test_df.values
    y_val_test = y_test_df.values

    #Convert to numpy arrays
    X_train = X_val_tr[:].astype(float)
    y_train = y_val_tr[:]
    X_test = X_val_test[:].astype(float)
    y_test = y_val_test[:]

    #Scale X values (train)
    scaler = RobustScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)

    #Scale X values (test)
    scaler.fit(X_test)
    X_test = scaler.transform(X_test)

    #Transform non-numerical values into numericals
    encoder = LabelEncoder()
    encoder.fit(y_train.ravel())
    encoded_y_train = encoder.transform(y_train.ravel())
    encoder.fit(y_test.ravel())
    encoded_y_test = encoder.transform(y_test.ravel())

    #Fitting Random Forest Classifier to the Training set
    rf = RandomForestClassifier(n_estimators=10,
                                criterion="entropy",
                                random_state=7)
    start = time.time()
    rf.fit(X_train, encoded_y_train)
    end = time.time()

    #Predicted values
    y_pred = encoder.inverse_transform(rf.predict(X_test))

    #Making the Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    print("\n")
    print(classification_report(y_test, y_pred))
    print("Scores for final, best model:\n")
    print("Acc: {}".format(accuracy_score(y_test, y_pred)))

    #Find labels
    labels = [label for label in y_test_df.iloc[:, 0].unique()]

    #Plot confusion matrix
    plot_confusion_matrix(cm, sorted(labels), False)

    #Show the plot
    plt.savefig("figures/RF_confusion_matrix_{}.svg".format(int(time.time())))
    #plt.show()

    #Write a .txt report file
    with open("reports/RF_{}_report.txt".format(fn), "w") as f:
        f.write("REPORT FOR \"{}\"\n\n".format(fn))

        f.write("\n\n\nClassification Report:\n")
        for line in classification_report(y_test, y_pred):
            f.write(line)

        f.write("\nConfusion Matrix:\n\n")
        f.write(np.array2string(cm, separator=', '))

        f.write("\n\nTime used to train the model: {} seconds".format(end -
                                                                      start))

        f.write("\n\nScores for final, best model:\n")
        f.write("Accuracy: {}".format(accuracy_score(y_test, y_pred)))

        f.close()
示例#17
0
def main(args):
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    # cudnn.benchmark = True

    # Redirect print to both console and log file
    if not args.evaluate:
        sys.stdout = Logger(osp.join(args.logs_dir, 'log.txt'))

    # Create data loaders
    if args.height is None or args.width is None:
        args.height, args.width = (144, 56) if args.arch == 'inception' else \
                                  (240, 240)
    dataset, num_classes, train_loader, val_loader, test_loader = \
        get_data(args.dataset, args.split, args.data_dir, args.height,
                 args.width, args.batch_size, args.workers, args.combine_trainval)

    # Create model

    img_branch = models.create(args.arch,
                               cut_layer=args.cut_layer,
                               num_classes=num_classes,
                               num_features=args.features)
    diff_branch = models.create(args.arch,
                                cut_layer=args.cut_layer,
                                num_classes=num_classes,
                                num_features=args.features)

    # Load from checkpoint
    start_epoch = best_top1 = 0
    if args.resume:
        checkpoint = load_checkpoint(args.resume)
        img_branch.load_state_dict(checkpoint['state_dict_img'])
        diff_branch.load_state_dict(checkpoint['state_dict_diff'])
        start_epoch = checkpoint['epoch']
        best_top1 = checkpoint['best_top1']
        print("=> Start epoch {}  best top1 {:.1%}".format(
            start_epoch, best_top1))

    img_branch = nn.DataParallel(img_branch).cuda()
    diff_branch = nn.DataParallel(diff_branch).cuda()
    # img_branch = nn.DataParallel(img_branch)
    # diff_branch = nn.DataParallel(diff_branch)

    # Criterion
    criterion = nn.CrossEntropyLoss().cuda()
    # criterion = nn.CrossEntropyLoss()

    # Evaluator
    evaluator = Evaluator(img_branch, diff_branch, criterion)
    if args.evaluate:
        # print("Validation:")
        # top1, _ = evaluator.evaluate(val_loader)
        # print("Validation acc: {:.1%}".format(top1))
        print("Test:")
        top1, (gt, pred) = evaluator.evaluate(test_loader)
        print("Test acc: {:.1%}".format(top1))
        from confusion_matrix import plot_confusion_matrix
        plot_confusion_matrix(gt, pred, dataset.classes, args.logs_dir)
        return

    img_param_groups = [
        {
            'params': img_branch.module.low_level_modules.parameters(),
            'lr_mult': 0.1
        },
        {
            'params': img_branch.module.high_level_modules.parameters(),
            'lr_mult': 0.1
        },
        {
            'params': img_branch.module.classifier.parameters(),
            'lr_mult': 1
        },
    ]

    diff_param_groups = [
        {
            'params': diff_branch.module.low_level_modules.parameters(),
            'lr_mult': 0.1
        },
        {
            'params': diff_branch.module.high_level_modules.parameters(),
            'lr_mult': 0.1
        },
        {
            'params': diff_branch.module.classifier.parameters(),
            'lr_mult': 1
        },
    ]

    img_optimizer = torch.optim.SGD(img_param_groups,
                                    lr=args.lr,
                                    momentum=args.momentum,
                                    weight_decay=args.weight_decay,
                                    nesterov=True)
    diff_optimizer = torch.optim.SGD(diff_param_groups,
                                     lr=args.lr,
                                     momentum=args.momentum,
                                     weight_decay=args.weight_decay,
                                     nesterov=True)

    # Trainer
    trainer = Trainer(img_branch, diff_branch, criterion)

    # Schedule learning rate
    def adjust_lr(epoch):
        step_size = args.step_size
        lr = args.lr * (0.1**(epoch // step_size))
        for g in img_optimizer.param_groups:
            g['lr'] = lr * g.get('lr_mult', 1)
        for g in diff_optimizer.param_groups:
            g['lr'] = lr * g.get('lr_mult', 1)

    # Start training
    for epoch in range(start_epoch, args.epochs):
        adjust_lr(epoch)
        trainer.train(epoch, train_loader, img_optimizer, diff_optimizer)
        if epoch < args.start_save:
            continue
        top1, _ = evaluator.evaluate(val_loader)

        is_best = top1 > best_top1
        best_top1 = max(top1, best_top1)
        save_checkpoint(
            {
                'state_dict_img': img_branch.module.state_dict(),
                'state_dict_diff': diff_branch.module.state_dict(),
                'epoch': epoch + 1,
                'best_top1': best_top1,
            },
            is_best,
            fpath=osp.join(args.logs_dir, 'checkpoint.pth.tar'))

        print('\n * Finished epoch {:3d}  top1: {:5.1%}  best: {:5.1%}{}\n'.
              format(epoch, top1, best_top1, ' *' if is_best else ''))

    # Final test
    print('Test with best model:')
    checkpoint = load_checkpoint(osp.join(args.logs_dir, 'model_best.pth.tar'))
    img_branch.module.load_state_dict(checkpoint['state_dict_img'])
    diff_branch.module.load_state_dict(checkpoint['state_dict_diff'])
    top1, (gt, pred) = evaluator.evaluate(test_loader)
    from confusion_matrix import plot_confusion_matrix
    plot_confusion_matrix(gt, pred, dataset.classes, args.logs_dir)
    print('\n * Test Accuarcy: {:5.1%}\n'.format(top1))
示例#18
0
def linear(X, Y, title, filename):

	C = [1,2,5,10,15,20,25,30,50,100,200,500,1000,2000,5000,10000]
	dataLength = len(X)
	
	loo = LeaveOneOut(dataLength)
	avg_Accuracy = dict()
	sensitivity = dict()
	specificity = dict()
	for c in C:
		#print "Performing Cross Validation on Penalty: {}".format(c)
		predictions = []
		expected = []
		TP, FN, TN, FP = 0, 0, 0, 0
		Accuracy = 0
		for train_index, test_index in loo:
			X_train, X_test = X[train_index], X[test_index]
			Y_train, Y_test = Y[train_index], Y[test_index][0]

			clf = SVC(C=c, kernel='linear')
			clf.fit(X_train, Y_train)
			prediction = clf.predict(X_test)[0]
			#print("Prediction: {}".format(prediction))
			#print("Expected Result: {}".format(Y_test))
			predictions.append(prediction)
			expected.append(Y_test)

		#print("Calculating Accuracy of Prediction")
		for i, prediction in enumerate(predictions):
			if(prediction == 1 and expected[i] == 1):
				TP += 1
			elif(prediction == 0 and expected[i] == 1):
				FN += 1
			elif(prediction == 0 and expected[i] == 0):
				TN += 1
			elif(prediction == 1 and expected[i] == 0):
				FP += 1
			else:
				pass
		Sensitivity = TP/float(TP + FN)
		Specificity = TN/float(TN + FP)
		Accuracy = (TP + TN)/float(TP + TN + FP + FN)
		#print("Accuracy of Prediction: {} @ Penalty: {}".format(Accuracy, c))
		avg_Accuracy[c] = Accuracy
		sensitivity[c] = Sensitivity
		specificity[c] = Specificity

	bestC = max(avg_Accuracy.iterkeys(), key=(lambda k: avg_Accuracy[k]))
	# We are hashing the Specificity and Sensitivity based on the key that gives best accuracy
	bestSensitivity = sensitivity[bestC]
	bestSpecificity = specificity[bestC]
	bestAccuracy = avg_Accuracy[bestC]

	with open(filename, 'ab') as f:

		f.write("All Accuracy Values @ Each Penalty: {} \n".format(avg_Accuracy))
		f.write("Most Accurate Penalty Value: {}\n".format(bestC))
		f.write("Accuracy of Prediction: {} @ Penalty: {}\n".format(bestAccuracy, c))
		f.write("Sensitivity of Prediction: {} @ Penalty: {}\n".format(bestSensitivity, c))
		f.write("Specificity of Prediction: {} @ Penalty: {}\n".format(bestSpecificity, c))
		f.write("Matthews Correlation Coeefficient Value: {}\n".format(matthews_corrcoef(predictions, expected)))
		f.write("Classification Report: \n")
		f.write(classification_report(predictions, expected))
		f.write("Confusion Matrix\n")
		cm = confusion_matrix(predictions, expected)
		f.write(str(cm))
		cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
		label1 = "Negative"
		label2 = "Positive"
		
		plt.figure()
		plot_confusion_matrix(cm, title, label1, label2)
示例#19
0
            frame = np.asarray(frame)
            em_pred = emotion_recog(frame)
            y_pred.append(em_pred)

        # go down one directory
        os.chdir(original)
    
    # checking list lengths 
    print(len(y_pred))
    print(len(y_true))

    labels = ["Angry", "Disgusted", "Fearful", "Happy", "Neutral", "Sad", "Surprised"]
    
    # Return to the main environment to save the plot
    os.chdir(w_env + "/output")
    cm.plot_confusion_matrix(y_true, y_pred, labels, "Confusion Matrix (No Mask)")

elif mode == "maskMatrix":
    y_pred = []
    y_true = []

    # Load the trained model
    model.load_weights('model.h5')

    # Load the cascade classifier
    facecasc = cv2.CascadeClassifier('haarcascade_frontalface_default.xml')
    w_env = os.getcwd()
    
    # changing to the test directory
    os.chdir("./data_masks")
示例#20
0
            true_classes.append(correct_class)
            predicted_classes.append(prediction)
    else:
        L2_distances = np.square(train_histograms[:, None] -
                                 val_histograms).sum(axis=2).T
        index = 0
        mAP = 0
        for val_img in VAL_IMAGE_PATHS:
            distances = L2_distances[index]
            idx = np.argpartition(distances, knn)
            correct_class = val_img.split("/")[-2]
            prediction = predict(TRAIN_IMAGE_PATHS, idx[:knn])
            if prediction == correct_class:
                mAP += 1
            index += 1
            true_classes.append(correct_class)
            predicted_classes.append(prediction)

    print("mAP:", mAP / len(VAL_IMAGE_PATHS), "\n")

    # Confusion Matrix
    confusion_matrix_title = "Accuraccy=" + str(
        mAP / len(VAL_IMAGE_PATHS)) + " (k-Means:" + str(
            k_means) + " StepSize:" + str(step_size) + " k-NN:" + str(
                knn) + ")"
    cm.plot_confusion_matrix(true_classes,
                             predicted_classes,
                             CLASS_NAMES,
                             title=confusion_matrix_title)
    plt.show()
示例#21
0
# visualize confusion matrix
# split data set
test_size = 0.35
train_size = 1 - test_size
X_train, X_test, Z_train, Z_test = train_test_split(data,
                                                    np.ravel(death),
                                                    train_size=train_size,
                                                    test_size=test_size)

model = RFC(n_estimators=50, max_depth=md_best, max_features=mf_best)
model.fit(X_train, Z_train)
Z_pred = model.predict(X_test)

plot_confusion_matrix(Z_test,
                      Z_pred,
                      normalize=True,
                      ndecimals=3,
                      title="Random Forest Confusion Matrix",
                      savename="CM_RF")

# compute final estimate of accuracy
N = 5
kfold = KFold(n_splits=N, shuffle=True)

accuracy_kfold = np.zeros(N)
model = RFC(n_estimators=50, max_depth=md_best, max_features=mf_best)
for k, (train_index, test_index) in enumerate(kfold.split(data, death)):
    x_train = data.iloc[train_index]
    y_train = np.ravel(death.iloc[train_index])
    x_test = data.iloc[test_index]
    y_test = np.ravel(death.iloc[test_index])
示例#22
0
def evaluate(modelPath):
    labels = ["BG", "figure", "table", "text"]
    use_cuda = torch.cuda.is_available()
    val_data = data_loader.ClassSeg(root=data_path, split='test', transform=True, filePath='DSSE',chanelCat=in_channels_Nmuber)
    val_loader = torch.utils.data.DataLoader(val_data,batch_size=1,shuffle=False,num_workers=5)
    print('load model .....')

    print("Using FCNs")
    vgg_model = models.VGGNet(model='vgg_self', pretrained=False, in_channels=in_channels_Nmuber)
    fcn_model = models.FCNs(pretrained_net=vgg_model, n_class=n_class, Attention=True)

    fcn_model.load_state_dict(torch.load(modelPath))

    if use_cuda:
        fcn_model.cuda()
    fcn_model.eval()

    label_trues, label_preds = [], []
    matrixs = np.zeros((n_class,n_class))
    for idx, (img, label,_) in enumerate(val_loader):
        img, label,Image_Path = val_data[idx]
        img = img.unsqueeze(0)
        if use_cuda:
            img = img.cuda()
        img = Variable(img)

        out = fcn_model(img)     # 1, 21, 320, 320
        srcImage = mpimg.imread(Image_Path)

        pred = out.data.max(1)[1].squeeze_(1).squeeze_(0)   # 320, 320

        if use_cuda:
            pred = pred.cpu()

        # 后处理
        data = pred.numpy()

        # CutX=int(data.shape[1]/32)
        #
        # for Cuti in range(data.shape[0]):
        #     for Cutj in range(0,data.shape[1],CutX):
        #         temp=data[Cuti,Cutj:Cutj+CutX-1]
        #         data[Cuti, Cutj:Cutj + CutX - 1]=stats.mode(temp)[0][0]
        #
        # dataT = data.T
        # CutY = int(dataT.shape[1]/3)
        #
        # for Cuti in range(dataT.shape[0]):
        #     for Cutj in range(0,dataT.shape[1],CutY):
        #         temp=dataT[Cuti,Cutj:Cutj+CutX-1]
        #         data[Cutj:Cutj + CutX - 1,Cuti]=stats.mode(temp)[0][0]

        # # -------------------------------------------------------------------------
        #
        # if len(srcImage.shape) == 3:
        #     image = cv2.cvtColor(srcImage, cv2.COLOR_BGR2GRAY)  # 将图像转化为灰度图像
        # else:
        #     image = srcImage
        #
        # h = int(max_height / 64) * 64
        # w = int(image.shape[1] * (max_height / image.shape[0]) / 64) * 64
        #
        # image = cv2.resize(image, (w, h), interpolation=cv2.INTER_LINEAR)
        # # print(image.shape)
        # sobelCombinedIMG = sobelCombined(image)
        # data=np.multiply(data,sobelCombinedIMG)

        #
        # #-------------------------------------------------------------------------

        label_trues.append(label.numpy())
        label_preds.append(data)


        if idx % 2 == 0:
            print('evaluate [%d/%d]' % (idx, len(val_loader)))

        label_matrix_T=label.numpy()
        pre_matrix_T = data

        label_matrix = label_matrix_T.flatten()
        pre_matrix = pre_matrix_T.flatten()


        matrix = metrics.confusion_matrix(label_matrix, pre_matrix)

        if sum(sum(matrixs)) == 0:
            for i in range(len(matrixs)):
                for j in range(len(matrixs[0])):
                    matrixs[i][j] = matrix[i][j]
        else:
            # 迭代输出行
            for i in range(len(matrix)):
                # 迭代输出列
                for j in range(len(matrix[0])):
                    # print(range(len(matrix[0])))
                    matrixs[i][j] = (matrixs[i][j] + matrix[i][j])/2


    # Mymetrics = tools.accuracy_score(label_trues, label_preds,n_class)
    # Mymetrics = np.array(Mymetrics)
    # Mymetrics *= 100
    # print('''\
    #         Accuracy: {0}
    #         Accuracy Class: {1}
    #         Mean IU: {2}
    #         FWAV Accuracy: {3}'''.format(*Mymetrics))
    plot_confusion_matrix(matrixs, classes=labels, normalize=True, title='Normalized confusion matrix',cmap=plt.cm.Blues,yMax=3.5)
    #
    numberTotal = sum(sum(matrixs))
    muberTrue = 0
    PercisionList = []
    RecallList = []

    for i in range(len(matrixs)):
        for j in range(len(matrixs[0])):
            if i == j:
                muberTrue = muberTrue + matrixs[i, j]

    for i in range(len(matrixs)):
        PercisionList.append(matrixs[i, i] / sum(matrixs[:, i]))

    for i in range(len(matrixs)):
        RecallList.append(matrixs[i, i] / sum(matrixs[i, :]))

    Acurracy = muberTrue / numberTotal
    Percision = sum(PercisionList) / len(PercisionList)
    Recall = sum(RecallList) / len(RecallList)
    F1 = (2 * Percision * Recall) / (Percision + Recall)

    print(Acurracy)
    print(Percision)
    print(Recall)
    print(F1)
示例#23
0
def main(args):
  
    with tf.Graph().as_default():
      
        with tf.Session() as sess:
            
            np.random.seed(seed=args.seed)
            
            if args.use_split_dataset:
                dataset_tmp = facenet.get_dataset(args.data_dir)
                train_set, test_set = split_dataset(dataset_tmp, args.min_nrof_images_per_class, args.nrof_train_images_per_class)
                if (args.mode=='TRAIN'):
                    dataset = train_set
                elif (args.mode=='CLASSIFY'):
                    dataset = test_set
            else:
                dataset = facenet.get_dataset(args.data_dir)

            # Check that there are at least one training image per class
            for cls in dataset:
                assert(len(cls.image_paths)>0, 'There must be at least one image for each class in the dataset')            

                 
            paths, labels = facenet.get_image_paths_and_labels(dataset)
            
            print('Number of classes: %d' % len(dataset))
            print('Number of images: %d' % len(paths))
            
            # Load the model
            print('Loading feature extraction model')
            facenet.load_model(args.model)
            
            # Get input and output tensors
            images_placeholder = tf.get_default_graph().get_tensor_by_name("input:0")
            embeddings = tf.get_default_graph().get_tensor_by_name("embeddings:0")
            phase_train_placeholder = tf.get_default_graph().get_tensor_by_name("phase_train:0")
            embedding_size = embeddings.get_shape()[1]
            
            # Run forward pass to calculate embeddings
            print('Calculating features for images')
            nrof_images = len(paths)
            nrof_batches_per_epoch = int(math.ceil(1.0*nrof_images / args.batch_size))
            emb_array = np.zeros((nrof_images, embedding_size))
            for i in range(nrof_batches_per_epoch):
                start_index = i*args.batch_size
                end_index = min((i+1)*args.batch_size, nrof_images)
                paths_batch = paths[start_index:end_index]
                images = facenet.load_data(paths_batch, False, False, args.image_size)
                feed_dict = { images_placeholder:images, phase_train_placeholder:False }
                emb_array[start_index:end_index,:] = sess.run(embeddings, feed_dict=feed_dict)
            
            classifier_filename_exp = os.path.expanduser(args.classifier_filename)

            if (args.mode=='TRAIN'):
                # Train classifier
                print('Training classifier')
                # model = SVC(kernel='linear', probability=True)
                # model = LSHForest(probability=True)
                model = KNeighborsClassifier()
                model.fit(emb_array, labels)
            
                # Create a list of class names
                class_names = [ cls.name.replace('_', ' ') for cls in dataset]

                # Saving classifier model
                with open(classifier_filename_exp, 'wb') as outfile:
                    pickle.dump((model, class_names), outfile)
                print('Saved classifier model to file "%s"' % classifier_filename_exp)
                
            elif (args.mode=='CLASSIFY'):
                # Classify images
                print('Testing classifier')
                with open(classifier_filename_exp, 'rb') as infile:
                    (model, class_names) = pickle.load(infile)

                print('Loaded classifier model from file "%s"' % classifier_filename_exp)

                predictions = model.predict_proba(emb_array)
                best_class_indices = np.argmax(predictions, axis=1)
                best_class_probabilities = predictions[np.arange(len(best_class_indices)), best_class_indices]
                
                for i in range(len(best_class_indices)):
                    print('%4d  %s: %.3f' % (i, class_names[best_class_indices[i]], best_class_probabilities[i]))
                    
                print(class_names)
                accuracy = np.mean(np.equal(best_class_indices, labels))
                print('Accuracy: %.3f' % accuracy)
                fig, ax = plot_confusion_matrix(best_class_indices, labels, classes=np.array(class_names),
                      title='Confusion matrix, without normalization')
                fig.savefig('full_figure.png')
def new_write_file_content(pickle_file_path, measure, results_path):
    # Setup the path and the name of the file
    dataset_name = corpus_name()
    pickle_file_content = open_pickle_content(pickle_file_path)
    file_path = (results_path + "\\" + measure.upper().replace("_", " ") +
                 " for " + dataset_name + ".xlsx")

    # Create an new Excel file and add a worksheet.
    workbook = xlsxwriter.Workbook(file_path)
    worksheet = workbook.add_worksheet()

    # Write titels
    extra_big = workbook.add_format({
        "bold": True,
        "font_size": 17,
        "underline": True
    })
    big = workbook.add_format({"bold": True, "font_size": 12})
    worksheet.write("A2", dataset_name, big)
    worksheet.write(
        "A3",
        "The classification results are shown in the table below in percentages"
    )

    # Write general data
    bold_gray = workbook.add_format({"bold": True, "font_color": "gray"})
    gray = workbook.add_format({"font_color": "gray"})
    worksheet.write("A5",
                    "General information about the classification software",
                    bold_gray)
    now = datetime.now()
    worksheet.write("A6", "Issue date: " + now.strftime("%d/%m/%Y %H:%M:%S"),
                    gray)
    version = (str(sys.version_info[0]) + "." + str(sys.version_info[1]) +
               "." + str(sys.version_info[2]))
    worksheet.write("A7", "Python version: Python " + version, gray)
    worksheet.write(
        "A8",
        "Python classification libraries: keras, sklearn, tensorflow, VADAR from nltk, WordCloud",
        gray,
    )

    # Write normalization
    worksheet.write("A10", "Pre Processing", bold_gray)
    worksheet.write("A11", "A - Acronyms", gray)
    worksheet.write("A12", "L - Lowercase", gray)
    """worksheet.write("A13", "AR - Apostrophe Removal", gray)
    worksheet.write("A11", "C - Spelling Correction", gray)
    worksheet.write("A12", "L - Lowercase", gray)
    worksheet.write("A13", "H - HTML tags", gray)
    worksheet.write("A14", "P - Punctuations", gray)
    worksheet.write("A15", "R - Repeated chars", gray)
    worksheet.write("A16", "T - Stemming", gray)
    worksheet.write("A17", "M - Lemmatizer", gray)"""

    # Write learning methods
    worksheet.write("H10", "Learning methods", bold_gray)
    worksheet.write("H11", "svc  - Linear SVC", gray)
    worksheet.write("H12", "rf      - Random Forest", gray)
    worksheet.write("H13", "mlp  - Multilayer Perceptron", gray)
    worksheet.write("H14", "lr      - Logistic Regression", gray)
    worksheet.write("H15", "mnb - Multinomial Naive Bayes", gray)
    worksheet.write("H16", "rnn   - Recurrent Neural Network", gray)

    # Write stop words option
    worksheet.write("D10", "Stop Words Options", bold_gray)
    worksheet.write("D11", "E - English stop words", gray)
    worksheet.write("D12", "H - Hebrew stop words", gray)
    worksheet.write("D13", "X - Extended Hebrew stop words", gray)

    # Write differences significance option
    worksheet.write("L10", "Statistical Significance Options", bold_gray)
    worksheet.write("L11", "V - Significantly larger than the baseline", gray)
    worksheet.write("L12", "* - Significantly smaller than the baseline", gray)

    # Write stylistic features option
    worksheet.write("F19", "Stylistic Features Options", bold_gray)
    worksheet.write("F20", "CC - chars count", gray)
    worksheet.write("F21", "WC - words count", gray)
    worksheet.write("F22", "SC - sentence count", gray)
    worksheet.write("F23", "EMC - exclamation mark (!) count", gray)
    worksheet.write("F24", "QSMC - question mark (?) count", gray)
    worksheet.write("F25",
                    "SCC - special characters (@, #, $, &, *, %, ^) count",
                    gray)
    worksheet.write("F26", "QTMC - quotation mark (\", ') count", gray)
    worksheet.write("F27", "ALW - average letters in words", gray)
    worksheet.write("F28", "ALS - average letters in sentence", gray)
    worksheet.write("F29", "AWS - average words in sentence", gray)
    worksheet.write("F30", "AWL - average words length", gray)
    worksheet.write("F31", "IE - increasing expressions", gray)
    worksheet.write("F32", "DE - doubt expressions", gray)
    worksheet.write("F33", "NW - negative terms", gray)
    worksheet.write("F34", "PW - positive terms", gray)
    worksheet.write("F35", "TE - time expressions", gray)
    worksheet.write("F36", "EE - emotion expressions", gray)
    worksheet.write("I20", "FPE - first person expressions", gray)
    worksheet.write("I21", "SPE - second person expressions", gray)
    worksheet.write("I22", "TPE - third person expressions", gray)
    worksheet.write("I23", "INE - inclusion expressions", gray)
    worksheet.write("I24", "P1 - expressions form power 1", gray)
    worksheet.write("I25", "P2 - expressions form power 2", gray)
    worksheet.write("I26", "P3 - expressions form power 3", gray)
    worksheet.write("I27", "PM1 - expressions form power -1", gray)
    worksheet.write("I28", "PM2 - expressions form power -2", gray)
    worksheet.write("I29", "PM3 - expressions form power -3", gray)
    worksheet.write("I30", "PM4 - expressions form power -4", gray)
    worksheet.write("I31", "AP - expressions form all the powers", gray)
    worksheet.write(
        "I32", "TOPA1 - Enable all features on the 1'st trimester of the text",
        gray)
    worksheet.write(
        "I32", "TOPA2 - Enable all features on the 2'nd trimester of the text",
        gray)
    worksheet.write(
        "I33", "TOPA3 - Enable all features on the 3'rd trimester of the text",
        gray)
    worksheet.write(
        "I34",
        "TOPB1 - Enable all features on the first ten words of the text", gray)
    worksheet.write(
        "I35",
        "TOPB2 - Enable all features on the text without 10 first and last 10 words",
        gray,
    )
    worksheet.write(
        "I36", "TOPB3 - Enable all features on the last ten words of the text",
        gray)

    # Write the result
    row = 40
    kind = {"w": "Words", "c": "Chars"}
    ngrams = {"1": "Unigrams", "2": "Bigrams", "3": "Trigrams"}
    tf = {"tf": "TF", "tfidf": "TF-IDF"}
    methods = {"svc": 12, "rf": 13, "mlp": 14, "lr": 15, "mnb": 16, "rnn": 17}

    if measure == "accuracy_&_confusion_matrix":
        maxes = {
            "svc": [[0, 0, {
                "accuracy": 0,
                "matrix": None
            }]],
            "rf": [[0, 0, {
                "accuracy": 0,
                "matrix": None
            }]],
            "mlp": [[0, 0, {
                "accuracy": 0,
                "matrix": None
            }]],
            "lr": [[0, 0, {
                "accuracy": 0,
                "matrix": None
            }]],
            "mnb": [[0, 0, {
                "accuracy": 0,
                "matrix": None
            }]],
            "rnn": [[0, 0, {
                "accuracy": 0,
                "matrix": None
            }]],
        }
        best = [[0, 0, {"accuracy": 0, "matrix": None}]]
    else:
        maxes = {
            "svc": [[0, 0, 0]],
            "rf": [[0, 0, 0]],
            "mlp": [[0, 0, 0]],
            "lr": [[0, 0, 0]],
            "mnb": [[0, 0, 0]],
            "rnn": [[0, 0, 0]],
        }
        best = [[0, 0, 0]]
    image_num = 0
    for key in sorted(pickle_file_content):
        value = pickle_file_content[key]

        # Gather all the results
        all_averages = []

        # N-Grams data
        cell_format = workbook.add_format()
        cell_format.set_text_wrap()
        cell_format.set_align("vcenter")
        cell_format.set_align("center")
        features = value["featurs"]
        if features:
            count = ""
            type = ""
            tfidf = ""
            grams = ""
            skips = ""
            for feature in features:
                feature = feature.split("_")
                count += feature[1] + "\n"
                type += kind[feature[2]] + "\n"
                tfidf += tf[feature[3]] + "\n"
                grams += ngrams[feature[4]] + "\n"
                skips += feature[5] + "\n"
            worksheet.write_number(row, 0, int(count[:-1]), cell_format)
            worksheet.write(row, 1, type[:-1], cell_format)
            worksheet.write(row, 2, grams[:-1], cell_format)
            worksheet.write(row, 3, tfidf[:-1], cell_format)
            worksheet.write(row, 4, skips[:-1], cell_format)

        # Stylistic Features data
        stylistic_features = ""
        num_of_features = 0
        stylistic_features_dict = initialize_features_dict()
        if value["stylistic_features"]:
            for styl_feature in value["stylistic_features"]:
                stylistic_features += styl_feature.upper() + "  "
            worksheet.write(row, 5, stylistic_features[:-2], cell_format)

        # Write the num of features
        worksheet.write_number(row, 0, value["num_of_features"], cell_format)

        # Pre Processing and Stop Words data
        cell_format = workbook.add_format()
        cell_format.set_align("center")
        cell_format.set_align("vcenter")
        normalization = ""
        stopwords = ""
        for char in value["normalization"]:
            if char.lower() in "sbx":
                stopwords += (char.replace("s", "E").replace("b", "H").replace(
                    "x", "X") + " ")
            else:
                normalization += char.upper() + " "
        if normalization == "":
            normalization = "NONE"
        if stopwords == "":
            stopwords = "NONE"
        try:
            worksheet.write(row, 6, str(value["selection"][0]), cell_format)
            worksheet.write(row, 7, str(value["selection"][1]), cell_format)
        except:
            pass
        worksheet.write(row, 8, normalization, cell_format)
        worksheet.write(row, 9, stopwords, cell_format)
        worksheet.write(row, 10, value["k_folds"], cell_format)
        worksheet.write(row, 11, value["iterations"], cell_format)

        # ML methods and result data
        for method, result in value["results"].items():
            # confusion matrix
            if not isinstance(result, list):
                title = measure + str(image_num)
                if measure == "confusion_matrix":
                    plot_confusion_matrix(result, results_path, title=title)
                    worksheet.set_column(methods[method], methods[method], 40)
                    worksheet.set_row(row, 140)
                elif measure == "roc_curve":
                    plot_roc_curve(result, results_path, method, title=title)
                    worksheet.set_column(methods[method], methods[method], 50)
                    worksheet.set_row(row, 225)
                elif measure == "precision_recall_curve":
                    plot_precision_recall_curve(result,
                                                results_path,
                                                title=title)
                    worksheet.set_column(methods[method], methods[method], 47)
                    worksheet.set_row(row, 215)
                elif measure == "accuracy_&_confusion_matrix":
                    plot_confusion_matrix(
                        result["matrix"],
                        results_path,
                        title=title,
                        accuracy=result["accuracy"],
                        cmap=plt.cm.Greys,
                    )
                    worksheet.set_column(methods[method], methods[method], 40)
                    worksheet.set_row(row, 170)
                    best, maxes = find_maxes_best(best, maxes, method, methods,
                                                  row, result)
                worksheet.insert_image(row, methods[method],
                                       results_path + "\\" + title + ".jpg")
                image_num += 1
                continue

            if isinstance(result, list):
                sign = differences_significance(value["baseline_path"], result,
                                                measure, value["k_folds"])
                val = str(float("{0:.4g}".format(
                    avg(result) * 100))) + " " + sign
                all_averages += [float("{0:.4g}".format(avg(result) * 100))]
            else:
                val = result

            worksheet.write(row, methods[method], str(val), cell_format)

            # Check if val bigger then max
            best, maxes = find_maxes_best(best, maxes, method, methods, row,
                                          val)

        # write the max result of each classification
        worksheet.write_number("S" + str(row + 1), max(all_averages),
                               cell_format)
        row += 1

    worksheet.write("A19", "Colors", bold_gray)
    good = workbook.add_format({"bold": True, "font_color": "blue"})
    good.set_align("center")
    good.set_align("vcenter")
    for _, method in maxes.items():
        for val in method:
            if isinstance(val[2], dict):
                if val[2]["accuracy"] != 0:
                    image_num += 1
                    title = measure + str(image_num)
                    plot_confusion_matrix(
                        val[2]["matrix"],
                        results_path,
                        title=title,
                        accuracy=val[2]["accuracy"],
                        cmap=plt.cm.Blues,
                        color="blue",
                    )
                    worksheet.insert_image(
                        val[0], val[1], results_path + "\\" + title + ".jpg")
            else:
                worksheet.write(val[0], val[1], val[2], good)
    good = workbook.add_format({"font_color": "blue"})
    worksheet.write("A20", "The best result of the learning method", good)
    good = workbook.add_format({"bold": True, "font_color": "red"})
    good.set_align("center")
    good.set_align("vcenter")
    for val in best:
        if isinstance(val[2], dict):
            if val[2]["accuracy"] != 0:
                image_num += 1
                title = measure + str(image_num)
                plot_confusion_matrix(
                    val[2]["matrix"],
                    results_path,
                    title=title,
                    accuracy=val[2]["accuracy"],
                    cmap=plt.cm.Reds,
                    color="red",
                )
                worksheet.insert_image(val[0], val[1],
                                       results_path + "\\" + title + ".jpg")
        else:
            worksheet.write(val[0], val[1], val[2], good)

    good = workbook.add_format({"font_color": "red"})
    worksheet.write("A21", "The best result in all classification", good)
    bold = workbook.add_format({"bold": True})
    worksheet.write("A39", "Results", bold)
    worksheet.add_table(
        "A40:S" + str(row),
        {
            "columns": [
                {
                    "header": "Number"
                },
                {
                    "header": "Type"
                },
                {
                    "header": "N-GRAMS"
                },
                {
                    "header": "TF"
                },
                {
                    "header": "Skips"
                },
                {
                    "header": "Stylistic Features"
                },
                {
                    "header": "Selection"
                },
                {
                    "header": "Number Selected"
                },
                {
                    "header": "Pre Processing"
                },
                {
                    "header": "Stop Words"
                },
                {
                    "header": "K-Folds CV"
                },
                {
                    "header": "Iterations"
                },
                {
                    "header": "SVC"
                },
                {
                    "header": "RF"
                },
                {
                    "header": "MLP"
                },
                {
                    "header": "LR"
                },
                {
                    "header": "MNB"
                },
                {
                    "header": "RNN"
                },
                {
                    "header": "Max Method"
                },
            ],
            "style":
            "Table Style Light 8",
        },
    )

    worksheet.write("A1",
                    "Classification results: " + measure.replace("_", " "),
                    extra_big)

    workbook.close()

    # Delete the images of the non integer measures
    for file in os.listdir(results_path):
        if file.endswith(".jpg"):
            os.remove(results_path + "\\" + file)
示例#25
0
model.add(Conv2D(1024, kernel_size=(3, 3), activation='elu'))
model.add(Dropout(0.5))

model.add(Flatten())
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adam(),
              metrics=['accuracy'])

model.fit(train_X,
          train_y,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_data=(validation_X, validation_y))
score = model.evaluate(validation_X, validation_y, verbose=0)
print('Val loss:', score[0])
print('Val accuracy:', score[1])

score = model.evaluate(test_X, test_y, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

preds = model.predict(test_X)
plt.figure(figsize=[12] * 2)
plot_confusion_matrix(test_y.argmax(1),
                      preds.argmax(1), [genres[i] for i in top6_idxs],
                      normalize=False)
predictions_3 = model_3.decision_function(test_set_3)

model_4 = LogisticRegression(C=0.8)
model_4.fit(train_set_4, train_labels_4)
predictions_4 = model_4.decision_function(test_set_4)

model_5 = LogisticRegression(C=0.5)
model_5.fit(train_set_5, train_labels_5)
predictions_5 = model_5.decision_function(test_set_5)

model_6 = LogisticRegression(C=1.4)
model_6.fit(train_set_6, train_labels_6)
predictions_6 = model_6.decision_function(test_set_6)

predictions = majority_vote([
    predictions_1, predictions_2, predictions_3, predictions_4, predictions_5,
    predictions_6
])

print(accuracy_score(test_labels, predictions))
print(precision_score(test_labels, predictions))
print(recall_score(test_labels, predictions))
print(f1_score(test_labels, predictions))

cnf_matrix = confusion_matrix(test_labels, predictions)
np.set_printoptions(precision=2)
plt = plot_confusion_matrix(cnf_matrix,
                            classes=[0, 1],
                            title='Confusion matrix Logistic Regression')
plt.savefig("graphs/logistic_confusion_matrix.png")
model = Sequential()
model.add(
    Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=input_shape))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])

model.fit(x_train,
          y_train,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_data=(x_test, y_test))
score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

predict_classes = model.predict_classes(x_test, batch_size=1)
true_classes = np.argmax(y_test, 1)
confusion_matrix.plot_confusion_matrix(true_classes,
                                       predict_classes,
                                       save_flg=True)
示例#28
0
def test_KNN(fn):
    """
    Function which will tune and test a K-Nearest Neighbors model. It will plot
    a confusion matrix and write a performance report to file.

    Arguments:
        - fn        :       Name of the input file.
    """
    #Timer variables
    start = 0
    end = 0

    #Load datasets
    X_train_df = pd.read_csv("input/{}_train_X.csv".format(fn), sep=";")
    y_train_df = pd.read_csv("input/{}_train_y.csv".format(fn), sep=";")
    X_test_df = pd.read_csv("input/{}_test_X.csv".format(fn), sep=";")
    y_test_df = pd.read_csv("input/{}_test_y.csv".format(fn), sep=";")

    X_val_tr = X_train_df.values
    y_val_tr = y_train_df.values
    X_val_test = X_test_df.values
    y_val_test = y_test_df.values

    #Convert to numpy arrays
    X_train = X_val_tr[:].astype(float)
    y_train = y_val_tr[:]
    X_test = X_val_test[:].astype(float)
    y_test = y_val_test[:]

    #Scale X values (train)
    scaler = RobustScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)

    #Scale X values (test)
    scaler.fit(X_test)
    X_test = scaler.transform(X_test)

    #Transform non-numerical values into numericals
    encoder = LabelEncoder()
    encoder.fit(y_train.ravel())
    encoded_y_train = encoder.transform(y_train.ravel())
    encoder.fit(y_test.ravel())
    encoded_y_test = encoder.transform(y_test.ravel())

    #Number of neighbors (K) to test
    nr_of_neighbors = [x for x in range(5, 100, 5)]

    #Variables to store the best values
    best_model = KNeighborsClassifier()
    best_acc = 0.0
    time_taken = 0

    #Test different values for K
    for K in nr_of_neighbors:
        knn = KNeighborsClassifier(n_neighbors=K)

        #Train the model
        start = time.time()
        knn.fit(X_train, encoded_y_train)
        end = time.time()

        #Predicted values
        y_pred = knn.predict(X_test)

        print("\nK: {}".format(knn.get_params()['n_neighbors']))
        print("Acc: {}".format(accuracy_score(encoded_y_test, y_pred)))

        #Measure accuracy and save model if it is the best one
        if accuracy_score(encoded_y_test, y_pred) > best_acc:
            time_taken = end - start
            best_model = knn
            best_acc = accuracy_score(encoded_y_test, y_pred)

    #Predict using the best model
    y_pred = encoder.inverse_transform(best_model.predict(X_test))
    K = best_model.get_params()['n_neighbors']

    #Making the Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    print("\n")
    print(classification_report(y_test, y_pred))
    print("Scores for final, best model:\n")
    print("\nK: {}".format(K))
    print("Acc: {}".format(accuracy_score(y_test, y_pred)))

    #Find labels
    labels = [label for label in y_test_df.iloc[:, 0].unique()]

    #Plot confusion matrix
    plot_confusion_matrix(cm, sorted(labels), False)

    #Show the plot
    plt.savefig("figures/KNN_confusion_matrix_{}.svg".format(int(time.time())))
    #plt.show()

    #Write a .txt report file
    with open("reports/KNN_{}_report.txt".format(fn), "w") as f:
        f.write("REPORT FOR \"{}\"\n\n".format(fn))
        f.write("Best value for K: {}".format(K))

        f.write("\n\n\nClassification Report:\n")
        for line in classification_report(y_test, y_pred):
            f.write(line)

        f.write("\nConfusion Matrix:\n\n")
        f.write(np.array2string(cm, separator=', '))

        f.write(
            "\n\nTime used to train the model: {} seconds".format(time_taken))

        f.write("\n\nScores for final, best model:\n")
        f.write("Accuracy: {}".format(best_acc))

        f.close()
示例#29
0
def new_write_file_content(pickle_file_path, measure, results_path):
    # Setup the path and the name of the file
    dataset_name = corpus_name()
    pickle_file_content = open_pickle_content(pickle_file_path)
    file_path = os.path.join(results_path,
                             measure.upper().replace(
                                 '_', ' ')) + " for " + dataset_name + '.xlsx'

    # Create an new Excel file and add a worksheet.
    workbook = xlsxwriter.Workbook(file_path)
    worksheet = workbook.add_worksheet()

    # Write titels
    extra_big = workbook.add_format({
        'bold': True,
        'font_size': 17,
        'underline': True
    })
    big = workbook.add_format({'bold': True, 'font_size': 12})
    worksheet.write('A2', dataset_name, big)
    worksheet.write(
        'A3',
        'The classification results are shown in the table below in percentages'
    )

    # Write general data
    bold_gray = workbook.add_format({'bold': True, 'font_color': 'gray'})
    gray = workbook.add_format({'font_color': 'gray'})
    worksheet.write('A5',
                    'General information about the classification software',
                    bold_gray)
    now = datetime.now()
    worksheet.write('A6', 'Issue date: ' + now.strftime("%d/%m/%Y %H:%M:%S"),
                    gray)
    version = str(sys.version_info[0]) + '.' + str(
        sys.version_info[1]) + '.' + str(sys.version_info[2])
    worksheet.write('A7', 'Python version: Python ' + version, gray)
    worksheet.write(
        'A8',
        'Python classification libraries: keras, sklearn, tensorflow, VADAR from nltk, WordCloud',
        gray)

    # Write normalization
    worksheet.write('A10', 'Pre Processing', bold_gray)
    worksheet.write('A11', "C - Spelling Correction", gray)
    worksheet.write('A12', "L - Lowercase", gray)
    worksheet.write('A13', "H - HTML tags", gray)
    worksheet.write('A14', "P - Punctuations", gray)
    worksheet.write('A15', "R - Repeated chars", gray)
    worksheet.write('A16', "T - Stemming", gray)
    worksheet.write('A17', "M - Lemmatizer", gray)

    # Write learning methods
    worksheet.write('H10', 'Learning methods', bold_gray)
    worksheet.write('H11', "svc  - Linear SVC", gray)
    worksheet.write('H12', "rf      - Random Forest", gray)
    worksheet.write('H13', "mlp  - Multilayer Perceptron", gray)
    worksheet.write('H14', "lr      - Logistic Regression", gray)
    worksheet.write('H15', "mnb - Multinomial Naive Bayes", gray)
    worksheet.write('H16', "rnn   - Recurrent Neural Network", gray)

    # Write stop words option
    worksheet.write('D10', 'Stop Words Options', bold_gray)
    worksheet.write('D11', "E - English stop words", gray)
    worksheet.write('D12', "H - Hebrew stop words", gray)
    worksheet.write('D13', "X - Extended Hebrew stop words", gray)

    # Write stylistic features option
    worksheet.write('F19', 'Stylistic Features Options', bold_gray)
    worksheet.write('F20', "CC - chars count", gray)
    worksheet.write('F21', "WC - words count", gray)
    worksheet.write('F22', "SC - sentence count", gray)
    worksheet.write('F23', "EMC - exclamation mark (!) count", gray)
    worksheet.write('F24', "QSMC - question mark (?) count", gray)
    worksheet.write('F25',
                    "SCC - special characters (@, #, $, &, *, %, ^) count",
                    gray)
    worksheet.write('F26', "QTMC - quotation mark (\", ') count", gray)
    worksheet.write('F27', "ALW - average letters in words", gray)
    worksheet.write('F28', "ALS - average letters in sentence", gray)
    worksheet.write('F29', "AWS - average words in sentence", gray)
    worksheet.write('F30', "AWL - average words length", gray)
    worksheet.write('F31', "IE - increasing expressions", gray)
    worksheet.write('F32', "DE - doubt expressions", gray)
    worksheet.write('F33', "NW - negative terms", gray)
    worksheet.write('F34', "PW - positive terms", gray)
    worksheet.write('F35', "TE - time expressions", gray)
    worksheet.write('F36', "EE - emotion expressions", gray)
    worksheet.write('I20', "FPE - first person expressions", gray)
    worksheet.write('I21', "SPE - second person expressions", gray)
    worksheet.write('I22', "TPE - third person expressions", gray)
    worksheet.write('I23', "INE - inclusion expressions", gray)
    worksheet.write('I24', "P1 - expressions form power 1", gray)
    worksheet.write('I25', "P2 - expressions form power 2", gray)
    worksheet.write('I26', "P3 - expressions form power 3", gray)
    worksheet.write('I27', "PM1 - expressions form power -1", gray)
    worksheet.write('I28', "PM2 - expressions form power -2", gray)
    worksheet.write('I29', "PM3 - expressions form power -3", gray)
    worksheet.write('I30', "PM4 - expressions form power -4", gray)
    worksheet.write('I31', "AP - expressions form all the powers", gray)
    worksheet.write(
        'I32', "TOPA1 - Enable all features on the 1'st trimester of the text",
        gray)
    worksheet.write(
        'I32', "TOPA2 - Enable all features on the 2'nd trimester of the text",
        gray)
    worksheet.write(
        'I33', "TOPA3 - Enable all features on the 3'rd trimester of the text",
        gray)
    worksheet.write(
        'I34',
        "TOPB1 - Enable all features on the first ten words of the text", gray)
    worksheet.write(
        'I35',
        "TOPB2 - Enable all features on the text without 10 first and last 10 words",
        gray)
    worksheet.write(
        'I36', "TOPB3 - Enable all features on the last ten words of the text",
        gray)

    # Write the result
    row = 40
    kind = {'w': 'Words', 'c': 'Chars'}
    ngrams = {'1': 'Unigrams', '2': 'Bigrams', '3': 'Trigrams', '4': '4-gram'}
    tf = {'tf': 'TF', 'tfidf': 'TF-IDF'}
    methods = {'svc': 8, 'rf': 9, 'mlp': 10, 'lr': 11, 'mnb': 12, 'rnn': 13}

    if measure == "accuracy_&_confusion_matrix":
        maxes = {
            'svc': [[0, 0, {
                "accuracy": 0,
                "matrix": None
            }]],
            'rf': [[0, 0, {
                "accuracy": 0,
                "matrix": None
            }]],
            'mlp': [[0, 0, {
                "accuracy": 0,
                "matrix": None
            }]],
            'lr': [[0, 0, {
                "accuracy": 0,
                "matrix": None
            }]],
            'mnb': [[0, 0, {
                "accuracy": 0,
                "matrix": None
            }]],
            'rnn': [[0, 0, {
                "accuracy": 0,
                "matrix": None
            }]]
        }
        best = [[0, 0, {"accuracy": 0, "matrix": None}]]
    else:
        maxes = {
            'svc': [[0, 0, 0]],
            'rf': [[0, 0, 0]],
            'mlp': [[0, 0, 0]],
            'lr': [[0, 0, 0]],
            'mnb': [[0, 0, 0]],
            'rnn': [[0, 0, 0]]
        }
        best = [[0, 0, 0]]
    image_num = 0
    for key in sorted(pickle_file_content):
        value = pickle_file_content[key]

        # N-Grams data
        cell_format = workbook.add_format()
        cell_format.set_text_wrap()
        cell_format.set_align('vcenter')
        cell_format.set_align('center')
        features = value['features']
        count = ''
        type = ''
        tfidf = ''
        grams = ''
        skips = ''
        for feature in features:
            feature = feature.split('_')
            print(feature)
            count += feature[1] + '\n'
            type += kind[feature[2]] + '\n'
            tfidf += tf[feature[3]] + '\n'
            grams += ngrams[feature[4]] + '\n'
            skips += feature[5] + '\n'
        worksheet.write(row, 0, str(count[:-1]), cell_format)
        worksheet.write(row, 1, type[:-1], cell_format)
        worksheet.write(row, 2, grams[:-1], cell_format)
        worksheet.write(row, 3, tfidf[:-1], cell_format)
        worksheet.write(row, 4, skips[:-1], cell_format)

        # Stylistic Features data
        stylistic_features = ''
        for styl_feature in value['stylistic_features']:
            stylistic_features += styl_feature.upper() + '  '
        worksheet.write(row, 5, stylistic_features[:-2], cell_format)

        # Pre Processing and Stop Words data
        cell_format = workbook.add_format()
        cell_format.set_align('center')
        cell_format.set_align('vcenter')
        normalization = ""
        stopwords = ""
        for char in value['normalization']:
            if char in "sbx":
                stopwords += char.replace('s',
                                          'E').replace('b',
                                                       'H').replace('x', 'X')
            else:
                normalization += char.upper()
        if normalization == "":
            normalization = "NONE"
        if stopwords == "":
            stopwords = "NONE"
        worksheet.write(row, 6, normalization, cell_format)
        worksheet.write(row, 7, stopwords, cell_format)

        # ML methods and result data
        for method, result in value['results'].items():
            # confusion matrix
            if not isinstance(result, float):
                title = measure + str(image_num)
                if measure == "confusion_matrix":
                    plot_confusion_matrix(result, results_path, title=title)
                    worksheet.set_column(methods[method], methods[method], 40)
                    worksheet.set_row(row, 140)
                elif measure == "roc_curve":
                    plot_roc_curve(result, results_path, method, title=title)
                    worksheet.set_column(methods[method], methods[method], 50)
                    worksheet.set_row(row, 225)
                elif measure == "precision_recall_curve":
                    plot_precision_recall_curve(result,
                                                results_path,
                                                title=title)
                    worksheet.set_column(methods[method], methods[method], 47)
                    worksheet.set_row(row, 215)
                elif measure == "accuracy_&_confusion_matrix":
                    plot_confusion_matrix(result["matrix"],
                                          results_path,
                                          title=title,
                                          accuracy=result["accuracy"],
                                          cmap=plt.cm.Greys)
                    worksheet.set_column(methods[method], methods[method], 40)
                    worksheet.set_row(row, 170)
                    best, maxes = find_maxes_best(best, maxes, method, methods,
                                                  row, result)
                worksheet.insert_image(
                    row, methods[method],
                    os.path.join(results_path, title) + ".jpg")
                image_num += 1
                continue

            if isinstance(result, float):
                val = float('{0:.4g}'.format(result * 100))
            else:
                val = result

            worksheet.write(row, methods[method], val, cell_format)

            # Check if val bigger then max
            best, maxes = find_maxes_best(best, maxes, method, methods, row,
                                          val)

        row += 1

    worksheet.write('A19', 'Colors', bold_gray)
    good = workbook.add_format({'bold': True, 'font_color': 'blue'})
    good.set_align('center')
    good.set_align('vcenter')
    for _, method in maxes.items():
        for val in method:
            if isinstance(val[2], dict):
                if val[2]["accuracy"] != 0:
                    image_num += 1
                    title = measure + str(image_num)
                    plot_confusion_matrix(val[2]["matrix"],
                                          results_path,
                                          title=title,
                                          accuracy=val[2]["accuracy"],
                                          cmap=plt.cm.Blues,
                                          color='blue')
                    worksheet.insert_image(
                        val[0], val[1],
                        os.path.join(results_path, title) + ".jpg")
            else:
                worksheet.write(val[0], val[1], val[2], good)

    good = workbook.add_format({'font_color': 'blue'})
    worksheet.write('A20', 'The best result of the learning method', good)
    good = workbook.add_format({'bold': True, 'font_color': 'red'})
    good.set_align('center')
    good.set_align('vcenter')
    for val in best:
        if isinstance(val[2], dict):
            if val[2]["accuracy"] != 0:
                image_num += 1
                title = measure + str(image_num)
                plot_confusion_matrix(val[2]["matrix"],
                                      results_path,
                                      title=title,
                                      accuracy=val[2]["accuracy"],
                                      cmap=plt.cm.Reds,
                                      color='red')
                worksheet.insert_image(
                    val[0], val[1], os.path.join(results_path, title + ".jpg"))
        else:
            worksheet.write(val[0], val[1], val[2], good)

    good = workbook.add_format({'font_color': 'red'})
    worksheet.write('A21', 'The best result in all classification', good)
    bold = workbook.add_format({'bold': True})
    worksheet.write('A39', 'Results', bold)
    worksheet.add_table(
        "A40:N" + str(row), {
            'columns': [{
                'header': 'Number'
            }, {
                'header': 'Type'
            }, {
                'header': 'N-GRAMS'
            }, {
                'header': 'TF'
            }, {
                'header': 'Skips'
            }, {
                'header': 'Stylistic Features'
            }, {
                'header': 'Pre Processing'
            }, {
                'header': 'Stop Words'
            }, {
                'header': 'svc'
            }, {
                'header': 'rf'
            }, {
                'header': 'mlp'
            }, {
                'header': 'lr'
            }, {
                'header': 'mnb'
            }, {
                'header': 'rnn'
            }],
            'style':
            'Table Style Light 8'
        })

    worksheet.write('A1',
                    'Classification results: ' + measure.replace("_", " "),
                    extra_big)

    workbook.close()

    # Delete the images of the non integer measures
    for file in os.listdir(results_path):
        if file.endswith('.jpg'):
            os.remove(os.path.join(results_path, file))
示例#30
0
                y_pred.append(pred_class_num)
            except:
                print("Unexpected error:", sys.exc_info()[0])
                traceback.print_exc()

        print("len", len(Y))

        import confusion_matrix as cm
        import matplotlib.pyplot as plt

        # Compute confusion matrix
        from sklearn.metrics import accuracy_score, confusion_matrix

        accuracy = accuracy_score(Y, y_pred)

        cnf_matrix = confusion_matrix(Y, y_pred)
        np.set_printoptions(precision=2)

        # Plot non-normalized confusion matrix
        plt.figure()
        cm.plot_confusion_matrix(cnf_matrix,
                                 classes,
                                 accuracy,
                                 normalize=True,
                                 title='Confusion matrix')

        plt.show()

    else:
        test_image(img_name, model_name)
示例#31
0
    def _train(self) -> Optional[float]:
        criterion = nn.CrossEntropyLoss()
        print_freq = 10
        acc = None
        max_accuracy = 0.0

        print("Evaluation before fine-tuning")
        correct = 0
        total = 0
        count = 0.0
        running_val_loss = 0.0

        self._state.model.eval()

        if self._train_cfg.architecture == 'PNASNet':
            self._state.model.module.cell_11.eval()
            self._state.model.module.cell_10.eval()
            self._state.model.module.cell_9.eval()
            self._state.model.module.dropout.eval()
        elif self._train_cfg.architecture == 'EfficientNet':
            self._state.model.module.classifier.eval()
            self._state.model.module.conv_head.eval()
            self._state.model.module.bn2.eval()

        else:
            self._state.model.module.layer4[2].bn3.eval()

        with torch.no_grad():
            y_pred = []
            y_true = []
            for data in self._test_loader:
                images, labels = data
                images = images.cuda(self._train_cfg.local_rank,
                                     non_blocking=True)
                labels = labels.cuda(self._train_cfg.local_rank,
                                     non_blocking=True)
                outputs = self._state.model(images)
                loss_val = criterion(outputs, labels)
                _, predicted = torch.max(outputs.data, 1)
                y_pred = y_pred + predicted.tolist()
                y_true = y_true + labels.tolist()
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
                running_val_loss += loss_val.item()
                count = count + 1.0
        acc = correct / total
        ls_nm = running_val_loss / count
        cnf_matrix = confusion_matrix(y_true, y_pred)
        # cnf_matrix = confusion_matrix(y_true, y_pred)
        print('Confusion matrix:')
        print(cnf_matrix)
        print('\nAccuracy_confusion_matrix:',
              np.diagonal(cnf_matrix).sum() / cnf_matrix.sum())
        print(f"Accuracy of the network on the {total} test images: {acc:.1%}",
              flush=True)
        print(f"Loss of the network on the {total} test images: {ls_nm:.3f}",
              flush=True)
        print("Accuracy before fine-tuning : " + str(acc))

        max_accuracy = np.max((max_accuracy, acc))
        start_epoch = self._state.epoch
        # Start from the loaded epoch
        j = 0
        for epoch in range(start_epoch, self._train_cfg.epochs):
            print(f"Start epoch {epoch}", flush=True)
            self._state.model.eval()
            if self._train_cfg.architecture == 'PNASNet':
                self._state.model.module.cell_11.train()
                self._state.model.module.cell_10.train()
                self._state.model.module.cell_9.train()
                self._state.model.module.dropout.train()
            elif self._train_cfg.architecture == 'EfficientNet':
                self._state.model.module.classifier.train()
                self._state.model.module.conv_head.train()
                self._state.model.module.bn2.train()
            else:
                self._state.model.module.layer4[2].bn3.train()

            j = epoch * len(self._train_loader)
            self._state.lr_scheduler.step(epoch)
            self._state.epoch = epoch
            running_loss = 0.0
            y_pred_train = []
            y_true_train = []
            for i, data in enumerate(self._train_loader):
                inputs, labels = data
                inputs = inputs.cuda(self._train_cfg.local_rank,
                                     non_blocking=True)
                labels = labels.cuda(self._train_cfg.local_rank,
                                     non_blocking=True)

                outputs = self._state.model(inputs)
                loss = criterion(outputs, labels)
                _, predicted = torch.max(outputs.data, 1)
                y_pred_train = y_pred_train + predicted.tolist()
                y_true_train = y_true_train + labels.tolist()
                self._state.optimizer.zero_grad()
                loss.backward()
                self._state.optimizer.step()
                print("Shape_data_train:", len(y_true_train))
                running_loss += loss.item()
                if i % print_freq == print_freq - 1:
                    acc = accuracy_score(y_true_train, y_pred_train)
                    import collections
                    count_y_train_T = collections.Counter(y_true_train)
                    # print("######")
                    # print(count_y_train_T)
                    count_y_train_T = OrderedDict(
                        sorted(count_y_train_T.items()))
                    # plt.hist(count_y_train_T.values(), bins=list(count_y_train_T.keys()))
                    # plt.title('Số lượng mẫu (data train) tương ứng của mỗi class')
                    # plt.xlabel('Class')
                    # plt.ylabel('Số lượng mẫu')
                    # plt.savefig('EDA_train_data.png')
                    train_eda = pygal.Bar()
                    train_eda.title = 'Số lượng mẫu của từng lớp trên tập data train'
                    train_eda.x_labels = map(str, count_y_train_T.keys())
                    train_eda.add('number of data', count_y_train_T.values())
                    train_eda.render_to_png('eda_data_train.png')
                    train_eda.render()
                    f1 = f1_score(y_true, y_pred, average='micro')
                    precision = precision_score(y_true,
                                                y_pred,
                                                average='micro')
                    recall = recall_score(y_true, y_pred, average='micro')
                    # print(f"[{epoch:02d}, {i:05d}] loss: {running_loss/print_freq:.3f}", flush=True)
                    writer.add_scalar("loss iter", running_loss / print_freq,
                                      j)
                    writer.add_scalar("accuracy iter", acc, j)
                    writer.add_scalar("f1-score iter", f1, j)
                    writer.add_scalar("precsion_score iter", precision, j)
                    writer.add_scalar("recall_score iter", recall, j)
                    writer.add_scalar(
                        "learning rate iter",
                        self._state.lr_scheduler.get_last_lr()[0], j)
                    running_loss = 0.0

            if epoch == self._train_cfg.epochs - 1:
                print("Start evaluation of the model", flush=True)

                correct = 0
                total = 0
                count = 0.0
                running_val_loss = 0.0
                self._state.model.eval()

                if self._train_cfg.architecture == 'PNASNet':
                    self._state.model.module.cell_11.eval()
                    self._state.model.module.cell_10.eval()
                    self._state.model.module.cell_9.eval()
                    self._state.model.module.dropout.eval()
                elif self._train_cfg.architecture == 'EfficientNet':
                    self._state.model.module.classifier.eval()
                    self._state.model.module.conv_head.eval()
                    self._state.model.module.bn2.eval()
                else:
                    self._state.model.module.layer4[2].bn3.eval()

                with torch.no_grad():
                    y_true = []
                    y_pred = []
                    for data in self._test_loader:
                        images, labels = data
                        images = images.cuda(self._train_cfg.local_rank,
                                             non_blocking=True)
                        labels = labels.cuda(self._train_cfg.local_rank,
                                             non_blocking=True)
                        outputs = self._state.model(images)
                        loss_val = criterion(outputs, labels)
                        _, predicted = torch.max(outputs.data, 1)
                        y_pred = y_pred + predicted.tolist()
                        y_true = y_true + labels.tolist()
                        total += labels.size(0)
                        correct += (predicted == labels).sum().item()
                        running_val_loss += loss_val.item()
                        count = count + 1.0
                print("Shape_test_data:", len(y_true))
                cnf_matrix = confusion_matrix(y_true, y_pred)
                pred_true = np.diag(cnf_matrix)
                pred_true = list(pred_true)
                import collections
                counter_true = collections.Counter(y_true)
                counter_true = OrderedDict(sorted(counter_true.items()))
                # print("true", counter_true)
                y_true_gb = counter_true.values()
                # sub = np.array(list(y_true_gb)) - np.array(pred_true)
                # a, y_true_gb = zip(*sorted(zip(list(sub), list(y_true_gb))))
                # b, pred_true = zip(*sorted(zip(list(sub), pred_true)))
                line_chart = pygal.Bar()
                line_chart.title = 'Đồ thị biểu diễn khả năng dự đoán của model cho từng class (data test)'
                line_chart.x_labels = map(str, range(37))
                line_chart.add('y_true', y_true_gb)
                line_chart.add('y_pred_true', pred_true)
                line_chart.render_to_png('chart_pred_true.png')
                line_chart.render()
                test_eda = pygal.Bar()
                test_eda.title = 'Số lượng mẫu của từng lớp trên tập data test'
                test_eda.x_labels = map(str, counter_true.keys())
                test_eda.add('number of data test', counter_true.values())
                test_eda.render_to_png('eda_data_test.png')
                test_eda.render()
                # df = pd.DataFrame({'y_true': y_true_gb,
                #                    'y_pred_true': pred_true}, index=index)
                #
                # ax = df.plot.bar(rot=0)
                # plot = ax
                # fig = plot.get_figure()
                # ax = fig.add_subplot(ax)
                # fig.savefig("pred_true.png")
                path = 'save_folder/confusion_matrix.csv'
                np.savetxt(path, cnf_matrix.astype(np.int32), delimiter=",")
                print('Confusion matrix:')
                print(cnf_matrix)
                # Plot non-normalized confusion matrix
                class_names = [i for i in range(len(cnf_matrix))]
                plt.figure()
                plot_confusion_matrix(
                    cnf_matrix,
                    classes=class_names,
                    title='Confusion matrix, without normalization')

                # Plot normalized confusion matrix
                plt.figure(figsize=(50, 50))
                plot_confusion_matrix(cnf_matrix,
                                      classes=class_names,
                                      normalize=False,
                                      title='Confusion matrix')
                plt.savefig("mygraph.png")

                print('\nAccuracy_confusion_matrix:',
                      np.diagonal(cnf_matrix).sum() / cnf_matrix.sum())
                # Model accuracy: how often is the classifier correct?
                acc_ = 100 * accuracy_score(y_true, y_pred)
                print("Accuracy: ", acc_)
                precision_micro = 100 * precision_score(
                    y_true, y_pred, average='micro')
                print("Precision micro: ", precision_micro)
                precision_macro = 100 * precision_score(
                    y_true, y_pred, average='macro')
                print("precision (macro): ", precision_macro)
                recall_micro = 100 * recall_score(
                    y_true, y_pred, average='micro')
                print("Recall(micro): ", recall_micro)
                recall_macro = 100 * recall_score(
                    y_true, y_pred, average='macro')
                print("Recall(macro): ", recall_macro)
                f1_macro = 100 * f1_score(y_true, y_pred, average='macro')
                print("F1-scores(macro): ", f1_macro)
                f1_micro = 100 * f1_score(y_true, y_pred, average='micro')
                print("F1-scores(micro): ", f1_micro)
                acc = correct / total
                ls_nm = running_val_loss / count
                matrix_measure = [
                    acc_, precision_micro, precision_macro, recall_micro,
                    recall_macro, f1_micro, f1_macro
                ]
                header = map(str, matrix_measure)
                import csv
                writer_ = csv.writer(open('accuracy_measure.csv', 'w+'))
                for word in header:
                    writer_.writerow([
                        word,
                    ])
                for item in matrix_measure:
                    writer_.writerow([
                        item,
                    ])
                print(
                    f"Accuracy of the network on the {total} test images: {acc:.1%}",
                    flush=True)
                print(
                    f"Loss of the network on the {total} test images: {ls_nm:.3f}",
                    flush=True)
                self._state.accuracy = acc
                if self._train_cfg.global_rank == 0:
                    self.checkpoint(rm_init=False)
                if epoch == self._train_cfg.epochs - 1:
                    return acc