Пример #1
0
def evaluate(exe,
             test_program,
             test_pyreader,
             fetch_list,
             eval_phase,
             f1=False):
    """
    Evaluation Function
    """
    test_pyreader.start()
    total_cost, total_acc, total_num_seqs = [], [], []
    y_pred, y_true = [], []
    time_begin = time.time()
    if f1:
        while True:
            try:
                probs, labels = exe.run(program=test_program,
                                        fetch_list=fetch_list,
                                        return_numpy=True)
                y_pred.extend([np.argmax(prob) for prob in probs])
                y_true.extend([label[0] for label in labels])
            except fluid.core.EOFException:
                test_pyreader.reset()
                break
        time_end = time.time()
        y_true = np.array(y_true)
        y_pred = np.array(y_pred)
        accuracy = utils.accuracy(y_true, y_pred)
        cls_report = utils.classification_report(y_true, y_pred)
        macro_avg = cls_report["macro avg"]
        print(
            "[%s evaluation] accuracy: %f, macro precision: %f, recall: %f, f1: %f, elapsed time: %f s"
            %
            (eval_phase, accuracy, macro_avg['precision'], macro_avg['recall'],
             macro_avg['f1-score'], time_end - time_begin))

    else:
        while True:
            try:
                np_loss, np_acc, np_num_seqs = exe.run(program=test_program,
                                                       fetch_list=fetch_list,
                                                       return_numpy=False)
                np_loss = np.array(np_loss)
                np_acc = np.array(np_acc)
                np_num_seqs = np.array(np_num_seqs)
                total_cost.extend(np_loss * np_num_seqs)
                total_acc.extend(np_acc * np_num_seqs)
                total_num_seqs.extend(np_num_seqs)
            except fluid.core.EOFException:
                test_pyreader.reset()
                break
        time_end = time.time()
        print("[%s evaluation] avg loss: %f, ave acc: %f, elapsed time: %f s" %
              (eval_phase, np.sum(total_cost) / np.sum(total_num_seqs),
               np.sum(total_acc) / np.sum(total_num_seqs),
               time_end - time_begin))
Пример #2
0
    def using_expert_knowledge(self, real_labels_file):
        logging.info('Validation results using labelled data from expert')
        real_labels_data = pd.read_csv(real_labels_file, sep=';')
        data_combined = pd.merge(left=real_labels_data[['Method', 'CLevel']],
                                 right=self.data[['Method'] +
                                                 self.list_labels],
                                 on='Method',
                                 how='inner')

        for y_pred in self.list_labels:
            report = classification_report(data_combined['CLevel'],
                                           data_combined[y_pred])
            logging.info('------- {} ------ \n {}'.format(y_pred, report))
Пример #3
0
def train(train_dataset, valid_dataset, validation_bool, test_dataset,
          fam_dict_path, num_column, num_trains, num_tests, test_file_path,
          args):
    # load model
    model = rna_model.DeepRfam(seq_length=args.seq_length,
                               num_c=num_column,
                               num_filters=args.num_filters,
                               filter_sizes=args.filter_sizes,
                               dropout_rate=args.keep_prob,
                               num_classes=args.num_classes,
                               num_hidden=args.num_hidden)
    print(model.summary())

    # model compile
    model.compile(
        loss=args.loss_function,
        optimizer=eval(f"optimizers.{args.optimizer}")(lr=args.learning_rate),
        metrics=['accuracy'])

    # start and record training history
    if validation_bool:
        train_history = model.fit_generator(train_dataset,
                                            epochs=args.num_epochs,
                                            verbose=1,
                                            validation_data=valid_dataset,
                                            use_multiprocessing=True,
                                            workers=6)
    else:
        train_history = model.fit_generator(train_dataset,
                                            epochs=args.num_epochs,
                                            verbose=1,
                                            use_multiprocessing=True,
                                            workers=6)

    # # test accuracy
    # t1 = time.time()
    # scores = model.evaluate_generator(test_dataset, steps=num_tests // args.batch_size + 1)
    # delta_t = time.time() - t1
    # print(f"Running time (Prediction):{delta_t} (s)\nAccuracy:{scores[1]}")
    # print(f"Running time (Prediction):{delta_t} (s)\nAccuracy:{scores[1]}")

    # =================================logging=============================================
    local_time = time.strftime("%m-%d_%H-%M", time.localtime())
    # determine log file name and `mkdir`
    if args.log_name is None:
        log_file_name = local_time
    else:
        log_file_name = local_time + '_' + args.log_name
    # os.system(f"mkdir -p {args.log_dir}/{log_file_name}")
    os.makedirs(f"{args.log_dir}/{log_file_name}")

    # save model to .h5 file
    model.save(f"{args.log_dir}/{log_file_name}/{log_file_name}.h5")

    # save the image of model structure
    plot_model(model,
               to_file=f"{args.log_dir}/{log_file_name}/model_structure.png",
               show_shapes=True)

    # save confusion matrix into .csv file
    # prediction = model.predict_generator(test_generator, workers=6, use_multiprocessing=True)
    prediction = model.predict_generator(
        test_generator)  # don't use the multiprocessing

    # get the list of true label
    with open(test_file_path) as f:
        label_list = []
        for line in f:
            line = line.strip()
            seq_index = line.split(',').pop(0)
            if seq_index != '':
                label_list.append(int(seq_index))
            else:
                pass

    prediction = prediction[:len(label_list)]
    prediction_1d = np.array(
        [np.argmax(prediction) for prediction in prediction])
    # print("Length of true label:", len(label_list))
    # print("Length of predict label:", len(prediction_1d))
    utils.cm2csv(true_labels=label_list,
                 predicted_labels=prediction_1d,
                 dict_file=fam_dict_path,
                 save_dir=f"{args.log_dir}/{log_file_name}")
    print('Accuracy:', accuracy_score(label_list, prediction_1d))

    # generate the confusion matrix
    if args.num_classes <= 20:
        utils.plot_cm(true_labels=label_list,
                      predicted_labels=prediction_1d,
                      dict_file=fam_dict_path,
                      title=f'Confusion Matrix',
                      save_dir=f"{args.log_dir}/{log_file_name}")
    else:
        pass

    # draw and save history plot
    utils.plot_history(train_history, f"{args.log_dir}/{log_file_name}")

    # save the classification report
    utils.classification_report(true_labels=label_list,
                                predicted_labels=prediction_1d,
                                dict_file=fam_dict_path,
                                save_dir=f"{args.log_dir}/{log_file_name}",
                                std_out=True)

    # save history to .csv file
    with open(f"{args.log_dir}/history.csv", 'a') as csv:
        print(
            f'{local_time},{log_file_name},{args.dataset},{accuracy_score(label_list, prediction_1d)},{str(args.filter_sizes).replace(","," ")},{args.num_filters},{args.batch_size},{args.num_epochs},{args.keep_prob},{args.num_hidden},{args.learning_rate},{args.loss_function},{args.optimizer}, ',
            file=csv)
Пример #4
0
from sklearn.model_selection import train_test_split
import pandas as pd

# utils.py and separation_mvp.py are in the repo
from utils import classification_report
from separation_mvp import SeparatedClassifier

url = "https://raw.githubusercontent.com/omarfsosa/datasets/master/fairness_synthetic_data.csv"
df = pd.read_csv(url)

X_train, X_test, y_train, y_test, A_train, A_test = train_test_split(
    df.drop(columns="y"), df["y"], df["A"], test_size=0.6, random_state=42)

clf = LogisticRegression(solver="lbfgs")
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, A_test))

R_train = clf.predict_proba(X_train)[:, 1]
R_test = clf.predict_proba(X_test)[:, 1]
goal_tpr, goal_fpr = 0.83591123066577, 0.2639968121139669

fair_clf = SeparatedClassifier(y_train, R_train, A_train)
fair_clf.fit(goal_fpr, goal_tpr)

for k, v in fair_clf.randomized_thresholds.items():
    print(f"Group {k}: t0={v[0]:.2f}, t1={v[1]:.2f}, p={v[2]:.2f}")

y_pred_fair = fair_clf.fair_predict(R_test, A_test)
print(classification_report(y_test, y_pred_fair, A_test))
Пример #5
0
def train(train_dataset, valid_dataset, validation__bool, test_dataset,
          label_list, fam_path, num_channels, num_trains, num_valids,
          num_tests, args):
    # load model
    model = rna_model.L5CFam(seq_length=args.seq_length,
                             num_filters=args.num_filters,
                             num_channels=num_channels,
                             filter_sizes=args.filter_sizes,
                             dropout_rate=args.keep_prob,
                             num_classes=args.num_classes,
                             num_hidden=args.num_hidden)
    print(model.summary())

    # model compile
    model.compile(loss=args.loss_function,
                  optimizer=args.optimizer,
                  metrics=['accuracy'])

    # start and record training history
    if validation__bool:
        train_history = model.fit_generator(train_dataset,
                                            epochs=args.num_epochs,
                                            verbose=1,
                                            validation_data=valid_dataset,
                                            workers=6,
                                            use_multiprocessing=True)
    else:
        train_history = model.fit_generator(train_dataset,
                                            epochs=args.num_epochs,
                                            verbose=1,
                                            workers=6,
                                            use_multiprocessing=True)

    # # test accuracy
    # t1 = time.time()
    # scores = model.evaluate_generator(test_dataset, steps=num_tests // args.batch_size + 1)
    # delta_t = time.time() - t1
    # print(f"Running time (Prediction):{delta_t} (s)\nAccuracy:{scores[1]}")

    # =================================logging=============================================
    local_time = time.strftime("%m-%d_%H-%M", time.localtime())
    # determine log file name and `mkdir`
    if args.log_name is None:
        log_file_name = local_time
    else:
        log_file_name = local_time + '_' + args.log_name
    # os.system(f"mkdir -p {args.log_dir}/{log_file_name}")
    os.makedirs(f"{args.log_dir}/{log_file_name}")

    # save model to .h5 file
    model.save(f"{args.log_dir}/{log_file_name}/{log_file_name}.h5")

    # save the image of model structure
    plot_model(model,
               to_file=f"{args.log_dir}/{log_file_name}/model_structure.png",
               show_shapes=True)

    # save confusion matrix into .csv file
    prediction = model.predict_generator(test_dataset,
                                         workers=6,
                                         use_multiprocessing=True)
    prediction_1d = np.array(
        [np.argmax(prediction) for prediction in prediction])
    # generate the list of the true label
    # label_list = np.zeros((num_tests,), dtype=int)
    # no_label = 0
    # for i in range(1, num_tests):
    #     if i % int(num_tests / args.num_classes) == 0:
    #         no_label += 1
    #     label_list[i] = no_label

    utils.cm2csv(true_labels=label_list,
                 predicted_labels=prediction_1d,
                 dict_file=fam_path,
                 save_dir=f"{args.log_dir}/{log_file_name}")
    print('Accuracy:', accuracy_score(label_list, prediction_1d))

    # draw and save history plot
    utils.plot_history(train_history, f"{args.log_dir}/{log_file_name}")

    # generate the confusion matrix
    if args.num_classes <= 20:
        utils.plot_cm(true_labels=label_list,
                      predicted_labels=prediction_1d,
                      dict_file=fam_dict_path,
                      title=f'Confusion Matrix',
                      save_dir=f"{args.log_dir}/{log_file_name}")
    else:
        pass

    # save the classification report
    utils.classification_report(true_labels=label_list,
                                predicted_labels=prediction_1d,
                                dict_file=fam_dict_path,
                                save_dir=f"{args.log_dir}/{log_file_name}",
                                std_out=True)

    # save history to .csv file
    with open(f"{args.log_dir}/history.csv", 'a') as csv:
        print(
            f'{local_time},{log_file_name},{args.dataset},{accuracy_score(label_list, prediction_1d)},{str(args.filter_sizes).replace(","," ")},{str(args.num_filters).replace(",","")},{args.batch_size},{args.num_epochs},{args.keep_prob},{str(args.num_hidden).replace(",","")},{args.learning_rate},{args.loss_function},{args.optimizer}, ',
            file=csv)