Exemplo n.º 1
0
def main(
    prediction_filenames,
    submission_filename,
):
    predictions_list = []
    for filename in prediction_filenames:
        predictions_list.append(load_predictions(filename))
    predictions = np.vstack(predictions_list)
    mean_predictions = np.mean(predictions, axis=0)

    test_filenames = list(sorted(os.listdir(config.TEST_DIR_PATH)))
    predictions_by_file = {}
    for filename, file_predictions in zip(test_filenames, mean_predictions):
        idx = np.argmax(file_predictions)
        label = idx_to_label[idx]
        predictions_by_file[filename] = label

    sample_submission = pd.read_csv(config.SAMPLE_SUBMISSION_PATH)
    sample_submission.drop('label', axis=1, inplace=True)
    predictions_df = pd.DataFrame(
        list(predictions_by_file.items()),
        columns=['fname', 'label'],
    )
    submission_df = sample_submission.merge(predictions_df, on='fname')
    submission_df.to_csv(
        os.path.join(config.SUBMISSIONS_PATH, submission_filename),
        index=False,
    )
Exemplo n.º 2
0
def process(exp, reference, query, tag=None):
    exp_clusters = sickle.assignments(query)

    clusters = pd.read_csv('{}/{}_clusters.csv'.format(
        os.path.join(os.path.dirname(CUR_DIR), '00-metadata'), query),
                           index_col=0,
                           header=None)
    clusters.index = clusters.index.str.replace('C', '').astype(int)

    preds = utils.load_predictions(
        os.path.join(CUR_DIR, '{}-preds.csv'.format(exp)), reference)

    # second_maxes = get_second_maxes(preds)
    # maxes_heatmap = plot_second_maxes(second_maxes)
    # maxes_heatmap.savefig(os.path.join(CUR_DIR, '{}-second-max-heatmap.png'.format(exp)))
    #
    # if query == 'SC03':
    #     plasma_second_maxes = get_second_maxes(preds.loc[exp_clusters.index[exp_clusters == 7],:])
    #     maxes_heatmap = plot_second_maxes(plasma_second_maxes)
    #     plt.suptitle('Second max predictions for Plasma cells')
    #     maxes_heatmap.savefig(os.path.join(CUR_DIR, '{}-plasma-second-max-heatmap.png'.format(exp)))

    hmap = heatmap(preds)
    hmap.savefig(os.path.join(CUR_DIR, '{}-heatmap.pdf'.format(exp)))

    if query == 'SC03':
        hmap = heatmap(preds.loc[exp_clusters.index[exp_clusters == 2], :],
                       label=r'$\it{B\ cells}$ from SC03 dataset',
                       figsize=(16, 8),
                       subfig='A')
        hmap.savefig(os.path.join(CUR_DIR,
                                  '{}-bcells-heatmap.pdf'.format(exp)))

        hmap = heatmap(preds.loc[exp_clusters.index[exp_clusters == 7], :],
                       label=r'$\it{Plasma\ cells}$ from SC03 dataset',
                       figsize=(16, 8),
                       subfig='B')
        hmap.savefig(os.path.join(CUR_DIR,
                                  '{}-plasma-heatmap.pdf'.format(exp)))

    mapping = sickle.mapping(query, reference)
    s = sankey.sankey(clusters.iloc[:, 0].loc[exp_clusters],
                      preds.idxmax(axis=1),
                      alpha=.7,
                      left_order=sickle.sankey_order(),
                      mapping=mapping,
                      tag=tag)
    s.savefig(os.path.join(CUR_DIR, '{}-sankey.pdf'.format(exp)))

    if mapping:
        open(os.path.join(CUR_DIR, '{}-f1.txt'.format(exp)),
             'w').write('F1 score: {:.4f}'.format(
                 mapping.f1(clusters.iloc[:, 0].loc[exp_clusters],
                            preds.idxmax(axis=1))))
Exemplo n.º 3
0
def process(exp, reference, query, tag=None):
    exp_clusters = sickle.assignments(query)

    clusters = pd.read_csv('{}/{}_clusters.csv'.format(
        os.path.join(os.path.dirname(CUR_DIR), '00-metadata'),
        query
    ), index_col=0, header=None)
    clusters.index = clusters.index.str.replace('C', '').astype(int)

    preds = utils.load_predictions(
        os.path.join(CUR_DIR, '{}-preds.csv'.format(exp)),
        reference
    )

    second_maxes = get_second_maxes(preds)
    maxes_heatmap = plot_second_maxes(second_maxes)
    maxes_heatmap.savefig(os.path.join(CUR_DIR, '{}-second-max-heatmap.png'.format(exp)))

    if query == 'SC03':
        plasma_second_maxes = get_second_maxes(preds.loc[exp_clusters.index[exp_clusters == 7],:])
        maxes_heatmap = plot_second_maxes(plasma_second_maxes)
        plt.suptitle('Second max predictions for Plasma cells')
        maxes_heatmap.savefig(os.path.join(CUR_DIR, '{}-plasma-second-max-heatmap.png'.format(exp)))

    hmap = heatmap(preds)
    hmap.suptitle('Predictions for {} dataset'.format(query))
    hmap.subplots_adjust(top=0.88)
    hmap.savefig(os.path.join(CUR_DIR, '{}-heatmap.pdf'.format(exp)))

    if query == 'SC03':
        hmap = heatmap(preds.loc[exp_clusters.index[exp_clusters == 7],:], figsize=(16, 12))
        hmap.suptitle('Predictions for SC03 Plasma cells')
        hmap.subplots_adjust(top=0.88)
        hmap.savefig(os.path.join(CUR_DIR, '{}-plasma-heatmap.pdf'.format(exp)))

    seaborn.set(font_scale=1)
    mapping = sickle.mapping(query, reference)
    s = sankey.sankey(
        clusters.iloc[:, 0].loc[exp_clusters],
        preds.idxmax(axis=1),
        alpha=.7,
        left_order=sickle.sankey_order(),
        mapping=mapping,
        tag=tag
    )
    s.savefig(os.path.join(CUR_DIR, '{}-sankey.pdf'.format(exp)))

    if mapping:
        open(os.path.join(CUR_DIR, '{}-f1.txt'.format(exp)), 'w').write(
            'F1 score: {:.4f}'.format(
                mapping.f1(clusters.iloc[:, 0].loc[exp_clusters],
                           preds.idxmax(axis=1))
            )
        )
Exemplo n.º 4
0
def draw_(img_dir, out_dir, json_file, threshold=0.0001):
    predictions = load_predictions(json_file)
    exec_time = json_file.rpartition('_')[-1].rstrip('.json')
    pfd_pgs_sizes = []
    for img_name, result in predictions.items():
        img_path = f'{img_dir}/{img_name}'
        img = cv2.imread(img_path)
        pfd_pgs_sizes.append(img.shape[:2])
    #     new_img = img.copy()
    #     img_obj = Image(Path(img_path))
    #     for obj in result:
    #         obj_label = obj['label']
    #         if obj_label == 'Bordered':
    #             table_bbox = obj['bbox']
    #             table_img = crop_img_to_bbox(img, table_bbox)
    #             boxes = recognize_bordered_table(table_img, threshold)
    # new_img = draw_boxes(new_img, boxes, table_bbox[:2], stroke=3)
    # new_img = draw_boxes(new_img, [convert_to_xywh(table_bbox)], color=(255, 0, 0), stroke=4)
    # detect_bordered_tables_on_image(img_obj)
    detection_json = detect_images(Path(img_dir), pfd_pgs_sizes)
    subdir = f'{out_dir}/{json_file.rpartition("_")[-1].rstrip(".json")}'
    os.makedirs(subdir, exist_ok=True)
    with open(f'{subdir}/detections.json', 'w') as f:
        json.dump(detection_json, f)
Exemplo n.º 5
0
def evaluate(predictions_file, labels_files, output_dir):

    IDs = []
    labels = []
    weights = []
    for labels_file in labels_files:
        file_IDs, file_labels, file_weights = \
            load_weighted_labels(labels_file)
        IDs += file_IDs
        labels += list(file_labels)
        weights += list(file_weights)
    labels = np.array(labels)
    weights = np.array(weights)

    predicted_IDs, probs = load_predictions(predictions_file)

    evaluate_IDs = []
    evaluate_labels = []
    evaluate_probs = []

    for i, ID in enumerate(predicted_IDs):
        # only evaluate if...
        # ID has weight = 1 in the original labels
        # and ID is not ambiguous in the original labels
        if (ID in IDs and weights[IDs.index(ID)] == 1
                and labels[IDs.index(ID)] != 0):
            evaluate_IDs.append(ID)
            evaluate_labels.append(labels[IDs.index(ID)])
            evaluate_probs.append(probs[i])

    evaluate_IDs = np.array(evaluate_IDs)
    evaluate_labels = np.array(evaluate_labels)
    evaluate_probs = np.array(evaluate_probs)

    try:
        sort_index = evaluate_probs.argsort()[::-1]
        evaluate_IDs = evaluate_IDs[sort_index]
        evaluate_labels = evaluate_labels[sort_index]
        evaluate_probs = evaluate_probs[sort_index]

        prior = sum(evaluate_labels == 1) / len(evaluate_labels)
        ranks = [10, 20, 50]

        auROC = roc_auc_score(evaluate_labels, evaluate_probs)
        auPRC = np.log2(
            average_precision_score(evaluate_labels, evaluate_probs) / prior)

        # precision = TP / (TP + FP)
        precisions = \
            [np.log2((sum(evaluate_labels[:rank] == 1) / rank) / prior)
                for rank in ranks]

        # recall = TP / (TP + FN)
        recalls = \
            [(sum(evaluate_labels[:rank] == 1) / sum(evaluate_labels == 1))
                for rank in ranks]

        precision_headers = \
            ["log(P@r{}/prior)".format(rank) for rank in ranks]
        recall_headers = ["R@r{}".format(rank) for rank in ranks]

        values = [auROC] + [auPRC] + precisions + recalls
        headers = \
            ["auROC"] + ["auPRC"] + precision_headers + recall_headers

        output_file = \
            os.path.join(output_dir, os.path.basename(predictions_file))
        with open(output_file, "w") as f:
            blank_string = "\t".join(["{}"] * (2 * len(ranks) + 2)) + "\n"
            f.write(blank_string.format(*headers))
            f.write(blank_string.format(*values))

    except ValueError as e:
        print("ValueError:", e)
###############################################################################
from utils import load_predictions, generate_metrics, save_pandas_summary
from utils import summarize_metrics_folds, pandanize_summary

if __name__ == "__main__":

    print "Loading data"
    predictions_dict = load_predictions()

    print "Started calculation"
    metrics_dict = generate_metrics(predictions_dict)
    summary = summarize_metrics_folds(metrics_dict)
    print "Finished calculation"

    pd_summary = pandanize_summary(summary)
    save_pandas_summary(pd_summary)
    print "Stored metrics"
import utils

if __name__=="__main__":
    predictions = utils.load_predictions("experiments/results/predictions.txt")
    print predictions