Пример #1
0
def evaluate_test_set(checkpoint_path,
                      html_output_dir: str,
                      prediction_threshold: float = 0.5,
                      network=None,
                      device='cpu',
                      writer=None,
                      writer_step=None):
    """Load checkpoint and evaluate it on different metrics on the test set."""
    predictions_df = inference_main.run_inference(
        checkpoint_path=checkpoint_path, network=network, device=device)

    logging.info("\n\nEvaluating all Test-Set...")
    predictions_df['binary_predictions'] = predictions_df['prediction'].apply(
        lambda pred: 1 if pred >= prediction_threshold else 0)
    evaluation_metrics(predictions_df,
                       html_output_dir,
                       split_name='all_test_set',
                       writer=writer,
                       writer_step=writer_step)

    logging.info("\n\nEvaluating Perfect ECGs...")
    only_perfect_ecgs_numbers = quality.test_set_quality_keep(
        quality.EcgQuality.PERFECT)
    only_perfect_ecgs_df = predictions_df[predictions_df['ecg_number'].isin(
        only_perfect_ecgs_numbers)]
    logging.info("Number of Perfect ECGs to evaluate: %d",
                 len(only_perfect_ecgs_df))
    if not os.path.isdir(os.path.join(html_output_dir, 'perfect')):
        os.makedirs(os.path.join(html_output_dir, 'perfect'))
    evaluation_metrics(only_perfect_ecgs_df,
                       os.path.join(html_output_dir, 'perfect'),
                       split_name='perfect_ecgs_only',
                       writer=writer,
                       writer_step=writer_step)

    logging.info("\n\nEvaluating ECGs without artifacts...")
    all_ecgs_except_artifacts_numbers = quality.test_set_quality_filter(
        quality.EcgQuality.SEVERE_ARTIFACTS)
    all_ecgs_except_artifacts_df = predictions_df[
        predictions_df['ecg_number'].isin(all_ecgs_except_artifacts_numbers)]
    logging.info("Number of ECGs without artifacts to evaluate: %d",
                 len(all_ecgs_except_artifacts_df))
    if not os.path.isdir(os.path.join(html_output_dir, 'without_artifacts')):
        os.makedirs(os.path.join(html_output_dir, 'without_artifacts'))
    max_f1 = evaluation_metrics(all_ecgs_except_artifacts_df,
                                os.path.join(html_output_dir,
                                             'without_artifacts'),
                                split_name='no_artifacts',
                                writer=writer,
                                writer_step=writer_step)
    return max_f1
Пример #2
0
    def __init__(self, transform=None, threshold_35=False):
        self.threshold_35 = threshold_35
        # self.annotations_df = pd.read_csv(metadata.EXCEL_DATASET_FILE)
        self.dicom_files = os.listdir(metadata.DICOM_DIR)
        # self.annotations_df = ecg_to_echo_dataset.validate_dicom_and_excel(self.annotations_df, self.dicom_files)
        # logging.info("Number of annotations after matching with dicom files: %d", len(self.annotations_df))

        #
        # Keep only test annotations and dicom files:
        #
        test_set_df = pd.read_excel(metadata.TEST_SET_FILE)

        #
        # Remove blank ECGs:
        #
        test_set_df['ecg_number'] = test_set_df.index.map(lambda x: x + 1)
        self.test_set_df = test_set_df[~test_set_df['ecg_number'].
                                       isin(quality.ECGS_TO_IGNORE)]

        perfect_ecg_numbers = quality.test_set_quality_keep(
            quality.EcgQuality.PERFECT)
        test_set_df_only_perfect = test_set_df[test_set_df['ecg_number'].isin(
            perfect_ecg_numbers)]

        all_except_artifacts = quality.test_set_quality_filter(
            quality.EcgQuality.SEVERE_ARTIFACTS)
        test_set_df_without_artifacts = test_set_df[
            test_set_df['ecg_number'].isin(all_except_artifacts)]

        logging.info("Total Number of ECGs in the test-set: %d",
                     len(test_set_df))
        logging.info("Number of Perfect ECGs: %d",
                     len(test_set_df_only_perfect))
        logging.info("Number of ECGs without Artifacts: %d",
                     len(test_set_df_without_artifacts))

        # self.test_set_files = test_set_df['file name']
        # self.ecg_numbers = test_set_df['ecg_number']
        # self.test_set_df_without_artifacts = test_set_df_without_artifacts['file name']
        # self.test_set_df_only_perfect = test_set_df_only_perfect['file name']

        # self.annotations_df = self.annotations_df[self.annotations_df['file name'].isin(self.test_set_files)]
        self.transform = transform
        logging.info("Final number of annotations: %d", len(self.test_set_df))
Пример #3
0
    def __init__(self, excel_path, dicom_dir, split_name=None, transform=None, threshold_35=False,
                 test_split_type=TestType.ALL_TEST):
        self.threshold_35 = threshold_35
        self.excel_path = excel_path
        self.dicom_dir = dicom_dir
        self.annotations_df = pd.read_csv(excel_path)
        self.dicom_files = os.listdir(dicom_dir)
        self.annotations_df = validate_dicom_and_excel(self.annotations_df, self.dicom_files)
        self.test_split_type = test_split_type
        # self.filter_samples()
        # self.filter_see_below()
        print("After filtering: ", len(self.annotations_df))
        # Filter test set:
        # test_set_path = '../data_reader/excel_files/test_set_v2.xlsx'
        test_set_path = '/Users/tomer.golany/PycharmProjects/ecg_medical_research/ecg_medical_research/data_reader/excel_files/test_set_v2.xlsx'
        test_set_df = pd.read_excel(test_set_path)

        test_set_df['ecg_number'] = test_set_df.index.map(lambda x: x + 1)
        test_set_df = test_set_df[~test_set_df['ecg_number'].isin(quality.ECGS_TO_IGNORE)]
        perfect_ecg_numbers = quality.test_set_quality_keep(quality.EcgQuality.PERFECT)
        test_set_df_only_perfect = test_set_df[test_set_df['ecg_number'].isin(perfect_ecg_numbers)]
        all_except_artifacts = quality.test_set_quality_filter(quality.EcgQuality.SEVERE_ARTIFACTS)
        test_set_df_without_artifacts = test_set_df[test_set_df['ecg_number'].isin(all_except_artifacts)]
        print(f'All test set: {len(test_set_df)}')
        print(f'Only perfect ECGS: {len(test_set_df_only_perfect)}')
        print(f'Without Artifacts ECGS: {len(test_set_df_without_artifacts)}')

        self.test_set_files = test_set_df['file name']
        self.test_set_df_without_artifacts = test_set_df_without_artifacts['file name']
        self.test_set_df_only_perfect = test_set_df_only_perfect['file name']

        if split_name != 'test':
            self.filter_test_set()
        print("After filtering test set:", len(self.annotations_df))
        if split_name is not None:
            self.annotations_df = self.split(name=split_name)
        self.transform = transform
        print("Final length: ", len(self.annotations_df))

        print("Filtering blank ECGS:")
        self.filter_blank_ecgs()
        print("Length after filtering: ", len(self.annotations_df))
Пример #4
0
def eval_natalia(ecg_quality_keep=None, ecg_quality_filter=None):
    #
    # Read Test-Set:
    #
    test_set_path = '/Users/tomer.golany/PycharmProjects/ecg_medical_research/ecg_medical_research/data_reader/excel_files/test_set_v2.xlsx'
    test_df = pd.read_excel(test_set_path)
    ecg_ids = set(np.arange(1, 1001, 1))
    test_df['ECG number'] = ecg_ids
    test_df = test_df[['ECG number', 'label']]

    execel_path = '/Users/tomer.golany/PycharmProjects/ecg_medical_research/ecg_medical_research/data_reader/excel_files/doctors_answers/ECG_echo_table Natalia.xlsx'
    df = pd.read_excel(execel_path)
    df = df[df['EF>=50% (mark 1 if yes!)'] != '?']
    df['ECG number'] = df['no']
    logging.info("Total number of answers to evaluate: %d", len(df))
    merged_df = pd.merge(test_df, df, on=['ECG number'])
    #
    # Keep only ECGs at a quality:
    #
    if ecg_quality_keep is not None:
        quality_ecg_numbers = quality.test_set_quality_keep(ecg_quality_keep)
        merged_df = merged_df[merged_df['ECG number'].isin(quality_ecg_numbers)]
        logging.info("Number of ECGs to evaluate after keeping on quality %s: %d", ecg_quality_keep.value,
                     len(merged_df))
    if ecg_quality_filter is not None:
        quality_ecg_numbers = quality.test_set_quality_filter(ecg_quality_filter)
        merged_df = merged_df[merged_df['ECG number'].isin(quality_ecg_numbers)]
        logging.info("Number of ECGs to evaluate after keeping on quality %s: %d", ecg_quality_filter.value,
                     len(merged_df))

    predictions = list(merged_df['EF>=50% (mark 1 if yes!)'])
    ground_truths = list(merged_df['label'])
    tp, fp, tn, fn = perf_measure(ground_truths, predictions)
    tpr = tp / (tp + fn)
    fpr = fp / (fp + tn)
    f1_score = tp / (tp + 0.5 * (fp + fn))
    logging.info(f"Doctor Name: Natalia\nScores: tpr: {tpr}. fpr: {fpr}\n\n")
    return 'Natalia', tpr, fpr, tp, fp, tn, fn, f1_score, len(ground_truths)
Пример #5
0
def eval(output_csv_name, ecg_quality_keep=None, ecg_quality_filter=None):
    exported_csv = []
    exported_csv_attributes = {}
    for attribute in ECG_ATTRIBUTES:
        exported_csv_attributes[attribute] = []

    df = pd.read_excel(DOCTOR_ANSWERS)
    df_missing_answers = pd.read_excel(DOCTOR_ANSWERS_COMPLETENCE)
    df['Personal ID'] = df['Personal ID'].apply(lambda x: x.lower().replace(' ', ''))
    df_missing_answers['Personal ID'] = df_missing_answers['Personal ID'].apply(lambda x: x.lower().replace(' ', ''))
    doctor_ids = df['Personal ID'].unique()
    logging.info("#%d doctors:\n %s", len(doctor_ids), "\n".join(doctor_ids))

    #
    # Read Test-Set:
    #
    test_set_path = '/Users/tomer.golany/PycharmProjects/ecg_medical_research/ecg_medical_research/data_reader/excel_files/test_set_v2.xlsx'
    test_df = pd.read_excel(test_set_path)
    ecg_ids = set(np.arange(1, 1001, 1))
    test_df['ECG number'] = ecg_ids
    test_df = test_df[['ECG number', 'label']]

    grouped = df.groupby(['Personal ID'])
    for doctor_name, groupd_df in grouped:
        if doctor_name == 'nako':
            continue
        logging.info("Evaluating answers from Doctor %s", doctor_name)
        ecg_numbers = groupd_df['ECG number']

        #
        # Filter ECG numbers not between 1-1000:
        #
        logging.info("Filtering ECGs that are not between 1 - 1000...")
        undefined_ecg_numbers = groupd_df[~groupd_df['ECG number'].isin(ecg_ids)]
        logging.info("Undefined ECGs: %s", list(undefined_ecg_numbers['ECG number']))
        groupd_df = groupd_df[groupd_df['ECG number'].isin(ecg_ids)]

        #
        # Find ids not annotated:
        #
        missing_ids = ecg_ids - set(ecg_numbers.unique())
        logging.info("Number of unique ECG answers: %d. Missing ECGs: %s", len(ecg_numbers.unique()), missing_ids)

        #
        # Find duplicate annotations:
        #
        duplicate_ecg_answers = pd.concat(g for _, g in groupd_df.groupby("ECG number") if len(g) > 1)
        logging.info("Number of duplicate ECGs: %d. Duplicates ECGs: %s",
                     len(duplicate_ecg_answers['ECG number'].unique()),
                     duplicate_ecg_answers['ECG number'].unique())

        groupd_df = groupd_df.drop_duplicates(subset=['ECG number'], keep=False)
        logging.info("Total number of answers to evaluate: %d", len(groupd_df))

        #
        # Search for the missing answers in the new excel file:
        #
        logging.info("Searching for the missing answers in the new excel file...")
        if doctor_name == 'reyu':
            df_more_answers = df_missing_answers[df_missing_answers['Personal ID'] == 'revi']
        else:
            df_more_answers = df_missing_answers[df_missing_answers['Personal ID'] == doctor_name]
        logging.info("Found extra answers: %s", df_more_answers['ECG number'].unique())
        groupd_df = pd.concat([groupd_df, df_more_answers])
        missing_ids = ecg_ids - set(groupd_df['ECG number'].unique()) - set(quality.ECGS_TO_IGNORE)
        logging.info("Number of unique ECG answers: %d. Missing ECGs: %s", len(groupd_df['ECG number'].unique()),
                     missing_ids)
        assert len(groupd_df['ECG number'].unique()) == len(groupd_df['ECG number'])

        #
        # Remove bad ECGs
        #
        groupd_df = groupd_df[~groupd_df['ECG number'].isin(quality.ECGS_TO_IGNORE)]
        logging.info("After removing bad ECGs: %d", len(groupd_df['ECG number'].unique()))

        # answers_df = groupd_df[['ECG number', 'Is EF  equal or more than 50%']]
        answers_df = groupd_df
        answers_df = answers_df.sort_values(by=['ECG number'])
        assert answers_df['Is EF  equal or more than 50%'].unique().sort() == ['YES', 'NO'].sort()
        answers_df['Is EF  equal or more than 50%'] = answers_df['Is EF  equal or more than 50%'].apply(
            lambda x: 1 if x == 'YES' else 0)
        merged_df = pd.merge(test_df, answers_df, on=['ECG number'])
        logging.info(type(merged_df))
        #
        # Keep only ECGs at a quality:
        #
        if ecg_quality_keep is not None:
            quality_ecg_numbers = quality.test_set_quality_keep(ecg_quality_keep)
            merged_df = merged_df[merged_df['ECG number'].isin(quality_ecg_numbers)]
            logging.info(type(merged_df))
            logging.info("Number of ECGs to evaluate after keeping on quality %s: %d", ecg_quality_keep.value,
                         len(merged_df))
        if ecg_quality_filter is not None:
            quality_ecg_numbers = quality.test_set_quality_filter(ecg_quality_filter)
            merged_df = merged_df[merged_df['ECG number'].isin(quality_ecg_numbers)]
            logging.info(type(merged_df))
            logging.info("Number of ECGs to evaluate after filtering quality %s: %d", ecg_quality_filter.value,
                         len(merged_df))

        predictions = list(merged_df['Is EF  equal or more than 50%'])
        ground_truths = list(merged_df['label'])
        tp, fp, tn, fn = perf_measure(ground_truths, predictions)
        tpr = tp / (tp + fn)
        fpr = fp / (fp + tn)
        f1_score = tp / (tp + 0.5 * (fp + fn))
        logging.info(f"Doctor Name: {doctor_name}\nScores: tpr: {tpr}. fpr: {fpr}\n\n")
        exported_csv.append((doctor_name, tpr, fpr, tp, fp, tn, fn, f1_score, len(ground_truths)))

        #
        # Analyze the additional attributes:
        #
        for attribute in ECG_ATTRIBUTES:
            tp, fp, tn, fn, tpr, fpr, total = analyze_attributes(merged_df, attribute, doctor_name)
            exported_csv_attributes[attribute].append((doctor_name, tpr, fpr, tp, fp, tn, fn, total))

    exported_csv.append(eval_natalia(ecg_quality_keep, ecg_quality_filter))

    exported_csv_df = pd.DataFrame(exported_csv, columns=['Name', 'True Positive Rate', 'False Positive Rate',
                                                          '#TP', '#FP', '#TN', '#FN', 'F1 Score', 'Number of ECGs'])
    exported_csv_attributes_df = {}
    for attribute in ECG_ATTRIBUTES:
        exported_csv_attributes_df[attribute] = pd.DataFrame(exported_csv_attributes[attribute], columns=['Name', 'True Positive Rate', 'False Positive Rate',
                                                          '#TP', '#FP', '#TN', '#FN', 'Number of ECGs'])
    # print(exported_csv_df)
    # exported_csv_df.to_csv(output_csv_name, index=False)
    with pd.ExcelWriter(output_csv_name) as writer:
        exported_csv_df.to_excel(writer, "general_results")
        for attribute in ECG_ATTRIBUTES:
            exported_csv_attributes_df[attribute].to_excel(writer, f'{attribute}')
        writer.save()