def run_dummy_baseline(set_name, attribute): ''' Runs a simple baseline that always predicts the most frequent value for a given attribute. :param set_name: the name of the dataset to run on :param attribute: the attribute to predict :return: a score reflecting the performance ''' set_spec_dict = get_default_set_spec_dict() set_spec = set_spec_dict[set_name] class_values = problem_legal_values[attribute] if attribute in problem_legal_values else None patients = get_data_for_spec(set_spec, loader_type='bag', attribute_to_filter=attribute, legal_attribute_values=class_values, muscles_to_use=None) y_true = [patient.attributes[attribute] for patient in patients] kind = problem_kind[attribute] if kind == 'regression': d = DummyRegressor(strategy='mean') scorer = mean_absolute_error else: d = DummyClassifier(strategy='most_frequent') scorer = classification_report d.fit([0] * len(y_true), y_true) train_preds = d.predict([0] * len(y_true)) if kind != 'regression': mapping = {'NMD': 1, 'no NMD': 0} y_true_rv = [mapping[y] for y in y_true] train_preds_rv = [mapping[y] for y in train_preds] print(roc_auc_score(y_true_rv, train_preds_rv)) return scorer(y_true, train_preds)
def export_selected_records(set_name): '''For the patients in this set, get the relevant record for each and then store them all in one DataFrame.''' set_spec_dict = get_default_set_spec_dict() set_spec = set_spec_dict[set_name] patients = get_data_for_spec(set_spec, loader_type='bag', attribute_to_filter='Class', legal_attribute_values=problem_legal_values['Class'], muscles_to_use=None) info_dicts = [] for patient in patients: patient.select_closest() record = patient.get_selected_record() info_dicts.append(record.meta_info) return pd.DataFrame(info_dicts)
def extract_y_true(set_name): '''Extract ground truth NMD diagnosis values for each patient in this set.''' set_spec_dict = get_default_set_spec_dict() set_spec = set_spec_dict[set_name] patients = get_data_for_spec(set_spec, loader_type='bag', attribute_to_filter='Class', legal_attribute_values=problem_legal_values['Class'], muscles_to_use=None) y_true = [] meta_infos = [] for patient in patients: y_true.append(patient.attributes['Class']) meta_infos.append(patient.attributes) mapping = {'NMD': 1, 'no NMD': 0} y_true_rv = [mapping[y] for y in y_true] return y_true_rv, pd.DataFrame(meta_infos)
def compute_brightness(set_name, device_name): """A method for computing the average brightness of a set of images. """ att_spec_dict = make_att_specs() set_spec_dict = get_default_set_spec_dict() # e.g. "ESAOTE_6100_train" set_spec = set_spec_dict[set_name] images = get_data_for_spec(set_spec, loader_type='image', dropna_values=False) # e.g. "ESAOTE_6100" transform = make_basic_transform(device_name, limit_image_size=False, to_tensor=True) ds = SingleImageDataset(image_frame=images, root_dir=set_spec.img_root_path, attribute_specs=[att_spec_dict['Sex']], return_attribute_dict=False, transform=transform, use_one_channel=True) mean, std = compute_normalization_parameters(ds, 1) print(mean) print(std)
def run_rule_based_baseline(set_name, ei_extraction_method): """ Get the rule-based prediction for each patient in this set. :param set_name: The data set to use. :param ei_extraction_method: The method to use for adjust EIZ scores. :return: Rule-based predictions. """ set_spec_dict = get_default_set_spec_dict() set_spec = set_spec_dict[set_name] patients = get_data_for_spec( set_spec, loader_type='bag', attribute_to_filter='Class', legal_attribute_values=problem_legal_values['Class'], muscles_to_use=None) preds = [] for patient in tqdm.tqdm(patients): patient.try_closest_fallback_to_latest() record = patient.get_selected_record() eiz = ei_extraction_method(record)['EIZ'] feature_rep = get_feature_rep_for_rule_based(eiz, record) pred = predict_rule_based(feature_rep) preds.append(pred) preds = pd.Series(preds) # the rule-based model can also predict uncertain disease state, map this to 0.5 to allow an # additional threshold during ROC computation. y_proba_rv = preds.replace({ 'NMD': 1, 'no NMD': 0, 'unknown or uncertain': 0.5 }).values return y_proba_rv
def obtain_feature_rep_ml_experiment(set_name, use_eiz=True, ei_extraction_method=None, additional_features=None): ''' A method that maps the entire provided set into the feature representation used for the Trad ML experiments. :param set_name: The name of the dataset to be mapped. :param use_eiz: Use EIZ scores? If false, use raw EI scores. :param ei_extraction_method: The method for extracting EI scores from records. Can use original scores or recompute. :param additional_features: Additional demographic features to be included (extracted from the records) :return: A DataFrame of mapped patient records for classification. ''' # use the original scores as default if not ei_extraction_method: ei_extraction_method = partial(get_original_scores) set_spec_dict = get_default_set_spec_dict() set_spec = set_spec_dict[set_name] patients = get_data_for_spec( set_spec, loader_type='bag', attribute_to_filter='Class', legal_attribute_values=problem_legal_values['Class'], muscles_to_use=None) feature_reps = [] if not additional_features: additional_features = [] for patient in patients: patient.try_closest_fallback_to_latest() record = patient.get_selected_record() # allow swapping between z_scores and EI values return_dict = ei_extraction_method(record) if use_eiz: if 'EIZ' not in return_dict: raise ValueError( f'Required z-score computation, but method {ei_extraction_method} did not provide it.' ) vector = return_dict['EIZ'] prefix = 'EIZ' else: vector = return_dict['EI'] prefix = 'EI' # drop all na vectors vector = vector[~np.isnan(vector)] if len(vector) == 0: continue # optionally smooth using different bins smoothed_vectors = {} smoothing_factors = [] for smoothing_factor in smoothing_factors: smoothed_vectors['smoothed_' + str(smoothing_factor)] = smooth_vector( vector, (0, 256), smoothing_factor) # always use the original scale smoothed_vectors['base'] = vector feature_rep = {} # additionally filter filtered_frame = extractor_frame #[extractor_frame['scale_inv']] # feature extraction starts here for smoothing_name, smoothed_vector in smoothed_vectors.items(): for i, row in filtered_frame.iterrows(): func = row['func'] if smoothing_name == 'base': name = prefix + '_' + row['name'] else: name = prefix + '_' + row['name'] + '_' + smoothing_name value = func(smoothed_vector) feature_rep[name] = value demographic_features = extract_features_from_meta_info( record, additional_features) feature_rep = {**feature_rep, **demographic_features} feature_rep['Class'] = record.meta_info['Class'] # color = 'r' if feature_rep['Class'] == 'NMD' else 'b' # plt.hist(ei, 5, (0, 150), color=color) plt.show() feature_reps.append(feature_rep) feature_frame = pd.DataFrame(feature_reps) return feature_frame
import os import itk import numpy from utils.experiment_utils import get_default_set_spec_dict from loading.loaders import get_data_for_spec, make_bag_dataset, make_basic_transform from loading.datasets import make_att_specs, PatientBagDataset from loading.datasets import problem_legal_values if __name__ == '__main__': set_name = 'ESAOTE_6100_val' set_spec_dict = get_default_set_spec_dict() set_spec = set_spec_dict[set_name] patients = get_data_for_spec( set_spec, loader_type='bag', attribute_to_filter='Class', legal_attribute_values=problem_legal_values['Class'], muscles_to_use=None) att_spec_dict = make_att_specs() transform = make_basic_transform(set_spec.device, normalizer_name=None, to_tensor=False, limit_image_size=True) ds = PatientBagDataset(patient_list=patients, root_dir=set_spec.img_root_path, attribute_specs=[att_spec_dict['Sex']], transform=transform, use_pseudopatients=False,