def getfeatures(subjects, classes, featdir, outdir, outname, getmethod=None, feature_type='dir_of_dris'): """Populates the pyradigm data structure with features from a given method. getmethod: takes in a path and returns a vectorized feature set (e.g. set of subcortical volumes), with an optional array of names for each feature. classes: dict of class labels keyed in by subject id """ assert callable(getmethod), "Supplied getmethod is not callable!" \ "It must take in a path and return a vectorized feature set and labels." # generating an unique numeric label for each class (sorted in order of their appearance in metadata file) class_set = set(classes.values()) class_labels = dict() for idx, cls in enumerate(class_set): class_labels[cls] = idx ids_excluded = list() if feature_type == 'data_matrix': data_matrix = get_data_matrix(featdir) ds = MLDataset() for subjid in subjects: try: if feature_type == 'data_matrix': data = data_matrix[subjects.index(subjid), :] feat_names = None else: data, feat_names = getmethod(featdir, subjid) ds.add_sample(subjid, data, class_labels[classes[subjid]], classes[subjid], feat_names) except: ids_excluded.append(subjid) traceback.print_exc() warnings.warn( "Features for {} via {} method could not be read or added. " "Excluding it.".format(subjid, getmethod.__name__)) # warning for if failed to extract features even for one subject alert_failed_feature_extraction(len(ids_excluded), ds.num_samples, len(subjects)) # save the dataset to disk to enable passing on multiple dataset(s) savepath = os.path.join(outdir, outname) ds.save(savepath) return savepath
def process_arff(feature_path, subjects, classes, out_dir): """Processes the given dataset to return a clean name and path.""" loaded_dataset = MLDataset(arff_path=feature_path) if len(loaded_dataset.description) > 1: method_name = loaded_dataset.description else: method_name = basename(feature_path) out_name = make_dataset_filename(method_name) out_path_cur_dataset = pjoin(out_dir, out_name) loaded_dataset.save(out_path_cur_dataset) if not saved_dataset_matches(loaded_dataset, subjects, classes): raise ValueError( 'supplied ARFF dataset does not match samples in the meta data.') return method_name, out_path_cur_dataset
def import_datasets(method_list, out_dir, subjects, classes, feature_path, feature_type='dir_of_dirs'): """ Imports all the specified feature sets and organizes them into datasets. Parameters ---------- method_list : list of callables Set of predefined methods returning a vector of features for a given sample id and location out_dir : str Path to the output folder subjects : list of str List of sample ids classes : dict Dict identifying the class for each sample id in the dataset. feature_path : list of str List of paths to the root directory containing the features (pre- or user-defined). Must be of same length as method_list feature_type : str a string identifying the structure of feature set. Choices = ('dir_of_dirs', 'data_matrix') Returns ------- method_names : list of str List of method names used for annotation. dataset_paths_file : str Path to the file containing paths to imported feature sets. """ def clean_str(string): return ' '.join(string.strip().split(' _-:\n\r\t')) method_names = list() outpath_list = list() for mm, cur_method in enumerate(method_list): if cur_method in [get_dir_of_dirs]: method_name = basename(feature_path[mm]) elif cur_method in [get_data_matrix]: method_name = os.path.splitext(basename(feature_path[mm]))[0] elif cur_method in [get_pyradigm]: if feature_type in ['pyradigm']: loaded_dataset = MLDataset(filepath=feature_path[mm]) else: raise ValueError('Invalid state of the program!') if len(loaded_dataset.description) > 1: method_name = loaded_dataset.description else: method_name = basename(feature_path[mm]) method_names.append(clean_str(method_name)) if saved_dataset_matches(loaded_dataset, subjects, classes): outpath_list.append(feature_path[mm]) continue else: raise ValueError( 'supplied pyradigm dataset does not match samples in the meta data.' ) elif cur_method in [get_arff]: loaded_dataset = MLDataset(arff_path=feature_path[mm]) if len(loaded_dataset.description) > 1: method_name = loaded_dataset.description else: method_name = basename(feature_path[mm]) method_names.append(clean_str(method_name)) out_name = make_dataset_filename(method_name) outpath_dataset = pjoin(out_dir, out_name) loaded_dataset.save(outpath_dataset) outpath_list.append(outpath_dataset) continue else: # adding an index for an even more unique identification # method_name = '{}_{}'.format(cur_method.__name__,mm) method_name = cur_method.__name__ method_names.append(clean_str(method_name)) out_name = make_dataset_filename(method_name) outpath_dataset = pjoin(out_dir, out_name) if not saved_dataset_matches(outpath_dataset, subjects, classes): # noinspection PyTypeChecker outpath_dataset = get_features(subjects, classes, feature_path[mm], out_dir, out_name, cur_method, feature_type) outpath_list.append(outpath_dataset) combined_name = uniq_combined_name(method_names) dataset_paths_file = pjoin(out_dir, 'datasetlist.' + combined_name + '.txt') with open(dataset_paths_file, 'w') as dpf: dpf.writelines('\n'.join(outpath_list)) return method_names, dataset_paths_file
def get_features(subjects, classes, featdir, outdir, outname, get_method=None, feature_type='dir_of_dris'): """ Populates the pyradigm data structure with features from a given method. Parameters ---------- subjects : list or ndarray List of subject IDs classes : dict dict of class labels keyed in by subject id featdir : str Path to input directory to read the features from outdir : str Path to output directory to save the gathered features to. outname : str Name of the feature set get_method : callable Callable that takes in a path and returns a vectorized feature set (e.g. set of subcortical volumes), with an optional array of names for each feature. feature_type : str Identifier of data organization for features. Returns ------- saved_path : str Path where the features have been saved to as an MLDataset """ if not callable(get_method): raise ValueError( "Supplied get_method is not callable! " "It must take in a path and return a vectorized feature set and labels." ) # generating an unique numeric label for each class (sorted in order of their appearance in metadata file) class_set = set(classes.values()) class_labels = dict() for idx, cls in enumerate(class_set): class_labels[cls] = idx ids_excluded = list() if feature_type == 'data_matrix': data_matrix = get_data_matrix(featdir) ds = MLDataset() for subjid in subjects: try: if feature_type == 'data_matrix': data = data_matrix[subjects.index(subjid), :] feat_names = None else: data, feat_names = get_method(featdir, subjid) ds.add_sample(subjid, data, class_labels[classes[subjid]], classes[subjid], feat_names) except: ids_excluded.append(subjid) traceback.print_exc() warnings.warn( "Features for {} via {} method could not be read or added. " "Excluding it.".format(subjid, get_method.__name__)) # warning for if failed to extract features even for one subject alert_failed_feature_extraction(len(ids_excluded), ds.num_samples, len(subjects)) # save the dataset to disk to enable passing on multiple dataset(s) saved_path = realpath(pjoin(outdir, outname)) try: ds.save(saved_path) except IOError as ioe: print('Unable to save {} features to disk in folder:\n{}'.format( outname, outdir)) raise ioe return saved_path
flag_incomplete = True incomplete_processing[base_feature][weight_method][ ds_name].append(sample) # print('processing incomplete for {} {} {}'.format(ds_name, weight_method, sample)) if flag_nan_exists or flag_incomplete or flag_unexpected: pass # print('{:20} {:25} - processing unusable; totally skipping it.'.format(base_feature, weight_method)) else: print('{:20} {:5} {:25} - fully usable.'.format( base_feature, ds_name, weight_method)) dataset.description = '{}_{}'.format(base_feature, weight_method) out_path = pjoin( out_dir, '{}_{}.MLDataset.pkl'.format(base_feature, weight_method)) dataset.save(out_path) # saving with open(pjoin(out_dir, 'incomplete_unusable_processing.pkl'), 'wb') as ipf: pickle.dump([incomplete_processing, comb_nan_values], ipf) # reading with open(pjoin(out_dir, 'incomplete_unusable_processing.pkl'), 'rb') as ipf: incomplete_processing, comb_nan_values = pickle.load(ipf) # results for base_feature in base_feature_list: for ds_name in dataset_list: for weight_method in histogram_dist: print('{:20} {:5} {:25} {:5} {:5}'.format(
for base_feature in features_freesurfer: id_list, classes = get_metadata(meta_file) class_set = list(set(classes.values())) class_set.sort() labels = {sub: class_set.index(cls) for sub, cls in classes.items()} out_path = pjoin( vis_out_dir, 'raw_features_{}_{}.MLDataset.pkl'.format(base_feature, '_'.join(class_set))) try: ds = MLDataset(filepath=out_path) except: traceback.print_exc() id_data = import_features(freesurfer_dir, id_list, base_feature, atlas=atlas, fwhm=fwhm) ds = MLDataset(data=id_data, labels=labels, classes=classes) ds.save(out_path) data, lbl, ids = ds.data_and_labels() print('{} {}\n min : {:.4f}\n max : {:.4f}'.format( dataset_name, base_feature, np.min(data), np.max(data))) for perc in [1, 5, 95, 99]: print('{:3d}% : {:10.4f}'.format(perc, np.percentile(data, perc)))
class_sizes = np.random.randint(10, 1000, num_classes) num_features = np.random.randint(10, 500) class_set = np.array(['C{:05d}'.format(x) for x in range(num_classes)]) feat_names = np.array([str(x) for x in range(num_features)]) test_dataset = MLDataset() for class_index, class_id in enumerate(class_set): for sub_ix in range(class_sizes[class_index]): subj_id = '{}_S{:05d}'.format(class_set[class_index], sub_ix) feat = np.random.random(num_features) test_dataset.add_sample(subj_id, feat, class_index, class_id, feat_names) out_file = os.path.join(out_dir, 'random_example_dataset.pkl') test_dataset.save(out_file) test_dataset.description = 'test dataset' print(test_dataset) print('default format:\n {}'.format(test_dataset)) print('full repr :\n {:full}'.format(test_dataset)) print('string/short :\n {:s}'.format(test_dataset)) class_set, label_set, class_sizes = test_dataset.summarize_classes() reloaded_dataset = MLDataset(filepath=out_file, description='reloaded test_dataset') copy_dataset = MLDataset(in_dataset=test_dataset) rand_index = np.random.randint(0, len(class_set), 1)[0]
num_classes = np.random.randint( 2, 50) class_sizes = np.random.randint(10, 100, num_classes) num_features = np.random.randint(10, 100) class_set = np.array([ 'C{:05d}'.format(x) for x in range(num_classes)]) feat_names = np.array([ str(x) for x in range(num_features) ]) test_dataset = MLDataset() for class_index, class_id in enumerate(class_set): for sub_ix in range(class_sizes[class_index]): subj_id = '{}_S{:05d}'.format(class_set[class_index],sub_ix) feat = np.random.random(num_features) test_dataset.add_sample(subj_id, feat, class_index, class_id, feat_names) out_file = os.path.join(out_dir,'random_example_dataset.pkl') test_dataset.save(out_file) # same IDs, new features same_ids_new_feat = MLDataset() for sub_id in test_dataset.keys: feat = np.random.random(num_features) same_ids_new_feat.add_sample(sub_id, feat, test_dataset.labels[sub_id], test_dataset.classes[sub_id]) same_ids_new_feat.feature_names = np.array([ 'new_f{}'.format(x) for x in range( num_features) ]) test_dataset.description = 'test dataset' print(test_dataset) print('default format:\n {}'.format(test_dataset))