def prepDataSet(csv_filename, feature_set=None, dataset_name='generic dataset', ddg_cutoff=0.0, truncate=False): ''' prepares a data set object from a CSV file, under the conventions of this project: - the CSV is indexed by PDBID and residue number (columns 0,1) - the last column contains label-related data, mostly ddG values of residues. - all other columns are feature columns. The function reads the columns into a TreeDict structure, such that each component (normalized feature data, labels, PDB identifiers, columns used) is accessible as an attribute. ``dataset_name`` is optional, giving the TreeDict a name. Optional argument ``features`` directs the function which features to select from the table. By default, all features are selected. ''' dataset = TreeDict(dataset_name) dataset.csv_filename = os.path.abspath(csv_filename) dataset.is_bound = (csv_filename.find('unbound') == -1) dataset._df = cached_csv_df(csv_filename, index_col=[0,1], true_values=['True'], false_values=['False'], ) if truncate: dataset._df = dataset._df[:DEBUG_DATASET_SIZE] if feature_set is None: cols = dataset._df.columns[:-1] dataset.feature_set = FeatureSet(cols, cols) else: dataset.feature_set = feature_set all_feature_data_df = dataset._df.ix[:,dataset.feature_set.all_features] dataset.feature_data_df = all_feature_data_df.ix[:,dataset.feature_set.features] #dataset.X = dataset.feature_data_df.values dataset.X = sklearn.preprocessing.scale( dataset.feature_data_df.values.astype(float)) dataset.label_data_df = dataset._df.ix[:,-1] dataset.ddg_cutoff = ddg_cutoff dataset.y = dataset.label_data_df.values > dataset.ddg_cutoff # sanity checks assert dataset.X.shape[0] == len(dataset.y) dataset.pdbs = dataset.feature_data_df.index.get_level_values(0) return dataset
def createConfig(feature_set, train=None, test=None, title_meta=None, ddg_cutoff=0.0): config = TreeDict('config') config.feature_set = feature_set config.bound = 'bound.data.old.csv' config.unbound = 'unbound.data.old.csv' config.ddg_cutoff = ddg_cutoff config.training = data.prepDataSet(train or config.unbound, feature_set=config.feature_set, ddg_cutoff=ddg_cutoff) config.testing = data.prepDataSet(test or config.bound, feature_set=config.feature_set, ddg_cutoff=ddg_cutoff) config.title = feature_set.getTitle() #display(Latex(config.title)) return config