def load_clean_datasets(self,dataset_name, project_settings): assert dataset_name in ['train_val','train','val','test'] clean_input_files = project_settings['clean_input_files'] X_name, y_name = [('X_' + d, 'y_' + d) for d in [dataset_name]][0] data_dir = find_data_dir(project_settings) data = dict() if 'take_nth_row' in project_settings: take_nth_row = project_settings['take_nth_row'] else: take_nth_row = 1 X_abs_filepath = data_dir + '/' + clean_input_files[X_name] X = pd.read_csv(X_abs_filepath, sep="\s", engine='python', header=None, skiprows=lambda i: i % take_nth_row != 0) y_mat_file_path = data_dir + '/' + clean_input_files[y_name] y_mat = pd.read_csv(y_mat_file_path, sep="\s", engine='python', header=None, skiprows= lambda i: i % take_nth_row != 0) y = y_mat.iloc[:, 0].tolist() data[dataset_name] = (X,y) return data
def main(): data_dir = find_data_dir(project_settings) processed_dir = data_dir + '/processed' if not os.path.isdir(processed_dir): os.makedirs(processed_dir) raw_files = project_settings['raw_input_files'] ##Remove duplicate columns or alter their names column_names_filepath = data_dir + '/' + raw_files['feature_names'] feat_df = pd.read_csv(column_names_filepath,sep="\s+",engine='python',names=['file_index','feature_name']) feat_df.sort_values('feature_name',inplace=True) feat_df['col_name_count'] = feat_df.groupby(['feature_name']).cumcount()+1 feat_df['new_feature_name'] = feat_df['feature_name'].where(feat_df['col_name_count'] == 1 ,other=feat_df['feature_name'] + '..' + feat_df['col_name_count'].astype(str) ) feature_name_filepath = data_dir + '/' + project_settings['clean_input_files']['feature_names'] feat_df.sort_values('file_index',inplace=True) feat_df[['new_feature_name']].to_csv(feature_name_filepath,index=False,header=False,quoting=False,sep="\t") ##Create Validation Set num_folds = project_settings['assessment']['cv_num_folds'] perc = 1 / num_folds clean_input_files = project_settings['clean_input_files'] X_train_val_filepath = find_data_dir(project_settings) + '/' + clean_input_files['X_train_val'] y_train_val_filepath = find_data_dir(project_settings) + '/' + clean_input_files['y_train_val'] X_train_val = pd.read_csv(X_train_val_filepath,sep="\s+",engine='python',header=None) y_train_val = pd.read_csv(y_train_val_filepath,sep="\s+",engine='python',header=None) s2_dfs = [pd.DataFrame(x) for x in train_test_split(X_train_val,y_train_val,test_size=perc)] s2_names = ["X_train","X_val","y_train","y_val"] dd2 = dict(zip(s2_names,s2_dfs)) for name in dd2: rel_file_path = clean_input_files[name] abs_file_path = data_dir + '/' + rel_file_path dd2[name].to_csv(abs_file_path,header=None,sep="\t",index=False)
def get_images(images, return_motion=False): if len(images[0].shape) == 0: data_dir = find_data_dir(work_dir, str(images[0])) paths = [[os.path.join( data_dir, os.path.split(str(x))[1]) for x in images]] fname = glob.glob(os.path.join(data_dir, 'rp_*.txt'))[0] motion = [fname] else: paths = [] motion = [] for session_scans in images: scans = [] data_dir = find_data_dir(work_dir, str(session_scans[0])) for x in session_scans: scans.append( os.path.join(data_dir, os.path.split(str(x))[1])) paths.append(scans) fname = glob.glob(os.path.join(data_dir, 'rp_*.txt'))[0] motion.append(fname) if return_motion: return paths, motion return paths
def save_normalize(m0): # normalize bold normalize = m0 doc['normalize']['bold'] = [] for session_scans in np.split( normalize.resample, np.cumsum(doc['n_scans']))[:-1]: scans = [] data_dir = find_data_dir(work_dir, str(session_scans[0])) for x in session_scans: scans.append(os.path.join(data_dir, os.path.split(str(x))[1])) doc['normalize']['bold'].append(scans)
def save_smooth(m0): # smooth smooth = m0 doc['smooth'] = {} doc['smooth']['bold'] = [] doc['smooth']['fwhm'] = float(smooth.fwhm) for session_scans in np.split( smooth.data, np.cumsum(doc['n_scans']))[:-1]: scans = [] data_dir = find_data_dir(work_dir, str(session_scans[0])) for x in session_scans: scans.append(os.path.join(data_dir, os.path.split(str(x))[1])) doc['smooth']['bold'].append(scans)
def get_path(path): data_dir = find_data_dir(work_dir, str(path)) return os.path.join(data_dir, os.path.split(str(path))[1])
def load_intra(location, fix=None, **kwargs): doc = {} mat = load_matfile(location)['SPM'] work_dir = os.path.split(os.path.realpath(location))[0] doc.update(_check_kwargs(work_dir, **kwargs)) # doc['mat'] = mat doc['design_matrices'] = mat.xX.X.tolist() # xX: model doc['design_conditions'] = [str(i) for i in mat.xX.name] doc['design_contrasts'] = {} doc['n_scans'] = mat.nscan.tolist() \ if isinstance(mat.nscan.tolist(), list) else [mat.nscan.tolist()] doc['n_sessions'] = mat.nscan.size doc['tr'] = float(mat.xY.RT) # xY: data doc['mask'] = os.path.join(work_dir, str(mat.VM.fname)) # VM: mask doc['beta_maps'] = [] doc['c_maps'] = {} doc['t_maps'] = {} doc['condition_key'] = [] doc['task_contrasts'] = {} doc['onsets'] = [] swabold = np.split(mat.xY.P.tolist(), np.cumsum(doc['n_scans'])[:-1]) doc['data'] = {} for session in swabold: session_dir = find_data_dir(work_dir, session[0]) scans = [os.path.join(session_dir, os.path.split(s)[1].strip()) for s in session] doc['data'].setdefault('swabold', []).append(scans) for s in doc['data']['swabold']: scans = [] for i in s: scans.append(strip_prefix_filename(i, 1)) doc['data'].setdefault('wabold', []).append(scans) for s in doc['data']['swabold']: scans = [] for i in s: scans.append(strip_prefix_filename(i, 2)) doc['data'].setdefault('abold', []).append(scans) for s in doc['data']['swabold']: scans = [] for i in s: scans.append(strip_prefix_filename(i, 3)) doc['data'].setdefault('bold', []).append(scans) doc['motion'] = [] if doc['n_sessions'] > 1: for session in mat.Sess: doc['motion'].append(session.C.C.tolist()) else: doc['motion'].append(mat.Sess.C.C.tolist()) def get_condition_onsets(condition): onset_time = condition.ons.tolist() onset_duration = condition.dur.tolist() if not isinstance(onset_time, list): onset_time = [onset_time] onset_duration = [onset_duration] onset_weight = [1] * len(onset_time) return zip(onset_time, onset_duration, onset_weight) if hasattr(mat.Sess, '__iter__'): for session in mat.Sess: onsets = {} condition_key = [] for condition_id, condition in enumerate(session.U): k = 'cond%03i' % (condition_id + 1) onsets[k] = get_condition_onsets(condition) condition_key.append(str(condition.name)) doc['condition_key'].append(condition_key) doc['onsets'].append(onsets) else: onsets = {} condition_key = [] for condition_id, condition in enumerate(mat.Sess.U): k = 'cond%03i' % (condition_id + 1) onsets[k] = get_condition_onsets(condition) condition_key.append(str(condition.name)) doc['condition_key'].append(condition_key) doc['onsets'].append(onsets) for c in mat.xCon: name = str(c.name) try: doc['c_maps'][name] = os.path.join(work_dir, str(c.Vcon.fname)) doc['t_maps'][name] = os.path.join(work_dir, str(c.Vspm.fname)) doc['design_contrasts'][name] = c.c.tolist() except: # sometimes c.Vcon is an empty array pass for i, b in enumerate(mat.Vbeta): doc['beta_maps'].append(os.path.join(work_dir, str(b.fname))) if 'subject_id' not in doc: doc['subject_id'] = hashlib.md5(work_dir).hexdigest() def get_condition_index(name): for i, full_name in enumerate(doc['design_conditions']): if name in full_name: return i # find the indices of the actual experimental conditions in the # design matrix, not the additional regressors... ii = [] for session in doc['condition_key']: ii.append([get_condition_index(name) for name in session]) # redefine the contrasts with the experimental conditions for k, contrast in doc['design_contrasts'].iteritems(): doc['task_contrasts'][k] = [] for per_session in ii: doc['task_contrasts'][k].append( np.array(contrast)[per_session].tolist()) # attempt to guess condition names with the contrast names & values condition_key = [np.array(ck, dtype='|S32') for ck in doc['condition_key']] for contrast_name, session_contrasts in doc['task_contrasts'].items(): for ck, contrast in zip(condition_key, session_contrasts): contrast = np.array(contrast) if ((contrast < 0).sum() == 0 and len(contrast.shape) == 1 and (contrast == np.abs(contrast).max()).sum() == 1): ck[np.array(contrast) > 0] = contrast_name doc['condition_key'] = condition_key # reformat SPM design per session (doc['design_matrices'], doc['design_conditions'], doc['design_contrasts']) = make_design_from_spm( doc['n_scans'], doc['design_matrices'], doc['design_conditions'], doc['design_contrasts']) if fix is not None: doc = fix_experiment(doc, fix)[0] return doc
def main(): data_dir = find_data_dir(project_settings) processed_dir = data_dir + '/processed' if not os.path.isdir(processed_dir): os.makedirs(processed_dir) data_dir = find_data_dir(project_settings) raw_files = project_settings['raw_input_files'] train_filepath = data_dir + '/' + raw_files['train'] test_filepath = data_dir + '/' + raw_files['test'] #for filepath in [train_filepath,test_filepath]: TODO: Figure this out df = pd.read_csv(train_filepath, sep=",", header=None, skiprows=1, keep_default_na=False) #df = pd.read_csv(train_filepath,sep=",",keep_default_na=False) #df = df.drop(64,axis=1) This is GarageQual. Highly correlated with GarageCond #Maybe that is why I originally dropped it, considering the comment below, which I wrote #a long time before writing this df.columns = range(df.shape[1]) #possible duplicate? #df = df.drop([0,38,46],axis=1) #Drop lincombo and id column #df.columns = range(df.shape[1]) #possible duplicate? X = df.iloc[:, range(df.shape[1] - 1)] y = df.iloc[:, df.shape[1] - 1] #y = pd.Series(np.repeat(1,len(y_str))).where(y_str == 'yes', other = 0) data_dict = OrderedDict([ ('id', 'numeric'), ('MSSubClass', [ '20', '30', '40', '45', '50', '60', '70', '75', '80', '85', '90', '120', '160', '180', '190', 'NA' ]), #Got rid of 150 ('MSZoning', ['C (all)', 'FV', 'RH', 'RL', 'RM', 'NA']), #Got rid of RP,I,C,A ('LotFrontage', 'numeric'), ('LotArea', 'numeric'), ('Street', ['Grvl', 'Pave']), ('Alley', ['Grvl', 'Pave', 'NA']), ('LotShape', ['Reg', 'IR1', 'IR2', 'IR3']), ('LandContour', ['Lvl', 'Bnk', 'HLS', 'Low']), ('Utilities', ['AllPub', 'NoSeWa']), #Dropped NoSewr,ELO ('LotConfig', ['Inside', 'Corner', 'CulDSac', 'FR2', 'FR3']), ('LandSlope', ['Gtl', 'Mod', 'Sev']), ('Neighborhood', [ 'Blmngtn', 'Blueste', 'BrDale', 'BrkSide', 'ClearCr', 'CollgCr', 'Crawfor', 'Edwards', 'Gilbert', 'IDOTRR', 'MeadowV', 'Mitchel', 'NAmes', 'NoRidge', 'NPkVill', 'NridgHt', 'NWAmes', 'OldTown', 'SWISU', 'Sawyer', 'SawyerW', 'Somerst', 'StoneBr', 'Timber', 'Veenker' ]), ('Condition1', [ 'Artery', 'Feedr', 'Norm', 'RRNn', 'RRAn', 'PosN', 'PosA', 'RRNe', 'RRAe' ]), ('Condition2', ['Artery', 'Feedr', 'Norm', 'RRNn', 'RRAn', 'PosA', 'PosN', 'RRAe']), #dropped RRNe ('BldgType', ['1Fam', '2fmCon', 'Duplex', 'TwnhsE', 'Twnhs']), #Does not match data description fi ('HouseStyle', [ '1Story', '1.5Fin', '1.5Unf', '2Story', '2.5Fin', '2.5Unf', 'SFoyer', 'SLvl' ]), ('OverallQual', 'numeric'), ('OverallCond', 'numeric'), ('YearBuilt', 'numeric'), ('YearRemodAdd', 'numeric'), ('RoofStyl', ['Flat', 'Gable', 'Gambrel', 'Hip', 'Mansard', 'Shed']), ('RoofMatl', [ 'ClyTile', 'CompShg', 'Membran', 'Metal', 'Roll', 'Tar&Grv', 'WdShake', 'WdShngl' ]), ('Exterior1st', [ 'AsbShng', 'AsphShn', 'BrkComm', 'BrkFace', 'CBlock', 'CemntBd', 'HdBoard', 'ImStucc', 'MetalSd', 'Plywood', 'Stone', 'Stucco', 'VinylSd', 'Wd Sdng', 'WdShing', "NA" ]), #Dropped Other,PreCast ( 'Exterior2nd', [ 'AsbShng', 'AsphShn', 'Brk Cmn', 'BrkFace', 'CBlock', 'CmentBd', 'HdBoard', 'ImStucc', 'MetalSd', 'Other', 'Plywood', #Dropped PreCast 'Stone', 'Stucco', 'VinylSd', 'Wd Sdng', 'Wd Shng', "NA" ]), #Does not match data desc file ('MasVnrType', ['BrkCmn', 'BrkFace', 'None', 'Stone', 'NA']), #Does not match data desc file (NA) #Dropped CBlock ('MasVnrArea', 'numeric'), ('ExterQual', ['Ex', 'Gd', 'TA', 'Fa']), #TODO: Make this numeric? #Dropped Po ('ExterCond', ['Ex', 'Gd', 'TA', 'Fa', 'Po']), ('Foundation', ['BrkTil', 'CBlock', 'PConc', 'Slab', 'Stone', 'Wood']), ('BsmtQual', ['Ex', 'Gd', 'TA', 'Fa', 'NA']), #Dropped Po ('BsmtCond', ['Gd', 'TA', 'Fa', 'Po', 'NA']), #Dropped Ex ('BsmtExposure', ['Gd', 'Av', 'Mn', 'No', 'NA']), ('BsmtFinType1', ['GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Unf', 'NA']), ('BsmtFinSF1', 'numeric'), ('BsmtFinType2', ['GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Unf', 'NA']), ('BsmtFinSF2', 'numeric'), ('BsmtUnfSF', 'numeric'), ('TotalBsmtSF', 'numeric'), ('Heating', ['Floor', 'GasA', 'GasW', 'Grav', 'OthW', 'Wall']), ('HeatingQC', ['Ex', 'Gd', 'TA', 'Fa', 'Po']), ('CentralAir', ['N', 'Y']), ('Electrical', ['SBrkr', 'FuseA', 'FuseF', 'FuseP', 'Mix', 'NA']), #Does not match data desc file (NA) ('1stFlrSF', 'numeric'), ('2ndFlrSF', 'numeric'), ('LowQualFinSF', 'numeric'), ('GrLivArea', 'numeric'), ('BsmtFullBath', 'numeric'), #TODO: should these be cat? ('BsmtHalfBath', 'numeric'), ('FullBath', 'numeric'), ('HalfBath', 'numeric'), ('Bedroom', 'numeric'), ('Kitchen', 'numeric'), ('KitchenQual', ['Ex', 'Gd', 'TA', 'Fa', 'NA']), #Dropped Po ('TotRmsAbvGrd', 'numeric'), ('Functional', ['Typ', 'Min1', 'Min2', 'Mod', 'Maj1', 'Maj2', 'Sev', 'NA']), #TODO: make numeric? #dropped Sal ('Fireplaces', 'numeric'), ('FireplaceQu', ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA']), ('GarageType', ['2Types', 'Attchd', 'Basment', 'BuiltIn', 'CarPort', 'Detchd', 'NA']), #Does not match data desc file ('GarageYrBlt', 'numeric'), ('GarageFinish', ['Fin', 'RFn', 'Unf', 'NA']), ('GarageCars', 'numeric'), ('GarageArea', 'numeric'), ('GarageQual', ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA']), ('GarageCond', ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA']), ('PavedDrive', ['Y', 'P', 'N']), ('WoodDeckSF', 'numeric'), ('OpenPorchSF', 'numeric'), ('EnclosedPorch', 'numeric'), ('3SsnPorch', 'numeric'), ('ScreenPorch', 'numeric'), ('PoolArea', 'numeric'), ('PoolQC', ['Ex', 'Gd', 'Fa', 'NA']), #Dropped TA ('Fence', ['GdPrv', 'MnPrv', 'GdWo', 'MnWw', 'NA']), ('MiscFeature', ['Gar2', 'Othr', 'Shed', 'TenC', 'NA']), #dropped Elev ('MiscVal', 'numeric'), ('MoSold', ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']), #TODO: numeric? ('YrSold', 'numeric'), ('SaleType', [ 'WD', 'CWD', 'New', 'COD', 'Con', 'ConLw', 'ConLI', 'ConLD', 'Oth', 'NA' ]), #dropped VWD ('SaleCondition', ['Normal', 'Abnorml', 'AdjLand', 'Alloca', 'Family', 'Partial']), #('SalePrice','numeric') ]) X.columns = data_dict.keys() #Bad Formatting in source file #X.loc[:,'BldgType'] = X.loc[:,'BldgType'].copy().where(X.loc[:,'BldgType'] != '2fmCon',other='2FmCon') #X.loc[:,'BldgType'] = X.loc[:,'BldgType'].copy().where(X.loc[:,'BldgType'] != 'Duplex',other='Duplx') col_idx = 0 meta_int_val_map = OrderedDict() for col in data_dict.keys(): col_vals = data_dict[col] if col_vals != 'numeric': val_int_map = dict([(col_vals[i], i) for i in range(len(col_vals))]) try: X.iloc[:, col_idx] = X.iloc[:, col_idx].astype(str).apply( lambda x: val_int_map[x]).copy() except KeyError: assert 1 == 0 meta_int_val_map[col] = {v: k for k, v in val_int_map.items()} else: meta_int_val_map[col] = 'numeric' col_idx += 1 Xe = pd.DataFrame() col_idx = 0 enc = OneHotEncoder(sparse=False) feature_names = list() for col in meta_int_val_map: if meta_int_val_map[col] == 'numeric': #try: # assert 'NA' not in pd.isnull(X.loc[:,col].value_counts().index) #except AssertionError: # assert 1 == 0 X.loc[:, col] = X.loc[:, col].copy().apply( lambda x: x if x != 'NA' else np.nan) Xe.loc[:, col_idx] = X.loc[:, col].astype(float) #Xe.loc[:,col_idx] = Xe.loc[:,col_idx].copy().fillna(Xe.loc[:,col_idx].mean()) #TODO: Make this a transfomer feature_names.append(col) col_idx += 1 else: assert 0 == 0 input_array = np.array(list(X.loc[:, col])).reshape(-1, 1) Xt = pd.DataFrame(enc.fit_transform(input_array)) int_val_map = meta_int_val_map[col] for int in int_val_map: if int_val_map[int] == 'NA': try: Xt.iloc[:, int] = Xt.iloc[:, int].apply( lambda x: np.nan if x == 1.0 else x) except IndexError: Xt.loc[:, int] = 0.0 else: pass Xt_w = Xt.shape[1] Xt.columns = [i + col_idx for i in range(Xt_w)] if Xe.shape[1] == 0: Xe = Xt else: Xe = pd.merge(Xe, Xt, left_index=True, right_index=True) int_val_map = meta_int_val_map[col] try: assert len(int_val_map) == Xt_w except AssertionError: assert 1 == 0 for i in range(len(int_val_map)): dim_val = int_val_map[i] feature_name = col + '_' + dim_val feature_names.append(feature_name) col_idx += 1 assert col_idx == Xe.shape[1] Xe.columns = range(col_idx) feature_names = [ x.replace(' ', '').replace('(', '').replace(')', '') for x in feature_names ] clean_input_files = project_settings['clean_input_files'] feature_names_rel_filepath = clean_input_files['feature_names'] feature_names_abs_filepath = data_dir + '/' + feature_names_rel_filepath pd.Series(feature_names).to_csv(feature_names_abs_filepath, index=False, header=None, sep="\t") split_perc = project_settings['train_test_split'] if split_perc != 1: s1_dfs = [ pd.DataFrame(x) for x in train_test_split( Xe, y, test_size=split_perc, random_state=42) ] s1_names = ["X_test", "X_train_val", "y_test", "y_train_val"] else: s1_dfs = [Xe, y] s1_names = ["X_train_val", "y_train_val"] dd1 = dict(zip(s1_names, s1_dfs)) for name in dd1: rel_file_path = clean_input_files[name] abs_file_path = data_dir + '/' + rel_file_path dd1[name].to_csv(abs_file_path, header=None, sep="\t", index=False) X_train_val, y_train_val = dd1['X_train_val'], dd1['y_train_val'] num_folds = project_settings['assessment']['cv_num_folds'] perc = 1 / num_folds s2_dfs = [ pd.DataFrame(x) for x in train_test_split(X_train_val, y_train_val, test_size=perc) ] s2_names = ["X_train", "X_val", "y_train", "y_val"] dd2 = dict(zip(s2_names, s2_dfs)) for name in dd2: rel_file_path = clean_input_files[name] abs_file_path = data_dir + '/' + rel_file_path dd2[name].to_csv(abs_file_path, header=None, sep="\t", index=False)
def main(): data_dir = find_data_dir(project_settings) processed_dir = data_dir + '/processed' if not os.path.isdir(processed_dir): os.makedirs(processed_dir) data_dir = find_data_dir(project_settings) raw_files = project_settings['raw_input_files'] X_filepath = data_dir + '/' + raw_files['X'] df = pd.read_csv(X_filepath, sep=";") X = df.iloc[:, range(16)] y_str = df.iloc[:, 16] y = pd.Series(np.repeat(1, len(y_str))).where(y_str == 'yes', other=0) data_dict = OrderedDict([ ('age', 'numeric'), ('job', [ 'blue-collar', 'management', 'technician', 'admin.', 'services', 'retired', 'self-employed', 'entrepreneur', 'unemployed', 'housemaid', 'student', 'unknown' ]), ('marital', ['married', 'single', 'divorced']), ('education', ['primary', 'secondary', 'tertiary', 'unknown']), ('default', ['no', 'yes']), ('balance', 'numeric'), ('housing', ['no', 'yes']), ('loan', ['no', 'yes']), ('contact', ['cellular', 'unknown', 'telephone']), ('day', 'numeric'), ('month', [ 'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec' ]), ('duration', 'numeric'), ('campaign', 'numeric'), ('pdays', 'numeric'), ('previous', 'numeric'), ('poutcome', ['success', 'failure', 'other', 'unknown']) ]) col_idx = 0 meta_int_val_map = OrderedDict() for col in data_dict.keys(): col_vals = data_dict[col] if col_vals != 'numeric': val_int_map = dict([(col_vals[i], i) for i in range(len(col_vals))]) X.iloc[:, col_idx] = X.iloc[:, col_idx].apply( lambda x: val_int_map[x]).copy( ) #TODO: fix the annoying pandas warning once and for all meta_int_val_map[col] = {v: k for k, v in val_int_map.items()} else: meta_int_val_map[col] = 'numeric' col_idx += 1 Xe = pd.DataFrame() col_idx = 0 enc = OneHotEncoder(sparse=False) feature_names = list() for col in meta_int_val_map: if meta_int_val_map[col] == 'numeric': Xe.loc[:, col] = X.loc[:, col] feature_names.append(col) col_idx += 1 else: input_array = np.array(list(X.loc[:, col])).reshape(-1, 1) Xt = pd.DataFrame(enc.fit_transform(input_array)) Xt_w = Xt.shape[1] Xt.columns = [i + col_idx for i in range(Xt_w)] Xe = pd.merge(Xe, Xt, left_index=True, right_index=True) int_val_map = meta_int_val_map[col] for i in range(len(int_val_map)): dim_val = int_val_map[i] feature_name = col + '_' + dim_val feature_names.append(feature_name) col_idx += 1 assert col_idx == Xe.shape[1] Xe.columns = range(col_idx) clean_input_files = project_settings['clean_input_files'] feature_names_rel_filepath = clean_input_files['feature_names'] feature_names_abs_filepath = data_dir + '/' + feature_names_rel_filepath pd.Series(feature_names).to_csv(feature_names_abs_filepath, index=False, header=None, sep="\t") split_perc = project_settings['train_test_split'] s1_dfs = [ pd.DataFrame(x) for x in train_test_split(Xe, y, test_size=split_perc, random_state=42) ] s1_names = ["X_test", "X_train_val", "y_test", "y_train_val"] dd1 = dict(zip(s1_names, s1_dfs)) for name in dd1: rel_file_path = clean_input_files[name] abs_file_path = data_dir + '/' + rel_file_path dd1[name].to_csv(abs_file_path, header=None, sep="\t", index=False) X_train_val, y_train_val = dd1['X_train_val'], dd1['y_train_val'] num_folds = project_settings['assessment']['cv_num_folds'] perc = 1 / num_folds s2_dfs = [ pd.DataFrame(x) for x in train_test_split(X_train_val, y_train_val, test_size=perc) ] s2_names = ["X_train", "X_val", "y_train", "y_val"] dd2 = dict(zip(s2_names, s2_dfs)) for name in dd2: rel_file_path = clean_input_files[name] abs_file_path = data_dir + '/' + rel_file_path dd2[name].to_csv(abs_file_path, header=None, sep="\t", index=False)
def main(): data_dir = find_data_dir(project_settings) processed_dir = data_dir + '/processed' if not os.path.isdir(processed_dir): os.makedirs(processed_dir) data_dir = find_data_dir(project_settings) raw_files = project_settings['raw_input_files'] X_filepath = data_dir + '/' + raw_files['X'] df = pd.read_csv(X_filepath, sep=";", header=None, skiprows=1) X = df.iloc[:, range(32)] y = df.iloc[:, 32] #y = pd.Series(np.repeat(1,len(y_str))).where(y_str == 'yes', other = 0) data_dict = OrderedDict([ ('school', ['GP', 'MS']), ('sex', ['F', 'M']), ('age', 'numeric'), ('address', ['U', 'R']), ('famsize', ['LE3', 'GT3']), ('pstatus', ['T', 'A']), ('medu', ['0', '1', '2', '3', '4']), ('fedu', ['0', '1', '2', '3', '4']), ('mjob', ['teacher', 'health', 'services', 'at_home', 'other']), ('fjob', ['teacher', 'health', 'services', 'at_home', 'other']), ('reason', ['home', 'reputation', 'course', 'other']), ('guardian', ['mother', 'father', 'other']), ('traveltime', ['1', '2', '3', '4']), ('studytime', ['1', '2', '3', '4']), ('failures', ['0', '1', '2', '3']), ('schoolsup', ['yes', 'no']), ('famsup', ['yes', 'no']), ('paid', ['yes', 'no']), ('activities', ['yes', 'no']), ('nursery', ['yes', 'no']), ('higher', ['yes', 'no']), ('internet', ['yes', 'no']), ('romantic', ['yes', 'no']), ('famrel', 'numeric'), #make cat? ('freetime', 'numeric'), ('goout', 'numeric'), ('dalc', 'numeric'), ('walc', 'numeric'), ('health', 'numeric'), ('absences', 'numeric'), ('g1', 'numeric'), ('g2', 'numeric'), ]) X.columns = data_dict.keys() col_idx = 0 meta_int_val_map = OrderedDict() for col in data_dict.keys(): col_vals = data_dict[col] if col_vals != 'numeric': val_int_map = dict([(col_vals[i], i) for i in range(len(col_vals))]) X.iloc[:, col_idx] = X.iloc[:, col_idx].astype(str).apply( lambda x: val_int_map[x]).copy() meta_int_val_map[col] = {v: k for k, v in val_int_map.items()} else: meta_int_val_map[col] = 'numeric' col_idx += 1 Xe = pd.DataFrame() col_idx = 0 enc = OneHotEncoder(sparse=False) feature_names = list() for col in meta_int_val_map: if meta_int_val_map[col] == 'numeric': Xe.loc[:, col_idx] = X.loc[:, col] #changed feature_names.append(col) col_idx += 1 else: input_array = np.array(list(X.loc[:, col])).reshape(-1, 1) Xt = pd.DataFrame(enc.fit_transform(input_array)) Xt_w = Xt.shape[1] Xt.columns = [i + col_idx for i in range(Xt_w)] if Xe.shape[1] == 0: Xe = Xt else: Xe = pd.merge(Xe, Xt, left_index=True, right_index=True) int_val_map = meta_int_val_map[col] assert len(int_val_map) == Xt_w for i in range(len(int_val_map)): dim_val = int_val_map[i] feature_name = col + '_' + dim_val feature_names.append(feature_name) col_idx += 1 assert col_idx == Xe.shape[1] Xe.columns = range(col_idx) clean_input_files = project_settings['clean_input_files'] feature_names_rel_filepath = clean_input_files['feature_names'] feature_names_abs_filepath = data_dir + '/' + feature_names_rel_filepath pd.Series(feature_names).to_csv(feature_names_abs_filepath, index=False, header=None, sep="\t") split_perc = project_settings['train_test_split'] if split_perc != 1: s1_dfs = [ pd.DataFrame(x) for x in train_test_split( Xe, y, test_size=split_perc, random_state=42) ] s1_names = ["X_test", "X_train_val", "y_test", "y_train_val"] else: s1_dfs = [Xe, y] s1_names = ["X_train_val", "y_train_val"] dd1 = dict(zip(s1_names, s1_dfs)) for name in dd1: rel_file_path = clean_input_files[name] abs_file_path = data_dir + '/' + rel_file_path dd1[name].to_csv(abs_file_path, header=None, sep="\t", index=False) X_train_val, y_train_val = dd1['X_train_val'], dd1['y_train_val'] num_folds = project_settings['assessment']['cv_num_folds'] perc = 1 / num_folds s2_dfs = [ pd.DataFrame(x) for x in train_test_split(X_train_val, y_train_val, test_size=perc) ] s2_names = ["X_train", "X_val", "y_train", "y_val"] dd2 = dict(zip(s2_names, s2_dfs)) for name in dd2: rel_file_path = clean_input_files[name] abs_file_path = data_dir + '/' + rel_file_path dd2[name].to_csv(abs_file_path, header=None, sep="\t", index=False)
def main(): data_dir = find_data_dir(project_settings) processed_dir = data_dir + '/processed' if not os.path.isdir(processed_dir): os.makedirs(processed_dir) data_dir = find_data_dir(project_settings) raw_files = project_settings['raw_input_files'] X_filepath = data_dir + '/' + raw_files['X'] df = pd.read_csv(X_filepath, sep=",", skiprows=1) X = df.iloc[:, range(24)] y = df.iloc[:, 24] #y = pd.Series(np.repeat(1,len(y_str))).where(y_str == 'yes', other = 0) data_dict = OrderedDict([('id', 'numeric'), ('amt_given_credit', 'numeric'), ('gender', ['1', '2']), ('education', ['0', '1', '2', '3', '4', '5', '6']), ('marital', ['0', '1', '2', '3']), ('age', 'numeric'), ('hist_apr', 'numeric'), ('hist_may', 'numeric'), ('hist_jun', 'numeric'), ('hist_jul', 'numeric'), ('hist_aug', 'numeric'), ('hist_sep', 'numeric'), ('bill_sep', 'numeric'), ('bill_aug', 'numeric'), ('bill_jul', 'numeric'), ('bill_jun', 'numeric'), ('bill_may', 'numeric'), ('bill_apr', 'numeric'), ('prepay_sep', 'numeric'), ('prepay_aug', 'numeric'), ('prepay_jul', 'numeric'), ('prepay_jun', 'numeric'), ('prepay_may', 'numeric'), ('prepay_apr', 'numeric')]) X.columns = data_dict.keys() col_idx = 0 meta_int_val_map = OrderedDict() for col in data_dict.keys(): col_vals = data_dict[col] if col_vals != 'numeric': val_int_map = dict([(col_vals[i], i) for i in range(len(col_vals))]) X.iloc[:, col_idx] = X.iloc[:, col_idx].astype(str).apply( lambda x: val_int_map[x]).copy() meta_int_val_map[col] = {v: k for k, v in val_int_map.items()} else: meta_int_val_map[col] = 'numeric' col_idx += 1 Xe = pd.DataFrame() col_idx = 0 enc = OneHotEncoder(sparse=False) feature_names = list() for col in meta_int_val_map: if meta_int_val_map[col] == 'numeric': Xe.loc[:, col] = X.loc[:, col] feature_names.append(col) col_idx += 1 else: input_array = np.array(list(X.loc[:, col])).reshape(-1, 1) Xt = pd.DataFrame(enc.fit_transform(input_array)) Xt_w = Xt.shape[1] Xt.columns = [i + col_idx for i in range(Xt_w)] Xe = pd.merge(Xe, Xt, left_index=True, right_index=True) int_val_map = meta_int_val_map[col] for i in range(len(int_val_map)): dim_val = int_val_map[i] feature_name = col + '_' + dim_val feature_names.append(feature_name) col_idx += 1 assert col_idx == Xe.shape[1] Xe.columns = range(col_idx) clean_input_files = project_settings['clean_input_files'] feature_names_rel_filepath = clean_input_files['feature_names'] feature_names_abs_filepath = data_dir + '/' + feature_names_rel_filepath pd.Series(feature_names).to_csv(feature_names_abs_filepath, index=False, header=None, sep="\t") split_perc = project_settings['train_test_split'] s1_dfs = [ pd.DataFrame(x) for x in train_test_split(Xe, y, test_size=split_perc, random_state=42) ] s1_names = ["X_test", "X_train_val", "y_test", "y_train_val"] dd1 = dict(zip(s1_names, s1_dfs)) for name in dd1: rel_file_path = clean_input_files[name] abs_file_path = data_dir + '/' + rel_file_path dd1[name].to_csv(abs_file_path, header=None, sep="\t", index=False) X_train_val, y_train_val = dd1['X_train_val'], dd1['y_train_val'] num_folds = project_settings['assessment']['cv_num_folds'] perc = 1 / num_folds s2_dfs = [ pd.DataFrame(x) for x in train_test_split(X_train_val, y_train_val, test_size=perc) ] s2_names = ["X_train", "X_val", "y_train", "y_val"] dd2 = dict(zip(s2_names, s2_dfs)) for name in dd2: rel_file_path = clean_input_files[name] abs_file_path = data_dir + '/' + rel_file_path dd2[name].to_csv(abs_file_path, header=None, sep="\t", index=False)
def main(): data_dir = find_data_dir(project_settings) raw_files = project_settings['raw_input_files'] X_filepath = data_dir + '/' + raw_files['X'] df = pd.read_csv(X_filepath, header=None) X = df.iloc[:, range(6)] y = df.iloc[:, 6] data_dict = OrderedDict([ ('buying', ['vhigh', 'high', 'med', 'low']), ('maint', ['vhigh', 'high', 'med', 'low']), ('doors', ['2', '3', '4', '5more']), ('persons', ['2', '4', 'more']), ('lug_boot', ['small', 'med', 'big']), ('safety', ['low', 'med', 'high']), #('class', ['unacc', 'acc', 'good', 'vgood']) ]) col_idx = 0 meta_int_val_map = OrderedDict() for col in data_dict.keys(): col_vals = data_dict[col] val_int_map = dict([(col_vals[i], i) for i in range(len(col_vals))]) X.iloc[:, col_idx] = X.iloc[:, col_idx].apply(lambda x: val_int_map[ x]).copy() #TODO: fix the annoying pandas warning once and for all meta_int_val_map[col] = {v: k for k, v in val_int_map.items()} col_idx += 1 enc = OneHotEncoder(sparse=False) Xe = pd.DataFrame(enc.fit_transform(X)) feature_names = list() for col in meta_int_val_map: int_val_map = meta_int_val_map[col] for i in range(len(int_val_map)): dim_val = int_val_map[i] feature_name = col + '_' + dim_val feature_names.append(feature_name) assert len(feature_names) == Xe.shape[1] clean_input_files = project_settings['clean_input_files'] feature_names_rel_filepath = clean_input_files['feature_names'] feature_names_abs_filepath = data_dir + '/' + feature_names_rel_filepath pd.Series(feature_names).to_csv(feature_names_abs_filepath, index=False, header=None, sep="\t") split_perc = project_settings['test_train_split'] numpy_arrays = train_test_split(Xe, y, test_size=split_perc, random_state=42) i = 0 for dataset_name in ["X_test", "X_train", "y_test", "y_train"]: array_i = numpy_arrays[i] df_i = pd.DataFrame(array_i) rel_file_path = clean_input_files[dataset_name] abs_file_path = data_dir + '/' + rel_file_path df_i.to_csv(abs_file_path, header=None, sep="\t", index=False) i += 1