for el in features: # drop columns with missing data as we cannot then calculate the rmse if sum(dset['df'][el].isnull()): features_drop.append(el) if len(features_drop): logger.info('dropping features {}'.format(features_drop)) time.sleep(2) features = [el for el in features if el not in features_drop] data = dset['df'][features].values.astype(np.float) if is_normalize_0_1: data = normalize_array(data) X_missing = utilmlab.introduce_missing(data, p_miss, uniform_miss, miss_corr) df_missing = pd.DataFrame(X_missing, columns=features) df_missing = pd.DataFrame(X_missing, columns=features) if islabel: df_missing[labels] = df[labels] df_missing.to_csv( fn_missing_csv, index=False) df_data = pd.DataFrame(data, columns=features) if islabel: df_data[labels] = df[labels] df_data.to_csv( fn_csv, index=False) if fn_json is not None: with open(fn_json, "w") as f: dset_prop = {
assert fn_i is not None assert label is not None logger.info('loading {} lbl:{} sep:{}'.format( fn_i, label, sep)) df = pd.read_csv(fn_i, sep=sep) features = list(df.columns) assert label in features features.remove(label) assert len(df) assert len(features) > 1 X_ = df[features] Y_ = df[label] if p_miss: X_ = pd.DataFrame( utilmlab.introduce_missing(X_, p_miss), columns=X_.columns) if nsample: nnan_x = utilmlab.df_get_num_na(X_) logger.info('+shape: x:{} y:{} nan: x:{} ({})'.format( X_.shape, Y_.shape, nnan_x, nnan_x/float(np.prod(X_.shape)) )) X_ = X_.iloc[:nsample, :] Y_ = Y_[:nsample] nnan_x = utilmlab.df_get_num_na(X_) logger.info('shape: x:{} y:{} #{} nan: x:{} ({:0.3f})'.format(
for el in features: # drop columns with missing data as we cannot then calculate the rmse if sum(dset['df'][el].isnull()): features_drop.append(el) if len(features_drop): logger.info('dropping features {}'.format(features_drop)) time.sleep(2) features = [el for el in features if el not in features_drop] data = dset['df'][features].values.astype(np.float) if is_normalize_0_1: data = normalize_array(data) X_missing = utilmlab.introduce_missing(data, p_miss) df_missing = pd.DataFrame(X_missing, columns=features) df_missing = pd.DataFrame(X_missing, columns=features) if islabel: df_missing[labels] = df[labels] df_missing.to_csv( fn_missing_csv, index=False) df_data = pd.DataFrame(data, columns=features) if islabel: df_data[labels] = df[labels] df_data.to_csv( fn_csv, index=False) if fn_json is not None: with open(fn_json, "w") as f:
df = dset['df'] logger.info(dset['targets']) assert len(dset['targets']) >= 1 df_dst = df[features + dset['targets']] if label[0] not in dset['targets']: print(dset['targets'], label) print(df_dst.columns) df_dst[label] = df[dset['targets']] print(df.columns) df_dst = df_dst.drop(dset['targets'], axis=1) for el in dset['targets']: assert el not in df_dst.columns del df logger.info('{} {} o:{} lbl:{} pmiss:{}'.format(dataset, df_dst.values.shape, fn_o, label, p_miss)) if nsample: df_dst = df_dst[:nsample] if p_miss: df_dst[features] = utilmlab.introduce_missing(df_dst[features], p_miss) assert fn_o is not None compression = 'gzip' if fn_o.endswith('.gz') else None logger.info('saving {} {}'.format(fn_o, list(df_dst.columns))) df_dst.to_csv(fn_o, index=False, compression=compression, sep=sep)
df = pd.read_csv(fn_i, sep=sep) features = list(df.columns) assert label in features features.remove(label) assert len(df) assert len(features) > 1 X_ = df[features] Y_ = df[label] if fn_i_val_index is not None: X_val_indexes_ = pd.read_csv(fn_i_val_index, sep=sep) X_val_indexes_ = X_val_indexes_['eid'] else: X_val_indexes_ = [] if p_miss: X_ = pd.DataFrame(utilmlab.introduce_missing(X_, p_miss), columns=X_.columns) if nsample: nnan_x = utilmlab.df_get_num_na(X_) logger.info('+shape: x:{} y:{} nan: x:{} ({})'.format( X_.shape, Y_.shape, nnan_x, nnan_x / float(np.prod(X_.shape)))) X_ = X_.iloc[:nsample, :] Y_ = Y_[:nsample] nnan_x = utilmlab.df_get_num_na(X_) logger.info('shape: x:{} y:{} #{} nan: x:{} ({:0.3f})'.format( X_.shape, Y_.shape, len(set(Y_)), nnan_x, nnan_x / np.prod(X_.shape))) if nnan_x and use_gain: X_ = impute_gain(X_, odir)