def from_df(df, type_overrides={}): """Loads data store from pandas DataFrame. - df: By default, we will load all numeric type as BucketizedFloatColumn and all other type as string_cols unless instructed by type_overrides. - type_overrides: A dict of col->type which allows us to override the types of each column. Valid override types are string, bucketized_float and float (raw_float). """ try: import numpy import pandas except ImportError: raise ImportError('Please install numpy and pandas.') d = _DataStore() raw_float_cols = set([ k for k, v in type_overrides.iteritems() if v == 'float' or v == 'raw_float' ]) string_cols = set( [k for k, v in type_overrides.iteritems() if v == 'string']) for col in df.select_dtypes(include=[numpy.number]).keys(): if col in raw_float_cols: d.add_raw_float_col(col, list(df[col])) elif col not in string_cols: d.add_bucketized_float_col(col, list(df[col])) for col in df.select_dtypes(exclude=[numpy.number]).keys(): d.add_string_col(col, list(df[col])) return DataStore(d)
def from_df(df, type_overrides={}): """Loads data store from pandas DataFrame. - df: By default, we will load all numeric type as BucketizedFloatColumn and all other type as string_cols unless instructed by type_overrides. - type_overrides: A dict of col->type which allows us to override the types of each column. Valid override types are string, bucketized_float and float (raw_float). """ try: import numpy import pandas except ImportError: raise ImportError('Please install numpy and pandas.') d = _DataStore() raw_float_cols = set([k for k, v in type_overrides.iteritems() if v == 'float' or v == 'raw_float']) string_cols = set([k for k, v in type_overrides.iteritems() if v == 'string']) for col in df.select_dtypes(include=[numpy.number]).keys(): if col in raw_float_cols: d.add_raw_float_col(col, list(df[col])) elif col not in string_cols: d.add_bucketized_float_col(col, list(df[col])) for col in df.select_dtypes(exclude=[numpy.number]).keys(): d.add_string_col(col, list(df[col])) return DataStore(d)
def slice(self, index): d = _DataStore() for key, value in self.iteritems(): if type(value) is StringColumn: d.add_string_col(key, [value[i] for i in index]) elif type(value) is BucketizedFloatColumn: d.add_bucketized_float_col(key, [value[i] for i in index]) elif type(value) is RawFloatColumn: d.add_raw_float_col(key, [value[i] for i in index]) return DataStore(d)
def from_tsvs(tsvs, bucketized_float_cols=[], string_cols=[], raw_float_cols=[]): """Loads data from tsvs. Inputs: tsvs: Blocks of tsvs, among which only the first contains header. bucketized_float_cols: Float columns that will be bucketized. All features will be bucketized. string_cols: String cols. raw_float_cols: Float columns that are loaded raw. Target columns are usually not bucketized. """ d = _DataStore() d.load_tsv(tsvs, bucketized_float_cols=bucketized_float_cols, string_cols=string_cols, raw_float_cols=raw_float_cols) return DataStore(d)
def from_dict(bucketized_float_cols={}, string_cols={}, raw_float_cols={}): """Loads data from dict of columns. bucketized_float_cols: Float columns that will be bucketized. All features will be bucketized. string_cols: String cols. raw_float_cols: Float columns that are loaded raw. Target columns are usually not bucketized. """ d = _DataStore() for key, value in bucketized_float_cols.iteritems(): d.add_bucketized_float_col(key, value) for key, value in string_cols.iteritems(): d.add_string_col(key, value) for key, value in raw_float_cols.iteritems(): d.add_raw_float_col(key, value) return DataStore(d)