示例#1
0
    def from_df(df, type_overrides={}):
        """Loads data store from pandas DataFrame.
             - df: By default, we will load all numeric type as BucketizedFloatColumn
                   and all other type as string_cols unless instructed by type_overrides.
             - type_overrides: A dict of col->type which allows us to override the types of
                   each column. Valid override types are string, bucketized_float and float (raw_float).
        """
        try:
            import numpy
            import pandas
        except ImportError:
            raise ImportError('Please install numpy and pandas.')

        d = _DataStore()
        raw_float_cols = set([
            k for k, v in type_overrides.iteritems()
            if v == 'float' or v == 'raw_float'
        ])
        string_cols = set(
            [k for k, v in type_overrides.iteritems() if v == 'string'])

        for col in df.select_dtypes(include=[numpy.number]).keys():
            if col in raw_float_cols:
                d.add_raw_float_col(col, list(df[col]))
            elif col not in string_cols:
                d.add_bucketized_float_col(col, list(df[col]))

        for col in df.select_dtypes(exclude=[numpy.number]).keys():
            d.add_string_col(col, list(df[col]))

        return DataStore(d)
示例#2
0
    def from_df(df, type_overrides={}):
        """Loads data store from pandas DataFrame.
             - df: By default, we will load all numeric type as BucketizedFloatColumn
                   and all other type as string_cols unless instructed by type_overrides.
             - type_overrides: A dict of col->type which allows us to override the types of
                   each column. Valid override types are string, bucketized_float and float (raw_float).
        """
        try:
            import numpy
            import pandas
        except ImportError:
            raise ImportError('Please install numpy and pandas.')

        d = _DataStore()
        raw_float_cols = set([k for k, v in type_overrides.iteritems()
                              if v == 'float' or v == 'raw_float'])
        string_cols = set([k for k, v in type_overrides.iteritems()
                           if v == 'string'])

        for col in df.select_dtypes(include=[numpy.number]).keys():
            if col in raw_float_cols:
                d.add_raw_float_col(col, list(df[col]))
            elif col not in string_cols:
                d.add_bucketized_float_col(col, list(df[col]))

        for col in df.select_dtypes(exclude=[numpy.number]).keys():
            d.add_string_col(col, list(df[col]))

        return DataStore(d)
示例#3
0
 def slice(self, index):
     d = _DataStore()
     for key, value in self.iteritems():
         if type(value) is StringColumn:
             d.add_string_col(key, [value[i] for i in index])
         elif type(value) is BucketizedFloatColumn:
             d.add_bucketized_float_col(key, [value[i] for i in index])
         elif type(value) is RawFloatColumn:
             d.add_raw_float_col(key, [value[i] for i in index])
     return DataStore(d)
示例#4
0
 def slice(self, index):
     d = _DataStore()
     for key, value in self.iteritems():
         if type(value) is StringColumn:
             d.add_string_col(key, [value[i] for i in index])
         elif type(value) is BucketizedFloatColumn:
             d.add_bucketized_float_col(key, [value[i] for i in index])
         elif type(value) is RawFloatColumn:
             d.add_raw_float_col(key, [value[i] for i in index])
     return DataStore(d)
示例#5
0
 def from_tsvs(tsvs, bucketized_float_cols=[], string_cols=[], raw_float_cols=[]):
     """Loads data from tsvs.
        Inputs:
          tsvs: Blocks of tsvs, among which only the first contains header.
          bucketized_float_cols: Float columns that will be bucketized. All features will be bucketized.
          string_cols: String cols.
          raw_float_cols: Float columns that are loaded raw. Target columns are usually not bucketized.
     """
     d = _DataStore()
     d.load_tsv(tsvs,
                bucketized_float_cols=bucketized_float_cols,
                string_cols=string_cols,
                raw_float_cols=raw_float_cols)
     return DataStore(d)
示例#6
0
    def from_dict(bucketized_float_cols={}, string_cols={}, raw_float_cols={}):
        """Loads data from dict of columns.
             bucketized_float_cols: Float columns that will be bucketized. All features will be bucketized.
             string_cols: String cols.
             raw_float_cols: Float columns that are loaded raw. Target columns are usually not bucketized.
        """
        d = _DataStore()
        for key, value in bucketized_float_cols.iteritems():
            d.add_bucketized_float_col(key, value)
        for key, value in string_cols.iteritems():
            d.add_string_col(key, value)
        for key, value in raw_float_cols.iteritems():
            d.add_raw_float_col(key, value)

        return DataStore(d)
示例#7
0
    def from_dict(bucketized_float_cols={}, string_cols={}, raw_float_cols={}):
        """Loads data from dict of columns.
             bucketized_float_cols: Float columns that will be bucketized. All features will be bucketized.
             string_cols: String cols.
             raw_float_cols: Float columns that are loaded raw. Target columns are usually not bucketized.
        """
        d = _DataStore()
        for key, value in bucketized_float_cols.iteritems():
            d.add_bucketized_float_col(key, value)
        for key, value in string_cols.iteritems():
            d.add_string_col(key, value)
        for key, value in raw_float_cols.iteritems():
            d.add_raw_float_col(key, value)

        return DataStore(d)
示例#8
0
 def from_tsvs(tsvs,
               bucketized_float_cols=[],
               string_cols=[],
               raw_float_cols=[]):
     """Loads data from tsvs.
        Inputs:
          tsvs: Blocks of tsvs, among which only the first contains header.
          bucketized_float_cols: Float columns that will be bucketized. All features will be bucketized.
          string_cols: String cols.
          raw_float_cols: Float columns that are loaded raw. Target columns are usually not bucketized.
     """
     d = _DataStore()
     d.load_tsv(tsvs,
                bucketized_float_cols=bucketized_float_cols,
                string_cols=string_cols,
                raw_float_cols=raw_float_cols)
     return DataStore(d)