def load_data(self, name, f_path, f_name, get_tid, get_attr, get_val, na_values=None): tic = time.clock() try: raw_data = pd.read_csv(os.path.join(f_path, f_name), na_values=na_values) raw_data.fillna('_nan_', inplace=True) raw_data['_tid_'] = raw_data.apply(get_tid, axis=1) raw_data['_attribute_'] = raw_data.apply(get_attr, axis=1) raw_data['_value_'] = raw_data.apply(get_val, axis=1) raw_data = raw_data[['_tid_', '_attribute_', '_value_']] # Normalize string to lower-case and strip whitespaces. raw_data['_attribute_'] = raw_data['_attribute_'].apply( lambda x: x.lower()) raw_data['_value_'] = raw_data['_value_'].apply( lambda x: x.strip()) self.clean_data = Table(name, Source.DF, raw_data) self.clean_data.store_to_db(self.ds.engine.engine) self.clean_data.create_db_index(self.ds.engine, ['_tid_']) self.clean_data.create_db_index(self.ds.engine, ['_attribute_']) status = 'DONE Loading ' + f_name except Exception: logging.error('load_data for table %s', name) raise toc = time.clock() load_time = toc - tic return status, load_time
def __init__(self, fpath=None, df=None, db_engine=None, table_name=None, schema_name=None, id_col="_tid_", attr_col="attribute", name="ErrorLoaderDetector"): """ :param fpath: (str) Path to source csv file to load errors :param df: (DataFrame) datarame containing the errors :param db_engine: (DBEngine) Database engine object :param table_name: (str) Relational table considered for loading errors :param schema_name: (str) Schema in which :param table_name: exists :param id_col: (str) ID column name :param attr_col: (str) Attribute column name :param name: (str) name of the detector To load from csv file, :param fpath: must be specified. To load from a relational table, :param db_engine:, and :param table_name: must be specified, optionally specifying :param schema_name:. """ super(ErrorsLoaderDetector, self).__init__(name) src = None dataset_name = None if fpath is not None: dataset_name = "errors_file" src = Source.FILE elif df is not None: dataset_name = "errors_df" src = Source.DF elif (db_engine is not None) and (table_name is not None): dataset_name = table_name src = Source.DB else: raise Exception( "ERROR while intializing ErrorsLoaderDetector. Please provide (<fpath>), (<db_engine> and <table_name>), OR <df>" ) self.errors_table = Table(dataset_name, src, exclude_attr_cols=[attr_col], fpath=fpath, df=df, schema_name=schema_name, db_engine=db_engine) expected_schema = [id_col, attr_col] if list(self.errors_table.df.columns) != expected_schema: raise Exception( "ERROR while intializing ErrorsLoaderDetector: The loaded errors table does not match the expected schema of {}" .format(expected_schema)) self.errors_table.df = self.errors_table.df.astype({ id_col: int, attr_col: str })
def create_table(self, table_name, primary_id=None, primary_type=None, primary_increment=None): """Create a new table. Either loads a table or creates it if it doesn't exist yet. You can define the name and type of the primary key field, if a new table is to be created. The default is to create an auto-incrementing integer, ``id``. You can also set the primary key to be a string or big integer. The caller will be responsible for the uniqueness of ``primary_id`` if it is defined as a text type. You can disable auto-increment behaviour for numeric primary keys by setting `primary_increment` to `False`. Returns a :py:class:`Table <dataset.Table>` instance. :: table = db.create_table('population') # custom id and type table2 = db.create_table('population2', 'age') table3 = db.create_table('population3', primary_id='city', primary_type=db.types.text) # custom length of String table4 = db.create_table('population4', primary_id='city', primary_type=db.types.string(25)) # no primary key table5 = db.create_table('population5', primary_id=False) """ assert not isinstance( primary_type, str), "Text-based primary_type support is dropped, use db.types." table_name = normalize_table_name(table_name) with self.lock: if table_name not in self._tables: self._tables[table_name] = Table( self, table_name, primary_id=primary_id, primary_type=primary_type, primary_increment=primary_increment, auto_create=True, ) return self._tables.get(table_name)
def load_data(self, name, fpath, tid_col, attr_col, val_col, na_values=None): tic = time.clock() try: raw_data = pd.read_csv(fpath, na_values=na_values, dtype=str, encoding='utf-8') # We drop any ground truth values that are NULLs since we follow # the closed-world assumption (if it's not there it's wrong). # TODO: revisit this once we allow users to specify which # attributes may be NULL. raw_data.dropna(subset=[val_col], inplace=True) raw_data.rename( { tid_col: '_tid_', attr_col: '_attribute_', val_col: '_value_' }, axis='columns', inplace=True) raw_data = raw_data[['_tid_', '_attribute_', '_value_']] raw_data['_tid_'] = raw_data['_tid_'].astype(int) # Normalize string to whitespaces. categorical_attrs = self.ds.categorical_attrs if categorical_attrs: cat_cells = raw_data['_attribute_'].isin(categorical_attrs) raw_data.loc[cat_cells, '_value_'] = \ raw_data.loc[cat_cells, '_value_'].astype(str).str.strip().str.lower() self.clean_data = Table(name, Source.DF, df=raw_data) self.clean_data.store_to_db(self.ds.engine.engine) self.clean_data.create_db_index(self.ds.engine, ['_tid_']) self.clean_data.create_db_index(self.ds.engine, ['_attribute_']) status = 'DONE Loading {fname}'.format( fname=os.path.basename(fpath)) except Exception: logging.error('load_data for table %s', name) raise toc = time.clock() load_time = toc - tic return status, load_time
def load_table(self, table_name): """Load a table. This will fail if the tables does not already exist in the database. If the table exists, its columns will be reflected and are available on the :py:class:`Table <dataset.Table>` object. Returns a :py:class:`Table <dataset.Table>` instance. :: table = db.load_table('population') """ table_name = normalize_table_name(table_name) with self.lock: if table_name not in self._tables: self._tables[table_name] = Table(self, table_name) return self._tables.get(table_name)