예제 #1
0
 def load_data(self,
               name,
               f_path,
               f_name,
               get_tid,
               get_attr,
               get_val,
               na_values=None):
     tic = time.clock()
     try:
         raw_data = pd.read_csv(os.path.join(f_path, f_name),
                                na_values=na_values)
         raw_data.fillna('_nan_', inplace=True)
         raw_data['_tid_'] = raw_data.apply(get_tid, axis=1)
         raw_data['_attribute_'] = raw_data.apply(get_attr, axis=1)
         raw_data['_value_'] = raw_data.apply(get_val, axis=1)
         raw_data = raw_data[['_tid_', '_attribute_', '_value_']]
         # Normalize string to lower-case and strip whitespaces.
         raw_data['_attribute_'] = raw_data['_attribute_'].apply(
             lambda x: x.lower())
         raw_data['_value_'] = raw_data['_value_'].apply(
             lambda x: x.strip())
         self.clean_data = Table(name, Source.DF, raw_data)
         self.clean_data.store_to_db(self.ds.engine.engine)
         self.clean_data.create_db_index(self.ds.engine, ['_tid_'])
         self.clean_data.create_db_index(self.ds.engine, ['_attribute_'])
         status = 'DONE Loading ' + f_name
     except Exception:
         logging.error('load_data for table %s', name)
         raise
     toc = time.clock()
     load_time = toc - tic
     return status, load_time
예제 #2
0
    def __init__(self,
                 fpath=None,
                 df=None,
                 db_engine=None,
                 table_name=None,
                 schema_name=None,
                 id_col="_tid_",
                 attr_col="attribute",
                 name="ErrorLoaderDetector"):
        """
        :param fpath: (str) Path to source csv file to load errors
        :param df: (DataFrame) datarame containing the errors
        :param db_engine: (DBEngine) Database engine object
        :param table_name: (str) Relational table considered for loading errors
        :param schema_name: (str) Schema in which :param table_name: exists
        :param id_col: (str) ID column name
        :param attr_col: (str) Attribute column name
        :param name: (str) name of the detector

        To load from csv file, :param fpath: must be specified.
        To load from a relational table, :param db_engine:, and 
        :param table_name: must be specified, optionally specifying :param schema_name:.
        """
        super(ErrorsLoaderDetector, self).__init__(name)
        src = None
        dataset_name = None
        if fpath is not None:
            dataset_name = "errors_file"
            src = Source.FILE
        elif df is not None:
            dataset_name = "errors_df"
            src = Source.DF
        elif (db_engine is not None) and (table_name is not None):
            dataset_name = table_name
            src = Source.DB
        else:
            raise Exception(
                "ERROR while intializing ErrorsLoaderDetector. Please provide (<fpath>), (<db_engine> and <table_name>), OR <df>"
            )

        self.errors_table = Table(dataset_name,
                                  src,
                                  exclude_attr_cols=[attr_col],
                                  fpath=fpath,
                                  df=df,
                                  schema_name=schema_name,
                                  db_engine=db_engine)

        expected_schema = [id_col, attr_col]
        if list(self.errors_table.df.columns) != expected_schema:
            raise Exception(
                "ERROR while intializing ErrorsLoaderDetector: The loaded errors table does not match the expected schema of {}"
                .format(expected_schema))

        self.errors_table.df = self.errors_table.df.astype({
            id_col: int,
            attr_col: str
        })
    def create_table(self,
                     table_name,
                     primary_id=None,
                     primary_type=None,
                     primary_increment=None):
        """Create a new table.

        Either loads a table or creates it if it doesn't exist yet. You can
        define the name and type of the primary key field, if a new table is to
        be created. The default is to create an auto-incrementing integer,
        ``id``. You can also set the primary key to be a string or big integer.
        The caller will be responsible for the uniqueness of ``primary_id`` if
        it is defined as a text type. You can disable auto-increment behaviour
        for numeric primary keys by setting `primary_increment` to `False`.

        Returns a :py:class:`Table <dataset.Table>` instance.
        ::

            table = db.create_table('population')

            # custom id and type
            table2 = db.create_table('population2', 'age')
            table3 = db.create_table('population3',
                                     primary_id='city',
                                     primary_type=db.types.text)
            # custom length of String
            table4 = db.create_table('population4',
                                     primary_id='city',
                                     primary_type=db.types.string(25))
            # no primary key
            table5 = db.create_table('population5',
                                     primary_id=False)
        """
        assert not isinstance(
            primary_type,
            str), "Text-based primary_type support is dropped, use db.types."
        table_name = normalize_table_name(table_name)
        with self.lock:
            if table_name not in self._tables:
                self._tables[table_name] = Table(
                    self,
                    table_name,
                    primary_id=primary_id,
                    primary_type=primary_type,
                    primary_increment=primary_increment,
                    auto_create=True,
                )
            return self._tables.get(table_name)
예제 #4
0
    def load_data(self,
                  name,
                  fpath,
                  tid_col,
                  attr_col,
                  val_col,
                  na_values=None):
        tic = time.clock()
        try:
            raw_data = pd.read_csv(fpath,
                                   na_values=na_values,
                                   dtype=str,
                                   encoding='utf-8')
            # We drop any ground truth values that are NULLs since we follow
            # the closed-world assumption (if it's not there it's wrong).
            # TODO: revisit this once we allow users to specify which
            # attributes may be NULL.
            raw_data.dropna(subset=[val_col], inplace=True)
            raw_data.rename(
                {
                    tid_col: '_tid_',
                    attr_col: '_attribute_',
                    val_col: '_value_'
                },
                axis='columns',
                inplace=True)
            raw_data = raw_data[['_tid_', '_attribute_', '_value_']]
            raw_data['_tid_'] = raw_data['_tid_'].astype(int)

            # Normalize string to whitespaces.
            categorical_attrs = self.ds.categorical_attrs
            if categorical_attrs:
                cat_cells = raw_data['_attribute_'].isin(categorical_attrs)
                raw_data.loc[cat_cells, '_value_'] = \
                    raw_data.loc[cat_cells, '_value_'].astype(str).str.strip().str.lower()

            self.clean_data = Table(name, Source.DF, df=raw_data)
            self.clean_data.store_to_db(self.ds.engine.engine)
            self.clean_data.create_db_index(self.ds.engine, ['_tid_'])
            self.clean_data.create_db_index(self.ds.engine, ['_attribute_'])
            status = 'DONE Loading {fname}'.format(
                fname=os.path.basename(fpath))
        except Exception:
            logging.error('load_data for table %s', name)
            raise
        toc = time.clock()
        load_time = toc - tic
        return status, load_time
예제 #5
0
    def load_table(self, table_name):
        """Load a table.

        This will fail if the tables does not already exist in the database. If
        the table exists, its columns will be reflected and are available on
        the :py:class:`Table <dataset.Table>` object.

        Returns a :py:class:`Table <dataset.Table>` instance.
        ::

            table = db.load_table('population')
        """
        table_name = normalize_table_name(table_name)
        with self.lock:
            if table_name not in self._tables:
                self._tables[table_name] = Table(self, table_name)
            return self._tables.get(table_name)