def _update_metadata(function, metadata, output_path=None, **kwargs): metadata_dict = _load_metadata_dict(metadata) function(metadata_dict, **kwargs) validate(metadata_dict) if not isinstance(metadata, MetaData): metadata = MetaData() metadata.data = metadata_dict if output_path is not None: metadata.to_json(output_path) return metadata
def __init__(self, path_to_dataset): """Read the dataset from disk. Args: path_to_dataset (str): The path to the dataset. """ self.path_to_dataset = os.path.expanduser(path_to_dataset) path_to_metadata = os.path.join(self.path_to_dataset, "metadata.json") self.metadata = MetaData.from_json(path_to_metadata) self.tables = {} for table_name in self.metadata.get_table_names(): path_to_csv = os.path.join(self.path_to_dataset, "%s.csv" % table_name) table_metadata = self.metadata.get_table(table_name) dtypes = _read_csv_dtypes(table_metadata) self.tables[table_name] = pd.read_csv(path_to_csv, dtype=dtypes) _parse_dtypes(self.tables[table_name], table_metadata) logger.info("Loaded table %s (%s rows, %s cols)" % ( table_name, len(self.tables[table_name]), len(self.tables[table_name].columns) ))
def test_foreign_keys_output_path(metad, foreign_keys, tmpdir): output_path = os.path.join(str(tmpdir), 'updated.json') metadata.update_metadata_foreign_keys(metad, foreign_keys, output_path=output_path) updated = MetaData.from_json(output_path) assert updated.data['foreign_keys'] == foreign_keys
def test_primary_output_path(metad, primary_keys, tmpdir): output_path = os.path.join(str(tmpdir), 'metadata.json') metadata.update_metadata_primary_keys(metad, primary_keys, output_path=output_path) updated = MetaData.from_json(output_path) assert updated.data['tables'][0]['primary_key'] == 'a_field' assert updated.data['tables'][1]['primary_key'] == 'another_field'
def test_column_map_output_path(metad, column_map, constraints, tmpdir): output_path = os.path.join(str(tmpdir), 'metadata.json') metadata.update_metadata_column_map(metad, column_map, target_table='1234', target_field='a_field', output_path=output_path) updated = MetaData.from_json(output_path) assert updated.data['constraints'] == constraints
def test_creation(self): metadata = MetaData() metadata.add_table({ "id": "users", "name": "users", "fields": [{ "name": "user_id", "data_type": "id" }, { "name": "name", "data_type": "text" }, { "name": "gender", "data_type": "categorical" }, { "name": "age", "data_type": "numerical" }, { "name": "birthday", "data_type": "datetime" }, { "name": "picture", "data_type": "other" }] }) assert len(metadata.get_table_names()) == 1 assert "users" in metadata.get_table_names()
def metad(): metadata = MetaData() metadata.data = { 'tables': [ { 'id': '1234', 'name': 'a_table', 'fields': [{ 'name': 'a_field', 'type': 'number', 'subtype': 'float' }, { 'name': 'some_field', 'type': 'number', 'subtype': 'float' }] }, { 'id': '4567', 'name': 'another_table', 'fields': [{ 'name': 'another_field', 'type': 'number', 'subtype': 'float' }, { 'name': 'some_other_field', 'type': 'number', 'subtype': 'float' }] }, ], } return metadata
def _load_metadata_dict(metadata): """Return the given metadata as a dict.""" if isinstance(metadata, MetaData): return metadata.data if isinstance(metadata, str): metadata = MetaData.from_json(metadata) metadata.validate() return metadata.data if isinstance(metadata, dict): validate(metadata) return metadata raise TypeError('Metadata can only be MetaData, dict or str')
def load_dataset(dataset_path): """Load a dataset as a MetaData and a dict of tables. Args: dataset_path (str): Path to the root of the dataset. Returns: tuple: Tuple containing: * A ``MetaData`` instance. * A ``dict`` containing the tables loaded as ``pandas.DataFrames``. """ metadata = MetaData.from_json(os.path.join(dataset_path, 'metadata.json')) tables = {} for table_name in metadata.get_table_names(): tables[table_name] = pd.read_csv( os.path.join(dataset_path, table_name + '.csv')) return metadata, tables
def validate(metadata): metad = MetaData() metad.data = metadata metad.validate()
def test_validation(self, path_to_example): metadata = MetaData.from_json(path_to_example) metadata.validate() assert str(metadata)
def validate(args): for path_to_json in args["<jsons>"]: print("Validating %s..." % path_to_json) MetaData.from_json(path_to_json).validate()
def describe(args): metadata = MetaData.from_json(args["<json>"]) print(str(metadata))
def _load_metadata(self): cursor = self.db.cursor(pymysql.cursors.DictCursor) self.metadata = MetaData() self.metadata.set_tables(self._tables(cursor)) self.metadata.set_foreign_keys(self._foreign_keys(cursor)) cursor.close()
class MySQLConnector(BaseConnector): """Import data from MySQL instances. The MySQLConnector allows you to import data from a MySQL instance by translating the data types and exporting the primary / foreign key constraints from the schema table. """ def __init__(self, host, port, user, password, database): """Create a new MySQLConnector. Args: host (str): The host name / ip address of the MySQL instance. port (int): The port number (default: 3306). user (str): The username for accessing the instance. password (str): The password for accessing the instance. database (str): The name of the database to export. """ super().__init__() self.database = database self.db = pymysql.connect(host=host, port=port, user=user, passwd=password, db=database, charset='utf8') self._load_metadata() def __del__(self): self.db.close() def _load_metadata(self): cursor = self.db.cursor(pymysql.cursors.DictCursor) self.metadata = MetaData() self.metadata.set_tables(self._tables(cursor)) self.metadata.set_foreign_keys(self._foreign_keys(cursor)) cursor.close() def export_metadata(self, path_to_json): self.metadata.to_json(path_to_json) def export_tables(self, path_to_output): for table in self.metadata.data["tables"]: cursor = self.db.cursor(pymysql.cursors.Cursor) cursor.execute("select * from `" + table["name"] + "`;") column_names = [column[0] for column in cursor.description] rows = [list(row) for row in cursor.fetchall()] df = pd.DataFrame(rows, columns=column_names) df.to_csv(os.path.join(path_to_output, "%s.csv" % table["name"]), index=False) cursor.close() def _tables(self, cursor): tables = [] for table_name in self._table_names(cursor): table_metadata = self._table_metadata(cursor, table_name) table_metadata["id"] = table_name # table names are unique tables.append(table_metadata) return tables def _foreign_keys(self, cursor): # Select Foreign Key Constraints cursor.execute("""SELECT CONSTRAINT_NAME, TABLE_NAME, COLUMN_NAME, REFERENCED_TABLE_NAME, REFERENCED_COLUMN_NAME FROM INFORMATION_SCHEMA.KEY_COLUMN_USAGE WHERE REFERENCED_TABLE_SCHEMA = '%s'""" % self.database) foreign_keys = cursor.fetchall() # Aggregate Composite Keys foreign_key_constraints = defaultdict(list) for row in foreign_keys: foreign_key_constraints[row["CONSTRAINT_NAME"]].append(row) for constraint_name, constraint_rows in list( foreign_key_constraints.items()): constraint = { "table": None, "field": [], "ref_table": None, "ref_field": [] } for row in constraint_rows: constraint["table"] = row["TABLE_NAME"] constraint["field"].append(row["COLUMN_NAME"]) constraint["ref_table"] = row["REFERENCED_TABLE_NAME"] constraint["ref_field"].append(row["REFERENCED_COLUMN_NAME"]) # Simplify Non-Composite Keys if len(constraint["field"]) == 1: constraint["field"] = constraint["field"][0] constraint["ref_field"] = constraint["ref_field"][0] foreign_key_constraints[constraint_name] = constraint return list(foreign_key_constraints.values()) def _table_names(self, cursor): cursor.execute("show tables;") table_names = [] for row in cursor.fetchall(): table_names.append(list(row.values())[0]) return table_names def _table_metadata(self, cursor, table_name): cursor.execute(""" SELECT * FROM INFORMATION_SCHEMA.COLUMNS WHERE table_name = '%s' AND table_schema = '%s' ORDER BY ORDINAL_POSITION""" % (table_name, self.database)) fields = [] primary_key = [] for row in cursor.fetchall(): data_type = self._get_dtype(row["DATA_TYPE"]).copy() if row["COLUMN_KEY"] == "PRI": primary_key.append(row["COLUMN_NAME"]) data_type["name"] = row["COLUMN_NAME"] fields.append(data_type) if len(primary_key) == 1: primary_key = primary_key[0] table = {"name": table_name, "fields": fields} if primary_key: table["primary_key"] = primary_key[0] if len( primary_key) == 1 else primary_key return { "name": table_name, "primary_key": primary_key, "fields": fields } def _get_dtype(self, mysql_dtype): if mysql_dtype in set(["int", "tinyint", "smallint"]): return { 'data_type': 'numerical', 'data_subtype': 'integer', } if mysql_dtype in set(["bit", "binary"]): return { 'data_type': 'categorical', 'data_subtype': 'boolean', } if mysql_dtype in set([ "text", "tinytext", "longtext", "mediumtext", "varchar", "char", "enum" ]): return { 'data_type': 'categorical', } if mysql_dtype in set( ["datetime", "year", "timestamp", "time", "date"]): return { 'data_type': 'datetime', } if mysql_dtype in set( ["float", "double", "bigint", "mediumint", "decimal"]): return { 'data_type': 'numerical', 'data_subtype': 'float', } return {'data_type': 'other'}
def __init__(self): self.metadata = MetaData()