def test_build_demo_metadata_without_tables(): metadata = Metadata() metadata.add_table('users') metadata.add_field('users', 'user_id', 'id', 'integer') metadata.add_field('users', 'country', 'categorical') metadata.add_field('users', 'gender', 'categorical') metadata.add_field('users', 'age', 'numerical', 'integer') metadata.set_primary_key('users', 'user_id') metadata.add_table('sessions') metadata.add_field('sessions', 'session_id', 'id', 'integer') metadata.add_field('sessions', 'user_id', 'id', 'integer') metadata.add_field('sessions', 'device', 'categorical') metadata.add_field('sessions', 'os', 'categorical') metadata.add_field('sessions', 'minutes', 'numerical', 'integer') metadata.set_primary_key('sessions', 'session_id') metadata.add_relationship('users', 'sessions') metadata.add_table('transactions') metadata.add_field('transactions', 'transaction_id', 'id', 'integer') metadata.add_field('transactions', 'session_id', 'id', 'integer') metadata.add_field('transactions', 'timestamp', 'datetime', properties={'format': '%Y-%m-%dT%H:%M'}) metadata.add_field('transactions', 'amount', 'numerical', 'float') metadata.add_field('transactions', 'cancelled', 'boolean') metadata.set_primary_key('transactions', 'transaction_id') metadata.add_relationship('sessions', 'transactions') assert DEMO_METADATA == metadata.to_dict()
def test_build_demo_metadata_from_tables(): """Build metadata from the demo tables. Then compare the built metadata with the demo one to make sure that they are the same. """ tables = load_demo(metadata=False) new_meta = Metadata() new_meta.add_table('users', data=tables['users'], primary_key='user_id') new_meta.add_table('sessions', data=tables['sessions'], primary_key='session_id', parent='users', foreign_key='user_id') transactions_fields = { 'timestamp': { 'type': 'datetime', 'format': '%Y-%m-%dT%H:%M' } } new_meta.add_table('transactions', tables['transactions'], fields_metadata=transactions_fields, primary_key='transaction_id', parent='sessions') assert DEMO_METADATA == new_meta.to_dict()
def make_dataset(name, data, table_name=None, entity_columns=None, sequence_index=None, datasets_path='.'): """Make a Dataset from a DataFrame. Args: name (str): Name of this dataset. data (pandas.DataFrame or str): Data passed as a DataFrame or as a path to a CSV file. table_name (str or None): Optionally give the table a different name. entity_columns (list or None): (Optional) List of names of the columns that form the entity_id of this dataset. If ``None`` (default), no entity columns are set. sequence_index (str or None): (Optional) Name of the column that is the sequence index of this dataset. datasets_path (str): (Optional) Path to the folder in which a new folder will be created for this dataset. Defaults to the current working directory. """ if isinstance(data, str): data = pd.read_csv(data) base_path = os.path.join(datasets_path, name) if os.path.exists(base_path): shutil.rmtree(base_path) os.makedirs(base_path, exist_ok=True) table_name = table_name or name cwd = os.getcwd() try: os.chdir(base_path) csv_name = table_name + '.csv' data.to_csv(csv_name, index=False) metadata = Metadata() metadata.add_table(name, csv_name) meta_dict = metadata.to_dict() table_meta = meta_dict['tables'][table_name] table_meta['entity_columns'] = entity_columns or [] table_meta['sequence_index'] = sequence_index table_meta['deepecho_version'] = Dataset.VERSION with open('metadata.json', 'w') as metadata_file: json.dump(meta_dict, metadata_file, indent=4) LOGGER.info('Dataset %s generated in folder %s', name, base_path) finally: os.chdir(cwd)