def _score_dataset(dataset, datasets_path, output): start = datetime.now() try: if datasets_path is None: metadata, tables = load_demo(dataset, metadata=True) else: metadata = Metadata( os.path.join(datasets_path, dataset, 'metadata.json')) tables = metadata.load_tables() sdv = SDV() LOGGER.info('Modeling dataset %s', dataset) sdv.fit(metadata, tables) LOGGER.info('Sampling dataset %s', dataset) sampled = sdv.sample_all(10) LOGGER.info('Evaluating dataset %s', dataset) score = evaluate(sampled, metadata=metadata) LOGGER.info('%s: %s - ELAPSED: %s', dataset, score, datetime.now() - start) output.update({ 'dataset': dataset, 'score': score, }) except Exception as ex: error = '{}: {}'.format(type(ex).__name__, str(ex)) LOGGER.error('%s: %s - ELAPSED: %s', dataset, error, datetime.now() - start) output.update({'dataset': dataset, 'error': error})
def make_dataset(name, data, table_name=None, entity_columns=None, sequence_index=None, datasets_path='.'): """Make a Dataset from a DataFrame. Args: name (str): Name of this dataset. data (pandas.DataFrame or str): Data passed as a DataFrame or as a path to a CSV file. table_name (str or None): Optionally give the table a different name. entity_columns (list or None): (Optional) List of names of the columns that form the entity_id of this dataset. If ``None`` (default), no entity columns are set. sequence_index (str or None): (Optional) Name of the column that is the sequence index of this dataset. datasets_path (str): (Optional) Path to the folder in which a new folder will be created for this dataset. Defaults to the current working directory. """ if isinstance(data, str): data = pd.read_csv(data) base_path = os.path.join(datasets_path, name) if os.path.exists(base_path): shutil.rmtree(base_path) os.makedirs(base_path, exist_ok=True) table_name = table_name or name cwd = os.getcwd() try: os.chdir(base_path) csv_name = table_name + '.csv' data.to_csv(csv_name, index=False) metadata = Metadata() metadata.add_table(name, csv_name) meta_dict = metadata.to_dict() table_meta = meta_dict['tables'][table_name] table_meta['entity_columns'] = entity_columns or [] table_meta['sequence_index'] = sequence_index table_meta['deepecho_version'] = Dataset.VERSION with open('metadata.json', 'w') as metadata_file: json.dump(meta_dict, metadata_file, indent=4) LOGGER.info('Dataset %s generated in folder %s', name, base_path) finally: os.chdir(cwd)
def test_evaluate_tables_from_demo(): tables = load_demo(metadata=False) new_meta = Metadata() new_meta.add_table('users', data=tables['users'], primary_key='user_id') new_meta.add_table('sessions', data=tables['sessions'], primary_key='session_id', parent='users', foreign_key='user_id') transactions_fields = { 'timestamp': { 'type': 'datetime', 'format': '%Y-%m-%d' } } new_meta.add_table('transactions', tables['transactions'], fields_metadata=transactions_fields, primary_key='transaction_id', parent='sessions') sdv = SDV() sdv.fit(new_meta, tables=tables) sampled = sdv.sample_all() table_scores = dict() for table in new_meta.get_tables(): table_scores[table] = evaluate(sampled[table], real=tables[table], metadata=new_meta, table_name=table) evaluate(sampled, real=tables, metadata=new_meta)
def test_build_demo_metadata_from_tables(): """Build metadata from the demo tables. Then compare the built metadata with the demo one to make sure that they are the same. """ tables = load_demo(metadata=False) new_meta = Metadata() new_meta.add_table('users', data=tables['users'], primary_key='user_id') new_meta.add_table('sessions', data=tables['sessions'], primary_key='session_id', parent='users', foreign_key='user_id') transactions_fields = { 'timestamp': { 'type': 'datetime', 'format': '%Y-%m-%dT%H:%M' } } new_meta.add_table('transactions', tables['transactions'], fields_metadata=transactions_fields, primary_key='transaction_id', parent='sessions') assert DEMO_METADATA == new_meta.to_dict()
def _load_metadata(self): dataset_path = os.path.join(DATA_DIR, self.name) metadata_path = os.path.join(dataset_path, 'metadata.json') try: self.metadata = Metadata(metadata_path) version = self.metadata.get_table_meta( self.table_name)['deepecho_version'] assert version == self.VERSION except Exception: self._download() self.metadata = Metadata(metadata_path)
def __init__(self, dataset, table_name=None, max_entities=None, segment_size=None): if os.path.isdir(dataset): self.name = os.path.basename(dataset) self.table_name = table_name or self.name self.metadata = Metadata(os.path.join(dataset, 'metadata.json')) else: self.name = dataset self.table_name = table_name or self.name self._load_metadata() self._load_table() table_meta = self.metadata.get_table_meta(self.table_name) self.entity_columns = table_meta.get('entity_columns') or [] self.sequence_index = table_meta.get('sequence_index') if 'context_columns' in table_meta: self.context_columns = table_meta['context_columns'] else: self.context_columns = self._get_context_columns() self.model_columns = [ column for column in self.data.columns if column not in self.entity_columns + self.context_columns + [self.sequence_index] ] if max_entities: self._filter_entities(max_entities) if not segment_size: self.evaluation_data = self.data else: self.evaluation_data = self._get_evaluation_data(segment_size)
def load_multi_foreign_key(): parent = pd.DataFrame({ 'parent_id': range(10), 'value': range(10) }) child = pd.DataFrame({ 'parent_1_id': range(10), 'parent_2_id': range(10), 'value': range(10) }) metadata = Metadata() metadata.add_table('parent', parent, primary_key='parent_id') metadata.add_table('child', child, parent='parent', foreign_key='parent_1_id') metadata.add_relationship('parent', 'child', 'parent_2_id') return metadata, {'parent': parent, 'child': child}
def load(dataset, is_path=False): """This function loads a SDMetrics dataset which consists of a metadata object, a set of real tables, a set of low quality synthetic tables, and a set of high quality synthetic tables. Arguments: dataset (str): The name of the dataset (or the path to the dataset). Returns: (Dataset): An instance of the Dataset object. """ if is_path: path_to_dataset = dataset else: path_to_dataset = os.path.join(_dir_, dataset) metadata = Metadata(os.path.join(path_to_dataset, "metadata.json")) tables = Dataset._load_tables(os.path.join(path_to_dataset)) lq_synthetic = Dataset._load_tables(os.path.join(path_to_dataset, "low_quality")) hq_synthetic = Dataset._load_tables(os.path.join(path_to_dataset, "high_quality")) return Dataset(metadata, tables, lq_synthetic, hq_synthetic)
def load_dataset(dataset, datasets_path=None, bucket=None): dataset_path = _get_dataset_path(dataset, datasets_path, bucket) metadata = Metadata(str(dataset_path / 'metadata.json')) tables = metadata.get_tables() if not hasattr(metadata, 'modality'): if len(tables) > 1: modality = 'multi-table' else: table = metadata.get_table_meta(tables[0]) if any(table.get(field) for field in TIMESERIES_FIELDS): modality = 'timeseries' else: modality = 'single-table' metadata._metadata['modality'] = modality metadata.modality = modality if not hasattr(metadata, 'name'): metadata._metadata['name'] = dataset_path.name metadata.name = dataset_path.name return metadata
def load_dataset(dataset, datasets_path=None, bucket=None, aws_key=None, aws_secret=None, max_columns=None): dataset_path = _get_dataset_path(dataset, datasets_path, bucket, aws_key, aws_secret) with open(dataset_path / 'metadata.json') as metadata_file: metadata_content = json.load(metadata_file) if max_columns: if len(metadata_content['tables']) > 1: raise ValueError( 'max_columns is not supported for multi-table datasets') _apply_max_columns_to_metadata(metadata_content, max_columns) metadata = Metadata(metadata_content, dataset_path) tables = metadata.get_tables() if not hasattr(metadata, 'modality'): if len(tables) > 1: modality = 'multi-table' else: table = metadata.get_table_meta(tables[0]) if any(table.get(field) for field in TIMESERIES_FIELDS): modality = 'timeseries' else: modality = 'single-table' metadata._metadata['modality'] = modality metadata.modality = modality if not hasattr(metadata, 'name'): metadata._metadata['name'] = dataset_path.name metadata.name = dataset_path.name return metadata
def test_build_demo_metadata_without_tables(): metadata = Metadata() metadata.add_table('users') metadata.add_field('users', 'user_id', 'id', 'integer') metadata.add_field('users', 'country', 'categorical') metadata.add_field('users', 'gender', 'categorical') metadata.add_field('users', 'age', 'numerical', 'integer') metadata.set_primary_key('users', 'user_id') metadata.add_table('sessions') metadata.add_field('sessions', 'session_id', 'id', 'integer') metadata.add_field('sessions', 'user_id', 'id', 'integer') metadata.add_field('sessions', 'device', 'categorical') metadata.add_field('sessions', 'os', 'categorical') metadata.add_field('sessions', 'minutes', 'numerical', 'integer') metadata.set_primary_key('sessions', 'session_id') metadata.add_relationship('users', 'sessions') metadata.add_table('transactions') metadata.add_field('transactions', 'transaction_id', 'id', 'integer') metadata.add_field('transactions', 'session_id', 'id', 'integer') metadata.add_field('transactions', 'timestamp', 'datetime', properties={'format': '%Y-%m-%dT%H:%M'}) metadata.add_field('transactions', 'amount', 'numerical', 'float') metadata.add_field('transactions', 'cancelled', 'boolean') metadata.set_primary_key('transactions', 'transaction_id') metadata.add_relationship('sessions', 'transactions') assert DEMO_METADATA == metadata.to_dict()
pd.DataFrame({ "x": np.random.random(size=size), "y": np.random.normal(size=size, loc=10.0) }) } lq_synthetic = { "table1": pd.DataFrame({ "x": np.random.random(size=size) + np.random.normal(size=size), "y": np.random.normal(size=size, loc=10.0) + np.random.normal(size=size) }) } hq_synthetic = { "table1": pd.DataFrame({ "x": np.random.random(size=size) + np.random.normal(size=size) / 10.0, "y": np.random.normal(size=size, loc=10.0) + np.random.normal(size=size) / 10.0 }) } metadata = Metadata() for table_name, df in tables.items(): metadata.add_table(table_name, data=df) dataset = Dataset(metadata, tables, lq_synthetic, hq_synthetic) dataset.save(os.path.dirname(__file__))
class Dataset: """Dataset abstraction for benchmarking. This class loads as TimeSeries dataset from an sdv.Metadata in the format expected by DeepEcho models. It handles the extraction of the context columns from analyzing the data and identifying the columns that are constant for each entity_id. Args: dataset_path (str): Path to the dataset folder, where the metadata.json can be found. max_entities (int): Optionally restrict the number of entities to the indicated amount. If not given, use all the entities from the dataset. segment_size (int, pd.Timedelta or str): If specified, cut each training sequence in several segments of the indicated size. The size can either can passed as an integer value, which will interpreted as the number of data points to put on each segment, or as a pd.Timedelta (or equivalent str representation), which will be interpreted as the segment length in time. Timedelta segment sizes can only be used with sequence indexes of type datetime. """ VERSION = '0.1.1' def _load_table(self): columns = list(self.metadata.get_fields(self.table_name).keys()) primary_key = self.metadata.get_primary_key(self.table_name) if primary_key: columns.remove(primary_key) self.data = self.metadata.load_table(self.table_name)[columns] @staticmethod def _is_constant(column): def wrapped(group): return len(group[column].unique()) == 1 return wrapped def _get_context_columns(self): context_columns = [] candidate_columns = set(self.data.columns) - set(self.entity_columns) if self.entity_columns: for column in candidate_columns: if self.data.groupby(self.entity_columns).apply( self._is_constant(column)).all(): context_columns.append(column) else: for column in candidate_columns: if self._is_constant(self.data[column]): context_columns.append(column) return context_columns def _download(self): os.makedirs(DATA_DIR, exist_ok=True) filename = '{}_v{}.zip'.format(self.name, self.VERSION) url = urljoin(DATA_URL, filename) LOGGER.info('Downloading dataset %s from %s', self.name, url) with urlopen(url) as remote: with ZipFile(BytesIO(remote.read())) as zipfile: zipfile.extractall(DATA_DIR) def _filter_entities(self, max_entities): entities = self.data[self.entity_columns].drop_duplicates() if max_entities < len(entities): entities = entities.sample(max_entities) data = pd.DataFrame() for _, row in entities.iterrows(): mask = [True] * len(self.data) for column in self.entity_columns: mask &= self.data[column] == row[column] data = data.append(self.data[mask]) self.data = data def _get_evaluation_data(self, segment_size): sequences = assemble_sequences(self.data, self.entity_columns, self.context_columns, segment_size, self.sequence_index) evaluation_data = pd.DataFrame(columns=self.data.columns) for idx, sequence in enumerate(sequences): sequence_df = pd.DataFrame(sequence['data'], index=self.model_columns).T for column, value in zip(self.context_columns, sequence['context']): sequence_df[column] = value for column in self.entity_columns: sequence_df[column] = idx evaluation_data = evaluation_data.append(sequence_df) return evaluation_data def _load_metadata(self): dataset_path = os.path.join(DATA_DIR, self.name) metadata_path = os.path.join(dataset_path, 'metadata.json') try: self.metadata = Metadata(metadata_path) version = self.metadata.get_table_meta( self.table_name)['deepecho_version'] assert version == self.VERSION except Exception: self._download() self.metadata = Metadata(metadata_path) def __init__(self, dataset, table_name=None, max_entities=None, segment_size=None): if os.path.isdir(dataset): self.name = os.path.basename(dataset) self.table_name = table_name or self.name self.metadata = Metadata(os.path.join(dataset, 'metadata.json')) else: self.name = dataset self.table_name = table_name or self.name self._load_metadata() self._load_table() table_meta = self.metadata.get_table_meta(self.table_name) self.entity_columns = table_meta.get('entity_columns') or [] self.sequence_index = table_meta.get('sequence_index') if 'context_columns' in table_meta: self.context_columns = table_meta['context_columns'] else: self.context_columns = self._get_context_columns() self.model_columns = [ column for column in self.data.columns if column not in self.entity_columns + self.context_columns + [self.sequence_index] ] if max_entities: self._filter_entities(max_entities) if not segment_size: self.evaluation_data = self.data else: self.evaluation_data = self._get_evaluation_data(segment_size) def describe(self): """Describe this datasets. The output is a ``pandas.Series`` containing: * ``entities``: Number of entities in the dataset. * ``entity_colums``: Number of entity columns. * ``context_colums``: Number of context columns. * ``data_columns``: Number of data columns. * ``max_sequence_len``: Maximum sequence length. * ``min_sequence_len``: Minimum sequence length. Returns: pandas.Series """ groupby = self.data.groupby(self.entity_columns) sizes = groupby.size() return pd.Series({ 'entities': len(sizes), 'entity_columns': len(self.entity_columns), 'context_columns': len(self.context_columns), 'model_columns': len(self.model_columns), 'max_sequence_len': sizes.max(), 'min_sequence_len': sizes.min(), }) def __repr__(self): return "Dataset('{}')".format(self.name)
] suppliers['Supplier address'] = [ fake.address().replace('\n', '') for _ in range(suppliers.shape[0]) ] suppliers.to_csv('Suppliers.csv', index=False) tables = { 'Products': products, 'Suppliers': suppliers, 'Customers': customers, 'Sales orders': sales_orders, 'Purchase orders': purchase_orders } metadata = Metadata() metadata.add_table(name='Products', data=tables['Products'], primary_key='Product id') metadata.add_table(name='Sales orders', data=tables['Sales orders'], primary_key='Sales order id', foreign_key='Product id', parent='Products') metadata.add_table(name='Purchase orders', data=tables['Purchase orders'], primary_key='Purchase order id', foreign_key='Product id', parent='Products') metadata.add_table(name='Customers', data=tables['Customers'],
dataset_name = os.path.basename(path_to_train) dataset_name = dataset_name.replace(".ts", "") dataset_name = dataset_name.replace("_TRAIN", "") dataset_dir = "datasets/%s" % dataset_name os.makedirs(dataset_dir, exist_ok=True) path_to_test = path_to_train.replace("_TRAIN", "_TEST") path_to_csv = os.path.join(dataset_dir, "%s.csv" % dataset_name) path_to_metadata = os.path.join(dataset_dir, "metadata.json") path_to_readme = os.path.join(dataset_dir, "README.md") print(path_to_csv, path_to_metadata, path_to_readme) df = to_our_format(path_to_train, path_to_test) df.to_csv(path_to_csv, index=False) metadata = Metadata() metadata.add_table('data', data=df, primary_key='e_id') metadata.to_json(path_to_metadata) with open(os.path.join(dataset_dir, "task.json"), "wt") as fout: json.dump({ "task_type": "classification", "key": ["e_id"], "target": "ml_class", "ignored": ["tt_split", "s_index"] }, fout) with open(path_to_readme, "wt") as fout: fout.write("""# %s This dataset originates from the Time Series Classification