def sample_metta_csv_diff_order(directory): """Stores matrix and metadata in a metta-data-like form The train and test matrices will have different column orders Args: directory (str) """ train_dict = OrderedDict([ ('entity_id', [1, 2]), ('k_feature', [0.5, 0.4]), ('m_feature', [0.4, 0.5]), ('label', [0, 1]) ]) train_matrix = pandas.DataFrame.from_dict(train_dict) train_metadata = { 'feature_start_time': datetime.date(2014, 1, 1), 'end_time': datetime.date(2015, 1, 1), 'matrix_id': 'train_matrix', 'label_name': 'label', 'label_timespan': '3month', 'indices': ['entity_id'], } test_dict = OrderedDict([ ('entity_id', [3, 4]), ('m_feature', [0.4, 0.5]), ('k_feature', [0.5, 0.4]), ('label', [0, 1]) ]) test_matrix = pandas.DataFrame.from_dict(test_dict) test_metadata = { 'feature_start_time': datetime.date(2015, 1, 1), 'end_time': datetime.date(2016, 1, 1), 'matrix_id': 'test_matrix', 'label_name': 'label', 'label_timespan': '3month', 'indices': ['entity_id'], } train_uuid, test_uuid = metta.archive_train_test( train_config=train_metadata, df_train=train_matrix, test_config=test_metadata, df_test=test_matrix, directory=directory, format='csv' ) train_store = CSVMatrixStore( matrix_path=os.path.join(directory, '{}.csv'.format(train_uuid)), metadata_path=os.path.join(directory, '{}.yaml'.format(train_uuid)) ) test_store = CSVMatrixStore( matrix_path=os.path.join(directory, '{}.csv'.format(test_uuid)), metadata_path=os.path.join(directory, '{}.yaml'.format(test_uuid)) ) return train_store, test_store
def test_as_of_dates_entity_index(self): data = { "entity_id": [1, 2], "feature_one": [0.5, 0.6], "feature_two": [0.5, 0.6], } with tempfile.TemporaryDirectory() as tmpdir: project_storage = ProjectStorage(tmpdir) matrix_store = CSVMatrixStore(project_storage, [], "test") matrix_store.matrix = pd.DataFrame.from_dict(data) matrix_store.metadata = {"end_time": "2016-01-01", "indices": ["entity_id"]} self.assertEqual(matrix_store.as_of_dates, ["2016-01-01"])
def test_as_of_dates_entity_index(project_storage): data = { "entity_id": [1, 2], "feature_one": [0.5, 0.6], "feature_two": [0.5, 0.6], "label": [0, 1], } df = pd.DataFrame.from_dict(data) labels = df.pop("label") matrix_store = CSVMatrixStore(project_storage, [], "test") matrix_store.matrix_label_tuple = df, labels matrix_store.metadata = {"end_time": "2016-01-01", "indices": ["entity_id"], "label_name": "label"} assert matrix_store.as_of_dates == ["2016-01-01"]
def test_as_of_dates(project_storage): data = { "entity_id": [1, 2, 1, 2], "feature_one": [0.5, 0.6, 0.5, 0.6], "feature_two": [0.5, 0.6, 0.5, 0.6], "as_of_date": [ pd.Timestamp(2016, 1, 1), pd.Timestamp(2016, 1, 1), pd.Timestamp(2017, 1, 1), pd.Timestamp(2017, 1, 1), ], "label": [1, 0, 1, 0], } df = pd.DataFrame.from_dict(data) matrix_store = CSVMatrixStore( project_storage, [], "test", matrix=df, metadata={ "indices": ["entity_id", "as_of_date"], "label_name": "label" }, ) assert matrix_store.as_of_dates == [ datetime.date(2016, 1, 1), datetime.date(2017, 1, 1), ]
def test_as_of_dates_entity_index(self): data = { 'entity_id': [1, 2], 'feature_one': [0.5, 0.6], 'feature_two': [0.5, 0.6], } with tempfile.TemporaryDirectory() as tmpdir: project_storage = ProjectStorage(tmpdir) matrix_store = CSVMatrixStore(project_storage, [], 'test') matrix_store.matrix = pd.DataFrame.from_dict(data) matrix_store.metadata = { 'end_time': '2016-01-01', 'indices': ['entity_id'] } self.assertEqual(matrix_store.as_of_dates, ['2016-01-01'])
def matrix_stores(): df = pd.DataFrame.from_dict(DATA_DICT).set_index(MatrixStore.indices) with tempfile.TemporaryDirectory() as tmpdir: project_storage = ProjectStorage(tmpdir) tmpcsv = os.path.join(tmpdir, "df.csv.gz") tmpyaml = os.path.join(tmpdir, "df.yaml") with open(tmpyaml, "w") as outfile: yaml.dump(METADATA, outfile, default_flow_style=False) df.to_csv(tmpcsv, compression="gzip") csv = CSVMatrixStore(project_storage, [], "df") # first test with caching with csv.cache(): yield csv # with the caching out of scope they will be nuked # and this last version will not have any cache yield csv
def matrix_store(self): data_dict = OrderedDict([('entity_id', [1, 2]), ('k_feature', [0.5, 0.4]), ('m_feature', [0.4, 0.5]), ('label', [0, 1])]) df = pd.DataFrame.from_dict(data_dict) metadata = { 'label_name': 'label', 'indices': ['entity_id'], } inmemory = InMemoryMatrixStore(matrix=df, metadata=metadata) with tempfile.TemporaryDirectory() as tmpdir: tmpcsv = os.path.join(tmpdir, 'df.csv') tmpyaml = os.path.join(tmpdir, 'metadata.yaml') tmphdf = os.path.join(tmpdir, 'df.h5') with open(tmpyaml, 'w') as outfile: yaml.dump(metadata, outfile, default_flow_style=False) df.to_csv(tmpcsv) df.to_hdf(tmphdf, 'matrix') csv = CSVMatrixStore(matrix_path=tmpcsv, metadata_path=tmpyaml) hdf = HDFMatrixStore(matrix_path=tmphdf, metadata_path=tmpyaml) assert csv.matrix.to_dict() == inmemory.matrix.to_dict() assert hdf.matrix.to_dict() == inmemory.matrix.to_dict() assert csv.metadata == inmemory.metadata assert hdf.metadata == inmemory.metadata assert csv.head_of_matrix.to_dict( ) == inmemory.head_of_matrix.to_dict() assert hdf.head_of_matrix.to_dict( ) == inmemory.head_of_matrix.to_dict() assert csv.empty == inmemory.empty assert hdf.empty == inmemory.empty assert csv.labels().to_dict() == inmemory.labels().to_dict() assert hdf.labels().to_dict() == inmemory.labels().to_dict() matrix_store = [inmemory, csv, hdf] return matrix_store
def matrix_stores(): df = pd.DataFrame.from_dict(DATA_DICT).set_index(["entity_id"]) with tempfile.TemporaryDirectory() as tmpdir: project_storage = ProjectStorage(tmpdir) tmpcsv = os.path.join(tmpdir, "df.csv") tmpyaml = os.path.join(tmpdir, "df.yaml") tmphdf = os.path.join(tmpdir, "df.h5") with open(tmpyaml, "w") as outfile: yaml.dump(METADATA, outfile, default_flow_style=False) df.to_csv(tmpcsv) df.to_hdf(tmphdf, "matrix") csv = CSVMatrixStore(project_storage, [], "df") hdf = HDFMatrixStore(project_storage, [], "df") assert csv.design_matrix.equals(hdf.design_matrix) # first test with caching with csv.cache(), hdf.cache(): yield csv yield hdf # with the caching out of scope they will be nuked # and these last two versions will not have any cache yield csv yield hdf
def matrix_stores(self): df = pd.DataFrame.from_dict(self.data_dict).set_index(["entity_id"]) with tempfile.TemporaryDirectory() as tmpdir: project_storage = ProjectStorage(tmpdir) tmpcsv = os.path.join(tmpdir, "df.csv") tmpyaml = os.path.join(tmpdir, "df.yaml") tmphdf = os.path.join(tmpdir, "df.h5") with open(tmpyaml, "w") as outfile: yaml.dump(self.metadata, outfile, default_flow_style=False) df.to_csv(tmpcsv) df.to_hdf(tmphdf, "matrix") csv = CSVMatrixStore(project_storage, [], "df") hdf = HDFMatrixStore(project_storage, [], "df") assert csv.matrix.equals(hdf.matrix) yield from [csv, hdf]
def test_as_of_dates_entity_date_index(project_storage): data = { "entity_id": [1, 2, 1, 2], "feature_one": [0.5, 0.6, 0.5, 0.6], "feature_two": [0.5, 0.6, 0.5, 0.6], "as_of_date": ["2016-01-01", "2016-01-01", "2017-01-01", "2017-01-01"], "label": [1, 0, 1, 0] } df = pd.DataFrame.from_dict(data) matrix_store = CSVMatrixStore( project_storage, [], "test", matrix=df, metadata={"indices": ["entity_id", "as_of_date"], "label_name": "label"} ) assert matrix_store.as_of_dates == ["2016-01-01", "2017-01-01"]
def test_s3_save(self): with mock_s3(): client = boto3.client("s3") client.create_bucket(Bucket="fake-matrix-bucket", ACL="public-read-write") example = next(self.matrix_stores()) project_storage = ProjectStorage("s3://fake-matrix-bucket") tosave = CSVMatrixStore(project_storage, [], "test") tosave.matrix = example.matrix tosave.metadata = example.metadata tosave.save() tocheck = CSVMatrixStore(project_storage, [], "test") assert tocheck.metadata == example.metadata assert tocheck.matrix.to_dict() == example.matrix.to_dict()
def test_s3_save(): with mock_s3(): client = boto3.client("s3") client.create_bucket(Bucket="fake-matrix-bucket", ACL="public-read-write") for example in matrix_stores(): if not isinstance(example, CSVMatrixStore): continue project_storage = ProjectStorage("s3://fake-matrix-bucket") tosave = CSVMatrixStore(project_storage, [], "test") tosave.metadata = example.metadata tosave.matrix_label_tuple = example.matrix_label_tuple tosave.save() tocheck = CSVMatrixStore(project_storage, [], "test") assert tocheck.metadata == example.metadata assert tocheck.design_matrix.to_dict() == example.design_matrix.to_dict()
def test_s3_save(self): with mock_s3(): import boto3 client = boto3.client('s3') client.create_bucket(Bucket='fake-matrix-bucket', ACL='public-read-write') example = next(self.matrix_stores()) project_storage = ProjectStorage('s3://fake-matrix-bucket') tosave = CSVMatrixStore(project_storage, [], 'test') tosave.matrix = example.matrix tosave.metadata = example.metadata tosave.save() tocheck = CSVMatrixStore(project_storage, [], 'test') assert tocheck.metadata == example.metadata assert tocheck.matrix.to_dict() == example.matrix.to_dict()
def test_s3_save(self): with mock_s3(): import boto3 client = boto3.client('s3') client.create_bucket(Bucket='fake-matrix-bucket', ACL='public-read-write') matrix_store_list = self.matrix_store() for matrix_store in matrix_store_list: if isinstance(matrix_store, CSVMatrixStore): matrix_store.save(project_path='s3://fake-matrix-bucket', name='test') # CSV csv = CSVMatrixStore( matrix_path='s3://fake-matrix-bucket/test.csv', metadata_path='s3://fake-matrix-bucket/test.yaml') assert csv.metadata == matrix_store_list[0].metadata assert csv.matrix.to_dict( ) == matrix_store_list[0].matrix.to_dict()