def test_archived_file_creates_a_new_artifact_when_custom_fields_are_different( dbdiskrepo, ): assert p.get_default_repo() is not None tmp_dir = tempfile.mkdtemp('prov_integration_archive_test') data_filename = os.path.join(tmp_dir, 'data.csv') pd.DataFrame({ 'a': [0, 1, 2], 'b': [10, 11, 12] }).to_csv(data_filename, index=False) data_filename2 = os.path.join(tmp_dir, 'data2.csv') shutil.copyfile(data_filename, data_filename2) archived_file = p.archive_file( data_filename, delete_original=True, custom_fields={'data_source': 'provider one'}, ) archived_file2 = p.archive_file( data_filename2, delete_original=True, custom_fields={'data_source': 'provider two'}, ) assert archived_file.artifact.id != archived_file2.artifact.id assert archived_file.artifact.value_id == archived_file2.artifact.value_id assert archived_file.artifact.custom_fields == { 'data_source': 'provider one' } assert archived_file2.artifact.custom_fields == { 'data_source': 'provider two' }
def write_entry(): with open(os.path.join(directory, 'demographic.json'), 'w') as demof: json.dump(demographics, demof) with open(os.path.join(directory, 'matrix.csv'), 'w') as matrixf: writer = csv.writer(matrixf) writer.writerows(matrix) p.archive_file(os.path.join(directory, 'demographic.json'), name=id + '/demographic', delete_original=True) p.archive_file(os.path.join(directory, 'matrix.csv'), name=id + '/matrix', delete_original=True)
def test_archived_file_becoming_loaded_value_while_persisting_artifact_info( dbdiskrepo): tmp_dir = tempfile.mkdtemp('prov_integration_archive_test') data_filename = os.path.join(tmp_dir, 'data.csv') pd.DataFrame({ 'a': [0, 1, 2], 'b': [10, 11, 12] }).to_csv(data_filename, index=False) archived_file = p.archive_file(data_filename, delete_original=True) @p.provenance(archive_file=True, delete_original_file=True) def add_col_c_ret_df(df): df['c'] = df['a'] + df['b'] data_filename = os.path.join(tmp_dir, 'data2.csv') df.to_csv(data_filename, index=False) return data_filename read_csv = lambda af: pd.read_csv(str(af)) df = archived_file.transform_value(read_csv) assert df.artifact.id == archived_file.artifact.id ret = add_col_c_ret_df(df).transform_value(read_csv) assert list(ret['c'].values) == [10, 12, 14] ar = ret.artifact assert ar.inputs['kargs']['df'].artifact.id == archived_file.artifact.id
def test_archived_file_used_in_input(dbdiskrepo): assert p.get_default_repo() is not None tmp_dir = tempfile.mkdtemp('prov_integration_archive_test') data_filename = os.path.join(tmp_dir, 'data.csv') pd.DataFrame({ 'a': [0, 1, 2], 'b': [10, 11, 12] }).to_csv(data_filename, index=False) assert os.path.exists(data_filename) archived_file = p.archive_file(data_filename, delete_original=True, custom_fields={'foo': 'bar'}) assert not os.path.exists(data_filename) assert archived_file.artifact.custom_fields == {'foo': 'bar'} @p.provenance() def add_col_c_ret_df(filename): df = pd.read_csv(str(filename)) df['c'] = df['a'] + df['b'] return df ret = add_col_c_ret_df(archived_file) assert list(ret['c'].values) == [10, 12, 14] assert ret.artifact.inputs['kargs']['filename'] == archived_file
def test_archived_file_cache_hits_when_filename_is_different(dbdiskrepo): repo = dbdiskrepo assert p.get_default_repo() is not None tmp_dir = tempfile.mkdtemp('prov_integration_archive_test') data_filename = os.path.join(tmp_dir, 'data.csv') pd.DataFrame({'a': [0, 1, 2], 'b': [10, 11, 12]}).\ to_csv(data_filename, index=False) data_filename2 = os.path.join(tmp_dir, 'data2.csv') shutil.copyfile(data_filename, data_filename2) archived_file = p.archive_file(data_filename, delete_original=True) assert not os.path.exists(data_filename) archived_file2 = p.archive_file(data_filename2, delete_original=True) assert not os.path.exists(data_filename2) assert archived_file.artifact.id == archived_file2.artifact.id
def test_archived_file_canonicalizes_file_extenstions(dbdiskrepo): assert p.get_default_repo() is not None tmp_dir = tempfile.mkdtemp('prov_integration_archive_test') data_filename = os.path.join(tmp_dir, 'foo.MPEG') spit(data_filename, 'blah') archived_file = p.archive_file(data_filename, delete_original=True, preserve_ext=True) assert archived_file.artifact.value_id.endswith('.mpg')
def test_archived_file_allows_extensions_to_be_ignored(dbdiskrepo): repo = dbdiskrepo assert p.get_default_repo() is not None tmp_dir = tempfile.mkdtemp('prov_integration_archive_test') data_filename = os.path.join(tmp_dir, 'data.csv00') pd.DataFrame({'a': [0, 1, 2], 'b': [10, 11, 12]}).\ to_csv(data_filename, index=False) archived_file = p.archive_file(data_filename, delete_original=True, preserve_ext=False) assert not archived_file.artifact.value_id.endswith('.csv')
def test_output_is_archived_as_file(dbdiskrepo): repo = dbdiskrepo tmp_dir = tempfile.mkdtemp('prov_integration_archive_test') data_filename = os.path.join(tmp_dir, 'data.csv') pd.DataFrame({'a': [0,1,2], 'b': [10,11,12]}).\ to_csv(data_filename, index=False) archived_file = p.archive_file(data_filename, delete_original=True) @p.provenance(archive_file=True, delete_original_file=True) def add_col_c_ret_df(filename): df = pd.read_csv(str(filename)) df['c'] = df['a'] + df['b'] data_filename = os.path.join(tmp_dir, 'data2.csv') df.to_csv(data_filename, index=False) return data_filename ret_file = add_col_c_ret_df(archived_file) ret = pd.read_csv(str(ret_file)) assert list(ret['c'].values) == [10, 12, 14]