def test_archived_file_creates_a_new_artifact_when_custom_fields_are_different( dbdiskrepo, ): assert p.get_default_repo() is not None tmp_dir = tempfile.mkdtemp('prov_integration_archive_test') data_filename = os.path.join(tmp_dir, 'data.csv') pd.DataFrame({ 'a': [0, 1, 2], 'b': [10, 11, 12] }).to_csv(data_filename, index=False) data_filename2 = os.path.join(tmp_dir, 'data2.csv') shutil.copyfile(data_filename, data_filename2) archived_file = p.archive_file( data_filename, delete_original=True, custom_fields={'data_source': 'provider one'}, ) archived_file2 = p.archive_file( data_filename2, delete_original=True, custom_fields={'data_source': 'provider two'}, ) assert archived_file.artifact.id != archived_file2.artifact.id assert archived_file.artifact.value_id == archived_file2.artifact.value_id assert archived_file.artifact.custom_fields == { 'data_source': 'provider one' } assert archived_file2.artifact.custom_fields == { 'data_source': 'provider two' }
def test_archived_file_used_in_input(dbdiskrepo): assert p.get_default_repo() is not None tmp_dir = tempfile.mkdtemp('prov_integration_archive_test') data_filename = os.path.join(tmp_dir, 'data.csv') pd.DataFrame({ 'a': [0, 1, 2], 'b': [10, 11, 12] }).to_csv(data_filename, index=False) assert os.path.exists(data_filename) archived_file = p.archive_file(data_filename, delete_original=True, custom_fields={'foo': 'bar'}) assert not os.path.exists(data_filename) assert archived_file.artifact.custom_fields == {'foo': 'bar'} @p.provenance() def add_col_c_ret_df(filename): df = pd.read_csv(str(filename)) df['c'] = df['a'] + df['b'] return df ret = add_col_c_ret_df(archived_file) assert list(ret['c'].values) == [10, 12, 14] assert ret.artifact.inputs['kargs']['filename'] == archived_file
def test_archived_file_canonicalizes_file_extenstions(dbdiskrepo): assert p.get_default_repo() is not None tmp_dir = tempfile.mkdtemp('prov_integration_archive_test') data_filename = os.path.join(tmp_dir, 'foo.MPEG') spit(data_filename, 'blah') archived_file = p.archive_file(data_filename, delete_original=True, preserve_ext=True) assert archived_file.artifact.value_id.endswith('.mpg')
def test_archived_file_allows_extensions_to_be_ignored(dbdiskrepo): repo = dbdiskrepo assert p.get_default_repo() is not None tmp_dir = tempfile.mkdtemp('prov_integration_archive_test') data_filename = os.path.join(tmp_dir, 'data.csv00') pd.DataFrame({'a': [0, 1, 2], 'b': [10, 11, 12]}).\ to_csv(data_filename, index=False) archived_file = p.archive_file(data_filename, delete_original=True, preserve_ext=False) assert not archived_file.artifact.value_id.endswith('.csv')
def test_archived_file_cache_hits_when_filename_is_different(dbdiskrepo): repo = dbdiskrepo assert p.get_default_repo() is not None tmp_dir = tempfile.mkdtemp('prov_integration_archive_test') data_filename = os.path.join(tmp_dir, 'data.csv') pd.DataFrame({'a': [0, 1, 2], 'b': [10, 11, 12]}).\ to_csv(data_filename, index=False) data_filename2 = os.path.join(tmp_dir, 'data2.csv') shutil.copyfile(data_filename, data_filename2) archived_file = p.archive_file(data_filename, delete_original=True) assert not os.path.exists(data_filename) archived_file2 = p.archive_file(data_filename2, delete_original=True) assert not os.path.exists(data_filename2) assert archived_file.artifact.id == archived_file2.artifact.id