def deprecated_data_as_bundle_not_csv(tmpdir): # Create Context api.context(TEST_CONTEXT) # Create test .txt file test_txt_path = os.path.join(str(tmpdir), 'test.txt') with open(test_txt_path, 'w') as f: f.write('this should not create a bundle') # Assert the txt file exists assert os.path.exists(test_txt_path) # Try to add file to the bundle with pytest.raises(AssertionError) as ex: api.add(TEST_CONTEXT, 'bad_path', test_txt_path, treat_file_as_bundle=True) # Assert Exited with error code of 1 assert ex.type == AssertionError # Make sure bundle does not exist assert api.get( TEST_CONTEXT, 'test_file_as_bundle_txt_file') is None, 'Bundle should not exist' api.delete_context(TEST_CONTEXT)
def test_cat(run_test): import tempfile # P3 only with tempfile.TemporaryDirectory() as tmpdir: tmpdir = tempfile.mkdtemp() try: # Create a couple of files to throw in the bundle .csv file for i in range(3): test_csv_path = os.path.join(str(tmpdir), '{}_test.csv'.format(i)) df = pd.DataFrame({'a': random.randint(0,10,10), 'b': random.randint(10)}) df.to_csv(test_csv_path) assert os.path.exists(test_csv_path) # Add the file to the bundle. Data is list[filepath,...] api.add(TEST_CONTEXT, TEST_BUNDLE_NAME, tmpdir) # Retrieve the bundle bundle_data = api.cat(TEST_CONTEXT, TEST_BUNDLE_NAME) # Assert the bundles contain the same data for f in bundle_data: i = os.path.basename(f).split('_')[0] bundle_hash, file_hash = get_hash(f), get_hash(os.path.join(tmpdir, '{}_test.csv'.format(i))) assert bundle_hash == file_hash, 'Hashes do not match' finally: shutil.rmtree(tmpdir)
def test_single_file(tmpdir): # Create Context api.context(TEST_CONTEXT) # Create test .csv file test_csv_path = os.path.join(str(tmpdir), 'test.csv') df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) df.to_csv(test_csv_path) # Assert csv_file_exits assert os.path.exists(test_csv_path) # Add the file to the bundle api.add(TEST_CONTEXT, 'test_single_file', test_csv_path) # Retrieve the bundle b = api.get(TEST_CONTEXT, 'test_single_file') # Assert the bundles contain the same data bundle_hash, file_hash = get_hash(b.data), get_hash(test_csv_path) assert bundle_hash == file_hash, 'Hashes do not match' # Test with tags tag = {'test': 'tag'} api.add(TEST_CONTEXT, 'test_single_file', test_csv_path, tags=tag) # Retrieve the bundle b = api.get(TEST_CONTEXT, 'test_single_file') # Assert the bundles contain the same data bundle_hash, file_hash = get_hash(b.data), get_hash(test_csv_path) assert bundle_hash == file_hash, 'Hashes do not match' assert b.tags == tag, 'Tags do not match' # Remove test .csv os.remove(test_csv_path) # Assert that data still remains in the bundle assert api.get(TEST_CONTEXT, 'test_single_file') is not None, 'Bundle should exist' api.delete_context(TEST_CONTEXT)
def test_add_bad_path(tmpdir): # Create Context api.context(TEST_CONTEXT) # Create path to csv file but don't create file test_csv_path = os.path.join(str(tmpdir), 'test.csv') # Assert csv file does not exist assert not os.path.exists(test_csv_path) # Try to add file to the bundle with pytest.raises(AssertionError) as ex: api.add(TEST_CONTEXT, 'bad_path', test_csv_path) # Assert Exited with error code of 1 assert ex.type == AssertionError # Make sure bundle does not exist assert api.get(TEST_CONTEXT, 'test_file_as_bundle_txt_file') is None, 'Bundle should not exist' api.delete_context(TEST_CONTEXT)
def _add(args): """Invoke the api.add() call from the CLI to create a bundle. Args: args: command line args. Returns: None """ fs = disdat.fs.DisdatFS() if not fs.in_context(): _logger.warning('Not in a data context') return _ = api.add(fs._curr_context.get_local_name(), args.bundle, args.path_name, tags=common.parse_args_tags(args.tag)) return
def test_add_directory(tmpdir): # Create Context api.context(TEST_CONTEXT) # Directory Structure # - test.csv # - second/test_1.txt # - second/test_2.txt # - second/third/test_3.txt # - second/third/test_4.txt level_1 = '' level_2 = os.path.join(level_1, 'second') os.mkdir(os.path.join(str(tmpdir), level_2)) level_3 = os.path.join(level_2, 'third') os.mkdir(os.path.join(str(tmpdir), level_3)) # Dictionary to hold paths path_dict = {} # Create files and save paths test_csv_name = 'test.csv' test_csv_path = os.path.join(level_1, test_csv_name) test_csv_abs_path = os.path.join(str(tmpdir), test_csv_path) df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) df.to_csv(test_csv_abs_path) path_dict[test_csv_name] = (test_csv_abs_path, test_csv_path.split('/')) test_text_1_name = 'test_1.txt' test_text_1_path = os.path.join(level_2, test_text_1_name) test_text_name_1_abs_path = os.path.join(str(tmpdir), test_text_1_path) with open(test_text_name_1_abs_path, 'w') as f: f.write('Hello!') path_dict[test_text_1_name] = (test_text_name_1_abs_path, test_text_1_path.split('/')) test_text_2_name = 'test_2.txt' test_text_2_path = os.path.join(level_2, test_text_2_name) test_text_name_2_abs_path = os.path.join(str(tmpdir), test_text_2_path) with open(test_text_name_2_abs_path, 'w') as f: f.write('Hello!') path_dict[test_text_2_name] = (test_text_name_2_abs_path, test_text_2_path.split('/')) test_text_3_name = 'test_3.txt' test_text_3_path = os.path.join(level_3, test_text_3_name) test_text_name_3_abs_path = os.path.join(str(tmpdir), test_text_3_path) with open(test_text_name_3_abs_path, 'w') as f: f.write('Third Hello!') path_dict[test_text_3_name] = (test_text_name_3_abs_path, test_text_3_path.split('/')) test_text_4_name = 'test_4.txt' test_text_4_path = os.path.join(level_3, test_text_4_name) test_text_name_4_abs_path = os.path.join(str(tmpdir), test_text_4_path) with open(test_text_name_4_abs_path, 'w') as f: f.write('Third World!') path_dict[test_text_4_name] = (test_text_name_4_abs_path, test_text_4_path.split('/')) # Assert files exist assert os.path.exists(test_csv_abs_path) assert os.path.exists(test_text_name_1_abs_path) assert os.path.exists(test_text_name_2_abs_path) assert os.path.exists(test_text_name_3_abs_path) assert os.path.exists(test_text_name_4_abs_path) # Add the directory to the bundle api.add(TEST_CONTEXT, 'test_directory', str(tmpdir)) # Assert check sums are the same b = api.get(TEST_CONTEXT, 'test_directory') for f in b.data: bundle_file_name = f.split('/')[-1] local_abs_path, local_split_path = path_dict[bundle_file_name] # Make sure paths match assert get_hash(f) == get_hash(local_abs_path), 'Hashes do not match' bundle_path = os.path.join(*f.split('/')[-len(local_split_path):]) local_path = os.path.join(*local_split_path) assert local_path == bundle_path, 'Bundle should have the same directory structure' # Add the directory to the bundle with tags tag = {'test': 'tag'} api.add(TEST_CONTEXT, 'test_directory', str(tmpdir), tags=tag) # Assert check sums are the same b = api.get(TEST_CONTEXT, 'test_directory') for f in b.data: bundle_file_name = f.split('/')[-1] local_abs_path, local_split_path = path_dict[bundle_file_name] # Make sure paths match assert get_hash(f) == get_hash(local_abs_path), 'Hashes do not match' # Make sure directory structure stays the same local_path = os.path.join(*local_split_path) bundle_path = os.path.join(*f.split('/')[-len(local_split_path):]) assert local_path == bundle_path, 'Bundle should have the same directory structure' # Make sure tags exist assert b.tags == tag, 'Tags do not match' api.delete_context(TEST_CONTEXT)
def test_add_with_treat_as_bundle(tmpdir): api.context(context_name=TEST_CONTEXT) # Setup moto s3 resources s3_client = boto3.client('s3') s3_resource = boto3.resource('s3') s3_resource.create_bucket(Bucket=TEST_BUCKET) # Make sure bucket is empty objects = s3_client.list_objects(Bucket=TEST_BUCKET) assert 'Contents' not in objects, 'Bucket should be empty' local_paths = [] s3_paths = [] # Create and upload test.csv file key = 'test.csv' test_csv_path = os.path.join(str(tmpdir), key) df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) df.to_csv(test_csv_path) s3_resource.meta.client.upload_file(test_csv_path, TEST_BUCKET, key) s3_path = "s3://{}/{}".format(TEST_BUCKET, key) local_paths.append(test_csv_path) s3_paths.append(s3_path) # Create and uploadt test.txt file key = 'text.txt' test_txt_path = os.path.join(str(tmpdir), key) with open(test_txt_path, 'w') as f: f.write('Test') s3_resource.meta.client.upload_file(test_txt_path, TEST_BUCKET, key) s3_path = "s3://{}/{}".format(TEST_BUCKET, key) local_paths.append(test_txt_path) s3_paths.append(s3_path) bool_values = [True, False] string_values = ['a', 'b'] float_values = [1.3, 3.5] int_values = [4, 5] # Build bundle dataframe bundle_df = pd.DataFrame({ 'local_paths': local_paths, 's3_paths': s3_paths, 'bools': bool_values, 'strings': string_values, 'floats': float_values, 'ints': int_values }) bundle_df_path = os.path.join(str(tmpdir), 'bundle.csv') bundle_df.to_csv(bundle_df_path) # Add bundle dataframe api.add(TEST_CONTEXT, 'test_add_bundle', bundle_df_path, treat_file_as_bundle=True) # Assert that data in bundle is a dataframe b = api.get(TEST_CONTEXT, 'test_add_bundle') assert(isinstance(b.data, pd.DataFrame)) # Add bundle dataframe with tags tag = {'test': 'tag'} api.add(TEST_CONTEXT, 'test_add_bundle', bundle_df_path, treat_file_as_bundle=True, tags=tag) # Assert that data in bundle is a dataframe b = api.get(TEST_CONTEXT, 'test_add_bundle') assert(isinstance(b.data, pd.DataFrame)) assert b.tags == tag, 'Tags do not match' api.delete_context(TEST_CONTEXT)