def runTest(self): m1 = """. 5348b82a029fd9e971a811ce1f71360b+43 0:43:md5sum.txt . 085c37f02916da1cad16f93c54d899b7+41 0:41:md6sum.txt . 8b22da26f9f433dea0a10e5ec66d73ba+43 0:43:md7sum.txt . 085c37f02916da1cad16f93c54d899b7+41 5348b82a029fd9e971a811ce1f71360b+43 8b22da26f9f433dea0a10e5ec66d73ba+43 47:80:md8sum.txt . 085c37f02916da1cad16f93c54d899b7+41 5348b82a029fd9e971a811ce1f71360b+43 8b22da26f9f433dea0a10e5ec66d73ba+43 40:80:md9sum.txt""" m2 = arvados.CollectionReader(m1) self.assertEqual( m2.manifest_text(), ". 5348b82a029fd9e971a811ce1f71360b+43 085c37f02916da1cad16f93c54d899b7+41 8b22da26f9f433dea0a10e5ec66d73ba+43 0:43:md5sum.txt 43:41:md6sum.txt 84:43:md7sum.txt 6:37:md8sum.txt 84:43:md8sum.txt 83:1:md9sum.txt 0:43:md9sum.txt 84:36:md9sum.txt\n" ) self.assertEqual( arvados.CollectionReader( m1).all_streams()[0].files()['md5sum.txt'].as_manifest(), ". 5348b82a029fd9e971a811ce1f71360b+43 0:43:md5sum.txt\n") self.assertEqual( arvados.CollectionReader( m1).all_streams()[0].files()['md6sum.txt'].as_manifest(), ". 085c37f02916da1cad16f93c54d899b7+41 0:41:md6sum.txt\n") self.assertEqual( arvados.CollectionReader( m1).all_streams()[0].files()['md7sum.txt'].as_manifest(), ". 8b22da26f9f433dea0a10e5ec66d73ba+43 0:43:md7sum.txt\n") self.assertEqual( arvados.CollectionReader( m1).all_streams()[0].files()['md9sum.txt'].as_manifest(), ". 085c37f02916da1cad16f93c54d899b7+41 5348b82a029fd9e971a811ce1f71360b+43 8b22da26f9f433dea0a10e5ec66d73ba+43 40:80:md9sum.txt\n" )
def test_locator_init(self): client = self.api_client_mock(200) # Ensure Keep will not return anything if asked. with tutil.mock_responses(None, 404): reader = arvados.CollectionReader(self.DEFAULT_DATA_HASH, api_client=client) self.assertEqual(self.DEFAULT_MANIFEST, reader.manifest_text())
def test_uuid_init_success(self): client = self.api_client_mock(200) reader = arvados.CollectionReader(self.DEFAULT_UUID, api_client=client, num_retries=3) self.assertEqual(self.DEFAULT_COLLECTION['manifest_text'], reader.manifest_text()) client.collections().get().execute.assert_called_with(num_retries=3)
def runTest(self): n_lines_in = 2**18 data_in = "abc\n" for x in xrange(0, 18): data_in += data_in p = subprocess.Popen(["gzip", "-1cn"], stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.PIPE, shell=False, close_fds=True) compressed_data_in, stderrdata = p.communicate(data_in) cw = arvados.CollectionWriter() cw.start_new_file('test.gz') cw.write(compressed_data_in) gzip_manifest = cw.manifest_text() cr = arvados.CollectionReader(gzip_manifest) got = 0 for x in list(cr.all_files())[0].readlines(): self.assertEqual(x, "abc\n", "decompression returned wrong data: %s" % x) got += 1 self.assertEqual( got, n_lines_in, "decompression returned %d lines instead of %d" % (got, n_lines_in))
def main(args, stdout, stderr, api_client=None): args = parse_args(args) if api_client is None: api_client = arvados.api('v1') try: cr = arvados.CollectionReader(args.locator, api_client=api_client, num_retries=args.retries) cr.normalize() except (arvados.errors.ArgumentError, arvados.errors.NotFoundError) as error: print("arv-ls: error fetching collection: {}".format(error), file=stderr) return 1 formatters = [] if args.s: formatters.append(size_formatter) formatters.append(name_formatter) for f in cr.all_files(): print(*(info_func(f) for info_func in formatters), file=stdout) return 0
def prepare_gatk_interval_list_collection(interval_list_coll): """ Checks that the supplied interval_list_collection has the required files and only the required files for GATK. Returns: a portable data hash for the interval_list collection """ # Ensure we have a .fa interval_list file with corresponding .fai index and .interval_list # see: http://gatkforums.broadinstitute.org/discussion/1601/how-can-i-prepare-a-fasta-file-to-use-as-interval_list ilcr = arvados.CollectionReader(interval_list_coll) interval_list = {} for ils in ilcr.all_streams(): for ilf in ils.all_files(): if re.search(r'\.interval_list$', ilf.name()): interval_list[ils.name(), ilf.name()] = ilf if len(interval_list) < 1: raise InvalidArgumentError( "Expected an interval_list dict in interval_list_collection, but found none. Found [%s]" % ' '.join(ilf.name() for ilf in ils.all_files())) if len(interval_list) > 1: raise InvalidArgumentError( "Expected a single interval_list dict in interval_list_collection, but found multuple. Found [%s]" % ' '.join(ilf.name() for ilf in ils.all_files())) for ((s_name, f_name), interval_list_f) in interval_list.items(): ref_input = interval_list_f.as_manifest() break # Create and return a portable data hash for the ref_input manifest try: r = arvados.api().collections().create(body={ "manifest_text": ref_input }).execute() ref_input_pdh = r["portable_data_hash"] except: raise return ref_input_pdh
def job_logs(api, job): # Returns the contents of the log for this job (as an array of lines). if job['log']: log_collection = arvados.CollectionReader(job['log'], api) log_filename = "{}.log.txt".format(job['uuid']) return log_collection.open(log_filename).readlines() return []
def test_init_num_retries_propagated(self): # More of an integration test... client = self.api_client_mock(200) reader = arvados.CollectionReader(self.DEFAULT_UUID, api_client=client, num_retries=3) with tutil.mock_responses('foo', 500, 500, 200): self.assertEqual('foo', ''.join(f.read(9) for f in reader.all_files()))
def check_manifest_file_sizes(self, manifest_text, expect_sizes): cr = arvados.CollectionReader(manifest_text) got_sizes = [] for f in cr.all_files(): got_sizes += [f.size()] self.assertEqual( got_sizes, expect_sizes, "got wrong file sizes %s, expected %s" % (got_sizes, expect_sizes))
def _open_remote(file_ref, config=None): """Retrieve an open handle to a file in an Arvados Keep collection. """ import arvados api_client = _get_api_client(config) coll_uuid, coll_ref = _get_uuid_file(file_ref) cr = arvados.CollectionReader(coll_uuid, api_client=api_client) return cr.open(coll_ref)
def open_remote(file_ref, config=None): """Retrieve an open handle to a file in an Arvados Keep collection. """ import arvados api_client = _get_api_client(config) coll_uuid, coll_ref = file_ref.replace("keep:", "").split("/", 1) cr = arvados.CollectionReader(coll_uuid, api_client=api_client) return cr.open(coll_ref)
def file_size(file_ref, config=None): """Retrieve file size in keep, in Mb """ import arvados api_client = _get_api_client(config) coll_uuid, coll_ref = file_ref.replace("keep:", "").split("/", 1) cr = arvados.CollectionReader(coll_uuid, api_client=api_client) file = cr[coll_ref] return file.size() / (1024.0 * 1024.0)
def file_size(file_ref, config=None): """Retrieve file size in keep, in Mb """ import arvados api_client = _get_api_client(config) coll_uuid, coll_ref = _get_uuid_file(file_ref) cr = arvados.CollectionReader(coll_uuid, api_client=api_client) file = cr.find(coll_ref) return file.size() / (1024.0 * 1024.0)
def write_block_or_manifest(dest, src, api_client, args): if '+A' in src: # block locator kc = arvados.keep.KeepClient(api_client=api_client) dest.write(kc.get(src, num_retries=args.retries)) else: # collection UUID or portable data hash reader = arvados.CollectionReader(src, num_retries=args.retries) dest.write(reader.manifest_text(strip=args.strip_manifest).encode())
def test_locator_init_falls_back_to_keep(self): # Reading manifests from Keep is deprecated. Feel free to # remove this test when we remove the fallback. client = self.api_client_mock(200) self.mock_get_collection(client, 404, None) with tutil.mock_responses(self.DEFAULT_MANIFEST, 200): reader = arvados.CollectionReader(self.DEFAULT_DATA_HASH, api_client=client, num_retries=3) self.assertEqual(self.DEFAULT_MANIFEST, reader.manifest_text())
def checkin(source_collection, target_dir, merge=True): # delete symlinks, commit directory, merge manifests and return combined # collection. for root, dirs, files in os.walk(target_dir): for f in files: s = os.lstat(os.path.join(root, f)) if stat.S_ISLNK(s.st_mode): os.unlink(os.path.join(root, f)) uuid = robust_put.upload(target_dir) if merge: cr1 = arvados.CollectionReader(source_collection) cr2 = arvados.CollectionReader(uuid) combined = arvados.CollectionReader(cr1.manifest_text() + cr2.manifest_text()) return combined else: return arvados.CollectionReader(uuid)
def _test_subset(self, collection, expected): cr = arvados.CollectionReader(collection, self.api_client) for s in cr.all_streams(): for ex in expected: if ex[0] == s: f = s.files()[ex[2]] got = [f.size(), f.stream_name(), f.name(), "".join(f.readall(2**26))] self.assertEqual(got, ex, 'all_files|as_manifest did not preserve manifest contents: got %s expected %s' % (got, ex))
def _collection_files(uuid, config): """Retrieve files in the input collection. """ import arvados api_client = _get_api_client(config) cr = arvados.CollectionReader(uuid, api_client=api_client) cr.normalize() pdh = cr.portable_data_hash() out = [str("%s:%s/%s" % (KEY, os.path.normpath(os.path.join(pdh, x.stream_name())), x.name)) for x in cr.all_files()] return out
def _runTest(self, what_in, what_out): cw = arvados.CollectionWriter() cw.start_new_file('test.txt') cw.write(what_in) test1 = cw.finish() cr = arvados.CollectionReader(test1) got = [] for x in list(cr.all_files())[0].readlines(): got += [x] self.assertEqual(got, what_out, "readlines did not split lines correctly: %s" % got)
def _runTest(self, collection, expected): cr = arvados.CollectionReader(collection) manifest_subsets = [] for s in cr.all_streams(): for f in s.all_files(): manifest_subsets += [f.as_manifest()] expect_i = 0 for m in manifest_subsets: cr = arvados.CollectionReader(m) for f in cr.all_files(): got = [ f.size(), f.stream_name(), f.name(), "".join(f.readall(2**26)) ] self.assertEqual( got, expected[expect_i], 'all_files|as_manifest did not preserve manifest contents: got %s expected %s' % (got, expected[expect_i])) expect_i += 1
def prepare_gatk_reference_collection(reference_coll): """ Checks that the supplied reference_collection has the required files and only the required files for GATK. Returns: a portable data hash for the reference collection """ # Ensure we have a .fa reference file with corresponding .fai index and .dict # see: http://gatkforums.broadinstitute.org/discussion/1601/how-can-i-prepare-a-fasta-file-to-use-as-reference rcr = arvados.CollectionReader(reference_coll) ref_fasta = {} ref_fai = {} ref_dict = {} ref_input = None dict_reader = None for rs in rcr.all_streams(): for rf in rs.all_files(): if re.search(r'\.fa$', rf.name()): ref_fasta[rs.name(), rf.name()] = rf elif re.search(r'\.fai$', rf.name()): ref_fai[rs.name(), rf.name()] = rf elif re.search(r'\.dict$', rf.name()): ref_dict[rs.name(), rf.name()] = rf for ((s_name, f_name), fasta_f) in ref_fasta.items(): fai_f = ref_fai.get( (s_name, re.sub(r'fa$', 'fai', f_name)), ref_fai.get((s_name, re.sub(r'fa$', 'fa.fai', f_name)), None)) dict_f = ref_dict.get( (s_name, re.sub(r'fa$', 'dict', f_name)), ref_dict.get((s_name, re.sub(r'fa$', 'fa.dict', f_name)), None)) if fasta_f and fai_f and dict_f: # found a set of all three! ref_input = fasta_f.as_manifest() ref_input += fai_f.as_manifest() ref_input += dict_f.as_manifest() dict_reader = dict_f break if ref_input is None: raise errors.InvalidArgumentError( "Expected a reference fasta with fai and dict in reference_collection. Found [%s]" % ' '.join(rf.name() for rf in rs.all_files())) if dict_reader is None: raise errors.InvalidArgumentError( "Could not find .dict file in reference_collection. Found [%s]" % ' '.join(rf.name() for rf in rs.all_files())) # Create and return a portable data hash for the ref_input manifest try: r = arvados.api().collections().create(body={ "manifest_text": ref_input }).execute() ref_input_pdh = r["portable_data_hash"] except: raise return ref_input_pdh
def runTest(self): m1 = """. 5348b82a029fd9e971a811ce1f71360b+43 0:43:md5sum.txt . 085c37f02916da1cad16f93c54d899b7+41 0:41:md5sum.txt . 8b22da26f9f433dea0a10e5ec66d73ba+43 0:43:md5sum.txt""" self.assertEqual( arvados.CollectionReader(m1).manifest_text(), """. 5348b82a029fd9e971a811ce1f71360b+43 085c37f02916da1cad16f93c54d899b7+41 8b22da26f9f433dea0a10e5ec66d73ba+43 0:127:md5sum.txt """) m2 = """. 204e43b8a1185621ca55a94839582e6f+67108864 b9677abbac956bd3e86b1deb28dfac03+67108864 fc15aff2a762b13f521baf042140acec+67108864 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:227212247:var-GS000016015-ASM.tsv.bz2 """ self.assertEqual(arvados.CollectionReader(m2).manifest_text(), m2) m3 = """. 5348b82a029fd9e971a811ce1f71360b+43 3:40:md5sum.txt . 085c37f02916da1cad16f93c54d899b7+41 0:41:md5sum.txt . 8b22da26f9f433dea0a10e5ec66d73ba+43 0:43:md5sum.txt""" self.assertEqual( arvados.CollectionReader(m3).manifest_text(), """. 5348b82a029fd9e971a811ce1f71360b+43 085c37f02916da1cad16f93c54d899b7+41 8b22da26f9f433dea0a10e5ec66d73ba+43 3:124:md5sum.txt """) m4 = """. 204e43b8a1185621ca55a94839582e6f+67108864 0:3:foo/bar ./zzz 204e43b8a1185621ca55a94839582e6f+67108864 0:999:zzz ./foo 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:3:bar""" self.assertEqual( arvados.CollectionReader(m4).manifest_text(), """./foo 204e43b8a1185621ca55a94839582e6f+67108864 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:3:bar 67108864:3:bar ./zzz 204e43b8a1185621ca55a94839582e6f+67108864 0:999:zzz """) m5 = """. 204e43b8a1185621ca55a94839582e6f+67108864 0:3:foo/bar ./zzz 204e43b8a1185621ca55a94839582e6f+67108864 0:999:zzz ./foo 204e43b8a1185621ca55a94839582e6f+67108864 3:3:bar""" self.assertEqual( arvados.CollectionReader(m5).manifest_text(), """./foo 204e43b8a1185621ca55a94839582e6f+67108864 0:6:bar ./zzz 204e43b8a1185621ca55a94839582e6f+67108864 0:999:zzz """) with open('testdata/1000G_ref_manifest') as f6: m6 = f6.read() self.assertEqual(arvados.CollectionReader(m6).manifest_text(), m6) with open('testdata/jlake_manifest') as f7: m7 = f7.read() self.assertEqual(arvados.CollectionReader(m7).manifest_text(), m7) m8 = """./a\\040b\\040c 59ca0efa9f5633cb0371bbc0355478d8+13 0:13:hello\\040world.txt """ self.assertEqual(arvados.CollectionReader(m8).manifest_text(), m8)
def collection_files(uuid, config=None, add_uuid=False): """Retrieve files in the input collection. """ import arvados api_client = _get_api_client(config) cr = arvados.CollectionReader(uuid, api_client=api_client) cr.normalize() out = ["%s/%s" % (x.stream_name(), x.name) for x in cr.all_files()] if add_uuid: out = [ "keep:%s" % os.path.normpath(os.path.join(uuid, x)) for x in out ] return out
def one_task_per_pair_input_file(if_sequence=0, and_end_task=True): """ Queue one task for each pair of fastq files in this job's input collection. Each new task will have two parameters, named "input_1" and "input_2", each being a manifest containing a single fastq file. A matching pair of files in the input collection is assumed to have names "x_1.y" and "x_2.y". Files in the input collection that are not part of a matched pair are silently ignored. if_sequence and and_end_task arguments have the same significance as in arvados.job_setup.one_task_per_input_file(). """ if if_sequence != arvados.current_task()['sequence']: return job_input = arvados.current_job()['script_parameters']['input'] cr = arvados.CollectionReader(job_input) all_files = [] for s in cr.all_streams(): all_files += list(s.all_files()) for s in cr.all_streams(): for left_file in s.all_files(): left_name = left_file.name() right_file = None right_name = re.sub(r'(.*_)1\.', '\g<1>2.', left_name) if right_name == left_name: continue for f2 in s.all_files(): if right_name == f2.name(): right_file = f2 if right_file != None: new_task_attrs = { 'job_uuid': arvados.current_job()['uuid'], 'created_by_job_task_uuid': arvados.current_task()['uuid'], 'sequence': if_sequence + 1, 'parameters': { 'input_1': left_file.as_manifest(), 'input_2': right_file.as_manifest() } } arvados.api().job_tasks().create(body=new_task_attrs).execute() if and_end_task: arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'], body={ 'success': True }).execute() exit(0)
def update(self): try: if self.collection_object is not None and portable_data_hash_pattern.match( self.collection_locator): return True if self.collection_locator is None: self.fresh() return True with llfuse.lock_released: coll_reader = arvados.CollectionReader( self.collection_locator, self.api, self.api.keep, num_retries=self.num_retries) new_collection_object = coll_reader.api_response() or {} # If the Collection only exists in Keep, there will be no API # response. Fill in the fields we need. if 'uuid' not in new_collection_object: new_collection_object['uuid'] = self.collection_locator if "portable_data_hash" not in new_collection_object: new_collection_object[ "portable_data_hash"] = new_collection_object["uuid"] if 'manifest_text' not in new_collection_object: new_collection_object[ 'manifest_text'] = coll_reader.manifest_text() coll_reader.normalize() # end with llfuse.lock_released, re-acquire lock if self.collection_object is None or self.collection_object[ "portable_data_hash"] != new_collection_object[ "portable_data_hash"]: self.new_collection(new_collection_object, coll_reader) self.fresh() return True except arvados.errors.NotFoundError: _logger.exception("arv-mount %s: error", self.collection_locator) except arvados.errors.ArgumentError as detail: _logger.warning("arv-mount %s: error %s", self.collection_locator, detail) if self.collection_object is not None and "manifest_text" in self.collection_object: _logger.warning("arv-mount manifest_text is: %s", self.collection_object["manifest_text"]) except Exception: _logger.exception("arv-mount %s: error", self.collection_locator) if self.collection_object is not None and "manifest_text" in self.collection_object: _logger.error("arv-mount manifest_text is: %s", self.collection_object["manifest_text"]) return False
def runTest(self): # Create the request handler operations = fuse.Operations(os.getuid(), os.getgid()) e = operations.inodes.add_entry(fuse.Directory(llfuse.ROOT_INODE)) operations.inodes.load_collection( e, arvados.CollectionReader(arvados.Keep.get(self.testcollection))) self.mounttmp = tempfile.mkdtemp() llfuse.init(operations, self.mounttmp, []) t = threading.Thread(None, lambda: llfuse.main()) t.start() # wait until the driver is finished initializing operations.initlock.wait() # now check some stuff d1 = os.listdir(self.mounttmp) d1.sort() self.assertEqual(d1, ['dir1', 'dir2', 'thing1.txt', 'thing2.txt']) d2 = os.listdir(os.path.join(self.mounttmp, 'dir1')) d2.sort() self.assertEqual(d2, ['thing3.txt', 'thing4.txt']) d3 = os.listdir(os.path.join(self.mounttmp, 'dir2')) d3.sort() self.assertEqual(d3, ['dir3', 'thing5.txt', 'thing6.txt']) d4 = os.listdir(os.path.join(self.mounttmp, 'dir2/dir3')) d4.sort() self.assertEqual(d4, ['thing7.txt', 'thing8.txt']) files = { 'thing1.txt': 'data 1', 'thing2.txt': 'data 2', 'dir1/thing3.txt': 'data 3', 'dir1/thing4.txt': 'data 4', 'dir2/thing5.txt': 'data 5', 'dir2/thing6.txt': 'data 6', 'dir2/dir3/thing7.txt': 'data 7', 'dir2/dir3/thing8.txt': 'data 8' } for k, v in files.items(): with open(os.path.join(self.mounttmp, k)) as f: self.assertEqual(f.read(), v)
def one_task_per_bam_file(if_sequence=0, and_end_task=True): """ Queue one task for each bam file in this job's input collection. Each new task will have an "input" parameter: a manifest containing one .bam file and (if available) the corresponding .bai index file. Files in the input collection that are not named *.bam or *.bai (as well as *.bai files that do not match any .bam file present) are silently ignored. if_sequence and and_end_task arguments have the same significance as in arvados.job_setup.one_task_per_input_file(). """ if if_sequence != arvados.current_task()['sequence']: return job_input = arvados.current_job()['script_parameters']['input'] cr = arvados.CollectionReader(job_input) bam = {} bai = {} for s in cr.all_streams(): for f in s.all_files(): if re.search(r'\.bam$', f.name()): bam[s.name(), f.name()] = f elif re.search(r'\.bai$', f.name()): bai[s.name(), f.name()] = f for ((s_name, f_name), bam_f) in bam.items(): bai_f = bai.get((s_name, re.sub(r'bam$', 'bai', f_name)), None) task_input = bam_f.as_manifest() if bai_f: task_input += bai_f.as_manifest() new_task_attrs = { 'job_uuid': arvados.current_job()['uuid'], 'created_by_job_task_uuid': arvados.current_task()['uuid'], 'sequence': if_sequence + 1, 'parameters': { 'input': task_input } } arvados.api().job_tasks().create(body=new_task_attrs).execute() if and_end_task: arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'], body={ 'success': True }).execute() exit(0)
def main(sqlite_db): # Query for recent Veritas sequenced samples query = ("SELECT uploaded_data.human_id, date, name " "FROM uploaded_data WHERE " "data_type == 'Veritas Genetics' AND " "uploaded_data.name GLOB '*VCF'") conn = sqlite3.connect(sys.argv[1]) df = pd.read_sql_query(query, conn) # Uniquify by sample df.sort_values("date", inplace=True) df.drop_duplicates(inplace=True) print(df.describe()) # Load files in Arvados BAM collection, if arvados client installed if arvados: api = arvados.api( host="su92l.arvadosapi.com", token="42yz0fp9s19djsgkae33khevpzq4or1ile5o7khofzw388lvfl") cr = arvados.CollectionReader("su92l-4zz18-1rqqi0kpkfmfite", api) bam_coll = [(x.name, x.size()) for x in cr.all_files()] else: bam_coll = None # Find recent samples with more than 1 data type, emphasizing diverse samples # Require higher depth coverage > 50Gb for sample in df["human_id"]: query = ( "SELECT data_type, date FROM uploaded_data WHERE human_id='%s'" % sample) df = pd.read_sql_query(query, conn) if len(df.data_type.unique()) > 1: bam_size = find_bam_size(sample, bam_coll) if bam_coll else 100 if bam_size > 50: query = ("SELECT * from demographics WHERE human_id='%s'" % sample) dfd = pd.read_sql_query(query, conn) if (len(dfd) and ((dfd["gender"][0] not in ["", "Male"]) or (dfd["race"][0] not in ["", "White", "Caucasian (White)"]))): demo = "%s %s" % (dfd["gender"][0], dfd["race"][0]) elif len(dfd) == 0: demo = "No demographics" print(sample, "%sGb" % bam_size, demo, list(set(df["data_type"])), list(set(df["date"])))
def runTest(self): cr = arvados.CollectionReader('d6c3b8e571f1b81ebb150a45ed06c884+114') got = [] for s in cr.all_streams(): for f in s.all_files(): got += [[f.size(), f.stream_name(), f.name(), f.read(2**26)]] expected = [[3, '.', 'bar.txt', 'bar'], [3, '.', 'foo.txt', 'foo'], [3, './baz', 'baz.txt', 'baz']] self.assertEqual(got, expected) stream0 = cr.all_streams()[0] self.assertEqual( stream0.readfrom(0, 0), '', 'reading zero bytes should have returned empty string') self.assertEqual(stream0.readfrom(0, 2**26), 'foobar', 'reading entire stream failed') self.assertEqual( stream0.readfrom(2**26, 0), '', 'reading zero bytes should have returned empty string')
def main(args, stdout, stderr, api_client=None, logger=None): args = parse_args(args) if api_client is None: api_client = arvados.api('v1') if logger is None: logger = logging.getLogger('arvados.arv-ls') try: r = re.search(r'^(.*?)(/.*)?$', args.locator) collection = r.group(1) get_prefix = r.group(2) cr = arvados.CollectionReader(collection, api_client=api_client, num_retries=args.retries) if get_prefix: if get_prefix[-1] == '/': get_prefix = get_prefix[:-1] stream_name = '.' + get_prefix reader = cr.find(stream_name) if not (isinstance(reader, arvados.CollectionReader) or isinstance(reader, arvados.collection.Subcollection)): logger.error("'{}' is not a subdirectory".format(get_prefix)) return 1 else: stream_name = '.' reader = cr except (arvados.errors.ApiError, arvados.errors.ArgumentError, arvados.errors.NotFoundError) as error: logger.error("error fetching collection: {}".format(error)) return 1 formatters = [] if args.s: formatters.append(size_formatter) formatters.append(name_formatter) for f in files_in_collection(reader, stream_name): print(*(info_func(f) for info_func in formatters), file=stdout) return 0