Пример #1
0
    def runTest(self):
        m1 = """. 5348b82a029fd9e971a811ce1f71360b+43 0:43:md5sum.txt
. 085c37f02916da1cad16f93c54d899b7+41 0:41:md6sum.txt
. 8b22da26f9f433dea0a10e5ec66d73ba+43 0:43:md7sum.txt
. 085c37f02916da1cad16f93c54d899b7+41 5348b82a029fd9e971a811ce1f71360b+43 8b22da26f9f433dea0a10e5ec66d73ba+43 47:80:md8sum.txt
. 085c37f02916da1cad16f93c54d899b7+41 5348b82a029fd9e971a811ce1f71360b+43 8b22da26f9f433dea0a10e5ec66d73ba+43 40:80:md9sum.txt"""

        m2 = arvados.CollectionReader(m1)

        self.assertEqual(
            m2.manifest_text(),
            ". 5348b82a029fd9e971a811ce1f71360b+43 085c37f02916da1cad16f93c54d899b7+41 8b22da26f9f433dea0a10e5ec66d73ba+43 0:43:md5sum.txt 43:41:md6sum.txt 84:43:md7sum.txt 6:37:md8sum.txt 84:43:md8sum.txt 83:1:md9sum.txt 0:43:md9sum.txt 84:36:md9sum.txt\n"
        )

        self.assertEqual(
            arvados.CollectionReader(
                m1).all_streams()[0].files()['md5sum.txt'].as_manifest(),
            ". 5348b82a029fd9e971a811ce1f71360b+43 0:43:md5sum.txt\n")
        self.assertEqual(
            arvados.CollectionReader(
                m1).all_streams()[0].files()['md6sum.txt'].as_manifest(),
            ". 085c37f02916da1cad16f93c54d899b7+41 0:41:md6sum.txt\n")
        self.assertEqual(
            arvados.CollectionReader(
                m1).all_streams()[0].files()['md7sum.txt'].as_manifest(),
            ". 8b22da26f9f433dea0a10e5ec66d73ba+43 0:43:md7sum.txt\n")
        self.assertEqual(
            arvados.CollectionReader(
                m1).all_streams()[0].files()['md9sum.txt'].as_manifest(),
            ". 085c37f02916da1cad16f93c54d899b7+41 5348b82a029fd9e971a811ce1f71360b+43 8b22da26f9f433dea0a10e5ec66d73ba+43 40:80:md9sum.txt\n"
        )
Пример #2
0
 def test_locator_init(self):
     client = self.api_client_mock(200)
     # Ensure Keep will not return anything if asked.
     with tutil.mock_responses(None, 404):
         reader = arvados.CollectionReader(self.DEFAULT_DATA_HASH,
                                           api_client=client)
         self.assertEqual(self.DEFAULT_MANIFEST, reader.manifest_text())
Пример #3
0
 def test_uuid_init_success(self):
     client = self.api_client_mock(200)
     reader = arvados.CollectionReader(self.DEFAULT_UUID, api_client=client,
                                       num_retries=3)
     self.assertEqual(self.DEFAULT_COLLECTION['manifest_text'],
                      reader.manifest_text())
     client.collections().get().execute.assert_called_with(num_retries=3)
Пример #4
0
    def runTest(self):
        n_lines_in = 2**18
        data_in = "abc\n"
        for x in xrange(0, 18):
            data_in += data_in
        p = subprocess.Popen(["gzip", "-1cn"],
                             stdout=subprocess.PIPE,
                             stdin=subprocess.PIPE,
                             stderr=subprocess.PIPE,
                             shell=False,
                             close_fds=True)
        compressed_data_in, stderrdata = p.communicate(data_in)

        cw = arvados.CollectionWriter()
        cw.start_new_file('test.gz')
        cw.write(compressed_data_in)
        gzip_manifest = cw.manifest_text()

        cr = arvados.CollectionReader(gzip_manifest)
        got = 0
        for x in list(cr.all_files())[0].readlines():
            self.assertEqual(x, "abc\n",
                             "decompression returned wrong data: %s" % x)
            got += 1
        self.assertEqual(
            got, n_lines_in, "decompression returned %d lines instead of %d" %
            (got, n_lines_in))
Пример #5
0
def main(args, stdout, stderr, api_client=None):
    args = parse_args(args)

    if api_client is None:
        api_client = arvados.api('v1')

    try:
        cr = arvados.CollectionReader(args.locator,
                                      api_client=api_client,
                                      num_retries=args.retries)
        cr.normalize()
    except (arvados.errors.ArgumentError,
            arvados.errors.NotFoundError) as error:
        print("arv-ls: error fetching collection: {}".format(error),
              file=stderr)
        return 1

    formatters = []
    if args.s:
        formatters.append(size_formatter)
    formatters.append(name_formatter)

    for f in cr.all_files():
        print(*(info_func(f) for info_func in formatters), file=stdout)

    return 0
Пример #6
0
def prepare_gatk_interval_list_collection(interval_list_coll):
    """
    Checks that the supplied interval_list_collection has the required
    files and only the required files for GATK.
    Returns: a portable data hash for the interval_list collection
    """
    # Ensure we have a .fa interval_list file with corresponding .fai index and .interval_list
    # see: http://gatkforums.broadinstitute.org/discussion/1601/how-can-i-prepare-a-fasta-file-to-use-as-interval_list
    ilcr = arvados.CollectionReader(interval_list_coll)
    interval_list = {}
    for ils in ilcr.all_streams():
        for ilf in ils.all_files():
            if re.search(r'\.interval_list$', ilf.name()):
                interval_list[ils.name(), ilf.name()] = ilf
    if len(interval_list) < 1:
        raise InvalidArgumentError(
            "Expected an interval_list dict in interval_list_collection, but found none. Found [%s]"
            % ' '.join(ilf.name() for ilf in ils.all_files()))
    if len(interval_list) > 1:
        raise InvalidArgumentError(
            "Expected a single interval_list dict in interval_list_collection, but found multuple. Found [%s]"
            % ' '.join(ilf.name() for ilf in ils.all_files()))
    for ((s_name, f_name), interval_list_f) in interval_list.items():
        ref_input = interval_list_f.as_manifest()
        break
    # Create and return a portable data hash for the ref_input manifest
    try:
        r = arvados.api().collections().create(body={
            "manifest_text": ref_input
        }).execute()
        ref_input_pdh = r["portable_data_hash"]
    except:
        raise
    return ref_input_pdh
Пример #7
0
def job_logs(api, job):
    # Returns the contents of the log for this job (as an array of lines).
    if job['log']:
        log_collection = arvados.CollectionReader(job['log'], api)
        log_filename = "{}.log.txt".format(job['uuid'])
        return log_collection.open(log_filename).readlines()
    return []
Пример #8
0
 def test_init_num_retries_propagated(self):
     # More of an integration test...
     client = self.api_client_mock(200)
     reader = arvados.CollectionReader(self.DEFAULT_UUID, api_client=client,
                                       num_retries=3)
     with tutil.mock_responses('foo', 500, 500, 200):
         self.assertEqual('foo',
                          ''.join(f.read(9) for f in reader.all_files()))
Пример #9
0
 def check_manifest_file_sizes(self, manifest_text, expect_sizes):
     cr = arvados.CollectionReader(manifest_text)
     got_sizes = []
     for f in cr.all_files():
         got_sizes += [f.size()]
     self.assertEqual(
         got_sizes, expect_sizes,
         "got wrong file sizes %s, expected %s" % (got_sizes, expect_sizes))
Пример #10
0
def _open_remote(file_ref, config=None):
    """Retrieve an open handle to a file in an Arvados Keep collection.
    """
    import arvados
    api_client = _get_api_client(config)
    coll_uuid, coll_ref = _get_uuid_file(file_ref)
    cr = arvados.CollectionReader(coll_uuid, api_client=api_client)
    return cr.open(coll_ref)
Пример #11
0
def open_remote(file_ref, config=None):
    """Retrieve an open handle to a file in an Arvados Keep collection.
    """
    import arvados
    api_client = _get_api_client(config)
    coll_uuid, coll_ref = file_ref.replace("keep:", "").split("/", 1)
    cr = arvados.CollectionReader(coll_uuid, api_client=api_client)
    return cr.open(coll_ref)
Пример #12
0
def file_size(file_ref, config=None):
    """Retrieve file size in keep, in Mb
    """
    import arvados
    api_client = _get_api_client(config)
    coll_uuid, coll_ref = file_ref.replace("keep:", "").split("/", 1)
    cr = arvados.CollectionReader(coll_uuid, api_client=api_client)
    file = cr[coll_ref]
    return file.size() / (1024.0 * 1024.0)
Пример #13
0
def file_size(file_ref, config=None):
    """Retrieve file size in keep, in Mb
    """
    import arvados
    api_client = _get_api_client(config)
    coll_uuid, coll_ref = _get_uuid_file(file_ref)
    cr = arvados.CollectionReader(coll_uuid, api_client=api_client)
    file = cr.find(coll_ref)
    return file.size() / (1024.0 * 1024.0)
Пример #14
0
def write_block_or_manifest(dest, src, api_client, args):
    if '+A' in src:
        # block locator
        kc = arvados.keep.KeepClient(api_client=api_client)
        dest.write(kc.get(src, num_retries=args.retries))
    else:
        # collection UUID or portable data hash
        reader = arvados.CollectionReader(src, num_retries=args.retries)
        dest.write(reader.manifest_text(strip=args.strip_manifest).encode())
Пример #15
0
 def test_locator_init_falls_back_to_keep(self):
     # Reading manifests from Keep is deprecated.  Feel free to
     # remove this test when we remove the fallback.
     client = self.api_client_mock(200)
     self.mock_get_collection(client, 404, None)
     with tutil.mock_responses(self.DEFAULT_MANIFEST, 200):
         reader = arvados.CollectionReader(self.DEFAULT_DATA_HASH,
                                           api_client=client, num_retries=3)
         self.assertEqual(self.DEFAULT_MANIFEST, reader.manifest_text())
Пример #16
0
def checkin(source_collection, target_dir, merge=True):
    # delete symlinks, commit directory, merge manifests and return combined
    # collection.
    for root, dirs, files in os.walk(target_dir):
        for f in files:
            s = os.lstat(os.path.join(root, f))
            if stat.S_ISLNK(s.st_mode):
                os.unlink(os.path.join(root, f))

    uuid = robust_put.upload(target_dir)
    if merge:
        cr1 = arvados.CollectionReader(source_collection)
        cr2 = arvados.CollectionReader(uuid)
        combined = arvados.CollectionReader(cr1.manifest_text() +
                                            cr2.manifest_text())
        return combined
    else:
        return arvados.CollectionReader(uuid)
Пример #17
0
 def _test_subset(self, collection, expected):
     cr = arvados.CollectionReader(collection, self.api_client)
     for s in cr.all_streams():
         for ex in expected:
             if ex[0] == s:
                 f = s.files()[ex[2]]
                 got = [f.size(), f.stream_name(), f.name(), "".join(f.readall(2**26))]
                 self.assertEqual(got,
                                  ex,
                                  'all_files|as_manifest did not preserve manifest contents: got %s expected %s' % (got, ex))
Пример #18
0
def _collection_files(uuid, config):
    """Retrieve files in the input collection.
    """
    import arvados
    api_client = _get_api_client(config)
    cr = arvados.CollectionReader(uuid, api_client=api_client)
    cr.normalize()
    pdh = cr.portable_data_hash()
    out = [str("%s:%s/%s" % (KEY, os.path.normpath(os.path.join(pdh, x.stream_name())), x.name))
           for x in cr.all_files()]
    return out
Пример #19
0
 def _runTest(self, what_in, what_out):
     cw = arvados.CollectionWriter()
     cw.start_new_file('test.txt')
     cw.write(what_in)
     test1 = cw.finish()
     cr = arvados.CollectionReader(test1)
     got = []
     for x in list(cr.all_files())[0].readlines():
         got += [x]
     self.assertEqual(got, what_out,
                      "readlines did not split lines correctly: %s" % got)
Пример #20
0
 def _runTest(self, collection, expected):
     cr = arvados.CollectionReader(collection)
     manifest_subsets = []
     for s in cr.all_streams():
         for f in s.all_files():
             manifest_subsets += [f.as_manifest()]
     expect_i = 0
     for m in manifest_subsets:
         cr = arvados.CollectionReader(m)
         for f in cr.all_files():
             got = [
                 f.size(),
                 f.stream_name(),
                 f.name(), "".join(f.readall(2**26))
             ]
             self.assertEqual(
                 got, expected[expect_i],
                 'all_files|as_manifest did not preserve manifest contents: got %s expected %s'
                 % (got, expected[expect_i]))
             expect_i += 1
def prepare_gatk_reference_collection(reference_coll):
    """
    Checks that the supplied reference_collection has the required
    files and only the required files for GATK.
    Returns: a portable data hash for the reference collection
    """
    # Ensure we have a .fa reference file with corresponding .fai index and .dict
    # see: http://gatkforums.broadinstitute.org/discussion/1601/how-can-i-prepare-a-fasta-file-to-use-as-reference
    rcr = arvados.CollectionReader(reference_coll)
    ref_fasta = {}
    ref_fai = {}
    ref_dict = {}
    ref_input = None
    dict_reader = None
    for rs in rcr.all_streams():
        for rf in rs.all_files():
            if re.search(r'\.fa$', rf.name()):
                ref_fasta[rs.name(), rf.name()] = rf
            elif re.search(r'\.fai$', rf.name()):
                ref_fai[rs.name(), rf.name()] = rf
            elif re.search(r'\.dict$', rf.name()):
                ref_dict[rs.name(), rf.name()] = rf
    for ((s_name, f_name), fasta_f) in ref_fasta.items():
        fai_f = ref_fai.get(
            (s_name, re.sub(r'fa$', 'fai', f_name)),
            ref_fai.get((s_name, re.sub(r'fa$', 'fa.fai', f_name)), None))
        dict_f = ref_dict.get(
            (s_name, re.sub(r'fa$', 'dict', f_name)),
            ref_dict.get((s_name, re.sub(r'fa$', 'fa.dict', f_name)), None))
        if fasta_f and fai_f and dict_f:
            # found a set of all three!
            ref_input = fasta_f.as_manifest()
            ref_input += fai_f.as_manifest()
            ref_input += dict_f.as_manifest()
            dict_reader = dict_f
            break
    if ref_input is None:
        raise errors.InvalidArgumentError(
            "Expected a reference fasta with fai and dict in reference_collection. Found [%s]"
            % ' '.join(rf.name() for rf in rs.all_files()))
    if dict_reader is None:
        raise errors.InvalidArgumentError(
            "Could not find .dict file in reference_collection. Found [%s]" %
            ' '.join(rf.name() for rf in rs.all_files()))
    # Create and return a portable data hash for the ref_input manifest
    try:
        r = arvados.api().collections().create(body={
            "manifest_text": ref_input
        }).execute()
        ref_input_pdh = r["portable_data_hash"]
    except:
        raise
    return ref_input_pdh
Пример #22
0
    def runTest(self):
        m1 = """. 5348b82a029fd9e971a811ce1f71360b+43 0:43:md5sum.txt
. 085c37f02916da1cad16f93c54d899b7+41 0:41:md5sum.txt
. 8b22da26f9f433dea0a10e5ec66d73ba+43 0:43:md5sum.txt"""
        self.assertEqual(
            arvados.CollectionReader(m1).manifest_text(),
            """. 5348b82a029fd9e971a811ce1f71360b+43 085c37f02916da1cad16f93c54d899b7+41 8b22da26f9f433dea0a10e5ec66d73ba+43 0:127:md5sum.txt
""")

        m2 = """. 204e43b8a1185621ca55a94839582e6f+67108864 b9677abbac956bd3e86b1deb28dfac03+67108864 fc15aff2a762b13f521baf042140acec+67108864 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:227212247:var-GS000016015-ASM.tsv.bz2
"""
        self.assertEqual(arvados.CollectionReader(m2).manifest_text(), m2)

        m3 = """. 5348b82a029fd9e971a811ce1f71360b+43 3:40:md5sum.txt
. 085c37f02916da1cad16f93c54d899b7+41 0:41:md5sum.txt
. 8b22da26f9f433dea0a10e5ec66d73ba+43 0:43:md5sum.txt"""
        self.assertEqual(
            arvados.CollectionReader(m3).manifest_text(),
            """. 5348b82a029fd9e971a811ce1f71360b+43 085c37f02916da1cad16f93c54d899b7+41 8b22da26f9f433dea0a10e5ec66d73ba+43 3:124:md5sum.txt
""")

        m4 = """. 204e43b8a1185621ca55a94839582e6f+67108864 0:3:foo/bar
./zzz 204e43b8a1185621ca55a94839582e6f+67108864 0:999:zzz
./foo 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:3:bar"""
        self.assertEqual(
            arvados.CollectionReader(m4).manifest_text(),
            """./foo 204e43b8a1185621ca55a94839582e6f+67108864 323d2a3ce20370c4ca1d3462a344f8fd+25885655 0:3:bar 67108864:3:bar
./zzz 204e43b8a1185621ca55a94839582e6f+67108864 0:999:zzz
""")

        m5 = """. 204e43b8a1185621ca55a94839582e6f+67108864 0:3:foo/bar
./zzz 204e43b8a1185621ca55a94839582e6f+67108864 0:999:zzz
./foo 204e43b8a1185621ca55a94839582e6f+67108864 3:3:bar"""
        self.assertEqual(
            arvados.CollectionReader(m5).manifest_text(),
            """./foo 204e43b8a1185621ca55a94839582e6f+67108864 0:6:bar
./zzz 204e43b8a1185621ca55a94839582e6f+67108864 0:999:zzz
""")

        with open('testdata/1000G_ref_manifest') as f6:
            m6 = f6.read()
            self.assertEqual(arvados.CollectionReader(m6).manifest_text(), m6)

        with open('testdata/jlake_manifest') as f7:
            m7 = f7.read()
            self.assertEqual(arvados.CollectionReader(m7).manifest_text(), m7)

        m8 = """./a\\040b\\040c 59ca0efa9f5633cb0371bbc0355478d8+13 0:13:hello\\040world.txt
"""
        self.assertEqual(arvados.CollectionReader(m8).manifest_text(), m8)
Пример #23
0
def collection_files(uuid, config=None, add_uuid=False):
    """Retrieve files in the input collection.
    """
    import arvados
    api_client = _get_api_client(config)
    cr = arvados.CollectionReader(uuid, api_client=api_client)
    cr.normalize()
    out = ["%s/%s" % (x.stream_name(), x.name) for x in cr.all_files()]
    if add_uuid:
        out = [
            "keep:%s" % os.path.normpath(os.path.join(uuid, x)) for x in out
        ]
    return out
Пример #24
0
def one_task_per_pair_input_file(if_sequence=0, and_end_task=True):
    """
    Queue one task for each pair of fastq files in this job's input
    collection.

    Each new task will have two parameters, named "input_1" and
    "input_2", each being a manifest containing a single fastq file.

    A matching pair of files in the input collection is assumed to
    have names "x_1.y" and "x_2.y".

    Files in the input collection that are not part of a matched pair
    are silently ignored.

    if_sequence and and_end_task arguments have the same significance
    as in arvados.job_setup.one_task_per_input_file().
    """
    if if_sequence != arvados.current_task()['sequence']:
        return
    job_input = arvados.current_job()['script_parameters']['input']
    cr = arvados.CollectionReader(job_input)
    all_files = []
    for s in cr.all_streams():
        all_files += list(s.all_files())
    for s in cr.all_streams():
        for left_file in s.all_files():
            left_name = left_file.name()
            right_file = None
            right_name = re.sub(r'(.*_)1\.', '\g<1>2.', left_name)
            if right_name == left_name:
                continue
            for f2 in s.all_files():
                if right_name == f2.name():
                    right_file = f2
            if right_file != None:
                new_task_attrs = {
                    'job_uuid': arvados.current_job()['uuid'],
                    'created_by_job_task_uuid': arvados.current_task()['uuid'],
                    'sequence': if_sequence + 1,
                    'parameters': {
                        'input_1': left_file.as_manifest(),
                        'input_2': right_file.as_manifest()
                    }
                }
                arvados.api().job_tasks().create(body=new_task_attrs).execute()
    if and_end_task:
        arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'],
                                         body={
                                             'success': True
                                         }).execute()
        exit(0)
Пример #25
0
    def update(self):
        try:
            if self.collection_object is not None and portable_data_hash_pattern.match(
                    self.collection_locator):
                return True

            if self.collection_locator is None:
                self.fresh()
                return True

            with llfuse.lock_released:
                coll_reader = arvados.CollectionReader(
                    self.collection_locator,
                    self.api,
                    self.api.keep,
                    num_retries=self.num_retries)
                new_collection_object = coll_reader.api_response() or {}
                # If the Collection only exists in Keep, there will be no API
                # response.  Fill in the fields we need.
                if 'uuid' not in new_collection_object:
                    new_collection_object['uuid'] = self.collection_locator
                if "portable_data_hash" not in new_collection_object:
                    new_collection_object[
                        "portable_data_hash"] = new_collection_object["uuid"]
                if 'manifest_text' not in new_collection_object:
                    new_collection_object[
                        'manifest_text'] = coll_reader.manifest_text()
                coll_reader.normalize()
            # end with llfuse.lock_released, re-acquire lock

            if self.collection_object is None or self.collection_object[
                    "portable_data_hash"] != new_collection_object[
                        "portable_data_hash"]:
                self.new_collection(new_collection_object, coll_reader)

            self.fresh()
            return True
        except arvados.errors.NotFoundError:
            _logger.exception("arv-mount %s: error", self.collection_locator)
        except arvados.errors.ArgumentError as detail:
            _logger.warning("arv-mount %s: error %s", self.collection_locator,
                            detail)
            if self.collection_object is not None and "manifest_text" in self.collection_object:
                _logger.warning("arv-mount manifest_text is: %s",
                                self.collection_object["manifest_text"])
        except Exception:
            _logger.exception("arv-mount %s: error", self.collection_locator)
            if self.collection_object is not None and "manifest_text" in self.collection_object:
                _logger.error("arv-mount manifest_text is: %s",
                              self.collection_object["manifest_text"])
        return False
Пример #26
0
    def runTest(self):
        # Create the request handler
        operations = fuse.Operations(os.getuid(), os.getgid())
        e = operations.inodes.add_entry(fuse.Directory(llfuse.ROOT_INODE))
        operations.inodes.load_collection(
            e, arvados.CollectionReader(arvados.Keep.get(self.testcollection)))

        self.mounttmp = tempfile.mkdtemp()

        llfuse.init(operations, self.mounttmp, [])
        t = threading.Thread(None, lambda: llfuse.main())
        t.start()

        # wait until the driver is finished initializing
        operations.initlock.wait()

        # now check some stuff
        d1 = os.listdir(self.mounttmp)
        d1.sort()
        self.assertEqual(d1, ['dir1', 'dir2', 'thing1.txt', 'thing2.txt'])

        d2 = os.listdir(os.path.join(self.mounttmp, 'dir1'))
        d2.sort()
        self.assertEqual(d2, ['thing3.txt', 'thing4.txt'])

        d3 = os.listdir(os.path.join(self.mounttmp, 'dir2'))
        d3.sort()
        self.assertEqual(d3, ['dir3', 'thing5.txt', 'thing6.txt'])

        d4 = os.listdir(os.path.join(self.mounttmp, 'dir2/dir3'))
        d4.sort()
        self.assertEqual(d4, ['thing7.txt', 'thing8.txt'])

        files = {
            'thing1.txt': 'data 1',
            'thing2.txt': 'data 2',
            'dir1/thing3.txt': 'data 3',
            'dir1/thing4.txt': 'data 4',
            'dir2/thing5.txt': 'data 5',
            'dir2/thing6.txt': 'data 6',
            'dir2/dir3/thing7.txt': 'data 7',
            'dir2/dir3/thing8.txt': 'data 8'
        }

        for k, v in files.items():
            with open(os.path.join(self.mounttmp, k)) as f:
                self.assertEqual(f.read(), v)
Пример #27
0
def one_task_per_bam_file(if_sequence=0, and_end_task=True):
    """
    Queue one task for each bam file in this job's input collection.

    Each new task will have an "input" parameter: a manifest
    containing one .bam file and (if available) the corresponding .bai
    index file.

    Files in the input collection that are not named *.bam or *.bai
    (as well as *.bai files that do not match any .bam file present)
    are silently ignored.

    if_sequence and and_end_task arguments have the same significance
    as in arvados.job_setup.one_task_per_input_file().
    """
    if if_sequence != arvados.current_task()['sequence']:
        return
    job_input = arvados.current_job()['script_parameters']['input']
    cr = arvados.CollectionReader(job_input)
    bam = {}
    bai = {}
    for s in cr.all_streams():
        for f in s.all_files():
            if re.search(r'\.bam$', f.name()):
                bam[s.name(), f.name()] = f
            elif re.search(r'\.bai$', f.name()):
                bai[s.name(), f.name()] = f
    for ((s_name, f_name), bam_f) in bam.items():
        bai_f = bai.get((s_name, re.sub(r'bam$', 'bai', f_name)), None)
        task_input = bam_f.as_manifest()
        if bai_f:
            task_input += bai_f.as_manifest()
        new_task_attrs = {
            'job_uuid': arvados.current_job()['uuid'],
            'created_by_job_task_uuid': arvados.current_task()['uuid'],
            'sequence': if_sequence + 1,
            'parameters': {
                'input': task_input
            }
        }
        arvados.api().job_tasks().create(body=new_task_attrs).execute()
    if and_end_task:
        arvados.api().job_tasks().update(uuid=arvados.current_task()['uuid'],
                                         body={
                                             'success': True
                                         }).execute()
        exit(0)
def main(sqlite_db):
    # Query for recent Veritas sequenced samples
    query = ("SELECT uploaded_data.human_id, date, name "
             "FROM uploaded_data WHERE "
             "data_type == 'Veritas Genetics' AND "
             "uploaded_data.name GLOB '*VCF'")
    conn = sqlite3.connect(sys.argv[1])
    df = pd.read_sql_query(query, conn)

    # Uniquify by sample
    df.sort_values("date", inplace=True)
    df.drop_duplicates(inplace=True)
    print(df.describe())

    # Load files in Arvados BAM collection, if arvados client installed
    if arvados:
        api = arvados.api(
            host="su92l.arvadosapi.com",
            token="42yz0fp9s19djsgkae33khevpzq4or1ile5o7khofzw388lvfl")
        cr = arvados.CollectionReader("su92l-4zz18-1rqqi0kpkfmfite", api)
        bam_coll = [(x.name, x.size()) for x in cr.all_files()]
    else:
        bam_coll = None

    # Find recent samples with more than 1 data type, emphasizing diverse samples
    # Require higher depth coverage > 50Gb
    for sample in df["human_id"]:
        query = (
            "SELECT data_type, date FROM uploaded_data WHERE human_id='%s'" %
            sample)
        df = pd.read_sql_query(query, conn)
        if len(df.data_type.unique()) > 1:
            bam_size = find_bam_size(sample, bam_coll) if bam_coll else 100
            if bam_size > 50:
                query = ("SELECT * from demographics WHERE human_id='%s'" %
                         sample)
                dfd = pd.read_sql_query(query, conn)
                if (len(dfd)
                        and ((dfd["gender"][0] not in ["", "Male"]) or
                             (dfd["race"][0]
                              not in ["", "White", "Caucasian (White)"]))):
                    demo = "%s %s" % (dfd["gender"][0], dfd["race"][0])
                elif len(dfd) == 0:
                    demo = "No demographics"
                print(sample, "%sGb" % bam_size, demo,
                      list(set(df["data_type"])), list(set(df["date"])))
Пример #29
0
 def runTest(self):
     cr = arvados.CollectionReader('d6c3b8e571f1b81ebb150a45ed06c884+114')
     got = []
     for s in cr.all_streams():
         for f in s.all_files():
             got += [[f.size(), f.stream_name(), f.name(), f.read(2**26)]]
     expected = [[3, '.', 'bar.txt', 'bar'], [3, '.', 'foo.txt', 'foo'],
                 [3, './baz', 'baz.txt', 'baz']]
     self.assertEqual(got, expected)
     stream0 = cr.all_streams()[0]
     self.assertEqual(
         stream0.readfrom(0, 0), '',
         'reading zero bytes should have returned empty string')
     self.assertEqual(stream0.readfrom(0, 2**26), 'foobar',
                      'reading entire stream failed')
     self.assertEqual(
         stream0.readfrom(2**26, 0), '',
         'reading zero bytes should have returned empty string')
Пример #30
0
def main(args, stdout, stderr, api_client=None, logger=None):
    args = parse_args(args)

    if api_client is None:
        api_client = arvados.api('v1')

    if logger is None:
        logger = logging.getLogger('arvados.arv-ls')

    try:
        r = re.search(r'^(.*?)(/.*)?$', args.locator)
        collection = r.group(1)
        get_prefix = r.group(2)

        cr = arvados.CollectionReader(collection,
                                      api_client=api_client,
                                      num_retries=args.retries)
        if get_prefix:
            if get_prefix[-1] == '/':
                get_prefix = get_prefix[:-1]
            stream_name = '.' + get_prefix
            reader = cr.find(stream_name)
            if not (isinstance(reader, arvados.CollectionReader)
                    or isinstance(reader, arvados.collection.Subcollection)):
                logger.error("'{}' is not a subdirectory".format(get_prefix))
                return 1
        else:
            stream_name = '.'
            reader = cr
    except (arvados.errors.ApiError, arvados.errors.ArgumentError,
            arvados.errors.NotFoundError) as error:
        logger.error("error fetching collection: {}".format(error))
        return 1

    formatters = []
    if args.s:
        formatters.append(size_formatter)
    formatters.append(name_formatter)

    for f in files_in_collection(reader, stream_name):
        print(*(info_func(f) for info_func in formatters), file=stdout)

    return 0