def gather_chunks(chunks, output_file, nproc=1):
    if len(chunks) == 1:
        datastore = DataStore.load_from_json(op.realpath(chunks[0]))
        log.info("Writing datastore to %s", output_file)
        datastore.write_json(output_file)
        return len(datastore.files)
    file_names_by_bc = defaultdict(list)
    datastore_files_by_bc = {}
    for file_name in chunks:
        log.info("Reading datastore from %s", file_name)
        datastore = DataStore.load_from_json(op.realpath(file_name))
        for ds_file in datastore.files.values():
            ds_file_name = op.realpath(ds_file.path)
            base_name = op.basename(ds_file_name)
            fields = base_name.split(".")
            bc_pair = fields[-3]
            file_names_by_bc[bc_pair].append(ds_file_name)
            datastore_files_by_bc[bc_pair] = ds_file
    log.info("Found %d unique barcode pairs", len(file_names_by_bc))
    _results = []
    pool = multiprocessing.Pool(nproc)
    for bc_pair, file_names in file_names_by_bc.items():
        _results.append(
            pool.apply_async(_merge_chunks,
                             (file_names, datastore_files_by_bc[bc_pair])))
    pool.close()
    pool.join()
    datastore_files = [r.get() for r in _results]
    datastore_out = DataStore(datastore_files)
    log.info("Writing datastore to %s", output_file)
    datastore_out.write_json(output_file)
    return len(datastore_files)
예제 #2
0
 def test_load_datastore_from_file(self):
     """
     Can load Datastore from Json
     :return:
     """
     ds = DataStore.load_from_json(_to_ds_json(self.job_dir))
     self.assertIsInstance(ds, DataStore)
예제 #3
0
def datastore_to_datastorefile_objs(in_datastore_json,
                                    allowed_types=ALLOWED_TYPES):
    """Return (datastorefile_objs, type_id, cls, ext)
    datastorefile_objs -- a list of DataStoreFile objects.
    type_id -- id
    cls -- e.g., SubreadSet
    ext -- e.g., subreadset.xml
    """
    datastore = DataStore.load_from_json(in_datastore_json)
    allowed_type_ids = [t.file_type_id for t in allowed_types]
    # Is input datastore empty?
    if len(datastore.files) == 0:
        raise ValueError(
            "Expected one or more dataset files in datastore {}".format(
                in_datastore_json))

    # Do all files share the same type?
    observed_type_ids = list(
        set([f.file_type_id for f in datastore.files.values()]))
    if len(observed_type_ids) != 1:
        raise ValueError(
            "Could not handle datastore of mixed types: {}!".format(
                observed_type_ids))

    # Is it an allowed file type?
    type_id = observed_type_ids[0]
    if not type_id in allowed_type_ids:
        raise ValueError(
            "Could not handle {} dataset in datastore file {}, only support {}!"
            .format(type_id, in_datastore_json, allowed_type_ids))

    cls = _type_id_to_cls(type_id)
    ext = _type_id_to_ext(type_id)
    return datastore.files.values(), type_id, cls, ext
예제 #4
0
def iterate_datastore_read_set_files(
        datastore_file, allowed_read_types=Constants.ALLOWED_BC_TYPES):
    """
    Iterate over dataset (e.g., SubreadSet or ConsensusReadSet) files listed in a datastore JSON.
    """
    ds = DataStore.load_from_json(datastore_file)
    files = ds.files.values()
    for f in files:
        if f.file_type_id in allowed_read_types:
            yield f
예제 #5
0
    def test_datastore_file_name_and_description(self):
        """
        Make sure output files have non-blank name and description.
        """
        ds = DataStore.load_from_json(_to_ds_json(self.job_dir))
        rx = re.compile(r'[a-zA-Z0-9]{1,}')

        for fd in ds.files.values():
            for x in (fd.name, fd.description):
                self.assertTrue(rx.search(x))
예제 #6
0
def run_args(args):
    dstore = DataStore.load_from_json(os.path.realpath(args.datastore))
    ds_in = ConsensusReadSet(args.ccs_in, trustCounts=True)
    ds_out = ConsensusReadSet(*([f.path for f in dstore.files.values()]),
                              trustCounts=True)
    sanitize_dataset_tags(ds_out, remove_hidden=True)
    ds_out.name = ds_in.name.replace(" (filtered)", "") + " (trimmed)"
    ds_out.subdatasets = []
    ds_out.write("trimmed.consensusreadset.xml")
    return 0
예제 #7
0
 def wrapper(self):
     ds_path = os.path.join(self.job_dir, "workflow", "datastore.json")
     ds = DataStore.load_from_json(ds_path)
     # log.info("Loaded datastore {d}".format(d=ds))
     for ds_file in ds.files.values():
         if ds_file.file_type_id == file_type_id:
             started_at = time.time()
             validator_func(ds_file.path, **kwargs)
             run_time = time.time() - started_at
             log.debug("Successfully validated in {s:.2f} sec {p}".format(p=ds_file.path, s=run_time))
     self.assertTrue(True)
예제 #8
0
 def run_after(self, rtc, output_dir):
     with openDataSet(rtc.task.output_files[0]) as f:
         f.assertIndexed()
         self.assertEqual(len(f.toExternalFiles()), 1)
         # test for bug 33778
         qnames = set()
         for rec in f:
             qnames.add(rec.qName)
         self.assertEqual(len(qnames), len(f))
     ds = DataStore.load_from_json(rtc.task.output_files[1])
     self.assertEqual(len(ds.files), 2)
예제 #9
0
파일: base.py 프로젝트: yqin22/pbsmrtpipe
 def wrapper(self):
     ds_path = os.path.join(self.job_dir, "workflow", "datastore.json")
     ds = DataStore.load_from_json(ds_path)
     # log.info("Loaded datastore {d}".format(d=ds))
     for ds_file in ds.files.values():
         if ds_file.file_type_id == file_type_id:
             started_at = time.time()
             validator_func(ds_file.path, **kwargs)
             run_time = time.time() - started_at
             log.debug("Successfully validated in {s:.2f} sec {p}".format(p=ds_file.path, s=run_time))
     self.assertTrue(True)
예제 #10
0
def test_gather_datastore_json():
    import subprocess
    from pbcommand.models import DataStore
    d = '/pbi/dept/secondary/siv/testdata/pbsvtools-unittest/data/test_scatter_align_datastore/'
    if1 = op.join(d, '1.aln.datastore.json')
    if2 = op.join(d, '2.aln.datastore.json')
    of = tempfile.NamedTemporaryFile(suffix=".datastore.json").name
    args = ['python', '-m', 'pbcoretools.tasks.gather', of, if1, if2]
    subprocess.check_call(args)
    out_fns = DataStore.load_from_json(of).to_dict()['files']
    expected_bam_1 = op.join(d, '1.bam')
    expected_bam_2 = op.join(d, '2.bam')
    assert out_fns[0]['path'] == expected_bam_1
    assert out_fns[1]['path'] == expected_bam_2
예제 #11
0
 def test_datastore_paths(self):
     tmpfile = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     base_dir = os.path.dirname(tmpfile)
     tmp_ds = os.path.join(base_dir, "datastore.json")
     dsf = DataStoreFile(str(uuid.uuid4()), "pbcommand.tasks.dev_task",
                         FileTypes.DS_SUBREADS.file_type_id,
                         os.path.basename(tmpfile), False, "Subreads",
                         "Subread DataSet XML")
     ds = DataStore([dsf])
     ds.write_json(tmp_ds)
     with open(tmp_ds) as json_in:
         d = json.loads(json_in.read())
         self.assertFalse(os.path.isabs(d['files'][0]['path']))
     ds = DataStore.load_from_json(tmp_ds)
     self.assertEqual(ds.files.values()[0].path, tmpfile)
예제 #12
0
 def test_integration(self):
     ds_out = op.join(self._output_dir, "datastore.json")
     args = [
         "python", "-m", "pbreports.report.subreads_reports",
         pbtestdata.get_file("subreads-sequel"), ds_out
     ]
     o, c, m = backticks(" ".join(args))
     self.assertEqual(c, 0)
     self.assertTrue(op.exists(ds_out))
     datastore = DataStore.load_from_json(ds_out)
     datastore_files = [f for u, f in datastore.files.iteritems()]
     self.assertEqual(sorted([f.file_id for f in datastore_files]), [
         "pbreports.tasks.adapter_report_xml",
         "pbreports.tasks.filter_stats_report_xml",
         "pbreports.tasks.loading_report_xml"
     ])
예제 #13
0
    def test_datastore_report_file_uuid(self):
        """Test that the DataStore file and the Underlying Report have the same UUID"""
        ds = DataStore.load_from_json(_to_ds_json(self.job_dir))
        n_tested = 0
        for ds_file in ds.files.values():
            if ds_file.file_type_id == FileTypes.REPORT.file_type_id:
                rpt = load_report_from_json(ds_file.path)
                emsg = "{p}: {u1} != {u2}".format(p=ds_file.path,
                                                  u1=rpt.uuid,
                                                  u2=ds_file.uuid)
                # by convention the DS UUID and the Report UUID should the same value
                self.assertEqual(rpt.uuid, ds_file.uuid, emsg)
                n_tested += 1

        if n_tested == 0:
            raise unittest.SkipTest(
                "Warning. No Report JSON files in datastore.")
예제 #14
0
def _get_barcoded_datasets(reads_file):
    dir_name = os.path.dirname(os.path.abspath(reads_file))
    if reads_file.endswith(".datastore.json"):
        datastore = DataStore.load_from_json(reads_file)
        datasets = [
            _to_abs_path(dir_name, f.path)
            for u, f in datastore.files.iteritems()
            if f.file_type_id in Constants.VALID_FT_IDS
        ]
        if len(datasets) == 0:
            raise ValueError("No datasets containing barcoded reads were " +
                             "present in the input.  This could mean that " +
                             "demultiplexing was run with incorrect inputs " +
                             "or an overly restrictive minimum barcode score.")
        return datasets
    else:
        return [reads_file]
예제 #15
0
def _run_auto_ccs_outputs_barcoded(datastore_in, datastore_out, nproc=Constants.MAX_NPROC):
    base_dir = op.dirname(datastore_out)
    files = DataStore.load_from_json(datastore_in).files.values()
    ccs_files = []
    for ds_file in files:
        # FIXME use a better file_id
        if ds_file.file_type_id == FileTypes.DS_CCS.file_type_id and ds_file.file_id == "barcoding.tasks.lima-0":
            ccs_files.append(ds_file.path)
            log.info("Exporting %s", ds_file.path)
    log.info("Exporting %d CCS datasets", len(ccs_files))
    args = [(f, base_dir) for f in ccs_files]
    output_files = list(itertools.chain.from_iterable(
        pool_map(__run_ccs_bam_fastq_exports, args, nproc)))
    output_files.extend([
        _create_zipped_fastq(output_files, "all_barcodes.fastq.tar.gz"),
        _create_zipped_fasta(output_files, "all_barcodes.fasta.tar.gz")
    ])
    DataStore(output_files).write_json(datastore_out)
    return 0
예제 #16
0
    def _validate_datastore_reports(self, validate_func):

        ds = DataStore.load_from_json(_to_ds_json(self.job_dir))

        # found one or more valid Report
        have_reports = True

        for ds_file in ds.files.values():
            if ds_file.file_type_id == FileTypes.REPORT.file_type_id:
                try:
                    _ = validate_func(ds_file.path)
                except ValueError as e:
                    self.fail(
                        "Report validation failed:\n{e}".format(e=str(e)))
                else:
                    have_reports = True

        if not have_reports:
            raise unittest.SkipTest("No Report JSON files in datastore.")
        return have_reports
예제 #17
0
    def test_datastore_dataset_file_uuid(self):
        """Test that the DataStore file and the Underlying Report have the same UUID"""
        dataset_type_ids = FileTypes.ALL_DATASET_TYPES().keys()

        ds = DataStore.load_from_json(_to_ds_json(self.job_dir))

        n_tested = 0
        for ds_file in ds.files.values():
            if ds_file.file_type_id in dataset_type_ids:
                path = ds_file.path
                dsf_uuid = ds_file.uuid
                uuid = getDataSetUuid(path)
                self.assertEqual(
                    uuid, dsf_uuid, "{p}: {u1} != {u2}".format(p=path,
                                                               u1=uuid,
                                                               u2=dsf_uuid))
                n_tested += 1

        if n_tested == 0:
            raise unittest.SkipTest(
                "Warning. No DataSet XML files in datastore.")
예제 #18
0
def update_barcoded_sample_metadata(
        base_dir,
        datastore_file,
        input_reads,
        barcode_set,
        isoseq_mode=False,
        use_barcode_uuids=True,
        nproc=1,
        min_score_filter=Constants.BARCODE_QUALITY_GREATER_THAN):
    """
    Given a datastore JSON of SubreadSets produced by barcoding, apply the
    following updates to each:
    1. Include only the BioSample(s) corresponding to its barcode
    2. Add the BioSample name to the dataset name
    3. Add a ParentDataSet record in the Provenance section.
    """
    barcode_names, bio_samples_d, barcode_uuids_d, update_files, parent_info = _load_files_for_update(
        input_reads, barcode_set, datastore_file)
    pool = multiprocessing.Pool(nproc)
    _results = []
    for ds_file in update_files:
        _results.append(
            pool.apply_async(_update_barcoded_sample_metadata,
                             (base_dir, ds_file, barcode_names, parent_info,
                              isoseq_mode, use_barcode_uuids, bio_samples_d,
                              barcode_uuids_d, min_score_filter)))
    pool.close()
    pool.join()
    datastore_files = [r.get() for r in _results]
    # copy over the un-barcoded reads BAM
    dstore = DataStore.load_from_json(datastore_file)
    files = dstore.files.values()
    for f in files:
        if f.file_id != "barcoding.tasks.lima-0":
            datastore_files.append(f)
    return DataStore(datastore_files)
예제 #19
0
 def _check_datastore(self, file_name):
     ds = DataStore.load_from_json(file_name)
     files = sorted([f.source_id for f in ds.files.values()])
     assert files == ["mapped_bam", "mapped_bam_bai"]
예제 #20
0
def gather_datastore(input_files, output_file, skip_empty=True):
    ds = DataStore([])
    for i_fn in input_files:
        for uuid, f in DataStore.load_from_json(i_fn).files.items():
            ds.add(f)
    ds.write_json(output_file)