Exemplo n.º 1
0
 def test_copy(self):
     ds1 = DataSet(data.getXml())
     ds2 = ds1.copy()
     self.assertFalse(ds1 == ds2)
     self.assertFalse(ds1.uuid == ds2.uuid)
     self.assertFalse(ds1 is ds2)
     self.assertTrue(ds1.name == ds2.name)
     self.assertTrue(ds1.externalResources == ds2.externalResources)
     # The name and UniqueId are different:
     self.assertFalse(ds1.objMetadata == ds2.objMetadata)
     self.assertTrue(ds1.filters == ds2.filters)
     self.assertTrue(ds1.subdatasets == ds2.subdatasets)
     self.assertTrue(len(ds1.subdatasets) == 2)
     self.assertTrue(len(ds2.subdatasets) == 2)
     assert not reduce(lambda x, y: x or y, [
         ds1d is ds2d for ds1d in ds1.subdatasets
         for ds2d in ds2.subdatasets
     ])
     # TODO: once simulated files are indexable, turn on strict:
     ds1 = SubreadSet(data.getXml(no=10), strict=False)
     self.assertEquals(type(ds1.metadata).__name__, 'SubreadSetMetadata')
     ds2 = ds1.copy()
     self.assertEquals(type(ds2.metadata).__name__, 'SubreadSetMetadata')
     # Lets try casting
     ds1 = DataSet(data.getBam())
     self.assertEquals(type(ds1).__name__, 'DataSet')
     ds1 = ds1.copy(asType='SubreadSet')
     self.assertEquals(type(ds1).__name__, 'SubreadSet')
     # Lets do some illicit casting
     with self.assertRaises(TypeError):
         ds1 = ds1.copy(asType='ReferenceSet')
     # Lets try not having to cast
     ds1 = SubreadSet(data.getBam())
     self.assertEquals(type(ds1).__name__, 'SubreadSet')
Exemplo n.º 2
0
 def test_update_barcoded_sample_metadata(self):
     datastore_tmp = tempfile.NamedTemporaryFile(
         suffix=".datastore.json").name
     barcodes = pbtestdata.get_file("barcodeset")
     ds = split_barcoded_dataset(self.SUBREADS)
     ds.write_json(datastore_tmp)
     base_dir = tempfile.mkdtemp()
     datastore = update_barcoded_sample_metadata(base_dir, datastore_tmp,
                                                 self.SUBREADS, barcodes)
     validate_barcoded_datastore_files(self, self.SUBREADS, datastore)
     # now with use_barcode_uuids=False
     datastore = update_barcoded_sample_metadata(base_dir,
                                                 datastore_tmp,
                                                 self.SUBREADS,
                                                 barcodes,
                                                 use_barcode_uuids=False)
     validate_barcoded_datastore_files(self,
                                       self.SUBREADS,
                                       datastore,
                                       use_barcode_uuids=False)
     # test that it doesn't break with no collection metadata
     ss = SubreadSet(self.SUBREADS)
     ss.metadata.collections = None
     ss_tmp = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     ss.write(ss_tmp)
     ds = split_barcoded_dataset(ss_tmp)
     ds.write_json(datastore_tmp)
     base_dir = tempfile.mkdtemp()
     datastore = update_barcoded_sample_metadata(base_dir, datastore_tmp,
                                                 self.SUBREADS, barcodes)
     validate_barcoded_datastore_files(self,
                                       self.SUBREADS,
                                       datastore,
                                       have_collection_metadata=False,
                                       number_of_expected_collections=0)
Exemplo n.º 3
0
 def test_discard_bio_samples(self):
     ds = SubreadSet(self.SUBREADS)
     discard_bio_samples(ds, "lbc1--lbc1")
     coll = ds.metadata.collections[0]
     bioSamples = ds.metadata.collections[0].wellSample.bioSamples
     assert len(bioSamples) == 1
     assert bioSamples[0].name == "Alice"
     # No matching BioSample records
     ds = SubreadSet(self.SUBREADS)
     ds.metadata.collections[0].wellSample.bioSamples.pop(1)
     ds.metadata.collections[0].wellSample.bioSamples.pop(1)
     bioSample = ds.metadata.collections[0].wellSample.bioSamples[0]
     while len(bioSample.DNABarcodes) > 0:
         bioSample.DNABarcodes.pop(0)
     assert len(ds.metadata.collections[0].wellSample.bioSamples) == 1
     discard_bio_samples(ds, "lbc1--lbc1")
     assert len(ds.metadata.collections[0].wellSample.bioSamples) == 1
     assert ds.metadata.collections[0].wellSample.bioSamples[
         0].name == "lbc1--lbc1"
     assert ds.metadata.collections[0].wellSample.bioSamples[0].DNABarcodes[
         0].name == "lbc1--lbc1"
     # no BioSample records
     ds = SubreadSet(pbtestdata.get_file("subreads-sequel"))
     assert len(ds.metadata.collections[0].wellSample.bioSamples) == 0
     discard_bio_samples(ds, "lbc1--lbc1")
     assert len(ds.metadata.collections[0].wellSample.bioSamples) == 1
     assert ds.metadata.collections[0].wellSample.bioSamples[
         0].name == "lbc1--lbc1"
     assert ds.metadata.collections[0].wellSample.bioSamples[0].DNABarcodes[
         0].name == "lbc1--lbc1"
Exemplo n.º 4
0
 def test_get_barcode_sample_mappings(self):
     with SubreadSet(self._subreads) as ds:
         # just double-checking that the XML defines more samples than are
         # actually present in the BAM
         assert len(ds.metadata.collections[0].wellSample.bioSamples) == 3
     samples = get_barcode_sample_mappings(SubreadSet(self._subreads))
     assert samples == {'lbc3--lbc3': 'Charles', 'lbc1--lbc1': 'Alice'}
Exemplo n.º 5
0
 def setup_class(cls):
     bam_files = []
     with SubreadSet(pbtestdata.get_file("barcoded-subreadset")) as ds_in:
         for er in ds_in.externalResources:
             bam_files.append(er.bam)
     with SubreadSet(*bam_files, strict=True) as ds_out:
         ds_out.write(cls.INPUT_FILE)
Exemplo n.º 6
0
def to_reports(subreads, output_dir):
    output_files = []
    log.info("Loading {f}".format(f=subreads))
    ds = SubreadSet(subreads)
    ds.loadStats()
    for base, module in [("filter_stats_xml", filter_stats_xml),
                         ("adapter_xml", adapter_xml),
                         ("loading_xml", loading_xml),
                         ("control", control)]:
        constants = getattr(module, "Constants")
        task_id = constants.TOOL_ID
        to_report = getattr(module, "to_report_impl")
        try:
            rpt_output_dir = os.path.join(output_dir, base)
            os.mkdir(rpt_output_dir)
            file_name = os.path.join(rpt_output_dir, "{b}.json".format(b=base))
            report = to_report(ds, rpt_output_dir)
            log.info("Writing {f}".format(f=file_name))
            report.write_json(file_name)
            output_files.append(DataStoreFile(
                uuid=report.uuid,
                source_id=task_id,
                type_id=FileTypes.REPORT.file_type_id,
                path=file_name,
                is_chunked=False,
                name=base))
        except InvalidStatsError as e:
            log.error("This dataset lacks some required statistics")
            log.error("Skipping generation of {b} report".format(b=base))
    datastore = DataStore(output_files)
    return datastore
Exemplo n.º 7
0
def _generateSubreadSet(output_bam_file):
    sset = SubreadSet(output_bam_file, generateIndices=True)

    sset_output_name = output_bam_file[:-12] + 'subreadset.xml'
    sset.name = sset_output_name.split('.')[0]
    sset.write(sset_output_name)
    return sset_output_name
 def test_subreadset_from_bam(self):
     # DONE control experiment for bug 28698
     bam = upstreamData.getUnalignedBam()
     ds1 = SubreadSet(bam, strict=False)
     fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     log.debug(fn)
     ds1.write(fn)
Exemplo n.º 9
0
    def test_de_novo(self):
        ofn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
        log.info(ofn)
        ss = SubreadSet(data.getXml(10))
        col = CollectionMetadata()
        self.assertFalse(ss.metadata.collections)

        ss.metadata.collections.append(col)
        self.assertTrue(ss.metadata.collections)

        col.cellIndex = 1
        self.assertTrue(ss.metadata.collections[0].cellIndex, 1)

        col.instrumentName = "foo"
        self.assertTrue(ss.metadata.collections[0].instrumentName, "foo")

        col.context = 'bar'
        self.assertTrue(ss.metadata.collections[0].context, "bar")

        ss.metadata.collections[0].runDetails.name = 'foo'
        self.assertEqual('foo', ss.metadata.collections[0].runDetails.name)

        ss.metadata.collections[0].wellSample.name = 'bar'
        self.assertEqual('bar', ss.metadata.collections[0].wellSample.name)

        ss.metadata.collections[0].wellSample.wellName = 'baz'
        self.assertEqual('baz', ss.metadata.collections[0].wellSample.wellName)

        ss.metadata.collections[0].wellSample.concentration = 'baz'
        self.assertEqual('baz',
                         ss.metadata.collections[0].wellSample.concentration)
        ss.write(ofn, validate=False)
Exemplo n.º 10
0
 def test_dataset_create_set_sample_names(self):
     sample_args = "--well-sample-name WELLSAMPLE --bio-sample-name BIOSAMPLE".split(
     )
     outfile = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     cmd = " ".join([
         "dataset", "create", "--force", outfile,
         pbtestdata.get_file("subreads-bam")
     ] + sample_args)
     self._run_cmd_with_output(cmd, outfile)
     with SubreadSet(outfile) as ds:
         assert len(ds.metadata.collections) == 1
         assert ds.metadata.collections[0].wellSample.name == "WELLSAMPLE"
         assert ds.metadata.collections[0].wellSample.bioSamples[
             0].name == "BIOSAMPLE"
         assert len(ds.metadata.collections[0].wellSample.bioSamples) == 1
     # now with existing samples
     outfile = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     cmd = " ".join([
         "dataset", "create", "--force", outfile,
         pbtestdata.get_file("barcoded-subreadset")
     ] + sample_args)
     self._run_cmd_with_output(cmd, outfile)
     with SubreadSet(outfile) as ds:
         assert len(ds.metadata.collections) == 1
         assert ds.metadata.collections[0].wellSample.name == "WELLSAMPLE"
         biosamples = {
             s.name
             for s in ds.metadata.collections[0].wellSample.bioSamples
         }
         assert biosamples == {"BIOSAMPLE"}
Exemplo n.º 11
0
 def test_subreadset_split_metadata_element_name(self):
     fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     log.debug(fn)
     sset = SubreadSet(data.getXml(10), data.getXml(13))
     chunks = sset.split(chunks=5, zmws=False, ignoreSubDatasets=True)
     self.assertEqual(len(chunks), 2)
     chunks[0].write(fn)
Exemplo n.º 12
0
def split_dataset(subreadset, out_prefix):
    """
    Takes an input dataset, and for each entry generates one separate dataset
    file, while maintaining all the filters.
    Returns a list of the generated datasets.

    To create an example filtered dataset for testing:
    dataset create --type SubreadSet test.subreadset.xml subreads1.bam subreads2.bam
    dataset filter test.subreadset.xml test.filtered.subreadset.xml 'length>1000'
    """
    out_prefix_abs = os.path.abspath(out_prefix)

    dset = SubreadSet(subreadset, strict=True, skipCounts=True)
    fns = dset.toFofn()

    log.info('resources in {!r}:\n{}'.format(subreadset, '\n'.join(fns)))

    split_fns = []
    for i, bam_fn in enumerate(fns):
        out_fn = '{}.{:05}.subreadset.xml'.format(out_prefix_abs, i)
        new_dataset = SubreadSet(bam_fn, skipCounts=True)
        new_dataset.newUuid()
        new_dataset._filters = copy.deepcopy(dset._filters)
        new_dataset.write(out_fn)
        split_fns.append(out_fn)

    return split_fns
Exemplo n.º 13
0
    def test_loadMetadata(self):
        aln = AlignmentSet(data.getXml(7))
        assert not aln.metadata.collections
        aln.loadMetadata('/pbi/dept/secondary/siv/testdata/'
                         'SA3-Sequel/lambda/roche_SAT/'
                         'm54013_151205_032353.run.metadata.xml')
        assert aln.metadata.collections
        sset_fn = ('/pbi/dept/secondary/siv/testdata/'
                   'SA3-Sequel/lambda/roche_SAT/'
                   'm54013_151205_032353.subreadset.xml')
        sset = SubreadSet(sset_fn)
        orig_metadata = copy.deepcopy(sset.metadata)
        sset.metadata.collections = None
        assert not sset.metadata.collections
        sset.loadMetadata('/pbi/dept/secondary/siv/testdata/'
                          'SA3-Sequel/lambda/roche_SAT/'
                          'm54013_151205_032353.run.metadata.xml')
        fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
        sset.write(fn)
        validateFile(fn)
        validateFile(sset_fn)
        assert sset.metadata == orig_metadata

        # load the wrong thing...
        sset_fn = ('/pbi/dept/secondary/siv/testdata/'
                   'SA3-Sequel/lambda/roche_SAT/'
                   'm54013_151205_032353.subreadset.xml')
        sset = SubreadSet(sset_fn)
        orig_metadata = copy.deepcopy(sset.metadata)
        sset.metadata.collections = None
        assert not sset.metadata.collections
        with pytest.raises(InvalidDataSetIOError):
            sset.loadMetadata('/pbi/dept/secondary/siv/testdata/'
                              'SA3-Sequel/lambda/roche_SAT/'
                              'm54013_151205_032353.sts.xml')
Exemplo n.º 14
0
def get_sequencing_chemistry(entry_points, include_system_type=True):
    """
    Given a list of entry points (eid, path), extract the sequencing chemistry
    (and optionally system name) as a human-readable string.
    """
    chemistries = set()
    is_sequel = is_rsii = False
    for eid, path in entry_points:
        if eid == "eid_subread" and op.isfile(path):
            ds = SubreadSet(path)
            for bam in ds.resourceReaders():
                for rg in bam.readGroupTable:
                    chemistries.add(rg.SequencingChemistry)
                    if rg.SequencingChemistry.startswith("S"):
                        is_sequel = True
                    else:
                        is_rsii = True
    if len(chemistries) == 0:
        return "NA"
    chemistry_str = "; ".join(sorted(list(chemistries)))
    if include_system_type:
        fmt = "{s} ({c})"
        if is_sequel and is_rsii:
            return fmt.format(s="Mixed", c=chemistry_str)
        elif is_sequel:
            return fmt.format(s="Sequel", c=chemistry_str)
        elif is_rsii:
            return fmt.format(s="RSII", c=chemistry_str)
        else:
            raise ValueError("Can't determine system type for {c}".format(
                             c=chemistry_str))
    return chemistry_str
Exemplo n.º 15
0
    def test_multi_movie_split_zmws(self):
        N_RECORDS = 1745161
        test_file_1 = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/"
                       "2372215/0007/Analysis_Results/m150404_101626_42"
                       "267_c100807920800000001823174110291514_s1_p0.al"
                       "l.subreadset.xml")
        test_file_2 = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/"
                       "2590980/0008/Analysis_Results/m141115_075238_et"
                       "han_c100699872550000001823139203261572_s1_p0.al"
                       "l.subreadset.xml")
        ds1 = SubreadSet(test_file_1, test_file_2)
        # used to get total:
        #self.assertEqual(sum(1 for _ in ds1), N_RECORDS)
        self.assertEqual(len(ds1), N_RECORDS)
        dss = ds1.split(chunks=1, zmws=True)
        self.assertEqual(len(dss), 1)
        self.assertEqual(sum([len(ds_) for ds_ in dss]), N_RECORDS)

        dss = ds1.split(chunks=12, zmws=True)
        self.assertEqual(len(dss), 12)
        self.assertEqual(sum([len(ds_) for ds_ in dss]), N_RECORDS)
        self.assertEqual(
            dss[0].zmwRanges,
            [('m150404_101626_42267_c100807920800000001823174110291514_s1_p0',
              7, 22099)])
        self.assertEqual(
            dss[-1].zmwRanges,
            [('m141115_075238_ethan_c100699872550000001823139203261572_s1_p0',
              127819, 163468)])
Exemplo n.º 16
0
    def test_movie_split(self):
        N_RECORDS = 1745161
        N_RECORDS_1 = 959539
        N_RECORDS_2 = 785622
        test_file_1 = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/"
                       "2372215/0007/Analysis_Results/m150404_101626_42"
                       "267_c100807920800000001823174110291514_s1_p0.al"
                       "l.subreadset.xml")
        test_file_2 = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/"
                       "2590980/0008/Analysis_Results/m141115_075238_et"
                       "han_c100699872550000001823139203261572_s1_p0.al"
                       "l.subreadset.xml")
        ds1 = SubreadSet(test_file_1, test_file_2)
        # used to get total:
        #self.assertEqual(sum(1 for _ in ds1), N_RECORDS)
        self.assertEqual(len(ds1), N_RECORDS)
        dss = ds1.split_movies(1)
        self.assertEqual(len(dss), 1)
        self.assertEqual(sum([len(ds_) for ds_ in dss]), N_RECORDS)
        self.assertEqual(len(ds1), N_RECORDS)
        self.assertFalse(ds1.filters)

        dss = ds1.split_movies(12)
        self.assertEqual(len(dss), 2)
        self.assertEqual(sum([len(ds_) for ds_ in dss]), N_RECORDS)
        self.assertEqual(len(set(dss[0].index.qId)), 1)
        self.assertEqual(len(set(dss[-1].index.qId)), 1)
        self.assertEqual(
            dss[0].qid2mov[list(set(dss[0].index.qId))[0]],
            'm150404_101626_42267_c100807920800000001823174110291514_s1_p0')
        self.assertEqual(len(dss[0]), N_RECORDS_1)
        self.assertEqual(
            dss[-1].qid2mov[list(set(dss[-1].index.qId))[0]],
            'm141115_075238_ethan_c100699872550000001823139203261572_s1_p0')
        self.assertEqual(len(dss[-1]), N_RECORDS_2)
Exemplo n.º 17
0
 def test_subreadset_from_bam(self):
     # DONE control experiment for bug 28698
     bam = upstreamData.getUnalignedBam()
     ds1 = SubreadSet(bam, strict=False)
     fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     log.debug(fn)
     ds1.write(fn)
    def test_loadMetadata(self):
        aln = AlignmentSet(data.getXml(no=8))
        self.assertFalse(aln.metadata.collections)
        aln.loadMetadata('/pbi/dept/secondary/siv/testdata/'
                         'SA3-Sequel/lambda/roche_SAT/'
                         'm54013_151205_032353.run.metadata.xml')
        self.assertTrue(aln.metadata.collections)
        sset_fn = ('/pbi/dept/secondary/siv/testdata/'
                   'SA3-Sequel/lambda/roche_SAT/'
                   'm54013_151205_032353.subreadset.xml')
        sset = SubreadSet(sset_fn)
        orig_metadata = copy.deepcopy(sset.metadata)
        sset.metadata.collections = None
        self.assertFalse(sset.metadata.collections)
        sset.loadMetadata('/pbi/dept/secondary/siv/testdata/'
                          'SA3-Sequel/lambda/roche_SAT/'
                          'm54013_151205_032353.run.metadata.xml')
        fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
        sset.write(fn)
        validateFile(fn)
        validateFile(sset_fn)
        self.assertEqual(sset.metadata, orig_metadata)

        # load the wrong thing...
        sset_fn = ('/pbi/dept/secondary/siv/testdata/'
                   'SA3-Sequel/lambda/roche_SAT/'
                   'm54013_151205_032353.subreadset.xml')
        sset = SubreadSet(sset_fn)
        orig_metadata = copy.deepcopy(sset.metadata)
        sset.metadata.collections = None
        self.assertFalse(sset.metadata.collections)
        with self.assertRaises(InvalidDataSetIOError):
            sset.loadMetadata('/pbi/dept/secondary/siv/testdata/'
                              'SA3-Sequel/lambda/roche_SAT/'
                              'm54013_151205_032353.sts.xml')
Exemplo n.º 19
0
    def test_multi_movie_split_zmws_with_existing_movie_filter(self):
        # TODO: test with three movies and two chunks
        N_RECORDS = 959539
        test_file_1 = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/"
                       "2372215/0007/Analysis_Results/m150404_101626_42"
                       "267_c100807920800000001823174110291514_s1_p0.al"
                       "l.subreadset.xml")
        test_file_2 = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/"
                       "2590980/0008/Analysis_Results/m141115_075238_et"
                       "han_c100699872550000001823139203261572_s1_p0.al"
                       "l.subreadset.xml")
        ds1 = SubreadSet(test_file_1, test_file_2)
        dss = ds1.split_movies(2)
        self.assertEqual(len(dss), 2)
        ds1 = dss[0]
        # used to get total:
        #self.assertEqual(sum(1 for _ in ds1), N_RECORDS)
        self.assertEqual(len(ds1), N_RECORDS)
        dss = ds1.split(chunks=1, zmws=True)
        self.assertEqual(len(dss), 1)
        self.assertEqual(sum([len(ds_) for ds_ in dss]), N_RECORDS)

        dss = ds1.split(chunks=12, zmws=True)
        self.assertEqual(len(dss), 12)
        self.assertEqual(sum([len(ds_) for ds_ in dss]), N_RECORDS)
        for ds in dss:
            self.assertEqual(
                ds.zmwRanges[0][0],
                'm150404_101626_42267_c100807920800000001823174110291514_s1_p0'
            )
Exemplo n.º 20
0
    def test_isBarcoded(self):
        empty = upstreamdata.getEmptyBam()
        nonempty = ('/pbi/dept/secondary/siv/testdata/'
                    'pblaa-unittest/Sequel/Phi29/m54008_160219_003234'
                    '.tiny.subreadset.xml')

        # One empty one non empty
        sset = SubreadSet(nonempty, empty, skipMissing=True)
        self.assertTrue(sset.isBarcoded)

        # Just nonempty
        sset = SubreadSet(nonempty, skipMissing=True)
        self.assertEqual(len(sset), 15133)
        self.assertTrue(sset.isBarcoded)

        # Just empty
        #   This is crazy, the pbi must be out of date:
        sset = SubreadSet(empty)
        self.assertEqual(len(sset), 0)
        self.assertTrue(sset.isBarcoded)
        #   To confirm current behavior, I will regenerate the pbi with a
        #   current pbindex:
        efn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
        log.info("Copying to {}".format(efn))
        sset.copyTo(efn)
        sset.induceIndices(force=True)
        self.assertFalse(sset.isBarcoded)
    def test_multi_movie_split_zmws_with_existing_movie_filter(self):
        # TODO: test with three movies and two chunks
        N_RECORDS = 959539
        test_file_1 = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/"
                       "2372215/0007/Analysis_Results/m150404_101626_42"
                       "267_c100807920800000001823174110291514_s1_p0.al"
                       "l.subreadset.xml")
        test_file_2 = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/"
                       "2590980/0008/Analysis_Results/m141115_075238_et"
                       "han_c100699872550000001823139203261572_s1_p0.al"
                       "l.subreadset.xml")
        ds1 = SubreadSet(test_file_1, test_file_2)
        dss = ds1.split_movies(2)
        self.assertEqual(len(dss), 2)
        ds1 = dss[0]
        # used to get total:
        #self.assertEqual(sum(1 for _ in ds1), N_RECORDS)
        self.assertEqual(len(ds1), N_RECORDS)
        dss = ds1.split(chunks=1, zmws=True)
        self.assertEqual(len(dss), 1)
        self.assertEqual(sum([len(ds_) for ds_ in dss]),
                         N_RECORDS)

        dss = ds1.split(chunks=12, zmws=True)
        self.assertEqual(len(dss), 12)
        self.assertEqual(sum([len(ds_) for ds_ in dss]),
                         N_RECORDS)
        for ds in dss:
            self.assertEqual(
                ds.zmwRanges[0][0],
                'm150404_101626_42267_c100807920800000001823174110291514_s1_p0')
    def test_multi_movie_split_zmws(self):
        N_RECORDS = 1745161
        test_file_1 = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/"
                       "2372215/0007/Analysis_Results/m150404_101626_42"
                       "267_c100807920800000001823174110291514_s1_p0.al"
                       "l.subreadset.xml")
        test_file_2 = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/"
                       "2590980/0008/Analysis_Results/m141115_075238_et"
                       "han_c100699872550000001823139203261572_s1_p0.al"
                       "l.subreadset.xml")
        ds1 = SubreadSet(test_file_1, test_file_2)
        # used to get total:
        #self.assertEqual(sum(1 for _ in ds1), N_RECORDS)
        self.assertEqual(len(ds1), N_RECORDS)
        dss = ds1.split(chunks=1, zmws=True)
        self.assertEqual(len(dss), 1)
        self.assertEqual(sum([len(ds_) for ds_ in dss]),
                         N_RECORDS)

        dss = ds1.split(chunks=12, zmws=True)
        self.assertEqual(len(dss), 12)
        self.assertEqual(sum([len(ds_) for ds_ in dss]),
                         N_RECORDS)
        self.assertEqual(
            dss[0].zmwRanges,
            [('m150404_101626_42267_c100807920800000001823174110291514_s1_p0',
              7, 22099)])
        self.assertEqual(
            dss[-1].zmwRanges,
            [('m141115_075238_ethan_c100699872550000001823139203261572_s1_p0',
              127819, 163468)])
Exemplo n.º 23
0
def run_bam_to_bam(subread_set_file, barcode_set_file, output_file_name,
                   nproc=1, score_mode="symmetric"):
    if not score_mode in ["asymmetric", "symmetric"]:
        raise ValueError("Unrecognized score mode '{m}'".format(m=score_mode))
    bc = BarcodeSet(barcode_set_file)
    if len(bc.resourceReaders()) > 1:
        raise NotImplementedError("Multi-FASTA BarcodeSet input is not supported.")
    new_prefix = re.sub(".subreadset.xml$", "", output_file_name)
    args = [
        "bam2bam",
        "-j", str(nproc),
        "-b", str(nproc),
        "-o", new_prefix,
        "--barcodes", barcode_set_file,
        "--scoreMode", score_mode,
        subread_set_file
    ]
    log.info(" ".join(args))
    result = run_cmd(" ".join(args),
                     stdout_fh=sys.stdout,
                     stderr_fh=sys.stderr)
    if result.exit_code != 0:
        return result.exit_code
    assert op.isfile(output_file_name)
    tmp_out = op.join(op.dirname(output_file_name),
                      "tmp_" + op.basename(output_file_name))
    shutil.move(output_file_name, tmp_out)
    with SubreadSet(tmp_out, strict=True) as ds:
        with SubreadSet(subread_set_file) as ds_in:
            ds.metadata = ds_in.metadata
            ds.name = ds_in.name + " (barcoded)"
        ds.updateCounts()
        ds.newUuid()
        ds.write(output_file_name)
    return 0
Exemplo n.º 24
0
def run_bax_to_bam(input_file_name, output_file_name):
    with HdfSubreadSet(input_file_name) as ds_in:
        movies = set()
        for rr in ds_in.resourceReaders():
            movies.add(rr.movieName)
        if len(movies) > 1:
            out_dir = os.path.dirname(output_file_name)
            ds_out_files = []
            for bax_file in ds_in.toExternalFiles():
                output_file_name_tmp = os.path.join(out_dir, ".".join(
                    os.path.basename(bax_file).split(".")[:-2]) +
                    ".hdfsubreadset.xml")
                rc = _run_bax_to_bam(bax_file, output_file_name_tmp)
                if rc != 0:
                    log.error("bax2bam failed")
                    return rc
                ds_out_files.append(output_file_name_tmp)
            ds = SubreadSet(*ds_out_files)
            ds.name = ds_in.name
            if 'Description' in ds_in.objMetadata:
                ds.objMetadata['Description'] = ds_in.objMetadata['Description']
                ds.metadata.merge(ds_in.metadata)
            ds.write(output_file_name)
        else:
            return _run_bax_to_bam(input_file_name, output_file_name)
    return 0
 def setUpClass(cls):
     super(TestToolContract, cls).setUpClass()
     ds = SubreadSet(BAM_FILE, strict=True)
     ds.write(cls.INPUT_FILES[0])
     with FastaWriter(cls.INPUT_FILES[1]) as fa_out:
         for i in range(1010):
             fa_out.writeRecord("%04d_Forward" % i, "A" * 16)
    def test_len(self):
        # AlignmentSet
        aln = AlignmentSet(data.getXml(8), strict=True)
        self.assertEqual(len(aln), 92)
        self.assertEqual(aln._length, (92, 123588))
        self.assertEqual(aln.totalLength, 123588)
        self.assertEqual(aln.numRecords, 92)
        aln.totalLength = -1
        aln.numRecords = -1
        self.assertEqual(aln.totalLength, -1)
        self.assertEqual(aln.numRecords, -1)
        aln.updateCounts()
        self.assertEqual(aln.totalLength, 123588)
        self.assertEqual(aln.numRecords, 92)
        self.assertEqual(sum(1 for _ in aln), 92)
        self.assertEqual(sum(len(rec) for rec in aln), 123588)

        # AlignmentSet with filters
        aln = AlignmentSet(data.getXml(15), strict=True)
        self.assertEqual(len(aln), 40)
        self.assertEqual(aln._length, (40, 52023))
        self.assertEqual(aln.totalLength, 52023)
        self.assertEqual(aln.numRecords, 40)
        aln.totalLength = -1
        aln.numRecords = -1
        self.assertEqual(aln.totalLength, -1)
        self.assertEqual(aln.numRecords, -1)
        aln.updateCounts()
        self.assertEqual(aln.totalLength, 52023)
        self.assertEqual(aln.numRecords, 40)

        # SubreadSet
        sset = SubreadSet(data.getXml(10), strict=True)
        self.assertEqual(len(sset), 92)
        self.assertEqual(sset._length, (92, 124093))
        self.assertEqual(sset.totalLength, 124093)
        self.assertEqual(sset.numRecords, 92)
        sset.totalLength = -1
        sset.numRecords = -1
        self.assertEqual(sset.totalLength, -1)
        self.assertEqual(sset.numRecords, -1)
        sset.updateCounts()
        self.assertEqual(sset.totalLength, 124093)
        self.assertEqual(sset.numRecords, 92)
        self.assertEqual(sum(1 for _ in sset), 92)
        self.assertEqual(sum(len(rec) for rec in sset), 124093)

        # ReferenceSet
        sset = ReferenceSet(data.getXml(9), strict=True)
        self.assertEqual(len(sset), 59)
        self.assertEqual(sset.totalLength, 85774)
        self.assertEqual(sset.numRecords, 59)
        sset.totalLength = -1
        sset.numRecords = -1
        self.assertEqual(sset.totalLength, -1)
        self.assertEqual(sset.numRecords, -1)
        sset.updateCounts()
        self.assertEqual(sset.totalLength, 85774)
        self.assertEqual(sset.numRecords, 59)
Exemplo n.º 27
0
 def test_output_subreadset_name(self):
     """
     Verify that the output SubreadSet name is identical to the input name
     plus ' (barcoded)'.
     """
     with SubreadSet(self.entrypoints.data['eid_subread']) as ds_in:
         with SubreadSet(self._get_subreadset_out()) as ds_out:
             self.assertEqual(ds_out.name, ds_in.name + " (barcoded)")
 def test_subreadset_split_metadata_element_name(self):
     fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     log.debug(fn)
     sset = SubreadSet(data.getXml(10),
                       data.getXml(13))
     chunks = sset.split(chunks=5, zmws=False, ignoreSubDatasets=True)
     self.assertEqual(len(chunks), 2)
     chunks[0].write(fn)
Exemplo n.º 29
0
 def test_bam2fastx_filtered(self):
     input_file = pbtestdata.get_file("subreads-xml")
     ds = SubreadSet(input_file, strict=True)
     ds.filters.addRequirement(length=[('>=', 1000)])
     input_tmp = get_temp_file(suffix=".subreadset.xml")
     ds.write(input_tmp)
     nrecords_expected = 13
     self.run_and_check_fastx(input_tmp, nrecords_expected)
def _make_dataset(file_name=None, barcodes=None):
    if file_name is None:
        file_name = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
    ds = SubreadSet(BAM_FILE, strict=True)
    if barcodes is not None:
        for er in ds.externalResources:
            er.barcodes = barcodes
    ds.write(file_name)
    return file_name
Exemplo n.º 31
0
def to_report(stats_xml, output_dir, dpi=72):
    """Main point of entry

    :type stats_xml: str
    :type output_dir: str
    :type dpi: int

    :rtype: Report
    """
    log.info("Analyzing XML {f}".format(f=stats_xml))
    # stats_xml should be a dataset:
    dset = SubreadSet(stats_xml)

    dataset_uuids = [dset.uuid]
    # but if it isn't, no problem:
    if not dset.metadata.summaryStats:
        dset.loadStats(stats_xml)
        # an sts file was provided which will generate a new random uuid
        dataset_uuids = []
    if not dset.metadata.summaryStats.readLenDists:
        raise IOError("Pipeline Summary Stats (sts.xml) not found or missing "
                      "key distributions")

    # we want all of the length distributions in this report to look the same,
    # so we make the shaper here and pass it around:
    alldists = (dset.metadata.summaryStats.readLenDists[:] +
                dset.metadata.summaryStats.insertReadLenDists[:])
    len_dist_shaper = continuous_dist_shaper(alldists, trim_excess=True)

    attr = to_read_stats_attributes(
        readLenDists=dset.metadata.summaryStats.readLenDists,
        readQualDists=dset.metadata.summaryStats.readQualDists)
    attr.extend(
        to_insert_stats_attributes(
            readLenDists=dset.metadata.summaryStats.insertReadLenDists,
            readQualDists=dset.metadata.summaryStats.insertReadQualDists))

    plot_groups = to_read_stats_plots(
        readLenDists=dset.metadata.summaryStats.readLenDists,
        readQualDists=dset.metadata.summaryStats.readQualDists,
        output_dir=output_dir,
        lenDistShaper=len_dist_shaper)
    plot_groups.extend(
        to_insert_stats_plots(
            readLenDists=dset.metadata.summaryStats.insertReadLenDists,
            readQualDists=dset.metadata.summaryStats.insertReadQualDists,
            output_dir=output_dir,
            lenDistShaper=len_dist_shaper))

    # build the report:
    report = Report(meta_rpt.id,
                    title=meta_rpt.title,
                    attributes=attr,
                    plotgroups=plot_groups,
                    dataset_uuids=dataset_uuids)

    return meta_rpt.apply_view(report)
 def test_provenance_record_ordering(self):
     import pbtestdata
     ds = SubreadSet(pbtestdata.get_file("subreads-sequel"), strict=True)
     ds.metadata.addParentDataSet(uuid.uuid4(), ds.datasetType, createdBy="AnalysisJob", timeStampedName="")
     tmp_out = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     ds.write(tmp_out)
     ds = SubreadSet(tmp_out, strict=True)
     tags = [r['tag'] for r in ds.metadata.record['children']]
     self.assertEqual(tags, ['TotalLength', 'NumRecords', 'Provenance', 'Collections', 'SummaryStats'])
 def setUpClass(cls):
     tmp_bam = tempfile.NamedTemporaryFile(suffix=".subreads.bam").name
     shutil.copyfile(pbcore.data.getUnalignedBam(), tmp_bam)
     shutil.copyfile(pbcore.data.getUnalignedBam()+".pbi", tmp_bam+".pbi")
     ds = SubreadSet(tmp_bam, pbcore.data.getUnalignedBam(), strict=True)
     ds.write(cls.INPUT_FILES[0])
     _write_fasta_or_contigset(cls.INPUT_FILES[1], make_faidx=True,
                               ds_class=BarcodeSet)
     super(TestScatterSubreadBAMs, cls).setUpClass()
Exemplo n.º 34
0
 def test_subreadset_split_metadata_element_name(self):
     fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     log.debug(fn)
     sset = SubreadSet("/pbi/dept/secondary/siv/testdata/"
                       "SA3-Sequel/phi29/315/3150101/"
                       "r54008_20160219_002905/1_A01/"
                       "m54008_160219_003234.subreadset.xml")
     chunks = sset.split(chunks=5, zmws=False, ignoreSubDatasets=True)
     chunks[0].write(fn)
Exemplo n.º 35
0
 def _set_up_basic(self):
     input_file = get_temp_file(suffix=".subreadset.xml")
     ds = SubreadSet(data.getXml(9), strict=True)
     ds.metadata.addParentDataSet(uuid.uuid4(),
                                  ds.datasetType,
                                  createdBy="AnalysisJob",
                                  timeStampedName="")
     ds.write(input_file)
     return input_file, len(ds)
Exemplo n.º 36
0
    def test_file_arg(self):
        fn = tempfile.NamedTemporaryFile(suffix="filterVals.txt").name
        log.debug(fn)
        sset = SubreadSet(data.getXml(9))
        assert len(sset) == 92
        size = 10
        qn = [r.qName for r in sset[:size]]
        with open(fn, 'w') as ofh:
            for q in qn:
                ofh.write(q)
                ofh.write('\n')
        good_qn = [('=', fn)]
        sset.filters.addRequirement(qname=good_qn)
        assert size == sum(1 for _ in sset)
        assert size == len(sset)
        og = set(qn)
        for r in sset:
            og.discard(r.qName)
        assert len(og) == 0

        fn = tempfile.NamedTemporaryFile(suffix="filterVals.txt").name
        log.debug(fn)
        sset = SubreadSet(data.getXml(9))
        assert len(sset) == 92
        size = 10
        qn = [r.qName for r in sset[:size]]
        with open(fn, 'w') as ofh:
            for q in qn:
                ofh.write(q)
                ofh.write('\n')
        good_qn = [('=', fn)]
        sset.filters.addRequirement(qname_file=good_qn)
        assert size == sum(1 for _ in sset)
        assert size == len(sset)
        og = set(qn)
        for r in sset:
            og.discard(r.qName)
        assert len(og) == 0

        fn = tempfile.NamedTemporaryFile(suffix="filterVals.txt").name
        log.debug(fn)
        sset = SubreadSet(data.getXml(9))
        assert len(sset) == 92
        size = 4
        hn = [r for r in sorted(list(set(sset.index.holeNumber)))[:size]]
        with open(fn, 'w') as ofh:
            for h in hn:
                ofh.write(str(h))
                ofh.write('\n')
        good_hn = [('=', fn)]
        sset.filters.addRequirement(zm=good_hn)
        assert size == len(set(sset.index.holeNumber))
        og = set(hn)
        for r in sset:
            og.discard(r.holeNumber)
        assert len(og) == 0
Exemplo n.º 37
0
def to_report(stats_xml, output_dir, dpi=72):
    """Main point of entry

    :type stats_xml: str
    :type output_dir: str
    :type dpi: int

    :rtype: Report
    """
    log.info("Analyzing XML {f}".format(f=stats_xml))
    # stats_xml should be a dataset:
    dset = SubreadSet(stats_xml)

    dataset_uuids = [dset.uuid]
    # but if it isn't, no problem:
    if not dset.metadata.summaryStats:
        dset.loadStats(stats_xml)
        # an sts file was provided which will generate a new random uuid
        dataset_uuids = []
    if not dset.metadata.summaryStats.readLenDists:
        raise IOError("Pipeline Summary Stats (sts.xml) not found or missing "
                      "key distributions")


    # we want all of the length distributions in this report to look the same,
    # so we make the shaper here and pass it around:
    alldists = (dset.metadata.summaryStats.readLenDists[:] +
                dset.metadata.summaryStats.insertReadLenDists[:])
    len_dist_shaper = continuous_dist_shaper(alldists, trim_excess=True)

    attr = to_read_stats_attributes(
        readLenDists=dset.metadata.summaryStats.readLenDists,
        readQualDists=dset.metadata.summaryStats.readQualDists)
    attr.extend(to_insert_stats_attributes(
        readLenDists=dset.metadata.summaryStats.insertReadLenDists,
        readQualDists=dset.metadata.summaryStats.insertReadQualDists))

    plot_groups = to_read_stats_plots(
        readLenDists=dset.metadata.summaryStats.readLenDists,
        readQualDists=dset.metadata.summaryStats.readQualDists,
        output_dir=output_dir,
        lenDistShaper=len_dist_shaper)
    plot_groups.extend(to_insert_stats_plots(
        readLenDists=dset.metadata.summaryStats.insertReadLenDists,
        readQualDists=dset.metadata.summaryStats.insertReadQualDists,
        output_dir=output_dir,
        lenDistShaper=len_dist_shaper))

    # build the report:
    report = Report(meta_rpt.id,
                    title=meta_rpt.title,
                    attributes=attr,
                    plotgroups=plot_groups,
                    dataset_uuids=dataset_uuids)

    return meta_rpt.apply_view(report)
Exemplo n.º 38
0
 def setUp(self):
     BAM_IN = pbcore.data.getUnalignedBam()
     ds = SubreadSet(BAM_IN, strict=True)
     chunks = ds.split(zmws=True, chunks=2, targetSize=2)
     assert len(chunks) == 2
     self.zmw_range = chunks[CHUNK_INDEX].zmwRanges[0][1:3]
     logging.info("zmwRanges[CHUNK_INDEX] = {r}".format(
         r=str(chunks[CHUNK_INDEX].zmwRanges)))
     logging.info("SubreadSet = {f}".format(f=self.INPUT_FILES[0]))
     chunks[CHUNK_INDEX].write(self.INPUT_FILES[0])
Exemplo n.º 39
0
 def test_get_dataset_uuid(self):
     ds = SubreadSet(upstreamdata.getUnalignedBam(), strict=True)
     ds_file = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     ds.write(ds_file)
     uuid = getDataSetUuid(ds_file)
     assert uuid == ds.uuid
     with open(ds_file, "w") as out:
         out.write("hello world!")
     uuid = getDataSetUuid(ds_file)
     assert uuid is None
 def test_get_dataset_uuid(self):
     ds = SubreadSet(upstreamdata.getUnalignedBam(), strict=True)
     ds_file = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     ds.write(ds_file)
     uuid = getDataSetUuid(ds_file)
     self.assertEqual(uuid, ds.uuid)
     with open(ds_file, "w") as out:
         out.write("hello world!")
     uuid = getDataSetUuid(ds_file)
     self.assertEqual(uuid, None)
Exemplo n.º 41
0
 def setUp(self):
     BAM_IN = pbcore.data.getUnalignedBam()
     ds = SubreadSet(BAM_IN, strict=True)
     chunks = ds.split(zmws=True, chunks=2, targetSize=2)
     assert len(chunks) == 2
     self.zmw_range = chunks[CHUNK_INDEX].zmwRanges[0][1:3]
     logging.info("zmwRanges[CHUNK_INDEX] = {r}".format(
         r=str(chunks[CHUNK_INDEX].zmwRanges)))
     logging.info("SubreadSet = {f}".format(f=self.INPUT_FILES[0]))
     chunks[CHUNK_INDEX].write(self.INPUT_FILES[0])
Exemplo n.º 42
0
def main(parser):
    args = parser.parse_args()

    filt = Filters()
    dset = SubreadSet(args.inXml)
    names = nameGen(args.inFile, fileType='list' if args.list else 'fasta')
    if args.subreads:
        if args.inverted:
            for name in names:
                filt.addRequirement(QNAME=[('!=', name)])
        else:
            filt.addRequirement(QNAME=[('=', name) for name in names])
    else:
        assert len(
            dset.movieIds
        ) == 1, 'This method only works for single-movie subreadsets.  use --subreads option for multi-movie subreadsets'
        uniqHn = set(map(getZmw, names))
        if args.inverted:
            for hn in uniqHn:
                filt.addRequirement(zm=[('!=', hn)])
        else:
            filt.addRequirement(zm=[('=', hn) for hn in uniqHn])
    dset.addFilters(filt)
    if args.newUuid:
        dset.newUuid()
    if args.name:
        dset.name = args.name
    dset.write(args.outXml)
Exemplo n.º 43
0
 def test_split_zmws_around_read_groups(self):
     ds1 = pbtestdata.get_file("subreads-xml")
     ds2 = pbtestdata.get_file("subreads-sequel")
     ds = SubreadSet(ds1, ds2)
     assert len(ds) == 137
     # this is still the default behavior
     chunks = list(ds.split(chunks=2, zmws=True, breakReadGroups=True))
     assert len(chunks[0]) == 72
     assert len(chunks[1]) == 65
     # don't break up movies
     chunks = list(ds.split(chunks=2, zmws=True, breakReadGroups=False))
     assert len(chunks[0]) == 20
     assert len(chunks[1]) == 117
     assert np.all(chunks[0].index.qId == -2081539485)
     assert np.all(chunks[1].index.qId == -1197849594)
     chunks = list(
         ds.split(chunks=4, targetSize=1, zmws=True, breakReadGroups=False))
     assert [len(c) for c in chunks] == [8, 12, 54, 63]
     assert np.all(chunks[0].index.qId == -2081539485)
     assert np.all(chunks[1].index.qId == -2081539485)
     assert np.all(chunks[2].index.qId == -1197849594)
     assert np.all(chunks[3].index.qId == -1197849594)
     # control: single-movie dataset
     ds = SubreadSet(ds1)
     chunks1 = list(ds.split(chunks=4, zmws=True, breakReadGroups=False))
     chunks2 = list(ds.split(chunks=4, zmws=True, breakReadGroups=True))
     assert [len(x) for x in chunks1] == [len(y) for y in chunks2]
    def test_de_novo(self):
        ofn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
        log.info(ofn)
        ss = SubreadSet(data.getXml(10))
        col = CollectionMetadata()
        self.assertFalse(ss.metadata.collections)

        ss.metadata.collections.append(col)
        self.assertTrue(ss.metadata.collections)

        col.cellIndex = 1
        self.assertTrue(ss.metadata.collections[0].cellIndex, 1)

        col.instrumentName = "foo"
        self.assertTrue(ss.metadata.collections[0].instrumentName, "foo")

        col.context = 'bar'
        self.assertTrue(ss.metadata.collections[0].context, "bar")

        ss.metadata.collections[0].runDetails.name = 'foo'
        self.assertEqual('foo', ss.metadata.collections[0].runDetails.name)

        ss.metadata.collections[0].wellSample.name = 'bar'
        self.assertEqual('bar', ss.metadata.collections[0].wellSample.name)

        ss.metadata.collections[0].wellSample.wellName = 'baz'
        self.assertEqual('baz', ss.metadata.collections[0].wellSample.wellName)

        ss.metadata.collections[0].wellSample.concentration = 'baz'
        self.assertEqual('baz',
                         ss.metadata.collections[0].wellSample.concentration)

        # There are no existing biosamples:
        self.assertFalse(
            'BioSamples' in ss.metadata.collections[0].wellSample.tags)
        # Therefore the metadata is falsy
        self.assertFalse(ss.metadata.collections[0].wellSample.bioSamples)

        ss.metadata.collections[0].wellSample.bioSamples.addSample('Clown')
        self.assertEqual(
            'Clown', ss.metadata.collections[0].wellSample.bioSamples[0].name)

        ss.metadata.collections[0].wellSample.bioSamples[
            0].DNABarcodes.addBarcode('Dentist')
        self.assertEqual(
            'Dentist', ss.metadata.collections[0].wellSample.bioSamples[0].
            DNABarcodes[0].name)

        # check that we are adding one additional biosamples element:
        self.assertEqual(
            Counter(ss.metadata.collections[0].wellSample.tags)['BioSamples'],
            1)
        # Therefore the metadata is truthy
        self.assertTrue(ss.metadata.collections[0].wellSample.bioSamples)
        ss.write(ofn, validate=False)
Exemplo n.º 45
0
 def setUpClass(cls):
     tmp_bam = tempfile.NamedTemporaryFile(suffix=".subreads.bam").name
     shutil.copyfile(pbcore.data.getUnalignedBam(), tmp_bam)
     shutil.copyfile(pbcore.data.getUnalignedBam() + ".pbi",
                     tmp_bam + ".pbi")
     ds = SubreadSet(tmp_bam, pbcore.data.getUnalignedBam(), strict=True)
     ds.write(cls.INPUT_FILES[0])
     _write_fasta_or_contigset(cls.INPUT_FILES[1],
                               make_faidx=True,
                               ds_class=BarcodeSet)
     super(TestScatterSubreadBAMs, cls).setUpClass()
Exemplo n.º 46
0
 def test_subreads_parent_dataset(self):
     ds1 = SubreadSet(data.getXml(no=5), skipMissing=True)
     assert ds1.metadata.provenance.parentDataSet.uniqueId == "f81cf391-b3da-41f8-84cb-a0de71f460f4"
     ds2 = SubreadSet(ds1.externalResources[0].bam, skipMissing=True)
     assert ds2.metadata.provenance.parentDataSet.uniqueId is None
     ds2.metadata.addParentDataSet("f81cf391-b3da-41f8-84cb-a0de71f460f4",
                                   "PacBio.DataSet.SubreadSet",
                                   "timestamped_name")
     assert ds2.metadata.provenance.parentDataSet.uniqueId == "f81cf391-b3da-41f8-84cb-a0de71f460f4"
     ds_out = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     ds2.write(ds_out, validate=False)
Exemplo n.º 47
0
def get_data_stats(entry_points):
    """
    Get basic metrics for input dataset (assumed to be a SubreadSet).
    """
    for eid, path in entry_points:
        if eid == "eid_subread" and op.isfile(path):
            ds = SubreadSet(path)
            n_zmws = 0
            for bam in ds.resourceReaders():
                n_zmws += len(set(bam.pbi.holeNumber))
            return data_stats(n_zmws, ds.numRecords, ds.totalLength)
    return data_stats("NA", "NA", "NA")
    def test_reports_with_fixed_bins(self):
        # TODO readQualDists are currently unpopulated, turn back on when
        # they're repopulated
        # for dist_name, nbins in zip(['medianInsertDists', 'readLenDists',
        #                             'readQualDists'], [200, 200, 50]):
        for dist_name, nbins in zip(["medianInsertDists", "readLenDists"], [200, 200]):
            ss = SubreadSet()
            ss.loadStats(get_fixed_bin_sts())

            ss2 = SubreadSet()
            ss2.loadStats(get_fixed_bin_sts())

            # shift ss2
            mdist = getattr(ss2.metadata.summaryStats, dist_name)[0].bins
            mdist = [0, 0, 0] + mdist[:-3]
            getattr(ss2.metadata.summaryStats, dist_name)[0].bins = mdist

            ss3 = ss + ss2

            ss4 = SubreadSet()
            ss4.loadStats(get_fixed_bin_sts())

            # shift ss4
            mdist = getattr(ss4.metadata.summaryStats, dist_name)[0].bins
            mdist = [0 for _ in mdist]
            getattr(ss4.metadata.summaryStats, dist_name)[0].bins = mdist

            dists = getattr(ss4.metadata.summaryStats, dist_name)
            self.assertEqual(len(dists), 1)
            for n in [0, 1, 2, 10, 40, 41, 49, 50, 51, 200, 500]:
                ds = continuous_dist_shaper(dists, nbins=n)
                fixed_dists = [ds(dist) for dist in dists]
                self.assertEqual(len(dists[0].bins), nbins)
                self.assertEqual(len(fixed_dists[0].bins), nbins)
                self.assertEqual(sum(dists[0].bins), sum(fixed_dists[0].bins))

            sss = [ss, ss2, ss3]

            for sset in sss:
                dists = getattr(sset.metadata.summaryStats, dist_name)
                self.assertEqual(len(dists), 1)
                # 0, requested nbins > numBins fails back to no-op
                ops = [1, 2, 3, 4, 7, 10, 40, 41, 49, 50, 51, 200, 500]
                no_ops = [0]
                for n in no_ops:
                    ds = continuous_dist_shaper(dists, nbins=n)
                    fixed_dists = [ds(dist) for dist in dists]
                    self.assertEqual(len(dists[0].bins), nbins)
                    self.assertEqual(len(fixed_dists[0].bins), nbins)
                    self.assertEqual(sum(dists[0].bins), sum(fixed_dists[0].bins))

                for n in ops:
                    ds = continuous_dist_shaper(dists, nbins=n)
                    fixed_dists = [ds(dist) for dist in dists]
                    self.assertEqual(len(dists[0].bins), nbins)
                    self.assertEqual(len(fixed_dists[0].bins), n)
                    self.assertEqual(sum(dists[0].bins), sum(fixed_dists[0].bins))
 def test_subreads_parent_dataset(self):
     ds1 = SubreadSet(data.getXml(no=5), skipMissing=True)
     self.assertEqual(ds1.metadata.provenance.parentDataSet.uniqueId,
                      "f81cf391-b3da-41f8-84cb-a0de71f460f4")
     ds2 = SubreadSet(ds1.externalResources[0].bam, skipMissing=True)
     self.assertEqual(ds2.metadata.provenance.parentDataSet.uniqueId, None)
     ds2.metadata.addParentDataSet("f81cf391-b3da-41f8-84cb-a0de71f460f4",
                                   "PacBio.DataSet.SubreadSet",
                                   "timestamped_name")
     self.assertEqual(ds2.metadata.provenance.parentDataSet.uniqueId,
                      "f81cf391-b3da-41f8-84cb-a0de71f460f4")
     ds_out = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     ds2.write(ds_out, validate=False)
    def test_de_novo(self):
        ofn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
        log.info(ofn)
        ss = SubreadSet(data.getXml(10))
        col = CollectionMetadata()
        self.assertFalse(ss.metadata.collections)

        ss.metadata.collections.append(col)
        self.assertTrue(ss.metadata.collections)

        col.cellIndex = 1
        self.assertTrue(ss.metadata.collections[0].cellIndex, 1)

        col.instrumentName = "foo"
        self.assertTrue(ss.metadata.collections[0].instrumentName, "foo")

        col.context = 'bar'
        self.assertTrue(ss.metadata.collections[0].context, "bar")

        ss.metadata.collections[0].runDetails.name = 'foo'
        self.assertEqual('foo', ss.metadata.collections[0].runDetails.name)

        ss.metadata.collections[0].wellSample.name = 'bar'
        self.assertEqual('bar', ss.metadata.collections[0].wellSample.name)

        ss.metadata.collections[0].wellSample.wellName = 'baz'
        self.assertEqual('baz', ss.metadata.collections[0].wellSample.wellName)

        ss.metadata.collections[0].wellSample.concentration = 'baz'
        self.assertEqual('baz',
                         ss.metadata.collections[0].wellSample.concentration)

        # There are no existing biosamples:
        self.assertFalse(
            'BioSamples' in ss.metadata.tags)
        # Therefore the metadata is falsy
        self.assertFalse(ss.metadata.bioSamples)

        ss.metadata.bioSamples.addSample('Clown')
        self.assertEqual('Clown', ss.metadata.bioSamples[0].name)

        ss.metadata.bioSamples[0].DNABarcodes.addBarcode('Dentist')
        self.assertEqual('Dentist',
                         ss.metadata.bioSamples[0].DNABarcodes[0].name)

        # check that we are adding one additional biosamples element:
        self.assertEqual(Counter(ss.metadata.tags)['BioSamples'], 1)
        # Therefore the metadata is truthy
        self.assertTrue(ss.metadata.bioSamples)
        ss.write(ofn, validate=False)
Exemplo n.º 51
0
def split_dataset(subreadset, out_prefix):
    """
    Takes an input dataset, and for each entry generates one separate dataset
    file, while maintaining all the filters.
    Returns a FOFN of the generated datasets.

    To create an example filtered dataset for testing:
    dataset create --type SubreadSet test.subreadset.xml subreads1.bam subreads2.bam
    dataset filter test.subreadset.xml test.filtered.subreadset.xml 'length>1000'
    """
    out_prefix_abs = os.path.abspath(out_prefix)

    dset = SubreadSet(subreadset, strict=True)
    fns = dset.toFofn()

    log.info('resources in {!r}:\n{}'.format(subreadset, '\n'.join(fns)))

    fofn = []
    for i, bam_fn in enumerate(fns):
        out_fn = '{}.{:05}.subreadset.xml'.format(out_prefix_abs, i)
        new_dataset = SubreadSet(bam_fn)
        new_dataset.newUuid()
        new_dataset._filters = copy.deepcopy(dset._filters)
        new_dataset.write(out_fn)
        fofn.append(out_fn)

    return fofn
Exemplo n.º 52
0
def run_bam_to_bam(subread_set_file, barcode_set_file, output_file_name,
                   nproc=1):
    bc = BarcodeSet(barcode_set_file)
    if len(bc.resourceReaders()) > 1:
        raise NotImplementedError("Multi-FASTA BarcodeSet input is not supported.")
    barcode_fasta = bc.toExternalFiles()[0]
    with SubreadSet(subread_set_file) as ds:
        # TODO(nechols)(2016-03-15): replace with BarcodedSubreadSet
        ds_new = SubreadSet(strict=True)
        for ext_res in ds.externalResources:
            subreads_bam = ext_res.bam
            scraps_bam = ext_res.scraps
            assert subreads_bam is not None
            if scraps_bam is None:
                raise TypeError("The input SubreadSet must include scraps.")
            new_prefix = op.join(op.dirname(output_file_name),
                re.sub(".subreads.bam", "_barcoded", op.basename(subreads_bam)))
            if not op.isabs(subreads_bam):
                subreads_bam = op.join(op.dirname(subread_set_file),
                    subreads_bam)
            if not op.isabs(scraps_bam):
                scraps_bam = op.join(op.dirname(subread_set_file), scraps_bam)
            args = [
                "bam2bam",
                "-j", str(nproc),
                "-b", str(nproc),
                "-o", new_prefix,
                "--barcodes", barcode_fasta,
                subreads_bam, scraps_bam
            ]
            print args
            log.info(" ".join(args))
            result = run_cmd(" ".join(args),
                             stdout_fh=sys.stdout,
                             stderr_fh=sys.stderr)
            if result.exit_code != 0:
                return result.exit_code
            subreads_bam = new_prefix + ".subreads.bam"
            scraps_bam = new_prefix + ".scraps.bam"
            assert op.isfile(subreads_bam), "Missing {f}".format(f=subreads_bam)
            # FIXME we need a more general method for this
            ext_res_new = ExternalResource()
            ext_res_new.resourceId = subreads_bam
            ext_res_new.metaType = 'PacBio.SubreadFile.SubreadBamFile'
            ext_res_new.addIndices([subreads_bam + ".pbi"])
            ext_res_inner = ExternalResources()
            ext_res_scraps = ExternalResource()
            ext_res_scraps.resourceId = scraps_bam
            ext_res_scraps.metaType = 'PacBio.SubreadFile.ScrapsBamFile'
            ext_res_scraps.addIndices([scraps_bam + ".pbi"])
            ext_res_inner.append(ext_res_scraps)
            ext_res_new.append(ext_res_inner)
            ds_new.externalResources.append(ext_res_new)
        ds._filters.clearCallbacks()
        ds_new._filters = ds._filters
        ds_new._populateMetaTypes()
        ds_new.updateCounts()
        ds_new.write(output_file_name)
    return 0
Exemplo n.º 53
0
def to_zmw_chunked_subreadset_files(subreadset_path, max_total_nchunks,
                                    chunk_key, dir_name, base_name, ext):
    """Identical to to_chunked_subreadset_files, but chunks subreads by
    ZMW ranges for input to pbccs."""
    dset = SubreadSet(subreadset_path, strict=True)
    dset_chunks = dset.split(chunks=max_total_nchunks, zmws=True)
    d = {}
    for i, dset in enumerate(dset_chunks):
        chunk_id = '_'.join([base_name, str(i)])
        chunk_name = '.'.join([chunk_id, ext])
        chunk_path = os.path.join(dir_name, chunk_name)
        dset.write(chunk_path)
        d[chunk_key] = os.path.abspath(chunk_path)
        c = PipelineChunk(chunk_id, **d)
        yield c
Exemplo n.º 54
0
def to_report(stats_xml, output_dir, dpi=72):
    """Main point of entry

    :type stats_xml: str
    :type output_dir: str
    :type dpi: int

    :rtype: Report
    """
    log.info("Analyzing XML {f}".format(f=stats_xml))
    # stats_xml should be a dataset:
    dset = SubreadSet(stats_xml)
    dataset_uuids = [dset.uuid]
    # but if it isn't, no problem:
    if not dset.metadata.summaryStats:
        dset.loadStats(stats_xml)
        # an sts file was provided which will generate a new random uuid
        dataset_uuids = []
    if not dset.metadata.summaryStats.readLenDists:
        raise IOError("Pipeline Summary Stats (sts.xml) not found or missing "
                      "key distributions")

    attr = to_read_stats_attributes(
        readLenDists=dset.metadata.summaryStats.readLenDists,
        readQualDists=dset.metadata.summaryStats.readQualDists)
    attr.extend(to_insert_stats_attributes(
        readLenDists=dset.metadata.summaryStats.insertReadLenDists,
        readQualDists=dset.metadata.summaryStats.insertReadQualDists))

    plot_groups = to_read_stats_plots(
        readLenDists=dset.metadata.summaryStats.readLenDists,
        readQualDists=dset.metadata.summaryStats.readQualDists,
        output_dir=output_dir)
    plot_groups.extend(to_insert_stats_plots(
        readLenDists=dset.metadata.summaryStats.insertReadLenDists,
        readQualDists=dset.metadata.summaryStats.insertReadQualDists,
        output_dir=output_dir))

    # build the report:
    report = Report("raw_data_report",
                    title="Raw Data Report",
                    attributes=attr,
                    plotgroups=plot_groups,
                    dataset_uuids=dataset_uuids)

    return report
def run(subreadset, fofn):
    dir_name = os.getcwd()
    maxChunks = 0
    dset = SubreadSet(subreadset, strict=True)
    fns = dset.toFofn()
    import pprint
    log.info('resources in {!r}:\n{}'.format(subreadset, pprint.pformat(fns)))
    nrecs = len(dset)
    # HG with 70x coverage => 200G bases total
    ts = 50000 # @ 20k/read => 1G bases, ~300MB .gz => ~200 chunks for Human
    ts = 500000 # @ 20k/read => 10G bases, ~3GB .gz => ~20 chunks for Human
    # and we expect about 7-10min per chunk.
    chunks = nrecs // ts
    log.info('num_chunks={:g} ({:g} / {:g})'.format(chunks, nrecs, ts))
    log.info('Splitting with dset.split(zmws=False, chunks={}, ignoreSubDatasets=True, maxChunks={},)'.format(
        chunks, maxChunks))
    dset_chunks = dset.split(zmws=False, chunks=chunks, ignoreSubDatasets=True, maxChunks=maxChunks,
            updateCounts=False,
            #targetSize=1, breakContigs=True
    )

    chunk_fns = []
    for i, dset in enumerate(dset_chunks):
        chunk_name = 'chunk_{:03d}.subreadset.xml'.format(i) # TODO: 02
        chunk_fn = os.path.join(dir_name, chunk_name)
        dset.updateCounts()
        dset.write(chunk_fn, validate=False) # , relPaths=True
        chunk_fns.append(chunk_fn)
    with open(fofn, 'w') as ofs:
        for fn in chunk_fns:
            ofs.write('{}\n'.format(fn))
    log.info('Wrote {} chunks into "{}"'.format(len(dset_chunks), fofn))
    def test_isBarcoded(self):
        empty = upstreamdata.getEmptyBam()
        nonempty = ('/pbi/dept/secondary/siv/testdata/'
                    'pblaa-unittest/Sequel/Phi29/m54008_160219_003234'
                    '.tiny.subreadset.xml')

        # One empty one non empty
        sset = SubreadSet(nonempty, empty, skipMissing=True)
        self.assertTrue(sset.isBarcoded)

        # Just nonempty
        sset = SubreadSet(nonempty, skipMissing=True)
        self.assertEqual(len(sset), 15133)
        self.assertTrue(sset.isBarcoded)

        # Just empty
        #   This is crazy, the pbi must be out of date:
        sset = SubreadSet(empty)
        self.assertEqual(len(sset), 0)
        self.assertTrue(sset.isBarcoded)
        #   To confirm current behavior, I will regenerate the pbi with a
        #   current pbindex:
        efn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
        log.info("Copying to {}".format(efn))
        sset.copyTo(efn)
        sset.induceIndices(force=True)
        self.assertFalse(sset.isBarcoded)
Exemplo n.º 57
0
def get_subread_ZMW_stats(subread_xml, report):
    """
    Fills a dict with:
    'numZMW' --- number of sequencing ZMWs
    'numSubread' -- number of subreads
    'avgZMWlen' -- approximated average ZMW length
    'avgSubreadlen' --- average subread length
    """
    subread_lens = []
    zmw_lens = defaultdict(lambda: 0)

    ds = SubreadSet(subread_xml)
    for rr in ds.resourceReaders():
        for zmw, qStart, qEnd in zip(rr.holeNumber, rr.qStart, rr.qEnd):
            subread_lens.append(qEnd-qStart)
            zmw_lens[zmw] = max(zmw_lens[zmw], qEnd)

    report['numZMW'] = len(zmw_lens)
    report['numSubread'] = len(subread_lens)
    report['avgZMWlen'] = int(sum(zmw_lens.itervalues())*1./len(zmw_lens))
    report['avgSubreadlen'] = int(sum(subread_lens)*1./len(subread_lens))
Exemplo n.º 58
0
    def test_subreadset_metadata_element_name(self):
        # without touching the element:
        sset = SubreadSet(data.getXml(10))
        log.debug(data.getXml(10))
        fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
        log.debug(fn)
        sset.write(fn)
        f = ET.parse(fn)
        self.assertEqual(len(f.getroot().findall(
            '{http://pacificbiosciences.com/PacBioDatasets.xsd}'
            'SubreadSetMetadata')),
            0)
        self.assertEqual(len(f.getroot().findall(
            '{http://pacificbiosciences.com/PacBioDatasets.xsd}'
            'DataSetMetadata')),
            1)

        # with touching the element:
        sset = SubreadSet(data.getXml(10))
        sset.metadata.description = 'foo'
        fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
        sset.write(fn, validate=False)
        f = ET.parse(fn)
        self.assertEqual(len(f.getroot().findall(
            '{http://pacificbiosciences.com/PacBioDatasets.xsd}'
            'SubreadSetMetadata')),
            0)
        self.assertEqual(len(f.getroot().findall(
            '{http://pacificbiosciences.com/PacBioDatasets.xsd}'
            'DataSetMetadata')),
            1)
 def test_merge_biosamples(self):
     import pbtestdata
     ds1 = pbtestdata.get_file("subreads-biosample-1")
     ds2 = pbtestdata.get_file("subreads-biosample-2")
     # Case 1: two biosamples
     ds = SubreadSet(ds1, ds2)
     samples = [bs.name for bs in ds.metadata.bioSamples]
     self.assertEqual(samples, ["Alice", "Bob"])
     # Case 2: same biosample in both files
     ds = SubreadSet(ds1, ds1)
     samples = [bs.name for bs in ds.metadata.bioSamples]
     self.assertEqual(samples, ["Alice"])
     self.assertEqual(len(ds.metadata.bioSamples[0].DNABarcodes), 1)
     # Case 3: same biosample, different barcodes
     dsTmp = SubreadSet(ds1)
     dsTmp.metadata.bioSamples[0].DNABarcodes[0].name = "F7--R7"
     tmpFile = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     dsTmp.write(tmpFile)
     ds = SubreadSet(ds1, tmpFile)
     samples = [bs.name for bs in ds.metadata.bioSamples]
     self.assertEqual(samples, ["Alice"])
     bcs = [bc.name for bc in ds.metadata.bioSamples[0].DNABarcodes]
     self.assertEqual(bcs, ["F1--R1", "F7--R7"])
    def test_movie_split(self):
        N_RECORDS = 1745161
        N_RECORDS_1 = 959539
        N_RECORDS_2 = 785622
        test_file_1 = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/"
                       "2372215/0007/Analysis_Results/m150404_101626_42"
                       "267_c100807920800000001823174110291514_s1_p0.al"
                       "l.subreadset.xml")
        test_file_2 = ("/pbi/dept/secondary/siv/testdata/SA3-DS/lambda/"
                       "2590980/0008/Analysis_Results/m141115_075238_et"
                       "han_c100699872550000001823139203261572_s1_p0.al"
                       "l.subreadset.xml")
        ds1 = SubreadSet(test_file_1, test_file_2)
        # used to get total:
        #self.assertEqual(sum(1 for _ in ds1), N_RECORDS)
        self.assertEqual(len(ds1), N_RECORDS)
        dss = ds1.split_movies(1)
        self.assertEqual(len(dss), 1)
        self.assertEqual(sum([len(ds_) for ds_ in dss]),
                         N_RECORDS)
        self.assertEqual(len(ds1), N_RECORDS)
        self.assertFalse(ds1.filters)

        dss = ds1.split_movies(12)
        self.assertEqual(len(dss), 2)
        self.assertEqual(sum([len(ds_) for ds_ in dss]),
                         N_RECORDS)
        self.assertEqual(len(set(dss[0].index.qId)), 1)
        self.assertEqual(len(set(dss[-1].index.qId)), 1)
        self.assertEqual(
            dss[0].qid2mov[list(set(dss[0].index.qId))[0]],
            'm150404_101626_42267_c100807920800000001823174110291514_s1_p0')
        self.assertEqual(len(dss[0]), N_RECORDS_1)
        self.assertEqual(
            dss[-1].qid2mov[list(set(dss[-1].index.qId))[0]],
            'm141115_075238_ethan_c100699872550000001823139203261572_s1_p0')
        self.assertEqual(len(dss[-1]), N_RECORDS_2)