def test_copy(self): ds1 = DataSet(data.getXml()) ds2 = ds1.copy() self.assertFalse(ds1 == ds2) self.assertFalse(ds1.uuid == ds2.uuid) self.assertFalse(ds1 is ds2) self.assertTrue(ds1.name == ds2.name) self.assertTrue(ds1.externalResources == ds2.externalResources) # The name and UniqueId are different: self.assertFalse(ds1.objMetadata == ds2.objMetadata) self.assertTrue(ds1.filters == ds2.filters) self.assertTrue(ds1.subdatasets == ds2.subdatasets) self.assertTrue(len(ds1.subdatasets) == 2) self.assertTrue(len(ds2.subdatasets) == 2) assert not reduce(lambda x, y: x or y, [ ds1d is ds2d for ds1d in ds1.subdatasets for ds2d in ds2.subdatasets ]) # TODO: once simulated files are indexable, turn on strict: ds1 = SubreadSet(data.getXml(no=10), strict=False) self.assertEquals(type(ds1.metadata).__name__, 'SubreadSetMetadata') ds2 = ds1.copy() self.assertEquals(type(ds2.metadata).__name__, 'SubreadSetMetadata') # Lets try casting ds1 = DataSet(data.getBam()) self.assertEquals(type(ds1).__name__, 'DataSet') ds1 = ds1.copy(asType='SubreadSet') self.assertEquals(type(ds1).__name__, 'SubreadSet') # Lets do some illicit casting with self.assertRaises(TypeError): ds1 = ds1.copy(asType='ReferenceSet') # Lets try not having to cast ds1 = SubreadSet(data.getBam()) self.assertEquals(type(ds1).__name__, 'SubreadSet')
def test_refLengths(self): ds = DataSet(data.getBam(0)) random_few = { 'B.cereus.6': 1472, 'S.agalactiae.1': 1470, 'B.cereus.4': 1472 } for key, value in random_few.items(): self.assertEqual(ds.refLengths[key], value) # this is a hack to only emit refNames that actually have records # associated with them: dss = ds.split(contigs=True, chunks=1)[0] self.assertEqual( dss.refLengths, { 'B.vulgatus.4': 1449, 'B.vulgatus.5': 1449, 'C.beijerinckii.13': 1433, 'C.beijerinckii.14': 1433, 'C.beijerinckii.9': 1433, 'E.coli.6': 1463, 'E.faecalis.1': 1482, 'E.faecalis.2': 1482, 'R.sphaeroides.1': 1386, 'S.epidermidis.2': 1472, 'S.epidermidis.3': 1472, 'S.epidermidis.4': 1472 })
def test_filter(self): ds2 = DataSet(data.getXml(8)) ds2.filters.addRequirement(rname=[('=', 'E.faecalis.1')]) self.assertEqual(len(list(ds2.records)), 20) ds2.disableFilters() self.assertEqual(len(list(ds2.records)), 92) ds2.enableFilters() self.assertEqual(len(list(ds2.records)), 20)
def to_report(stats_xml): """Main point of entry :type stats_xml: str :type output_dir: str :type dpi: int :rtype: Report """ log.info("Analyzing XML {f}".format(f=stats_xml)) dset = DataSet(stats_xml) if not dset.metadata.summaryStats: dset.loadStats(stats_xml) if not dset.metadata.summaryStats.prodDist: raise IOError("Pipeline Summary Stats (sts.xml) not found or missing " "key distributions") dsets = [dset] for subdset in dset.subdatasets: if subdset.metadata.summaryStats: dsets.append(subdset) col_ids = [ Constants.C_CONTEXT, Constants.C_ZMWS, Constants.C_PROD_0, Constants.C_PROD_1, Constants.C_PROD_2 ] col_values = [[], [], [], [], []] for dset in dsets: if len(dsets) > 1 and len(col_values[0]) == 0: movie_name = "Combined" else: try: collection = list(dset.metadata.collections)[0] movie_name = collection.context except AttributeError: movie_name = "NA" productive_zmws = int(dset.metadata.summaryStats.numSequencingZmws) empty, productive, other, _ = dset.metadata.summaryStats.prodDist.bins prod0 = np.round(100.0 * empty / float(productive_zmws), decimals=Constants.DECIMALS) prod1 = np.round(100.0 * productive / float(productive_zmws), decimals=Constants.DECIMALS) prod2 = np.round(100.0 * other / float(productive_zmws), decimals=Constants.DECIMALS) this_row = [movie_name, productive_zmws, prod0, prod1, prod2] map(lambda (x, y): x.append(y), zip(col_values, this_row)) columns = [ Column(cid, values=vals) for cid, vals in zip(col_ids, col_values) ] tables = [Table(Constants.T_LOADING, columns=columns)] report = Report(meta_rpt.id, title=meta_rpt.title, tables=tables, attributes=None, plotgroups=None) return meta_rpt.apply_view(report)
def test_addFilters(self): ds1 = DataSet() filt = Filters() filt.addRequirement(rq=[('>', '0.85')]) ds1.addFilters(filt) assert str(ds1.filters) == '( rq > 0.85 )' # Or added from a source XML ds2 = DataSet(data.getXml(15)) assert str(ds2.filters).startswith('( rname = E.faecalis')
def test_addFilters(self): ds1 = DataSet() filt = Filters() filt.addRequirement(rq=[('>', '0.85')]) ds1.addFilters(filt) self.assertEquals(str(ds1.filters), '( rq > 0.85 )') # Or added from a source XML ds2 = DataSet(data.getXml(16)) self.assertTrue(str(ds2.filters).startswith('( rname = E.faecalis'))
def to_report(stats_xml): """Main point of entry :type stats_xml: str :type output_dir: str :type dpi: int :rtype: Report """ log.info("Analyzing XML {f}".format(f=stats_xml)) dset = DataSet(stats_xml) if not dset.metadata.summaryStats: dset.loadStats(stats_xml) if not dset.metadata.summaryStats.prodDist: raise IOError("Pipeline Summary Stats (sts.xml) not found or missing " "key distributions") dsets = [dset] for subdset in dset.subdatasets: if subdset.metadata.summaryStats: dsets.append(subdset) col_names = ["Collection Context", "Productive ZMWs", "Productivity 0 (%)", "Productivity 1 (%)", "Productivity 2 (%)"] col_values = [[], [], [], [], []] for dset in dsets: if len(dsets) > 1 and len(col_values[0]) == 0: movie_name = "Combined" else: try: collection = list(dset.metadata.collections)[0] movie_name = collection.context except AttributeError: movie_name = "NA" productive_zmws = int(dset.metadata.summaryStats.numSequencingZmws) empty, productive, other, _ = dset.metadata.summaryStats.prodDist.bins prod0 = np.round(100.0 * empty / float(productive_zmws), decimals=Constants.DECIMALS) prod1 = np.round(100.0 * productive / float(productive_zmws), decimals=Constants.DECIMALS) prod2 = np.round(100.0 * other / float(productive_zmws), decimals=Constants.DECIMALS) this_row = [movie_name, productive_zmws, prod0, prod1, prod2] map(lambda (x, y): x.append(y), zip(col_values, this_row)) columns = [Column(cn.translate(None, '(%)').strip().replace(' ', '_').lower(), cn, vals) for cn, vals in zip(col_names, col_values)] tables = [Table("loading_xml_table", "Loading Statistics", columns)] report = Report("loading_xml_report", title="Loading Report", tables=tables, attributes=None, plotgroups=None) return report
def test_split_by_contigs_with_split(self): # test to make sure the refWindows work when chunks == # refs ds3 = DataSet(data.getBam()) dss = ds3.split(contigs=True) self.assertEqual(len(dss), 12) refWindows = sorted(reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) # not all references have something mapped to them, refWindows doesn't # care... self.assertNotEqual(refWindows, sorted(ds3.refWindows)) random_few = [('C.beijerinckii.13', 0, 1433), ('B.vulgatus.4', 0, 1449), ('E.faecalis.1', 0, 1482)] for reference in random_few: found = False for ref in refWindows: if ref == reference: found = True self.assertTrue(found) old_refWindows = refWindows dss = ds3.split(contigs=True, chunks=1) self.assertEqual(len(dss), 1) refWindows = sorted(reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) self.assertEqual(refWindows, old_refWindows) dss = ds3.split(contigs=True, chunks=24) self.assertEqual(len(dss), 24) refWindows = sorted(reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) random_few = [('E.faecalis.2', 0, 741), ('E.faecalis.2', 741, 1482)] for ref in random_few: found = False for window in refWindows: if ref == window: found = True if not found: log.debug(ref) self.assertTrue(found) dss = ds3.split(contigs=True, chunks=36) self.assertEqual(len(dss), 36) refWindows = sorted(reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) random_few = [('E.faecalis.2', 0, 494), ('E.faecalis.2', 494, 988), ('E.faecalis.2', 988, 1482)] for ref in random_few: found = False for window in refWindows: if ref == window: found = True self.assertTrue(found)
def test_addFilters(self): ds1 = DataSet() filt = Filters() filt.addRequirement(rq=[('>', '0.85')]) ds1.addFilters(filt) self.assertEquals(str(ds1.filters), '( rq > 0.85 )') # Or added from a source XML ds2 = DataSet(data.getXml(16)) self.assertTrue(str(ds2.filters).startswith( '( rname = E.faecalis'))
def test_setFilters(self): ds1 = DataSet() filt = Filters() filt.addRequirement(rq=[('>', '0.85')]) ds1.addFilters(filt) self.assertEquals(str(ds1.filters), '( rq > 0.85 )') # Or added from a source XML ds2 = DataSet() ds2.filters = ds1.filters self.assertEquals(str(ds2.filters), '( rq > 0.85 )')
def to_report(stats_xml): """Main point of entry :type stats_xml: str :type output_dir: str :type dpi: int :rtype: Report """ log.info("Analyzing XML {f}".format(f=stats_xml)) dset = DataSet(stats_xml) if not dset.metadata.summaryStats: dset.loadStats(stats_xml) if not dset.metadata.summaryStats.prodDist: raise IOError("Pipeline Summary Stats (sts.xml) not found or missing " "key distributions") dsets = [dset] for subdset in dset.subdatasets: if subdset.metadata.summaryStats: dsets.append(subdset) col_ids = [Constants.C_CONTEXT, Constants.C_ZMWS, Constants.C_PROD_0, Constants.C_PROD_1, Constants.C_PROD_2] col_values = [[], [], [], [], []] for dset in dsets: if len(dsets) > 1 and len(col_values[0]) == 0: movie_name = "Combined" else: try: collection = list(dset.metadata.collections)[0] movie_name = collection.context except AttributeError: movie_name = "NA" productive_zmws = int(dset.metadata.summaryStats.numSequencingZmws) empty, productive, other, _ = dset.metadata.summaryStats.prodDist.bins prod0 = np.round(100.0 * empty / float(productive_zmws), decimals=Constants.DECIMALS) prod1 = np.round(100.0 * productive / float(productive_zmws), decimals=Constants.DECIMALS) prod2 = np.round(100.0 * other / float(productive_zmws), decimals=Constants.DECIMALS) this_row = [movie_name, productive_zmws, prod0, prod1, prod2] map(lambda (x, y): x.append(y), zip(col_values, this_row)) columns = [Column(cid, values=vals) for cid, vals in zip(col_ids, col_values)] tables = [Table(Constants.T_LOADING, columns=columns)] report = Report(Constants.R_ID, tables=tables, attributes=None, plotgroups=None) return spec.apply_view(report)
def test_addMetadata(self): ds = DataSet() ds.addMetadata(None, Name='LongReadsRock') self.assertEquals(ds._metadata.getV(container='attrib', tag='Name'), 'LongReadsRock') ds2 = DataSet(data.getXml(no=8)) self.assertEquals(ds2._metadata.totalLength, 123588) ds2._metadata.totalLength = 100000 self.assertEquals(ds2._metadata.totalLength, 100000) ds2._metadata.totalLength += 100000 self.assertEquals(ds2._metadata.totalLength, 200000)
def test_checkFilterMatch(self): # different resourceIds, compatible filters: ds1 = DataSet(data.getXml(no=8)) ds2 = DataSet(data.getXml(no=11)) #self.assertTrue(ds1._checkFilterMatch(ds2.filters)) self.assertTrue(ds1.filters.testCompatibility(ds2.filters)) # different resourceIds, incompatible filters: ds3 = DataSet(data.getXml(no=11)) ds3.filters.addRequirement(rname=[('=', 'E.faecalis.1')]) #self.assertFalse(ds1._checkFilterMatch(ds3.filters)) self.assertFalse(ds1.filters.testCompatibility(ds3.filters))
def test_toExternalFiles(self): bogusDS = DataSet("bam1.bam", "bam2.bam", strict=False) self.assertEqual(['bam1.bam', 'bam2.bam'], bogusDS.externalResources.resourceIds) self.assertEquals(DataSet("bam1.bam", "bam2.bam", strict=False).toExternalFiles(), ['bam1.bam', 'bam2.bam']) realDS = DataSet(data.getXml(8)) files = realDS.toExternalFiles() self.assertEqual(len(files), 1) self.assertTrue(os.path.exists(files[0])) self.assertTrue(os.path.isabs(files[0]))
def test_build(self): # Progs like pbalign provide a .bam file: # e.g. d = DataSet("aligned.bam") # Something like the test files we have: inBam = data.getBam() self.assertTrue(inBam.endswith('.bam')) d = DataSet(inBam) # A UniqueId is generated, despite being a BAM input self.assertTrue(d.uuid != '') dOldUuid = d.uuid # They can write this BAM to an XML: # e.g. d.write("alignmentset.xml") outdir = tempfile.mkdtemp(suffix="dataset-unittest") outXml = os.path.join(outdir, 'tempfile.xml') d.write(outXml) # And then recover the same XML (or a different one): # e.g. d = DataSet("alignmentset.xml") d = DataSet(outXml) # The UniqueId will be the same self.assertTrue(d.uuid == dOldUuid) # Inputs can be many and varied ds1 = DataSet(data.getXml(11), data.getBam()) self.assertEquals(ds1.numExternalResources, 2) ds1 = DataSet(data.getFofn()) self.assertEquals(ds1.numExternalResources, 2) # New! Use the correct constructor: self.assertEquals(type(SubreadSet(data.getSubreadSet())).__name__, 'SubreadSet') # Even with untyped inputs self.assertTrue(str(SubreadSet(data.getBam())).startswith( '<SubreadSet')) self.assertEquals(type(SubreadSet(data.getBam())).__name__, 'SubreadSet') self.assertEquals(type(DataSet(data.getBam())).__name__, 'DataSet') # You can also cast up and down, but casting between siblings # is limited (abuse at your own risk) self.assertEquals( type(DataSet(data.getBam()).copy(asType='SubreadSet')).__name__, 'SubreadSet') self.assertEquals( type(SubreadSet(data.getBam()).copy(asType='DataSet')).__name__, 'DataSet') # Add external Resources: ds = DataSet() ds.externalResources.addResources(["IdontExist.bam"]) self.assertTrue( ds.externalResources[-1].resourceId == "IdontExist.bam") # Add an index file ds.externalResources[-1].addIndices(["IdontExist.bam.pbi"]) self.assertTrue( ds.externalResources[-1].indices[0].resourceId == "IdontExist.bam.pbi")
def test_toExternalFiles(self): bogusDS = DataSet("bam1.bam", "bam2.bam", strict=False) self.assertEqual(['bam1.bam', 'bam2.bam'], bogusDS.externalResources.resourceIds) self.assertEquals( DataSet("bam1.bam", "bam2.bam", strict=False).toExternalFiles(), ['bam1.bam', 'bam2.bam']) realDS = DataSet(data.getXml(8)) files = realDS.toExternalFiles() self.assertEqual(len(files), 1) self.assertTrue(os.path.exists(files[0])) self.assertTrue(os.path.isabs(files[0]))
def test_toFofn(self): self.assertEquals(DataSet("bam1.bam", "bam2.bam", strict=False).toFofn(), ['bam1.bam', 'bam2.bam']) realDS = DataSet(data.getXml(8)) files = realDS.toFofn() self.assertEqual(len(files), 1) self.assertTrue(os.path.exists(files[0])) self.assertTrue(os.path.isabs(files[0])) files = realDS.toFofn(relative=True) self.assertEqual(len(files), 1) self.assertTrue(os.path.exists(files[0])) self.assertFalse(os.path.isabs(files[0]))
def test_toFofn(self): self.assertEquals( DataSet("bam1.bam", "bam2.bam", strict=False).toFofn(), ['bam1.bam', 'bam2.bam']) realDS = DataSet(data.getXml(8)) files = realDS.toFofn() self.assertEqual(len(files), 1) self.assertTrue(os.path.exists(files[0])) self.assertTrue(os.path.isabs(files[0])) files = realDS.toFofn(relative=True) self.assertEqual(len(files), 1) self.assertTrue(os.path.exists(files[0])) self.assertFalse(os.path.isabs(files[0]))
def test_split(self): ds1 = DataSet(data.getXml()) self.assertTrue(ds1.numExternalResources > 1) dss = ds1.split() self.assertTrue(len(dss) == ds1.numExternalResources) dss = ds1.split(chunks=1) self.assertTrue(len(dss) == 1) dss = ds1.split(chunks=2, ignoreSubDatasets=True) self.assertTrue(len(dss) == 2) self.assertFalse(dss[0].uuid == dss[1].uuid) self.assertTrue(dss[0].name == dss[1].name) # Lets try merging and splitting on subdatasets ds1 = DataSet(data.getXml(8)) self.assertEquals(ds1.totalLength, 123588) ds1tl = ds1.totalLength ds2 = DataSet(data.getXml(11)) self.assertEquals(ds2.totalLength, 117086) ds2tl = ds2.totalLength dss = ds1 + ds2 self.assertTrue(dss.totalLength == (ds1tl + ds2tl)) ds1, ds2 = sorted(dss.split(2), key=lambda x: x.totalLength, reverse=True) self.assertTrue(ds1.totalLength == ds1tl) self.assertTrue(ds2.totalLength == ds2tl)
def test_addExternalResources(self): ds = DataSet() er1 = ExternalResource() er1.resourceId = "test1.bam" er2 = ExternalResource() er2.resourceId = "test2.bam" er3 = ExternalResource() er3.resourceId = "test1.bam" ds.addExternalResources([er1], updateCount=False) self.assertEquals(ds.numExternalResources, 1) # different resourceId: succeeds ds.addExternalResources([er2], updateCount=False) self.assertEquals(ds.numExternalResources, 2) # same resourceId: fails ds.addExternalResources([er3], updateCount=False) self.assertEquals(ds.numExternalResources, 2) for extRef in ds.externalResources: self.assertEqual(type(extRef).__name__, "ExternalResource") extRef = ds.externalResources[0] self.assertEqual(type(extRef).__name__, "ExternalResource") self.assertEqual(extRef.resourceId, 'test1.bam') extRef = ds.externalResources[1] self.assertEqual(type(extRef).__name__, "ExternalResource") self.assertEqual(extRef.resourceId, 'test2.bam')
def test_reads_in_reference(self): ds = DataSet(data.getBam()) refNames = ds.refNames # See test_ref_names for why this is expected: rn = refNames[15] reads = ds.readsInReference(rn) self.assertEqual(len(list(reads)), 11) ds2 = DataSet(data.getBam(0)) reads = ds2.readsInReference("E.faecalis.1") self.assertEqual(len(list(reads)), 20) reads = ds2.readsInReference("E.faecalis.2") self.assertEqual(len(list(reads)), 3) ds2 = DataSet(data.getXml(8)) reads = ds2.readsInReference("E.faecalis.1") self.assertEqual(len(list(reads)), 20) ds2.filters.addRequirement(rname=[('=', 'E.faecalis.1')]) # Because of the filter! reads = ds2.readsInReference("E.faecalis.2") self.assertEqual(len(list(reads)), 0)
def loadStatsXml(args): dset = DataSet(args.infile, strict=args.strict) dset.loadStats(args.statsfile) if args.outfile: dset.write(args.outfile, validate=False) else: dset.write(args.infile, validate=False)
def test_reads_in_subdataset(self): ds = DataSet(data.getXml(8)) #refs = ['E.faecalis.1', 'E.faecalis.2'] #readRefs = ['E.faecalis.1'] * 2 + ['E.faecalis.2'] * 9 #ds.filters.removeRequirement('rname') dss = ds.split(contigs=True) self.assertEqual(len(dss), 12) self.assertEqual([ 'B.vulgatus.4', 'B.vulgatus.5', 'C.beijerinckii.13', 'C.beijerinckii.14', 'C.beijerinckii.9', 'E.coli.6', 'E.faecalis.1', 'E.faecalis.2', 'R.sphaeroides.1', 'S.epidermidis.2', 'S.epidermidis.3', 'S.epidermidis.4' ], sorted([ds.filters[0][0].value for ds in dss])) self.assertEqual(len(list(dss[0].readsInSubDatasets())), 3) self.assertEqual(len(list(dss[1].readsInSubDatasets())), 20)
def createXml(args): if args.dsType is None: dset = openDataFile(*args.infile, strict=args.strict, skipCounts=args.skipCounts, generateIndices=args.generateIndices) else: dsTypes = DataSet.castableTypes() dset = dsTypes[args.dsType](*args.infile, strict=args.strict, skipCounts=args.skipCounts, generateIndices=args.generateIndices) if args.generateIndices: # we generated the indices with the last open, lets capture them with # this one: dset = dsTypes[args.dsType](*args.infile, strict=args.strict, skipCounts=args.skipCounts) if args.dsName != '': dset.name = args.dsName log.debug("Dataset created") dset.write(args.outfile, validate=args.novalidate, modPaths=True, relPaths=args.relative) log.debug("Dataset written") return 0
def test_reads_in_subdataset(self): ds = DataSet(data.getXml(8)) #refs = ['E.faecalis.1', 'E.faecalis.2'] #readRefs = ['E.faecalis.1'] * 2 + ['E.faecalis.2'] * 9 #ds.filters.removeRequirement('rname') dss = ds.split(contigs=True) self.assertEqual(len(dss), 12) self.assertEqual(['B.vulgatus.4', 'B.vulgatus.5', 'C.beijerinckii.13', 'C.beijerinckii.14', 'C.beijerinckii.9', 'E.coli.6', 'E.faecalis.1', 'E.faecalis.2', 'R.sphaeroides.1', 'S.epidermidis.2', 'S.epidermidis.3', 'S.epidermidis.4'], sorted([ds.filters[0][0].value for ds in dss])) self.assertEqual(len(list(dss[0].readsInSubDatasets())), 3) self.assertEqual(len(list(dss[1].readsInSubDatasets())), 20)
def splitXml(args): log.debug("Starting split") dataSet = DataSet(args.infile, strict=args.strict) chunks = len(args.outfiles) if args.chunks: chunks = args.chunks dss = dataSet.split(chunks=chunks, ignoreSubDatasets=(not args.subdatasets), contigs=args.contigs, maxChunks=args.maxChunks, breakContigs=args.breakContigs) log.debug("Split into {i} chunks".format(i=len(dss))) infix = 'chunk{i}' if args.contigs: infix += 'contigs' if not args.outfiles: if not args.outdir: args.outfiles = [ '.'.join( args.infile.split('.')[:-1] + [infix.format(i=chNum), 'xml']) for chNum in range(len(dss)) ] else: args.outfiles = [ '.'.join( args.infile.split('.')[:-1] + [infix.format(i=chNum), 'xml']) for chNum in range(len(dss)) ] args.outfiles = [ os.path.join(args.outdir, os.path.basename(outfn)) for outfn in args.outfiles ] num = len(dss) end = '' if num > 5: num = 5 end = '...' log.debug("Emitting {f} {e}".format(f=', '.join( args.outfiles[:num]), e=end)) log.debug("Finished splitting, now writing") for out_fn, dset in zip(args.outfiles, dss): dset.write(out_fn) log.debug("Done writing files")
def test_refWindows(self): ds = DataSet(data.getBam()) dss = ds.split(chunks=2, contigs=True) self.assertEqual(len(dss), 2) log.debug(dss[0].filters) log.debug(dss[1].filters) self.assertTrue('( rname = E.faecalis.2 ) ' in str(dss[0].filters) or '( rname = E.faecalis.2 ) ' in str(dss[1].filters)) ds = DataSet(data.getBam()) ds.filters.addRequirement(rname=[('=', 'lambda_NEB3011'), ('=', 'lambda_NEB3011')], tStart=[('<', '0'), ('<', '100')], tEnd=[('>', '99'), ('>', '299')]) self.assertEqual( str(ds.filters), '( rname = lambda_NEB3011 AND tstart ' '< 0 AND tend > 99 ) OR ( rname = lambd' 'a_NEB3011 AND tstart < 100 AND tend > 299 )')
def createXml(args): dsTypes = DataSet.castableTypes() dset = dsTypes[args.dsType](*args.infile, strict=args.strict, skipCounts=args.skipCounts) log.debug("Dataset created") dset.write(args.outfile, validate=args.novalidate, modPaths=True, relPaths=args.relative) log.debug("Dataset written")
def filterXml(args): log.error("Adding filters via CLI is temporarily out of order") exit(1) if args.infile.endswith('xml'): dataSet = DataSet(args.infile, strict=args.strict) filters = [] separators = ['<=', '>=', '!=', '==', '>', '<', '='] for filt in args.filters: for sep in separators: if sep in filt: param, condition = filt.split(sep) condition = sep + condition filters[param] = condition break dataSet.addFilters([filters]) log.info("{i} filters added".format(i=len(filters))) dataSet.write(args.outfile) else: raise IOError("No files found/found to be compatible")
def test_merge(self): # xmls with different resourceIds: success ds1 = DataSet(data.getXml(no=8)) ds2 = DataSet(data.getXml(no=11)) ds3 = ds1 + ds2 expected = ds1.numExternalResources + ds2.numExternalResources self.assertTrue(ds3.numExternalResources == expected) # xmls with different resourceIds but conflicting filters: # failure to merge ds2 = DataSet(data.getXml(no=11)) ds2.filters.addRequirement(rname=[('=', 'E.faecalis.1')]) ds3 = ds1 + ds2 self.assertEqual(ds3, None) # xmls with same resourceIds: ignores new inputs ds1 = DataSet(data.getXml(no=8)) ds2 = DataSet(data.getXml(no=8)) ds3 = ds1 + ds2 expected = ds1.numExternalResources self.assertTrue(ds3.numExternalResources == expected)
def test_reads_in_range(self): ds = DataSet(data.getBam()) refNames = ds.refNames rn = refNames[15] reads = ds.readsInRange(rn, 10, 100) self.assertEqual(len(list(reads)), 10) ds2 = DataSet(data.getBam(0)) reads = ds2.readsInRange("E.faecalis.1", 0, 1400) self.assertEqual(len(list(reads)), 20) lengths = ds.refLengths for rname, rId in ds.refInfo('ID'): rn = ds._idToRname(rId) self.assertEqual(rname, rn) rlen = lengths[rn] self.assertEqual(len(list(ds.readsInReference(rn))), len(list(ds.readsInReference(rId)))) self.assertEqual(len(list(ds.readsInRange(rn, 0, rlen))), len(list(ds.readsInRange(rId, 0, rlen))))
def createXml(args): dsTypes = DataSet.castableTypes() dset = dsTypes[args.dsType]( *args.infile, strict=args.strict, skipCounts=args.skipCounts, generateIndices=args.generateIndices ) if args.generateIndices: # we generated the indices with the last open, lets capture them with # this one: dset = dsTypes[args.dsType](*args.infile, strict=args.strict, skipCounts=args.skipCounts) log.debug("Dataset created") dset.write(args.outfile, validate=args.novalidate, modPaths=True, relPaths=args.relative) log.debug("Dataset written")
def test_split_by_contigs_presplit(self): # Consumes too much memory for Jenkins # Test to make sure the result of a split by contigs has an appropriate # number of records (make sure filters are appropriately aggressive) ds2 = DataSet(data.getXml(15)) bams = ds2.externalResources.resourceIds self.assertEqual(len(bams), 2) refwindows = ds2.refWindows self.assertEqual(refwindows, [(0, 0, 224992)]) res1 = openIndexedAlignmentFile(bams[0][7:]) res2 = openIndexedAlignmentFile(bams[1][7:]) def count(iterable): count = 0 for _ in iterable: count += 1 return count self.assertEqual(count(res1.readsInRange(*refwindows[0])), 1409) self.assertEqual(count(res2.readsInRange(*refwindows[0])), 1375) self.assertEqual(count(ds2.readsInRange(*refwindows[0])), 2784) self.assertEqual(count(ds2.records), 2784) ds2.disableFilters() self.assertEqual(count(ds2.records), 53552) self.assertEqual(ds2.countRecords(), 53552)
def test_split_by_contigs_presplit(self): # Consumes too much memory for Jenkins # Test to make sure the result of a split by contigs has an appropriate # number of records (make sure filters are appropriately aggressive) ds2 = DataSet(data.getXml(14)) bams = ds2.externalResources.resourceIds assert len(bams) == 2 refwindows = ds2.refWindows assert refwindows == [(0, 0, 224992)] res1 = openIndexedAlignmentFile(bams[0][7:]) res2 = openIndexedAlignmentFile(bams[1][7:]) def count(iterable): count = 0 for _ in iterable: count += 1 return count assert count(res1.readsInRange(*refwindows[0])) == 1409 assert count(res2.readsInRange(*refwindows[0])) == 1375 assert count(ds2.readsInRange(*refwindows[0])) == 2784 assert count(ds2.records) == 2784 ds2.disableFilters() assert count(ds2.records) == 53552 assert ds2.countRecords() == 53552
def createXml(args): if os.path.exists(args.outfile) and not args.force: raise IOError("Output file {} already exists. Use --force to " "clobber".format(args.outfile)) if args.dsType is None: dset = openDataFile(*args.infile, strict=args.strict, skipCounts=args.skipCounts, trustCounts=args.trustCounts, generateIndices=args.generateIndices, referenceFastaFname=args.reference_fasta_fname) else: dsTypes = DataSet.castableTypes() dset = dsTypes[args.dsType]( *args.infile, strict=args.strict, skipCounts=args.skipCounts, trustCounts=args.trustCounts, generateIndices=args.generateIndices, referenceFastaFname=args.reference_fasta_fname) if args.dsName != '': dset.name = args.dsName if args.metadata: dset.loadMetadata(args.metadata) if args.well_sample_name or args.bio_sample_name: if args.metadata: log.warning( "Setting the WellSample or BioSample name will overwrite fields pulled from %s", args.metadata) n_new_collections = add_mock_collection_metadata(dset) if n_new_collections > 0: log.warning( "Created new CollectionMetadata from blank template for %d movies", n_new_collections) if args.well_sample_name: force_set_all_well_sample_names(dset, args.well_sample_name) if args.bio_sample_name: force_set_all_bio_sample_names(dset, args.bio_sample_name) log.debug("Dataset created") if isinstance(dset, ContigSet): if args.organism: dset.metadata.organism = args.organism if args.ploidy: dset.metadata.ploidy = args.ploidy dset.newUuid() if args.no_sub_datasets: dset.subdatasets = [] if args.unique_collections: uniqueify_collections(dset.metadata) dset.write(args.outfile, validate=args.novalidate, relPaths=args.relative) log.debug("Dataset written") return 0
def splitXml(args): log.debug("Starting split") dataSet = DataSet(args.infile, strict=args.strict) chunks = len(args.outfiles) if args.chunks: chunks = args.chunks dss = dataSet.split(chunks=chunks, ignoreSubDatasets=(not args.subdatasets), contigs=args.contigs, maxChunks=args.maxChunks, breakContigs=args.breakContigs) log.debug("Split into {i} chunks".format(i=len(dss))) infix = 'chunk{i}' if args.contigs: infix += 'contigs' if not args.outfiles: if not args.outdir: args.outfiles = ['.'.join(args.infile.split('.')[:-1] + [infix.format(i=chNum), 'xml']) for chNum in range(len(dss))] else: args.outfiles = ['.'.join(args.infile.split('.')[:-1] + [infix.format(i=chNum), 'xml']) for chNum in range(len(dss))] args.outfiles = [os.path.join(args.outdir, os.path.basename(outfn)) for outfn in args.outfiles] num = len(dss) end = '' if num > 5: num = 5 end = '...' log.debug("Emitting {f} {e}".format( f=', '.join(args.outfiles[:num]), e=end)) log.debug("Finished splitting, now writing") for out_fn, dset in zip(args.outfiles, dss): dset.write(out_fn) log.debug("Done writing files")
def test_referenceInfoTableMerging(self): log.info("Testing refIds, etc. after merging") ds = DataSet(data.getXml(17)) also_lambda = ds.toExternalFiles()[0] aln = AlignmentSet(data.getBam(0), data.getBam(0), also_lambda) readers = aln.resourceReaders() ids = sorted([i for _, i in aln.refInfo('ID')]) self.assertEqual(range(len(ids)), ids) accNames = aln.refNames expNames = reduce(np.append, [reader.referenceInfoTable['Name'] for reader in readers]) expNames = np.unique(expNames) self.assertEqual(sorted(expNames), sorted(accNames)) accNames = aln.fullRefNames expNames = reduce(np.append, [reader.referenceInfoTable['FullName'] for reader in readers]) expNames = np.unique(expNames) self.assertEqual(sorted(expNames), sorted(accNames))
def test_refLengths(self): ds = DataSet(data.getBam(0)) random_few = {'B.cereus.6': 1472, 'S.agalactiae.1': 1470, 'B.cereus.4': 1472} for key, value in random_few.items(): self.assertEqual(ds.refLengths[key], value) # this is a hack to only emit refNames that actually have records # associated with them: dss = ds.split(contigs=True, chunks=1)[0] self.assertEqual(dss.refLengths, {'B.vulgatus.4': 1449, 'B.vulgatus.5': 1449, 'C.beijerinckii.13': 1433, 'C.beijerinckii.14': 1433, 'C.beijerinckii.9': 1433, 'E.coli.6': 1463, 'E.faecalis.1': 1482, 'E.faecalis.2': 1482, 'R.sphaeroides.1': 1386, 'S.epidermidis.2': 1472, 'S.epidermidis.3': 1472, 'S.epidermidis.4': 1472 })
def test_refWindows(self): ds = DataSet(data.getBam()) dss = ds.split(chunks=2, contigs=True) self.assertEqual(len(dss), 2) log.debug(dss[0].filters) log.debug(dss[1].filters) self.assertTrue( '( rname = E.faecalis.2 ) ' in str(dss[0].filters) or '( rname = E.faecalis.2 ) ' in str(dss[1].filters)) ds = DataSet(data.getBam()) ds.filters.addRequirement(rname=[('=', 'lambda_NEB3011'), ('=', 'lambda_NEB3011')], tStart=[('<', '0'), ('<', '100')], tEnd=[('>', '99'), ('>', '299')]) self.assertEqual(str(ds.filters), '( rname = lambda_NEB3011 AND tstart ' '< 0 AND tend > 99 ) OR ( rname = lambd' 'a_NEB3011 AND tstart < 100 AND tend > 299 )')
def test_referenceInfoTableMerging(self): log.info("Testing refIds, etc. after merging") ds = DataSet(data.getXml(17)) also_lambda = ds.toExternalFiles()[0] aln = AlignmentSet(data.getBam(0), data.getBam(0), also_lambda) readers = aln.resourceReaders() ids = sorted([i for _, i in aln.refInfo('ID')]) self.assertEqual(range(len(ids)), ids) accNames = aln.refNames expNames = reduce( np.append, [reader.referenceInfoTable['Name'] for reader in readers]) expNames = np.unique(expNames) self.assertEqual(sorted(expNames), sorted(accNames)) accNames = aln.fullRefNames expNames = reduce( np.append, [reader.referenceInfoTable['FullName'] for reader in readers]) expNames = np.unique(expNames) self.assertEqual(sorted(expNames), sorted(accNames))
def test_staggered_reads_in_range(self): ds = DataSet(data.getXml(8)) refNames = ds.refNames rn = 'B.vulgatus.5' reads = list(ds.readsInRange(rn, 0, 10000)) ds2 = DataSet(data.getXml(11)) reads2 = list(ds2.readsInRange(rn, 0, 10000)) dsBoth = DataSet(data.getXml(8), data.getXml(11)) readsBoth = list(dsBoth.readsInRange(rn, 0, 10000)) self.assertEqual(len(reads), 2) self.assertEqual(len(reads2), 5) self.assertEqual(len(readsBoth), 7) read_starts = (0, 1053) for read, start in zip(reads, read_starts): self.assertEqual(read.tStart, start) read2_starts = (0, 0, 3, 3, 4) for read, start in zip(reads2, read2_starts): self.assertEqual(read.tStart, start) readboth_starts = (0, 0, 0, 3, 3, 4, 1053) for read, start in zip(readsBoth, readboth_starts): self.assertEqual(read.tStart, start)
def test_split_by_contigs_with_split(self): # test to make sure the refWindows work when chunks == # refs ds3 = DataSet(data.getBam()) dss = ds3.split(contigs=True) self.assertEqual(len(dss), 12) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) # not all references have something mapped to them, refWindows doesn't # care... self.assertNotEqual(refWindows, sorted(ds3.refWindows)) random_few = [('C.beijerinckii.13', 0, 1433), ('B.vulgatus.4', 0, 1449), ('E.faecalis.1', 0, 1482)] for reference in random_few: found = False for ref in refWindows: if ref == reference: found = True self.assertTrue(found) old_refWindows = refWindows dss = ds3.split(contigs=True, chunks=1) self.assertEqual(len(dss), 1) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) self.assertEqual(refWindows, old_refWindows) dss = ds3.split(contigs=True, chunks=24) self.assertEqual(len(dss), 24) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) random_few = [('E.faecalis.2', 0, 741), ('E.faecalis.2', 741, 1482)] for ref in random_few: found = False for window in refWindows: if ref == window: found = True if not found: log.debug(ref) self.assertTrue(found) dss = ds3.split(contigs=True, chunks=36) self.assertEqual(len(dss), 36) refWindows = sorted( reduce(lambda x, y: x + y, [ds.refWindows for ds in dss])) random_few = [('E.faecalis.2', 0, 494), ('E.faecalis.2', 494, 988), ('E.faecalis.2', 988, 1482)] for ref in random_few: found = False for window in refWindows: if ref == window: found = True self.assertTrue(found)
def test_copy(self): ds1 = DataSet(data.getXml()) ds2 = ds1.copy() self.assertFalse(ds1 == ds2) self.assertFalse(ds1.uuid == ds2.uuid) self.assertFalse(ds1 is ds2) self.assertTrue(ds1.name == ds2.name) self.assertTrue(ds1.externalResources == ds2.externalResources) # The name and UniqueId are different: self.assertFalse(ds1.objMetadata == ds2.objMetadata) self.assertTrue(ds1.filters == ds2.filters) self.assertTrue(ds1.subdatasets == ds2.subdatasets) self.assertTrue(len(ds1.subdatasets) == 2) self.assertTrue(len(ds2.subdatasets) == 2) assert not reduce(lambda x, y: x or y, [ds1d is ds2d for ds1d in ds1.subdatasets for ds2d in ds2.subdatasets]) # TODO: once simulated files are indexable, turn on strict: ds1 = SubreadSet(data.getXml(no=10), strict=False) self.assertEquals(type(ds1.metadata).__name__, 'SubreadSetMetadata') ds2 = ds1.copy() self.assertEquals(type(ds2.metadata).__name__, 'SubreadSetMetadata') # Lets try casting ds1 = DataSet(data.getBam()) self.assertEquals(type(ds1).__name__, 'DataSet') ds1 = ds1.copy(asType='SubreadSet') self.assertEquals(type(ds1).__name__, 'SubreadSet') # Lets do some illicit casting with self.assertRaises(TypeError): ds1 = ds1.copy(asType='ReferenceSet') # Lets try not having to cast ds1 = SubreadSet(data.getBam()) self.assertEquals(type(ds1).__name__, 'SubreadSet')
def test_checkInputFile(self): """Test checkInputFile().""" fastaFN = path.join(self.rootDir, "data/ecoli.fasta") plsFN = self.dataDir + \ "m121215_065521_richard_c100425710150000001823055001121371_s1_p0.pls.h5" self.assertTrue(filecmp.cmp(fastaFN, checkInputFile(fastaFN))) self.assertTrue(filecmp.cmp(plsFN, checkInputFile(plsFN))) fofnFN = path.join(self.rootDir, "data/ecoli_lp.fofn") self.assertTrue(filecmp.cmp(fofnFN, checkInputFile(fofnFN))) xmlFN = path.join(self.rootDir, "data/subreads_dataset1.xml") ret = checkInputFile(xmlFN) self.assertTrue(ret.endswith('.xml')) fs = DataSet(ret).toExternalFiles() self.assertTrue(fs[0].endswith( "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.subreads.bam" )) self.assertTrue(fs[1].endswith( "m130406_011850_42141_c100513442550000001823074308221310_s1_p0.1.subreads.bam" ))
def _get_dataset_uuid_or_create_uuid(path): """ Extract the uuid from the DataSet or assign a new UUID :param path: Path to file :rtype: str :return: uuid string """ try: ds = DataSet(path) ds_id = ds.uuid # make sure it's a validate uuid _ = uuid.UUID(ds_id) except ValueError as e: log.error("DataSet {p} uuid is malformed. {e}".format(e=e, p=path)) ds_id = uuid.uuid4 except Exception: # not a DataSet file ds_id = uuid.uuid4() return ds_id
def createXml(args): if args.dsType is None: dset = openDataFile(*args.infile, strict=args.strict, skipCounts=args.skipCounts, generateIndices=args.generateIndices) else: dsTypes = DataSet.castableTypes() dset = dsTypes[args.dsType](*args.infile, strict=args.strict, skipCounts=args.skipCounts, generateIndices=args.generateIndices) if args.generateIndices: # we generated the indices with the last open, lets capture them with # this one: dset = dsTypes[args.dsType](*args.infile, strict=args.strict, skipCounts=args.skipCounts) if args.dsName != '': dset.name = args.dsName if args.metadata: dset.loadMetadata(args.metadata) log.debug("Dataset created") dset.write(args.outfile, validate=args.novalidate, relPaths=args.relative) log.debug("Dataset written") return 0
def test_reads_in_contig(self): log.info("Testing reads in contigs") ds = DataSet(data.getXml(8)) dss = ds.split(contigs=True) self.assertEqual(len(dss), 12) efaec1TimesFound = 0 efaec1TotFound = 0 efaec2TimesFound = 0 efaec2TotFound = 0 for ds in dss: ef1 = len(list(ds.readsInReference('E.faecalis.1'))) ef2 = len(list(ds.readsInReference('E.faecalis.2'))) if ef1: efaec1TimesFound += 1 efaec1TotFound += ef1 if ef2: efaec2TimesFound += 1 efaec2TotFound += ef2 self.assertEqual(efaec1TimesFound, 1) self.assertEqual(efaec1TotFound, 20) self.assertEqual(efaec2TimesFound, 1) self.assertEqual(efaec2TotFound, 3) ds = DataSet(data.getXml(8)) filt = Filters() filt.addRequirement(length=[('>', '100')]) ds.addFilters(filt) dss = ds.split(contigs=True) self.assertEqual(len(dss), 12) efaec1TimesFound = 0 efaec1TotFound = 0 efaec2TimesFound = 0 efaec2TotFound = 0 for ds in dss: ef1 = len(list(ds.readsInReference('E.faecalis.1'))) ef2 = len(list(ds.readsInReference('E.faecalis.2'))) if ef1: efaec1TimesFound += 1 efaec1TotFound += ef1 if ef2: efaec2TimesFound += 1 efaec2TotFound += ef2 self.assertEqual(efaec1TimesFound, 1) self.assertEqual(efaec1TotFound, 20) self.assertEqual(efaec2TimesFound, 1) self.assertEqual(efaec2TotFound, 3) ds = DataSet(data.getXml(8)) filt = Filters() filt.addRequirement(length=[('>', '1000')]) ds.addFilters(filt) dss = ds.split(contigs=True) self.assertEqual(len(dss), 9) efaec1TimesFound = 0 efaec1TotFound = 0 efaec2TimesFound = 0 efaec2TotFound = 0 for ds in dss: ef1 = len(list(ds.readsInReference('E.faecalis.1'))) ef2 = len(list(ds.readsInReference('E.faecalis.2'))) if ef1: efaec1TimesFound += 1 efaec1TotFound += ef1 if ef2: efaec2TimesFound += 1 efaec2TotFound += ef2 self.assertEqual(efaec1TimesFound, 1) self.assertEqual(efaec1TotFound, 20) self.assertEqual(efaec2TimesFound, 1) self.assertEqual(efaec2TotFound, 1)
def to_report(stats_xml, output_dir, dpi=72): """Main point of entry :type stats_xml: str :type output_dir: str :type dpi: int :rtype: Report """ log.info("Analyzing XML {f}".format(f=stats_xml)) # stats_xml should be a dataset: dset = DataSet(stats_xml) dataset_uuids = [dset.uuid] # but if it isn't, no problem: if not dset.metadata.summaryStats: dset.loadStats(stats_xml) # an sts file was provided which will generate a new random uuid dataset_uuids = [] if not dset.metadata.summaryStats.readLenDists: raise RuntimeError("No Pipeline Summary Stats (sts.xml) found") # Build the stats table: nbases = 0 nreads = 0 n50 = 0 readscoretotal = 0 readscorenumber = 0 approx_read_lens = [] # if a merge failed there may be more than one dist: for rlendist in dset.metadata.summaryStats.readLenDists: nbases += _total_from_bins(rlendist.bins, rlendist.minBinValue, rlendist.binWidth) nreads += sum(rlendist.bins) # N50: for i, lbin in enumerate(rlendist.bins): # use the average, except for the last bin if i != len(rlendist.bins) - 1: value = ((i * rlendist.binWidth) + rlendist.minBinValue + rlendist.binWidth / 2) # for the last bin, just use the value else: value = (i * rlendist.binWidth) + rlendist.minBinValue approx_read_lens.extend([value] * lbin) # TODO(mdsmith)(2016-02-09) make sure maxOutlierValue is updated # during a merge /todo # but pop off that last value and replace it with the # maxOutlierValue: # approx_read_lens.pop() # approx_read_lens.append(rlendist.maxBinValue) n50 = np.round(compute_n50(approx_read_lens)) for rqualdist in dset.metadata.summaryStats.readQualDists: readscoretotal += _total_from_bins(rqualdist.bins, rqualdist.minBinValue, rqualdist.binWidth) readscorenumber += sum(rqualdist.bins) readlen = 0 if nreads != 0: readlen = np.round(nbases / nreads, decimals=2) readQuality = 0 if readscorenumber != 0: readQuality = np.round(readscoretotal / readscorenumber, decimals=2) row_names = ["Polymerase Read Bases", "Polymerase Reads", "Polymerase Read N50", "Polymerase Read Length", "Polymerase Read Quality"] _pre_filter = [np.round(nbases, decimals=2), nreads, n50, readlen, readQuality] plots = [] # ReadLen distribution to barplot: for i, rlendist in enumerate(dset.metadata.summaryStats.readLenDists): len_fig, len_axes = get_fig_axes_lpr() len_axes.bar(rlendist.labels, rlendist.bins, color=get_green(0), edgecolor=get_green(0), width=(rlendist.binWidth * 0.75)) len_axes.set_xlabel('Read Length') len_axes.set_ylabel('Reads') png_fn = os.path.join(output_dir, "readLenDist{i}.png".format(i=i)) png_base, thumbnail_base = save_figure_with_thumbnail(len_fig, png_fn, dpi=dpi) plots.append(Plot("filter_len_xml_plot_{i}".format(i=i), os.path.relpath(png_base, output_dir), thumbnail=os.path.relpath(thumbnail_base, output_dir))) plot_groups = [PlotGroup("filter_len_xml_plot_group", title="Polymerase Read Length", plots=plots, thumbnail=os.path.relpath(thumbnail_base, output_dir))] plots = [] # ReadQual distribution to barplot: for i, rqualdist in enumerate(dset.metadata.summaryStats.readQualDists): qual_fig, qual_axes = get_fig_axes_lpr() qual_axes.bar(rqualdist.labels, rqualdist.bins, color=get_green(0), edgecolor=get_green(0), width=(rqualdist.binWidth * 0.75)) qual_axes.set_xlabel('Read Quality') qual_axes.set_ylabel('Reads') png_fn = os.path.join(output_dir, "readQualDist{i}.png".format(i=i)) png_base, thumbnail_base = save_figure_with_thumbnail(qual_fig, png_fn, dpi=dpi) plots.append(Plot("filter_qual_xml_plot_{i}".format(i=i), os.path.relpath(png_base, output_dir), thumbnail=os.path.relpath(thumbnail_base, output_dir))) plot_groups.append(PlotGroup("filter_qual_xml_plot_group", title="Polymerase Read Quality", plots=plots)) # build the report: columns = [Column("filter_names_column", header="Metrics", values=row_names)] columns.append(Column("filter_stats_column", header="Values", values=_pre_filter)) tables = [Table("filter_xml_table", "Filtering Statistics", columns)] report = Report("filtering_stats_xml_report", title="Filtering stats XML report", tables=tables, attributes=None, plotgroups=plot_groups, dataset_uuids=dataset_uuids) return report
def mergeXml(args): dss = [] for infn in args.infiles: dss.append(DataSet(infn, strict=args.strict)) reduce(lambda ds1, ds2: ds1 + ds2, dss).write(args.outfile)
def createXml(args): dsTypes = DataSet.castableTypes() dset = dsTypes[args.dsType](*args.infile) log.debug("Dataset created") dset.write(args.outfile, validate=args.novalidate, relPaths=args.relative) log.debug("Dataset written")
def test_stats_metadata(self): ds = DataSet(data.getBam()) ds.loadStats(data.getStats()) self.assertEqual(ds.metadata.summaryStats.prodDist.numBins, 4) self.assertEqual(ds.metadata.summaryStats.prodDist.bins, [1576, 901, 399, 0]) ds1 = DataSet(data.getXml(8)) ds1.loadStats(data.getStats()) ds2 = DataSet(data.getXml(11)) ds2.loadStats(data.getStats()) ds3 = ds1 + ds2 self.assertEqual(ds1.metadata.summaryStats.prodDist.bins, [1576, 901, 399, 0]) self.assertEqual(ds2.metadata.summaryStats.prodDist.bins, [1576, 901, 399, 0]) self.assertEqual(ds3.metadata.summaryStats.prodDist.bins, [3152, 1802, 798, 0]) self.assertEqual(ds1.metadata.summaryStats.readLenDist.bins, [0, 62, 39, 36, 29, 37, 19, 29, 37, 32, 32, 40, 45, 54, 73, 77, 97, 95, 49, 17, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) self.assertEqual(ds2.metadata.summaryStats.readLenDist.bins, [0, 62, 39, 36, 29, 37, 19, 29, 37, 32, 32, 40, 45, 54, 73, 77, 97, 95, 49, 17, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) self.assertEqual(ds3.metadata.summaryStats.readLenDist.bins, [0, 124, 78, 72, 58, 74, 38, 58, 74, 64, 64, 80, 90, 108, 146, 154, 194, 190, 98, 34, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) # Lets check some manual values ds1 = DataSet(data.getXml(8)) ds1.loadStats(data.getStats()) ds2 = DataSet(data.getXml(11)) ds2.loadStats(data.getStats()) ds1.metadata.summaryStats.readLenDist.bins = ( [0, 10, 9, 8, 7, 6, 4, 2, 1, 0, 0, 1]) self.assertEqual(ds1.metadata.summaryStats.readLenDist.bins, [0, 10, 9, 8, 7, 6, 4, 2, 1, 0, 0, 1]) ds1.metadata.summaryStats.readLenDist.minBinValue = 10 ds1.metadata.summaryStats.readLenDist.binWidth = 10 ds2.metadata.summaryStats.readLenDist.bins = ( [0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1]) self.assertEqual(ds2.metadata.summaryStats.readLenDist.bins, [0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1]) ds2.metadata.summaryStats.readLenDist.minBinValue = 20 ds2.metadata.summaryStats.readLenDist.binWidth = 10 ds3 = ds1 + ds2 self.assertEqual(ds3.metadata.summaryStats.readLenDist.bins, [0, 10, 10, 9, 8, 7, 5, 3, 2, 1, 0, 1, 1]) # now lets swap ds1 = DataSet(data.getXml(8)) ds1.loadStats(data.getStats()) ds2 = DataSet(data.getXml(11)) ds2.loadStats(data.getStats()) ds1.metadata.summaryStats.readLenDist.bins = ( [0, 10, 9, 8, 7, 6, 4, 2, 1, 0, 0, 1]) self.assertEqual(ds1.metadata.summaryStats.readLenDist.bins, [0, 10, 9, 8, 7, 6, 4, 2, 1, 0, 0, 1]) ds1.metadata.summaryStats.readLenDist.minBinValue = 20 ds1.metadata.summaryStats.readLenDist.binWidth = 10 ds2.metadata.summaryStats.readLenDist.bins = ( [0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1]) self.assertEqual(ds2.metadata.summaryStats.readLenDist.bins, [0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1]) ds2.metadata.summaryStats.readLenDist.minBinValue = 10 ds2.metadata.summaryStats.readLenDist.binWidth = 10 ds3 = ds1 + ds2 self.assertEqual(ds3.metadata.summaryStats.readLenDist.bins, [0, 1, 11, 10, 9, 8, 7, 5, 3, 1, 0, 1, 1]) # now lets do some non-overlapping ds1 = DataSet(data.getXml(8)) ds1.loadStats(data.getStats()) ds2 = DataSet(data.getXml(11)) ds2.loadStats(data.getStats()) ds1.metadata.summaryStats.readLenDist.bins = ( [1, 1, 1]) self.assertEqual(ds1.metadata.summaryStats.readLenDist.bins, [1, 1, 1]) ds1.metadata.summaryStats.readLenDist.minBinValue = 10 ds1.metadata.summaryStats.readLenDist.binWidth = 10 ds2.metadata.summaryStats.readLenDist.bins = ( [2, 2, 2]) self.assertEqual(ds2.metadata.summaryStats.readLenDist.bins, [2, 2, 2]) ds2.metadata.summaryStats.readLenDist.minBinValue = 50 ds2.metadata.summaryStats.readLenDist.binWidth = 10 ds3 = ds1 + ds2 self.assertEqual(ds3.metadata.summaryStats.readLenDist.bins, [1, 1, 1, 0, 2, 2, 2]) # now lets test the subdataset metadata retention: ss = SubreadSet(data.getXml(10)) ss.loadStats(data.getStats(0)) ss.loadStats(data.getStats(1)) self.assertEqual(153168.0, ss.metadata.summaryStats.numSequencingZmws) self.assertEqual( 2876.0, ss.subdatasets[0].metadata.summaryStats.numSequencingZmws) self.assertEqual( 150292.0, ss.subdatasets[1].metadata.summaryStats.numSequencingZmws)