Exemplo n.º 1
0
    def test_referenceset_contigs(self):
        names = [
            'A.baumannii.1', 'A.odontolyticus.1', 'B.cereus.1', 'B.cereus.2',
            'B.cereus.4', 'B.cereus.6', 'B.vulgatus.1', 'B.vulgatus.2',
            'B.vulgatus.3', 'B.vulgatus.4', 'B.vulgatus.5', 'C.beijerinckii.1',
            'C.beijerinckii.2', 'C.beijerinckii.3', 'C.beijerinckii.4',
            'C.beijerinckii.5', 'C.beijerinckii.6', 'C.beijerinckii.7',
            'C.beijerinckii.8', 'C.beijerinckii.9', 'C.beijerinckii.10',
            'C.beijerinckii.11', 'C.beijerinckii.12', 'C.beijerinckii.13',
            'C.beijerinckii.14', 'D.radiodurans.1', 'D.radiodurans.2',
            'E.faecalis.1', 'E.faecalis.2', 'E.coli.1', 'E.coli.2', 'E.coli.4',
            'E.coli.5', 'E.coli.6', 'E.coli.7', 'H.pylori.1', 'L.gasseri.1',
            'L.monocytogenes.1', 'L.monocytogenes.2', 'L.monocytogenes.3',
            'L.monocytogenes.5', 'N.meningitidis.1', 'P.acnes.1',
            'P.aeruginosa.1', 'P.aeruginosa.2', 'R.sphaeroides.1',
            'R.sphaeroides.3', 'S.aureus.1', 'S.aureus.4', 'S.aureus.5',
            'S.epidermidis.1', 'S.epidermidis.2', 'S.epidermidis.3',
            'S.epidermidis.4', 'S.epidermidis.5', 'S.agalactiae.1',
            'S.mutans.1', 'S.mutans.2', 'S.pneumoniae.1']
        seqlens = [1458, 1462, 1472, 1473, 1472, 1472, 1449, 1449, 1449, 1449,
                   1449, 1433, 1433, 1433, 1433, 1433, 1433, 1433, 1433, 1433,
                   1433, 1433, 1433, 1433, 1433, 1423, 1423, 1482, 1482, 1463,
                   1463, 1463, 1463, 1463, 1463, 1424, 1494, 1471, 1471, 1471,
                   1471, 1462, 1446, 1457, 1457, 1386, 1388, 1473, 1473, 1473,
                   1472, 1472, 1472, 1472, 1472, 1470, 1478, 1478, 1467]
        ds = ReferenceSet(data.getXml(9))
        log.debug([contig.id for contig in ds])
        for contig, name, seqlen in zip(ds.contigs, names, seqlens):
            self.assertEqual(contig.id, name)
            self.assertEqual(len(contig.sequence), seqlen)

        for name in names:
            self.assertTrue(ds.get_contig(name))
Exemplo n.º 2
0
def loadFromFile(filename_, cmpH5):
    """
    Reads reference from FASTA file, loading
    lookup tables that can be used any time later.
    """
    # Contigs in FASTA may disagree with those in cmp.h5 ref info
    # table, for instance if the FASTA has been edited.  Here's how we
    # handle things:
    #
    # |fastaContigs \   cmpContigs| > 0 : OK, extra FASTA contigs just ignored
    # |cmpContigs   \ fastaContigs| > 0 : Not necessarily OK---a warning should be
    #                                     issued.  We then proceed to operate on
    #                                     the contigs that are in both.
    # |cmpContigs ^ fastaContigs| == 0  : Nothing to work with.  This is an error.
    #
    # While we formerly used MD5s to vouch for the identity of a
    # contig, we now use the name.  This is an inferior approach but
    # is necessary, in using the FastaTable.

    # Load contigs
    assert not isLoaded()
    try:
        f = ReferenceSet(filename_)
        f.assertIndexed()
    except IOError as e:
        die(e)

    cmpContigNames = set(cmpH5.refNames)

    for fastaRecord in f.contigs:
        refName = fastaRecord.id
        if refName in cmpContigNames:
            cmpH5RefEntry = cmpH5.referenceInfo(refName)
            refId = cmpH5RefEntry.ID
            pacBioName = cmpH5RefEntry.Name
            refFullName = cmpH5RefEntry.FullName
            sequence = UppercasingMmappedFastaSequence(fastaRecord.sequence)
            length = len(fastaRecord.sequence)
            contig = ReferenceContig(refId, refName, refFullName, sequence, length)
            byId[refId] = contig
            byName[refName] = contig
            byPacBioName[pacBioName] = contig
    loadedFastaContigNames = set(byName.keys())
    logging.info("Loaded %d of %d reference groups from %s " % (len(byName), len(loadedFastaContigNames), filename_))

    if len(byName) == 0:
        die(
            "No reference groups in the FASTA file were aligned against.  "
            "Did you select the wrong reference FASTA file?"
        )
    elif cmpContigNames - loadedFastaContigNames:
        logging.warn(
            "Some reference contigs aligned against are not found in "
            "the reference FASTA.  Will process only those contigs "
            "supported by the reference FASTA."
        )

    global filename
    filename = filename_
    assert isLoaded()
    def test_exit_code_0_referenceset(self):
        """
        Like a cram test. Assert exits with 0 with ReferenceSet XML
        """

        ref = os.path.join(self._data_dir, 'references', 'lambda', 'sequence',
                           'lambda.fasta')
        ref_name = os.path.join(self._output_dir, "refset.xml")
        refset = ReferenceSet(ref)
        refset.write(ref_name)
        ref = ref_name
        gff = os.path.join(self._data_dir, 'alignment_summary.lambda.gff')
        r = 'rpt.json'
        cmd = 'python -m pbreports.report.coverage {o} {r} {c} {g}'.format(o=self._output_dir,
                                                            r=r,
                                                            c=ref, g=gff)

        log.info(cmd)
        o, c, m = backticks(cmd)

        if c is not 0:
            log.error(m)
            log.error(o)
            sys.stderr.write(str(m) + "\n")

        self.assertEquals(0, c)
        self.assertTrue(os.path.exists(os.path.join(self._output_dir, r)))
Exemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--input')
    parser.add_argument('-r', '--ref')
    parser.add_argument('-v', dest='verbose', action='store_true')
    args = parser.parse_args()
    try:
        len(args.input) > 0
    except:
        usage()
        sys.exit(2)

    with AlignmentSet(args.input) as peekCmpH5:
        print "test"
        print peekCmpH5
        logging.info("Peeking at CmpH5 file %s" % (args.input))
        logging.info("Input CmpH5 data: numAlnHits=%d" % len(peekCmpH5))
        cmpContigNames = set(peekCmpH5.refNames)
        print cmpContigNames
        reference.loadFromFile(args.ref, peekCmpH5)
        
    f = ReferenceSet(args.ref)
    f.assertIndexed()
    for fastaRecord in f.contigs:
        refName = fastaRecord.id
        print refName
Exemplo n.º 5
0
def loadFromFile(filename_, alnFile):
    """
    Reads reference from FASTA file, loading
    lookup tables that can be used any time later.
    """
    # Contigs in FASTA may disagree with those in cmp.h5 ref info
    # table, for instance if the FASTA has been edited.  Here's how we
    # handle things:
    #
    # |fastaContigs \   cmpContigs| > 0 : OK, extra FASTA contigs just ignored
    # |cmpContigs   \ fastaContigs| > 0 : Not necessarily OK---a warning should be
    #                                     issued.  We then proceed to operate on
    #                                     the contigs that are in both.
    # |cmpContigs ^ fastaContigs| == 0  : Nothing to work with.  This is an error.
    #
    # While we formerly used MD5s to vouch for the identity of a
    # contig, we now use the name.  This is an inferior approach but
    # is necessary, in using the FastaTable.

    # Load contigs
    assert not isLoaded()
    try:
        f = ReferenceSet(filename_)
        f.assertIndexed()
    except IOError as e:
        die(e)

    cmpContigNames = set(alnFile.refNames)

    for fastaRecord in f.contigs:
        refName = fastaRecord.id
        if refName in cmpContigNames:
            refEntry = alnFile.referenceInfo(refName)
            refId = refEntry.ID
            pacBioName = refEntry.Name
            refFullName = refEntry.FullName
            sequence = UppercasingMmappedFastaSequence(fastaRecord.sequence)
            length = len(fastaRecord.sequence)
            contig = ReferenceContig(refId, refName, refFullName, sequence,
                                     length)
            byId[refId] = contig
            byName[refName] = contig
            byPacBioName[pacBioName] = contig
    loadedFastaContigNames = set(byName.keys())
    logging.info("Loaded %d of %d reference groups from %s " %
                 (len(byName), len(loadedFastaContigNames), filename_))

    if len(byName) == 0:
        die("No reference groups in the FASTA file were aligned against.  " \
            "Did you select the wrong reference FASTA file?")
    elif (cmpContigNames - loadedFastaContigNames):
        logging.warn(
            "Some reference contigs aligned against are not found in " \
            "the reference FASTA.  Will process only those contigs "   \
            "supported by the reference FASTA.")

    global filename
    filename = filename_
    assert isLoaded()
Exemplo n.º 6
0
def loadReferences(fastaFilename, alnReader):
    # as of 3.0, quiver can be called with a "ReferenceSet" XML
    # instead of just a FASTA.  Let's just unwrap the underlying FASTA
    # file.  This code still works if a FASTA was provided.
    dset = ReferenceSet(fastaFilename)
    fastas = dset.toExternalFiles()
    assert len(fastas) == 1
    return IndexedFastaReader(fastas[0])
Exemplo n.º 7
0
 def test_contigset_split(self):
     ref = ReferenceSet(data.getXml(9))
     exp_n_contigs = len(ref)
     refs = ref.split(10)
     self.assertEqual(len(refs), 10)
     obs_n_contigs = 0
     for r in refs:
         obs_n_contigs += sum(1 for _ in r)
     self.assertEqual(obs_n_contigs, exp_n_contigs)
 def test_contigset_len(self):
     ref = ReferenceSet(data.getXml(9))
     exp_n_contigs = len(ref)
     refs = ref.split(10)
     self.assertEqual(len(refs), 10)
     obs_n_contigs = 0
     for r in refs:
         obs_n_contigs += len(r)
     self.assertEqual(obs_n_contigs, exp_n_contigs)
Exemplo n.º 9
0
 def test_contigset_len(self):
     ref = ReferenceSet(data.getXml(8))
     exp_n_contigs = len(ref)
     refs = ref.split(10)
     assert len(refs) == 10
     obs_n_contigs = 0
     for r in refs:
         obs_n_contigs += len(r)
     assert obs_n_contigs == exp_n_contigs
Exemplo n.º 10
0
 def test_exit_code_0_referenceset(self):
     """
     Like a cram test. Assert exits with 0 with ReferenceSet XML
     """
     ref_name = op.join(self._output_dir, "refset.xml")
     refset = ReferenceSet(self._get_reference_fasta())
     refset.write(ref_name)
     ref = ref_name
     cmd = 'python -m pbreports.report.variants {r} {c} {a} {v}'.format(
         r='rpt.json', c=ref, a=self.ALIGNMENT_SUMMARY,
         v=self.VARIANTS_GFF)
     rcode = run_backticks(cmd)
     self.assertEquals(0, rcode)
     self.assertTrue(op.exists("rpt.json"))
 def test_exit_code_0_referenceset(self):
     """
     Like a cram test. Assert exits with 0 with ReferenceSet XML
     """
     ref_name = op.join(self._output_dir, "refset.xml")
     refset = ReferenceSet(self._get_reference_fasta())
     refset.write(ref_name)
     ref = ref_name
     cmd = 'python -m pbreports.report.variants {o} {r} {c} {a} {v}'.format(
         o=self._output_dir, r='rpt.json', c=ref, a=self.ALIGNMENT_SUMMARY,
         v=self.VARIANTS_GFF)
     rcode = run_backticks(cmd)
     self.assertEquals(0, rcode)
     self.assertTrue(op.exists(op.join(self._output_dir, "rpt.json")))
Exemplo n.º 12
0
    def test_alignment_reference(self):
        rs1 = ReferenceSet(data.getXml(9))
        fasta_res = rs1.externalResources[0]
        fasta_file = urlparse(fasta_res.resourceId).path

        ds1 = AlignmentSet(data.getXml(8), referenceFastaFname=rs1)
        aln_ref = None
        for aln in ds1:
            aln_ref = aln.reference()
            break
        self.assertTrue(aln_ref is not None)

        ds1 = AlignmentSet(data.getXml(8), referenceFastaFname=fasta_file)
        aln_ref = None
        for aln in ds1:
            aln_ref = aln.reference()
            break
        self.assertTrue(aln_ref is not None)

        ds1 = AlignmentSet(data.getXml(8))
        ds1.addReference(fasta_file)
        aln_ref = None
        for aln in ds1:
            aln_ref = aln.reference()
            break
        self.assertTrue(aln_ref is not None)
Exemplo n.º 13
0
    def test_contigset_consolidate_int_names(self):
        # build set to merge
        outdir = tempfile.mkdtemp(suffix="dataset-unittest")

        inFas = os.path.join(outdir, 'infile.fasta')
        outFas1 = os.path.join(outdir, 'tempfile1.fasta')
        outFas2 = os.path.join(outdir, 'tempfile2.fasta')

        # copy fasta reference to hide fai and ensure FastaReader is used
        shutil.copyfile(
            ReferenceSet(data.getXml(8)).toExternalFiles()[0], inFas)
        rs1 = ContigSet(inFas)

        double = 'B.cereus.1'
        exp_double = rs1.get_contig(double)

        # todo: modify the names first:
        with FastaWriter(outFas1) as writer:
            writer.writeRecord('5141', exp_double.sequence)
        with FastaWriter(outFas2) as writer:
            writer.writeRecord('5142', exp_double.sequence)

        exp_double_seqs = [exp_double.sequence, exp_double.sequence]
        exp_names = ['5141', '5142']

        obs_file = ContigSet(outFas1, outFas2)
        log.debug(obs_file.toExternalFiles())
        obs_file.consolidate()
        log.debug(obs_file.toExternalFiles())

        # open obs and compare to exp
        for name, seq in zip(exp_names, exp_double_seqs):
            assert obs_file.get_contig(name).sequence[:] == seq
Exemplo n.º 14
0
def make_topvariants_report(gff, reference, how_many, batch_sort_size, report,
                            output_dir):
    """
    Entry to report.
    :param gff: (str) path to variants.gff (or rare_variants.gff). Note, could also be *.gz
    :param reference: (str) path to reference dir
    :param how_many: (int)
    :param batch_sort_size: (int)
    :param report: (str) report name
    :param batch_sort_size: (str) output dir
    """
    _validate_inputs(gff, reference, how_many, batch_sort_size)

    table_builder = VariantTableBuilder()
    vf = VariantFinder(gff, reference, how_many, batch_sort_size)
    top = vf.find_top()
    for v in top:
        table_builder.add_variant(v)

    r = Report(Constants.R_ID,
               tables=[table_builder.table],
               dataset_uuids=(ReferenceSet(reference).uuid, ))
    r = spec.apply_view(r)
    r.write_json(os.path.join(output_dir, report))
    return 0
Exemplo n.º 15
0
def make_variants_report(aln_summ_gff, variants_gff, reference, max_contigs_to_plot, report, output_dir, dpi=72, dumpdata=True):
    """
    Entry to report.
    :param aln_summ_gff: (str) path to alignment_summary.gff
    :param variants_gff: (str) path to variants_gff
    :param reference: (str) path to reference_dir
    :param max_contigs_to_plot: (int) max number of contigs to plot
    """
    _validate_inputs([('aln_summ_gff', aln_summ_gff),
                      ('variants_gff', variants_gff),
                      ('reference', reference)])

    # reference entry & top contings
    ref = openReference(reference)
    top_contigs = get_top_contigs_from_ref_entry(ref, max_contigs_to_plot)

    # extract gff data from files
    ref_data, contig_variants = _extract_alignment_summ_data(
        aln_summ_gff, top_contigs)
    _append_variants_gff_data(ref_data, variants_gff)

    # make report objects
    table, atts = _get_consensus_table_and_attributes(ref_data, ref)
    plotgroup = _create_variants_plot_grp(
        top_contigs, contig_variants, output_dir)

    rpt = Report(Constants.R_ID,
                 plotgroups=[plotgroup],
                 attributes=atts,
                 tables=[table],
                 dataset_uuids=(ReferenceSet(reference).uuid,))

    rpt = spec.apply_view(rpt)
    rpt.write_json(os.path.join(output_dir, report))
    return rpt
Exemplo n.º 16
0
    def loadReferenceContigs(referencePath, alignmentSet, windows=None):
        # FIXME we should get rid of this entirely, but I think it requires
        # fixing the inconsistency in how contigs are referenced here versus in
        # pbcore.io
        """
        Load the reference contigs, and tag each one with the ref.cmpH5ID it
        was assigned in the alignment file(s).  Return a list of contigs,
        which are used to set up IpdModel.
        """

        # Read contigs from FASTA file (or XML dataset)
        refReader = ReferenceSet(referencePath)
        contigs = []
        if windows is not None:
            refNames = set([rw.refName for rw in windows])
            for contig in refReader:
                if contig.id in refNames:
                    contigs.append(contig)
        else:
            contigs.extend([x for x in refReader])
        contigDict = dict([(x.id, x) for x in contigs])

        # initially each contig has an id of None -- this will be overwritten with the id from the cmp.h5, if there are any
        # reads mapped to it.
        for x in contigs:
            x.cmph5ID = None

        # Mark each contig with it's ID from the cmp.h5 - match them up using MD5s
        for x in alignmentSet.referenceInfoTable:
            if x.FullName in contigDict:
                contigDict[x.FullName].cmph5ID = x.ID

        return contigs
Exemplo n.º 17
0
 def setUpClass(cls):
     super(TestModificationsOutput, cls).setUpClass()
     datastore = DataStore.from_job_path(cls.job_dir)
     entrypoints = EntryPoints.from_job_path(cls.job_dir)
     cls.h5_file = None
     cls.bw_file = None
     cls.gff_file = None
     for file_id, file_info in datastore.get_file_dict().iteritems():
         if file_info.is_chunked:
             continue
         if file_info.file_type_id == FileTypes.GFF.file_type_id:
             with GffReader(file_info.path) as gff:
                 for header in gff.headers:
                     if header.startswith("##source ipdSummary"):
                         cls.gff_file = file_info.path
         elif file_info.file_type_id == FileTypes.H5.file_type_id:
             cls.h5_file = file_info.path
         elif file_info.file_type_id == FileTypes.BIGWIG.file_type_id:
             cls.bw_file = file_info.path
     with GffReader(cls.gff_file) as gff:
         cls.gff_records = [rec for rec in gff]
     cls.gff_dict = {}
     for rec in cls.gff_records:
         cls.gff_dict[(rec.seqid, rec.start, rec.strand)] = rec
     ref = entrypoints.data['eid_ref_dataset']
     cls.seqids = []
     with ReferenceSet(ref) as rs:
         for i_ref, ctg in enumerate(rs):
             cls.seqids.append(ctg.id)
Exemplo n.º 18
0
def run_fasta_to_reference(input_file_name,
                           output_file_name,
                           organism=None,
                           reference_name=None,
                           ploidy="haploid"):
    if reference_name is None or reference_name == "":
        reference_name = op.splitext(op.basename(input_file_name))[0]
    ds_in = ContigSet(input_file_name)
    if len(ds_in.externalResources) > 1:
        raise TypeError("Only a single FASTA file is supported as input.")
    fasta_file_name = ds_in.externalResources[0].resourceId
    output_dir_name = op.dirname(output_file_name)
    args = [
        "fasta-to-reference", "--organism",
        str(organism) if organism != "" else "unknown", "--ploidy",
        str(ploidy) if ploidy != "" else "unknown", "--debug", fasta_file_name,
        output_dir_name, reference_name
    ]
    log.info(" ".join(args))
    result = run_cmd(" ".join(args),
                     stdout_fh=sys.stdout,
                     stderr_fh=sys.stderr)
    if result.exit_code != 0:
        return result.exit_code
    ref_file = op.join(output_dir_name, reference_name, "referenceset.xml")
    assert op.isfile(ref_file)
    with ReferenceSet(ref_file, strict=True) as ds_ref:
        ds_ref.makePathsAbsolute()
        log.info("saving final ReferenceSet to {f}".format(f=output_file_name))
        ds_ref.write(output_file_name)
    return 0
    def test_exit_code_0_referenceset(self):
        """
        Like a cram test. Assert exits with 0 with ReferenceSet XML
        """
        ref = self._get_reference_fasta()
        ref_name = os.path.join(self._output_dir, "refset.xml")
        refset = ReferenceSet(ref)
        refset.write(ref_name)
        ref = ref_name
        j = 'rpt.json'
        cmd = 'python -m pbreports.report.top_variants {j} {g} {r}'.format(
            g=self.VARIANTS_GFF, r=ref, j=j)
        log.info(cmd)

        rcode = run_backticks(cmd)
        self.assertEquals(0, rcode)
Exemplo n.º 20
0
def run_fasta_to_reference(input_file_name, output_file_name,
                           organism, reference_name,
                           ploidy):
    """Copied from pbcoretools/tasks/converters.py:run_fasta_to_reference()
    """
    ds_in = ContigSet(input_file_name)
    if len(ds_in.externalResources) > 1:
        raise TypeError("Only a single FASTA file is supported as input.")
    fasta_file_name = ds_in.externalResources[0].resourceId
    output_dir_name = op.dirname(output_file_name)
    args = [
        "fasta-to-reference",
        "--organism", organism,
        "--ploidy", ploidy,
        "--debug",
        fasta_file_name,
        output_dir_name,
        reference_name
    ]
    log.info(" ".join(args))
    system(" ".join(args))
    ref_file = op.join(output_dir_name, reference_name, "referenceset.xml")
    assert op.isfile(ref_file)
    with ReferenceSet(ref_file, strict=True) as ds_ref:
        ds_ref.makePathsAbsolute()
        log.info("saving final ReferenceSet to {f!r}".format(f=output_file_name))
        ds_ref.write(output_file_name)
Exemplo n.º 21
0
    def test_len(self):
        # AlignmentSet
        aln = AlignmentSet(data.getXml(7), strict=True)
        assert len(aln) == 92
        assert aln._length == (92, 123588)
        assert aln.totalLength == 123588
        assert aln.numRecords == 92
        aln.totalLength = -1
        aln.numRecords = -1
        assert aln.totalLength == -1
        assert aln.numRecords == -1
        aln.updateCounts()
        assert aln.totalLength == 123588
        assert aln.numRecords == 92
        assert sum(1 for _ in aln) == 92
        assert sum(len(rec) for rec in aln) == 123588

        # AlignmentSet with filters
        aln = AlignmentSet(data.getXml(14), strict=True)
        assert len(aln) == 40
        assert aln._length == (40, 52023)
        assert aln.totalLength == 52023
        assert aln.numRecords == 40
        aln.totalLength = -1
        aln.numRecords = -1
        assert aln.totalLength == -1
        assert aln.numRecords == -1
        aln.updateCounts()
        assert aln.totalLength == 52023
        assert aln.numRecords == 40

        # SubreadSet
        sset = SubreadSet(data.getXml(9), strict=True)
        assert len(sset) == 92
        assert sset._length == (92, 124093)
        assert sset.totalLength == 124093
        assert sset.numRecords == 92
        sset.totalLength = -1
        sset.numRecords = -1
        assert sset.totalLength == -1
        assert sset.numRecords == -1
        sset.updateCounts()
        assert sset.totalLength == 124093
        assert sset.numRecords == 92
        assert sum(1 for _ in sset) == 92
        assert sum(len(rec) for rec in sset) == 124093

        # ReferenceSet
        sset = ReferenceSet(data.getXml(8), strict=True)
        assert len(sset) == 59
        assert sset.totalLength == 85774
        assert sset.numRecords == 59
        sset.totalLength = -1
        sset.numRecords = -1
        assert sset.totalLength == -1
        assert sset.numRecords == -1
        sset.updateCounts()
        assert sset.totalLength == 85774
        assert sset.numRecords == 59
Exemplo n.º 22
0
 def test_getitem(self):
     types = [
         AlignmentSet(data.getXml(7)),
         ReferenceSet(data.getXml(8)),
         SubreadSet(data.getXml(9)),
     ]
     for ds in types:
         assert ds[0]
    def test_exit_code_0_referenceset(self):
        """
        Like a cram test. Assert exits with 0 with ReferenceSet XML
        """
        ref = self._get_reference_fasta()
        ref_name = os.path.join(self._output_dir, "refset.xml")
        refset = ReferenceSet(ref)
        refset.write(ref_name)
        ref = ref_name
        j = 'rpt.json'
        cmd = 'python -m pbreports.report.top_variants {o} {j} {g} {r}'.format(
            o=self._output_dir,
            g=self.VARIANTS_GFF, r=ref, j=j)
        log.info(cmd)

        rcode = run_backticks(cmd)
        self.assertEquals(0, rcode)
        self.assertTrue(os.path.exists(os.path.join(self._output_dir, j)))
Exemplo n.º 24
0
    def test_contigset_consolidate(self):
        #build set to merge
        outdir = tempfile.mkdtemp(suffix="dataset-unittest")

        inFas = os.path.join(outdir, 'infile.fasta')
        outFas1 = os.path.join(outdir, 'tempfile1.fasta')
        outFas2 = os.path.join(outdir, 'tempfile2.fasta')

        # copy fasta reference to hide fai and ensure FastaReader is used
        backticks('cp {i} {o}'.format(i=ReferenceSet(
            data.getXml(9)).toExternalFiles()[0],
                                      o=inFas))
        rs1 = ContigSet(inFas)

        singletons = ['A.baumannii.1', 'A.odontolyticus.1']
        double = 'B.cereus.1'
        reader = rs1.resourceReaders()[0]
        exp_double = rs1.get_contig(double)
        exp_singles = [rs1.get_contig(name) for name in singletons]

        # todo: modify the names first:
        with FastaWriter(outFas1) as writer:
            writer.writeRecord(exp_singles[0])
            writer.writeRecord(exp_double.name + '_10_20', exp_double.sequence)
        with FastaWriter(outFas2) as writer:
            writer.writeRecord(exp_double.name + '_0_10',
                               exp_double.sequence + 'ATCGATCGATCG')
            writer.writeRecord(exp_singles[1])

        exp_double_seq = ''.join(
            [exp_double.sequence, 'ATCGATCGATCG', exp_double.sequence])
        exp_single_seqs = [rec.sequence for rec in exp_singles]

        acc_file = ContigSet(outFas1, outFas2)
        acc_file.induceIndices()
        log.debug(acc_file.toExternalFiles())
        self.assertEqual(len(acc_file), 4)
        self.assertEqual(len(list(acc_file)), 4)
        acc_file.consolidate()
        log.debug(acc_file.toExternalFiles())

        # open acc and compare to exp
        for name, seq in zip(singletons, exp_single_seqs):
            self.assertEqual(acc_file.get_contig(name).sequence[:], seq)
        self.assertEqual(
            acc_file.get_contig(double).sequence[:], exp_double_seq)

        self.assertEqual(len(acc_file._openReaders), 1)
        self.assertEqual(len(acc_file.index), 3)
        self.assertEqual(len(acc_file._indexMap), 3)
        self.assertEqual(len(acc_file), 3)
        self.assertEqual(len(list(acc_file)), 3)

        # test merge:
        acc1 = ContigSet(outFas1)
        acc2 = ContigSet(outFas2)
        acc3 = acc1 + acc2
Exemplo n.º 25
0
def openReference(fname):
    """ Take a ReferenceSet, fasta or reference dir path and return a
    referenceSet.
    """
    if os.path.isdir(fname):
        raise ValueError("{r} is a directory, not a ReferenceSet".format(
                         r=fname))
    ref = ReferenceSet(fname)
    return ref
 def test_exit_code_0_referenceset(self):
     """
     Like a cram test. Assert exits with 0 with ReferenceSet XML
     """
     ref_name = op.join(self._output_dir, "refset.xml")
     refset = ReferenceSet(self.REFERENCE)
     refset.write(ref_name)
     ref = ref_name
     r = op.join(self._output_dir, 'rpt.json')
     cmd = 'python -m pbreports.report.coverage {c} {g} {r}'.format(
         r=r, c=ref, g=self.GFF)
     log.info(cmd)
     o, c, m = backticks(cmd)
     if c is not 0:
         log.error(m)
         log.error(o)
         sys.stderr.write(str(m) + "\n")
     self.assertEquals(0, c)
     self.assertTrue(op.exists(r))
 def test_exit_code_0_referenceset(self):
     """
     Like a cram test. Assert exits with 0 with ReferenceSet XML
     """
     ref_name = op.join(self._output_dir, "refset.xml")
     refset = ReferenceSet(self.REFERENCE)
     refset.write(ref_name)
     ref = ref_name
     r = op.join(self._output_dir, 'rpt.json')
     cmd = 'python -m pbreports.report.coverage {c} {g} {r}'.format(
         r=r, c=ref, g=self.GFF)
     log.info(cmd)
     o, c, m = backticks(cmd)
     if c is not 0:
         log.error(m)
         log.error(o)
         sys.stderr.write(str(m) + "\n")
     self.assertEquals(0, c)
     self.assertTrue(op.exists(r))
Exemplo n.º 28
0
 def test_file_factory(self):
     # TODO: add ConsensusReadSet, cmp.h5 alignmentSet
     types = [
         AlignmentSet(data.getXml(7)),
         ReferenceSet(data.getXml(8)),
         SubreadSet(data.getXml(9))
     ]
     for ds in types:
         mystery = openDataFile(ds.toExternalFiles()[0])
         assert type(mystery) == type(ds)
Exemplo n.º 29
0
 def test_create_cli_reference_fasta(self):
     tmp_dir = tempfile.mkdtemp(suffix="dataset-unittest")
     fasta = os.path.join(tmp_dir, "reference.fasta")
     with open(fasta, "w") as fasta_out:
         fasta_out.write(">chr1\nacgtacgtacgt")
     ref_xml = os.path.join(tmp_dir, "test.referenceset.xml")
     cmd = "dataset create {d} {f} --generateIndices --type ReferenceSet --name test_reference_name --organism test_reference_organism --ploidy octaploid".format(
         d=ref_xml, f=fasta)
     self._run_cmd_with_output(cmd, ref_xml)
     ref = ReferenceSet(ref_xml)
     assert ref.metadata.organism == "test_reference_organism"
Exemplo n.º 30
0
 def test_init_xml(self):
     """Test PBAlignRunner.__init__() to XML."""
     argumentList = [
         '--minAccuracy', '70', '--maxDivergence', '30', self.queryFile,
         self.referenceFile, self.xmlOut
     ]
     pbobj = PBAlignRunner(argumentList=argumentList)
     self.assertEqual(pbobj.start(), 0)
     aln = AlignmentSet(self.xmlOut)
     self.assertEqual(aln.externalResources[0].reference,
                      ReferenceSet(self.referenceFile).toExternalFiles()[0])
Exemplo n.º 31
0
 def test_file_factory(self):
     # TODO: add ConsensusReadSet, cmp.h5 alignmentSet
     types = [
         AlignmentSet(data.getXml(8)),
         ReferenceSet(data.getXml(9)),
         SubreadSet(data.getXml(10)),
         #ConsensusAlignmentSet(data.getXml(20)),
         HdfSubreadSet(data.getXml(19))
     ]
     for ds in types:
         mystery = openDataFile(ds.toExternalFiles()[0])
         self.assertEqual(type(mystery), type(ds))
Exemplo n.º 32
0
    def test_alignment_reference(self):
        rfn = data.getXml(9)
        rs1 = ReferenceSet(data.getXml(9))
        fasta_res = rs1.externalResources[0]
        fasta_file = urlparse(fasta_res.resourceId).path

        ds1 = AlignmentSet(data.getXml(8), referenceFastaFname=rs1)
        aln_ref = None
        for aln in ds1:
            aln_ref = aln.reference()
            break
        self.assertTrue(aln_ref is not None)
        self.assertEqual(ds1.externalResources[0].reference, fasta_file)
        self.assertEqual(ds1.resourceReaders()[0].referenceFasta.filename,
                         fasta_file)

        ds1 = AlignmentSet(data.getXml(8), referenceFastaFname=fasta_file)
        aln_ref = None
        for aln in ds1:
            aln_ref = aln.reference()
            break
        self.assertTrue(aln_ref is not None)
        self.assertEqual(ds1.externalResources[0].reference, fasta_file)
        self.assertEqual(ds1.resourceReaders()[0].referenceFasta.filename,
                         fasta_file)

        ds1 = AlignmentSet(data.getXml(8))
        ds1.addReference(fasta_file)
        aln_ref = None
        for aln in ds1:
            aln_ref = aln.reference()
            break
        self.assertTrue(aln_ref is not None)
        self.assertEqual(ds1.externalResources[0].reference, fasta_file)
        self.assertEqual(ds1.resourceReaders()[0].referenceFasta.filename,
                         fasta_file)

        fofn_out = tempfile.NamedTemporaryFile(suffix=".fofn").name
        log.debug(fofn_out)
        with open(fofn_out, 'w') as f:
            f.write(data.getXml(8))
            f.write('\n')
            f.write(data.getXml(11))
            f.write('\n')
        ds1 = AlignmentSet(fofn_out, referenceFastaFname=fasta_file)
        aln_ref = None
        for aln in ds1:
            aln_ref = aln.reference()
            break
        self.assertTrue(aln_ref is not None)
        self.assertEqual(ds1.externalResources[0].reference, fasta_file)
        self.assertEqual(ds1.resourceReaders()[0].referenceFasta.filename,
                         fasta_file)
Exemplo n.º 33
0
def test_makeConsensusTensorsRef_EqualStates():
    """
    Test that ConsensusTensorList can be populated in equal-states
    mode
    :return:
    """
    rset = ReferenceSet('data/references/All4mers_InsertOnly.ReferenceSet.xml')
    ref = rset[np.flatnonzero(rset.index['id'] == 'All4mer.V2.105')[0]]
    tpctl = setup_func(ref)
    poa_tensor_list = ConsensusTensorList(tpctl.poa.subreads,
                                          ref=ref,
                                          collection_mode='equal-state',
                                          subsample_count=30)
Exemplo n.º 34
0
def test_makeConsensusTensorsRef_FullMode():
    """
    Test that ConsensusTensorList can be populated in standard mode
    :return:
    """
    rset = ReferenceSet('data/references/All4mers_InsertOnly.ReferenceSet.xml')
    ref = rset[np.flatnonzero(rset.index['id'] == 'All4mer.V2.105')[0]]
    tpctl = setup_func(ref)
    poa_tensor_list = ConsensusTensorList(tpctl.poa.subreads,
                                          ref=ref,
                                          context_width=1,
                                          collection_mode='standard',
                                          subsample_count=15)
Exemplo n.º 35
0
    def test_referenceset_contigs(self):
        names = [
            'A.baumannii.1', 'A.odontolyticus.1', 'B.cereus.1', 'B.cereus.2',
            'B.cereus.4', 'B.cereus.6', 'B.vulgatus.1', 'B.vulgatus.2',
            'B.vulgatus.3', 'B.vulgatus.4', 'B.vulgatus.5', 'C.beijerinckii.1',
            'C.beijerinckii.2', 'C.beijerinckii.3', 'C.beijerinckii.4',
            'C.beijerinckii.5', 'C.beijerinckii.6', 'C.beijerinckii.7',
            'C.beijerinckii.8', 'C.beijerinckii.9', 'C.beijerinckii.10',
            'C.beijerinckii.11', 'C.beijerinckii.12', 'C.beijerinckii.13',
            'C.beijerinckii.14', 'D.radiodurans.1', 'D.radiodurans.2',
            'E.faecalis.1', 'E.faecalis.2', 'E.coli.1', 'E.coli.2', 'E.coli.4',
            'E.coli.5', 'E.coli.6', 'E.coli.7', 'H.pylori.1', 'L.gasseri.1',
            'L.monocytogenes.1', 'L.monocytogenes.2', 'L.monocytogenes.3',
            'L.monocytogenes.5', 'N.meningitidis.1', 'P.acnes.1',
            'P.aeruginosa.1', 'P.aeruginosa.2', 'R.sphaeroides.1',
            'R.sphaeroides.3', 'S.aureus.1', 'S.aureus.4', 'S.aureus.5',
            'S.epidermidis.1', 'S.epidermidis.2', 'S.epidermidis.3',
            'S.epidermidis.4', 'S.epidermidis.5', 'S.agalactiae.1',
            'S.mutans.1', 'S.mutans.2', 'S.pneumoniae.1'
        ]
        seqlens = [
            1458, 1462, 1472, 1473, 1472, 1472, 1449, 1449, 1449, 1449, 1449,
            1433, 1433, 1433, 1433, 1433, 1433, 1433, 1433, 1433, 1433, 1433,
            1433, 1433, 1433, 1423, 1423, 1482, 1482, 1463, 1463, 1463, 1463,
            1463, 1463, 1424, 1494, 1471, 1471, 1471, 1471, 1462, 1446, 1457,
            1457, 1386, 1388, 1473, 1473, 1473, 1472, 1472, 1472, 1472, 1472,
            1470, 1478, 1478, 1467
        ]
        ds = ReferenceSet(data.getXml(8))
        log.debug([contig.id for contig in ds])
        for contig, name, seqlen in zip(ds.contigs, names, seqlens):
            assert contig.id == name
            assert len(contig.sequence) == seqlen

        for name in names:
            assert ds.get_contig(name)

        for name in names:
            assert ds[name].id == name
Exemplo n.º 36
0
 def test_filter_reference_contigs(self):
     ds2 = ReferenceSet(data.getRef())
     self.assertEqual(len(list(ds2.refNames)), 59)
     filt = Filters()
     filt.addRequirement(id=[('==', 'E.faecalis.1')])
     ds2.addFilters(filt)
     self.assertEqual(str(ds2.filters), "( id == E.faecalis.1 )")
     self.assertEqual(len(ds2.refNames), 1)
     self.assertEqual(len(list(ds2.records)), 1)
     ds2.disableFilters()
     self.assertEqual(len(list(ds2.refNames)), 59)
     self.assertEqual(len(list(ds2.records)), 59)
     ds2.enableFilters()
     self.assertEqual(len(list(ds2.refNames)), 1)
     self.assertEqual(len(list(ds2.records)), 1)
Exemplo n.º 37
0
    def test_missing_fai_error_message(self):
        outdir = tempfile.mkdtemp(suffix="dataset-unittest")

        inFas = os.path.join(outdir, 'infile.fasta')

        # copy fasta reference to hide fai and ensure FastaReader is used
        shutil.copyfile(
            ReferenceSet(data.getXml(8)).toExternalFiles()[0], inFas)
        rs1 = ContigSet(inFas)
        with pytest.raises(IOError) as cm:
            rs1.assertIndexed()
            assert str(cm) == (
                "Companion FASTA index (.fai) file not found or malformatted! "
                "Use 'samtools faidx' to generate FASTA index.")
Exemplo n.º 38
0
 def test_incorrect_len_getitem(self):
     types = [
         AlignmentSet(data.getXml(7)),
         ReferenceSet(data.getXml(8)),
         SubreadSet(data.getXml(9))
     ]
     fn = tempfile.NamedTemporaryFile(suffix=".xml").name
     for ds in types:
         explen = -2
         with openDataFile(ds.toExternalFiles()[0]) as mystery:
             # try to avoid crashes...
             explen = len(mystery)
             mystery.numRecords = 1000000000
             mystery.write(fn)
         with openDataFile(fn) as mystery:
             assert len(list(mystery)) == explen
Exemplo n.º 39
0
 def test_autofilled_metatypes(self):
     ds = ReferenceSet(data.getXml(8))
     for extRes in ds.externalResources:
         assert extRes.metaType == 'PacBio.ReferenceFile.ReferenceFastaFile'
         assert len(extRes.indices) == 1
         for index in extRes.indices:
             assert index.metaType == "PacBio.Index.SamIndex"
     ds = AlignmentSet(data.getXml(7))
     for extRes in ds.externalResources:
         assert extRes.metaType == 'PacBio.SubreadFile.SubreadBamFile'
         assert len(extRes.indices) == 2
         for index in extRes.indices:
             if index.resourceId.endswith('pbi'):
                 assert index.metaType == "PacBio.Index.PacBioIndex"
             if index.resourceId.endswith('bai'):
                 assert index.metaType == "PacBio.Index.BamIndex"
Exemplo n.º 40
0
 def test_filter_reference_contigs(self):
     ds2 = ReferenceSet(data.getRef())
     self.assertEqual(len(list(ds2.refNames)), 59)
     filt = Filters()
     filt.addRequirement(id=[('==', 'E.faecalis.1')])
     ds2.addFilters(filt)
     self.assertEqual(str(ds2.filters),
                      "( id == E.faecalis.1 )")
     self.assertEqual(len(ds2.refNames), 1)
     self.assertEqual(len(list(ds2.records)), 1)
     ds2.disableFilters()
     self.assertEqual(len(list(ds2.refNames)), 59)
     self.assertEqual(len(list(ds2.records)), 59)
     ds2.enableFilters()
     self.assertEqual(len(list(ds2.refNames)), 1)
     self.assertEqual(len(list(ds2.records)), 1)