Пример #1
0
    def loadReferenceContigs(referencePath, alignmentSet, windows=None):
        # FIXME we should get rid of this entirely, but I think it requires
        # fixing the inconsistency in how contigs are referenced here versus in
        # pbcore.io
        """
        Load the reference contigs, and tag each one with the ref.cmpH5ID it
        was assigned in the alignment file(s).  Return a list of contigs,
        which are used to set up IpdModel.
        """

        # Read contigs from FASTA file (or XML dataset)
        refReader = ReferenceSet(referencePath)
        contigs = []
        if windows is not None:
            refNames = set([rw.refName for rw in windows])
            for contig in refReader:
                if contig.id in refNames:
                    contigs.append(contig)
        else:
            contigs.extend([x for x in refReader])
        contigDict = dict([(x.id, x) for x in contigs])

        # initially each contig has an id of None -- this will be overwritten with the id from the cmp.h5, if there are any
        # reads mapped to it.
        for x in contigs:
            x.cmph5ID = None

        # Mark each contig with it's ID from the cmp.h5 - match them up using MD5s
        for x in alignmentSet.referenceInfoTable:
            if x.FullName in contigDict:
                contigDict[x.FullName].cmph5ID = x.ID

        return contigs
Пример #2
0
def make_variants_report(aln_summ_gff, variants_gff, reference, max_contigs_to_plot, report, output_dir, dpi=72, dumpdata=True):
    """
    Entry to report.
    :param aln_summ_gff: (str) path to alignment_summary.gff
    :param variants_gff: (str) path to variants_gff
    :param reference: (str) path to reference_dir
    :param max_contigs_to_plot: (int) max number of contigs to plot
    """
    _validate_inputs([('aln_summ_gff', aln_summ_gff),
                      ('variants_gff', variants_gff),
                      ('reference', reference)])

    # reference entry & top contings
    ref = openReference(reference)
    top_contigs = get_top_contigs_from_ref_entry(ref, max_contigs_to_plot)

    # extract gff data from files
    ref_data, contig_variants = _extract_alignment_summ_data(
        aln_summ_gff, top_contigs)
    _append_variants_gff_data(ref_data, variants_gff)

    # make report objects
    table, atts = _get_consensus_table_and_attributes(ref_data, ref)
    plotgroup = _create_variants_plot_grp(
        top_contigs, contig_variants, output_dir)

    rpt = Report(Constants.R_ID,
                 plotgroups=[plotgroup],
                 attributes=atts,
                 tables=[table],
                 dataset_uuids=(ReferenceSet(reference).uuid,))

    rpt = spec.apply_view(rpt)
    rpt.write_json(os.path.join(output_dir, report))
    return rpt
Пример #3
0
    def test_alignment_reference(self):
        rs1 = ReferenceSet(data.getXml(9))
        fasta_res = rs1.externalResources[0]
        fasta_file = urlparse(fasta_res.resourceId).path

        ds1 = AlignmentSet(data.getXml(8), referenceFastaFname=rs1)
        aln_ref = None
        for aln in ds1:
            aln_ref = aln.reference()
            break
        self.assertTrue(aln_ref is not None)

        ds1 = AlignmentSet(data.getXml(8), referenceFastaFname=fasta_file)
        aln_ref = None
        for aln in ds1:
            aln_ref = aln.reference()
            break
        self.assertTrue(aln_ref is not None)

        ds1 = AlignmentSet(data.getXml(8))
        ds1.addReference(fasta_file)
        aln_ref = None
        for aln in ds1:
            aln_ref = aln.reference()
            break
        self.assertTrue(aln_ref is not None)
Пример #4
0
def run_fasta_to_reference(input_file_name,
                           output_file_name,
                           organism=None,
                           reference_name=None,
                           ploidy="haploid"):
    if reference_name is None or reference_name == "":
        reference_name = op.splitext(op.basename(input_file_name))[0]
    ds_in = ContigSet(input_file_name)
    if len(ds_in.externalResources) > 1:
        raise TypeError("Only a single FASTA file is supported as input.")
    fasta_file_name = ds_in.externalResources[0].resourceId
    output_dir_name = op.dirname(output_file_name)
    args = [
        "fasta-to-reference", "--organism",
        str(organism) if organism != "" else "unknown", "--ploidy",
        str(ploidy) if ploidy != "" else "unknown", "--debug", fasta_file_name,
        output_dir_name, reference_name
    ]
    log.info(" ".join(args))
    result = run_cmd(" ".join(args),
                     stdout_fh=sys.stdout,
                     stderr_fh=sys.stderr)
    if result.exit_code != 0:
        return result.exit_code
    ref_file = op.join(output_dir_name, reference_name, "referenceset.xml")
    assert op.isfile(ref_file)
    with ReferenceSet(ref_file, strict=True) as ds_ref:
        ds_ref.makePathsAbsolute()
        log.info("saving final ReferenceSet to {f}".format(f=output_file_name))
        ds_ref.write(output_file_name)
    return 0
Пример #5
0
def make_topvariants_report(gff, reference, how_many, batch_sort_size, report,
                            output_dir):
    """
    Entry to report.
    :param gff: (str) path to variants.gff (or rare_variants.gff). Note, could also be *.gz
    :param reference: (str) path to reference dir
    :param how_many: (int)
    :param batch_sort_size: (int)
    :param report: (str) report name
    :param batch_sort_size: (str) output dir
    """
    _validate_inputs(gff, reference, how_many, batch_sort_size)

    table_builder = VariantTableBuilder()
    vf = VariantFinder(gff, reference, how_many, batch_sort_size)
    top = vf.find_top()
    for v in top:
        table_builder.add_variant(v)

    r = Report(Constants.R_ID,
               tables=[table_builder.table],
               dataset_uuids=(ReferenceSet(reference).uuid, ))
    r = spec.apply_view(r)
    r.write_json(os.path.join(output_dir, report))
    return 0
Пример #6
0
def run_fasta_to_reference(input_file_name, output_file_name,
                           organism, reference_name,
                           ploidy):
    """Copied from pbcoretools/tasks/converters.py:run_fasta_to_reference()
    """
    ds_in = ContigSet(input_file_name)
    if len(ds_in.externalResources) > 1:
        raise TypeError("Only a single FASTA file is supported as input.")
    fasta_file_name = ds_in.externalResources[0].resourceId
    output_dir_name = op.dirname(output_file_name)
    args = [
        "fasta-to-reference",
        "--organism", organism,
        "--ploidy", ploidy,
        "--debug",
        fasta_file_name,
        output_dir_name,
        reference_name
    ]
    log.info(" ".join(args))
    system(" ".join(args))
    ref_file = op.join(output_dir_name, reference_name, "referenceset.xml")
    assert op.isfile(ref_file)
    with ReferenceSet(ref_file, strict=True) as ds_ref:
        ds_ref.makePathsAbsolute()
        log.info("saving final ReferenceSet to {f!r}".format(f=output_file_name))
        ds_ref.write(output_file_name)
Пример #7
0
    def test_contigset_consolidate_int_names(self):
        # build set to merge
        outdir = tempfile.mkdtemp(suffix="dataset-unittest")

        inFas = os.path.join(outdir, 'infile.fasta')
        outFas1 = os.path.join(outdir, 'tempfile1.fasta')
        outFas2 = os.path.join(outdir, 'tempfile2.fasta')

        # copy fasta reference to hide fai and ensure FastaReader is used
        shutil.copyfile(
            ReferenceSet(data.getXml(8)).toExternalFiles()[0], inFas)
        rs1 = ContigSet(inFas)

        double = 'B.cereus.1'
        exp_double = rs1.get_contig(double)

        # todo: modify the names first:
        with FastaWriter(outFas1) as writer:
            writer.writeRecord('5141', exp_double.sequence)
        with FastaWriter(outFas2) as writer:
            writer.writeRecord('5142', exp_double.sequence)

        exp_double_seqs = [exp_double.sequence, exp_double.sequence]
        exp_names = ['5141', '5142']

        obs_file = ContigSet(outFas1, outFas2)
        log.debug(obs_file.toExternalFiles())
        obs_file.consolidate()
        log.debug(obs_file.toExternalFiles())

        # open obs and compare to exp
        for name, seq in zip(exp_names, exp_double_seqs):
            assert obs_file.get_contig(name).sequence[:] == seq
 def setUpClass(cls):
     super(TestModificationsOutput, cls).setUpClass()
     datastore = DataStore.from_job_path(cls.job_dir)
     entrypoints = EntryPoints.from_job_path(cls.job_dir)
     cls.h5_file = None
     cls.bw_file = None
     cls.gff_file = None
     for file_id, file_info in datastore.get_file_dict().iteritems():
         if file_info.is_chunked:
             continue
         if file_info.file_type_id == FileTypes.GFF.file_type_id:
             with GffReader(file_info.path) as gff:
                 for header in gff.headers:
                     if header.startswith("##source ipdSummary"):
                         cls.gff_file = file_info.path
         elif file_info.file_type_id == FileTypes.H5.file_type_id:
             cls.h5_file = file_info.path
         elif file_info.file_type_id == FileTypes.BIGWIG.file_type_id:
             cls.bw_file = file_info.path
     with GffReader(cls.gff_file) as gff:
         cls.gff_records = [rec for rec in gff]
     cls.gff_dict = {}
     for rec in cls.gff_records:
         cls.gff_dict[(rec.seqid, rec.start, rec.strand)] = rec
     ref = entrypoints.data['eid_ref_dataset']
     cls.seqids = []
     with ReferenceSet(ref) as rs:
         for i_ref, ctg in enumerate(rs):
             cls.seqids.append(ctg.id)
Пример #9
0
    def test_len(self):
        # AlignmentSet
        aln = AlignmentSet(data.getXml(7), strict=True)
        assert len(aln) == 92
        assert aln._length == (92, 123588)
        assert aln.totalLength == 123588
        assert aln.numRecords == 92
        aln.totalLength = -1
        aln.numRecords = -1
        assert aln.totalLength == -1
        assert aln.numRecords == -1
        aln.updateCounts()
        assert aln.totalLength == 123588
        assert aln.numRecords == 92
        assert sum(1 for _ in aln) == 92
        assert sum(len(rec) for rec in aln) == 123588

        # AlignmentSet with filters
        aln = AlignmentSet(data.getXml(14), strict=True)
        assert len(aln) == 40
        assert aln._length == (40, 52023)
        assert aln.totalLength == 52023
        assert aln.numRecords == 40
        aln.totalLength = -1
        aln.numRecords = -1
        assert aln.totalLength == -1
        assert aln.numRecords == -1
        aln.updateCounts()
        assert aln.totalLength == 52023
        assert aln.numRecords == 40

        # SubreadSet
        sset = SubreadSet(data.getXml(9), strict=True)
        assert len(sset) == 92
        assert sset._length == (92, 124093)
        assert sset.totalLength == 124093
        assert sset.numRecords == 92
        sset.totalLength = -1
        sset.numRecords = -1
        assert sset.totalLength == -1
        assert sset.numRecords == -1
        sset.updateCounts()
        assert sset.totalLength == 124093
        assert sset.numRecords == 92
        assert sum(1 for _ in sset) == 92
        assert sum(len(rec) for rec in sset) == 124093

        # ReferenceSet
        sset = ReferenceSet(data.getXml(8), strict=True)
        assert len(sset) == 59
        assert sset.totalLength == 85774
        assert sset.numRecords == 59
        sset.totalLength = -1
        sset.numRecords = -1
        assert sset.totalLength == -1
        assert sset.numRecords == -1
        sset.updateCounts()
        assert sset.totalLength == 85774
        assert sset.numRecords == 59
Пример #10
0
def loadFromFile(filename_, alnFile):
    """
    Reads reference from FASTA file, loading
    lookup tables that can be used any time later.
    """
    # Contigs in FASTA may disagree with those in cmp.h5 ref info
    # table, for instance if the FASTA has been edited.  Here's how we
    # handle things:
    #
    # |fastaContigs \   cmpContigs| > 0 : OK, extra FASTA contigs just ignored
    # |cmpContigs   \ fastaContigs| > 0 : Not necessarily OK---a warning should be
    #                                     issued.  We then proceed to operate on
    #                                     the contigs that are in both.
    # |cmpContigs ^ fastaContigs| == 0  : Nothing to work with.  This is an error.
    #
    # While we formerly used MD5s to vouch for the identity of a
    # contig, we now use the name.  This is an inferior approach but
    # is necessary, in using the FastaTable.

    # Load contigs
    assert not isLoaded()
    try:
        f = ReferenceSet(filename_)
        f.assertIndexed()
    except IOError as e:
        die(e)

    cmpContigNames = set(alnFile.refNames)

    for fastaRecord in f.contigs:
        refName = fastaRecord.id
        if refName in cmpContigNames:
            refEntry = alnFile.referenceInfo(refName)
            refId = refEntry.ID
            pacBioName = refEntry.Name
            refFullName = refEntry.FullName
            sequence = UppercasingMmappedFastaSequence(fastaRecord.sequence)
            length = len(fastaRecord.sequence)
            contig = ReferenceContig(refId, refName, refFullName, sequence,
                                     length)
            byId[refId] = contig
            byName[refName] = contig
            byPacBioName[pacBioName] = contig
    loadedFastaContigNames = set(byName.keys())
    logging.info("Loaded %d of %d reference groups from %s " %
                 (len(byName), len(loadedFastaContigNames), filename_))

    if len(byName) == 0:
        die("No reference groups in the FASTA file were aligned against.  " \
            "Did you select the wrong reference FASTA file?")
    elif (cmpContigNames - loadedFastaContigNames):
        logging.warn(
            "Some reference contigs aligned against are not found in " \
            "the reference FASTA.  Will process only those contigs "   \
            "supported by the reference FASTA.")

    global filename
    filename = filename_
    assert isLoaded()
Пример #11
0
 def test_getitem(self):
     types = [
         AlignmentSet(data.getXml(7)),
         ReferenceSet(data.getXml(8)),
         SubreadSet(data.getXml(9)),
     ]
     for ds in types:
         assert ds[0]
Пример #12
0
def openReference(fname):
    """ Take a ReferenceSet, fasta or reference dir path and return a
    referenceSet.
    """
    if os.path.isdir(fname):
        raise ValueError("{r} is a directory, not a ReferenceSet".format(
                         r=fname))
    ref = ReferenceSet(fname)
    return ref
Пример #13
0
 def test_contigset_len(self):
     ref = ReferenceSet(data.getXml(8))
     exp_n_contigs = len(ref)
     refs = ref.split(10)
     assert len(refs) == 10
     obs_n_contigs = 0
     for r in refs:
         obs_n_contigs += len(r)
     assert obs_n_contigs == exp_n_contigs
Пример #14
0
    def test_contigset_consolidate(self):
        #build set to merge
        outdir = tempfile.mkdtemp(suffix="dataset-unittest")

        inFas = os.path.join(outdir, 'infile.fasta')
        outFas1 = os.path.join(outdir, 'tempfile1.fasta')
        outFas2 = os.path.join(outdir, 'tempfile2.fasta')

        # copy fasta reference to hide fai and ensure FastaReader is used
        backticks('cp {i} {o}'.format(i=ReferenceSet(
            data.getXml(9)).toExternalFiles()[0],
                                      o=inFas))
        rs1 = ContigSet(inFas)

        singletons = ['A.baumannii.1', 'A.odontolyticus.1']
        double = 'B.cereus.1'
        reader = rs1.resourceReaders()[0]
        exp_double = rs1.get_contig(double)
        exp_singles = [rs1.get_contig(name) for name in singletons]

        # todo: modify the names first:
        with FastaWriter(outFas1) as writer:
            writer.writeRecord(exp_singles[0])
            writer.writeRecord(exp_double.name + '_10_20', exp_double.sequence)
        with FastaWriter(outFas2) as writer:
            writer.writeRecord(exp_double.name + '_0_10',
                               exp_double.sequence + 'ATCGATCGATCG')
            writer.writeRecord(exp_singles[1])

        exp_double_seq = ''.join(
            [exp_double.sequence, 'ATCGATCGATCG', exp_double.sequence])
        exp_single_seqs = [rec.sequence for rec in exp_singles]

        acc_file = ContigSet(outFas1, outFas2)
        acc_file.induceIndices()
        log.debug(acc_file.toExternalFiles())
        self.assertEqual(len(acc_file), 4)
        self.assertEqual(len(list(acc_file)), 4)
        acc_file.consolidate()
        log.debug(acc_file.toExternalFiles())

        # open acc and compare to exp
        for name, seq in zip(singletons, exp_single_seqs):
            self.assertEqual(acc_file.get_contig(name).sequence[:], seq)
        self.assertEqual(
            acc_file.get_contig(double).sequence[:], exp_double_seq)

        self.assertEqual(len(acc_file._openReaders), 1)
        self.assertEqual(len(acc_file.index), 3)
        self.assertEqual(len(acc_file._indexMap), 3)
        self.assertEqual(len(acc_file), 3)
        self.assertEqual(len(list(acc_file)), 3)

        # test merge:
        acc1 = ContigSet(outFas1)
        acc2 = ContigSet(outFas2)
        acc3 = acc1 + acc2
Пример #15
0
 def test_contigset_split(self):
     ref = ReferenceSet(data.getXml(9))
     exp_n_contigs = len(ref)
     refs = ref.split(10)
     self.assertEqual(len(refs), 10)
     obs_n_contigs = 0
     for r in refs:
         obs_n_contigs += sum(1 for _ in r)
     self.assertEqual(obs_n_contigs, exp_n_contigs)
Пример #16
0
 def test_file_factory(self):
     # TODO: add ConsensusReadSet, cmp.h5 alignmentSet
     types = [
         AlignmentSet(data.getXml(7)),
         ReferenceSet(data.getXml(8)),
         SubreadSet(data.getXml(9))
     ]
     for ds in types:
         mystery = openDataFile(ds.toExternalFiles()[0])
         assert type(mystery) == type(ds)
Пример #17
0
 def test_create_cli_reference_fasta(self):
     tmp_dir = tempfile.mkdtemp(suffix="dataset-unittest")
     fasta = os.path.join(tmp_dir, "reference.fasta")
     with open(fasta, "w") as fasta_out:
         fasta_out.write(">chr1\nacgtacgtacgt")
     ref_xml = os.path.join(tmp_dir, "test.referenceset.xml")
     cmd = "dataset create {d} {f} --generateIndices --type ReferenceSet --name test_reference_name --organism test_reference_organism --ploidy octaploid".format(
         d=ref_xml, f=fasta)
     self._run_cmd_with_output(cmd, ref_xml)
     ref = ReferenceSet(ref_xml)
     assert ref.metadata.organism == "test_reference_organism"
Пример #18
0
 def test_init_xml(self):
     """Test PBAlignRunner.__init__() to XML."""
     argumentList = [
         '--minAccuracy', '70', '--maxDivergence', '30', self.queryFile,
         self.referenceFile, self.xmlOut
     ]
     pbobj = PBAlignRunner(argumentList=argumentList)
     self.assertEqual(pbobj.start(), 0)
     aln = AlignmentSet(self.xmlOut)
     self.assertEqual(aln.externalResources[0].reference,
                      ReferenceSet(self.referenceFile).toExternalFiles()[0])
Пример #19
0
 def test_file_factory(self):
     # TODO: add ConsensusReadSet, cmp.h5 alignmentSet
     types = [
         AlignmentSet(data.getXml(8)),
         ReferenceSet(data.getXml(9)),
         SubreadSet(data.getXml(10)),
         #ConsensusAlignmentSet(data.getXml(20)),
         HdfSubreadSet(data.getXml(19))
     ]
     for ds in types:
         mystery = openDataFile(ds.toExternalFiles()[0])
         self.assertEqual(type(mystery), type(ds))
Пример #20
0
    def test_alignment_reference(self):
        rfn = data.getXml(9)
        rs1 = ReferenceSet(data.getXml(9))
        fasta_res = rs1.externalResources[0]
        fasta_file = urlparse(fasta_res.resourceId).path

        ds1 = AlignmentSet(data.getXml(8), referenceFastaFname=rs1)
        aln_ref = None
        for aln in ds1:
            aln_ref = aln.reference()
            break
        self.assertTrue(aln_ref is not None)
        self.assertEqual(ds1.externalResources[0].reference, fasta_file)
        self.assertEqual(ds1.resourceReaders()[0].referenceFasta.filename,
                         fasta_file)

        ds1 = AlignmentSet(data.getXml(8), referenceFastaFname=fasta_file)
        aln_ref = None
        for aln in ds1:
            aln_ref = aln.reference()
            break
        self.assertTrue(aln_ref is not None)
        self.assertEqual(ds1.externalResources[0].reference, fasta_file)
        self.assertEqual(ds1.resourceReaders()[0].referenceFasta.filename,
                         fasta_file)

        ds1 = AlignmentSet(data.getXml(8))
        ds1.addReference(fasta_file)
        aln_ref = None
        for aln in ds1:
            aln_ref = aln.reference()
            break
        self.assertTrue(aln_ref is not None)
        self.assertEqual(ds1.externalResources[0].reference, fasta_file)
        self.assertEqual(ds1.resourceReaders()[0].referenceFasta.filename,
                         fasta_file)

        fofn_out = tempfile.NamedTemporaryFile(suffix=".fofn").name
        log.debug(fofn_out)
        with open(fofn_out, 'w') as f:
            f.write(data.getXml(8))
            f.write('\n')
            f.write(data.getXml(11))
            f.write('\n')
        ds1 = AlignmentSet(fofn_out, referenceFastaFname=fasta_file)
        aln_ref = None
        for aln in ds1:
            aln_ref = aln.reference()
            break
        self.assertTrue(aln_ref is not None)
        self.assertEqual(ds1.externalResources[0].reference, fasta_file)
        self.assertEqual(ds1.resourceReaders()[0].referenceFasta.filename,
                         fasta_file)
Пример #21
0
def test_makeConsensusTensorsRef_EqualStates():
    """
    Test that ConsensusTensorList can be populated in equal-states
    mode
    :return:
    """
    rset = ReferenceSet('data/references/All4mers_InsertOnly.ReferenceSet.xml')
    ref = rset[np.flatnonzero(rset.index['id'] == 'All4mer.V2.105')[0]]
    tpctl = setup_func(ref)
    poa_tensor_list = ConsensusTensorList(tpctl.poa.subreads,
                                          ref=ref,
                                          collection_mode='equal-state',
                                          subsample_count=30)
Пример #22
0
def test_makeConsensusTensorsRef_FullMode():
    """
    Test that ConsensusTensorList can be populated in standard mode
    :return:
    """
    rset = ReferenceSet('data/references/All4mers_InsertOnly.ReferenceSet.xml')
    ref = rset[np.flatnonzero(rset.index['id'] == 'All4mer.V2.105')[0]]
    tpctl = setup_func(ref)
    poa_tensor_list = ConsensusTensorList(tpctl.poa.subreads,
                                          ref=ref,
                                          context_width=1,
                                          collection_mode='standard',
                                          subsample_count=15)
Пример #23
0
    def test_missing_fai_error_message(self):
        outdir = tempfile.mkdtemp(suffix="dataset-unittest")

        inFas = os.path.join(outdir, 'infile.fasta')

        # copy fasta reference to hide fai and ensure FastaReader is used
        shutil.copyfile(
            ReferenceSet(data.getXml(8)).toExternalFiles()[0], inFas)
        rs1 = ContigSet(inFas)
        with pytest.raises(IOError) as cm:
            rs1.assertIndexed()
            assert str(cm) == (
                "Companion FASTA index (.fai) file not found or malformatted! "
                "Use 'samtools faidx' to generate FASTA index.")
Пример #24
0
 def test_exit_code_0_referenceset(self):
     """
     Like a cram test. Assert exits with 0 with ReferenceSet XML
     """
     ref_name = op.join(self._output_dir, "refset.xml")
     refset = ReferenceSet(self._get_reference_fasta())
     refset.write(ref_name)
     ref = ref_name
     cmd = 'python -m pbreports.report.variants {r} {c} {a} {v}'.format(
         r='rpt.json', c=ref, a=self.ALIGNMENT_SUMMARY,
         v=self.VARIANTS_GFF)
     rcode = run_backticks(cmd)
     self.assertEquals(0, rcode)
     self.assertTrue(op.exists("rpt.json"))
Пример #25
0
 def test_filter_reference_contigs(self):
     ds2 = ReferenceSet(data.getRef())
     self.assertEqual(len(list(ds2.refNames)), 59)
     filt = Filters()
     filt.addRequirement(id=[('==', 'E.faecalis.1')])
     ds2.addFilters(filt)
     self.assertEqual(str(ds2.filters), "( id == E.faecalis.1 )")
     self.assertEqual(len(ds2.refNames), 1)
     self.assertEqual(len(list(ds2.records)), 1)
     ds2.disableFilters()
     self.assertEqual(len(list(ds2.refNames)), 59)
     self.assertEqual(len(list(ds2.records)), 59)
     ds2.enableFilters()
     self.assertEqual(len(list(ds2.refNames)), 1)
     self.assertEqual(len(list(ds2.records)), 1)
Пример #26
0
def test_generatePoaGraph():
    """
    Demonstrate that a poa-generated MSA
    can be generated from a list of subreads
    from a particular ZMW (using pbcore)
    :return:
    """
    # no reference provided
    test_poa = setup_func()
    test_poa.poa.generatePoaGraph()

    # reference provided
    rset = ReferenceSet('data/references/All4mers_InsertOnly.ReferenceSet.xml')
    ref = rset[np.flatnonzero(rset.index['id'] == 'All4mer.V2.105')[0]]
    test_poa = setup_func(ref=ref)
    test_poa.poa.generatePoaGraph()
    def test_exit_code_0_referenceset(self):
        """
        Like a cram test. Assert exits with 0 with ReferenceSet XML
        """
        ref = self._get_reference_fasta()
        ref_name = os.path.join(self._output_dir, "refset.xml")
        refset = ReferenceSet(ref)
        refset.write(ref_name)
        ref = ref_name
        j = 'rpt.json'
        cmd = 'python -m pbreports.report.top_variants {j} {g} {r}'.format(
            g=self.VARIANTS_GFF, r=ref, j=j)
        log.info(cmd)

        rcode = run_backticks(cmd)
        self.assertEquals(0, rcode)
Пример #28
0
 def test_autofilled_metatypes(self):
     ds = ReferenceSet(data.getXml(8))
     for extRes in ds.externalResources:
         assert extRes.metaType == 'PacBio.ReferenceFile.ReferenceFastaFile'
         assert len(extRes.indices) == 1
         for index in extRes.indices:
             assert index.metaType == "PacBio.Index.SamIndex"
     ds = AlignmentSet(data.getXml(7))
     for extRes in ds.externalResources:
         assert extRes.metaType == 'PacBio.SubreadFile.SubreadBamFile'
         assert len(extRes.indices) == 2
         for index in extRes.indices:
             if index.resourceId.endswith('pbi'):
                 assert index.metaType == "PacBio.Index.PacBioIndex"
             if index.resourceId.endswith('bai'):
                 assert index.metaType == "PacBio.Index.BamIndex"
Пример #29
0
    def test_missing_fai_error_message(self):
        outdir = tempfile.mkdtemp(suffix="dataset-unittest")

        inFas = os.path.join(outdir, 'infile.fasta')

        # copy fasta reference to hide fai and ensure FastaReader is used
        backticks('cp {i} {o}'.format(i=ReferenceSet(
            data.getXml(9)).toExternalFiles()[0],
                                      o=inFas))
        rs1 = ContigSet(inFas)
        with self.assertRaises(IOError) as cm:
            rs1.assertIndexed()
        self.assertEqual(
            str(cm.exception),
            ("Companion FASTA index (.fai) file not found or malformatted! "
             "Use 'samtools faidx' to generate FASTA index."))
Пример #30
0
 def test_incorrect_len_getitem(self):
     types = [
         AlignmentSet(data.getXml(7)),
         ReferenceSet(data.getXml(8)),
         SubreadSet(data.getXml(9))
     ]
     fn = tempfile.NamedTemporaryFile(suffix=".xml").name
     for ds in types:
         explen = -2
         with openDataFile(ds.toExternalFiles()[0]) as mystery:
             # try to avoid crashes...
             explen = len(mystery)
             mystery.numRecords = 1000000000
             mystery.write(fn)
         with openDataFile(fn) as mystery:
             assert len(list(mystery)) == explen