示例#1
0
 def setup(self):
     self.header = "chr1|blah|blah\tblah blah"
     self.rc_header = "chr1|blah|blah\tblah blah [revcomp]"
     self.id = "chr1|blah|blah"
     self.comment = "blah blah"
     self.sequence = "GATTACA" * 20
     self.rc_sequence = "TGTAATC" * 20
     self.length = 140
     self.expected__str__ = (
         ">chr1|blah|blah\tblah blah\n"
         "GATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATT\n"
         "ACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAG\n"
         "ATTACAGATTACAGATTACA")
     self.rc1_expected__str__ = (
         ">chr1|blah|blah\tblah blah [revcomp]\n"
         "TGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTA\n"
         "ATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCT\n"
         "GTAATCTGTAATCTGTAATC")
     self.rc2_expected__str__ = (
         ">chr1|blah|blah\tblah blah\n"
         "TGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTA\n"
         "ATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCT\n"
         "GTAATCTGTAATCTGTAATC")
     self.record = FastaRecord(self.header, self.sequence)
     self.rc1_record = self.record.reverseComplement()
     self.rc2_record = self.record.reverseComplement(True)
示例#2
0
def _create_chimera_pair(locus, sequences):
    """
    Create chimeras for a single locus
    """
    assert len(sequences) == 2
    first_5p, first_3p = _split_sequence(sequences[0])
    second_5p, second_3p = _split_sequence(sequences[1])
    A_chimera = FastaRecord(name='Locus%s_ChimeraA' % locus,
                            sequence=first_5p + second_3p)
    B_chimera = FastaRecord(name='Locus%s_ChimeraB' % locus,
                            sequence=second_5p + first_3p)
    return [A_chimera, B_chimera]
示例#3
0
def SummarizeData(indexedFasta, windows, adps):
    summaries = []

    fa = IndexedFastaReader(indexedFasta)
    for hn, (_, tid, s, e, target) in windows.iteritems():
        # First skip ZMWs with no adp results, i.e. with <= 1 adp
        try:
            leftTc6, leftAlt, rightTc6, rightAlt = adps[hn]
        except:
            continue

        chrm = fa[tid]

        # Search for restriction sites near the ends
        fiveP = chrm.sequence[max(s - 5, 0):s + 6]
        threeP = chrm.sequence[e - 5:e + 6]
        fiveEco = HasEcoR1(fiveP)
        threeEco = HasEcoR1(threeP)

        # Search for restriction sites contained within
        inside = chrm.sequence[s + 6:e - 5]
        insideEco = HasEcoR1(inside)

        # Count and summarize any PolyA/T regions
        region = chrm.sequence[s:e]
        AT = LargestAsAndTs(region)
        maxAT = 0 if len(AT) == 0 else max(AT)

        # Check for Guide RNA matches
        OutFiveP = chrm.sequence[max(s - 33, 0):s + 10]
        InFiveP = FastaRecord("tmp",
                              chrm.sequence[max(s - 10, 0):s +
                                            33]).reverseComplement().sequence
        InThreeP = chrm.sequence[e - 33:e + 10]
        OutThreeP = FastaRecord(
            "tmp", chrm.sequence[e - 10:e + 33]).reverseComplement().sequence
        k1, s1, a1 = ScoreCas9SiteSides(OutFiveP, InFiveP)
        k2, s2, a2 = ScoreCas9SiteSides(OutThreeP, InThreeP)

        # Summary columns
        hasPolyA = "T" if maxAT > 0 else "F"
        hasLeft = "T" if (fiveEco == "T" or k1 != "N/A") else "F"
        hasRight = "T" if (threeEco == "T" or k2 != "N/A") else "F"

        summaries.append(
            (hn, tid, s, e, e - s, target, len(AT), maxAT, sum(AT), leftTc6,
             rightTc6, leftAlt, rightAlt, fiveEco, insideEco, threeEco, k1, s1,
             a1, k2, s2, a2, hasPolyA, hasLeft, hasRight))

    return sorted(summaries)
示例#4
0
    def Write(self):
        """Clean-up the sequences and write out a Genomic Fasta"""

        sets = []
        writers = []

        for allele, seq in self._dict.iteritems():
            exons = seq.split("|")

            while len(writers) < len(exons):
                fasta = "{0}_exon{1}.fasta".format(self._locus,
                                                   len(writers) + 1)
                writers.append(FastaWriter(fasta))
                sets.append(set())

            for i, exon in enumerate(exons):
                exon = re.sub("[.|*]", "", exon)
                if len(exon) == 0:
                    continue

                if exon in sets[i]:
                    continue
                record = FastaRecord(allele, exon)
                writers[i].writeRecord(record)
                sets[i].add(exon)
示例#5
0
def write_temporary_fasta(record_list):
    temp_fasta = tempfile.NamedTemporaryFile(suffix=".fasta", delete=False)
    with FastaWriter(temp_fasta.name) as handle:
        for record in record_list:
            rec = FastaRecord(record.name, record.sequence)
            handle.writeRecord(rec)
    return temp_fasta
 def setup(self):
     self.header = "chr1|blah|blah\tblah blah"
     self.rc_header = "chr1|blah|blah\tblah blah [revcomp]"
     self.id = "chr1|blah|blah"
     self.comment = "blah blah"
     self.sequence = "GATTACA" * 20
     self.rc_sequence = "TGTAATC" * 20
     self.length = 140
     self.expected__str__ = (
         ">chr1|blah|blah\tblah blah\n"
         "GATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATT\n"
         "ACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAG\n"
         "ATTACAGATTACAGATTACA")
     self.rc1_expected__str__ = (
         ">chr1|blah|blah\tblah blah [revcomp]\n"
         "TGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTA\n"
         "ATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCT\n"
         "GTAATCTGTAATCTGTAATC")
     self.rc2_expected__str__ = (
         ">chr1|blah|blah\tblah blah\n"
         "TGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTA\n"
         "ATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCT\n"
         "GTAATCTGTAATCTGTAATC")
     self.record = FastaRecord(self.header, self.sequence)
     self.rc1_record = self.record.reverseComplement()
     self.rc2_record = self.record.reverseComplement(True)
示例#7
0
def parse_primer_sequences(primers_str):
    """
    Return a list of primer FastaRecords if primers_str only contains
    valid primers. Otherwise raise a ValueError.
    """
    if isinstance(primers_str, str) or isinstance(primers_str, unicode):
        primer_fasta_records = []
        primers_str = str(primers_str)
        if '>' not in primers_str:
            raise ValueError("Invalid primer header, could not find leading '>'.")
        for str_index, primer_str in enumerate(primers_str.split('>')[1:]):
            lines = [line.strip().translate(None, '\'\" ') for line in primer_str.split('\n')]
            lines = [line for line in lines if len(line) > 0] # remove empty lines
            if len(lines) < 2:
                raise ValueError("Primer %s must have a sequence." % lines[0])
            primer_name = lines[0]
            primer_sequence = ''.join(lines[1:])
            primer_index = int(str_index / 2)
            primer_strand = 'F' if str_index % 2 == 0 else 'R'
            expected_primer_name = "{s}{i}".format(s=primer_strand, i=primer_index)
            if primer_name != expected_primer_name:
                raise ValueError("Primers should be placed in order F0, R0, F1, R1...")
            for base in primer_sequence:
                if base.upper() not in ('A', 'T', 'G', 'C'):
                    raise ValueError("Primer sequence %s must only contain ATGC" % primer_sequence)
            primer_fasta_records.append(FastaRecord(header=primer_name, sequence=primer_sequence))

        return primer_fasta_records

    raise ValueError("Input primers_str %s must be either str or unicode" % type(primers_str))
示例#8
0
def gconFunc(tp):
    # called bcause multiprocess
    rootDir, barcode = tp
    bcdir = "/".join((rootDir, barcode))

    ## call gcon
    logging.info("In gconFunc for: %s" % barcode)

    cmd = "gcon.py r --min_cov 3 %s/subreads.fasta %s/seed_read.fasta -d %s" % \
        (bcdir, bcdir, bcdir)
    subprocess.call(cmd, shell=True)

    ## check to see if the file is empty
    r = FastaReader("%s/g_consensus.fa" % bcdir)

    if not list(r)[0].sequence:
        return None

    ## check to see if we are going to run quiver
    if not runner.args.noQuiver:
        # setup the blasr / sam / quiver stuff.
        logging.info("Setup regions file, now running blasr through quiver.")

        cmd = ('blasr %s %s/g_consensus.fa -nproc 1 -sam -regionTable %s/region.fofn -out ' + \
                   '%s/aligned_reads.sam') % (runner.args.inputFofn, bcdir, bcdir, bcdir)
        logging.debug(cmd)
        subprocess.call(cmd, shell=True)

        cmd = 'samtoh5 %s/aligned_reads.sam %s/g_consensus.fa %s/aligned_reads.cmp.h5' % \
            (bcdir, bcdir, bcdir)
        logging.debug(cmd)
        subprocess.call(cmd, shell=True)

        cmd = ('loadPulses %s %s/aligned_reads.cmp.h5 -byread -metrics ' + \
                   'QualityValue,InsertionQV,MergeQV,DeletionQV,DeletionTag,SubstitutionTag,' + \
                   'SubstitutionQV') % (runner.args.inputFofn, bcdir)
        logging.debug(cmd)
        subprocess.call(cmd, shell=True)

        cmd = 'cmph5tools.py sort --inPlace %s/aligned_reads.cmp.h5' % bcdir
        logging.debug(cmd)
        subprocess.call(cmd, shell=True)

        cmd = ('quiver -vv --algorithm quiver -p P4-C2.AllQVsMergingByChannelModel ' \
                   '%s/aligned_reads.cmp.h5 --outputFilename %s/q_consensus.fasta ' + \
                   '--referenceFilename %s/g_consensus.fa') % (bcdir, bcdir, bcdir)
        logging.debug(cmd)
        subprocess.call(cmd, shell=True)
        cFilename = 'q_consensus.fasta'
    else:
        cFilename = 'g_consensus.fa'

    ## append results to output file.
    bcCons = "%s/%s/%s" % (rootDir, barcode, cFilename)
    if os.path.exists(bcCons):
        return FastaRecord(barcode, list(FastaReader(bcCons))[0].sequence)
    else:
        return None
示例#9
0
 def Write(self):
     """Clean-up the sequences and write out a Genomic Fasta"""
     filename = "{0}_genomic.fasta".format(self._locus)
     with FastaWriter(filename) as handle:
         for allele, seq in self._dict.iteritems():
             # Remove inserts, exon/intron boundaries, and trimmed regions
             seq = re.sub("[.|*]", "", seq)
             record = FastaRecord(allele, seq)
             handle.writeRecord(record)
示例#10
0
 def setup(self):
     self.name = "chr1|blah|blah"
     self.sequence = "GATTACA" * 20
     self.expected__str__ =                                               \
         ">chr1|blah|blah\n"                                              \
         "GATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATT\n" \
         "ACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAG\n" \
         "ATTACAGATTACAGATTACA"
     self.record = FastaRecord(self.name, self.sequence)
示例#11
0
def write_temp_fasta(fastq_file):
    """
    Write a temporary Fasta file from a Fastq
    """
    temp = tempfile.NamedTemporaryFile(suffix='.fasta', delete=False)
    with FastaWriter(temp.name) as handle:
        for record in FastqReader(fastq_file):
            temp_record = FastaRecord(record.name, record.sequence)
            handle.writeRecord(temp_record)
    return temp
示例#12
0
def _fastq_to_fasta(fastq_path, fasta_path):
    """Convert a fastq file to  fasta file"""
    with FastqReader(fastq_path) as r:
        with FastaWriter(fasta_path) as w:
            for fastq_record in r:
                fasta_record = FastaRecord(fastq_record.name, fastq_record.sequence)
                w.writeRecord(fasta_record)

    log.info("Completed converting {q} to {f}".format(q=fastq_path, f=fasta_path))
    return 0
示例#13
0
def rename_record( record ):
    new_name = '|'.join( record.name.strip().split('|')[:-1] )
    if isinstance( record, FastaRecord ):
        return FastaRecord( new_name, record.sequence )
    elif isinstance( record, FastqRecord ):
        return FastqRecord( new_name, record.sequence, record.quality )
    else:
        msg = "Object must be a valid Fasta or Fastq Record"
        log.error( msg )
        raise TypeError( msg )
示例#14
0
def merge_fasta_sequences(sequences, positions):
    merged = []
    for i, part_list in enumerate(positions):
        name = "Merged%s_NumReads100" % (i + 1)
        sequence = ''
        for part in part_list:
            source = part['name']
            start = part['start']
            end = part['end']
            sequence += sequences[source].sequence[start:end]
        record = FastaRecord(name=name, sequence=sequence)
        merged.append(record)
    return merged
示例#15
0
def rename_imgt_fasta(input_file, output_file):
    with FastaWriter(output_file) as handle:
        for record in FastaReader(input_file):
            # Check that this is an IMGT-formatted FASTA record
            assert record.header.startswith('HLA:')

            # Extract the header and replace spaces with underscores
            new_header = record.header.strip().replace(' ', '_')

            # Create a new record with the same sequence and the type
            #    in place of it's id.
            new_record = FastaRecord(new_header, record.sequence)
            handle.writeRecord(new_record)
示例#16
0
def _createUnrolledReference(refIdx, reference, adp):
    refName = "Reference{0}".format(refIdx)
    seq = ""
    idx = 0
    adpPos = []
    while len(seq) < MIN_TEMPLATE_SIZE:
        if idx % 2 == 0:
            seq += reference.reverseComplement().sequence.upper()
        else:
            seq += reference.sequence.upper()
        adpPos.append((refName, len(seq), len(seq) + len(adp.sequence), 0))
        seq += adp.sequence.upper()
        idx += 1
    return FastaRecord(refName, seq), adpPos
示例#17
0
def _createForwardUnrolledReference(refIdx, reference, adps):
    refName = "Reference{0}_Forward".format(refIdx)
    seq = ""
    idx = 0
    adpPos = []
    while len(seq) < MIN_TEMPLATE_SIZE:
        if idx == 0:
            seq += reference.sequence.upper()
        else:
            seq += reference.reverseComplement().sequence.upper()
        adpPos.append(
            (refName, len(seq), len(seq) + len(adps[idx].sequence), idx))
        seq += adps[idx].sequence.upper()
        idx += 1
        if idx >= len(adps):
            idx = 0
    return FastaRecord(refName, seq), adpPos
示例#18
0
def main(parser):
    args = parser.parse_args()

    # Get outfile name
    if args.outFile is None:
        outfile = 'nobarcode.fasta' if args.fasta else 'nobarcode.fastq'
    else:
        outfile = args.outFile

    # Input files
    barcodeFofn = (l.strip('\n') for l in args.barcode_fofn)
    ccsFofn = (l.strip('\n') for l in args.ccs_fofn)

    # Get the read names that are not barcoded
    no_barcode = set()
    for barcodeFile in barcodeFofn:
        bcH5 = BarcodeH5Reader(barcodeFile)
        for row in bcH5.bestDS:
            if row[3] / row[1] < args.minAvgBarcodeScore:
                no_barcode.add('%s/%d' % (bcH5.movieName, row[0]))

    if args.fasta:
        outh = FastaWriter(outfile)
    else:
        outh = FastqWriter(outfile)

    for ccsFile in ccsFofn:
        ccsH5 = BasH5Reader(ccsFile)
        for ccsRead in ccsH5.ccsReads():
            if ccsRead.zmw.zmwName in no_barcode:
                basecalls = ccsRead.basecalls()
                if len(basecalls) >= args.minMaxInsertLength:
                    if args.fasta:
                        outh.writeRecord(
                            FastaRecord(ccsRead.zmw.zmwName,
                                        ccsRead.basecalls()))
                    else:
                        outh.writeRecord(
                            FastqRecord(ccsRead.zmw.zmwName,
                                        ccsRead.basecalls(),
                                        ccsRead.QualityValue()))
    outh.close()
示例#19
0
def main(parser):
  args = parser.parse_args()

  # Get outfile name
  if args.outFile is None:
    outfile = 'nobarcode.fasta' if args.fasta else 'nobarcode.fastq'
  else:
    outfile = args.outFile
  
  # Input files
  barcodeFofn = (l.strip('\n') for l in args.barcode_fofn)
  baxFofn = (l.strip('\n') for l in args.bax_fofn)
  
  # Get the read names that are not barcoded
  no_barcode = defaultdict(set)
  for barcodeFile in barcodeFofn:
    bcH5 = BarcodeH5Reader(barcodeFile)
    for row in bcH5.bestDS:
      if row[3] / row[1] < args.minAvgBarcodeScore:
        no_barcode[bcH5.movieName].add(row[0])

  if args.fasta:
    outh = FastaWriter(outfile)
  else:
    outh = FastqWriter(outfile)

  for baxFile in baxFofn:
    baxH5 = BasH5Reader(baxFile)
    for holeNum in baxH5.sequencingZmws:
      if holeNum in no_barcode[baxH5.movieName]:
        zmw = baxH5[holeNum]
        if len(zmw.subreads) and max(len(sr.basecalls()) for sr in zmw.subreads) >= args.minMaxInsertLength:
          for subread in zmw.subreads:
            if len(subread.basecalls()) >= args.minSubreadLength:
              if args.fasta:
                outh.writeRecord(FastaRecord(subread.readName,subread.basecalls()))
              else:
                outh.writeRecord(FastqRecord(subread.readName,subread.basecalls(),subread.QualityValue()))

  outh.close()
示例#20
0
#! /usr/bin/env python

import sys
from pbcore.io import FastaReader, FastaWriter, FastaRecord

seqs = set()
with FastaWriter(sys.stdout) as handle:
    for rec in FastaReader(sys.argv[1]):

        # Strip gaps from the sequence
        seq = rec.sequence.replace('-', '').replace('.', '')

        # If we've seen the ungapped sequence before, skip
        if seq in seqs:
            continue
        # Otherwise add the sequence and write the record
        else:
            new_rec = FastaRecord(rec.name, seq)
            seqs.add(seq)
            handle.writeRecord(new_rec)
class TestFastaRecord(object):

    def setup(self):
        self.header = "chr1|blah|blah\tblah blah"
        self.rc_header = "chr1|blah|blah\tblah blah [revcomp]"
        self.id = "chr1|blah|blah"
        self.comment = "blah blah"
        self.sequence = "GATTACA" * 20
        self.rc_sequence = "TGTAATC" * 20
        self.length = 140
        self.expected__str__ = (
            ">chr1|blah|blah\tblah blah\n"
            "GATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATT\n"
            "ACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAG\n"
            "ATTACAGATTACAGATTACA")
        self.rc1_expected__str__ = (
            ">chr1|blah|blah\tblah blah [revcomp]\n"
            "TGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTA\n"
            "ATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCT\n"
            "GTAATCTGTAATCTGTAATC")
        self.rc2_expected__str__ = (
            ">chr1|blah|blah\tblah blah\n"
            "TGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTA\n"
            "ATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCT\n"
            "GTAATCTGTAATCTGTAATC")
        self.record = FastaRecord(self.header, self.sequence)
        self.rc1_record = self.record.reverseComplement()
        self.rc2_record = self.record.reverseComplement(True)

    def test__init__(self):
        assert_equal(self.header, self.record.header)
        assert_equal(self.sequence, self.record.sequence)
        assert_equal(self.id, self.record.id)
        assert_equal(self.comment, self.record.comment)

    def test__str__(self):
        assert_equal(self.expected__str__, str(self.record))

    def test_fromString(self):
        recordFromString = FastaRecord.fromString(self.expected__str__)
        assert_equal(self.header, recordFromString.header)
        assert_equal(self.sequence, recordFromString.sequence)

    def test_reverse_complement1(self):
        assert_equal(self.rc1_record.header, self.rc_header)
        assert_equal(self.rc1_record.sequence, self.rc_sequence)
        assert_equal(self.rc1_expected__str__, str(self.rc1_record))

    def test_reverse_complement2(self):
        assert_equal(self.rc2_record.header, self.header)
        assert_equal(self.rc2_record.sequence, self.rc_sequence)
        assert_equal(self.rc2_expected__str__, str(self.rc2_record))

    def test_len(self):
        assert_equal(self.length, len(self.record))
        assert_equal(self.length, len(self.rc1_record))
        assert_equal(self.length, len(self.rc2_record))

    def test_eq(self):
        header = 'r1'
        seq = 'ACGT'
        r1 = FastaRecord(header, seq)
        r2 = FastaRecord(header, seq)
        assert_true(r1 == r2)

    def test_not_equal(self):
        r1 = FastaRecord('r1', 'ACGT')
        r2 = FastaRecord('r2', 'ACGT')
        r3 = FastaRecord('r1', 'ACGT')
        assert_true(r1 != r2)
        assert_false(r1 != r3)
 def test_fromString(self):
     recordFromString = FastaRecord.fromString(self.expected__str__)
     assert_equal(self.header, recordFromString.header)
     assert_equal(self.sequence, recordFromString.sequence)
示例#23
0
def callConsensus():
    def makeReadAndReads(zmwsForBC):
        ccsData = filter(lambda x: x,
                         [zmw.ccsRead for _, _, zmw in zmwsForBC if zmw])
        srData = reduce(lambda x, y: x + y,
                        [zmw.subreads for zmw, _, _ in zmwsForBC if zmw], [])
        if not srData and not ccsData:
            return (None, None)

        def getSeedRead(reads,
                        lq=80,
                        uq=90,
                        sLambda=lambda x: -x.zmw.readScore):
            lens = map(len, reads)
            candidateRange = (n.percentile(lens, lq), n.percentile(lens, uq))
            pfReads = [
                read for read, l in zip(reads, lens)
                if l >= candidateRange[0] and l <= candidateRange[1]
            ]
            pfReads.sort(key=sLambda)
            return pfReads[0] if len(pfReads) else None

        if ccsData:
            ## all CCS reads should be the *same* length for an
            ## amplicon. Let's take the middle ones
            seedRead = getSeedRead(ccsData,
                                   lq=30,
                                   uq=70,
                                   sLambda=lambda x: -x.zmw.numPasses)
            if not seedRead:
                seedRead = getSeedRead(srData)
                logging.info("Unable to use a CCS read for the seed read.")
            else:
                logging.info("Using a CCS read for the seed read.")
        else:
            logging.info("Using a raw read for the seed read")
            seedRead = getSeedRead(srData)

        return (seedRead, srData)

    # check to make sure that you have the necessary dependencies,
    # i.e., hgap script, blasr, etc.
    try:
        import pbtools.pbdagcon
    except ImportError:
        raise ImportError(
            "Unable to find dependency `pbdagcon` - please install.")

    # retrieve ZMWs by barcode
    if runner.args.barcode:
        zmwsForBCs = getZmwsForBarcodes(runner.args.barcode)
    else:
        zmwsForBCs = getZmwsForBarcodes()

    # subsample
    zmwsForBCs = {k: subsampleReads(v) for k, v in zmwsForBCs.items()}

    logging.info("unfiltered average zmws per barcode: %g" %
                 n.round(n.mean(map(len, zmwsForBCs.values()))))

    # filter ZMWs
    zmwsForBCs = filterZmws(zmwsForBCs)

    logging.info("filtered average zmws per barcode: %g" %
                 n.round(n.mean(map(len, zmwsForBCs.values()))))

    # now choose the best subread to seed the assembly
    if runner.args.ccsFofn:
        # XXX: This part depends on the filenames of the ccs and input
        # fofns, this is essentially a workaround to the fact the the
        # part isn't part of the API
        ccsReaders = {
            movieNameFromFile(l): BasH5Reader(l)
            for l in open(runner.args.ccsFofn).read().splitlines()
        }

        # fill in the CCS spot.
        for k, v in zmwsForBCs.items():
            l = []
            for zmw, lZmw in v:
                r = ccsReaders[movieNameFromFile(zmw.baxH5.file.filename)]
                l.append((zmw, lZmw, r[zmw.holeNumber]))
            zmwsForBCs[k] = l
    else:
        # add none to the CCS spot.
        zmwsForBCs = {
            k: [(zmw, lZmw, None) for zmw, lZmw in v]
            for k, v in zmwsForBCs.iteritems()
        }

    readAndReads = {k: makeReadAndReads(v) for k, v in zmwsForBCs.items()}

    # remove barcodes that don't have a seed read and a set of useable reads.
    readAndReads = {k: v for k, v in readAndReads.items() if v[0] and v[1]}

    # generate FASTA files
    outDir = runner.args.outDir

    for barcode, reads in readAndReads.items():
        bcdir = '/'.join((outDir, barcode))
        if not os.path.exists(bcdir):
            os.makedirs(bcdir)

        # emit the seeds to separte files
        with FastaWriter("%s/seed_read.fasta" % bcdir) as w:
            w.writeRecord(FastaRecord(reads[0].readName, reads[0].basecalls()))

        subreads = reads[1]

        # emit the subreads to a single file
        with FastaWriter("%s/subreads.fasta" % bcdir) as w:
            for r in subreads:
                w.writeRecord(FastaRecord(r.readName, r.basecalls()))

        # construct the region file by subsetting the ZMWs that you
        # are interested in.
        nfofn = []
        for inFof, in zipFofns(runner.args.inputFofn):
            bh5 = BaxH5Reader(inFof)
            reg = bh5.file['/PulseData/Regions']
            inMovie = filter(lambda z: z.baxH5.movieName == bh5.movieName,
                             subreads)
            holes = n.in1d(reg[:, 0], n.array([a.holeNumber for a in inMovie]))
            if any(holes):
                nreg = reg[holes, :]
            else:
                nreg = n.empty(shape=(0, reg.shape[1]), dtype='int32')

            fname = "%s/%s.rgn.h5" % (bcdir, movieNameFromFile(inFof))
            nfile = h5.File(fname, 'w')
            ndset = nfile.create_dataset('/PulseData/Regions',
                                         data=nreg,
                                         maxshape=(None, None))
            copyAttributes(reg, ndset)
            nfile.close()
            nfofn.append(fname)

        ofile = open('%s/region.fofn' % bcdir, 'w')
        ofile.writelines("\n".join(nfofn))
        ofile.close()

    ## call gcon
    outDirs = [(outDir, k) for k in readAndReads.keys()]
    if runner.args.nProcs == 1:
        outFasta = filter(lambda z: z, map(gconFunc, outDirs))
    else:
        pool = Pool(runner.args.nProcs)
        outFasta = filter(lambda z: z, pool.map(gconFunc, outDirs))

    ## write the results
    with FastaWriter('/'.join((outDir, "consensus.fa"))) as w:
        for r in outFasta:
            w.writeRecord(r)

    ## optionally cleanup
    if not runner.args.keepTmpDir:
        for barcode, reads in readAndReads.items():
            bcdir = '/'.join((outDir, barcode))
            shutil.rmtree(bcdir)
示例#24
0
 def setup_class(cls):
     cls.record = FastaRecord(cls.HEADER, cls.SEQUENCE)
     cls.rc1_record = cls.record.reverseComplement()
     cls.rc2_record = cls.record.reverseComplement(True)
示例#25
0
 def test_fromString(self):
     recordFromString = FastaRecord.fromString(self.EXPECTED__STR__)
     assert self.HEADER == recordFromString.header
     assert self.SEQUENCE == recordFromString.sequence
示例#26
0
 def test_not_equal(self):
     r1 = FastaRecord('r1', 'ACGT')
     r2 = FastaRecord('r2', 'ACGT')
     r3 = FastaRecord('r1', 'ACGT')
     assert r1 != r2
     assert not r1 != r3
示例#27
0
    chrm = fa[tid]

    # Search for restriction sites near
    fiveP = chrm.sequence[s - 5:s + 6]
    threeP = chrm.sequence[e - 5:e + 6]
    fiveEco, fiveBam = HasEcoR1(fiveP), HasBamH1(fiveP)
    threeEco, threeBam = HasEcoR1(threeP), HasBamH1(threeP)

    # Count and summarize any PolyA/T regions
    region = chrm.sequence[s:e]
    AT = LargestAsAndTs(region)
    maxAT = 0 if len(AT) == 0 else max(AT)

    # Check for Guide RNA matches
    OutFiveP = chrm.sequence[s - 33:s + 10]
    InFiveP = FastaRecord(
        "tmp", chrm.sequence[s - 10:s + 33]).reverseComplement().sequence
    InThreeP = chrm.sequence[e - 33:e + 10]
    OutThreeP = FastaRecord(
        "tmp", chrm.sequence[e - 10:e + 33]).reverseComplement().sequence
    k1, s1, a1 = ScoreCas9SiteSides(OutFiveP, InFiveP)
    k2, s2, a2 = ScoreCas9SiteSides(OutThreeP, InThreeP)

    # Summary columns
    hasPolyA = "T" if (polyA == "T" or maxAT > 0) else "F"
    hasLeft = "T" if (fiveEco == "T" or fiveBam == "T" or k1 != "N/A") else "F"
    hasRight = "T" if (threeEco == "T" or threeBam == "T"
                       or k2 != "N/A") else "F"

    print "{0},{1},{2},{3},{4},{5},{6},{7},{8},{9},{10},{11},{12},{13},{14},{15},{16},{17},{18},{19},{20},{21}".format(
        hn, tid, s, e, target, polyA, len(AT), maxAT, sum(AT), fiveEco,
        fiveBam, threeEco, threeBam, k1, s1, a1, k2, s2, a2, hasPolyA, hasLeft,
示例#28
0
 def test_not_equal(self):
     r1 = FastaRecord('r1', 'ACGT')
     r2 = FastaRecord('r2', 'ACGT')
     r3 = FastaRecord('r1', 'ACGT')
     assert_true(r1 != r2)
     assert_false(r1 != r3)
示例#29
0
 def test_eq(self):
     name = 'r1'
     seq = 'ACGT'
     r1 = FastaRecord(name, seq)
     r2 = FastaRecord(name, seq)
     assert_true(r1 == r2)
示例#30
0
 def test_fromString(self):
     recordFromString = FastaRecord.fromString(self.expected__str__)
     assert_equal(self.name, recordFromString.name)
     assert_equal(self.sequence, recordFromString.sequence)
示例#31
0
 def test_eq(self):
     header = 'r1'
     seq = 'ACGT'
     r1 = FastaRecord(header, seq)
     r2 = FastaRecord(header, seq)
     assert_true(r1 == r2)
示例#32
0
class TestFastaRecord:
    def setup(self):
        self.header = "chr1|blah|blah\tblah blah"
        self.rc_header = "chr1|blah|blah\tblah blah [revcomp]"
        self.id = "chr1|blah|blah"
        self.comment = "blah blah"
        self.sequence = "GATTACA" * 20
        self.rc_sequence = "TGTAATC" * 20
        self.length = 140
        self.expected__str__ = (
            ">chr1|blah|blah\tblah blah\n"
            "GATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATT\n"
            "ACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAG\n"
            "ATTACAGATTACAGATTACA")
        self.rc1_expected__str__ = (
            ">chr1|blah|blah\tblah blah [revcomp]\n"
            "TGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTA\n"
            "ATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCT\n"
            "GTAATCTGTAATCTGTAATC")
        self.rc2_expected__str__ = (
            ">chr1|blah|blah\tblah blah\n"
            "TGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTA\n"
            "ATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCT\n"
            "GTAATCTGTAATCTGTAATC")
        self.record = FastaRecord(self.header, self.sequence)
        self.rc1_record = self.record.reverseComplement()
        self.rc2_record = self.record.reverseComplement(True)

    def test__init__(self):
        assert_equal(self.header, self.record.header)
        assert_equal(self.sequence, self.record.sequence)
        assert_equal(self.id, self.record.id)
        assert_equal(self.comment, self.record.comment)

    def test__str__(self):
        assert_equal(self.expected__str__, str(self.record))

    def test_fromString(self):
        recordFromString = FastaRecord.fromString(self.expected__str__)
        assert_equal(self.header, recordFromString.header)
        assert_equal(self.sequence, recordFromString.sequence)

    def test_reverse_complement1(self):
        assert_equal(self.rc1_record.header, self.rc_header)
        assert_equal(self.rc1_record.sequence, self.rc_sequence)
        assert_equal(self.rc1_expected__str__, str(self.rc1_record))

    def test_reverse_complement2(self):
        assert_equal(self.rc2_record.header, self.header)
        assert_equal(self.rc2_record.sequence, self.rc_sequence)
        assert_equal(self.rc2_expected__str__, str(self.rc2_record))

    def test_len(self):
        assert_equal(self.length, len(self.record))
        assert_equal(self.length, len(self.rc1_record))
        assert_equal(self.length, len(self.rc2_record))

    def test_eq(self):
        header = 'r1'
        seq = 'ACGT'
        r1 = FastaRecord(header, seq)
        r2 = FastaRecord(header, seq)
        assert_true(r1 == r2)

    def test_not_equal(self):
        r1 = FastaRecord('r1', 'ACGT')
        r2 = FastaRecord('r2', 'ACGT')
        r3 = FastaRecord('r1', 'ACGT')
        assert_true(r1 != r2)
        assert_false(r1 != r3)
示例#33
0
def _to_fasta_record(header, seq):
    return FastaRecord(header, seq)