Exemplo n.º 1
0
def call_starcode_fastq_file(fastq):
    #pdb.set_trace()
    MIN_BRCD = 15
    MAX_BRCD = 25

    brcd_outfname = fname + '_barcodes.tsv'
    spk_outfname = fname + '_spikes.tsv'

    GFP = seeq.compile('CATGCTAGTTGTGGTTTGTCCAAACT', 4)
    SPIKE = seeq.compile('CATGATTACCCTGTTATC', 2)
    barcode_tempf = tempfile.NamedTemporaryFile(delete=False)
    spike_tempf = tempfile.NamedTemporaryFile(delete=False)

    with gzopen(fastq) as f:
        outf = None
        for lineno, line in enumerate(f):
            if lineno % 4 != 1: continue
            hit = GFP.match(line)
            if hit is not None:
                outf = barcode_tempf
            else:
                hit = SPIKE.match(line)
                if hit is not None:
                    outf = spike_tempf
                else:
                    continue
            pos = hit.matchlist[0][0]
            if MIN_BRCD <= pos <= MAX_BRCD:
                outf.write(line[:pos] + '\n')
    barcode_tempf.close()
    spike_tempf.close()

    subprocess.call([
        'starcode',
        '-t4',
        '-i',
        barcode_tempf.name,
        '-o',
        brcd_outfname,
    ])

    subprocess.call([
        'starcode',
        '-t4',
        '-i',
        spike_tempf.name,
        '-o',
        spk_outfname,
    ])

    # Delete temporary files.
    os.unlink(barcode_tempf.name)
    os.unlink(spike_tempf.name)

    # Save the names of the files processsed
    #processed.append([brcd_outfname,spk_outfname])
    processed.append(brcd_outfname)
    spikessed.append(spk_outfname)
    #pdb.set_trace()
    return
Exemplo n.º 2
0
def call_starcode_fastq_file(fastq):
   #pdb.set_trace()
   MIN_BRCD = 15
   MAX_BRCD = 25
      
   brcd_outfname = fname + '_barcodes.tsv'
   spk_outfname  = fname + '_spikes.tsv'
   
   GFP   = seeq.compile('CATGCTAGTTGTGGTTTGTCCAAACT', 4)
   SPIKE = seeq.compile('CATGATTACCCTGTTATC', 2)
   barcode_tempf = tempfile.NamedTemporaryFile(delete=False)
   spike_tempf   = tempfile.NamedTemporaryFile(delete=False)
   
   with gzopen(fastq) as f:
      outf = None
      for lineno,line in enumerate(f):
         if lineno % 4 != 1: continue
         hit = GFP.match(line)
         if hit is not None:
            outf = barcode_tempf
         else:
            hit = SPIKE.match(line)
            if hit is not None:
               outf = spike_tempf
            else:
               continue
         pos = hit.matchlist[0][0]
         if MIN_BRCD <= pos <= MAX_BRCD:
            outf.write(line[:pos] + '\n')
   barcode_tempf.close()
   spike_tempf.close()

   subprocess.call([
      'starcode',
      '-t4',
      '-i', barcode_tempf.name,
      '-o', brcd_outfname,])
      
   subprocess.call([
      'starcode',
      '-t4',
      '-i', spike_tempf.name,
      '-o', spk_outfname,])

   # Delete temporary files.
   os.unlink(barcode_tempf.name)
   os.unlink(spike_tempf.name)

   # Save the names of the files processsed
   #processed.append([brcd_outfname,spk_outfname])
   processed.append(brcd_outfname)
   spikessed.append(spk_outfname)
   #pdb.set_trace()
   return
Exemplo n.º 3
0
    def __init__(self, cst, fwd, rev, samples, spikes=dict()):
        '''Simple constructor instantiating attributes.'''

        # Base multiplexing information.
        self.cst = seeq.compile(cst, len(cst) / 5)
        self.samples = samples

        # Store the sequences of the primers used
        # in a frozen set for reference.
        self.Lseq = frozenset([fwd[-a:] for (a, b) in self.samples.keys()])
        self.Rseq = frozenset([rev[-b:] for (a, b) in self.samples.keys()])

        # Check that the primers are not too close to each
        # other, otherwise it will be impossible to demultiplex.
        # If all the primers are at a distance greater than
        # twice the cut-off, then no double hit is possible.
        for (a, b) in combinations(self.Lseq, 2):
            if dist_less_than(a, b, 2 * len(fwd) / 5):
                raise BadSpecifications('primer sequences too close', a, b)
        for (a, b) in combinations(self.Rseq, 2):
            if dist_less_than(a, b, 2 * len(rev) / 5):
                raise BadSpecifications('primer sequences too close', a, b)

        # Check that the spikes (if present) are not too close to
        # each other to avoid multiple matches. The distance between
        # any two spikes has to be at least 2. This does not guarantee
        # that the matches are unique but we have to accomodate the
        # experiments.
        for (a, b) in combinations(spikes.values(), 2):
            if dist_less_than(a, b, 1):
                raise BadSpecifications('spike sequences too close', a, b)

        # Check that the spikes can be found by looking for the
        # constant part.
        for spseq in spikes.values():
            if self.cst.match(spseq) is None:
                raise BadSpecifications('spike too divergent', spseq)

        # Spikes are not too close. Replace the values of the
        # dictionary by a compiled seeq pattern allowing 2 errors.
        self.spikes = dict()
        for spname in spikes:
            self.spikes[spname] = seeq.compile(spikes[spname], 1)

        # Check that the sample specification corresponds to
        # the primers used for the PCR.
        if len(fwd) != max([a for (a, b) in samples.keys()]):
            raise BadSpecifications('inconsistent sample keys')
        if len(rev) != max([b for (a, b) in samples.keys()]):
            raise BadSpecifications('inconsistent sample keys')
Exemplo n.º 4
0
   def test_matchSuffix(self):
      matcher = seeq.compile("CGCTAATTAATGGAAT", 3)

      nomatch = "ATGCTGATGCTGGGGG"
      match = "GGGGCGCTAATAATGGAATGGGG"

      self.assertEqual(matcher.matchSuffix(nomatch, True), "")
      self.assertEqual(matcher.matchSuffix(nomatch, False), "")
      self.assertEqual(matcher.matchSuffix(match, True),
            "CGCTAATAATGGAATGGGG")
      self.assertEqual(matcher.matchSuffix(match, False), "GGGG")
Exemplo n.º 5
0
def extract_reads_from_PE_fastq(fname_iPCR_PE1, fname_iPCR_PE2):
    """This function takes the 2 pair-end sequencing files and extracts the
    barcode making sure that the other read contains the transposon."""

    MIN_BRCD = 15
    MAX_BRCD = 25
    MIN_GENOME = 15

    # The known parts of the sequences are matched with a Levenshtein
    # automaton. On the reverse read, the end of the transposon
    # corresponds to a 34 bp sequence ending as shown below. We allow
    # up to 5 mismatches/indels. On the forward read, the only known
    # sequence is the CATG after the barcode, which is matched exactly.
    pT2 = seeq.compile('TGTATGTAAACTTCCGACTTCAACTGTA', 5)

    # Open a file to write
    fname_fasta = re.sub(r'[\_F][w\_].fastq(\.gz)?', 'iPCR.fasta',
                         fname_iPCR_PE1)

    # Substitution failed, append '.fasta' to avoid name collision.
    if fname_fasta == fname_iPCR_PE1:
        fname_fasta = fname_iPCR_PE1 + '.fasta'

    # Skip if file exists.
    if os.path.exists(fname_fasta):
        return fname_fasta

    with gzopen(fname_iPCR_PE1) as f, gzopen(fname_iPCR_PE2) as g, \
            open(fname_fasta, 'w') as outf:
        # Aggregate iterator of f,g iterators -> izip(f,g).
        for lineno, (line1, line2) in enumerate(izip(f, g)):
            # Take sequence only.
            if lineno % 4 != 1:
                continue
            # Split on "CATG" and take the first fragment.
            # In case there is no "CATG", the barcode will be rejected
            # for being too long.
            brcd = line1.rstrip().split('CATG')[0]
            if not MIN_BRCD < len(brcd) < MAX_BRCD:
                continue
            # Use a Levenshtein automaton to find the transpsoson.
            genome = pT2.matchSuffix(line2, False)
            if not genome:
                continue
            # Select the region from the end of the transposon to
            # the first "CATG", if any.
            genome = genome.split('CATG')[0].rstrip()
            if len(genome) < MIN_GENOME:
                continue
            outf.write('>%s\n%s\n' % (brcd, genome))

    return fname_fasta
Exemplo n.º 6
0
def extract_fingerprint_and_GATCGATC(f):
    '''The design of the oligo is the following:
   o(12) GATCGATC o(12) CGCACTAATGAATTCGTTGC u(20)
   The nucleotides labelled "o" are oligo-specific random
   nucleotides; those labelled "u" are random UMI nucleotides
   introduced during the linear amplification or the PCR.

   The "fingerprint" is the concatenation of the random
   nucleotides with a constant sequence, i.e.
   o(12) o(12) AGATACAGAGATAATACA u(20).
   '''

    cst = seeq.compile(r'CGCACTAATGAATTCGTTGCA', 4)
    GATCGATC = seeq.compile(r'GATCGATC', 1)

    for line in f:
        # First remove the constant part, keep the left part
        # with oligo-specific nucleotides plus GATCGATC, and
        # keep the UMI on the right.
        try:
            oligo, ignore, umi = cst.match(line.rstrip()).tokenize()
            # Target length is 32. Allow at most 2 indels.
            if not 30 <= len(oligo) <= 34: continue
        except (ValueError, AttributeError):
            continue

        # Then split the oligo part to extract GATCGATC
        try:
            start, end, ignore = GATCGATC.match(oligo[10:22]).matchlist[0]
        except AttributeError:
            continue
        brcd = oligo[:10 + start] + oligo[10 + end:]
        readout = oligo[10 + start:10 + end]

        # Output fingerprint and GATCGATC
        fingerprint = brcd + 'AGATACAGAGATAATACA' + umi
        sys.stdout.write('%s\t%s\n' % (fingerprint, readout))
Exemplo n.º 7
0
def extract_reads_from_PE_fastq(fname_iPCR_PE1, fname_iPCR_PE2):
   """This function takes the 2 pair-end sequencing files and extracts the
   barcode making sure that the other read contains the transposon."""

   MIN_BRCD = 15
   MAX_BRCD = 25
   MIN_GENOME = 15

   # The known parts of the sequences are matched with a Levenshtein
   # automaton. On the reverse read, the end of the transposon
   # corresponds to a 34 bp sequence ending as shown below. We allow
   # up to 5 mismatches/indels. On the forward read, the only known
   # sequence is the CATG after the barcode, which is matched exactly.
   pT2 = seeq.compile('TGTATGTAAACTTCCGACTTCAACTGTA', 5)

   # Open a file to write
   fname_fasta = re.sub(r'read[1-2].fastq(\.gz)?', 'iPCR.fasta',
         fname_iPCR_PE1)
   # Substitution failed, append '.fasta' to avoid name collision.
   if fname_fasta == fname_iPCR_PE1:
      fname_fasta = fname_iPCR_PE1 + '.fasta'

   # Skip if file exists.
   if os.path.exists(fname_fasta): return fname_fasta
    
   with gzopen(fname_iPCR_PE1) as f, gzopen(fname_iPCR_PE2) as g, \
      open(fname_fasta, 'w') as outf:
      # Aggregate iterator of f,g iterators -> izip(f,g).
      for lineno,(line1,line2) in enumerate(izip(f,g)):
         # Take sequence only.
         if lineno % 4 != 1: continue
         # Split on "CATG" and take the first fragment.
         # In case there is no "CATG", the barcode will be rejected
         # for being too long.
         brcd = line1.rstrip().split('CATG')[0]
         if not MIN_BRCD < len(brcd) < MAX_BRCD: continue
         # Use a Levenshtein automaton to find the transpsoson.
         genome = pT2.matchSuffix(line2, False)
         if not genome: continue
         # Select the region from the end of the transposon to
         # the first "CATG", if any.
         genome = genome.split('CATG')[0].rstrip()
         if len(genome) < MIN_GENOME: continue
         outf.write('>%s\n%s\n' % (brcd,genome))

   return fname_fasta
Exemplo n.º 8
0
def main(f):
    constant = seeq.compile('TATAGTGAGTCGTATTAAAAGCGAAAGGGAAACCAGAGGAGC', 5)
    for lineno, line in enumerate(f):
        if lineno % 4 == 0:
            index2 = re.sub(r'.*\+', '', line.rstrip())
        elif lineno % 4 == 1:
            m = constant.match(line.rstrip())
            if m is None:
                continue
            try:
                barcode, ignore, tail = m.tokenize()
            except ValueError:
                continue
            if len(tail) < 8 or len(barcode) < 14:
                continue
            UMI = tail[:4]
            index1 = tail[4:12]
            sys.stdout.write('%s %s %s %s\n' % (barcode, index1, index2, UMI))
Exemplo n.º 9
0
def call_starcode_on_fastq_file(fname_fastq):
    ''' Extracts the gDNA,cDNA reads and spikes and runs stracode on them.'''
    MIN_BRCD = 15
    MAX_BRCD = 25

    brcd_outfname = re.sub(r'\.fastq.*', '_starcode.txt', fname_fastq)
    spk_outfname = re.sub(r'\.fastq.*', '_spikes_starcode.txt', fname_fastq)
    if brcd_outfname == fname_fastq:
        brcd_outfname = fname_fastq + '_starcode.txt'
    if spk_outfname == fname_fastq:
        spk_outfname = fname_fastq + '_spikes_starcode.txt'

    if os.path.exists(brcd_outfname) and os.path.exists(spk_outfname):
        return (brcd_outfname, spk_outfname)

    SPIKE = seeq.compile('CATGATTACCCTGTTATC', 2)
    barcode_tempf = tempfile.NamedTemporaryFile(delete=False)
    spike_tempf = tempfile.NamedTemporaryFile(delete=False)
    with gzopen(fname_fastq) as f:
        outf = None
        for lineno, line in enumerate(f):
            if lineno % 4 != 1:
                continue
            spike = SPIKE.match(line)
            if spike is not None:
                outf = spike_tempf
                outf.write(line[:spike.matchlist[0][0]] + '\n')
            else:
                outf = barcode_tempf
                outf.write(line[:20] + '\n')

    barcode_tempf.close()
    spike_tempf.close()

    # Call `starcode`.
    starcode_process = subprocess.call([
        'starcode',
        '-t4',
        '--print-clusters',
        '-i',
        barcode_tempf.name,
        '-o',
        brcd_outfname,
    ])

    if int(starcode_process) < 0:
        sys.stderr.write("Error during Starcode call on: %s\n" %
                         barcode_tempf.name)

    starcode_process = subprocess.call([
        'starcode',
        '-t4',
        '--print-clusters',
        '-i',
        spike_tempf.name,
        '-o',
        spk_outfname,
    ])

    if int(starcode_process) < 0:
        sys.stderr.write("Error during Starcode call on: %s\n" % spk_outfname)

    # Delete temporary files.
    os.unlink(barcode_tempf.name)
    os.unlink(spike_tempf.name)
Exemplo n.º 10
0
                    help="iPCR reverse reads (gzipped+fastq format)")
params = parser.parse_args()

fbrcd = open(params.barcodes, 'w')
fseqs = open(params.reads, 'w')

if params.bdist < 0: bdist = int(max(1, round(0.2 * len(params.b))))
else: bdist = params.bdist
if params.ldist < 0: ldist = int(max(1, round(0.1 * len(params.l))))
else: ldist = params.rdist

# BHIVE seqs:
# T7 promoter (-b) TATAGTGAGTCGTA
# LTR sequence (-l) AGCCCTTCCA
# HIVRE sequence (-r) CGCTTTTAA
T7 = seeq.compile(params.b, bdist)
LTR = seeq.compile(params.l, ldist)
HIVRE = None
if params.r:
    HIVRE = seeq.compile(params.r, params.rdist)
fqline = [""] * 4

with gzip.open(params.ipcr_forward) as r1, gzip.open(
        params.ipcr_reverse) as r2:
    for lineno, line in enumerate(r1):
        if lineno % 4 != 1: continue
        for i in range(0, 4):
            fqline[i] = r2.readline()
        # Match 5' LTR on reverse read (required, there must be 20 nt).
        l = LTR.matchBest(fqline[1])
        if not l:
#!/usr/bin/env python
# -*- coding:utf-8 -*-

import sys
import seeq

from gzopen import gzopen

T7 = seeq.compile('TATAGTGAGTCGTATTAAAA', 3)

def main(f, indx):
   for lineno,line in enumerate(f):
      if lineno % 4 != 1: continue
      try:
         barcode,suffix = T7.match(line).split()
         if 14 < len(barcode) < 25 and suffix.startswith(indx):
            print barcode
      except (AttributeError, ValueError, IndexError) as e:
         continue

if __name__ == '__main__':
   with gzopen(sys.argv[1]) as f:
      main(f, sys.argv[2])
Exemplo n.º 12
0
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import pdb
import seeq
import sys
from gzopen import gzopen
from itertools import izip

pT2 = seeq.compile('TGTATGTAAACTTCCGACTTCAACTGTA', 5)
MIN_BRCD = 15
MAX_BRCD = 25
MIN_GENOME = 15

#pdb.set_trace()
outfname = sys.argv[1].split("_")[0] +  ".tomap"

with gzopen(sys.argv[1]) as f, gzopen(sys.argv[2]) as g, \
      open(outfname, 'w') as outf:
      # Aggregate iterator of f,g iterators -> izip(f,g).
      for lineno,(line1,line2) in enumerate(izip(f,g)):
         # Take sequence only.
         if lineno % 4 != 1: continue
         # Split on "CATG" and take the first fragment.
         # In case there is no "CATG", the barcode will be rejected
         # for being too long.
         brcd = line1.rstrip().split('CATG')[0]
         if not MIN_BRCD < len(brcd) < MAX_BRCD: continue
         # Use a Levenshtein automaton to find the transpsoson.
         genome = pT2.matchSuffix(line2, False)y
         if not genome: continue
         # Select the region from the end of the transposon to
Exemplo n.º 13
0
def main(fname):
    lineno = 0
    mode = 2

    # Parse parameter file
    cf = open(fname)
    for line in cf:
        lineno += 1
        line = ''.join(line.split())
        if line == '' or line[0] == '#':
            continue
        elif len(line.split('=')) == 2:
            [param, value] = line.split('=')
            params[param] = value
        elif len(line.split(':')) == 2:
            section = line.split(':')[0]
            if section == 'dna-index':
                mode = 0
            elif section == 'rna-index':
                mode = 1
            else:
                print "error in parameter file: Uknown section '{}' in {}, line {}.".format(
                    section, fname, lineno)
                sys.exit(1)
        elif mode < 2 and len(line.split(',')) == 2:
            [index, fout] = line.split(',')
            if IND[mode].has_key(index):
                print "duplicate index '{}' in {}, line {}".format(
                    index, fname, lineno)
                sys.exit(1)
            IND[mode][index] = fout
        else:
            print "unknown parameter '{}' in {}, line {}".format(
                line, fname, lineno)
            sys.exit(1)

    # Check parameters
    error = 0
    if not params.has_key('bfs'):
        print "missing parameter in {}: 'bfs' must be defined.".format(fname)
        error = 1
    if not params.has_key('dist'):
        print "missing parameter in {}: 'dist' must be defined.".format(fname)
        error = 1
    if not params.has_key('dna-seqfile'):
        print "missing parameter in {}: 'dna-seqfile' must be defined.".format(
            fname)
        error = 1
    if not params.has_key('rna-seqfile'):
        print "missing parameter in {}: 'rna-seqfile' must be defined.".format(
            fname)
        error = 1
    if error:
        sys.exit(1)

    # Compile flanking sequence
    T7 = seeq.compile(params['bfs'], int(params['dist']))

    FDICT = [dict(), dict()]
    for index, fname in IND[0].items():
        if not FDICT[0].has_key(fname):
            FDICT[0][fname] = gzip.open(fname, 'wb')

    for index, fname in IND[1].items():
        if not FDICT[1].has_key(fname):
            FDICT[1][fname] = gzip.open(fname, 'wb')

    try:
        # Demultiplex DNA indices
        with gzopen(params['dna-seqfile']) as f:
            # Read fastq file.
            for lineno, line in enumerate(f):
                lines[lineno % 4] = line
                if lineno % 4 == 3:
                    index = getindex(T7, lines[1])
                    if IND[0].has_key(index):
                        f = FDICT[0][IND[0][index]]
                        for l in lines:
                            f.write(l)

        # Demultiplex RNA indices
        with gzopen(params['rna-seqfile']) as f:
            # Read fastq file.
            for lineno, line in enumerate(f):
                lines[lineno % 4] = line
                if lineno % 4 == 3:
                    index = getindex(T7, lines[1])
                    if IND[1].has_key(index):
                        f = FDICT[1][IND[1][index]]
                        for l in lines:
                            f.write(l)
    finally:
        for f in FDICT[0].values():
            f.close()
        for f in FDICT[1].values():
            f.close()
Exemplo n.º 14
0
def call_starcode_on_fastq_file(fname_fastq):
   ''' Extracts the gDNA,cDNA reads and spikes and runs stracode on them.'''
   MIN_BRCD = 15
   MAX_BRCD = 25

   brcd_outfname = re.sub(r'\.fastq.*', '_starcode.txt', fname_fastq)
   spk_outfname = re.sub(r'\.fastq.*', '_spikes_starcode.txt', fname_fastq)
   if brcd_outfname == fname_fastq:
      brcd_outfname = fname_fastq + '_starcode.txt'
   if spk_outfname == fname_fastq:
      spk_outfname = fname_fastq + '_spikes_starcode.txt'

   if os.path.exists(brcd_outfname) and os.path.exists(spk_outfname):
      return (brcd_outfname, spk_outfname)

   GFP = seeq.compile('CATGCTAGTTGTGGTTTGTCCAAACT', 4)
   SPIKE = seeq.compile('CATGATTACCCTGTTATC', 2)
   barcode_tempf = tempfile.NamedTemporaryFile(delete=False)
   spike_tempf = tempfile.NamedTemporaryFile(delete=False)
   with gzopen(fname_fastq) as f:
      outf = None
      for lineno,line in enumerate(f):
         if lineno % 4 != 1: continue
         hit = GFP.match(line)
         if hit is not None:
            outf = barcode_tempf
         else:
            hit = SPIKE.match(line)
            if hit is not None:
               outf = spike_tempf
            else:
               continue
         pos = hit.matchlist[0][0]
         if MIN_BRCD <= pos <= MAX_BRCD:
            outf.write(line[:pos] + '\n')
   barcode_tempf.close()
   spike_tempf.close()

   # Skip if file exists.
   if not os.path.exists(brcd_outfname):
      # Call `starcode`.
      subprocess.call([
         'starcode',
         '-t4',
         '-i', barcode_tempf.name,
         '-o', brcd_outfname,
      ])

   if not os.path.exists(spk_outfname):
      subprocess.call([
         'starcode',
         '-t4',
         '-i', spike_tempf.name,
         '-o', spk_outfname,
      ])

   # Delete temporary files.
   os.unlink(barcode_tempf.name)
   os.unlink(spike_tempf.name)

   return (brcd_outfname, spk_outfname)
Exemplo n.º 15
0
import sys
import re
import seeq
import pdb
import os
#from automata import PatternMatcher
from itertools import izip
from gzopen import gzopen

#pdb.set_trace()
fname1 = sys.argv[1]
fname2 = sys.argv[2]

#hind = seeq.compile('AAGCTAGCTT', 1)
dpn = seeq.compile('GATC', 0)
# Open 2 files to write
out1 = re.sub(r'.fastq(\.gz)?', 'read1.fasta', os.path.basename(fname1))
out2 = re.sub(r'.fastq(\.gz)?', 'read2.fasta', os.path.basename(fname2))

# We cut in enzyme restriction site GATC (DpnII) and make a fasta file
# Or cut in                         AAGCTAGCTT (HindIII)
with gzopen(fname1) as f, gzopen(fname2) as g, \
     open(out1,'w') as y, open(out2,'w') as z:
    for lineno, (line1, line2) in enumerate(izip(f, g)):
        if lineno % 4 != 1: continue

        seq1 = dpn.matchPrefix(line1, False) or line1.rstrip()
        seq2 = dpn.matchPrefix(line2, False) or line2.rstrip()
        if len(seq1) > 16 and len(seq2) > 16:
            y.write('>%d\n' % (lineno / 4))
            y.write(seq1 + '\n')
Exemplo n.º 16
0
import pdb
import sys
import seeq
from collections import defaultdict

COMMON = seeq.compile('CTAGTTGTGGTTTGTCCAAACTCATCGAGCTCGAGA', 3)
PROMD = defaultdict(int)

with open(sys.argv[1]) as f:
    for lineno, line in enumerate(f):
        if lineno % 4 != 1: continue
        barcode = COMMON.matchPrefix(line, False)
        prom = COMMON.matchSuffix(line.rstrip(), False)
        if prom:
            PROMD[prom] += 1

#pdb.set_trace()

for k in PROMD:
    count = PROMD[k]
    sys.stdout.write('%s\t%d\n' % (k, count))
Exemplo n.º 17
0
def call_starcode_on_fastq_file(fname_fastq):
    ''' Extracts the gDNA,cDNA reads and spikes and runs stracode on them.'''
    MIN_BRCD = 15
    MAX_BRCD = 25

    brcd_outfname = re.sub(r'\.fastq.*', '_starcode.txt', fname_fastq)
    spk_outfname = re.sub(r'\.fastq.*', '_spikes_starcode.txt', fname_fastq)
    if brcd_outfname == fname_fastq:
        brcd_outfname = fname_fastq + '_starcode.txt'
    if spk_outfname == fname_fastq:
        spk_outfname = fname_fastq + '_spikes_starcode.txt'

    if os.path.exists(brcd_outfname) and os.path.exists(spk_outfname):
        return (brcd_outfname, spk_outfname)

    GFP = seeq.compile('CATGCTAGTTGTGGTTTGTCCAAACT', 4)
    SPIKE = seeq.compile('CATGATTACCCTGTTATC', 2)
    barcode_tempf = tempfile.NamedTemporaryFile(delete=False)
    spike_tempf = tempfile.NamedTemporaryFile(delete=False)
    with gzopen(fname_fastq) as f:
        outf = None
        for lineno, line in enumerate(f):
            if lineno % 4 != 1:
                continue
            hit = GFP.match(line)
            if hit is not None:
                outf = barcode_tempf
            else:
                hit = SPIKE.match(line)
                if hit is not None:
                    outf = spike_tempf
                else:
                    continue
            pos = hit.matchlist[0][0]
            if MIN_BRCD <= pos <= MAX_BRCD:
                outf.write(line[:pos] + '\n')
    barcode_tempf.close()
    spike_tempf.close()

    # Skip if file exists.
    if not os.path.exists(brcd_outfname):
        # Call `starcode`.
        subprocess.call([
            'starcode',
            '-t4',
            '-i',
            barcode_tempf.name,
            '-o',
            brcd_outfname,
        ])

    if not os.path.exists(spk_outfname):
        subprocess.call([
            'starcode',
            '-t4',
            '-i',
            spike_tempf.name,
            '-o',
            spk_outfname,
        ])

    # Delete temporary files.
    os.unlink(barcode_tempf.name)
    os.unlink(spike_tempf.name)

    return (brcd_outfname, spk_outfname)