Exemplo n.º 1
0
def prepend_barcode(seqfile, bcfile, rc, text=''):
    tmph = open(seqfile + '.tmp', 'w')
    itr1 = FastqGeneralIterator(open(seqfile))
    itr2 = FastqGeneralIterator(open(bcfile))
    (h1, s1, q1) = itr1.next()
    (h2, s2, q2) = itr2.next()
    while 1:
        h1 = h1.split()[0]
        h2 = h2.split()[0]
        while h1 != h2:
            try:
                (h2, s2, q2) = itr2.next()
                h2 = h2.split()[0]
            except (StopIteration, IOError):
                break
        if rc:
            rcs = Seq(s2, generic_dna)
            s2 = rcs.reverse_complement()
            q2 = q2[::-1]
        if text:
            h1 = h1 + '.' + text
        tmph.write("@%s\n%s%s\n+\n%s%s\n" % (h1, s2, s1, q2, q1))
        try:
            (h1, s1, q1) = itr1.next()
            (h2, s2, q2) = itr2.next()
        except (StopIteration, IOError):
            break
    tmph.close()
    os.rename(seqfile + '.tmp', seqfile)
Exemplo n.º 2
0
def prepend_barcode(seqfile, bcfile, rc, text=''):
    tmph = open(seqfile+'.tmp', 'w')
    itr1 = FastqGeneralIterator(open(seqfile))
    itr2 = FastqGeneralIterator(open(bcfile))
    (h1, s1, q1) = itr1.next()
    (h2, s2, q2) = itr2.next()
    while 1:
        h1 = h1.split()[0]
        h2 = h2.split()[0]
        while h1 != h2:
            try:
                (h2, s2, q2) = itr2.next()
                h2 = h2.split()[0]
            except (StopIteration, IOError):
                break
        if rc:
            rcs = Seq(s2, generic_dna)
            s2 = rcs.reverse_complement()
            q2 = q2[::-1]
        if text:
            h1 = h1+'.'+text
        tmph.write("@%s\n%s%s\n+\n%s%s\n" %(h1, s2, s1, q2, q1))
        try:
            (h1, s1, q1) = itr1.next()
            (h2, s2, q2) = itr2.next()
        except (StopIteration, IOError):
            break
    tmph.close()
    os.rename(seqfile+'.tmp', seqfile)
Exemplo n.º 3
0
def main(args):
    usage  = "usage: %prog [options] -i <input index file> -s <input seq file> -o <output merge file>"+__doc__
    parser = OptionParser(usage)
    parser.add_option("-i", "--index", dest="index", default=None, help="Input index fastq file.")
    parser.add_option("-s", "--seq", dest="seq", default=None, help="Input seq fastq file.")
    parser.add_option("-o", "--output", dest="output", default=None, help="Output barcode file.")
    
    (opts, args) = parser.parse_args()
    if not (opts.index and os.path.isfile(opts.index) and opts.seq and os.path.isfile(opts.seq) and opts.output):
        parser.error("Missing input and/or output")
    
    outh = open(opts.output+'.tmp', 'w')
    itr1 = FastqGeneralIterator(open(opts.seq))
    itr2 = FastqGeneralIterator(open(opts.index))
    (h1, s1, q1) = itr1.next()
    (h2, s2, q2) = itr2.next()
    while 1:
        h1 = h1.split()[0]
        h2 = h2.split()[0]
        while h1 != h2:
            try:
                (h2, s2, q2) = itr2.next()
                h2 = h2.split()[0]
            except (StopIteration, IOError):
                break
        outh.write("@%s\n%s%s\n+\n%s%s\n" %(h1, s2, s1, q2, q1))
        try:
            (h1, s1, q1) = itr1.next()
            (h2, s2, q2) = itr2.next()
        except (StopIteration, IOError):
            break
    outh.close()
    os.rename(opts.output+'.tmp', opts.output)
    
    return 0
Exemplo n.º 4
0
def main(args):
    usage = "usage: %prog [options] -i <input index file> -s <input seq file> -o <output merge file>" + __doc__
    parser = OptionParser(usage)
    parser.add_option("-i",
                      "--index",
                      dest="index",
                      default=None,
                      help="Input index fastq file.")
    parser.add_option("-s",
                      "--seq",
                      dest="seq",
                      default=None,
                      help="Input seq fastq file.")
    parser.add_option("-o",
                      "--output",
                      dest="output",
                      default=None,
                      help="Output barcode file.")

    (opts, args) = parser.parse_args()
    if not (opts.index and os.path.isfile(opts.index) and opts.seq
            and os.path.isfile(opts.seq) and opts.output):
        parser.error("Missing input and/or output")

    outh = open(opts.output + '.tmp', 'w')
    itr1 = FastqGeneralIterator(open(opts.seq))
    itr2 = FastqGeneralIterator(open(opts.index))
    (h1, s1, q1) = itr1.next()
    (h2, s2, q2) = itr2.next()
    while 1:
        h1 = h1.split()[0]
        h2 = h2.split()[0]
        while h1 != h2:
            try:
                (h2, s2, q2) = itr2.next()
                h2 = h2.split()[0]
            except (StopIteration, IOError):
                break
        outh.write("@%s\n%s%s\n+\n%s%s\n" % (h1, s2, s1, q2, q1))
        try:
            (h1, s1, q1) = itr1.next()
            (h2, s2, q2) = itr2.next()
        except (StopIteration, IOError):
            break
    outh.close()
    os.rename(opts.output + '.tmp', opts.output)

    return 0
Exemplo n.º 5
0
def main():

    count = 0
    fnum  = 1

    if not paired:

        handle = open( "%s_%03d.fq" % (arguments['--output'], fnum), "w")
        for t,s,q in FastqGeneralIterator(open(arguments['<file.fastq>'], "r")):
            if count >= arguments['--number']:
                handle.close()
                count = 0
                fnum += 1
                handle = open( "%s_%03d.fq" % (arguments['--output'], fnum), "w")
            handle.write( "@%s\n%s\n+\n%s\n" % (t,s,q) )
            count += 1
                
    else:
        
        #going to assume all reads are in both files and skip error checking
        h1 = open( "%s_R1_%03d.fq" % (arguments['--output'], fnum), "w")
        h2 = open( "%s_R2_%03d.fq" % (arguments['--output'], fnum), "w")
        r2_gen = FastqGeneralIterator(open(arguments['<read2.fastq>'], "r"))

        for t,s,q in FastqGeneralIterator(open(arguments['<file.fastq>'], "r")):
            if count >= arguments['--number']:
                h1.close()
                h2.close()
                count = 0
                fnum += 1
                h1 = open( "%s_R1_%03d.fq" % (arguments['--output'], fnum), "w")
                h2 = open( "%s_R2_%03d.fq" % (arguments['--output'], fnum), "w")
            h1.write( "@%s\n%s\n+\n%s\n" % (t,s,q) )
            h2.write( "@%s\n%s\n+\n%s\n" % r2_gen.next() )
            count += 1
Exemplo n.º 6
0
def distribute_reads(readfiles, read_hit_dict, single=True):
    iterator1 = FastqGeneralIterator(open(readfiles[0]))
    if len(readfiles) == 1:

        for ID1_long, Seq1, Qual1 in iterator1:
            ID1 = ID1_long.split()[0]
            if ID1 in read_hit_dict:
                for target in read_hit_dict[ID1]:
                    write_single_seqs(target, ID1, Seq1)
        return

    elif len(readfiles) == 2:
        iterator2 = FastqGeneralIterator(open(readfiles[1]))

    for ID1_long, Seq1, Qual1 in iterator1:
        ID2_long, Seq2, Qual2 = iterator2.next()

        ID1 = ID1_long.split()[0]
        ID2 = ID2_long.split()[0]

        if ID1 in read_hit_dict:
            for target in read_hit_dict[ID1]:
                write_paired_seqs(target, ID1, Seq1, ID2, Seq2)
        elif ID2 in read_hit_dict:
            for target in read_hit_dict[ID2]:
                write_paired_seqs(target, ID1, Seq1, ID2, Seq2)
def lookup_index_cycles(index_fn):

    iterator = FastqGeneralIterator(gzip.open(args.index_read_file))

    name, seq, qual = iterator.next()

    return len(seq)
def distribute_reads(readfiles,read_hit_dict,single=True):
	iterator1 = FastqGeneralIterator(open(readfiles[0]))
	if len(readfiles) == 1:
	
		for ID1_long, Seq1, Qual1 in iterator1:
			ID1 = ID1_long.split()[0]
			if ID1 in read_hit_dict:
				for target in read_hit_dict[ID1]:
					write_single_seqs(target,ID1,Seq1)
		return

	elif len(readfiles) == 2:
		iterator2 = FastqGeneralIterator(open(readfiles[1]))
	
	for ID1_long, Seq1, Qual1 in iterator1:
		ID2_long, Seq2, Qual2 = iterator2.next()
		
		ID1 = ID1_long.split()[0]
		ID2 = ID2_long.split()[0]
		
		if ID1 in read_hit_dict:
			for target in read_hit_dict[ID1]:
				write_paired_seqs(target,ID1,Seq1,ID2,Seq2)
		elif ID2 in read_hit_dict:
			for target in read_hit_dict[ID2]:
				write_paired_seqs(target,ID1,Seq1,ID2,Seq2)
Exemplo n.º 9
0
def stitch_seqs(outfile, file1, file2, blen):
    bseq  = 'N' * blen
    bqual = '!' * blen
    itr1 = FastqGeneralIterator(open(file1))
    itr2 = FastqGeneralIterator(open(file2))
    rec1 = itr1.next()
    rec2 = itr2.next()
    outh = open(outfile, 'w')
    while 1:
        seq2 = Seq(rec2[1], generic_dna)
        outh.write("@%s\n%s%s%s\n+\n%s%s%s\n" %(rec1[0].split()[0], rec1[1], bseq, str(seq2.reverse_complement()), rec1[2], bqual, rec2[2][::-1]))
        try:
            rec1 = itr1.next()
            rec2 = itr2.next()
        except (StopIteration, IOError):
            break
    outh.close()
Exemplo n.º 10
0
def stitch_seqs(outfile, file1, file2, blen):
    bseq = 'N' * blen
    bqual = '!' * blen
    itr1 = FastqGeneralIterator(open(file1))
    itr2 = FastqGeneralIterator(open(file2))
    rec1 = itr1.next()
    rec2 = itr2.next()
    outh = open(outfile, 'w')
    while 1:
        seq2 = Seq(rec2[1], generic_dna)
        outh.write(
            "@%s\n%s%s%s\n+\n%s%s%s\n" %
            (rec1[0].split()[0], rec1[1], bseq, str(
                seq2.reverse_complement()), rec1[2], bqual, rec2[2][::-1]))
        try:
            rec1 = itr1.next()
            rec2 = itr2.next()
        except (StopIteration, IOError):
            break
    outh.close()
def parse_2fastq_parallel(file1, file2):
    """ Parse two fastq files in parallel - generator yielding (name, seq1, seq2, qual1, qual2) tuples.

    Doesn't check that the readnames match.
    """
    from Bio.SeqIO.QualityIO import FastqGeneralIterator    # Bio is the biopython package
    with open(file1) as INFILE1:
        with open(file2) as INFILE2:
            generator1 = FastqGeneralIterator(INFILE1)
            generator2 = FastqGeneralIterator(INFILE2)
            if_finished_1, if_finished_2 = False, False
            while True:
                try:                    name1, seq1, qual1 = generator1.next()
                except StopIteration:   if_finished_1 = True
                try:                    name2, seq2, qual2 = generator2.next()
                except StopIteration:   if_finished_2 = True
                name = name1.split()[0]
                if not if_finished_1 and not if_finished_2:
                    yield (name, seq1, seq2, qual1, qual2)
                elif if_finished_1 and if_finished_2:
                    raise StopIteration
                else:
                    raise DeepseqError("One file finished but the other one didn't! Read name %s"%(
                                                                        name if if_finished_2 else name2.split()[0]))
if (args.read2_file != None):
    iterator2 = FastqGeneralIterator(gzip.open(args.read2_file))
if (args.read3_file != None):
    iterator3 = FastqGeneralIterator(gzip.open(args.read3_file))

func = None

if (args.mismatches > 0):
    func = make_fuzzy_match(args.mismatches, args.n_penalty)
else:
    func = make_strict_match(read_index_length, indexes)

for rname1, seq1, qual1 in FastqGeneralIterator(gzip.open(args.read1_file)):

    rnamei, seqi, quali = iteratori.next()

    out_index = func(seqi, indexes)

    if (out_index is None):
        out_index = 'unknown'

    if (iterator2 is not None):

        rname2, seq2, qual2 = iterator2.next()

    if (iterator3 is not None):

        rname3, seq3, qual3 = iterator3.next()

        read3_index_out_fh[out_index].write('@' + rname3 + ":" + seq3 + "\n")
Exemplo n.º 13
0
#read1_iter = SeqIO.parse(sys.argv[1], "fastq")
#read2_iter = SeqIO.parse(sys.argv[2], "fastq")

read1_iter = FastqGeneralIterator(open(sys.argv[1]))
read2_iter = FastqGeneralIterator(open(sys.argv[2]))

read1_out = open(sys.argv[3], 'w')
read2_out = open(sys.argv[4], 'w')

min_length = int(sys.argv[5])
max_length = int(sys.argv[6])

pairs_discarded = 0
for read1_id, read1_seq, read1_qual in read1_iter:
    read2_id, read2_seq, read2_qual = read2_iter.next()

    read1_seq = read1_seq[5:]
    read1_qual = read1_qual[5:]
    read2_seq = read2_seq[5:]
    read2_qual = read2_qual[5:]

    if len(read1_seq) < min_length or len(read2_seq) < min_length:
        pairs_discarded += 1
        continue
    #SeqIO.write(read1, read1_out, "fastq")
    #SeqIO.write(read2, read2_out, "fastq")

    if len(read1_seq) > max_length:
        read1_seq = read1_seq[:max_length]
        read1_qual = read1_qual[:max_length]
Exemplo n.º 14
0
else:
    handle1 = open(fastq1)
if fastq2.endswith(".gz"):
    sys.stderr.write("Decompressing %s\n" % fastq2)
    handle2 = gzip.open(fastq2)
else:
    handle2 = open(fastq2)
sys.stderr.write("Interlacing paired FASTQ files to stdout...\n")
out_handle = sys.stdout

iter1 = FastqGeneralIterator(handle1)
iter2 = FastqGeneralIterator(handle2)

for title1, seq1, qual1 in iter1:
    try:
        title2, seq2, qual2 = iter2.next()
    except StopIteration:
        sys_exit("More records in %s than %s, e.g. %s" %
                 (fastq1, fastq2, title1))
    id1, descr1 = title1.split(None, 1)
    id2, descr2 = title2.split(None, 1)
    if id1 == id2:
        # Add the /1 and /2, preserve any description after the ID
        if descr1:
            descr1 = " " + descr1
        if descr2:
            descr2 = " " + descr2
        out_handle.write("@%s/1%s\n%s\n+\n%s\n@%s/2%s\n%s\n+\n%s\n"
                         % (id1, descr1, seq1, qual1, id2, descr2, seq2, qual2))
    elif id1.endswith("/1") and id2.endswith("/2") and id1[:-2] == id2[:-2]:
        out_handle.write("@%s\n%s\n+\n%s\n@%s\n%s\n+\n%s\n"
Exemplo n.º 15
0
#!/usr/bin/python

import sys
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.SeqIO.QualityIO import FastqGeneralIterator

#read1_iter = SeqIO.parse(sys.argv[1], "fastq")
#read2_iter = SeqIO.parse(sys.argv[2], "fastq")

read1_iter = FastqGeneralIterator(open(sys.argv[1]))
read2_iter = FastqGeneralIterator(open(sys.argv[2]))

kv_out_file = open(sys.argv[1] + ".tmp", "w")

for read1 in read1_iter:
    read2 = read2_iter.next()

    print(read1[0])

    # strip off the /1 in read1 - this will be the key for the MR data file
    read_id = read1[0][:len(read1[0]) - 2]

    kv_out_file.write("\t".join([
        read_id, read1[0], read1[1], read1[2], read2[0], read2[1], read2[2]
    ]) + "\n")