示例#1
1
def readFq(fname, hdict):
    with open(fname, 'r') as FQ:
        for header, seq, qual in FastqGeneralIterator(FQ):
            if header.count(' '):
                # Header was created using CASAVA 1.8+
                (mhead, suphead) = header.split(' ')
                hdict[mhead].append((seq, qual))
            else:
                # Header was created using older versions of CASAVA
                header = re.sub('/[1-2]', '', header)
                hdict[header].append((seq, qual))
def distribute_reads(readfiles,read_hit_dict,single=True):
	iterator1 = FastqGeneralIterator(open(readfiles[0]))
	if len(readfiles) == 1:
	
		for ID1_long, Seq1, Qual1 in iterator1:
			ID1 = ID1_long.split()[0]
			if ID1 in read_hit_dict:
				for target in read_hit_dict[ID1]:
					write_single_seqs(target,ID1,Seq1)
		return

	elif len(readfiles) == 2:
		iterator2 = FastqGeneralIterator(open(readfiles[1]))
	
	for ID1_long, Seq1, Qual1 in iterator1:
		ID2_long, Seq2, Qual2 = iterator2.next()
		
		ID1 = ID1_long.split()[0]
		ID2 = ID2_long.split()[0]
		
		if ID1 in read_hit_dict:
			for target in read_hit_dict[ID1]:
				write_paired_seqs(target,ID1,Seq1,ID2,Seq2)
		elif ID2 in read_hit_dict:
			for target in read_hit_dict[ID2]:
				write_paired_seqs(target,ID1,Seq1,ID2,Seq2)
示例#3
0
def main(args):
    usage  = "usage: %prog [options] -i <input index file> -s <input seq file> -o <output merge file>"+__doc__
    parser = OptionParser(usage)
    parser.add_option("-i", "--index", dest="index", default=None, help="Input index fastq file.")
    parser.add_option("-s", "--seq", dest="seq", default=None, help="Input seq fastq file.")
    parser.add_option("-o", "--output", dest="output", default=None, help="Output barcode file.")
    
    (opts, args) = parser.parse_args()
    if not (opts.index and os.path.isfile(opts.index) and opts.seq and os.path.isfile(opts.seq) and opts.output):
        parser.error("Missing input and/or output")
    
    outh = open(opts.output+'.tmp', 'w')
    itr1 = FastqGeneralIterator(open(opts.seq))
    itr2 = FastqGeneralIterator(open(opts.index))
    (h1, s1, q1) = itr1.next()
    (h2, s2, q2) = itr2.next()
    while 1:
        h1 = h1.split()[0]
        h2 = h2.split()[0]
        while h1 != h2:
            try:
                (h2, s2, q2) = itr2.next()
                h2 = h2.split()[0]
            except (StopIteration, IOError):
                break
        outh.write("@%s\n%s%s\n+\n%s%s\n" %(h1, s2, s1, q2, q1))
        try:
            (h1, s1, q1) = itr1.next()
            (h2, s2, q2) = itr2.next()
        except (StopIteration, IOError):
            break
    outh.close()
    os.rename(opts.output+'.tmp', opts.output)
    
    return 0
示例#4
0
def prepend_barcode(seqfile, bcfile, rc, text=''):
    tmph = open(seqfile+'.tmp', 'w')
    itr1 = FastqGeneralIterator(open(seqfile))
    itr2 = FastqGeneralIterator(open(bcfile))
    (h1, s1, q1) = itr1.next()
    (h2, s2, q2) = itr2.next()
    while 1:
        h1 = h1.split()[0]
        h2 = h2.split()[0]
        while h1 != h2:
            try:
                (h2, s2, q2) = itr2.next()
                h2 = h2.split()[0]
            except (StopIteration, IOError):
                break
        if rc:
            rcs = Seq(s2, generic_dna)
            s2 = rcs.reverse_complement()
            q2 = q2[::-1]
        if text:
            h1 = h1+'.'+text
        tmph.write("@%s\n%s%s\n+\n%s%s\n" %(h1, s2, s1, q2, q1))
        try:
            (h1, s1, q1) = itr1.next()
            (h2, s2, q2) = itr2.next()
        except (StopIteration, IOError):
            break
    tmph.close()
    os.rename(seqfile+'.tmp', seqfile)
示例#5
0
def read_fastq(fname):
    """Provide read info from fastq file, potentially not existing.
    """
    if fname:
        with open(fname) as in_handle:
            for info in FastqGeneralIterator(in_handle):
                yield info
    else:
        for info in itertools.repeat(("", None, None)):
            yield info
def count_reads(in_fastq):
    """function count the number of reads"""
    #open the fastq file
    in_file = open(in_fastq)
    # iterate through the fastq file
    total_reads = 0
    for (title, sequence, quality) in FastqGeneralIterator(in_file):
        total_reads = total_reads+1
    in_file.close()
    return total_reads
示例#7
0
def _get_fastq_num_records(path_to):
    with open(path_to) as in_handle:
        total_reads = 0
        reads_ids = []
        for title, seq, qual in FastqGeneralIterator(in_handle):
            total_reads += 1
            reads_ids.append(title.split(" ")[0])
        num_uniq_reads = len(set(reads_ids))

    return total_reads, num_uniq_reads
示例#8
0
def fastqtrimmer(fastq_in, fastq_out, trim=21):
    """
    Cut a fastq file using only the first trim characterst.

    """

    handle = open(fastq_out, "w")
    for title, seq, qual in FastqGeneralIterator(open(input)):
        handle.write("@%sn%sn+n%sn" % (title, seq[:trim], qual[:trim]))
    handle.close()
示例#9
0
def filter_sample(file, output):
    global keep_count, total_count
    with open(output, 'w') as out:
        for title, seq, qual in FastqGeneralIterator(open(file)):
            total_count += 1
            sample = title.split('barcodelabel=')[1]
            sample = sample[:-1]
            if not sample in keep_list:
                keep_count += 1
                out.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
示例#10
0
def run_mBP(f1_in, f1_out, min_bp_qual_in_read, min_av_read_qual,
            min_bp_qual_or_N):
    iter1 = FastqGeneralIterator(f1_in)
    for (idLine, seqLine, qualLine) in iter1:
        npQualLine = numpy.fromstring(
            qualLine, dtype=numpy.uint8) - 33  #assume illumina 1.7
        min = numpy.min(npQualLine)
        if min >= min_bp_qual_in_read:
            f1_out.write("@%s\n%s\n%s\n%s\n" %
                         (idLine, seqLine, "+", qualLine))
def deal_fastq_file(afastq):
    fastq_dict = {}
    header = gzip.open(afastq, "r")
    try:
        for title, seq, qual in FastqGeneralIterator(header):
            fastq_dict[title.split()[0]] = seq
    finally:
        header.close()

    return fastq_dict
示例#12
0
def fastqreindex(input, output):
    from Bio.SeqIO.QualityIO import FastqGeneralIterator
    count = 1
    with open(output, 'w') as out:
        with open(input, 'rU') as fastq:
            for title, sequence, qual in FastqGeneralIterator(fastq):
                cols = title.split(';')
                header = 'R_' + str(count) + ';' + cols[1] + ';'
                count += 1
                out.write("@%s\n%s\n+\n%s\n" % (header, sequence, qual))
示例#13
0
def getAvgLength(input):
    AvgLength = []
    for title, seq, qual in FastqGeneralIterator(open(input)):
        AvgLength.append(len(seq))
    Average = sum(AvgLength) / float(len(AvgLength))
    Min = min(AvgLength)
    Max = max(AvgLength)
    a = np.array(AvgLength)
    nintyfive = np.percentile(a, 5)
    return (Average, Min, Max, int(nintyfive))
示例#14
0
def splitread(args):
    """
    %prog splitread fastqfile

    Split fastqfile into two read fastqfiles, cut in the middle.
    """
    p = OptionParser(splitread.__doc__)
    p.add_option(
        "-n",
        dest="n",
        default=76,
        type="int",
        help="Split at N-th base position",
    )
    p.add_option(
        "--rc",
        default=False,
        action="store_true",
        help="Reverse complement second read",
    )
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (pairsfastq, ) = args

    base = op.basename(pairsfastq).split(".")[0]
    fq1 = base + ".1.fastq"
    fq2 = base + ".2.fastq"
    fw1 = must_open(fq1, "w")
    fw2 = must_open(fq2, "w")

    fp = must_open(pairsfastq)
    n = opts.n
    minsize = n * 8 / 5

    for name, seq, qual in FastqGeneralIterator(fp):
        if len(seq) < minsize:
            logging.error("Skipping read {0}, length={1}".format(
                name, len(seq)))
            continue

        name = "@" + name
        rec1 = FastqLite(name, seq[:n], qual[:n])
        rec2 = FastqLite(name, seq[n:], qual[n:])
        if opts.rc:
            rec2.rc()

        print(rec1, file=fw1)
        print(rec2, file=fw2)

    logging.debug("Reads split into `{0},{1}`".format(fq1, fq2))
    fw1.close()
    fw2.close()
示例#15
0
def split_fastqs(r1, r2, r1_o, r2_o, barcodes):
    '''Given paired-end fastq data, split reads based off of an inline 10 (or 11) mer barcode'''
    mismatch = 0
    not_found_R1 = 0
    not_found_R2 = 0
    reads = 0
    fqr1 = FastqGeneralIterator(r1)
    fqr2 = FastqGeneralIterator(r2)
    seqzip = it.izip(fqr1, fqr2)  #Zip up the two iterators for expediency
    for pairs in seqzip:
        title1, seq1, qual1 = pairs[0]
        title2, seq2, qual2 = pairs[1]
        barcode1 = seq1[:
                        8]  #Just look in read 1--barcodes SHOULD be the same on both ends of the molecule)
        barcode2 = seq2[:
                        8]  #Check barcode 2 as well; print out how many times they disagree
        test1 = checkHamming(barcodes, barcode1)
        test2 = checkHamming(barcodes, barcode2)
        if test1[0]:
            if test2[0]:
                #if the barcodes match, print out the trimmed / split reads to new files
                if test1[1] == test2[1]:
                    print >> r1_o, "@%s&%s\n%s\n+\n%s" % (
                        title1, test1[1], seq1[11:], qual1[11:])
                    print >> r2_o, "@%s&%s\n%s\n+\n%s" % (
                        title2, test2[1], seq2[11:], qual2[11:])
                else:
                    mismatch += 1
            else:  #If there isn't a match in R1
                not_found_R2 += 1
        elif test2[0]:
            not_found_R1 += 1
        else:
            not_found_R1 += 1
            not_found_R2 += 1
        reads += 1
    out_error0 = "<H3>Total number of reads:%s</H3>" % reads
    out_error1 = "<H3>Total number of barcode mismatches:%s</H3>" % mismatch
    out_error2 = "<H3>Total number of missed R1 barcodes:%s</H3>" % not_found_R1
    out_error3 = "<H3>Total number of missed R2 barcodes:%s</H3>" % not_found_R2
    sys.stderr.write(out_error0 + '\n' + out_error1 + '\n' + out_error2 +
                     '\n' + out_error3 + '\n')
示例#16
0
def fastq_filter(in_file, pos_file, neg_file, wanted):
    """FASTQ filter."""
    from Bio.SeqIO.QualityIO import FastqGeneralIterator
    pos_count = neg_count = 0
    handle = open(in_file, "r")
    if pos_file is not None and neg_file is not None:
        print("Generating two FASTQ files")
        positive_handle = open(pos_file, "w")
        negative_handle = open(neg_file, "w")
        print(in_file)
        for title, seq, qual in FastqGeneralIterator(handle):
            # print("%s --> %s" % (title, clean_name(title.split(None, 1)[0])))
            if clean_name(title.split(None, 1)[0]) in wanted:
                positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
                pos_count += 1
            else:
                negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
                neg_count += 1
        positive_handle.close()
        negative_handle.close()
    elif pos_file is not None:
        print("Generating matching FASTQ file")
        positive_handle = open(pos_file, "w")
        for title, seq, qual in FastqGeneralIterator(handle):
            if clean_name(title.split(None, 1)[0]) in wanted:
                positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
                pos_count += 1
            else:
                neg_count += 1
        positive_handle.close()
    elif neg_file is not None:
        print("Generating non-matching FASTQ file")
        negative_handle = open(neg_file, "w")
        for title, seq, qual in FastqGeneralIterator(handle):
            if clean_name(title.split(None, 1)[0]) in wanted:
                pos_count += 1
            else:
                negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
                neg_count += 1
        negative_handle.close()
    handle.close()
    return pos_count, neg_count
示例#17
0
def splitDemux2(input, outputdir):
    for title, seq, qual in FastqGeneralIterator(open(input)):
        sample = title.split('barcodelabel=')[1].split(';')[0]
        sample = sample.replace(';', '')
        if not args.length:
            with open(os.path.join(outputdir, sample+'.fastq'), 'ab') as output:
                output.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
        else:
            if len(seq) >= int(args.length):
                with open(os.path.join(outputdir, sample+'.fastq'), 'ab') as output:
                    output.write("@%s\n%s\n+\n%s\n" % (title, seq[:int(args.length):], qual[:int(args.length)]))
示例#18
0
def run_mBPN_pair(f1_in, f1_out, f2_in, f2_out, min_bp_qual_in_read,
                  min_av_read_qual, min_bp_qual_or_N):
    iter1 = FastqGeneralIterator(f1_in)
    iter2 = FastqGeneralIterator(f2_in)
    for (idLine, seqLine, qualLine) in iter1:
        (idLine2, seqLine2, qualLine2) = next(iter2)
        npQualLine = numpy.fromstring(
            qualLine, dtype=numpy.uint8) - 33  #assume illumina 1.7
        npQualLine2 = numpy.fromstring(
            qualLine2, dtype=numpy.uint8) - 33  #assume illumina 1.7
        npSeqLine = numpy.fromstring(seqLine, 'c')
        npSeqLine[npQualLine < min_bp_qual_or_N] = 'N'
        f1_out.write(
            "@%s\n%s\n%s\n%s\n" %
            (idLine, npSeqLine.tostring().decode('utf-8'), "+", qualLine))
        npSeqLine2 = numpy.fromstring(seqLine2, 'c')
        npSeqLine2[npQualLine2 < min_bp_qual_or_N] = 'N'
        f2_out.write(
            "@%s\n%s\n%s\n%s\n" %
            (idLine2, npSeqLine2.tostring().decode('utf-8'), "+", qualLine2))
示例#19
0
def quick_fastq_iterator(handle, alphabet=single_letter_alphabet):
    """Parse Illumina 1.3 to 1.7 FASTQ files without decoding the qualities
    to improve the performance.
    """
    for title, sequence, quality in FastqGeneralIterator(handle):
        first_word = title.split()[0]
        yield SeqRecord(Seq(sequence, alphabet),
                        id=first_word,
                        name=first_word,
                        description=title,
                        annotations={'quality': quality})
示例#20
0
def main(end1_fastq):
    base = _clean_name(os.path.basename(end1_fastq))
    out_genomic = "%s-genomic.fastq" % base
    with gzip.open(end1_fastq) as in_handle:
        with open(out_genomic, "w") as genomic_handle:
            for name, seq, qual in FastqGeneralIterator(in_handle):
                seq_qual = extract_genomic(seq, qual)
                if seq_qual:
                    genomic_handle.write(
                        "@%s\n%s\n+\n%s\n" %
                        (name, seq_qual.genomic_seq, seq_qual.genomic_qual))
def trim_file_handle(in_handle, config):
    """Retrieve trimmed sequences from opened input handle.
    """
    link1, link2 = get_linker_regions(config["linkers"],
                                      config["algorithm"]["anchor_sizes"])
    for name, seq, qual in FastqGeneralIterator(in_handle):
        trim_seq = internal_seq(seq, link1, link2,
                                config["algorithm"]["anchor_mismatches"],
                                config["algorithm"]["min_size"])
        if trim_seq:
            yield name, trim_seq, trim_qual(qual, seq, trim_seq)
示例#22
0
def main(in_file, out_file, trim=0):
    trim = int(trim)

    with open(in_file) as in_handle:
        with open(out_file, "w") as out_handle:
            for title, seq, qual in FastqGeneralIterator(in_handle):

                trim_seq = seq[:len(seq) - trim]
                trim_qual = qual[:len(qual) - trim]
                out_handle.write("@%s\n%s\n+\n%s\n" %
                                 (title, trim_seq, trim_qual))
示例#23
0
def run_mBP_mRQ_pair(f1_in, f1_out, f2_in, f2_out, min_bp_qual_in_read,
                     min_av_read_qual, min_bp_qual_or_N):
    iter1 = FastqGeneralIterator(f1_in)
    iter2 = FastqGeneralIterator(f2_in)
    for (idLine, seqLine, qualLine) in iter1:
        (idLine2, seqLine2, qualLine2) = next(iter2)
        npQualLine = numpy.fromstring(
            qualLine, dtype=numpy.uint8) - 33  #assume illumina 1.7
        npQualLine2 = numpy.fromstring(
            qualLine2, dtype=numpy.uint8) - 33  #assume illumina 1.7
        mean = numpy.mean(npQualLine)
        mean2 = numpy.mean(npQualLine2)
        if mean >= min_av_read_qual and mean2 >= min_av_read_qual:
            min = numpy.min(npQualLine)
            min2 = numpy.min(npQualLine2)
            if min >= min_bp_qual_in_read and min2 >= min_bp_qual_in_read:
                f1_out.write("@%s\n%s\n%s\n%s\n" %
                             (idLine, seqLine, "+", qualLine))
                f2_out.write("@%s\n%s\n%s\n%s\n" %
                             (idLine2, seqLine2, "+", qualLine2))
def main():
    '''
    - Read demux_info file, and both read files in a synchronized way
    - Write read files to a file depending on the sample assignment in demux_info
    '''

    args = parse_args()

    demux_info = pd.read_csv(args.mapping,
                             header=None,
                             index_col=0,
                             sep="\t",
                             dtype=str).dropna(axis=1, how='all')
    index_orient = ['fwd', 'rev'][:demux_info.shape[1] - 3]
    demux_info.columns = ['rid'] + index_orient + ['sample_name', 'mismatches']

    read_orient = ['fwd', 'rev'][:len(args.fastqs)]

    print('Preparing handles.')
    handles = {}
    for sample in demux_info['sample_name'].unique():
        if not pd.isnull(sample):
            for i, orient in enumerate(read_orient, 1):
                handles[sample + orient] = open(
                    '{}_R{}.fastq'.format(sample, i), 'w')

    parsers = [FastqGeneralIterator(open(fastq, 'r')) for fastq in args.fastqs]

    print('Starting demultiplexing')
    for seq_nb, sequences in enumerate(zip(*parsers)):
        ids = [seq[0].split()[0] for seq in sequences]

        if len(ids) > 1:
            if ids[0] != ids[1]:
                print(
                    "Sequence #{}: {} (fwd) and {} (rev) do not match. The forward and reverse read files seem to be out of order"
                    .format(seq_nb, *ids))
                exit(42)

        sample_assignment = demux_info.loc[ids[0], "sample_name"]

        if pd.isnull(sample_assignment):
            continue

        for orient, seq in zip(read_orient, sequences):
            handles[sample_assignment + orient].write(
                '@{}\n{}\n+\n{}\n'.format(*seq))

    for sample in demux_info['sample_name'].unique():
        if not pd.isnull(sample):
            for orient in read_orient:
                handles[sample + orient].close()

    print("Demultiplexing finished.")
示例#25
0
def get_n_reads(fastx, ftype):
    n_lines = 0
    with open(fastx) as f:
        for i, l in enumerate(f):
            n_lines += 1
    
    if ftype=="fastq":
        n_reads = len([read_tup for read_tup in FastqGeneralIterator(open(fastx))])
    elif ftype=="fasta":
        n_reads = len([read_tup for read_tup in SimpleFastaParser(open(fastx))])
    return n_reads
示例#26
0
def recordsToDict(outputprefix, inFastq1, inFastq2, idxBase, barcodeCutOff, constant_right, constant_left, barcode_dict):
    discarded_sequence_count = 0
    constant_left_length = len(constant_left)
    constant_right_length = len(constant_right)
    hamming_left_threshold = float(1)/constant_left_length
    hamming_right_threshold = float(1)/constant_right_length
    usable_left_seq = idxBase + constant_left_length
    usable_right_seq = idxBase + constant_right_length
    func = partial(readClustering, barcode_dict, idxBase, barcodeCutOff,
                constant_left, constant_right, constant_left_length, constant_right_length,
                hamming_left_threshold, hamming_right_threshold, usable_left_seq, usable_right_seq)

    with gzip.open(inFastq1,'rb') as fq1, gzip.open(inFastq2,'rb') as fq2:
        iterator = enumerate(izip(FastqGeneralIterator(fq1),FastqGeneralIterator(fq2)))
        for read_num, (read1,read2) in iterator:
            discarded_sequence_count += func(read1,read2)
    barcode_count = len(barcode_dict.keys())
    stderr.write('[%s] Extracted: %i barcode group\n' %(programname,barcode_count) +\
                 '[%s] discarded: %i sequences\n' %(programname, discarded_sequence_count) +\
                 '[%s] Parsed:    %i seqeucnes\n' %(programname, read_num))
    return barcode_dict, read_num, barcode_count
示例#27
0
def count_kmers_and_reads(in_fastq, kmer_size):
    ktable = khmer.new_ktable(kmer_size)
    read_count = collections.defaultdict(int)
    with open(in_fastq) as in_handle:
        i = 0
        for (_, seq, _) in FastqGeneralIterator(in_handle):
            i += 1
            #if i > 1e5: break
            if seq.find("N") == -1:
                ktable.consume(seq)
                read_count[seq] += 1
    return ktable, dict(read_count)
示例#28
0
def filter_sample(file, output):
    global keep_count, total_count
    with open(output, 'w') as out:
        for title, seq, qual in FastqGeneralIterator(open(file)):
            total_count += 1
            sample = title.split('=', 1)[1].split(';')[0]
            if not sample in keep_list:
                keep_count += 1
                if args.format == 'fastq':
                    out.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
                elif args.format == 'fasta':
                    out.write(">%s\n%s\n" % (title, seq))
示例#29
0
def benchmark_biopython_faster(fh):
    from Bio.SeqIO.QualityIO import FastqGeneralIterator
    total_seq = int(0)
    t0 = time.time()
    it = FastqGeneralIterator(fh)
    for i, (title, seq, qual) in enumerate(it):
        total_seq += len(seq)
        if i % REFRESH_RATE == 0:
            t1 = time.time()
            print('\r%.2fMB/s' % (total_seq/(1E6)/(t1-t0)), end='', flush=True)
    print()
    print('%i entries in %.3f seconds.' % (i+1, time.time()-t0))
示例#30
0
def findQuality(filename):
    fname = filename
    max_value = -9999
    min_value = 9999
    with open(filename) as handle:
        for (title, sequence, quality) in FastqGeneralIterator(handle):
            ascii_score = [ord(number) for number in quality]
            if min(ascii_score) < min_value:
                min_value = min(ascii_score)
            if max(ascii_score) > max_value:
                max_value = max(ascii_score)
        return (min_value, max_value)
示例#31
0
def readFastQ(fastq_path):
    """
    Reads fastq file and returns a dictionary with the header as a key
    """
    with open(fastq_path, 'r') as FASTQ:
        fastq_generator = FastqGeneralIterator(FASTQ)
        readDict = {
            re.sub('/[1-2]', '', header).split(' ')[0]: (seq, qual)
            for header, seq, qual in fastq_generator
        }

    return (readDict)
示例#32
0
def run_script():
    """ runs the script to append paired end information to fastq header"""
    try:
        out = gzip.open(fixed_fastq_file, "wt", compresslevel=4, newline="\n")
        with gzip.open(fastq_file, "rt") as handle:
        for title, sequence, quality in FastqGeneralIterator(handle):
            title = title + "/1"
            record = "\n".join([title, sequence, "+", quality])
            out.write(record)
        close(out)
    except Exception as e:
        print(e)
示例#33
0
def interleave(prefix):
    #Setup variables
    file_f = prefix + "_1.fastq"
    file_r = prefix + "_2.fastq"
    file_out = prefix + "_interleaved.fastq"

    handle = open(file_out, "w")
    count = 0

    f_iter = FastqGeneralIterator(open(file_f, "rU"))
    r_iter = FastqGeneralIterator(open(file_r, "rU"))
    for (f_id, f_seq, f_q), (r_id, r_seq,
                             r_q) in itertools.izip(f_iter, r_iter):
        assert f_id.split(' ')[0] == r_id.split(' ')[0]
        count += 2
        #Write out both reads with "/1" and "/2" suffix on ID
        handle.write(
            "@%s/1\n%s\n+\n%s\n@%s/2\n%s\n+\n%s\n" %
            (f_id.split(' ')[0], f_seq, f_q, r_id.split(' ')[0], r_seq, r_q))
    handle.close()
    print "%i records written to %s" % (count, file_out)
def parse_2fastq_parallel(file1, file2):
    """ Parse two fastq files in parallel - generator yielding (name, seq1, seq2, qual1, qual2) tuples.

    Doesn't check that the readnames match.
    """
    from Bio.SeqIO.QualityIO import FastqGeneralIterator    # Bio is the biopython package
    with open(file1) as INFILE1:
        with open(file2) as INFILE2:
            generator1 = FastqGeneralIterator(INFILE1)
            generator2 = FastqGeneralIterator(INFILE2)
            if_finished_1, if_finished_2 = False, False
            while True:
                try:                    name1, seq1, qual1 = generator1.next()
                except StopIteration:   if_finished_1 = True
                try:                    name2, seq2, qual2 = generator2.next()
                except StopIteration:   if_finished_2 = True
                name = name1.split()[0]
                if not if_finished_1 and not if_finished_2:
                    yield (name, seq1, seq2, qual1, qual2)
                elif if_finished_1 and if_finished_2:
                    raise StopIteration
                else:
                    raise DeepseqError("One file finished but the other one didn't! Read name %s"%(
                                                                        name if if_finished_2 else name2.split()[0]))
示例#35
0
def stitch_seqs(outfile, file1, file2, blen):
    bseq  = 'N' * blen
    bqual = '!' * blen
    itr1 = FastqGeneralIterator(open(file1))
    itr2 = FastqGeneralIterator(open(file2))
    rec1 = itr1.next()
    rec2 = itr2.next()
    outh = open(outfile, 'w')
    while 1:
        seq2 = Seq(rec2[1], generic_dna)
        outh.write("@%s\n%s%s%s\n+\n%s%s%s\n" %(rec1[0].split()[0], rec1[1], bseq, str(seq2.reverse_complement()), rec1[2], bqual, rec2[2][::-1]))
        try:
            rec1 = itr1.next()
            rec2 = itr2.next()
        except (StopIteration, IOError):
            break
    outh.close()
示例#36
0
sys.stderr.write("Interlacing %s and %s\n" % (fastq1, fastq2))
if fastq1.endswith(".gz"):
    sys.stderr.write("Decompressing %s\n" % fastq1)
    handle1 = gzip.open(fastq1)
else:
    handle1 = open(fastq1)
if fastq2.endswith(".gz"):
    sys.stderr.write("Decompressing %s\n" % fastq2)
    handle2 = gzip.open(fastq2)
else:
    handle2 = open(fastq2)
sys.stderr.write("Interlacing paired FASTQ files to stdout...\n")
out_handle = sys.stdout

iter1 = FastqGeneralIterator(handle1)
iter2 = FastqGeneralIterator(handle2)

for title1, seq1, qual1 in iter1:
    try:
        title2, seq2, qual2 = iter2.next()
    except StopIteration:
        sys_exit("More records in %s than %s, e.g. %s" %
                 (fastq1, fastq2, title1))
    id1, descr1 = title1.split(None, 1)
    id2, descr2 = title2.split(None, 1)
    if id1 == id2:
        # Add the /1 and /2, preserve any description after the ID
        if descr1:
            descr1 = " " + descr1
        if descr2:
            descr2 = " " + descr2