def demultiplex_fastq(index, fastq1, fastq2): filter = {'index': index} fp1 = FastQParser(fastq1,filter) if fastq2 is not None: fp2 = FastQParser(fastq2,filter) for r1 in fp1: if fastq2 is not None: r2 = fp2.next() assert is_read_pair(r1,r2), "Mismatching headers for expected read pair" sys.stderr.write("{}\n".format("\n".join(r2))) sys.stdout.write("{}\n".format("\n".join(r1)))
def demultiplex_fastq(index, fastq1, fastq2): filter = {"index": index} fp1 = FastQParser(fastq1, filter) if fastq2 is not None: fp2 = FastQParser(fastq2, filter) for r1 in fp1: if fastq2 is not None: r2 = fp2.next() assert is_read_pair(r1, r2), "Mismatching headers for expected read pair" sys.stderr.write("{}\n".format("\n".join(r2))) sys.stdout.write("{}\n".format("\n".join(r1)))
def _split_fastq(fastq_input, outdir, outprefix, outsuffix, samples): if not os.path.exists(outdir): os.mkdir(outdir) out_handles = {} for file in fastq_input: iter = FastQParser(file) for record in iter: index = record[0].rfind(":") i = record[0][index + 1:].strip() # open a file handle to the index file if it's not already available if i not in out_handles: out_file = os.path.join( outdir, "%s_%s%s" % (outprefix, samples.get(i, i), outsuffix)) out_handles[i] = FastQWriter(out_file) out_handles[i].write(record) # summarize the written records and close the file handles counts = {} for i, oh in out_handles.items(): counts[i] = oh.rwritten() oh.close() return counts
def count_top_indexes(count_num, index_file, index_length, progress_interval): """ Determine the most common indexes, sampling at most 200,000 reads. """ assert (type(count_num) == int and count_num > 0), "Number passed must be a positive integer." fqp_ind = FastQParser(index_file) # This should perhaps be added to the FastQParser class print("Counting total number of lines in fastq file...", file=sys.stderr, end="") total_lines = int( subprocess.check_output(shlex.split( "wc -l {}".format(index_file))).split()[0]) total_reads = total_lines / 4 print(" complete.", file=sys.stderr) index_tally = collections.defaultdict(int) reads_processed = 0 # Subsample if file is large if (total_reads) > 200000: print("Subsampling 200,000 reads from index file...", file=sys.stderr) fqp_ind = iter_sample_fast(fqp_ind, 200000, total_reads) print("Complete.", file=sys.stderr) total_reads = 200000 print("Tallying indexes in {} records...".format(total_reads), file=sys.stderr) start_time = datetime.datetime.now() for index in fqp_ind: index_read_seq = index[1] index_seq = index_read_seq[:index_length] index_tally[index_seq] += 1 reads_processeds += 1 if reads_processed % progress_interval == 0: print_progress(reads_processed, total_reads, start_time) print("\n", file=sys.stderr) if count_num > len(index_tally.keys()): print( "Number of indexes found ({}) is fewer than those requested ({}). Printing all indexes found." .format(len(index_tally.keys()), count_num), file=sys.stderr) print("Printing indexes...", file=sys.stderr) count_num = len(index_tally.keys()) print("{:<20} {:>20} {:>11}".format("Index", "Occurences", "Percentage")) for index, _ in sorted(index_tally.items(), key=(lambda x: x[1]), reverse=True)[:count_num]: percentage = (100.0 * index_tally[index]) / total_reads print("{:<20} {:>20,} {:>10.2f}%".format(index, index_tally[index], percentage))
from bloomfaster import Elf import collections import sys # Slight modification to read from input file instead of stdin from scilifelab.utils.fastq_utils import (FastQParser, FastQWriter) __doc__ %= sys.argv[0] if len(sys.argv) > 2: print sys.argv print __doc__ sys.exit() print >> sys.stderr, "Command: ", " ".join(sys.argv) infile = sys.argv[1] fp = FastQParser(infile) for _ in fp: pass records = fp.rread() print >> sys.stderr, records, "records in file ", infile # say 1 out of 1000 is false positive. bloom = Elf(records, error_rate=1e-3) fp.seek(0) checks = [] for _, seq, _, _ in fp: if seq in bloom: checks.append(seq) bloom.add(seq) # now checks contains anything that could be a duplicate according to
from bloomfaster import Elf import collections import sys # Slight modification to read from input file instead of stdin from scilifelab.utils.fastq_utils import (FastQParser, FastQWriter) __doc__ %= sys.argv[0] if len(sys.argv) > 2: print sys.argv print __doc__ sys.exit() print >>sys.stderr, "Command: ", " ".join(sys.argv) infile = sys.argv[1] fp = FastQParser(infile) for _ in fp: pass records = fp.rread() print >>sys.stderr, records, "records in file ", infile # say 1 out of 1000 is false positive. bloom = Elf(records, error_rate=1e-3) fp.seek(0) checks = [] for _,seq,_,_ in fp: if seq in bloom: checks.append(seq) bloom.add(seq) # now checks contains anything that could be a duplicate according to