def test_fastq_parser(): print("Test Fastq parser") for seq in HTSeq.FastqReader('example_data/fastqEx.fastq'): pass print("Test passed") print("Test Fastq parser on gzip input") for seq in HTSeq.FastqReader('example_data/fastqExgzip.fastq.gz'): pass print("Test passed") print("Test Fastq parser on gzip input (raw iterator)") for seq in HTSeq.FastqReader('example_data/fastqExgzip.fastq.gz', raw_iterator=True): pass print("Test passed")
def sample(self): fraction = float(self.rate) in1 = iter(HTSeq.FastqReader(self.fq1)) in2 = iter(HTSeq.FastqReader(self.fq2)) o1 = open(self.o1, "w") o2 = open(self.o2, "w") for read1, read2 in itertools.izip(in1, in2): if random.random() < fraction: read1.write_to_fastq_file(o1) read2.write_to_fastq_file(o2) o1.close() o2.close()
def count_reads(data): reads = {'readlengths': [], 'readcount': 0} reads['readlengths'] = [ len(s[0]) for s in HTSeq.FastqReader(data, raw_iterator=True) ] reads['readcount'] = len(reads['readlengths']) return reads
def move_barcode_to_name_in_fastq(filename, out_dir): if not os.path.exists(out_dir): os.system('mkdir ' + out_dir) outfilename = out_dir + '/%s' % os.path.basename(filename) if os.path.exists(outfilename): print("Output file {} exists... Not overwriting...".format(outfilename)) return else: print("Writing {}".format(outfilename)) outf = open(outfilename, 'w') fastq = HTSeq.FastqReader(filename) obs_let = set() # phred 33 for read in fastq: if len(read.seq) < 14: continue if min(read.qual[:9]) < 30: continue _seq = read.seq.decode() n_read = HTSeq.SequenceWithQualities( read.seq[9:], read.name.partition(' ')[0] + '#' + _seq[0:3] + _seq[7:9], read.qualstr[9:]) n_read.write_to_fastq_file(outf) return
def catagorize_fastq(matched, filename): """ Catagoriz each read of a fastq file based on whether it is aligned to the reference genome (therefore name appears in matched) or not. Output two files - "filename_aligned" & "filename_unaligned". """ fastq_reader = HTSeq.FastqReader(filename) counter = 0 aligned_output = open(filename[:-6] + "_aligned.fq", "w") unaligned_output = open(filename[:-6] + "_unaligned.fq", "w") for read in fastq_reader: if read.name.split(" ")[0] in matched: counter += 1 read.write_to_fasta_file(aligned_output) else: read.write_to_fasta_file(unaligned_output) aligned_output.close() unaligned_output.close() # Compress as .gz subprocess.call(["gzip", filename[:-6] + "_aligned.fq"]) subprocess.call(["gzip", filename[:-6] + "_unaligned.fq"]) print("Find %d aligns in fastq" % counter) return 0
def main(): ifile = sys.argv[1] ofile = sys.argv[2] r = HTSeq.FastqReader(ifile) with open(ofile, 'wb') as outstream: for ent in r: ent.write_to_fastq_file(outstream)
def count_barcodes(read1, read2=None): fastq_file = HTSeq.FastqReader(read1) barcode_counts = collections.defaultdict(int) barcode_reg_counts = collections.defaultdict(int) for n, read in enumerate(fastq_file): if (n > 1000000): break barcode_reg = read.seq[:10] barcode = read.seq[:3] barcode_counts[barcode] += 1 barcode_reg_counts[barcode_reg] += 1 with open('barcode_counts.txt', 'w') as f: for barcode in sorted(barcode_counts, key=lambda x: barcode_counts[x]): if barcode_counts[barcode] < 100: continue f.write("{b}\t{c}\n".format(b=barcode, c=str(barcode_counts[barcode]))) with open('barcode_region_counts.txt', 'w') as f: for barcode in barcode_reg_counts: f.write("{b}\t{c}\n".format(b=barcode, c=str(barcode_reg_counts[barcode])))
def get_counts(sequences, infile, seq2regex): """Get number of reads within a file that contain each sequence (sequences should be a list)""" fastq_file = HTSeq.FastqReader(infile) counts = collections.Counter() old_id = None old_seq = None barcode = False for read in fastq_file: counts["reads"] += 1 read_id = read.name.split(' ')[0] if old_id == read_id: counts["read pairs"] += 1 else: old_id = read_id if barcode == False: counts["nothin"] += 1 barcode = False old_seq = read.seq for sequence in sequences: if re.search(seq2regex[sequence], read.seq): if barcode: counts["double"] += 1 if barcode==sequence: counts["double same"] += 1 counts[sequence] += 1 barcode=sequence print("Reads: %d" % (counts["reads"])) print("Read pairs: %d" % (counts["read pairs"])) print("Nothin: %d" % (counts["nothin"])) for sequence in sequences: print("%s: %d" % (sequence, counts[sequence])) print("Double: %d" % (counts["double"])) print("Double same: %d" % (counts["double same"])) print("\n")
def read_barcodes(bc_file, coder): namer = {} for r in HTSeq.FastqReader(bc_file): if r.seq.decode() in coder: namer[r.name.split()[0]] = coder[r.seq.decode()] return namer
def demultiplexing(sampleIndex, read1, read2, outdir, mismatch): Read1 = HTSeq.FastqReader(read1) Read2 = HTSeq.FastqReader(read2) for eachSample in sampleIndex: sampleIndex[eachSample]['Read1'] = gzip.open(os.path.join( outdir, eachSample + '_' + sampleIndex[eachSample]['i7Index'] + '_1.fq.gz'), mode='wb', compresslevel=1) sampleIndex[eachSample]['Read2'] = gzip.open(os.path.join( outdir, eachSample + '_' + sampleIndex[eachSample]['i7Index'] + '_2.fq.gz'), mode='wb', compresslevel=1) undetermined1 = gzip.open(os.path.join(outdir, 'undetermined_1.fq.gz'), mode='wb', compresslevel=1) undetermined2 = gzip.open(os.path.join(outdir, 'undetermined_2.fq.gz'), mode='wb', compresslevel=1) for R1, R2 in itertools.izip(Read1, Read2): if "+" in R1.name.split(':')[-1]: i7IndexSeq, i5IndexSeq = R1.name.split(':')[-1].strip().split('+') else: i7IndexSeq = R1.name.split(':')[-1].strip() bestMatch = None bestMisCount = 1000 for eachSample in sampleIndex: i7HD = hamming_distance(i7IndexSeq, sampleIndex[eachSample]['i7Index']) if i7HD == 0: bestMatch = eachSample break if i7HD <= mismatch and i7HD < bestMisCount: bestMatch = eachSample bestMisCount = i7HD if bestMatch: R1.write_to_fastq_file(sampleIndex[bestMatch]['Read1']) R2.write_to_fastq_file(sampleIndex[bestMatch]['Read2']) else: R1.write_to_fastq_file(undetermined1) R2.write_to_fastq_file(undetermined2) for eachSample in sampleIndex: sampleIndex[eachSample]['Read1'].close() sampleIndex[eachSample]['Read2'].close() undetermined1.close() undetermined2.close()
def head(fastqs, outputs, sequences=100, qual_scale=_default_qual_scale): if len(fastq) != len(output): raise ValueError("Length of fastq and output parameters must match") for (i, fastq) in enumerate(fastqs): fastq_iterator = HTSeq.FastqReader(fastq, qual_scale) with open(outputs[i], 'w') as headFile: for sequence in itertools.islice(fastq_iterator, sequences): sequence.write_to_fastq_file(headFile)
def converting_fasta_to_fastq(align_clusters, merged_reads, seqtk): print "Converting Fasta to Fastq step has been started...." if os.path.isfile(os.path.join(align_clusters, "temp.fastq")): shutil.rmtree(os.path.join(align_clusters, "temp.fastq")) merged_fq = open(os.path.join(align_clusters, "temp.fastq"), "a") else: merged_fq = open(os.path.join(align_clusters, "temp.fastq"), "a") print merged_reads for fq in merged_reads: fq_reads = HTSeq.FastqReader(fq) for r in fq_reads: r.write_to_fastq_file(merged_fq) merged_fq.close() fastq_dir = glob.glob(os.path.join(align_clusters, '*_cl*', '*.fq')) reads_merged = os.path.join(align_clusters, "temp.fastq") for fq in fastq_dir: cl_files = HTSeq.FastqReader(fq) reads_merged_header_l = os.path.join(align_clusters, "temp.list") proc1 = subprocess.Popen(['grep', '^@M', fq], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False) (out1, err1) = proc1.communicate() if not err1: with open(reads_merged_header_l, "w") as f: f.write(out1.replace("@", "")) else: print "Errors:", err1 proc2 = subprocess.Popen( [seqtk, 'subseq', reads_merged, reads_merged_header_l], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False) (out2, err2) = proc2.communicate() if not err2: new_fq_w_q = os.path.splitext(fq)[0] + "_q.fq" with open(new_fq_w_q, "w") as f: f.write(out2) print "Converting Fasta to Fastq step has been finished...."
def __init__(self, original_fastq, trimmed_fastq, unpaired_fastq, shorter_file, dropped_file, outputdir=os.getcwd(), qual_scale=_default_qual_scale): original_reader = HTSeq.FastqReader(original_fastq, qual_scale) original_iterator = original_reader.__iter__() trimmed_reader = HTSeq.FastqReader(trimmed_fastq, qual_scale) trimmed_iterator = trimmed_reader.__iter__() unpaired_reader = HTSeq.FastqReader(unpaired_fastq, qual_scale) unpaired_iterator = unpaired_reader.__iter__() brenninc_comp_counter.Comparer.__init__(self, original_iterator, trimmed_iterator, unpaired_iterator) self.shorter = collections.Counter() self.same = 0 self.update_factor = 100000 self.shorter_file = shorter_file self.dropped_file = dropped_file
def sub_fraction_reads(fq1, fq2, fraction, fq1_out, fq2_out): fraction = float(fraction) in1 = iter( HTSeq.FastqReader(fq1) ) in2 = iter( HTSeq.FastqReader(fq2) ) out1 = open( fq1_out, "w" ) out2 = open( fq2_out, "w" ) while True: try: read1 = next( in1 ) read2 = next( in2 ) if random.random() < fraction: read1.write_to_fastq_file( out1 ) read2.write_to_fastq_file( out2 ) except StopIteration: sys.exit() out1.close() out2.close()
def read_fastQ(): '''Reads 200,000 sequences of a fastQ file and writes every sequence to a new file called result.txt''' fastq_file = HTSeq.FastqReader(".fastq.gz") result_f = open("result.txt", "w") for read in itertools.islice(fastq_file, 200000): result_f.write(str(read)) result_f.close()
def split_by_barcode(sample1_filename, sample2_filename, sample3_filename, missing_barcode_filename, initial_filename): sample1f = open(sample1_filename, 'w') sample2f = open(sample2_filename, 'w') sample3f = open(sample3_filename, 'w') missingf = open(missing_barcode_filename, 'w') fastq_file = HTSeq.FastqReader(initial_filename) total_reads = 0 sample1_num = 0 sample2_num = 0 sample3_num = 0 found = 0 for read in fastq_file: total_reads += 1 if (not (total_reads % 100000)): sum_with_barcode = sample1_num + sample2_num + sample3_num print "Read: %i (%i, %i, %i). Barcode in %f reads" % ( total_reads, sample1_num, sample2_num, sample3_num, float(sum_with_barcode) / float(total_reads)) if strain == 'n2': if (re.match('\w{3}TTGT.*', read.seq)): sample1_num += 1 read.write_to_fastq_file(sample1f) continue if (re.match('\w{3}CCGG.*', read.seq)): sample2_num += 1 read.write_to_fastq_file(sample2f) continue if (re.match('\w{3}GGCA.*', read.seq)): sample3_num += 1 read.write_to_fastq_file(sample3f) continue read.write_to_fastq_file(missingf) if strain == 'fbf': if (re.match('\w{3}TGGC.*', read.seq)): sample1_num += 1 read.write_to_fastq_file(sample1f) continue if (re.match('\w{3}GGTT.*', read.seq)): sample2_num += 1 read.write_to_fastq_file(sample2f) continue if (re.match('\w{3}CGGA.*', read.seq)): sample3_num += 1 read.write_to_fastq_file(sample3f) continue read.write_to_fastq_file(missingf) print """Results: Reads: %i Sample 1: %i (%f) Sample 2: %i (%f) (sum sample 1 + 2): %i remaining: %i""" % (total_reads, sample1_num, float(sample1_num) / float(total_reads), sample2_num, float(sample2_num) / float(total_reads), sample1_num + sample2_num, total_reads - sample1_num + sample2_num)
def head(path, sequences=100, outputdir=None, qual_scale=_default_qual_scale): extra = "_head" + str(sequences) new_path = brenninc_utils.create_new_file(path, extra, outputdir=outputdir, gzipped=False) fastq_iterator = HTSeq.FastqReader(path, qual_scale) with open(new_path, 'w') as headFile: for sequence in itertools.islice(fastq_iterator, sequences): sequence.write_to_fastq_file(headFile)
def clip_adapters_if_not_already_clipped(in_dir, out_dir, args): # Do we need to remove adapters, or are they already removed? # Check the first N sequences to determine. n_lines_to_check = 4e3 need_to_clip = True for _file in glob.glob(in_dir + '/*.fastq'): fastq_reader = HTSeq.FastqReader(_file) read_lens = set() for i, read in enumerate(fastq_reader): read_lens.add(len(read.seq)) if i > n_lines_to_check: break if len([x for x in read_lens if x<20]) > 2: need_to_clip = False break if not need_to_clip: print("Adapters in {0} are apparently already clipped...".format(in_dir)) return # If adapters are not already removed: print("Clipping adapters in {0}...".format(in_dir)) if in_dir == out_dir: print("Input and output dir can't be the same.") sys.exit() args.input_dir = in_dir args.output_dir = 'temp_fastq/' if os.path.exists('temp_fastq/'): os.system('rm -r temp_fastq/') # Remove the 3' linker first: args.adapter = '' args.three_prime_linker = True args.rt_primer = False clip_adapter.run(args) # Remove the RT primer second: for k, v in list( {'three_prime_linker': False, 'rt_primer': True, 'input_dir': 'temp_fastq/', 'output_dir': out_dir, 'adapter': ''}.items()): setattr(args, k, v) clip_adapter.run(args)
def getInstrument(self): for r in HTSeq.FastqReader(self.r1): serialNumber = r.name.split(":")[0] try: instrument = Instruments.objects.get( serial_number=serialNumber) except: sys.exit("Instrument with Serial Number " + serialNumber + " not found in the database!") break return instrument
def process_file(filename, output_filename): outf = open(output_filename, 'w') fastq = HTSeq.FastqReader(filename) for read in fastq: barcode = read.name[-9:] new_barcode = barcode[:3] + barcode[7:] n_read = HTSeq.SequenceWithQualities( read.seq, read.name[:-9] + new_barcode, read.qualstr) n_read.write_to_fastq_file(outf) outf.close()
def openDNA(filename): extension=os.path.splitext(filename)[1] if extension in ['.fna','.fasta','.ffn','.faa','.frn']: print('File '+filename+' is type FastA.') file=HTSeq.FastaReader(filename) num_lines=sum(1 for line in open(filename)) elif extension in ['.fq','.fastq']: print('File '+filename+' is type FastQ.') file=HTSeq.FastqReader(filename) num_lines=int(sum(1 for line in open(filename))/4) #1/4 of lines are sequencesy in fastQ else: raise Exception('Unknown file type, exiting.') return file, num_lines
def collapse_reads(infile, outfile=None, min_length=15): """Collapse identical reads, writing collapsed reads to a new fasta file. Retains copy number in fasta headers. Each sequence in the resulting file should be unique. Args: infile: input fastq file outfile: output fasta file with collapsed reads min_length: minimum length of read to include Returns: True if successful, otherwise False """ #from itertools import islice if outfile == None: outfile = os.path.splitext(infile)[0] + '_collapsed.fa' print('collapsing reads %s' % infile) ext = os.path.splitext(infile)[1] if ext == '.fastq': fastfile = HTSeq.FastqReader(infile, "solexa") elif ext == '.fa' or ext == '.fasta': fastfile = HTSeq.FastaReader(infile) else: print('not fasta or fastq') return False i = 0 total = 0 f = {} #print (fastfile) for s in fastfile: seq = s.seq.decode() if seq in f: f[seq]['reads'] += 1 else: f[seq] = {'name': s.name, 'reads': 1} total += 1 df = pd.DataFrame.from_dict(f, orient='index') df.index.name = 'seq' df = df.reset_index() l = df.seq.str.len() df = df[l >= min_length] df = df.drop(['name'], 1) df = df.sort_values(by='reads', ascending=False).reset_index() df['read_id'] = df.index.copy() df['read_id'] = df.apply(lambda x: str(x.read_id) + '_' + str(x.reads), 1) #print df[:10] utils.dataframe_to_fasta(df, idkey='read_id', outfile=outfile) #df.to_csv(os.path.splitext(outfile)[0]+'.csv', index=False) print('collapsed %s reads to %s' % (total, len(df))) return True
def fastq_to_fasta(infile, rename=True): """Fastq to fasta""" fastqfile = HTSeq.FastqReader(infile, "solexa") outfile = open(os.path.splitext(infile)[0] + '.fa', 'w') i = 1 for s in fastqfile: if rename == True: s.name = str(i) s.write_to_fasta_file(outfile) i += 1 outfile.close() return
def summary(fastq_file, qual_scale="phred"): fastq_iterator = HTSeq.FastqReader(fastq_file, qual_scale) for sequence in itertools.islice(fastq_iterator, 1): qualsum = numpy.zeros(len(sequence), numpy.int) counts = numpy.zeros((len(sequence), 5), numpy.int) nsequence = 0 for sequence in fastq_iterator: qualsum += sequence.qual nsequence += 1 sequence.add_bases_to_count_array(counts) return (qualsum / float(nsequence), counts)
def create_random_fastq(sourcefile, path, sizes=None): """Generate multiple random subsets of reads for testing""" fastqfile = HTSeq.FastqReader(sourcefile, "solexa") sequences = [s for s in fastqfile] print('source file has %s seqs' % len(sequences)) if sizes == None: sizes = np.arange(5e5, 7.e6, 5e5) for s in sizes: label = str(s / 1e6) name = os.path.join(path, 'test_%s.fa' % label) create_random_subset(sequences=sequences, size=s, outfile=name) return
def subSampleFastQSE(ReadFraction, FastQFileIn, FastQFileOut, Zip=False): in1 = iter(HTSeq.FastqReader(FastQFileIn)) out1 = open(FastQFileOut, "w") for read1 in in1: if random.random() < ReadFraction: read1.write_to_fastq_file(out1) in1.close() out1.close() if Zip: system('gzip %s' % (FastQFileOut))
def move_barcode_to_name_in_fastq(filename, out_dir): if not os.path.exists(out_dir): os.system('mkdir ' + out_dir) outf = open(out_dir + '/%s' % os.path.basename(filename), 'w') fastq = HTSeq.FastqReader(filename) obs_let = set() # phred 33 for read in fastq: if len(read.seq) < 14: continue if min(read.qualstr[:9]) < 53: continue n_read = HTSeq.SequenceWithQualities( read.seq[9:], read.name.partition(' ')[0] + '#' + read.seq[0:9], read.qualstr[9:]) n_read.write_to_fastq_file(outf)
def generate_seq_stats(seqfile, header, table=None, fastqfile=True): ''' This function creates the JSON-files table.j, hist.j, edges.j, which are the basis for the sequence statistics table and graph visualized in the Sequence distribution-tab. If no table object is provided, headers are created and a table object is returned with two columns, headers and values. If a table object is provided, the function will add a new column to the table table: existing table (for adding a column) seqfile: path to sequencefile (fasta/fastq) header: name of column fastqfile: the function assumes a fastq file. "False" will accept fasta ''' if not table: table = { 'Statistic': [ 'Count (#)', 'Length (bp)', 'Over 100 bp', 'Over 500 bp', 'Over 1000 bp', 'Over 5000 bp', 'Over 10000 bp', 'Largest (bp)', 'Smallest (bp)', 'Average length (bp)', 'Median (bp)', 'N50' ] } # Parse sequencefile if fastqfile: seqlengths = [ len(s[0]) for s in HTSeq.FastqReader(seqfile, raw_iterator=True) ] else: seqlengths = [ len(s[0]) for s in HTSeq.FastaReader(seqfile, raw_iterator=True) ] # Calculate statistcs table[header] = [] table[header].append(len(seqlengths)) table[header].append(sum(seqlengths)) table[header].append(len([x for x in seqlengths if x > 100])) table[header].append(len([x for x in seqlengths if x > 500])) table[header].append(len([x for x in seqlengths if x > 1000])) table[header].append(len([x for x in seqlengths if x > 5000])) table[header].append(len([x for x in seqlengths if x > 10000])) table[header].append(max(seqlengths)) table[header].append(min(seqlengths)) table[header].append(np.mean(seqlengths)) table[header].append(calculate_n50(seqlengths)) # Create historgram data hist, edges = np.histogram(seqlengths, density=False, bins=int(max(seqlengths) / 10)) return (table, hist.tolist(), edges.tolist())
def demultiplex(infile, outfile, sequences, seq2regex): fastq_file = HTSeq.FastqReader(infile) with open(outfile, "w+") as outf: for read in fastq_file: for sequence in sequences: if 'r' not in sequence: continue match = re.search(seq2regex[sequence], read.seq) if not match: continue barcode = HTSeq.Sequence(match.group(0)) read2 = read.trim_left_end_with_quals(barcode) read2.write_to_fastq_file(outf) break
def readFile(filename, fileType): """ :rtype : return type is DNA Sequence as a list of characters """ fasta_file = "" #dummy initialization if (fileType == FASTA): fasta_file = HTSeq.FastaReader(filename) elif (fileType == FASTQ): fasta_file = HTSeq.FastqReader(filename) sequence = "" for read in fasta_file: sequence += read.seq return (map(lambda x: x.upper(), list(sequence)))