Пример #1
0
def split_large_fastq_file(file_name, output_file_pattern, cutoff=1000000):
	''' Split large fastq file.
	For example:
		output_file_pattern = "/home/akomissarov/data/asian_seabass/raw_reads/500bp_insert_lane1_read1.chunk%s.fastq"	
	'''

	def write_data(reads, chunk, output_file_pattern):
		with open(output_file_pattern % chunk, "w") as fh:
			for read in reads:
				fh.write(read.fastq)

	reads = []
	chunk = 1
	print "Read data..."
	for i, read in enumerate(fastq_reader(file_name)):
		print i, "\r",
		if reads and i % cutoff == 0:
			print "\nSave reads", i, "for chunk", chunk
			write_data(reads, chunk, output_file_pattern)
			chunk += 1
			reads = []
		reads.append(read)
	if reads:
		print "\nLast one. Save reads", i, "for chunk", chunk
		write_data(reads, chunk, output_file_pattern)
Пример #2
0
def clean_single_read_data(fastq1_file, fastq1ok_file, fastq_bad_file, verbose=False, adapters_file=None, cutoff=None, polyG_cutoff=23):
	''' Remove reads containing N, # quality, polyG/polyC tracks and adapters.
	'''
	wh1 = open(fastq1ok_file, "w")
	bad = open(fastq_bad_file, "w")

	statistics = {
		"pe": 0,
		"se": 0,
		"N": 0,
		"zeroQ": 0,
		"polyC%s" % polyG_cutoff: 0,
		"polyG%s" % polyG_cutoff: 0,
		"adapters": 0,
	}

	if cutoff:
		cutoff = int(cutoff)
		cutoff_key = "length%s" % cutoff
		statistics[cutoff_key] = 0


	print "Load adapters file"
	adapters = []
	if adapters_file:
		with open(adapters_file) as fh:
			for line in fh:
				adap = line.strip().split()[0]
				rev_adap = get_revcomp(adap)
				if not adap in adapters:
					adapters.append(adap)
				if not rev_adap in adapters:
					adapters.append(rev_adap)
	else:
		print "Adapter file missing"
	print "Number of adapters:", len(adapters)

	for i, read1 in enumerate(fastq_reader(fastq1_file)):
		error1 = None
		if verbose:
			print i, round(100 * statistics["se"]/float(i+1), 2), "% of good", statistics, "\r",
		if cutoff:
			if read1.length < cutoff:
				error1 = cutoff_key
		if not error1:
			error1 = is_bad_read(read1, adapters, polyG_cutoff)
		if error1:
			bad.write(read1.fastq_with_error(error1))
			statistics[error1] += 1
		else:
			wh1.write(read1.fastq)
			statistics["se"] += 1
			
	wh1.close()
	bad.close()
	if i > 0:
		statistics["fraction"] = statistics["pe"]/float(i)
	print
	print statistics
	return statistics
Пример #3
0
def fastq_to_fasta(fastq_file, fasta_file, verbose=False):
	''' Fastq to fasta.
	'''
	print "Fastq to fasta..."
	with open(fasta_file, "w") as fh:
		for i, read in enumerate(fastq_reader(fastq_file)):
			if verbose:
				print i, "\r",
			fh.write(read.fasta)
Пример #4
0
def filter_reads_by_gc(fastq_file, output_file, min_gc, max_gc):
	''' Filter reads between min_gc and max_gc.
	'''
	with open(output_file, "w") as fh:
		for read in fastq_reader(fastq_file):
			gc = read.gc
			if gc < max_gc and gc > min_gc:
				print round(gc, 3), read.seq
				fh.write(read.fasta)
Пример #5
0
def iter_pe_data(fastq1_file, fastq2_file):
	''' Iterate over PE fastq files.
	'''
	reader1 = fastq_reader(fastq1_file)
	reader2 = fastq_reader(fastq2_file)
	while True:
		try:
			read1 = next(reader1)
		except StopIteration:
			try:
				read2 = next(reader2)
			except StopIteration:
				break
				pass
			raise Exception("Not equal number of reads in PE run") 
			break
		try:
			read2 = next(reader2)
		except StopIteration:
			raise Exception("Not equal number of reads in PE run") 
		yield read1, read2
Пример #6
0
def main(fastq1_file, fastq2_file, fastq1ok_file, fastq2ok_file, fastq_se_file):
	"""
	"""
	fh1 = fastq_reader(fastq1_file)
	fh2 = fastq_reader(fastq2_file)

	wh1 = open(fastq1ok_file, "w")
	wh2 = open(fastq2ok_file, "w")
	fse = open(fastq_se_file, "w")

	i = 0

	while True:
		read1, read2 = get_next_pair(fh1, fh2, fse)
		if read1 is None:
			break
		wh1.write(read1.fastq)
		wh2.write(read2.fastq)
		i += 1
		print i, "\r",
	print
Пример #7
0
def fix_uncorrect_long_quality(fastq_file, corrected_fastq_output):
	''' Fix too long quality scores in corrupted HiSeq files.

	@param fastq_file: fastq_file
	@param corrected_fastq_output: corrected fastq output
	'''
	bp200 = 0
	total = 0.
	with open(corrected_fastq_output, "w") as fh:
		for i, read in enumerate(fastq_reader(fastq_file)):
			print i, "\r",
			if len(read.seq) != len(read.qual):
				read.qual = read.qual[:len(read.seq)]
				print
				print read.fastq
			fh.write(read.fastq)
Пример #8
0
def clean_short_reads(fastq1_file, fastq1ok_file, fastq_short_file, cutoff, verbose=False):
	''' Remove short reads from fastq file.
	'''
	statistics = {
		"short": 0,
		"ok": 0,
		"fraction": 0.0,
	}
	with open(fastq_short_file, "w") as bad:
		with open(fastq1ok_file, "w") as good:
			for i, read1 in enumerate(fastq_reader(fastq1_file)):
				statistics["fraction"] = statistics["ok"]/float(i+1)
				if verbose:
					print i, statistics, "\r",
				if len(read1.seq) < 100:
					bad.write(read1.fastq_with_error("short"))
					statistics["short"] += 1
				else:
					good.write(read1.fastq)
					statistics["ok"] += 1
	print statistics
	return statistics
Пример #9
0
def separate_reads_witn_n_and_sharps(fastq_file, output_file, reads_with_n_file, reads_with_sharp_file):
	''' Separate reads from fastq file witn N and sharp quality scores.
	'''
	reads = []
	print "Read data..."
	k = 0
	s = 0
	with open(output_file, "w") as good_fh:
		with open(reads_with_n_file, "w") as bad_fh:
			with open(reads_with_sharp_file, "w") as sharp_fh:
				for i, read in enumerate(fastq_reader(fastq_file)):
					if "n" in read.sequence or "N" in read.sequence:
						k += 1 
						print i, k, s, float(k+1)/(i+1), float(s+1)/(i+1), "\r",
						bad_fh.write(read.fastq)
					elif "#" in read.qual:
						s += 1
						print i, k, s, float(k+1)/(i+1), float(s+1)/(i+1), "\r",
						sharp_fh.write(read.fastq)
					else:
						good_fh.write(read.fastq)
	print
Пример #10
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
#@created: 23.02.2011
#@author: Aleksey Komissarov
#@contact: [email protected] 


from trseeker.seqio.sra_file import fastq_reader

fastq_file = "/stripe2/assembly_test/Caenorhabditis_elegans/raw_reads/DRR008443_500bp_1.fastq"
output_fastq_file = "/stripe2/assembly_test/Caenorhabditis_elegans/raw_reads/500K_1.fastq"

with open(output_fastq_file, "w") as fh:
	for i, read in enumerate(fastq_reader(fastq_file)):
		print i, "\r",
		if i == 500000:
			break
		fh.write(read.fastq)

fastq_file = "/stripe2/assembly_test/Caenorhabditis_elegans/raw_reads/DRR008443_500bp_2.fastq"
output_fastq_file = "/stripe2/assembly_test/Caenorhabditis_elegans/raw_reads/500K_2.fastq"

with open(output_fastq_file, "w") as fh:
	for i, read in enumerate(fastq_reader(fastq_file)):
		print i, "\r",
		if i == 500000:
			break
		fh.write(read.fastq)
Пример #11
0
# -*- coding: utf-8 -*-
#
#@created: 10.10.2013
#@author: Aleksey Komissarov
#@contact: [email protected]

import sys
from trseeker.seqio.sra_file import fastq_reader
import argparse

if __name__ == '__main__':
	
	parser = argparse.ArgumentParser(description='Remove reads with N.')
	parser.add_argument('-i','--input', help='Fastq file', required=True)
	parser.add_argument('-o','--output', help='Fastq file', required=True)
	args = vars(parser.parse_args())
	print "Process data"
	data = []
	ok = 0
	l = int(args["length"])
	with open(args["output"], "w") as fh:
		for i, read in enumerate(fastq_reader(args["input"])):
			if i % 10000 == 0:
				print ok, i, ok/(i+1.), "\r",
			if 'N' in read.seq:
				continue
			fh.write(read.fastq)
			ok += 1
	print
	print "Final: saved %s (%s) from %s" % (ok, ok/(i+1.), i)