def main(config_file, fastq_dir): with open(config_file) as in_handle: config = yaml.load(in_handle) barcode_info = config["barcodes"] print "Processing %s." % (fastq_dir) in_files = glob.glob(os.path.join(fastq_dir, "*.fastq")) print "Found %s in %s. " % (in_files, fastq_dir) print "Combining paired-end files, if found." pairs = combine_pairs(in_files) print "Calulcated pairs: %s." % (pairs) out_files = [] for pair in pairs: barcode = _determine_barcode_from_filename(pair[0]) print "Detected barcode: %s" % barcode if barcode not in barcode_info.keys(): print "barcode %s not found in the YAML file, skipping." % ( barcode) continue print "Sample ID: %s" % (barcode_info[barcode][0]) type = barcode_info[barcode][1] print "Sample type: %s" % (barcode_info[barcode][1]) to_trim = config["to_trim"][type] cutadapt_dir = "cutadapt" print("Trimming off %s and any bases before it from %s." % (to_trim[0], pair[0])) out_dir = os.path.join(cutadapt_dir, os.path.basename(pair[0])) out_files.append(_trim_from_front(pair[0], to_trim[0])) if len(pair) > 1: print("Trimming off %s and any bases before it from %s." % (to_trim[1], pair[1])) out_files.append(_trim_from_front(pair[1], to_trim[1])) out_files = list(flatten(out_files)) out_files = combine_pairs(out_files) for pair in out_files: if len(pair) > 1: filter_reads_by_length(pair[0], pair[1], "fastq-sanger") else: filter_single_reads_by_length(pair[0], "fastq-sanger")
def main(config_file, fastq_dir): with open(config_file) as in_handle: config = yaml.load(in_handle) barcode_info = config["barcodes"] print "Processing %s." % (fastq_dir) in_files = glob.glob(os.path.join(fastq_dir, "*.fastq")) print "Found %s in %s. " % (in_files, fastq_dir) print "Combining paired-end files, if found." pairs = combine_pairs(in_files) print "Calulcated pairs: %s." % (pairs) out_files = [] for pair in pairs: barcode = _determine_barcode_from_filename(pair[0]) print "Detected barcode: %s" % barcode if barcode not in barcode_info.keys(): print "barcode %s not found in the YAML file, skipping." % (barcode) continue print "Sample ID: %s" % (barcode_info[barcode][0]) type = barcode_info[barcode][1] print "Sample type: %s" % (barcode_info[barcode][1]) to_trim = config["to_trim"][type] cutadapt_dir = "cutadapt" print ("Trimming off %s and any bases before it from %s." % (to_trim[0], pair[0])) out_dir = os.path.join(cutadapt_dir, os.path.basename(pair[0])) out_files.append(_trim_from_front(pair[0], to_trim[0])) if len(pair) > 1: print ("Trimming off %s and any bases before it from %s." % (to_trim[1], pair[1])) out_files.append(_trim_from_front(pair[1], to_trim[1])) out_files = list(flatten(out_files)) out_files = combine_pairs(out_files) for pair in out_files: if len(pair) > 1: filter_reads_by_length(pair[0], pair[1], "fastq-sanger") else: filter_single_reads_by_length(pair[0], "fastq-sanger")
def remove_short_reads(fastq_files, dirs, lane_config): """ remove reads from a single or pair of fastq files which fall below a length threshold (30 bases) """ min_length = int(lane_config["algorithm"].get("min_read_length", 20)) supplied_quality_format = _get_quality_format(lane_config) if supplied_quality_format == "illumina": quality_format = "fastq-illumina" else: quality_format = "fastq-sanger" if is_pair(fastq_files): fastq1, fastq2 = fastq_files out_files = fastq.filter_reads_by_length(fastq1, fastq2, quality_format, min_length) else: out_files = [fastq.filter_single_reads_by_length(fastq_files[0], quality_format, min_length)] map(os.remove, fastq_files) return out_files
def _remove_short_reads(fastq_files, dirs, lane_config): """ remove reads from a single or pair of fastq files which fall below a length threshold (30 bases) """ MIN_LENGTH = 20 supplied_quality_format = _get_quality_format(lane_config) if supplied_quality_format == "illumina": quality_format = "fastq-illumina" else: quality_format = "fastq-sanger" if is_pair(fastq_files): fastq1, fastq2 = fastq_files out_files = fastq.filter_reads_by_length(fastq1, fastq2, quality_format, MIN_LENGTH) else: out_files = fastq.filter_single_reads_by_length( fastq_files[0], quality_format, MIN_LENGTH) return out_files
def _remove_short_reads(fastq_files, dirs, lane_config): """ remove reads from a single or pair of fastq files which fall below a length threshold (30 bases) """ MIN_LENGTH = 20 supplied_quality_format = _get_quality_format(lane_config) if supplied_quality_format == "illumina": quality_format = "fastq-illumina" else: quality_format = "fastq-sanger" if is_pair(fastq_files): fastq1, fastq2 = fastq_files out_files = fastq.filter_reads_by_length(fastq1, fastq2, quality_format, MIN_LENGTH) else: out_files = [fastq.filter_single_reads_by_length(fastq_files[0], quality_format, MIN_LENGTH)] return out_files
def remove_short_reads(fastq_files, dirs, lane_config): """ remove reads from a single or pair of fastq files which fall below a length threshold (30 bases) """ min_length = int(lane_config["algorithm"].get("min_read_length", 20)) supplied_quality_format = _get_quality_format(lane_config) if supplied_quality_format == "illumina": quality_format = "fastq-illumina" else: quality_format = "fastq-sanger" if is_pair(fastq_files): fastq1, fastq2 = fastq_files out_files = fastq.filter_reads_by_length(fastq1, fastq2, quality_format, min_length) else: out_files = [ fastq.filter_single_reads_by_length(fastq_files[0], quality_format, min_length) ] map(os.remove, fastq_files) return out_files