def get_study_state(study_id='.'): existing_data_types = get_existing_data_types(study_id) state = { 'study-id': study_id, 'workflow': { 'exe': [], 'nexe': [] }, 'data': {} } for workflow_id in workflows: workflow = workflows[workflow_id] if workflow.inputs.issubset(existing_data_types): state['workflow']['exe'].append(workflow_id) else: state['workflow']['nexe'].append(workflow_id) for data_type in existing_data_types: data_filepath = get_data_filepath(data_type, study_id) with open(data_filepath, 'rb') as data_file: # should we be using sha256 instead? md5 = safe_md5(data_file).hexdigest() state['data'][data_filepath] = md5 return state
def test_safe_md5(self): exp = 'ab07acbb1e496801937adfa772424bf7' fd = BytesIO(b'foo bar baz') obs = safe_md5(fd) self.assertEqual(obs.hexdigest(), exp) fd.close()
def test_safe_md5(self): """Make sure we have the expected md5""" exp = 'ab07acbb1e496801937adfa772424bf7' fd = BytesIO(b'foo bar baz') obs = safe_md5(fd) self.assertEqual(obs.hexdigest(), exp) fd.close()
def generate_run_xml(self): """Generates the run XML file Returns ------- ET.Element Object with run XML values """ run_set = ET.Element( 'RUN_SET', { 'xmlns:xsi': self.xmlns_xsi, "xsi:noNamespaceSchemaLocation": self.xsi_noNSL % "run" }) for sample_name, sample_prep in sorted(viewitems(self.samples_prep)): sample_prep = dict(sample_prep) if self._ebi_experiment_accessions[sample_name]: experiment_ref_dict = { 'accession': self._ebi_experiment_accessions[sample_name] } else: experiment_alias = self._get_experiment_alias(sample_name) experiment_ref_dict = {'refname': experiment_alias} # We only submit fastq file_type = 'fastq' file_path = self.sample_demux_fps[sample_name] with open(file_path) as fp: md5 = safe_md5(fp).hexdigest() run = ET.SubElement( run_set, 'RUN', { 'alias': self._get_run_alias(sample_name), 'center_name': qiita_config.ebi_center_name }) ET.SubElement(run, 'EXPERIMENT_REF', experiment_ref_dict) data_block = ET.SubElement(run, 'DATA_BLOCK') files = ET.SubElement(data_block, 'FILES') ET.SubElement( files, 'FILE', { 'filename': join(self.ebi_dir, basename(file_path)), 'filetype': file_type, 'quality_scoring_system': 'phred', 'checksum_method': 'MD5', 'checksum': md5 }) return run_set
def generate_run_xml(self): """Generates the run XML file Returns ------- ET.Element Object with run XML values """ run_set = ET.Element('RUN_SET', { 'xmlns:xsi': self.xmlns_xsi, "xsi:noNamespaceSchemaLocation": self.xsi_noNSL % "run"}) for sample_name, sample_prep in sorted(viewitems(self.samples_prep)): sample_prep = dict(sample_prep) if self._ebi_experiment_accessions[sample_name]: experiment_ref_dict = { 'accession': self._ebi_experiment_accessions[sample_name]} else: experiment_alias = self._get_experiment_alias(sample_name) experiment_ref_dict = {'refname': experiment_alias} # We only submit fastq file_type = 'fastq' file_path = self.sample_demux_fps[sample_name] with open(file_path) as fp: md5 = safe_md5(fp).hexdigest() run = ET.SubElement(run_set, 'RUN', { 'alias': self._get_run_alias(sample_name), 'center_name': qiita_config.ebi_center_name} ) ET.SubElement(run, 'EXPERIMENT_REF', experiment_ref_dict) data_block = ET.SubElement(run, 'DATA_BLOCK') files = ET.SubElement(data_block, 'FILES') ET.SubElement(files, 'FILE', { 'filename': join(self.ebi_dir, basename(file_path)), 'filetype': file_type, 'quality_scoring_system': 'phred', 'checksum_method': 'MD5', 'checksum': md5} ) return run_set
def generate_run_xml(self): """Generates the run XML file Returns ------- xml.etree.Element The root elelement of the generated ``ElementTree`` """ run_set = ET.Element( 'RUN_SET', { "xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance", "xsi:noNamespaceSchemaLocation": "ftp://ftp.sra.ebi.ac.uk/meta/xsd" "/sra_1_3/SRA.run.xsd" }) for sample_name, sample_info in sorted(viewitems(self.samples)): experiment_alias = self._get_experiment_alias(sample_name) file_type = sample_info['prep']['file_type'] file_path = sample_info['prep']['file_path'] with open(file_path) as fp: md5 = safe_md5(fp).hexdigest() run = ET.SubElement( run_set, 'RUN', { 'alias': self._get_run_alias(basename(file_path)), 'center_name': qiita_config.ebi_center_name }) ET.SubElement(run, 'EXPERIMENT_REF', {'refname': experiment_alias}) data_block = ET.SubElement(run, 'DATA_BLOCK') files = ET.SubElement(data_block, 'FILES') ET.SubElement( files, 'FILE', { 'filename': join(self.ebi_dir, basename(file_path)), 'filetype': file_type, 'quality_scoring_system': 'phred', 'checksum_method': 'MD5', 'checksum': md5 }) return run_set
def _add_file_subelement(self, add_file, file_type, sample_name, is_forward): """generate_run_xml helper to avoid duplication of code """ if is_forward: suffix = self.FWD_READ_SUFFIX else: suffix = self.REV_READ_SUFFIX file_path = self.sample_demux_fps[sample_name] + suffix with open(file_path, 'rb') as fp: md5 = safe_md5(fp).hexdigest() file_details = {'filetype': file_type, 'quality_scoring_system': 'phred', 'checksum_method': 'MD5', 'checksum': md5, 'filename': join(self.ebi_dir, basename(file_path))} add_file(file_details)
def generate_run_xml(self): """Generates the run XML file Returns ------- xml.etree.Element The root elelement of the generated ``ElementTree`` """ run_set = ET.Element('RUN_SET', { "xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance", "xsi:noNamespaceSchemaLocation": "ftp://ftp.sra.ebi.ac.uk/meta/xsd" "/sra_1_3/SRA.run.xsd"}) for sample_name, sample_info in sorted(viewitems(self.samples)): experiment_alias = self._get_experiment_alias(sample_name) file_type = sample_info['prep']['file_type'] file_path = sample_info['prep']['file_path'] with open(file_path) as fp: md5 = safe_md5(fp).hexdigest() run = ET.SubElement(run_set, 'RUN', { 'alias': self._get_run_alias(basename(file_path)), 'center_name': qiita_config.ebi_center_name} ) ET.SubElement(run, 'EXPERIMENT_REF', { 'refname': experiment_alias} ) data_block = ET.SubElement(run, 'DATA_BLOCK') files = ET.SubElement(data_block, 'FILES') ET.SubElement(files, 'FILE', { 'filename': join(self.ebi_dir, basename(file_path)), 'filetype': file_type, 'quality_scoring_system': 'phred', 'checksum_method': 'MD5', 'checksum': md5} ) return run_set
def _add_file_subelement(self, add_file, file_type, sample_name, is_forward): """generate_run_xml helper to avoid duplication of code """ if is_forward: suffix = self.FWD_READ_SUFFIX else: suffix = self.REV_READ_SUFFIX file_path = self.sample_demux_fps[sample_name] + suffix with open(file_path, 'rb') as fp: md5 = safe_md5(fp).hexdigest() file_details = { 'filetype': file_type, 'quality_scoring_system': 'phred', 'checksum_method': 'MD5', 'checksum': md5, 'filename': join(self.ebi_dir, basename(file_path)) } add_file(file_details)
def get_study_state(study_id='.'): existing_data_types = get_existing_data_types(study_id) state = { 'study-id': study_id, 'workflow': {'exe': [], 'nexe': []}, 'data': {} } for workflow_id in workflows: workflow = workflows[workflow_id] if workflow.inputs.issubset(existing_data_types): state['workflow']['exe'].append(workflow_id) else: state['workflow']['nexe'].append(workflow_id) for data_type in existing_data_types: data_filepath = get_data_filepath(data_type, study_id) with open(data_filepath, 'rb') as data_file: # should we be using sha256 instead? md5 = safe_md5(data_file).hexdigest() state['data'][data_filepath] = md5 return state
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) read_arguments_from_file = opts.read_arguments_from_file # these arguments can optionally be read from a file, reasoning is to # allow arguments that would span over hundreds of samples and would be # prohibitive to execute as a command line call if read_arguments_from_file: # sample_ids is the only one of these arguments that's returned as a # string, the rest of them are lists if opts.sample_ids: opts.sample_ids = ','.join(parse_items(opts.sample_ids)) if opts.sequence_read_fps: opts.sequence_read_fps = parse_items(opts.sequence_read_fps[0]) if opts.barcode_read_fps: opts.barcode_read_fps = parse_items(opts.barcode_read_fps[0]) if opts.mapping_fps: opts.mapping_fps = parse_items(opts.mapping_fps[0]) sequence_read_fps = opts.sequence_read_fps barcode_read_fps = opts.barcode_read_fps sample_ids = None if opts.sample_ids is not None: sample_ids = opts.sample_ids.split(',') mapping_fps = opts.mapping_fps phred_quality_threshold = opts.phred_quality_threshold retain_unassigned_reads = opts.retain_unassigned_reads min_per_read_length_fraction = opts.min_per_read_length_fraction max_bad_run_length = opts.max_bad_run_length rev_comp = opts.rev_comp rev_comp_barcode = opts.rev_comp_barcode rev_comp_mapping_barcodes = opts.rev_comp_mapping_barcodes seq_max_N = opts.sequence_max_n start_seq_id = opts.start_seq_id # NEED TO FIX THIS FUNCTIONALITY - CURRENTLY READING THE WRONG FIELD # opts.filter_bad_illumina_qual_digit filter_bad_illumina_qual_digit = False store_qual_scores = opts.store_qual_scores store_demultiplexed_fastq = opts.store_demultiplexed_fastq barcode_type = opts.barcode_type max_barcode_errors = opts.max_barcode_errors # if this is not a demultiplexed run, if barcode_type == 'not-barcoded': if sample_ids is None: option_parser.error( "If not providing barcode reads (because " "your data is not multiplexed), must provide --sample_ids.") if len(sample_ids) != len(sequence_read_fps): option_parser.error( "If providing --sample_ids (because " "your data is not multiplexed), must provide the same number " "of sample ids as sequence read filepaths.") barcode_read_fps = [None] * len(sequence_read_fps) mapping_fps = [None] * len(sequence_read_fps) elif barcode_read_fps is None: option_parser.error("Must provide --barcode_read_fps if " "--barcode_type is not 'not-barcoded'") elif mapping_fps is None: option_parser.error("Must provide --mapping_fps if " "--barcode_type is not 'not-barcoded'") phred_offset = opts.phred_offset if phred_offset is not None: try: phred_offset = int(phred_offset) except ValueError: # shouldn't be able to get here... option_parser.error( "If --phred_offset is provided, it must be a valid integer.") if opts.last_bad_quality_char is not None: option_parser.error( '--last_bad_quality_char is no longer supported. ' 'Use -q instead (see option help text by passing -h)') if not (0 < min_per_read_length_fraction <= 1): option_parser.error('--min_per_read_length_fraction must be greater ' 'than 0 and less than or equal to 1. You passed ' '%1.5f.' % min_per_read_length_fraction) barcode_correction_fn = BARCODE_DECODER_LOOKUP.get(barcode_type, None) if len(mapping_fps) == 1 and len(sequence_read_fps) > 1: mapping_fps = mapping_fps * len(sequence_read_fps) if len( set([ len(sequence_read_fps), len(barcode_read_fps), len(mapping_fps) ])) > 1: option_parser.error("Same number of sequence, barcode, and mapping " "files must be provided.") output_dir = opts.output_dir create_dir(output_dir) output_fp_temp = '%s/seqs.fna.incomplete' % output_dir output_fp = '%s/seqs.fna' % output_dir output_f = open(output_fp_temp, 'w') qual_fp_temp = '%s/qual.fna.incomplete' % output_dir qual_fp = '%s/seqs.qual' % output_dir output_fastq_fp_temp = '%s/seqs.fastq.incomplete' % output_dir output_fastq_fp = '%s/seqs.fastq' % output_dir if store_qual_scores: qual_f = open(qual_fp_temp, 'w') # define a qual writer whether we're storing # qual strings or not so we don't have to check # every time through the for loop below def qual_writer(h, q): qual_f.write('>%s\n%s\n' % (h, q)) else: def qual_writer(h, q): pass if store_demultiplexed_fastq: output_fastq_f = open(output_fastq_fp_temp, 'w') # define a fastq writer whether we're storing # qual strings or not so we don't have to check # every time through the for loop below def fastq_writer(h, s, q): output_fastq_f.write(format_fastq_record(h, s, q)) else: def fastq_writer(h, s, q): pass log_fp = '%s/split_library_log.txt' % output_dir log_f = open(log_fp, 'w') histogram_fp = '%s/histograms.txt' % output_dir histogram_f = open(histogram_fp, 'w') for i in range(len(sequence_read_fps)): sequence_read_fp = sequence_read_fps[i] barcode_read_fp = barcode_read_fps[i] mapping_fp = mapping_fps[i] if mapping_fp is not None: mapping_f = open(mapping_fp, 'U') _, _, barcode_to_sample_id, _, _, _, _ = check_map( mapping_f, disable_primer_check=True, has_barcodes=barcode_read_fp is not None) else: mapping_f = None barcode_to_sample_id = {} if rev_comp_mapping_barcodes: barcode_to_sample_id = { str(DNA(k).rc()): v for k, v in barcode_to_sample_id.iteritems() } if barcode_type == 'golay_12': invalid_golay_barcodes = get_invalid_golay_barcodes( barcode_to_sample_id.keys()) if len(invalid_golay_barcodes) > 0: option_parser.error( "Some or all barcodes are not valid golay " "codes. Do they need to be reverse complemented? If these " "are not golay barcodes pass --barcode_type 12 to disable " "barcode error correction, or pass --barcode_type # if " "the barcodes are not 12 base pairs, where # is the size " "of the barcodes. Invalid codes:\n\t%s" % ' '.join(invalid_golay_barcodes)) log_f.write("Input file paths\n") if mapping_fp is not None: log_f.write('Mapping filepath: %s (md5: %s)\n' % (mapping_fp, safe_md5(open(mapping_fp)).hexdigest())) log_f.write('Sequence read filepath: %s (md5: %s)\n' % (sequence_read_fp, str(safe_md5(open(sequence_read_fp)).hexdigest()))) if sequence_read_fp.endswith('.gz'): sequence_read_f = gzip_open(sequence_read_fp) else: sequence_read_f = open(sequence_read_fp, 'U') seq_id = start_seq_id if barcode_read_fp is not None: log_f.write( 'Barcode read filepath: %s (md5: %s)\n\n' % (barcode_read_fp, safe_md5(open(barcode_read_fp)).hexdigest())) if barcode_read_fp.endswith('.gz'): barcode_read_f = gzip_open(barcode_read_fp) else: barcode_read_f = open(barcode_read_fp, 'U') seq_generator = process_fastq_single_end_read_file( sequence_read_f, barcode_read_f, barcode_to_sample_id, store_unassigned=retain_unassigned_reads, max_bad_run_length=max_bad_run_length, phred_quality_threshold=phred_quality_threshold, min_per_read_length_fraction=min_per_read_length_fraction, rev_comp=rev_comp, rev_comp_barcode=rev_comp_barcode, seq_max_N=seq_max_N, start_seq_id=start_seq_id, filter_bad_illumina_qual_digit=filter_bad_illumina_qual_digit, log_f=log_f, histogram_f=histogram_f, barcode_correction_fn=barcode_correction_fn, max_barcode_errors=max_barcode_errors, phred_offset=phred_offset) else: seq_generator = process_fastq_single_end_read_file_no_barcode( sequence_read_f, sample_ids[i], store_unassigned=retain_unassigned_reads, max_bad_run_length=max_bad_run_length, phred_quality_threshold=phred_quality_threshold, min_per_read_length_fraction=min_per_read_length_fraction, rev_comp=rev_comp, seq_max_N=seq_max_N, start_seq_id=start_seq_id, filter_bad_illumina_qual_digit=filter_bad_illumina_qual_digit, log_f=log_f, histogram_f=histogram_f, phred_offset=phred_offset) for fasta_header, sequence, quality, seq_id in seq_generator: output_f.write('>%s\n%s\n' % (fasta_header, sequence)) qual_writer(fasta_header, quality) fastq_writer(fasta_header, sequence, quality) start_seq_id = seq_id + 1 log_f.write('\n---\n\n') output_f.close() rename(output_fp_temp, output_fp) # process the optional output files, as necessary if store_qual_scores: qual_f.close() rename(qual_fp_temp, qual_fp) if store_demultiplexed_fastq: output_fastq_f.close() rename(output_fastq_fp_temp, output_fastq_fp)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) read_arguments_from_file = opts.read_arguments_from_file # these arguments can optionally be read from a file, reasoning is to # allow arguments that would span over hundreds of samples and would be # prohibitive to execute as a command line call if read_arguments_from_file: # sample_ids is the only one of these arguments that's returned as a # string, the rest of them are lists if opts.sample_ids: opts.sample_ids = ','.join(parse_items(opts.sample_ids)) if opts.sequence_read_fps: opts.sequence_read_fps = parse_items(opts.sequence_read_fps[0]) if opts.barcode_read_fps: opts.barcode_read_fps = parse_items(opts.barcode_read_fps[0]) if opts.mapping_fps: opts.mapping_fps = parse_items(opts.mapping_fps[0]) sequence_read_fps = opts.sequence_read_fps barcode_read_fps = opts.barcode_read_fps sample_ids = None if opts.sample_ids is not None: sample_ids = opts.sample_ids.split(',') mapping_fps = opts.mapping_fps phred_quality_threshold = opts.phred_quality_threshold retain_unassigned_reads = opts.retain_unassigned_reads min_per_read_length_fraction = opts.min_per_read_length_fraction max_bad_run_length = opts.max_bad_run_length rev_comp = opts.rev_comp rev_comp_barcode = opts.rev_comp_barcode rev_comp_mapping_barcodes = opts.rev_comp_mapping_barcodes seq_max_N = opts.sequence_max_n start_seq_id = opts.start_seq_id # NEED TO FIX THIS FUNCTIONALITY - CURRENTLY READING THE WRONG FIELD # opts.filter_bad_illumina_qual_digit filter_bad_illumina_qual_digit = False store_qual_scores = opts.store_qual_scores store_demultiplexed_fastq = opts.store_demultiplexed_fastq barcode_type = opts.barcode_type max_barcode_errors = opts.max_barcode_errors # if this is not a demultiplexed run, if barcode_type == 'not-barcoded': if sample_ids is None: option_parser.error("If not providing barcode reads (because " "your data is not multiplexed), must provide --sample_ids.") if len(sample_ids) != len(sequence_read_fps): option_parser.error("If providing --sample_ids (because " "your data is not multiplexed), must provide the same number " "of sample ids as sequence read filepaths.") barcode_read_fps = [None] * len(sequence_read_fps) mapping_fps = [None] * len(sequence_read_fps) elif barcode_read_fps is None: option_parser.error("Must provide --barcode_read_fps if " "--barcode_type is not 'not-barcoded'") elif mapping_fps is None: option_parser.error("Must provide --mapping_fps if " "--barcode_type is not 'not-barcoded'") phred_offset = opts.phred_offset if phred_offset is not None: try: phred_offset = int(phred_offset) except ValueError: # shouldn't be able to get here... option_parser.error( "If --phred_offset is provided, it must be a valid integer.") if opts.last_bad_quality_char is not None: option_parser.error('--last_bad_quality_char is no longer supported. ' 'Use -q instead (see option help text by passing -h)') if not (0 < min_per_read_length_fraction <= 1): option_parser.error('--min_per_read_length_fraction must be greater ' 'than 0 and less than or equal to 1. You passed ' '%1.5f.' % min_per_read_length_fraction) barcode_correction_fn = BARCODE_DECODER_LOOKUP.get(barcode_type, None) if len(mapping_fps) == 1 and len(sequence_read_fps) > 1: mapping_fps = mapping_fps * len(sequence_read_fps) if len(set([len(sequence_read_fps), len(barcode_read_fps), len(mapping_fps)])) > 1: option_parser.error("Same number of sequence, barcode, and mapping " "files must be provided.") output_dir = opts.output_dir create_dir(output_dir) output_fp_temp = '%s/seqs.fna.incomplete' % output_dir output_fp = '%s/seqs.fna' % output_dir output_f = open(output_fp_temp, 'w') qual_fp_temp = '%s/qual.fna.incomplete' % output_dir qual_fp = '%s/seqs.qual' % output_dir output_fastq_fp_temp = '%s/seqs.fastq.incomplete' % output_dir output_fastq_fp = '%s/seqs.fastq' % output_dir if store_qual_scores: qual_f = open(qual_fp_temp, 'w') # define a qual writer whether we're storing # qual strings or not so we don't have to check # every time through the for loop below def qual_writer(h, q): qual_f.write('>%s\n%s\n' % (h, q)) else: def qual_writer(h, q): pass if store_demultiplexed_fastq: output_fastq_f = open(output_fastq_fp_temp, 'w') # define a fastq writer whether we're storing # qual strings or not so we don't have to check # every time through the for loop below def fastq_writer(h, s, q): output_fastq_f.write(format_fastq_record(h, s, q)) else: def fastq_writer(h, s, q): pass log_fp = '%s/split_library_log.txt' % output_dir log_f = open(log_fp, 'w') histogram_fp = '%s/histograms.txt' % output_dir histogram_f = open(histogram_fp, 'w') for i in range(len(sequence_read_fps)): sequence_read_fp = sequence_read_fps[i] barcode_read_fp = barcode_read_fps[i] mapping_fp = mapping_fps[i] if mapping_fp is not None: mapping_f = open(mapping_fp, 'U') _, _, barcode_to_sample_id, _, _, _, _ = check_map(mapping_f, disable_primer_check=True, has_barcodes=barcode_read_fp is not None) else: mapping_f = None barcode_to_sample_id = {} if rev_comp_mapping_barcodes: barcode_to_sample_id = {str(DNA(k).rc()): v for k, v in barcode_to_sample_id.iteritems()} if barcode_type == 'golay_12': invalid_golay_barcodes = get_invalid_golay_barcodes( barcode_to_sample_id.keys()) if len(invalid_golay_barcodes) > 0: option_parser.error("Some or all barcodes are not valid golay " "codes. Do they need to be reverse complemented? If these " "are not golay barcodes pass --barcode_type 12 to disable " "barcode error correction, or pass --barcode_type # if " "the barcodes are not 12 base pairs, where # is the size " "of the barcodes. Invalid codes:\n\t%s" % ' '.join(invalid_golay_barcodes)) log_f.write("Input file paths\n") if mapping_fp is not None: log_f.write('Mapping filepath: %s (md5: %s)\n' % (mapping_fp, safe_md5(open(mapping_fp)).hexdigest())) log_f.write('Sequence read filepath: %s (md5: %s)\n' % (sequence_read_fp, str(safe_md5(open(sequence_read_fp)).hexdigest()))) if sequence_read_fp.endswith('.gz'): sequence_read_f = gzip_open(sequence_read_fp) else: sequence_read_f = open(sequence_read_fp, 'U') seq_id = start_seq_id if barcode_read_fp is not None: log_f.write('Barcode read filepath: %s (md5: %s)\n\n' % (barcode_read_fp, safe_md5(open(barcode_read_fp)).hexdigest())) if barcode_read_fp.endswith('.gz'): barcode_read_f = gzip_open(barcode_read_fp) else: barcode_read_f = open(barcode_read_fp, 'U') seq_generator = process_fastq_single_end_read_file( sequence_read_f, barcode_read_f, barcode_to_sample_id, store_unassigned=retain_unassigned_reads, max_bad_run_length=max_bad_run_length, phred_quality_threshold=phred_quality_threshold, min_per_read_length_fraction=min_per_read_length_fraction, rev_comp=rev_comp, rev_comp_barcode=rev_comp_barcode, seq_max_N=seq_max_N, start_seq_id=start_seq_id, filter_bad_illumina_qual_digit=filter_bad_illumina_qual_digit, log_f=log_f, histogram_f=histogram_f, barcode_correction_fn=barcode_correction_fn, max_barcode_errors=max_barcode_errors, phred_offset=phred_offset) else: seq_generator = process_fastq_single_end_read_file_no_barcode( sequence_read_f, sample_ids[i], store_unassigned=retain_unassigned_reads, max_bad_run_length=max_bad_run_length, phred_quality_threshold=phred_quality_threshold, min_per_read_length_fraction=min_per_read_length_fraction, rev_comp=rev_comp, seq_max_N=seq_max_N, start_seq_id=start_seq_id, filter_bad_illumina_qual_digit=filter_bad_illumina_qual_digit, log_f=log_f, histogram_f=histogram_f, phred_offset=phred_offset) for fasta_header, sequence, quality, seq_id in seq_generator: output_f.write('>%s\n%s\n' % (fasta_header, sequence)) qual_writer(fasta_header, quality) fastq_writer(fasta_header, sequence, quality) start_seq_id = seq_id + 1 log_f.write('\n---\n\n') output_f.close() rename(output_fp_temp, output_fp) # process the optional output files, as necessary if store_qual_scores: qual_f.close() rename(qual_fp_temp, qual_fp) if store_demultiplexed_fastq: output_fastq_f.close() rename(output_fastq_fp_temp, output_fastq_fp)