def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) sequence_read_fps = opts.sequence_read_fps barcode_read_fps = opts.barcode_read_fps sample_id = opts.sample_id mapping_fps = opts.mapping_fps phred_quality_threshold = opts.phred_quality_threshold retain_unassigned_reads = opts.retain_unassigned_reads min_per_read_length_fraction = opts.min_per_read_length_fraction max_bad_run_length = opts.max_bad_run_length rev_comp = opts.rev_comp rev_comp_barcode = opts.rev_comp_barcode rev_comp_mapping_barcodes = opts.rev_comp_mapping_barcodes seq_max_N = opts.sequence_max_n start_seq_id = opts.start_seq_id # NEED TO FIX THIS FUNCTIONALITY - CURRENTLY READING THE WRONG FIELD filter_bad_illumina_qual_digit = False #opts.filter_bad_illumina_qual_digit store_qual_scores = opts.store_qual_scores store_demultiplexed_fastq = opts.store_demultiplexed_fastq barcode_type = opts.barcode_type max_barcode_errors = opts.max_barcode_errors # if this is not a demultiplexed run, if barcode_type == 'not-barcoded': if sample_id == None: option_parser.error("If not providing barcode reads (because " "your data is not multiplexed), must provide a --sample_id.") barcode_read_fps = [None] * len(sequence_read_fps) elif barcode_read_fps == None: option_parser.error("Must provide --barcode_fps if " "--barcode_type is not 'not-barcoded'") else: pass phred_offset = opts.phred_offset if phred_offset != None: try: phred_to_ascii_f = phred_to_ascii_fs[phred_offset] except KeyError: # shouldn't be able to get here, but we'll stay on the # safe side opption_parser.error(\ "Only valid phred offsets are: %s" %\ ' '.join(phred_to_ascii_fs.keys())) else: # let split_libraries_fastq.process_fastq_single_end_read_file # figure it out... phred_to_ascii_f = None if opts.last_bad_quality_char != None: option_parser.error('--last_bad_quality_char is no longer supported. ' 'Use -q instead (see option help text by passing -h)') if not (0 <= min_per_read_length_fraction <= 1): option_parser.error('--min_per_read_length_fraction must be between ' '0 and 1 (inclusive). You passed %1.5f' % min_per_read_length_fraction) try: barcode_correction_fn = BARCODE_DECODER_LOOKUP[barcode_type] except KeyError: barcode_correction_fn = None if len(mapping_fps) == 1 and len(sequence_read_fps) > 1: mapping_fps = mapping_fps * len(sequence_read_fps) if len(set([len(sequence_read_fps), len(barcode_read_fps), len(mapping_fps)])) > 1: option_parser.error("Same number of sequence, barcode and mapping files must be provided.") output_dir = opts.output_dir create_dir(output_dir) output_fp_temp = '%s/seqs.fna.incomplete' % output_dir output_fp = '%s/seqs.fna' % output_dir output_f = open(output_fp_temp,'w') qual_fp_temp = '%s/qual.fna.incomplete' % output_dir qual_fp = '%s/seqs.qual' % output_dir output_fastq_fp_temp = '%s/seqs.fastq.incomplete' % output_dir output_fastq_fp = '%s/seqs.fastq' % output_dir if store_qual_scores: qual_f = open(qual_fp_temp,'w') # define a qual writer whether we're storing # qual strings or not so we don't have to check # every time through the for loop below def qual_writer(h,q): qual_f.write('>%s\n%s\n' % (h,q)) else: def qual_writer(h,q): pass if store_demultiplexed_fastq: output_fastq_f = open(output_fastq_fp_temp,'w') # define a fastq writer whether we're storing # qual strings or not so we don't have to check # every time through the for loop below def fastq_writer(h,s,q): output_fastq_f.write('@%s\n%s\n+\n%s\n' % (h,s,q)) else: def fastq_writer(h,s,q): pass log_fp = '%s/split_library_log.txt' % output_dir log_f = open(log_fp,'w') histogram_fp = '%s/histograms.txt' % output_dir histogram_f = open(histogram_fp,'w') for sequence_read_fp, barcode_read_fp, mapping_fp in\ zip(sequence_read_fps, barcode_read_fps, mapping_fps): mapping_f = open(mapping_fp, 'U') h, i, barcode_to_sample_id, warnings, errors, p, a =\ check_map(mapping_f, disable_primer_check=True, has_barcodes=barcode_read_fp != None) if rev_comp_mapping_barcodes: barcode_to_sample_id = \ dict([(DNA.rc(k),v) for k,v in barcode_to_sample_id.items()]) if barcode_type == 'golay_12': invalid_golay_barcodes = \ get_invalid_golay_barcodes(barcode_to_sample_id.keys()) if len(invalid_golay_barcodes) > 0: option_parser.error("Some or all barcodes are not valid golay codes. "+\ "Do they need to be reverse complimented? If these are not "+\ "golay barcodes pass --barcode_type 12 to disable barcode "+\ "error correction, or pass --barcode_type # if the barcodes "+\ "are not 12 base pairs, where # is the size of the barcodes. "+ " Invalid codes:\n\t%s" % \ ' '.join(invalid_golay_barcodes)) log_f.write("Input file paths\n") log_f.write('Mapping filepath: %s (md5: %s)\n' %\ (mapping_fp,safe_md5(open(mapping_fp)).hexdigest())) log_f.write('Sequence read filepath: %s (md5: %s)\n' %\ (sequence_read_fp,str(safe_md5(open(sequence_read_fp)).hexdigest()))) if sequence_read_fp.endswith('.gz'): sequence_read_f = gzip_open(sequence_read_fp) else: sequence_read_f = open(sequence_read_fp,'U') seq_id = start_seq_id if barcode_read_fp != None: log_f.write('Barcode read filepath: %s (md5: %s)\n\n' %\ (barcode_read_fp,safe_md5(open(barcode_read_fp)).hexdigest())) if barcode_read_fp.endswith('.gz'): barcode_read_f = gzip_open(barcode_read_fp) else: barcode_read_f = open(barcode_read_fp,'U') seq_generator = process_fastq_single_end_read_file( sequence_read_f, barcode_read_f, barcode_to_sample_id, store_unassigned=retain_unassigned_reads, max_bad_run_length=max_bad_run_length, phred_quality_threshold=phred_quality_threshold, min_per_read_length_fraction=min_per_read_length_fraction, rev_comp=rev_comp, rev_comp_barcode=rev_comp_barcode, seq_max_N=seq_max_N, start_seq_id=start_seq_id, filter_bad_illumina_qual_digit=\ filter_bad_illumina_qual_digit, log_f=log_f, histogram_f=histogram_f, barcode_correction_fn=barcode_correction_fn, max_barcode_errors=max_barcode_errors, phred_to_ascii_f=phred_to_ascii_f) else: seq_generator = process_fastq_single_end_read_file_no_barcode( sequence_read_f, sample_id, store_unassigned=retain_unassigned_reads, max_bad_run_length=max_bad_run_length, phred_quality_threshold=phred_quality_threshold, min_per_read_length_fraction=min_per_read_length_fraction, rev_comp=rev_comp, seq_max_N=seq_max_N, start_seq_id=start_seq_id, filter_bad_illumina_qual_digit=\ filter_bad_illumina_qual_digit, log_f=log_f, histogram_f=histogram_f, phred_to_ascii_f=phred_to_ascii_f) for fasta_header, sequence, quality, seq_id in seq_generator: output_f.write('>%s\n%s\n' % (fasta_header,sequence)) qual_writer(fasta_header,quality) fastq_writer(fasta_header,sequence,quality) start_seq_id = seq_id + 1 log_f.write('\n---\n\n') output_f.close() rename(output_fp_temp,output_fp) # process the optional output files, as necessary if store_qual_scores: qual_f.close() rename(qual_fp_temp,qual_fp) if store_demultiplexed_fastq: output_fastq_f.close() rename(output_fastq_fp_temp,output_fastq_fp)
def log_input_md5s(logger, fps): logger.write("Input file md5 sums:\n") for fp in fps: if fp is not None: logger.write("%s: %s\n" % (fp, safe_md5(open(fp)).hexdigest())) logger.write("\n")
def submit_fasta_and_split_lib(data_access,fasta_files,metadata_study_id, input_dir): """ FASTA Loading: This function takes the fasta filenames and using that path, determines the location of the split-library and picked-otu files. Once file locations have been determined, it moves the files to the DB machine and load the files into the DB. """ # get DB connection and cursor con = data_access.getSFFDatabaseConnection() cur = con.cursor() # check if study exists study_id_exists=data_access.checkIfStudyIdExists(metadata_study_id) print "Study ID exists: " + str(study_id_exists) # get temp filename alphabet = "ABCDEFGHIJKLMNOPQRSTUZWXYZ" alphabet += alphabet.lower() alphabet += "01234567890" random_fname=''.join([choice(alphabet) for i in range(10)]) tmp_filename ='_'+random_fname+'_'+strftime("%Y_%m_%d_%H_%M_%S") # get fasta filenames fasta_filenames=fasta_files.split(',') seq_run_id=0 analysis_id=0 split_lib_input_checksums=[] ### by disabling constraints you can speed up loading as well, but shouldn't ### be necessary #valid = data_access.disableTableConstraints() #print "Disabled table constraints" # split the fasta filenames and determine filepaths for fasta_fname in fasta_filenames: input_fname, input_ext = splitext(split(fasta_fname)[-1]) input_basename, input_ext = splitext(fasta_fname) # define analysis notes analysis_notes=split(input_basename)[0] # get md5 for raw fasta files fasta_md5 = safe_md5(open(fasta_fname)).hexdigest() print 'MD5 is: %s' % str(fasta_md5) # create an analysis row in analysis table if analysis_id==0: analysis_id=data_access.createAnalysis(metadata_study_id) # check if fasta info already loaded fasta_exists=data_access.checkIfSFFExists(fasta_md5) print 'fasta in database? %s' % str(fasta_exists) # if fasta info not loaded, then insert into DB if not fasta_exists: if seq_run_id==0: seq_run_id=data_access.createSequencingRun(True,'FASTA', None,seq_run_id) # get sequence count count_seqs_cmd="grep '^>' %s | wc -l" % (fasta_fname) o,e,r = qiime_system_call(count_seqs_cmd) seq_counts = o.strip() # add fasta info valid=data_access.addSFFFileInfo(True,input_fname, seq_counts, None, None, None, None, None, None, fasta_md5,seq_run_id) else: seq_run_id=data_access.getSeqRunIDUsingMD5(fasta_md5) print 'sequence_run_id is: %s' % str(seq_run_id) # get md5 sum for input to split-libraries split_lib_input_md5sum=safe_md5(MD5Wrap(fasta_filenames)).hexdigest() print split_lib_input_md5sum print 'Finished loading the processed FASTA data!' print 'Run ID: %s' % seq_run_id print 'Analysis ID: %s' % analysis_id # update analysis table with seq_run_id valid=data_access.updateAnalysisWithSeqRunID(True,analysis_id,seq_run_id) if not valid: raise ValueError, 'Error: Unable to append SEQ_RUN_ID into ANALYSIS table!' return analysis_id,input_dir,seq_run_id,split_lib_input_md5sum
def log_input_md5s(logger,fps): logger.write("Input file md5 sums:\n") for fp in fps: if fp != None: logger.write("%s: %s\n" % (fp, safe_md5(open(fp)).hexdigest())) logger.write("\n")
def submit_illumina_and_split_lib(data_access,fastq_files,metadata_study_id, input_dir): """ Illumina Loading: This function takes the fasta filenames and using that path, determines the location of the split-library and picked-otu files. Once file locations have been determined, it moves the files to the DB machine and load the files into the DB. """ # get DB connection and cursor con = data_access.getSFFDatabaseConnection() cur = con.cursor() ### this may help in speeding up loading but shouldn't be necessary #print 'Rebuilding PK_SPLIT_LIBRARY_READ_MAP...' #cur.execute('alter index "SFF"."PK_SPLIT_LIBRARY_READ_MAP" rebuild ') #cur = con.cursor() # check if study exists study_id_exists=data_access.checkIfStudyIdExists(metadata_study_id) print "Study ID exists: " + str(study_id_exists) # get temp filename alphabet = "ABCDEFGHIJKLMNOPQRSTUZWXYZ" alphabet += alphabet.lower() alphabet += "01234567890" random_fname=''.join([choice(alphabet) for i in range(10)]) tmp_filename ='_'+random_fname+'_'+strftime("%Y_%m_%d_%H_%M_%S") # get fastq filenames fastq_filenames=fastq_files.split(',') seq_run_id=0 analysis_id=0 split_lib_input_checksums=[] ### by disabling constraints you can speed up loading as well, but shouldn't ### be necessary #valid = data_access.disableTableConstraints() #print "Disabled table constraints" #split the fastq filenames and determine filepaths for fastq_fname in fastq_filenames: input_fname, input_ext = splitext(split(fastq_fname)[-1]) input_basename, input_ext = splitext(fastq_fname) # get analysis notes analysis_notes=split(input_basename)[0] # get md5 for raw fastq files fastq_md5 = safe_md5(open(fastq_fname)).hexdigest() print 'MD5 is: %s' % str(fastq_md5) # create an analysis row in analysis table if analysis_id==0: analysis_id=data_access.createAnalysis(metadata_study_id) # check if fastq info already loaded fastq_exists=data_access.checkIfSFFExists(fastq_md5) print 'fastq in database? %s' % str(fastq_exists) # if fastq info not loaded, then insert into DB if not fastq_exists: if seq_run_id==0: seq_run_id=data_access.createSequencingRun(True,'ILLUMINA', None,seq_run_id) # get sequence count if fastq_fname.endswith('.gz'): count_seqs_cmd = "zcat %s | grep ^@ | wc -l" % (fastq_fname) else: count_seqs_cmd="grep ^@ %s | wc -l" % (fastq_fname) o,e,r = qiime_system_call(count_seqs_cmd) seq_counts = o.strip() # get header length and # of flows (length of seq) fastq_fname_open=open(fastq_fname) first_seq_fastq=get_top_fastq_two_lines(fastq_fname_open) header_length=len(first_seq_fastq[1]) num_flows=len(first_seq_fastq[1]) # insert fastq info valid=data_access.addSFFFileInfo(True,input_fname, seq_counts, header_length, None, num_flows, None, None, None, fastq_md5,seq_run_id) else: seq_run_id=data_access.getSeqRunIDUsingMD5(fastq_md5) print 'sequence_run_id is: %s' % str(seq_run_id) # get md5 for split-library input split_lib_input_md5sum=safe_md5(MD5Wrap(fastq_filenames)).hexdigest() print split_lib_input_md5sum print 'Finished loading the processed ILLUMINA data!' print 'Run ID: %s' % seq_run_id print 'Analysis ID: %s' % analysis_id # update analysis table with seq_run_id valid=data_access.updateAnalysisWithSeqRunID(True,analysis_id,seq_run_id) if not valid: raise ValueError, 'Error: Unable to append SEQ_RUN_ID into ANALYSIS table!' return analysis_id,input_dir,seq_run_id,split_lib_input_md5sum
def submit_sff_and_split_lib(data_access,fasta_files,metadata_study_id): """ SFF Loading: This function takes the fasta filenames and using that path, determines the location of the split-library and picked-otu files. Once file locations have been determined, it moves the files to the DB machine and load the files into the DB. """ # get database connection and cursor con = data_access.getSFFDatabaseConnection() cur = con.cursor() ### this may help in speeding up loading but shouldn't be necessary #print 'Rebuilding PK_SPLIT_LIBRARY_READ_MAP...' #cur.execute('alter index "SFF"."PK_SPLIT_LIBRARY_READ_MAP" rebuild ') #cur = con.cursor() # check if study exists study_id_exists=data_access.checkIfStudyIdExists(metadata_study_id) print "Study ID exists: " + str(study_id_exists) # create a temp filename alphabet = "ABCDEFGHIJKLMNOPQRSTUZWXYZ" alphabet += alphabet.lower() alphabet += "01234567890" random_fname=''.join([choice(alphabet) for i in range(10)]) tmp_filename ='_'+random_fname+'_'+strftime("%Y_%m_%d_%H_%M_%S") # get a list of filenames fasta_filenames=fasta_files.split(',') seq_run_id=0 analysis_id=0 split_lib_input_checksums=[] fasta_qual_files=[] ### by disabling constraints you can speed up loading as well, but shouldn't ### be necessary #valid = data_access.disableTableConstraints() #print "Disabled table constraints" #split the fasta filenames and determine filepaths for fasta_fname in fasta_filenames: input_fname, input_ext = splitext(split(fasta_fname)[-1]) input_basename, input_ext = splitext(fasta_fname) input_dir = split(input_basename)[:-1][0] # get the sff basename if re.search('0\d$', input_fname)==None or re.search('0\d$', input_fname).group()==None: sff_basename=input_fname else: sff_basename=input_fname[:-2] if re.search('0\d_FLX$', sff_basename)==None or re.search('0\d_FLX$', sff_basename).group()==None: sff_basename=sff_basename else: sff_basename=sff_basename[:-6] print 'sff_basename: %s' % sff_basename # get analysis notes analysis_notes=split(input_basename)[0] # using the fasta basename, define qual and flow files qual_fname=join(input_basename+'.qual') flow_fname=join(input_basename+'.txt') fasta_qual_files.append(fasta_fname) fasta_qual_files.append(qual_fname) # Run the Oracle process_sff_files load package ## Get the location and name of the SFF file, get it's MD5. .SFF is one # directory up from the other files rev = dirname(fasta_fname)[::-1] # check for sffs in the processed folder...only occurs for Ti processing sffs_in_processed_folder = glob(join(input_dir, '*_FLX.sff')) if len(sffs_in_processed_folder) == 0: sff_file_dir = split(input_dir)[0] else: sff_file_dir=input_dir # get SFF file sff_file = join(sff_file_dir, input_fname + '.sff') # get md5 of SFF sff_md5 = safe_md5(open(sff_file)).hexdigest() print 'MD5 is: %s' % str(sff_md5) # create an analysis if analysis_id==0: analysis_id=data_access.createAnalysis(metadata_study_id) # check if SFF info was already loaded into DB sff_exists=data_access.checkIfSFFExists(sff_md5) print 'sff in database? %s' % str(sff_exists) #if True: if not sff_exists: print 'flow_fname: %s' % flow_fname sff_header=get_header_info(open(flow_fname)) # get instrument info if sff_header['# of Flows']=='400': instrument_code='GS FLX' elif sff_header['# of Flows']=='168': instrument_code='GS2-' elif sff_header['# of Flows']=='800': instrument_code='Titanium' else: instrument_code='UNKNOWN' print 'Instrument Code: %s' % instrument_code # load SFF info if seq_run_id==0: seq_run_id=data_access.createSequencingRun(True,instrument_code, sff_header['Version'],seq_run_id) valid=data_access.addSFFFileInfo(True,sff_basename, sff_header['# of Reads'], sff_header['Header Length'], sff_header['Key Length'], sff_header['# of Flows'], sff_header['Flowgram Code'], sff_header['Flow Chars'], sff_header['Key Sequence'], sff_md5,seq_run_id) else: valid=data_access.addSFFFileInfo(True,sff_basename, sff_header['# of Reads'], sff_header['Header Length'], sff_header['Key Length'], sff_header['# of Flows'], sff_header['Flowgram Code'], sff_header['Flow Chars'], sff_header['Key Sequence'], sff_md5,seq_run_id) else: seq_run_id=data_access.getSeqRunIDUsingMD5(sff_md5) print 'sequence_run_id is: %s' % str(seq_run_id) # get md5 of fna/qual files print fasta_qual_files split_lib_input_md5sum=safe_md5(MD5Wrap(fasta_qual_files)).hexdigest() print split_lib_input_md5sum print 'Finished loading the processed SFF data!' print 'Run ID: %s' % seq_run_id print 'Analysis ID: %s' % analysis_id # add seq_run_id to Analysis table valid=data_access.updateAnalysisWithSeqRunID(True,analysis_id,seq_run_id) if not valid: raise ValueError, 'Error: Unable to append SEQ_RUN_ID into ANALYSIS table!' return analysis_id,input_dir,seq_run_id,split_lib_input_md5sum
def load_otu_mapping(data_access, input_dir, analysis_id): """ Load the OTU table into the DB """ # For OTU Tables # read in the workflow log file and determine timestamp and svn version of # Qiime used for the analysis pOTUs_threshold = '97' ref_set_threshold = '97' pOTUs_method='UCLUST_REF' reference_set_name='GREENGENES_REFERENCE' otus_log_str = open(join(input_dir, 'gg_97_otus', 'log.txt')).read() log_str = open(join(input_dir, 'gg_97_otus', 'log.txt')).readlines() #from the workflow log file get the pick-otus cmd for substr in log_str: if 'parallel_pick_otus_uclust_ref.py' in substr: pick_otus_cmd=substr elif 'pick_otus.py' in substr: pick_otus_cmd=substr # define values for otu_picking_run table otu_run_set_id = 0 svn_version = '1418' # This is temporarily defined, however will use script to dtermine this value run_date=datetime.now().strftime("%d/%m/%Y/%H/%M/%S") pick_otus_map = join(input_dir, 'gg_97_otus', 'exact_uclust_ref_otus.txt') # get md5 for split-lib seq file split_lib_seqs = join(input_dir, 'split_libraries', 'seqs.fna') split_lib_seqs_md5=safe_md5(open(split_lib_seqs)).hexdigest() # Insert the otu-picking log information in the DB print 'calling loadAllOTUInfo with analysis_id %s' % str(analysis_id) valid,new_otu_run_set_id,otu_picking_run_id=data_access.loadAllOTUInfo(True, otu_run_set_id, run_date, pOTUs_method, pOTUs_threshold, svn_version, pick_otus_cmd, otus_log_str, split_lib_seqs_md5,reference_set_name, ref_set_threshold, analysis_id) if not valid: raise ValueError, 'Error: Unable to load OTU run data into database!' else: print "Finished registering OTU run!" # define OTU mapping otu_map=[] otu_to_seqid = fields_to_dict(open(pick_otus_map, 'U')) for otu in otu_to_seqid: for sample in otu_to_seqid[otu]: otu_map.append('%s\t%s\t%s\t%s' % (otu,sample,new_otu_run_set_id, reference_set_name)) print 'Finished setting otu_map.' # define oracle data types types = ['s','s','i','s'] con = data_access.getSFFDatabaseConnection() cur = con.cursor() #print 'Starting PK_SPLIT_LIBRARY_READ_MAP index rebuild...' #cur.execute('alter index "SFF"."PK_SPLIT_LIBRARY_READ_MAP" rebuild ') print 'Fisnished rebuilding index PK_SPLIT_LIBRARY_READ_MAP.' cur = con.cursor() set_count = 1 # prepare the OTU table for laoding print 'Loading OTU Table into the database!' pick_otus_table = join(input_dir, 'gg_97_otus', 'exact_uclust_ref_otu_table.txt') otu_table_lines=open(pick_otus_table).readlines() sample_ids, otu_ids, otu_table, lineages = \ parse_classic_otu_table(otu_table_lines) # convert OTU table to tab-delimited list otu_table_load=[] for i,otu in enumerate(otu_ids): for j,sample in enumerate(sample_ids): if otu_table[i][j]>0: otu_table_load.append("%s\t%s\t%s\t%s" % \ (otu,sample,new_otu_run_set_id,otu_table[i][j])) # get DB connection con = data_access.getSFFDatabaseConnection() cur = con.cursor() # load otu table into DB data_types=['s','s','i','f'] set_count = 0 for input_set in input_set_generator(otu_table_load, cur,data_types,\ buffer_size=1000): valid=data_access.loadOTUTable(True,input_set) if not valid: raise ValueError, 'Error: Unable to load OTU table!' print "loading OTU Table: %s" % set_count set_count += 1 print 'Successfully loaded the OTU Table into the database!' print 'End of function'
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) sequence_read_fps = opts.sequence_read_fps barcode_read_fps = opts.barcode_read_fps sample_ids = None if opts.sample_ids is not None: sample_ids = opts.sample_ids.split(',') mapping_fps = opts.mapping_fps phred_quality_threshold = opts.phred_quality_threshold retain_unassigned_reads = opts.retain_unassigned_reads min_per_read_length_fraction = opts.min_per_read_length_fraction max_bad_run_length = opts.max_bad_run_length rev_comp = opts.rev_comp rev_comp_barcode = opts.rev_comp_barcode rev_comp_mapping_barcodes = opts.rev_comp_mapping_barcodes seq_max_N = opts.sequence_max_n start_seq_id = opts.start_seq_id # NEED TO FIX THIS FUNCTIONALITY - CURRENTLY READING THE WRONG FIELD # opts.filter_bad_illumina_qual_digit filter_bad_illumina_qual_digit = False store_qual_scores = opts.store_qual_scores store_demultiplexed_fastq = opts.store_demultiplexed_fastq barcode_type = opts.barcode_type max_barcode_errors = opts.max_barcode_errors # if this is not a demultiplexed run, if barcode_type == 'not-barcoded': if sample_ids is None: option_parser.error( "If not providing barcode reads (because " "your data is not multiplexed), must provide --sample_ids.") if len(sample_ids) != len(sequence_read_fps): option_parser.error( "If providing --sample_ids (because " "your data is not multiplexed), must provide the same number " "of sample ids as sequence read filepaths.") barcode_read_fps = [None] * len(sequence_read_fps) mapping_fps = [None] * len(sequence_read_fps) elif barcode_read_fps is None: option_parser.error("Must provide --barcode_read_fps if " "--barcode_type is not 'not-barcoded'") elif mapping_fps is None: option_parser.error("Must provide --mapping_fps if " "--barcode_type is not 'not-barcoded'") phred_offset = opts.phred_offset if phred_offset is not None: try: phred_to_ascii_f = phred_to_ascii_fs[phred_offset] except KeyError: # shouldn't be able to get here, but we'll stay on the # safe side option_parser.error("Only valid phred offsets are: %s" % ' '.join(phred_to_ascii_fs.keys())) else: # let split_libraries_fastq.process_fastq_single_end_read_file # figure it out... phred_to_ascii_f = None if opts.last_bad_quality_char is not None: option_parser.error( '--last_bad_quality_char is no longer supported. ' 'Use -q instead (see option help text by passing -h)') if not (0 <= min_per_read_length_fraction <= 1): option_parser.error('--min_per_read_length_fraction must be between ' '0 and 1 (inclusive). You passed %1.5f' % min_per_read_length_fraction) barcode_correction_fn = BARCODE_DECODER_LOOKUP.get(barcode_type, None) if len(mapping_fps) == 1 and len(sequence_read_fps) > 1: mapping_fps = mapping_fps * len(sequence_read_fps) if len( set([ len(sequence_read_fps), len(barcode_read_fps), len(mapping_fps) ])) > 1: option_parser.error("Same number of sequence, barcode, and mapping " "files must be provided.") output_dir = opts.output_dir create_dir(output_dir) output_fp_temp = '%s/seqs.fna.incomplete' % output_dir output_fp = '%s/seqs.fna' % output_dir output_f = open(output_fp_temp, 'w') qual_fp_temp = '%s/qual.fna.incomplete' % output_dir qual_fp = '%s/seqs.qual' % output_dir output_fastq_fp_temp = '%s/seqs.fastq.incomplete' % output_dir output_fastq_fp = '%s/seqs.fastq' % output_dir if store_qual_scores: qual_f = open(qual_fp_temp, 'w') # define a qual writer whether we're storing # qual strings or not so we don't have to check # every time through the for loop below def qual_writer(h, q): qual_f.write('>%s\n%s\n' % (h, q)) else: def qual_writer(h, q): pass if store_demultiplexed_fastq: output_fastq_f = open(output_fastq_fp_temp, 'w') # define a fastq writer whether we're storing # qual strings or not so we don't have to check # every time through the for loop below def fastq_writer(h, s, q): output_fastq_f.write('@%s\n%s\n+\n%s\n' % (h, s, q)) else: def fastq_writer(h, s, q): pass log_fp = '%s/split_library_log.txt' % output_dir log_f = open(log_fp, 'w') histogram_fp = '%s/histograms.txt' % output_dir histogram_f = open(histogram_fp, 'w') for i in range(len(sequence_read_fps)): sequence_read_fp = sequence_read_fps[i] barcode_read_fp = barcode_read_fps[i] mapping_fp = mapping_fps[i] if mapping_fp is not None: mapping_f = open(mapping_fp, 'U') _, _, barcode_to_sample_id, _, _, _, _ = check_map( mapping_f, disable_primer_check=True, has_barcodes=barcode_read_fp is not None) else: mapping_f = None barcode_to_sample_id = {} if rev_comp_mapping_barcodes: barcode_to_sample_id = { DNA.rc(k): v for k, v in barcode_to_sample_id.iteritems() } if barcode_type == 'golay_12': invalid_golay_barcodes = get_invalid_golay_barcodes( barcode_to_sample_id.keys()) if len(invalid_golay_barcodes) > 0: option_parser.error( "Some or all barcodes are not valid golay " "codes. Do they need to be reverse complemented? If these " "are not golay barcodes pass --barcode_type 12 to disable " "barcode error correction, or pass --barcode_type # if " "the barcodes are not 12 base pairs, where # is the size " "of the barcodes. Invalid codes:\n\t%s" % ' '.join(invalid_golay_barcodes)) log_f.write("Input file paths\n") if mapping_fp is not None: log_f.write('Mapping filepath: %s (md5: %s)\n' % (mapping_fp, safe_md5(open(mapping_fp)).hexdigest())) log_f.write('Sequence read filepath: %s (md5: %s)\n' % (sequence_read_fp, str(safe_md5(open(sequence_read_fp)).hexdigest()))) if sequence_read_fp.endswith('.gz'): sequence_read_f = gzip_open(sequence_read_fp) else: sequence_read_f = open(sequence_read_fp, 'U') seq_id = start_seq_id if barcode_read_fp is not None: log_f.write( 'Barcode read filepath: %s (md5: %s)\n\n' % (barcode_read_fp, safe_md5(open(barcode_read_fp)).hexdigest())) if barcode_read_fp.endswith('.gz'): barcode_read_f = gzip_open(barcode_read_fp) else: barcode_read_f = open(barcode_read_fp, 'U') seq_generator = process_fastq_single_end_read_file( sequence_read_f, barcode_read_f, barcode_to_sample_id, store_unassigned=retain_unassigned_reads, max_bad_run_length=max_bad_run_length, phred_quality_threshold=phred_quality_threshold, min_per_read_length_fraction=min_per_read_length_fraction, rev_comp=rev_comp, rev_comp_barcode=rev_comp_barcode, seq_max_N=seq_max_N, start_seq_id=start_seq_id, filter_bad_illumina_qual_digit=filter_bad_illumina_qual_digit, log_f=log_f, histogram_f=histogram_f, barcode_correction_fn=barcode_correction_fn, max_barcode_errors=max_barcode_errors, phred_to_ascii_f=phred_to_ascii_f) else: seq_generator = process_fastq_single_end_read_file_no_barcode( sequence_read_f, sample_ids[i], store_unassigned=retain_unassigned_reads, max_bad_run_length=max_bad_run_length, phred_quality_threshold=phred_quality_threshold, min_per_read_length_fraction=min_per_read_length_fraction, rev_comp=rev_comp, seq_max_N=seq_max_N, start_seq_id=start_seq_id, filter_bad_illumina_qual_digit=filter_bad_illumina_qual_digit, log_f=log_f, histogram_f=histogram_f, phred_to_ascii_f=phred_to_ascii_f) for fasta_header, sequence, quality, seq_id in seq_generator: output_f.write('>%s\n%s\n' % (fasta_header, sequence)) qual_writer(fasta_header, quality) fastq_writer(fasta_header, sequence, quality) start_seq_id = seq_id + 1 log_f.write('\n---\n\n') output_f.close() rename(output_fp_temp, output_fp) # process the optional output files, as necessary if store_qual_scores: qual_f.close() rename(qual_fp_temp, qual_fp) if store_demultiplexed_fastq: output_fastq_f.close() rename(output_fastq_fp_temp, output_fastq_fp)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) sequence_read_fps = opts.sequence_read_fps barcode_read_fps = opts.barcode_read_fps mapping_fps = opts.mapping_fps retain_unassigned_reads = opts.retain_unassigned_reads max_bad_run_length = opts.max_bad_run_length last_bad_quality_char = opts.last_bad_quality_char min_per_read_length = opts.min_per_read_length rev_comp = opts.rev_comp rev_comp_barcode = opts.rev_comp_barcode seq_max_N = opts.sequence_max_n start_seq_id = opts.start_seq_id # NEED TO FIX THIS FUNCTIONALITY - CURRENTLY READING THE WRONG FIELD filter_bad_illumina_qual_digit = False #opts.filter_bad_illumina_qual_digit barcode_type = opts.barcode_type max_barcode_errors = opts.max_barcode_errors try: barcode_correction_fn = BARCODE_DECODER_LOOKUP[barcode_type] except KeyError: barcode_correction_fn = None if len(sequence_read_fps) != len(barcode_read_fps): parser.error("Same number of sequence and barcode files must be provided.") output_dir = opts.output_dir create_dir(output_dir) output_fp_temp = '%s/seqs.fna.incomplete' % output_dir output_fp = '%s/seqs.fna' % output_dir output_f = open(output_fp_temp,'w') log_fp = '%s/split_library_log.txt' % output_dir log_f = open(log_fp,'w') histogram_fp = '%s/histograms.txt' % output_dir histogram_f = open(histogram_fp,'w') for sequence_read_fp, barcode_read_fp, mapping_fp in\ zip(sequence_read_fps, barcode_read_fps, mapping_fps): mapping_f = open(mapping_fp, 'U') h, i, barcode_to_sample_id, warnings, errors, p, a =\ check_map(mapping_f, disable_primer_check=True) if barcode_type == 'golay_12': invalid_golay_barcodes = \ get_invalid_golay_barcodes(barcode_to_sample_id.keys()) if len(invalid_golay_barcodes) > 0: option_parser.error("Some or all barcodes are not valid golay codes. "+\ "Do they need to be reverse complimented? If these are not "+\ "golay barcodes pass --barcode_type 12 to disable barcode "+\ "error correction. Invalid codes:\n\t%s" % \ ' '.join(invalid_golay_barcodes)) log_f.write("Input file paths\n") log_f.write('Mapping filepath: %s (md5: %s)\n' %\ (mapping_fp,safe_md5(open(mapping_fp)).hexdigest())) log_f.write('Sequence read filepath: %s (md5: %s)\n' %\ (sequence_read_fp,str(safe_md5(open(sequence_read_fp)).hexdigest()))) log_f.write('Barcode read filepath: %s (md5: %s)\n\n' %\ (barcode_read_fp,safe_md5(open(barcode_read_fp)).hexdigest())) sequence_read_f = open(sequence_read_fp,'U') barcode_read_f = open(barcode_read_fp,'U') seq_id = start_seq_id for fasta_header, sequence, quality, seq_id in \ process_fastq_single_end_read_file( sequence_read_f, barcode_read_f, barcode_to_sample_id, store_unassigned=retain_unassigned_reads, max_bad_run_length=max_bad_run_length, last_bad_quality_char=last_bad_quality_char, min_per_read_length=min_per_read_length, rev_comp=rev_comp, rev_comp_barcode=rev_comp_barcode, seq_max_N=seq_max_N, start_seq_id=start_seq_id, filter_bad_illumina_qual_digit=\ filter_bad_illumina_qual_digit, log_f=log_f, histogram_f=histogram_f, barcode_correction_fn=barcode_correction_fn, max_barcode_errors=max_barcode_errors): output_f.write('>%s\n%s\n' % (fasta_header,sequence)) start_seq_id = seq_id + 1 log_f.write('\n---\n\n') output_f.close() rename(output_fp_temp,output_fp)