def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) # get cmd-line options fasta_fp=opts.input_fasta_fp qual_fp=opts.input_qual_fp output_dir=opts.output_dir # create output dir create_dir(output_dir) output_fps={} # open sequence files sequences=MinimalFastaParser(open(fasta_fp,'U')) qual_sequences=MinimalFastaParser(open(qual_fp,'U')) # iterate over seqs for seq_name,seq in sequences: # iterate over qual qual_seq_name,qual_seq=qual_sequences.next() # verify headers from seq and qual match if seq_name==qual_seq_name: # get the SampleID samp_id='_'.join(seq_name.split()[0].split('_')[:-1]) samp_filename='seqs_%s' % (str(samp_id)) # open files for output if not output_fps.has_key(str(samp_filename)): output_fps[str(samp_filename)] = open(join(output_dir, '%s.fastq' % (str(samp_filename))),'w') # write out the fastq format for seqs output_fps[str(samp_filename)].write('@%s\n%s\n+\n%s\n' % \ (seq_name,seq,qual_seq)) else: print seq_name # close the files for s_id in output_fps: output_fps[str(s_id)].close()
def generate_full_split_lib_fastq(study, study_input_dir, zip_fname, files_to_remove,output_dir): """ Generate the full split-library fastq file """ # define sequence output file seq_fname='study_%s_split_library_seqs.fastq.gz' % (str(study)) fna_fname='study_%s_split_library_seqs.fna.gz' % (str(study)) output_seq_fp=join(output_dir,seq_fname) output_fna_fp=join(output_dir,fna_fname) # add to list of files to remove files_to_remove.append(output_seq_fp) files_to_remove.append(output_fna_fp) output_seqs=gzip.open(output_seq_fp,'w') output_fna=gzip.open(output_fna_fp,'w') iterator=0 # get a list of all files in study_dir processed_folders=listdir(study_input_dir) samples={} biom_files=[] for processed_folder in processed_folders: # determine if the file startswith the word "processed" if processed_folder.startswith('processed'): # define split-lib seq fp split_lib_seqs=join(study_input_dir,processed_folder, 'split_libraries','seqs.fna') # open sequence files seqs=MinimalFastaParser(open(split_lib_seqs,'U')) try: # for illumina split_lib_qual=join(study_input_dir,processed_folder, 'split_libraries','seqs.qual') # open sequence files qual_sequences=MinimalFastaParser(open(split_lib_qual,'U')) except: # for 454 split_lib_qual=join(study_input_dir,processed_folder, 'split_libraries','seqs_filtered.qual') # open sequence files qual_sequences=MinimalFastaParser(open(split_lib_qual,'U')) # open split-lib seq fp seqs=MinimalFastaParser(open(split_lib_seqs,'U')) # iterate over sequences for seq_name,seq in seqs: # update sequence numbers since they may cause issues across # multiple split-lib runs qual_seq_name,qual_seq=qual_sequences.next() if seq_name==qual_seq_name: full_seq_name_list=seq_name.split() seq_name_prefix='_'.join(full_seq_name_list[0].split('_')[:-1]) # get per sample sequence counts if seq_name_prefix in samples: samples[seq_name_prefix]=samples[seq_name_prefix]+1 else: samples[seq_name_prefix]=1 # update the sequence name, but retain barcode info updated_seq_name=seq_name_prefix + '_' + str(iterator) + \ ' ' + ' '.join(full_seq_name_list[1:]) # write the sequence out in FASTA format output_seqs.write('@%s\n%s\n+\n%s\n' % \ (str(updated_seq_name), str(seq), str(qual_seq))) # write the sequence out in FASTA format output_fna.write('>%s\n%s\n' % (str(updated_seq_name), str(seq))) iterator=iterator+1 else: print seq_name # get list of biom files gg_biom_fp=join(study_input_dir,processed_folder, 'gg_97_otus','exact_uclust_ref_otu_table.biom') if exists(gg_biom_fp) and getsize(gg_biom_fp)>0: biom_files.append(gg_biom_fp) output_seqs.close() output_fna.close() # zip the full split-library sequence file #cmd_call='cd %s; tar rzvf %s %s' % (study_input_dir,zip_fname,seq_fname) #cmd_call='cd %s; tar rzvf %s %s' % (study_input_dir,zip_fname,fna_fname) #system(cmd_call) return files_to_remove, biom_files, samples
def assign_dna_reads_to_protein_database(query_fasta_fp, database_fasta_fp, output_fp, temp_dir="/tmp", params=None): """Assign DNA reads to a database fasta of protein sequences. Wraps assign_reads_to_database, setting database and query types. All parameters are set to default unless params is passed. A temporary file must be written containing the translated sequences from the input query fasta file because BLAT cannot do this automatically. query_fasta_fp: absolute path to the query fasta file containing DNA sequences. database_fasta_fp: absolute path to the database fasta file containing protein sequences. output_fp: absolute path where the output file will be generated. temp_dir: optional. Change the location where the translated sequences will be written before being used as the query. Defaults to /tmp. params: optional. dict containing parameter settings to be used instead of default values. Cannot change database or query file types from protein and dna, respectively. This method returns an open file object. The output format defaults to blast9 and should be parsable by the PyCogent BLAST parsers. """ if params is None: params = {} my_params = {'-t': 'prot', '-q': 'prot'} # make sure temp_dir specifies an absolute path if not isabs(temp_dir): raise ApplicationError("temp_dir must be an absolute path.") # if the user specified parameters other than default, then use them. # However, if they try to change the database or query types, raise an # applciation error. if '-t' in params or '-q' in params: raise ApplicationError( "Cannot change database or query types " "when using assign_dna_reads_to_dna_database. Use " "assign_reads_to_database instead.") if 'genetic_code' in params: my_genetic_code = GeneticCodes[params['genetic_code']] del params['genetic_code'] else: my_genetic_code = GeneticCodes[1] my_params.update(params) # get six-frame translation of the input DNA sequences and write them to # temporary file. tmp = get_tmp_filename(tmp_dir=temp_dir, result_constructor=str) tmp_out = open(tmp, 'w') for label, sequence in MinimalFastaParser(open(query_fasta_fp)): seq_id = label.split()[0] s = DNA.makeSequence(sequence) translations = my_genetic_code.sixframes(s) frames = [1, 2, 3, -1, -2, -3] translations = dict(zip(frames, translations)) for frame, translation in sorted(translations.iteritems()): entry = '>{seq_id}_frame_{frame}\n{trans}\n' entry = entry.format(seq_id=seq_id, frame=frame, trans=translation) tmp_out.write(entry) tmp_out.close() result = assign_reads_to_database(tmp, database_fasta_fp, output_fp, \ params = my_params) remove(tmp) return result
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) parameters = {} # get the tree insertion method to use module = opts.insertion_method # create output directory output_dir = opts.output_dir create_dir(output_dir) # list of tree insertion methods tree_insertion_module_names = \ {'raxml_v730': cogent.app.raxml_v730, 'parsinsert': cogent.app.parsinsert, 'pplacer': cogent.app.pplacer} # load input sequences and convert to phylip since the tools require # the query sequences to phylip-compliant names load_aln = MinimalFastaParser(open(opts.input_fasta_fp, 'U')) aln = DenseAlignment(load_aln) seqs, align_map = aln.toPhylip() if opts.method_params_fp: param_dict = parse_qiime_parameters(open(opts.method_params_fp, 'U')) if module == 'raxml_v730': # load the reference sequences load_ref_aln = \ DenseAlignment(MinimalFastaParser(open(opts.refseq_fp, 'U'))) # combine and load the reference plus query combined_aln = MinimalFastaParser(StringIO(load_ref_aln.toFasta() + '\n' + aln.toFasta())) # overwrite the alignment map aln = DenseAlignment(combined_aln) seqs, align_map = aln.toPhylip() try: parameters = param_dict['raxml'] except: parameters = {} tree = convert_tree_tips(align_map, opts.starting_tree_fp) # write out the tree with phylip labels updated_tree_fp = join(output_dir, '%s_phylip_named_tree.tre' % (module)) write_updated_tree_file(updated_tree_fp, tree) # set the primary parameters for raxml parameters['-w'] = abspath(output_dir) + '/' parameters["-n"] = split(splitext(get_tmp_filename())[0])[-1] parameters["-t"] = updated_tree_fp if "-f" not in parameters: parameters["-f"] = 'v' if "-m" not in parameters: parameters["-m"] = 'GTRGAMMA' elif module == 'pplacer': try: parameters = param_dict['pplacer'] except: parameters = {} # make sure stats file is passed if not opts.stats_fp: raise IOError( 'When using pplacer, the RAxML produced info file is required.') # set the primary parameters for pplacer - allow for user-defined parameters['--out-dir'] = abspath(output_dir) + '/' parameters["-t"] = opts.starting_tree_fp parameters['-r'] = opts.refseq_fp parameters['-s'] = opts.stats_fp elif module == 'parsinsert': try: parameters = param_dict['parsinsert'] except: parameters = {} # define log fp log_fp = join(output_dir, 'parsinsert.log') # define tax assignment values fp tax_assign_fp = join(output_dir, 'parsinsert_assignments.log') parameters["-l"] = log_fp parameters["-o"] = tax_assign_fp parameters["-s"] = opts.refseq_fp parameters["-t"] = opts.starting_tree_fp # call the module and return a tree object result = \ tree_insertion_module_names[module].insert_sequences_into_tree(seqs, moltype=DNA, params=parameters) result_tree = strip_and_rename_unwanted_labels_from_tree(align_map, result) # write out the resulting tree final_tree = join(output_dir, '%s_final_placement.tre' % (module)) write_updated_tree_file(final_tree, result)
from cogent.parse.fasta import MinimalFastaParser if __name__ == '__main__': metric_lines = [ l.strip().split('\t') for l in open(argv[1]) if not l.startswith('#') ] chi_lines = [ l.strip().split('\t') for l in open(argv[2]) if not l.startswith('#') ] chi = set([r[0] for r in chi_lines]) metrics = dict([(id_, (float(inv), float(nonacgt))) \ for id_,inv,nonacgt in metric_lines]) seqs = MinimalFastaParser(open(argv[3])) output_seqs = open(argv[4], 'w') output_reasons = open(argv[4] + '.filtered', 'w') output_reasons.write("#ncbi_acc_w_ver\treason\n") for i, s in seqs: if i in chi: output_reasons.write("%s\tchimeric\n" % i) continue inv, nonacgt = metrics[i] if inv < 0.9: output_reasons.write(i + "\tinvariants < 90%") output_reasons.write("\n") continue
def parse_fasta(lines): """lightweight parser for KEGG FASTA format sequences""" for label, seq in MinimalFastaParser(lines): yield '\t'.join(list(kegg_label_fields(label)) \ + [seq] + ["\n"])
def convert_fastq(fasta_file_path, qual_file_path, output_directory='.', multiple_output_files=False, ascii_increment=33, full_fastq=False, full_fasta_headers=False, per_file_buffer_size=100000): '''Takes a FASTA and QUAL file, generates FASTQ file(s) fasta_file_path: filepath of input FASTA file. qual_file_path: filepath of input QUAL file (needed for making FASTQ files) output_directory: Directory to output converted files. multiple_output_files: Make one file per SampleID. ascii_increment: Conversion value for fastq ascii character to numeric quality score. full_fastq: Write labels to both sequence and quality score lines. full_fasta_headers: Retain all data on fasta label, instead of breaking at first whitespace.''' fasta_file = open(fasta_file_path, 'U') qual_file = open(qual_file_path, 'U') # if we're not using multiple output files, we can open the one (and only) # output file right now if not multiple_output_files: output_file_path = get_filename_with_new_ext(fasta_file_path, '.fastq', output_directory) fastq_file = open(output_file_path, 'w') else: fastq_lookup = defaultdict(str) # iterate through the FASTA and QUAL files entry by entry (assume the # entries are synchronized) for fasta_data, qual_data in izip(MinimalFastaParser(fasta_file), MinimalQualParser(qual_file)): qual_header = qual_data[0] fasta_header = fasta_data[0] label = fasta_header.split()[0] sample_id = label.split('_')[0] sequence = fasta_data[1] qual = qual_data[1] # check whether the entries are actually (at least nominally) synch'd if qual_header != label: raise KeyError, ("QUAL header (%s) does not match " "FASTA header (%s)") % (qual_header, label) if len(sequence) != len(qual): raise KeyError, ("Sequence length does not match QUAL length for " "label (%s)") % label if multiple_output_files: output_file_path = get_filename_with_new_ext( fasta_file_path, '_' + sample_id + '.fastq', output_directory) # when we use multiple output files, we close each file after each # sequence is written to avoid using up all the file handles, so # we must open the file each time in append mode # fastq_file = open(output_file_path, 'a') if full_fasta_headers: fastq_sequence_header = fasta_header else: fastq_sequence_header = label if full_fastq: fastq_quality_header = fastq_sequence_header else: fastq_quality_header = '' #Writing to FASTQ file record = '@%s\n%s\n+%s\n' % (fastq_sequence_header, sequence, fastq_quality_header) if multiple_output_files: fastq_lookup[output_file_path] += record else: fastq_file.write(record) for qual_score in qual: # increment the qual score by the asciiIncrement (default 33), # and print the corresponding character, which represents that # position's quality. qual_score += ascii_increment if qual_score < 32 or qual_score > 126: raise ValueError, ( "Cannot convert quality score to ASCII code" + " between 32 and 126: " + str(qual_score - ascii_increment) + "using ascii_increment = " + str(ascii_increment)) if multiple_output_files: fastq_lookup[output_file_path] += chr(qual_score) else: fastq_file.write(chr(qual_score)) if multiple_output_files: fastq_lookup[output_file_path] += '\n' else: fastq_file.write('\n') if multiple_output_files: if len(fastq_lookup[output_file_path]) >= per_file_buffer_size: fastq_file = open(output_file_path, 'a') fastq_file.write(fastq_lookup[output_file_path]) fastq_lookup[output_file_path] = '' fastq_file.close() # write last seqs to output files, or close the output file if thre is only # one if multiple_output_files: for output_file_path, records in fastq_lookup.iteritems(): if records: fastq_file = open(output_file_path, 'a') fastq_file.write(records) fastq_file.close() else: fastq_file.close()
def __call__(self, seq_path, otu_path, reference_path, result_path=None, log_path=None, sort_by='otu'): """Returns dict mapping {otu_id:[seq_ids]} for each otu. Parameters: seq_path: path to file of sequences otu_path: path to file of OTUs result_path: path to file of results. If specified, dumps the result to the desired path instead of returning it. log_path: path to log, which includes dump of params. sort_by: sort by otu or seq_id """ # Load the seq path. We may want to change that in the future # to avoid the overhead of loading large sequence collections # during this step. if seq_path: seq_f = open(seq_path, 'U') seqs = dict(MinimalFastaParser(seq_f, label_to_name=label_to_name)) seq_f.close() else: # allows the user to not pass seqs, which can be useful when # all otus are based on reference sequences seqs = {} # Load the reference_path. We may want to change that in the future # to avoid the overhead of loading large sequence collections # during this step. reference_f = open(reference_path, 'U') reference_seqs = dict(\ MinimalFastaParser(reference_f,label_to_name=label_to_name)) reference_f.close() #Load the otu file otu_f = open(otu_path, 'U') otus = fields_to_dict(otu_f) otu_f.close() if self.Params['ChoiceFRequiresSeqs']: choice_f = self.Params['ChoiceF'](seqs) else: choice_f = self.Params['ChoiceF'] #actually pick the set result = {} for set_id, ids in otus.items(): if set_id in reference_seqs: result[set_id] = (reference_seqs, set_id) elif seqs: result[set_id] = (seqs, choice_f(ids, seqs)) else: raise KeyError,\ "Unknown reference sequence identifier: %s\n" % set_id +\ "Have you provided the correct reference sequence file? " +\ "Did you forget to provide a seqs filepath for de novo OTUs?" if result_path: of = open(result_path, 'w') if sort_by == 'seq_id': def key(s): try: return int(s[1].split('_', 1)[-1]) except ValueError: return s else: key = lambda s: s for cluster, rep in sorted(result.items(), key=key): seq_lookup, id_ = rep try: of.write('>%s %s\n%s\n' % (cluster, id_, seq_lookup[id_])) except KeyError: raise KeyError,\ "Sequence identifiers (%s and %s) " % (cluster, id_) +\ "not found in reference or sequence collection." of.close() result = None log_str = 'Result path: %s' % result_path else: # The return value here differs from GenericRepSetPicker # because it is possible for the representative sequences # to be ambiguous. For example, if the identifiers in # seq_path and reference_path are both integers, returning # a sequence identifier is not sufficent to determine which # sequence collection the reference sequence came from. # Therefore if the user did not provide a result_path, store # the result in a dict of {otu_id: (rep_id, rep_seq)}, log_str = 'Result path: None, returned as dict.' for cluster, rep in result.items(): seq_lookup, id_ = rep try: result[cluster] = (id_, seq_lookup[id_]) except KeyError: raise KeyError,\ "Sequence identifiers (%s and %s) " % (cluster, id_) +\ "not found in reference or sequence collection." if log_path: # if the user provided a log file path, log the run log_file = open(log_path, 'w') log_file.write(str(self)) log_file.write('\n') log_file.write('%s\n' % log_str) # return the result (note this is None if the data was # written to file) return result
def __call__(self, seq_path=None, seqs=None, result_path=None, log_path=None): """Returns dict mapping {seq_id:(taxonomy, confidence)} for each seq. """ assert seq_path or seqs, \ "Must provide either seqs or seq_path when calling a BlastTaxonAssigner." # initialize the logger logger = self._get_logger(log_path) logger.info(str(self)) # assign the blast database, either as a pre-exisiting database # specified as self.Params['blast_db'] or by creating a # temporary database from the sequence file specified # as self.Params['reference_seqs_filepath'] try: blast_db = self.Params['blast_db'] except KeyError: # build a temporary blast_db reference_seqs_path = self.Params['reference_seqs_filepath'] refseqs_dir, refseqs_name = os.path.split(reference_seqs_path) blast_db, db_files_to_remove = \ build_blast_db_from_fasta_path(reference_seqs_path) # build the mapping of sequence identifier # (wrt to the blast db seqs) to taxonomy id_to_taxonomy_map = self._parse_id_to_taxonomy_file(\ open(self.Params['id_to_taxonomy_filepath'],'U')) ## Iterate over the input self.SeqsPerBlastRun seqs at a time. # There are two competing issues here when dealing with very large # inputs. If all sequences are read in at once, the containing object # can be very large, causing the system to page. On the other hand, # in such cases it would be very slow to treat each sequence # individually, since blast requires a filepath. Each call would # therefore involve writing a single sequence to file, opening/closing # and removing the file. To balance this, sequences are read in and # blasted in chunks of self.SeqsPerBlastRun (defualt: 1000) at a time. # This appears to solve the problem with the largest sets I've worked # with so far. if seq_path: # Get a seq iterator seqs = MinimalFastaParser(open(seq_path)) # Build object to keep track of the current set of sequence to be # blasted, and the results (i.e., seq_id -> (taxonomy,quaility score) # mapping) current_seqs = [] result = {} # Iterate over the (seq_id, seq) pairs for seq_id, seq in seqs: # append the current seq_id,seq to list of seqs to be blasted current_seqs.append((seq_id, seq)) # When there are 1000 in the list, blast them if len(current_seqs) == self.SeqsPerBlastRun: # update the result object result.update(self._seqs_to_taxonomy(\ current_seqs,blast_db,id_to_taxonomy_map)) # reset the list of seqs to be blasted current_seqs = [] # Assign taxonomy to the remaining sequences result.update(self._seqs_to_taxonomy(\ current_seqs,blast_db,id_to_taxonomy_map)) ## End iteration over the input self.SeqsPerBlastRun seqs at a time. # Write log data if we have a path (while the logger can handle # being called if we are not logging, some of these steps are slow). if log_path is not None: num_inspected = len(result) logger.info('Number of sequences inspected: %s' % num_inspected) num_null_hits = [r[1] for r in result.values()].count(None) logger.info('Number with no blast hits: %s' % num_null_hits) if result_path: # if the user provided a result_path, write the # results to file of = open(result_path, 'w') for seq_id, (lineage, confidence, blast_hit_id) in result.items(): of.write('%s\t%s\t%s\t%s\n' % (seq_id, lineage, confidence, blast_hit_id)) of.close() result = None logger.info('Result path: %s' % result_path) else: # Returning the data as a dict, so no modification to result # is necessary. pass # if no result_path was provided, return the data as a dict logger.info('Result path: None, returned as dict.') # clean-up temp blastdb files, if a temp blastdb was created if 'reference_seqs_filepath' in self.Params: map(remove, db_files_to_remove) # return the result return result
def denoise_per_sample(sff_fp, fasta_fp, tmpoutdir, cluster=False, num_cpus=1, squeeze=True, percent_id=0.97, bail=1, primer="", low_cutoff=3.75, high_cutoff=4.5, log_fp="denoiser.log", low_memory=False, verbose=False, error_profile=DENOISER_DATA_DIR +'FLX_error_profile.dat', max_num_rounds=None, titanium=False): """Denoise each sample separately""" #abort early if binary is missing check_flowgram_ali_exe() log_fh = None if log_fp: #switch of buffering for global log file log_fh = open(tmpoutdir+"/"+log_fp, "w", 0) # overwrite settings if titanium is set # This flag is only used from qiime. Remove after qiime integration if titanium: error_profile = DENOISER_DATA_DIR + "Titanium_error_profile.dat" low_cutoff = 4 high_cutoff = 5 if verbose: log_fh.write("Denoiser version: %s\n" % __version__) log_fh.write("SFF file: %s\n" % sff_fp) log_fh.write("Fasta file: %s\n" % fasta_fp) log_fh.write("Cluster: %s\n" % cluster) log_fh.write("Num CPUs: %d\n" % num_cpus) log_fh.write("Squeeze Seqs: %s\n" % squeeze) log_fh.write("tmpdir: %s\n\n" % tmpoutdir) log_fh.write("percent_id threshold: %.2f\n" % percent_id) log_fh.write("Minimal sequence coverage for first phase: %d\n" % bail) log_fh.write("Error profile: %s\n" % error_profile) log_fh.write("Maximal number of iteration: %s\n\n" % max_num_rounds) # here we go ... sff_files = split_sff(open(sff_fp), open(fasta_fp), tmpoutdir) combined_mapping = {} result_centroids = [] result_singletons_files = [] #denoise each sample separately for i, sff_file in enumerate(sff_files): if not exists(tmpoutdir+("/%d" % i)): makedirs(tmpoutdir+("/%d" % i)) out_fp= tmpoutdir+("/%d/" % i) denoise_seqs(sff_file, fasta_fp, out_fp, None, cluster, num_cpus, squeeze, percent_id, bail, primer, low_cutoff, high_cutoff, log_fp, low_memory, verbose, error_profile, max_num_rounds) #collect partial results this_rounds_mapping = read_denoiser_mapping(open(out_fp+"/denoiser_mapping.txt")) combined_mapping.update(this_rounds_mapping) result_centroids.append(MinimalFastaParser(open(out_fp+"/centroids.fasta"))) result_singletons_files.append(out_fp+"/singletons.fasta") #write the combined files store_mapping(combined_mapping, tmpoutdir, "denoiser") seqs = chain(*result_centroids) fasta_fh = open(tmpoutdir+"/denoised.fasta", "w"); #write centroids sorted by clustersize write_Fasta_from_name_seq_pairs(sort_seqs_by_clustersize(seqs, combined_mapping), fasta_fh) for singleton_file in result_singletons_files: write_Fasta_from_name_seq_pairs(MinimalFastaParser(open(singleton_file,"r")), fasta_fh) fasta_fh.close() # return outdir for tests/test_denoiser return tmpoutdir
def compute_min_alignment_length(seqs_f, fraction=0.75): """ compute the min alignment length as n standard deviations below the mean """ med_length = median([len(s) for _, s in MinimalFastaParser(seqs_f)]) return int(med_length * fraction)
def __call__(self, seq_path, result_path=None, log_path=None, failure_path=None): # load candidate sequences seq_file = open(seq_path, 'U') candidate_sequences = MinimalFastaParser(seq_file) # load template sequences template_alignment = [] template_alignment_fp = self.Params['template_filepath'] for seq_id, seq in MinimalFastaParser(open(template_alignment_fp)): # replace '.' characters with '-' characters template_alignment.append((seq_id, seq.replace('.', '-').upper())) try: template_alignment = LoadSeqs(data=template_alignment, moltype=DNA, aligned=DenseAlignment) except KeyError as e: raise KeyError( 'Only ACGT-. characters can be contained in template alignments.' + ' The offending character was: %s' % e) # initialize_logger logger = NastLogger(log_path) # get function for pairwise alignment method pairwise_alignment_f = pairwise_alignment_methods[ self.Params['pairwise_alignment_method']] pynast_aligned, pynast_failed = pynast_seqs( candidate_sequences, template_alignment, min_pct=self.Params['min_pct'], min_len=self.Params['min_len'], align_unaligned_seqs_f=pairwise_alignment_f, logger=logger, temp_dir=get_qiime_temp_dir()) logger.record(str(self)) if failure_path is not None: fail_file = open(failure_path, 'w') for seq in pynast_failed: fail_file.write(seq.toFasta()) fail_file.write('\n') fail_file.close() if result_path is not None: result_file = open(result_path, 'w') for seq in pynast_aligned: result_file.write(seq.toFasta()) result_file.write('\n') result_file.close() return None else: try: return LoadSeqs(data=pynast_aligned, aligned=DenseAlignment) except ValueError: return {}
def __call__(self, seq_path, result_path=None, log_path=None, failure_path=None, cmbuild_params=None, cmalign_params=None): log_params = [] # load candidate sequences candidate_sequences = dict(MinimalFastaParser(open(seq_path, 'U'))) # load template sequences try: info, template_alignment, struct = list( MinimalRfamParser(open(self.Params['template_filepath'], 'U'), seq_constructor=ChangedSequence))[0] except RecordError: raise ValueError( "Template alignment must be in Stockholm format with corresponding secondary structure annotation when using InfernalAligner." ) moltype = self.Params['moltype'] # Need to make separate mapping for unaligned sequences unaligned = SequenceCollection(candidate_sequences, MolType=moltype) int_map, int_keys = unaligned.getIntMap(prefix='unaligned_') int_map = SequenceCollection(int_map, MolType=moltype) # Turn on --gapthresh option in cmbuild to force alignment to full # model if cmbuild_params is None: cmbuild_params = {} cmbuild_params.update({'--gapthresh': 1.0}) # record cmbuild parameters log_params.append('cmbuild parameters:') log_params.append(str(cmbuild_params)) # Turn on --sub option in Infernal, since we know the unaligned sequences # are fragments. # Also turn on --gapthresh to use same gapthresh as was used to build # model if cmalign_params is None: cmalign_params = {} cmalign_params.update({'--sub': True, '--gapthresh': 1.0}) # record cmalign parameters log_params.append('cmalign parameters:') log_params.append(str(cmalign_params)) # Align sequences to alignment including alignment gaps. aligned, struct_string = cmalign_from_alignment( aln=template_alignment, structure_string=struct, seqs=int_map, moltype=moltype, include_aln=True, params=cmalign_params, cmbuild_params=cmbuild_params) # Pull out original sequences from full alignment. infernal_aligned = {} aligned_dict = aligned.NamedSeqs for key in int_map.Names: infernal_aligned[int_keys.get(key, key)] = aligned_dict[key] # Create an Alignment object from alignment dict infernal_aligned = Alignment(infernal_aligned, MolType=moltype) if log_path is not None: log_file = open(log_path, 'w') log_file.write('\n'.join(log_params)) log_file.close() if result_path is not None: result_file = open(result_path, 'w') result_file.write(infernal_aligned.toFasta()) result_file.close() return None else: try: return infernal_aligned except ValueError: return {}
def apply_lane_mask_and_gap_filter(fastalines, lane_mask,\ allowed_gap_frac=1-eps, verbose=False): """Applies lanemask and gap filter to fasta file, yielding filtered seqs. """ if lane_mask: # convert lane_mask to a numpy index array p = mask_to_positions(lane_mask) # special case: lanemask is all zeros if sum(p) == 0: for line in fastalines: if line.startswith(">"): yield line + '\n' else: yield '\n' return # random temporary file for first-pass results tmpfilename = "/tmp/" + "".join(sample(lowercase, 20)) + ".tmp" try: tmpfile = open(tmpfilename, 'w') except IOError: raise IOError, "Can't open temporary file for writing: %s" %\ tmpfilename # the number of gaps seen in each position (length may be unknown here) gapcounts = None # First pass: apply filter, and track gaps if verbose: print "First pass: applying lanemask..." seq_count = 0 for k, v in MinimalFastaParser(fastalines): seq_count += 1 # print progress in verbose mode if verbose and (seq_count % 100) == 0: status(seq_count) # apply lanemask if there is one if lane_mask: masked = get_masked_string(v, p) else: masked = v # initialize gapcount array to proper length if gapcounts == None: gapcounts = zeros(len(masked)) # increment gap counts if requested if allowed_gap_frac < 1: gapcounts[find_gaps(masked)] += 1 # write masked sequence to temporary file tmpfile.write('>%s\n%s\n' % (k, masked)) if verbose: print print tmpfile.close() tmpfile = open(tmpfilename, 'U') # if we're not removing gaps, we're done; yield the temp file contents if allowed_gap_frac == 1: for line in tmpfile: yield line # else we are removing gaps; do second pass else: # convert gapcounts to true/false mask gapcounts = (gapcounts / float(seq_count)) <= allowed_gap_frac # Second pass: remove all-gap positions if verbose: print "Second pass: remove all-gap positions..." seq_count = 0 for k, v in MinimalFastaParser(tmpfile): seq_count += 1 # print progress in verbose mode if verbose and (seq_count % 100) == 0: status(seq_count) masked = get_masked_string(v, gapcounts) yield '>%s\n' % (k) yield '%s\n' % (masked) if verbose: print # delete temporary file tmpfile.close() remove(tmpfilename)
help="Number of CPUs \ (Default 1)") args = parser.parse_args() basefolder = args.f if basefolder[-1] != "/": basefolder += "/" knownfile = args.i rnd = args.r if args.c < 1: raise ValueError("CPUs must be greater than 0!") knownfile = open(knownfile, 'U') knownseqs = [] #build the storage vector holding known seq and output file for seq for header, seq in MinimalFastaParser(knownfile): seqfile = open(basefolder + header + ".fasta", 'w') seqfile.write(">%s\n%s\n" % (header, seq)) knownseqs.append((seq, seqfile)) knownfile.close() #loop over all the rounds to find sequence matches for currrnd in range(1, rnd + 1): rndname = "R" + str(currrnd) print rndname #print round info to each output file for info in knownseqs: info[1].write(">%s\n%s\n" % (rndname, rndname)) #multiprocess each round manager = Manager() hold = manager.dict()
def usearch61_chimera_check(input_seqs_fp, output_dir, reference_seqs_fp=None, suppress_usearch61_intermediates=False, suppress_usearch61_ref=False, suppress_usearch61_denovo=False, split_by_sampleid=False, non_chimeras_retention="union", usearch61_minh=0.28, usearch61_xn=8.0, usearch61_dn=1.4, usearch61_mindiffs=3, usearch61_mindiv=0.8, usearch61_abundance_skew=2.0, percent_id_usearch61=0.97, minlen=64, word_length=8, max_accepts=1, max_rejects=8, verbose=False, HALT_EXEC=False): """ Main convenience function for usearch61 chimera checking input_seqs_fp: filepath of input fasta file. output_dir: output directory reference_seqs_fp: fasta filepath for reference chimera detection. suppress_usearch61_intermediates: Suppress retention of .uc and log files. suppress_usearch61_ref: Suppress usearch61 reference chimera detection. suppress_usearch61_denovo: Suppress usearch61 de novo chimera detection. split_by_sampleid: Split by sample ID for de novo chimera detection. non_chimeras_retention: Set to "union" or "intersection" to retain non-chimeras between de novo and reference based results. usearch61_minh: Minimum score (h) to be classified as chimera. Increasing this value tends to the number of false positives (and also sensitivity). usearch61_xn: Weight of "no" vote. Increasing this value tends to the number of false positives (and also sensitivity). usearch61_dn: Pseudo-count prior for "no" votes. (n). Increasing this value tends to the number of false positives (and also sensitivity). usearch61_mindiffs: Minimum number of diffs in a segment. Increasing this value tends to reduce the number of false positives while reducing sensitivity to very low-divergence chimeras. usearch61_mindiv: Minimum divergence, i.e. 100% - identity between the query and closest reference database sequence. Expressed as a percentage, so the default is 0.8%, which allows chimeras that are up to 99.2% similar to a reference sequence. usearch61_abundance_skew: abundance skew for de novo chimera comparisons. percent_id_usearch61: identity to cluster sequences at minlen: minimum sequence length for use with usearch61 word_length: length of nucleotide 'words' for usearch61 max_accepts: max number of accepts for hits with usearch61 max_rejects: max number of rejects for usearch61, increasing allows more sensitivity at a cost of speed HALT_EXEC=application controller option to halt execution and print command """ """ Need to cluster sequences de novo first to get 1. abundance information and 2 consensus sequence for each cluster. Using dereplication followed by clustering does not appear to automatically update complete cluster size, will directly cluster raw seqs with the small_mem clustering option. This means without additional parsing steps to recalculate actual cluster sizes, the sizeorder option can't be used for de novo clustering and downstream chimera detection.""" files_to_remove = [] # Get absolute paths to avoid issues with calling usearch input_seqs_fp = abspath(input_seqs_fp) output_dir = abspath(output_dir) if reference_seqs_fp: reference_seqs_fp = abspath(reference_seqs_fp) log_fp = join(output_dir, "identify_chimeric_seqs.log") chimeras_fp = join(output_dir, "chimeras.txt") non_chimeras_fp = join(output_dir, "non_chimeras.txt") non_chimeras = [] chimeras = [] log_lines = { 'denovo_chimeras': 0, 'denovo_non_chimeras': 0, 'ref_chimeras': 0, 'ref_non_chimeras': 0 } if split_by_sampleid: if verbose: print "Splitting fasta according to SampleID..." full_seqs = open(input_seqs_fp, "U") sep_fastas =\ split_fasta_on_sample_ids_to_files(MinimalFastaParser(full_seqs), output_dir) full_seqs.close() if suppress_usearch61_intermediates: files_to_remove += sep_fastas for curr_fasta in sep_fastas: curr_chimeras, curr_non_chimeras, files_to_remove, log_lines =\ identify_chimeras_usearch61(curr_fasta, output_dir, reference_seqs_fp, suppress_usearch61_intermediates, suppress_usearch61_ref, suppress_usearch61_denovo, non_chimeras_retention, usearch61_minh, usearch61_xn, usearch61_dn, usearch61_mindiffs, usearch61_mindiv, usearch61_abundance_skew, percent_id_usearch61, minlen, word_length, max_accepts, max_rejects, files_to_remove, HALT_EXEC, log_lines, verbose) chimeras += curr_chimeras non_chimeras += curr_non_chimeras else: chimeras, non_chimeras, files_to_remove, log_lines =\ identify_chimeras_usearch61(input_seqs_fp, output_dir, reference_seqs_fp, suppress_usearch61_intermediates, suppress_usearch61_ref, suppress_usearch61_denovo, non_chimeras_retention, usearch61_minh, usearch61_xn, usearch61_dn, usearch61_mindiffs, usearch61_mindiv, usearch61_abundance_skew, percent_id_usearch61, minlen, word_length, max_accepts, max_rejects, files_to_remove, HALT_EXEC, log_lines, verbose) # write log, non chimeras, chimeras. write_usearch61_log( log_fp, input_seqs_fp, output_dir, reference_seqs_fp, suppress_usearch61_intermediates, suppress_usearch61_ref, suppress_usearch61_denovo, split_by_sampleid, non_chimeras_retention, usearch61_minh, usearch61_xn, usearch61_dn, usearch61_mindiffs, usearch61_mindiv, usearch61_abundance_skew, percent_id_usearch61, minlen, word_length, max_accepts, max_rejects, HALT_EXEC, log_lines) chimeras_f = open(chimeras_fp, "w") non_chimeras_f = open(non_chimeras_fp, "w") for curr_chimera in chimeras: chimeras_f.write("%s\n" % curr_chimera) for curr_non_chimera in non_chimeras: non_chimeras_f.write("%s\n" % curr_non_chimera) chimeras_f.close() non_chimeras_f.close() remove_files(files_to_remove)
def __call__(self, seq_path, otu_path, result_path=None, log_path=None, sort_by='otu'): """Returns dict mapping {otu_id:[seq_ids]} for each otu. Parameters: seq_path: path to file of sequences otu_path: path to file of OTUs result_path: path to file of results. If specified, dumps the result to the desired path instead of returning it. log_path: path to log, which includes dump of params. sort_by: sort by otu or seq_id """ # Load the seq path. We may want to change that in the future # to avoid the overhead of loading large sequence collections # during this step. seq_f = open(seq_path, 'U') seqs = dict(MinimalFastaParser(seq_f, label_to_name=label_to_name)) seq_f.close() #Load the otu file otu_f = open(otu_path, 'U') otus = fields_to_dict(otu_f) otu_f.close() if self.Params['ChoiceFRequiresSeqs']: choice_f = self.Params['ChoiceF'](seqs) else: choice_f = self.Params['ChoiceF'] #actually pick the set result = {} for set_id, ids in otus.items(): result[set_id] = choice_f(ids, seqs) if result_path: # if the user provided a result_path, write the # results to file with one tab-separated line per # cluster of = open(result_path, 'w') if sort_by == 'seq_id': def key(s): try: return int(s[1].split('_', 1)[-1]) except ValueError: return s else: key = lambda s: s for cluster, id_ in sorted(result.items(), key=key): of.write('>%s %s\n%s\n' % (cluster, id_, seqs[id_])) of.close() result = None log_str = 'Result path: %s' % result_path else: # if the user did not provide a result_path, store # the result in a dict of {otu_id: rep_id}, log_str = 'Result path: None, returned as dict.' if log_path: # if the user provided a log file path, log the run log_file = open(log_path, 'w') log_file.write(str(self)) log_file.write('\n') log_file.write('%s\n' % log_str) # return the result (note this is None if the data was # written to file) return result
def assign_seqs(file_data, ids_bcs_added_field, bc_lens, all_bcs, keep_barcode=False, barcode_type="golay_12", max_bc_errors=1.5, start_index=1, write_unassigned_reads=False, disable_bc_correction=False, added_demultiplex_field=None): """ Demultiplexes, writes seqs/qual files, returns log data file_data: dict of open file objects, contains input fasta, qual, and mapping files, and output filepaths for partially demultiplexed fasta and qual files, and unassigned sequence output file. ids_bcs_added_field: dict of (barcode,added_demultiplex): SampleID bc_lens: Lengths of all barcodes from largest to smallest. all_bcs: List of all barcode sequences. keep_barcode: If True, will not remove barcode from output files. barcode_type: Specified barcode, can be golay_12, hamming_8, variable_length, or an integer specifying length. max_bc_errors: Number of changes allowed for error correcting barcodes, for generic barcodes, specifies the number of mismatches allowed. start_index: Specifies the first number used to enumerate output sequences. write_unassigned_reads: If True, will write sequences that could not be demultiplexed into a separate output file. disable_bc_correction: Only tests for exact matches to barcodes. added_demultiplex_field: Uses data supplied in metadata mapping field and demultiplexes according to data in fasta labels. save_barcode_frequencies: Saves the frequencies of barcode sequences in a separate output file. """ log_data = initialize_log_data(ids_bcs_added_field) bc_freqs = defaultdict(int) seq_counts = 0 enum_val = start_index corrected_bc_count = [0, 0] if file_data['qual_files']: for curr_fasta, curr_qual in zip(file_data['fasta_files'], file_data['qual_files']): for fasta_data, qual_data in izip( MinimalFastaParser(curr_fasta), MinimalQualParser(curr_qual, full_header=True)): seq_counts += 1 fasta_label, fasta_seq = fasta_data qual_label, qual_seq = qual_data bc, corrected_bc, num_errors, added_field =\ get_demultiplex_data(ids_bcs_added_field, fasta_label, fasta_seq, bc_lens, all_bcs, barcode_type, max_bc_errors, disable_bc_correction, added_demultiplex_field) bc_freqs[bc] += 1 sample_id, log_id, bc_corrected_result =\ get_output_ids(ids_bcs_added_field, corrected_bc, num_errors, added_field, max_bc_errors, enum_val) if bc_corrected_result == 'corrected': corrected_bc_count[0] += 1 if bc_corrected_result == 'not_corrected': corrected_bc_count[1] += 1 label_line = get_label_line(sample_id, fasta_label, bc, corrected_bc, num_errors) if sample_id.startswith("Unassigned") and\ write_unassigned_reads: write_fasta_line(file_data['unassigned_seqs_f'], fasta_seq, label_line, True, len(bc)) write_qual_line(file_data['unassigned_qual_f'], list(qual_seq), label_line, True, len(bc)) elif not sample_id.startswith("Unassigned"): write_fasta_line(file_data['demultiplexed_seqs_f'], fasta_seq, label_line, keep_barcode, len(bc)) write_qual_line(file_data['demultiplexed_qual_f'], list(qual_seq), label_line, keep_barcode, len(bc)) if log_id: log_data[log_id] += 1 enum_val += 1 else: for curr_fasta in file_data['fasta_files']: for fasta_label, fasta_seq in MinimalFastaParser(curr_fasta): seq_counts += 1 bc, corrected_bc, num_errors, added_field =\ get_demultiplex_data(ids_bcs_added_field, fasta_label, fasta_seq, bc_lens, all_bcs, barcode_type, max_bc_errors, disable_bc_correction, added_demultiplex_field) bc_freqs[bc] += 1 sample_id, log_id, bc_corrected_result =\ get_output_ids(ids_bcs_added_field, corrected_bc, num_errors, added_field, max_bc_errors, enum_val) if bc_corrected_result == 'corrected': corrected_bc_count[0] += 1 if bc_corrected_result == 'not_corrected': corrected_bc_count[1] += 1 label_line = get_label_line(sample_id, fasta_label, bc, corrected_bc, num_errors) if sample_id.startswith("Unassigned") and\ write_unassigned_reads: write_fasta_line(file_data['unassigned_seqs_f'], fasta_seq, label_line, True, len(bc)) elif not sample_id.startswith("Unassigned"): write_fasta_line(file_data['demultiplexed_seqs_f'], fasta_seq, label_line, keep_barcode, len(bc)) if log_id: log_data[log_id] += 1 enum_val += 1 return log_data, bc_freqs, seq_counts, corrected_bc_count
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) input_fp = opts.input_fp output_fp = opts.output_fp min_count = opts.min_count max_count = opts.max_count min_count_fraction = opts.min_count_fraction if min_count_fraction < 0. or min_count_fraction > 1.: option_parser.error("min_count_fraction must be between 0 and 1") if min_count != 0 and min_count_fraction != 0: option_parser.error( "cannot specify both min_count and min_count_fraction") min_samples = opts.min_samples max_samples = opts.max_samples otu_ids_to_exclude_fp = opts.otu_ids_to_exclude_fp negate_ids_to_exclude = opts.negate_ids_to_exclude if not (min_count != 0 or \ min_count_fraction != 0 or \ not isinf(max_count) or \ otu_ids_to_exclude_fp != None or \ min_samples !=0 or not isinf(max_samples)): option_parser.error( "No filtering requested. Must provide either " "min counts, max counts, min samples, max samples, min_count_fraction, " "or exclude_fp (or some combination of those).") otu_table = parse_biom_table(open(opts.input_fp, 'U')) if min_count_fraction > 0: min_count = otu_table.sum() * min_count_fraction print otu_table.sum(), min_count output_f = open(opts.output_fp, 'w') otu_ids_to_keep = set(otu_table.ObservationIds) if otu_ids_to_exclude_fp: if otu_ids_to_exclude_fp.endswith('.fasta') or \ otu_ids_to_exclude_fp.endswith('.fna'): otu_ids_to_exclude = set([ id_.strip().split()[0] for id_, seq in MinimalFastaParser( open(otu_ids_to_exclude_fp, 'U')) ]) else: otu_ids_to_exclude = set([ l.strip().split('\t')[0] for l in open(otu_ids_to_exclude_fp, 'U') ]) otu_ids_to_keep -= otu_ids_to_exclude filtered_otu_table = filter_otus_from_otu_table(otu_table, otu_ids_to_keep, min_count, max_count, min_samples, max_samples, negate_ids_to_exclude) output_f.write(format_biom_table(filtered_otu_table)) output_f.close()
def get_seqs_to_keep_lookup_from_fasta_file(fasta_f): """return the sequence ids within the fasta file""" return set([seq_id.split()[0] for seq_id,seq in MinimalFastaParser(fasta_f)])
def test_empty(self): """MinimalFastaParser should return empty list from 'file' w/o labels""" self.assertEqual(list(MinimalFastaParser(self.empty)), []) self.assertEqual(list(MinimalFastaParser(self.nolabels, strict=False)), []) self.assertRaises(RecordError, list, MinimalFastaParser(self.nolabels))
if args.c < 1: raise ValueError("ERROR: CPU count must be at least 1!") if args.sim <= 0.0 or args.sim > 1.0: raise ValueError("ERROR: clustering simmilarity must be > 0 and <= 1!") clustscore = args.csc outfolder = args.o.strip() basefolder = args.f.strip() if not exists(basefolder): raise IOError("Basefolder does not exist!") # calculate minseqs if necessary if args.minseqs == -1: with open(args.i) as fin: args.minseqs = int( count_seqs([h for h, s in MinimalFastaParser(fin)]) * 0.001) if basefolder[:-1] != "/": basefolder += "/" if outfolder[:-1] != "/": outfolder += "/" if not exists(outfolder): mkdir(outfolder) date = str(datetime.now()) print "Program started ", date # print out run info to a file infofile = open(outfolder + "runparams.txt", 'w') infofile.write(''.join([ "Program started ", date, "\n", "FASTA file:\t", args.i, "\n",
def get_chimeras_from_Nast_aligned(seqs_fp, ref_db_aligned_fp=None, ref_db_fasta_fp=None, HALT_EXEC=False, min_div_ratio=None, keep_intermediates=False): """remove chimeras from seqs_fp using chimeraSlayer. seqs_fp: a filepath with the seqs to check in the file ref_db_aligned_fp: fp to (pynast) aligned reference sequences ref_db_fasta_fp: same seqs as above, just unaligned. Will be computed on the fly if not provided, HALT_EXEC: stop execution if true min_div_ratio: passed to ChimeraSlayer App """ files_to_remove = [] #might come in as FilePath object with quotes seqs_fp = str(seqs_fp) seqs_fp = seqs_fp.rstrip('"') seqs_fp = seqs_fp.lstrip('"') seqs_dir, new_seqs_fp = split(seqs_fp) #if fp is in current dir, we fake a dir change if seqs_dir == "": seqs_dir = "./" #Chimera Slayer puts some temp files in current dir and some in dir of input file #use exe_dir to change to dir of input file, so to have all tmp files in one place params = {'--query_NAST': new_seqs_fp, '--exec_dir': seqs_dir} if ref_db_aligned_fp == None and ref_db_fasta_fp == None: #use default db, whose relative position to the #ChimeraSlayer binary is hardcoded pass else: if not ref_db_fasta_fp: #make degapped reference file ref_db_fasta_fp = write_degapped_fasta_to_file(MinimalFastaParser( \ open(ref_db_aligned_fp))) files_to_remove.append(ref_db_fasta_fp) #use user db params.update({ '--db_NAST': ref_db_aligned_fp, '--db_FASTA': ref_db_fasta_fp }) if min_div_ratio != None: params.update({'-R': min_div_ratio}) app = ChimeraSlayer(params=params, HALT_EXEC=HALT_EXEC) app_results = app() # this is a FilePath object in case of success. # How can we test for failure here? # if not exists(app_results['CPS']): # raise ApplicationError, "ChimeraSlayer failed. No output file." chimeras = parse_CPS_file((app_results['CPS'])) if not keep_intermediates: app.remove_intermediate_files() remove_files(files_to_remove) return chimeras
def setUp(self): self.seqs = Alignment(dict(MinimalFastaParser(test_seqs.split())))
def preprocess(sff_fps, log_fh, fasta_fp=None, out_fp="/tmp/", verbose=False, squeeze=False, primer=STANDARD_BACTERIAL_PRIMER): """Quality filtering and truncation of flowgrams, followed by denoiser phase I. sff_fps: List of paths to flowgram files log_fh: log messages are written to log_fh if it is set to something else than None fasta_fp: Path to fasta file, formatted as from split_libraries.py. This files is used to filter the flowgrams in sff_fps. Only reads in fasta_fp are pulled from sff_fps. out_fp: path to output directory verbose: a binary verbose flag squeeze: a flag that controls if sequences are squeezed before phase I. Squeezing means consecutive identical nucs are collapsed to one. primer: The primer sequences of the amplification process. This seq will be removed from all reads during the preprocessing """ flowgrams, header = cat_sff_files(map(open, sff_fps)) if(fasta_fp): # remove barcodes and sequences tossed by split_libraries, i.e. not in # fasta_fp labels = imap(lambda a_b: a_b[0], MinimalFastaParser(open(fasta_fp))) barcode_mapping = extract_barcodes_from_mapping(labels) (trunc_sff_fp, l) = truncate_flowgrams_in_SFF(flowgrams, header, outdir=out_fp, barcode_mapping=barcode_mapping, primer=primer) if verbose: log_fh.write( "Sequences in barcode mapping: %d\n" % len(barcode_mapping)) log_fh.write("Truncated flowgrams written: %d\n" % l) else: # just do a simple clean and truncate (clean_sff_fp, l) = cleanup_sff(flowgrams, header, outdir=out_fp) if verbose: log_fh.write("Cleaned flowgrams written: %d\n" % l) flowgrams, header = lazy_parse_sff_handle(open(clean_sff_fp)) (trunc_sff_fp, l) = truncate_flowgrams_in_SFF(flowgrams, header, outdir=out_fp, primer=primer) if verbose: log_fh.write("Truncated flowgrams written: %d\n" % l) remove(clean_sff_fp) if (l == 0): raise ValueError("No flowgrams left after preprocesing.\n" + "Check your primer sequence") # Phase I - cluster seqs which are exact prefixe if verbose: log_fh.write("Filter flowgrams by prefix matching\n") (flowgrams, header) = lazy_parse_sff_handle(open(trunc_sff_fp)) l, orig_l, mapping =\ prefix_filter_flowgrams(flowgrams, squeeze=squeeze) averaged_sff_fp, seqs = build_averaged_flowgrams(mapping, trunc_sff_fp, min_coverage=1, # averaging produces too good flowgrams # such that the greedy clustering clusters too much. # Use the cluster centroid # instead by using # min_coverage 1 out_fp=out_fp + "/prefix_dereplicated.sff.txt") remove(trunc_sff_fp) if verbose: log_fh.write("Prefix matching: removed %d out of %d seqs\n" % (orig_l - l, orig_l)) log_fh.write("Remaining number of sequences: %d\n" % l) log_fh.write(make_stats(mapping) + "\n") # print representative sequences and mapping print_rep_seqs(mapping, seqs, out_fp) store_mapping(mapping, out_fp, "prefix") return (averaged_sff_fp, l, mapping, seqs)
def assign_taxonomy(data, min_confidence=0.80, output_fp=None, training_data_fp=None, max_memory=None): """ Assign taxonomy to each sequence in data with the RDP classifier data: open fasta file object or list of fasta lines confidence: minimum support threshold to assign taxonomy to a sequence output_fp: path to write output; if not provided, result will be returned in a dict of {seq_id:(taxonomy_assignment,confidence)} """ data = list(data) # build a map of seq identifiers as the RDP classifier doesn't # preserve these perfectly identifier_lookup = {} for seq_id, seq in MinimalFastaParser(data): identifier_lookup[seq_id.split()[0]] = seq_id # build the classifier object app = RdpClassifier20() if max_memory is not None: app.Parameters['-Xmx'].on(max_memory) if training_data_fp is not None: app.Parameters['-training-data'].on(training_data_fp) # apply the rdp app controller rdp_result = app('\n'.join(data)) # grab assignment output result_lines = rdp_result['Assignments'] # start a list to store the assignments results = {} # ShortSequenceException messages are written to stdout # Tag these ID's as unassignable stdout_lines = rdp_result['StdOut'] for line in stdout_lines: if line.startswith('ShortSequenceException'): matchobj = re.search('recordID=(\S+)', line) if matchobj: rdp_id = matchobj.group(1) orig_id = identifier_lookup[rdp_id] results[orig_id] = ('Unassignable', 1.0) # iterate over the identifier, assignment strings (this is a bit # of an abuse of the MinimalFastaParser, as these are not truely # fasta lines) for identifier, assignment_str in MinimalFastaParser(result_lines): # get the original identifier from the one in the rdp result identifier = identifier_lookup[\ identifier[:identifier.index('reverse=')].strip()] # build a list to store the assignments we're confident in # (i.e., the ones that have a confidence greater than min_confidence) confident_assignments = [] # keep track of the lowest acceptable confidence value that # has been encountered lowest_confidence = 0.0 # split the taxonomy assignment string assignment_fields = assignment_str.split(';') # iterate over (assignment, assignment confidence) pairs for i in range(0, len(assignment_fields), 2): assignment = assignment_fields[i] try: assignment_confidence = float(assignment_fields[i + 1]) except IndexError: break # check the confidence of the current assignment if assignment_confidence >= min_confidence: # if the current assignment confidence is greater than # the min, store the assignment and confidence value confident_assignments.append(assignment.strip()) lowest_confidence = assignment_confidence else: # otherwise, we've made it to the lowest assignment that # met the confidence threshold, so bail out of the loop break # store the identifier, the semi-colon-separated assignments, and the # confidence for the last assignment results[identifier] = \ (';'.join(confident_assignments),lowest_confidence) if output_fp: try: output_file = open(output_fp, 'w') except OSError: raise OSError, "Can't open output file for writing: %s" % output_fp for seq_id, values in results.items(): output_file.write('%s\t%s\t%1.3f\n' % (seq_id, values[0], values[1])) output_file.close() return None else: return results
from sys import argv from cogent.parse.fasta import MinimalFastaParser silva_taxa = open(argv[1], "U") id_to_taxa = {} for line in silva_taxa: curr_id = line.split()[0].strip() curr_taxa = " ".join(line.split()[1:]).strip() id_to_taxa[curr_id] = curr_taxa rep_set_fasta = open(argv[2], "U") ordered_ids = [] for label,seq in MinimalFastaParser(rep_set_fasta): ordered_ids.append(label) ordered_ids = set(ordered_ids) otu_mapping = open(argv[3], "U") matched_ids = {} for line in otu_mapping: if len(line.strip()) == 0: continue curr_line = line.strip().split('\t') curr_otu = curr_line[0] all_seqs = curr_line[1:] for seq in all_seqs:
def parse_and_submit_params(key,project_id,seq_file,output_dir,\ submit_to_server=True): '''This function takes the input options from the user and generates a url and request header for submitting to the MG-RAST cgi script''' #Verify that the users computer can connect to the internet try: check_internet=urlopen('http://www.google.com') except: raise OSError, "This script is having trouble connecting to the internet!" #parse and split fasta file into individual sample fastas fasta_file=MinimalFastaParser(open(seq_file)) split_fasta_on_sample_ids_to_files(fasta_file,output_dir) #set the MG-RAST link for QIIME host = 'metagenomics.anl.gov' #open the log html log_file=open(os.path.join(output_dir,'log.html'),'w') log_data=['<h3>The following jobs were submitted to MG-RAST.</h3>'] log_data.append('<table border=1><tr><th>Fasta File</th><th>Job ID</th>') log_data.append('<th>md5</th></tr>') num=0 #iterate over the fasta files in the given directory fasta_filepaths = glob('%s/*.fasta' % output_dir) fasta_filepaths.sort() for i in fasta_filepaths: #Get the sample id from the fasta filename sample_id=os.path.split(os.path.splitext(i)[0])[-1] #set the parameters params=[('key', key), ('sample', sample_id), ('project', project_id)] #get the full path and short name for the fasta file to be uploaded file_to_submit=os.path.abspath(i) fasta_shortname=os.path.split(file_to_submit)[-1] #open and read file to be put in post form file_object=open(file_to_submit).read() #set the file files=[('file',fasta_shortname,file_object)] #Post the file and parameters response = post_multipart(host,params,files,submit_to_server) #check the response for MG-RAST errors job=re.findall(r'<id>.*</id>',response) md5=re.findall(r'<md5>.*</md5>',response) #if job successful write to log html otherwise post an error message #in the log file if job and md5: job_id=job[0].strip('<id>').strip('</id>') md5_id=md5[0].strip('<md5>').strip('</md5>') log_data.append('<tr><td>%s</td><td>%s</td><td>%s</td></tr>' % \ (fasta_shortname,job_id,md5_id)) else: response_error=re.findall(r'Can\'t call method "login" ',response) if response_error: log_data.append('</table><br><h3 style="color:red">') log_data.append('Web-service authorization key is not valid!') log_data.append('</h3>') else: log_data.append('</table><br><h3 style="color:red">%s</h3>' % \ (response)) log_data.append('</table>') log_info='\n'.join(log_data) #write and close the log html log_file.write(log_html % (log_info)) log_file.close() return log_info
def check_fasta_seqs(input_fasta_fp, barcodes, linkerprimerseqs, total_seq_count, valid_chars=frozenset( ['A', 'T', 'C', 'G', 'N', 'a', 't', 'c', 'g', 'n'])): """ Returns perc of seqs w/ invalid chars, barcodes, or primers present input_fasta_fp: fasta filepath barcodes: set of barcodes from the mapping file linkerprimerseqs: set of linkerprimersequences from the mapping file total_seq_count: total number of sequences in fasta file valid_chars: Currently allowed DNA chars """ input_fasta_f = open(input_fasta_fp, "U") invalid_chars_count = 0 barcodes_count = 0 linkerprimers_count = 0 barcodes_at_start = 0 # Get max barcode length to checking the beginning of seq for barcode max_bc_len = max([len(bc_len) for bc_len in barcodes]) for label, seq in MinimalFastaParser(input_fasta_f): # Only count one offending problem for curr_nt in seq: if curr_nt not in valid_chars: invalid_chars_count += 1 break sliced_seq = seq[0:max_bc_len] for curr_bc in barcodes: if curr_bc in sliced_seq: barcodes_at_start += 1 break for curr_bc in barcodes: if curr_bc in seq: barcodes_count += 1 break for curr_primer in linkerprimerseqs: if curr_primer in seq: linkerprimers_count += 1 break invalid_chars_count = float(invalid_chars_count) barcodes_count = float(barcodes_count) linkerprimers_count = float(linkerprimers_count) total_seq_count = float(total_seq_count) barcodes_at_start_count = float(barcodes_at_start) perc_invalid_chars = "%1.3f" %\ (invalid_chars_count/total_seq_count) perc_barcodes_detected = "%1.3f" %\ (barcodes_count/total_seq_count) perc_primers_detected = "%1.3f" %\ (linkerprimers_count/total_seq_count) perc_barcodes_at_start_detected = "%1.3f" %\ (barcodes_at_start_count/total_seq_count) return perc_invalid_chars, perc_barcodes_detected, perc_primers_detected,\ perc_barcodes_at_start_detected
result['SS'] = ResultPath(Path=self.WorkingDir+'alirna.ps',\ IsWritten=True) return result def rnaalifold_from_alignment(aln, moltype=RNA, params=None): """Returns seq, pairs, folding energy for alignment. """ #Create Alignment object. Object will handle if seqs are unaligned. aln = Alignment(aln, MolType=RNA) int_map, int_keys = aln.getIntMap() app = RNAalifold(WorkingDir='/tmp',\ InputHandler='_input_as_multiline_string',params=params) res = app(clustal_from_alignment(int_map)) #seq,pairs,energy = rnaalifold_parser(res['StdOut'].readlines()) pairs_list = MinimalRnaalifoldParser(res['StdOut'].readlines()) res.cleanUp() return pairs_list if __name__ == "__main__": from sys import argv aln_file = argv[1] aln = dict(MinimalFastaParser(open(aln_file, 'U'))) res = rnaalifold_from_alignment(aln) print res
def convert_fastq(fasta_file_path, qual_file_path, output_directory='.', multiple_output_files=False, ascii_increment=33, full_fastq=False, full_fasta_headers=False): '''Takes a FASTA and QUAL file, generates FASTQ file(s) fasta_file_path: filepath of input FASTA file. qual_file_path: filepath of input QUAL file (needed for making FASTQ files) output_directory: Directory to output converted files. multiple_output_files: Make one file per SampleID. ascii_increment: Conversion value for fastq ascii character to numeric quality score. full_fastq: Write labels to both sequence and quality score lines. full_fasta_headers: Retain all data on fasta label, instead of breaking at first whitespace.''' output_files = {} fasta_file = open(fasta_file_path, 'U') qual_file = open(qual_file_path, 'U') # Need to open file the first time as "w", thereafter open as "a" sample_ids_written = {} for fasta_data, qual_data in izip(MinimalFastaParser(fasta_file), MinimalQualParser(qual_file)): qual_header = qual_data[0] fasta_header = fasta_data[0] label = fasta_header.split()[0] sample_id = label.split('_')[0] sequence = fasta_data[1] qual = qual_data[1] try: quality_scores = qual_data[1] except KeyError: raise KeyError,("No entry in QUAL file for label: %s\n" % \ label) if qual_header != label: raise KeyError,("Fasta(%s) and qual(%s) headers don't match" %\ (label, qual_header)) if len(qual) != len(sequence): raise KeyError,("Number of quality scores "+\ "(%d) does not match number of positions (%d) for label: %s" %\ (len(qual), len(sequence), label)) if not multiple_output_files: output_file_path = path.join(output_directory, \ path.splitext(path.split(fasta_file_path)[1])[0] + '.fastq') if output_file_path in sample_ids_written.keys(): sample_ids_written[output_file_path] = True else: sample_ids_written[output_file_path] = False try: # Create new file if first time writing, else append if sample_ids_written[output_file_path]: fastq_file = open(output_file_path, 'a') else: fastq_file = open(output_file_path, 'w') except IOError: qual_file.close() fasta_file.close() raise IOError,("Could not open FASTQ file for writing: " \ + output_file_path + '\n') output_files[sample_id] = output_file_path if multiple_output_files: if sample_id not in output_files: output_file_path = path.join(output_directory, \ path.splitext(path.split(fasta_file_path)[1])[0] + \ '_' + sample_id + '.fastq') if output_file_path in sample_ids_written.keys(): sample_ids_written[output_file_path] = True else: sample_ids_written[output_file_path] = False try: # Create new file if first time writing, else append if sample_ids_written[output_file_path]: output_files[sample_id] = open(output_file_path, 'a') else: output_files[sample_id] = open(output_file_path, 'w') except IOError: raise IOError,("Could not open FASTQ file for writing: " \ + output_file_path + '\n') output_files[sample_id] = output_file_path fastq_file = open(output_files[sample_id], 'a') if full_fasta_headers: fastq_sequence_header = fasta_header else: fastq_sequence_header = label if full_fastq: fastq_quality_header = fastq_sequence_header else: fastq_quality_header = '' #Writing to FASTQ file fastq_file.write('@' + fastq_sequence_header + '\n') fastq_file.write(sequence + '\n') fastq_file.write('+' + fastq_quality_header + '\n') qual_scores = list(qual) for qual_score in qual_scores: # increment the qual score by the asciiIncrement (default 33), # and print the corresponding character, which represents that # position's quality. qual_score += ascii_increment if qual_score < 32 or qual_score > 126: raise ValueError,("Cannot convert quality score to ASCII code"+\ " between 32 and 126: " + str(qual_score - ascii_increment) +\ "using ascii_increment = " + str(ascii_increment)) fastq_file.write(chr(qual_score)) fastq_file.write('\n') if multiple_output_files: fastq_file.close()
def process_silva(seqs, tax_out, seq_out): for label,seq in MinimalFastaParser(seqs): new_header,taxonomy = parse_label(label) fixed_seq = parse_seq(seq) tax_out.write(new_header + '\t' + taxonomy + '\n') seq_out.write('>' + new_header + '\n' + fixed_seq + '\n')