def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) split_fasta_on_sample_ids_to_files(MinimalFastaParser(open(opts.input_fasta_fp,'U')), opts.output_dir, opts.buffer_size)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) split_fasta_on_sample_ids_to_files( MinimalFastaParser(open(opts.input_fasta_fp, 'U')), opts.output_dir, opts.buffer_size)
def parse_and_submit_params(key, project_id, seq_file, output_dir, submit_to_server=True): '''This function takes the input options from the user and generates a url and request header for submitting to the MG-RAST cgi script''' # Verify that the users computer can connect to the internet try: check_internet = urlopen('http://www.google.com') except: raise OSError( "This script is having trouble connecting to the internet!") # parse and split fasta file into individual sample fastas fasta_file = parse_fasta(open(seq_file)) split_fasta_on_sample_ids_to_files(fasta_file, output_dir) # set the MG-RAST link for QIIME host = 'metagenomics.anl.gov' # open the log html log_file = open(os.path.join(output_dir, 'log.html'), 'w') log_data = ['<h3>The following jobs were submitted to MG-RAST.</h3>'] log_data.append('<table border=1><tr><th>Fasta File</th><th>Job ID</th>') log_data.append('<th>md5</th></tr>') num = 0 # iterate over the fasta files in the given directory fasta_filepaths = sorted(glob('%s/*.fasta' % output_dir)) for i in fasta_filepaths: # Get the sample id from the fasta filename sample_id = os.path.split(os.path.splitext(i)[0])[-1] # set the parameters params = [('key', key), ('sample', sample_id), ('project', project_id)] # get the full path and short name for the fasta file to be uploaded file_to_submit = os.path.abspath(i) fasta_shortname = os.path.split(file_to_submit)[-1] # open and read file to be put in post form file_object = open(file_to_submit).read() # set the file files = [('file', fasta_shortname, file_object)] # Post the file and parameters response = post_multipart(host, params, files, submit_to_server) # check the response for MG-RAST errors job = re.findall(r'<id>.*</id>', response) md5 = re.findall(r'<md5>.*</md5>', response) # if job successful write to log html otherwise post an error message # in the log file if job and md5: job_id = job[0].strip('<id>').strip('</id>') md5_id = md5[0].strip('<md5>').strip('</md5>') log_data.append('<tr><td>%s</td><td>%s</td><td>%s</td></tr>' % (fasta_shortname, job_id, md5_id)) else: response_error = re.findall( r'Can\'t call method "login" ', response) if response_error: log_data.append('</table><br><h3 style="color:red">') log_data.append('Web-service authorization key is not valid!') log_data.append('</h3>') else: log_data.append('</table><br><h3 style="color:red">%s</h3>' % (response)) log_data.append('</table>') log_info = '\n'.join(log_data) # write and close the log html log_file.write(log_html % (log_info)) log_file.close() return log_info
def usearch61_chimera_check(input_seqs_fp, output_dir, reference_seqs_fp=None, suppress_usearch61_intermediates=False, suppress_usearch61_ref=False, suppress_usearch61_denovo=False, split_by_sampleid=False, non_chimeras_retention="union", usearch61_minh=0.28, usearch61_xn=8.0, usearch61_dn=1.4, usearch61_mindiffs=3, usearch61_mindiv=0.8, usearch61_abundance_skew=2.0, percent_id_usearch61=0.97, minlen=64, word_length=8, max_accepts=1, max_rejects=8, verbose=False, threads=1.0, HALT_EXEC=False): """ Main convenience function for usearch61 chimera checking input_seqs_fp: filepath of input fasta file. output_dir: output directory reference_seqs_fp: fasta filepath for reference chimera detection. suppress_usearch61_intermediates: Suppress retention of .uc and log files. suppress_usearch61_ref: Suppress usearch61 reference chimera detection. suppress_usearch61_denovo: Suppress usearch61 de novo chimera detection. split_by_sampleid: Split by sample ID for de novo chimera detection. non_chimeras_retention: Set to "union" or "intersection" to retain non-chimeras between de novo and reference based results. usearch61_minh: Minimum score (h) to be classified as chimera. Increasing this value tends to the number of false positives (and also sensitivity). usearch61_xn: Weight of "no" vote. Increasing this value tends to the number of false positives (and also sensitivity). usearch61_dn: Pseudo-count prior for "no" votes. (n). Increasing this value tends to the number of false positives (and also sensitivity). usearch61_mindiffs: Minimum number of diffs in a segment. Increasing this value tends to reduce the number of false positives while reducing sensitivity to very low-divergence chimeras. usearch61_mindiv: Minimum divergence, i.e. 100% - identity between the query and closest reference database sequence. Expressed as a percentage, so the default is 0.8%, which allows chimeras that are up to 99.2% similar to a reference sequence. usearch61_abundance_skew: abundance skew for de novo chimera comparisons. percent_id_usearch61: identity to cluster sequences at minlen: minimum sequence length for use with usearch61 word_length: length of nucleotide 'words' for usearch61 max_accepts: max number of accepts for hits with usearch61 max_rejects: max number of rejects for usearch61, increasing allows more sensitivity at a cost of speed threads: Specify number of threads used per core per CPU HALT_EXEC=application controller option to halt execution and print command """ """ Need to cluster sequences de novo first to get 1. abundance information and 2 consensus sequence for each cluster. Using dereplication followed by clustering does not appear to automatically update complete cluster size, will directly cluster raw seqs with the small_mem clustering option. This means without additional parsing steps to recalculate actual cluster sizes, the sizeorder option can't be used for de novo clustering and downstream chimera detection.""" files_to_remove = [] # Get absolute paths to avoid issues with calling usearch input_seqs_fp = abspath(input_seqs_fp) output_dir = abspath(output_dir) if reference_seqs_fp: reference_seqs_fp = abspath(reference_seqs_fp) log_fp = join(output_dir, "identify_chimeric_seqs.log") chimeras_fp = join(output_dir, "chimeras.txt") non_chimeras_fp = join(output_dir, "non_chimeras.txt") non_chimeras = [] chimeras = [] log_lines = { 'denovo_chimeras': 0, 'denovo_non_chimeras': 0, 'ref_chimeras': 0, 'ref_non_chimeras': 0 } if split_by_sampleid: if verbose: print "Splitting fasta according to SampleID..." full_seqs = open(input_seqs_fp, "U") sep_fastas =\ split_fasta_on_sample_ids_to_files(MinimalFastaParser(full_seqs), output_dir) full_seqs.close() if suppress_usearch61_intermediates: files_to_remove += sep_fastas for curr_fasta in sep_fastas: curr_chimeras, curr_non_chimeras, files_to_remove, log_lines =\ identify_chimeras_usearch61(curr_fasta, output_dir, reference_seqs_fp, suppress_usearch61_intermediates, suppress_usearch61_ref, suppress_usearch61_denovo, non_chimeras_retention, usearch61_minh, usearch61_xn, usearch61_dn, usearch61_mindiffs, usearch61_mindiv, usearch61_abundance_skew, percent_id_usearch61, minlen, word_length, max_accepts, max_rejects, files_to_remove, HALT_EXEC, log_lines, verbose, threads) chimeras += curr_chimeras non_chimeras += curr_non_chimeras else: chimeras, non_chimeras, files_to_remove, log_lines =\ identify_chimeras_usearch61(input_seqs_fp, output_dir, reference_seqs_fp, suppress_usearch61_intermediates, suppress_usearch61_ref, suppress_usearch61_denovo, non_chimeras_retention, usearch61_minh, usearch61_xn, usearch61_dn, usearch61_mindiffs, usearch61_mindiv, usearch61_abundance_skew, percent_id_usearch61, minlen, word_length, max_accepts, max_rejects, files_to_remove, HALT_EXEC, log_lines, verbose, threads) # write log, non chimeras, chimeras. write_usearch61_log( log_fp, input_seqs_fp, output_dir, reference_seqs_fp, suppress_usearch61_intermediates, suppress_usearch61_ref, suppress_usearch61_denovo, split_by_sampleid, non_chimeras_retention, usearch61_minh, usearch61_xn, usearch61_dn, usearch61_mindiffs, usearch61_mindiv, usearch61_abundance_skew, percent_id_usearch61, minlen, word_length, max_accepts, max_rejects, HALT_EXEC, log_lines) chimeras_f = open(chimeras_fp, "w") non_chimeras_f = open(non_chimeras_fp, "w") for curr_chimera in chimeras: chimeras_f.write("%s\n" % curr_chimera) for curr_non_chimera in non_chimeras: non_chimeras_f.write("%s\n" % curr_non_chimera) chimeras_f.close() non_chimeras_f.close() remove_files(files_to_remove)
def usearch61_chimera_check(input_seqs_fp, output_dir, reference_seqs_fp = None, suppress_usearch61_intermediates = False, suppress_usearch61_ref = False, suppress_usearch61_denovo = False, split_by_sampleid = False, non_chimeras_retention = "union", usearch61_minh = 0.28, usearch61_xn = 8.0, usearch61_dn = 1.4, usearch61_mindiffs = 3, usearch61_mindiv = 0.8, usearch61_abundance_skew = 2.0, percent_id_usearch61 = 0.97, minlen = 64, word_length = 8, max_accepts = 1, max_rejects = 8, verbose=False, threads = 1.0, HALT_EXEC=False): """ Main convenience function for usearch61 chimera checking input_seqs_fp: filepath of input fasta file. output_dir: output directory reference_seqs_fp: fasta filepath for reference chimera detection. suppress_usearch61_intermediates: Suppress retention of .uc and log files. suppress_usearch61_ref: Suppress usearch61 reference chimera detection. suppress_usearch61_denovo: Suppress usearch61 de novo chimera detection. split_by_sampleid: Split by sample ID for de novo chimera detection. non_chimeras_retention: Set to "union" or "intersection" to retain non-chimeras between de novo and reference based results. usearch61_minh: Minimum score (h) to be classified as chimera. Increasing this value tends to the number of false positives (and also sensitivity). usearch61_xn: Weight of "no" vote. Increasing this value tends to the number of false positives (and also sensitivity). usearch61_dn: Pseudo-count prior for "no" votes. (n). Increasing this value tends to the number of false positives (and also sensitivity). usearch61_mindiffs: Minimum number of diffs in a segment. Increasing this value tends to reduce the number of false positives while reducing sensitivity to very low-divergence chimeras. usearch61_mindiv: Minimum divergence, i.e. 100% - identity between the query and closest reference database sequence. Expressed as a percentage, so the default is 0.8%, which allows chimeras that are up to 99.2% similar to a reference sequence. usearch61_abundance_skew: abundance skew for de novo chimera comparisons. percent_id_usearch61: identity to cluster sequences at minlen: minimum sequence length for use with usearch61 word_length: length of nucleotide 'words' for usearch61 max_accepts: max number of accepts for hits with usearch61 max_rejects: max number of rejects for usearch61, increasing allows more sensitivity at a cost of speed threads: Specify number of threads used per core per CPU HALT_EXEC=application controller option to halt execution and print command """ """ Need to cluster sequences de novo first to get 1. abundance information and 2 consensus sequence for each cluster. Using dereplication followed by clustering does not appear to automatically update complete cluster size, will directly cluster raw seqs with the small_mem clustering option. This means without additional parsing steps to recalculate actual cluster sizes, the sizeorder option can't be used for de novo clustering and downstream chimera detection.""" files_to_remove = [] # Get absolute paths to avoid issues with calling usearch input_seqs_fp = abspath(input_seqs_fp) output_dir = abspath(output_dir) if reference_seqs_fp: reference_seqs_fp = abspath(reference_seqs_fp) log_fp = join(output_dir, "identify_chimeric_seqs.log") chimeras_fp = join(output_dir, "chimeras.txt") non_chimeras_fp = join(output_dir, "non_chimeras.txt") non_chimeras = [] chimeras = [] log_lines = {'denovo_chimeras':0, 'denovo_non_chimeras':0, 'ref_chimeras':0, 'ref_non_chimeras':0} if split_by_sampleid: if verbose: print "Splitting fasta according to SampleID..." full_seqs = open(input_seqs_fp, "U") sep_fastas =\ split_fasta_on_sample_ids_to_files(MinimalFastaParser(full_seqs), output_dir) full_seqs.close() if suppress_usearch61_intermediates: files_to_remove += sep_fastas for curr_fasta in sep_fastas: curr_chimeras, curr_non_chimeras, files_to_remove, log_lines =\ identify_chimeras_usearch61(curr_fasta, output_dir, reference_seqs_fp, suppress_usearch61_intermediates, suppress_usearch61_ref, suppress_usearch61_denovo, non_chimeras_retention, usearch61_minh, usearch61_xn, usearch61_dn, usearch61_mindiffs, usearch61_mindiv, usearch61_abundance_skew, percent_id_usearch61, minlen, word_length, max_accepts, max_rejects, files_to_remove, HALT_EXEC, log_lines, verbose, threads) chimeras += curr_chimeras non_chimeras += curr_non_chimeras else: chimeras, non_chimeras, files_to_remove, log_lines =\ identify_chimeras_usearch61(input_seqs_fp, output_dir, reference_seqs_fp, suppress_usearch61_intermediates, suppress_usearch61_ref, suppress_usearch61_denovo, non_chimeras_retention, usearch61_minh, usearch61_xn, usearch61_dn, usearch61_mindiffs, usearch61_mindiv, usearch61_abundance_skew, percent_id_usearch61, minlen, word_length, max_accepts, max_rejects, files_to_remove, HALT_EXEC, log_lines, verbose, threads) # write log, non chimeras, chimeras. write_usearch61_log(log_fp, input_seqs_fp, output_dir, reference_seqs_fp, suppress_usearch61_intermediates, suppress_usearch61_ref, suppress_usearch61_denovo, split_by_sampleid, non_chimeras_retention, usearch61_minh, usearch61_xn, usearch61_dn, usearch61_mindiffs, usearch61_mindiv, usearch61_abundance_skew, percent_id_usearch61, minlen, word_length, max_accepts, max_rejects, HALT_EXEC, log_lines) chimeras_f = open(chimeras_fp, "w") non_chimeras_f = open(non_chimeras_fp, "w") for curr_chimera in chimeras: chimeras_f.write("%s\n" % curr_chimera) for curr_non_chimera in non_chimeras: non_chimeras_f.write("%s\n" % curr_non_chimera) chimeras_f.close() non_chimeras_f.close() remove_files(files_to_remove)
def parse_and_submit_params(key, project_id, seq_file, output_dir, submit_to_server=True): '''This function takes the input options from the user and generates a url and request header for submitting to the MG-RAST cgi script''' # Verify that the users computer can connect to the internet try: check_internet = urlopen('http://www.google.com') except: raise OSError( "This script is having trouble connecting to the internet!") # parse and split fasta file into individual sample fastas fasta_file = MinimalFastaParser(open(seq_file)) split_fasta_on_sample_ids_to_files(fasta_file, output_dir) # set the MG-RAST link for QIIME host = 'metagenomics.anl.gov' # open the log html log_file = open(os.path.join(output_dir, 'log.html'), 'w') log_data = ['<h3>The following jobs were submitted to MG-RAST.</h3>'] log_data.append('<table border=1><tr><th>Fasta File</th><th>Job ID</th>') log_data.append('<th>md5</th></tr>') num = 0 # iterate over the fasta files in the given directory fasta_filepaths = sorted(glob('%s/*.fasta' % output_dir)) for i in fasta_filepaths: # Get the sample id from the fasta filename sample_id = os.path.split(os.path.splitext(i)[0])[-1] # set the parameters params = [('key', key), ('sample', sample_id), ('project', project_id)] # get the full path and short name for the fasta file to be uploaded file_to_submit = os.path.abspath(i) fasta_shortname = os.path.split(file_to_submit)[-1] # open and read file to be put in post form file_object = open(file_to_submit).read() # set the file files = [('file', fasta_shortname, file_object)] # Post the file and parameters response = post_multipart(host, params, files, submit_to_server) # check the response for MG-RAST errors job = re.findall(r'<id>.*</id>', response) md5 = re.findall(r'<md5>.*</md5>', response) # if job successful write to log html otherwise post an error message # in the log file if job and md5: job_id = job[0].strip('<id>').strip('</id>') md5_id = md5[0].strip('<md5>').strip('</md5>') log_data.append('<tr><td>%s</td><td>%s</td><td>%s</td></tr>' % (fasta_shortname, job_id, md5_id)) else: response_error = re.findall(r'Can\'t call method "login" ', response) if response_error: log_data.append('</table><br><h3 style="color:red">') log_data.append('Web-service authorization key is not valid!') log_data.append('</h3>') else: log_data.append('</table><br><h3 style="color:red">%s</h3>' % (response)) log_data.append('</table>') log_info = '\n'.join(log_data) # write and close the log html log_file.write(log_html % (log_info)) log_file.close() return log_info