def get_extract_otu_tax_rank_command(config, fixed_rank, taxa_otu_rank, otu_taxa_labels, abund_table, significance_threshold, rdp_database, rdp_depth): """Extract OTU-tax-ranks""" description = 'OTU taxextr' short = 'te' command = [config['scripts']['taxrank_extractor'], '--input', fixed_rank, '--output', taxa_otu_rank, '--significance_threshold', significance_threshold, '--out_taxlabel', otu_taxa_labels, '--abundance_table', abund_table] if rdp_database == '16S' and rdp_depth == 'phylum': command += ['--deeper_taxa', 'proteobacteria'] if rdp_depth == 'phylum': base_depth = 1 elif rdp_depth == 'class': base_depth = 2 else: raise Exception('Unknown RDP depth encountered: {}'.format(rdp_depth)) if rdp_database == '16S': command += ['--depth', base_depth] elif rdp_database == '18S': command += ['--depth', base_depth + 1] return program_module.ProgramCommand(description, short, command)
def get_prinseq_command(config_file, input_file, good_output, bad_output): """ Runs the Prinseq program which performs various cleaning on the reads Currently performs: - Quality trimming for both ends - Evaluating that the remaining length is long enough - Evaluating the quality of the read - Evaluating that it contains a low number of Ns """ description = 'Prinseq' short = 'pq' prinseq = [config_file['programs']['prinseq']] input_command = ['-fastq', input_file] output_command = ['-out_good', good_output, '-out_bad', bad_output] # phred_command = ['-phred64'] trim_command = ['-trim_qual_left', str(TRIM_QUAL), '-trim_qual_right', str(TRIM_QUAL)] minlen_command = ['-min_len', str(MIN_LEN)] minqual_command = ['-min_qual_mean', str(MIN_QUAL)] ns_command = ['-ns_max_p', str(MAX_NS)] process_command = prinseq + input_command + output_command\ + trim_command + minlen_command + minqual_command + ns_command # + derep_command return program_module.ProgramCommand(description, short, process_command)
def get_timeplot_command(config, log_table_fp, timeplot_fp): """Create a taxa barplot using matplotlib""" description = 'Timeplot' short = 'tp' command = [ config['scripts']['time_script'], '-i', log_table_fp, '-o', timeplot_fp ] return program_module.ProgramCommand(description, short, command)
def get_generate_otu_names_command(config, raw_otus, otu_name_map): """Generates mapping table between old and newly generated OTU names""" description = 'Generate otu names' short = 'GOn' command = [ config['scripts']['generate_otu_names'], '--input', raw_otus, '--output', otu_name_map ] return program_module.ProgramCommand(description, short, command)
def get_convert_to_phylip_command(config, pynast_alignment_fasta, pynast_alignment_phylip): """Convert fasta-alignment to phylip format""" description = 'Convert fas/phy' short = 'Cfp' command = [ config['scripts']['fasta_to_phylip'], '--input_fasta', pynast_alignment_fasta, '--output_phylip', pynast_alignment_phylip ] return program_module.ProgramCommand(description, short, command)
def get_fast_tree_command(config, input_alignment_fp, output_tree_fp): """Produces a tree file from a PyNAST alignment using Fast Tree""" description = 'FastTree' short = 'FT' command = [config['scripts']['fasttree'], '--input', input_alignment_fp, '--output', output_tree_fp, '--fasttree_path', config['programs']['fasttree']] return program_module.ProgramCommand(description, short, command)
def get_create_color_tables_command(config, fixed_rdp_output_fp, otu_color_taxa_table, taxa_color_table): """Produces color tables for color strap and color definitions""" description = 'Create color-tab' short = 'cct' command = [config['scripts']['colors_from_phyla'], '--input', fixed_rdp_output_fp, '--otu_color_taxa', otu_color_taxa_table, '--taxa_color', taxa_color_table] return program_module.ProgramCommand(description, short, command)
def get_reduce_phylip_command(config, phylip_alignment, phylip_alignment_reduced): """Remove empty columns from phylip alignment""" description = 'Reduce phylip' short = 'rp' command = [ config['scripts']['reduce_phylip'], '-i', phylip_alignment, '-o', phylip_alignment_reduced ] return program_module.ProgramCommand(description, short, command)
def get_decompression_command(config_file, compressed_input_fp, decompressed_output_base): """Runs the decompression script, targetting .gz files only""" description = 'decompress' short = 'dc' command = [ config_file['scripts']['decompression_script'], '--input', compressed_input_fp, '--output_base', decompressed_output_base, '--decompression_mode', 'gz' ] return program_module.ProgramCommand(description, short, command)
def get_derep_command(config, raw_reads_fp, dereplicated_fp): """ Adds the -derep_fulllength execute_test Dereplicates full-length sequences Outputs fasta-file with dereplication counts """ description = 'Dereplicate' short = 'dr' command = [ config['programs']['dereplicate'], raw_reads_fp, dereplicated_fp ] return program_module.ProgramCommand(description, short, command)
def get_merge_command(config_file, input_fastq_files_fp, merged_output_fp, labels): """Runs the decompression script, targetting .gz files only""" description = 'merge' short = 'mr' print('DEBUG input labels {}'.format(labels)) command = [ config_file['scripts']['merge'], '--input_files', input_fastq_files_fp, '--output', merged_output_fp, '--labels', labels ] return program_module.ProgramCommand(description, short, command)
def get_fastq_to_fasta_command(config, fastq_fp, fasta_fp): """ Command for converting the input files from fastq to fasta format This format is needed in the following steps """ description = 'Fq to Fa' short = 'FQA' command = [ config['scripts']['fasta_to_fastq'], '--input', fastq_fp, '--output', fasta_fp, '--extract_label' ] return program_module.ProgramCommand(description, short, command)
def get_create_barplot_data_command(config, taxa_otu_rank, otu_sample_table, tax_count_table, tax_abund_table): """Extracts data from the otu-taxa-table and outputs taxa barplot data""" description = 'Create bar-data' short = 'cbd' command = [ config['scripts']['create_barplot_table'], '--taxa_table', taxa_otu_rank, '--otu_sample_table', otu_sample_table, '--barplot_cluster', tax_count_table, '--barplot_abund', tax_abund_table ] return program_module.ProgramCommand(description, short, command)
def get_generate_alpha_plots_command(config, plot_rar, plot_chao, otu_mapping_table): """Runs the preliminary chao1/rarefaction calculating script""" description = 'Chao1' short = 'PC' command = [config['scripts']['alpha_plots'], '--plot_rarefaction', plot_rar, '--plot_chao', plot_chao, '--samplepoints', SAMPLE_STEPS, '--replicates', SAMPLE_REPLICATES, '--otu_mapping_table', otu_mapping_table] return program_module.ProgramCommand(description, short, command)
def get_filter_otu_command(config, complete_otu_fp, otu_abundancy_fp, output_fp, abund_filt_fp, filter_threshold): """ Filters the otus based on mapped read count """ description = 'Abund. filtering' short = 'af' command = [ config['scripts']['filter_otus'], '-i', complete_otu_fp, '-m', otu_abundancy_fp, '-o', output_fp, '-t', filter_threshold, '-O', abund_filt_fp ] return program_module.ProgramCommand(description, short, command)
def get_create_otu_table_command(config, cluster_mapping, derep_mapping, name_mapping_table, otu_table): """ Creates OTU table where counts in separate samples are mapped to the different OTUs """ description = 'OTU-table' short = 'OT' command = [config['scripts']['create_otu_table'], '--cluster_mapping', cluster_mapping, '--derep_mapping', derep_mapping, '--name_mapping', name_mapping_table, '--output', otu_table] return program_module.ProgramCommand(description, short, command)
def get_ete_command(config, input_tree_fp, output_tree_pic_fp, otu_abund_fp, color_strap_fp, labels_fp): """Uses the Python ETE module to create and render a tree""" description = 'ETE' short = 'ete' command = ['xvfb-run', config['scripts']['ete'], '--input', input_tree_fp, '--output', output_tree_pic_fp, '--labels', labels_fp, '--abundancies', otu_abund_fp, '--color_taxa', color_strap_fp] return program_module.ProgramCommand(description, short, command)
def get_chimera_checking_command(config, unchecked_fp, non_chimeric_fp): """ Runs chimeric checking Identifies chimeric reads, and outputs non-chimeric reads to target file path """ description = 'Chim. checking' short = 'cc' command = [ config['programs']['vsearch'], '-uchime_ref', unchecked_fp, '-db', config['databases']['vsearch_16S_ref'], '-uchimeout', str(non_chimeric_fp + '.OUTPUT'), '-nonchimeras', non_chimeric_fp ] return program_module.ProgramCommand(description, short, command)
def get_cdhit_parser_command(config, input_mapping_matrix_fp, output_mapping_table_fp, cluster_mapping_fp): """ Extracts OTU counts from CDHIT mapping table and outputs them as an abundancy matrix """ description = 'Extract otu table' short = 'eOt' command = [ config['scripts']['cdhit_output_parser'], '-i', input_mapping_matrix_fp, '-o', output_mapping_table_fp, '--count_dereplicated', '--seq_matrix', cluster_mapping_fp ] return program_module.ProgramCommand(description, short, command)
def get_rename_otus_command(config, input_otus, input_table, renamed_fasta, renamed_table, name_mapping): """ Renames OTUs in fasta-file and abundancy matrix Outputs then as 'output.fasta' and 'output.table' """ description = 'Rename otus' short = 'rO' command = [ config['scripts']['rename_otus'], '--fasta', input_otus, '--table', input_table, '--output_fasta', renamed_fasta, '--output_table', renamed_table, '--name_mapping', name_mapping ] return program_module.ProgramCommand(description, short, command)
def get_label_fasta_header_command(config, raw_reads_fp, labelled_fp): """ Adds labels to fasta headers Is used to add ';size=1;' if the derep command isn't used """ description = 'Label reads' short = 'lr' label = ';size=1;' command = [ config['scripts']['label_fasta_headers'], '-i', raw_reads_fp, '-o', labelled_fp, '-l', label ] return program_module.ProgramCommand(description, short, command)
def get_script_dereplicator_command(config, raw_reads_fp, dereplicated_fp, mapping_fp): """ Home-made dereplication script """ description = 'Dereplicate script' short = 'dr' # command = [config['programs']['dereplicate'], raw_reads_fp, dereplicated_fp] command = [ config['scripts']['script_dereplicator'], '--input', raw_reads_fp, '--output', dereplicated_fp, '--mapping_file', mapping_fp ] return program_module.ProgramCommand(description, short, command)
def get_annotate_otus_command(config, raw_otus, raw_abundance, annotated_otus, annotated_abundance, taxa_otu_rank, fixed_rank_annotation): """ Annotates an OTU-fasta file and an OTU-abundancy matrix """ description = 'Annotate otu' short = 'ao' command = [ config['scripts']['annotate_otu'], '--input_fasta', raw_otus, '--input_abundancy', raw_abundance, '--input_taxa', taxa_otu_rank, '--annotated_fasta', annotated_otus, '--annotated_abundancy', annotated_abundance, '--fixed_rank_annotation', fixed_rank_annotation ] return program_module.ProgramCommand(description, short, command)
def get_filter_bad_taxa_command(config, raw_otus, abundancy_matrix, otu_taxa_table, output_dir): """ Filters out OTUs whose taxa is determined with low confidence by RDP classifier. Filtering effects both OTU fasta file, and abundancy table """ description = 'Filter taxa' short = 'ft' command = [ config['scripts']['filter_poor_taxa'], '--input', raw_otus, '--taxa_table', otu_taxa_table, '--suffix', TAX_FILTER_SUFFIX, '--abund_matrix', abundancy_matrix, '--output_dir', output_dir ] return program_module.ProgramCommand(description, short, command)
def get_raxml_command(config, input_alignment_fp, output_dir): """Produces a tree file from a PyNAST alignment using Fast Tree""" description = 'RAxML' short = 'Rx' seed = 12345 model_of_substitution = 'GTRGAMMA' raxml_out_name = 'raxml_tree.tre' command = [config['programs']['raxml'], '-p', seed, '-m', model_of_substitution, '-s', input_alignment_fp, '-n', raxml_out_name, '-w', output_dir] return program_module.ProgramCommand(description, short, command)
def get_taxa_barplot_command(config, tax_count_table_fp, matlibplot_out_fp, taxa_color_table, plot_relative_abundance=False, title='MODTITL', ylabel='MODYLABL'): """Create a taxa barplot using matplotlib""" description = 'Taxa-barplot' short = 'tb' command = [ config['scripts']['make_barplot'], '--input', tax_count_table_fp, '--output', matlibplot_out_fp, '--title', title, '--ylabel', ylabel, '--color_table', taxa_color_table ] if plot_relative_abundance: command.append('--relative_abundance') return program_module.ProgramCommand(description, short, command)
def get_rdp_command(config, input_fp, fixed_rank_fp, significant_taxa_fp, significance_threshold, chosen_database): """Run the RDP classification program""" description = 'RDP Classifier' short = 'RDP' if chosen_database == '18S': train_option = ['-t', config['databases']['rdpclassifier_18S']] else: train_option = [] command = ['java', '-Xmx' + str(MEMORY_SIZE_GB) + 'g', '-jar', config['programs']['rdpclassifier'], 'classify', '-c', significance_threshold, '-f', 'fixrank', '-o', fixed_rank_fp, '-h', significant_taxa_fp] command += train_option command += [input_fp] return program_module.ProgramCommand(description, short, command)
def get_run_cdhit_command(config, input_reads_fp, output_otus_fp, clustering_identity): """ Runs CD-HIT, clustering sequences into OTUs """ description = 'CD-HIT' short = 'CH' output_description_length_option = 0 command = [ config['programs']['cdhit'], '-i', input_reads_fp, '-o', output_otus_fp, '-c', clustering_identity, '-n', CDHIT_WORD_SIZE, '-T', CDHIT_THREADS, '-M', CDHIT_MEMORY, '-d', output_description_length_option ] if ACCURATE_CLUSTERING: command.append('-g') command.append(1) print('CD-HIT command: {}'.format(command)) return program_module.ProgramCommand(description, short, command)
def get_pynast_command(config, filtered_otus, pynast_alignment_fasta, pynast_log, pynast_failed, min_alignment_similarity_percentage, rdp_database): """Run PyNAST""" description = 'PyNAST' short = 'PN' if rdp_database == '16S': pynast_database = config['databases']['pynast_16S'] elif rdp_database == '18S': pynast_database = config['databases']['pynast_16S'] else: raise AttributeError( 'Chosen RDP database: {} doesn\'t exist!'.format(rdp_database)) command = [ config['programs']['pynast'], '-i', filtered_otus, '-t', pynast_database, '-l', MIN_INPUT_TO_ALIGNMENT_LENGTH, '-p', min_alignment_similarity_percentage, '-a', pynast_alignment_fasta, '-g', pynast_log, '-f', pynast_failed ] return program_module.ProgramCommand(description, short, command)