def test_write(self): output_file = tempfile.mktemp() output_lines = [["this", "is", "a", "header"], ["Content", 1, 2, 3]] expected_list = ["this\tis\ta\theader\n", "Content\t1\t2\t3\n"] Writer.write(output_lines, output_file) with open(output_file) as output_file_io: for idx, line in enumerate(output_file_io): self.assertEqual(line, expected_list[idx]) self.assertEqual(idx + 1, len(expected_list))
def generate_gff_files(self, genomes_list): ''' Write GFF files for each of the genome objects in genomes_list Parameters ---------- genomes_list - List. List of Genome objects ''' output_directory_path = path.join(self.output_directory, self.GENOME_GFF) mkdir(output_directory_path) for genome in genomes_list: logging.info(' - Generating .gff file for %s', genome.name) gff_output = path.join(output_directory_path, genome.name + self.GFF_SUFFIX) Writer.write_gff(genome, gff_output)
def uses_pipeline(self, compounds_list_path, annotation_matrix_path, metadata_path, output, count): logging.info('Parsing input compounds list') compound_list = Parser.parse_single_column_text_file(compounds_list_path) logging.info('Parsing input annotations') annotations_dict, column_names, annotations = Parser.parse_simple_matrix(annotation_matrix_path) logging.info('Parsing input metadata') metadata, metadata_value_lists, attribute_dict = Parser.parse_metadata_matrix(metadata_path) logging.info('Tallying genes that use specified compounds') output_lines_abundance, enrichment_tallys = self.uses(compound_list, annotations_dict, column_names, count) logging.info('Writing file: %s' % self.abundace) Writer.write(output_lines_abundance, os.path.join(output, self.abundace)) logging.info('Calculating enrichment between groups for each compound') output_lines_enrichment = self.enrichment(enrichment_tallys, attribute_dict) logging.info('Writing file: %s' % self.enrichment) Writer.write(output_lines_enrichment, os.path.join(output, self.enrichment)) logging.info('Finished the use pipeline')
def predict_pipeline(self, forester_model_directory, input_matrix_path, output_directory): ''' Inputs ------ Outputs ------- ''' forester_model = ParseGenerate(forester_model_directory) logging.info('Parsing input') logging.info('Loading model: %s' % (forester_model.rf_model)) logging.info('Parsing data') features, _, _ = Parser.parse_simple_matrix(input_matrix_path) sample_list = list() content_list = list() for sample, content in features.items(): sample_list.append(sample) sample_content = list() for attribute in forester_model.attributes: if attribute in content: sample_content.append(content[attribute]) else: sample_content.append('0') content_list.append(sample_content) logging.info('Making predictions') output_lines = self.make_predictions(forester_model.model, sample_list, content_list, forester_model.labels) Writer.write( output_lines, os.path.join(output_directory, self.predictions_output_file))
def generate_pipeline(self, input_matrix_path, groups_path, model_type, testing_portion, grid_search, threads, output_directory): ''' Inputs ------ Outputs ------- ''' logging.info('Using %f%% of the input data for testing', testing_portion * 100) if model_type == self.regressor: model = RandomForestRegressor() elif model_type == self.classifier: model = RandomForestClassifier() else: raise Exception("Model type not recognised: %s" % (model_type)) logging.info('Parsing inputs:') labels, _, _ = Parser.parse_metadata_matrix(groups_path) features, _, attribute_list = Parser.parse_simple_matrix( input_matrix_path) labels_list, features_list = self.transpose(labels, features, attribute_list) labels_dict, labels_list_numeric = self.numerify(labels_list) logging.info("Tuning hyperparameters") random_forest_model, test_features, test_labels, best_params_list = self.tune( features_list, labels_list_numeric, testing_portion, grid_search, threads, model) logging.info('Making predictions on test data:') predictions = random_forest_model.predict(test_features) errors = abs(predictions - test_labels) logging.info('\t\tMean Absolute Error: %f degrees', round(np.mean(errors), 2)) accuracy = self.estimate_correctness(predictions, test_labels) logging.info('\t\tAccuracy: %f%%', accuracy) best_params_list.append(["Accuracy", str(accuracy)]) logging.info("Generating attribute importances") output_attribute_importances = self.get_importances( random_forest_model, attribute_list) Writer.write(best_params_list, os.path.join(output_directory, self.model_accuracy)) logging.info("Generating model accuracy summary file") Writer.write( output_attribute_importances, os.path.join(output_directory, self.attribute_importances)) logging.info("Preserving model") with open(os.path.join(output_directory, self.model_pickle), 'wb') as model_io: pickle.dump(random_forest_model, model_io) logging.info("Preserving group labels") with open(os.path.join(output_directory, self.labels_dict), 'wb') as labels_io: pickle.dump(labels_dict, labels_io)
def do(self, custom_modules, cutoff, aggregate, genome_and_annotation_file, genome_and_annotation_matrix, output_directory): ''' Parameters ---------- custom_modules - string. Path to file containing custom module definitions, consistent with KEGG module nomenclature (http://www.genome.jp/kegg/module.html) cutoff - float. Fraction of a module needed in order to be included in the output. genome_and_annotation_file - string. Path to file containing genome - annotation file. This file contains two columns, the first with the genome name, the second with a annotation annotation within that genome genome_and_annotation_matrix - string. Path to file containing genome - annotation matrix output - string. Path to file to output results to. ''' pathway = dict() genome_output_lines = list() if custom_modules: logging.info('Reading in custom modules: %s' % custom_modules) self.update_with_custom_modules(custom_modules) # TODO: remove me there is a duplicated parser below if genome_and_annotation_file: genome_to_annotation_sets = Parser.parse_genome_and_annotation_file_lf( genome_and_annotation_file) elif genome_and_annotation_matrix: genome_to_annotation_sets = Parser.parse_genome_and_annotation_file_matrix( genome_and_annotation_matrix) if aggregate: logging.info('Reading in abundances: %s' % (genome_and_annotation_matrix)) abundances, _, _ = Parser.parse_simple_matrix( genome_and_annotation_matrix) abundance_result = dict() logging.info("Read in annotations for %i genomes" % len(genome_to_annotation_sets)) output_lines = [ '\t'.join([ "Genome_name", "Module_id", "Module_name", "Steps_found", "Steps_needed", "Percent_steps_found" ]) + '\n' ] # "KO_found", "KO_needed", "Percent_KO_found" genome_output_lines = [ '\t'.join(["Genome_name", "Module_id", "Module_name"]) + '\n' ] for name, pathway_string in self.m2def.items(): if name not in self.signature_modules: path = ModuleDescription(pathway_string) pathway[name] = path for genome, annotations in genome_to_annotation_sets.items(): num_covered, _, _, ko_path = path.num_covered_steps( annotations) num_all = path.num_steps() perc_covered = num_covered / float(num_all) ko_path_list = list(chain(*ko_path.values())) if perc_covered >= cutoff: if path.is_single_step: if perc_covered != 1: if cutoff < 1: num_all = 1 num_covered = 0 perc_covered = 0.0 else: continue else: num_all = 1 num_covered = 1 if aggregate: if genome not in abundance_result: abundance_result[genome] = dict() pathway_abundance = [ abundances[genome][ko] for ko in ko_path_list ] pathway_average_abundance = sum( pathway_abundance) / len(pathway_abundance) abundance_result[genome][ name] = pathway_average_abundance genome_output_lines.append([ genome, name, self.m[name], ','.join(ko_path_list) ]) output_line = [ genome, name, self.m[name], str(num_covered), str(num_all), str(round(perc_covered * 100, 2)) ] output_lines.append(output_line) Writer.write(output_lines, os.path.join(output_directory, self.KO_OUTPUT)) Writer.write(genome_output_lines, os.path.join(output_directory, self.MODULE_PATHS)) if aggregate: samples = list(abundance_result.keys()) output_lines = ['\t'.join(["ID"] + samples) + '\n'] for module in self.m2def.keys(): if module not in self.signature_modules: output_line = [module] for sample in samples: if module in abundance_result[sample]: output_line.append( str(abundance_result[sample][module])) else: output_line.append('0.0') output_lines.append(output_line) Writer.write(output_lines, os.path.join(output_directory, self.AGGREGATE_OUTPUT))
def network_pipeline(self, subparser_name, matrix, genome_metadata_path, transcriptome_abundances_path, transcriptome_metadata_path, metagenome_abundances, metagenome_metadata_path, metabolome, enrichment_output, depth, filter, limit, queries, output_directory): ''' Parameters ---------- matrix transcriptome_abundances_path metagenome_abundances metagenome_metadata_path metabolome enrichment_output depth filter limit queries output_directory ''' orthology_matrix, genome_names, _ = Parser.parse_simple_matrix(matrix) if genome_metadata_path: genome_to_group, genome_groups, group_to_genome = \ Parser.parse_metadata_matrix(genome_metadata_path) else: genome_to_group, genome_groups, group_to_genome = \ self.mock_metadata(genome_names) reaction_matrix = self.aggregate_dictionary(self.reaction_to_ko, orthology_matrix) # Read in fisher results if enrichment_output: logging.info('Parsing input enrichment results') fisher_results = Parser.parse_enrichment_output(enrichment_output) else: logging.info('No enrichment results provided') fisher_results = None # Read in metabolome abundances if metabolome: logging.info('Parsing metabolome abundances') abundances_metabolome = Parser.parse_simple_matrix(metabolome) else: logging.info('No metabolome abundances provided') abundances_metabolome = None # Read in genome metagenome_abundances if metagenome_abundances: logging.info('Parsing input genome abundances') sample_abundance = Parser.parse_simple_matrix(metagenome_abundances)[0] sample_metadata = Parser.parse_metadata_matrix(metagenome_metadata_path)[2] else: # FIXME : There's always a better way than faking it. logging.info('No genome abundances provided') sample_abundance = {'MOCK': {x:1 for x in list(reaction_matrix.keys())} } sample_metadata = {"abundance": ['MOCK']} median_sample_abundances = self.median_genome_abundance(sample_abundance, sample_metadata) normalised_abundance_dict = self.normalise_by_abundance(median_sample_abundances, reaction_matrix, group_to_genome, genome_to_group, genome_groups) abundances_metagenome = self.average(normalised_abundance_dict) # Read in expression (TPM) values if transcriptome_abundances_path: logging.info("Parsing detectM TPM abundances") transcriptome_metadata = Parser.parse_metadata_matrix(transcriptome_metadata_path)[2] transcriptome_abundance_dict = self.average_tpm_by_sample(Parser.parse_tpm_values(transcriptome_abundances_path), transcriptome_metadata) transcriptome_abundances = self.average_tpm_values(transcriptome_abundance_dict, group_to_genome) else: transcriptome_abundances = None network_builder = NetworkBuilder(group_to_genome, abundances_metagenome, transcriptome_abundances, abundances_metabolome, fisher_results) # Run the subcommand specified if subparser_name == self.EXPLORE: network_lines, node_metadata = network_builder.query_matrix(queries, depth) elif subparser_name == self.PATHWAY: network_lines, node_metadata = network_builder.pathway_matrix(limit, filter) # Write the outputs Writer.write(network_lines, os.path.join(output_directory, self.NETWORK_OUTPUT_FILE)) Writer.write(node_metadata, os.path.join(output_directory, self.METADATA_OUTPUT_FILE)) logging.info('Finished the %s pipeline' % subparser_name)
def do(# Input options self, annotate_output, annotation_matrix, metadata_path, abundances_path, abundance_metadata_path, transcriptome_path, transcriptome_metadata_path, # Runtime options pval_cutoff, proportions_cutoff, threshold, multi_test_correction, batchfile, processes, allow_negative_values, ko, pfam, tigrfam, cluster, ortholog, cazy, ec, ko_hmm, # Output options output_directory): plot = Plot() database = Databases() if annotate_output: logging.info('Parsing annotate output: %s' % (annotate_output)) pa = ParseAnnotate(annotate_output, processes) if ko: annotation_matrix = pa.ko elif ko_hmm: annotation_matrix = pa.ko_hmm elif pfam: annotation_matrix = pa.pfam elif tigrfam: annotation_matrix = pa.tigrfam elif cluster: annotation_matrix = pa.cluster elif ortholog: annotation_matrix = pa.ortholog elif cazy: annotation_matrix = pa.cazy elif ec: annotation_matrix = pa.ec annotations_dict, _, annotations, = Parser.parse_simple_matrix(annotation_matrix) annotation_type = self.check_annotation_type(annotations) logging.info('Parsing metadata: %s' % metadata_path) metadata, metadata_value_lists, attribute_dict = Parser.parse_metadata_matrix(metadata_path) if abundances_path: logging.info('Running abundances pipeline') logging.info('Parsing sample abundance') abundances_dict, _, _ = Parser.parse_simple_matrix(abundances_path) logging.info('Parsing sample metadata') _, _, ab_attribute_dict = Parser.parse_metadata_matrix(abundance_metadata_path) test = Test(annotations_dict, None, annotation_type, threshold, multi_test_correction, processes, database) weighted_abundance = self.weight_annotation_matrix(abundances_dict, annotations_dict, ab_attribute_dict, annotations) results = test.test_weighted_abundances(weighted_abundance, annotations) for result in results: test_result_lines, test_result_output_file = result test_result_output_path = os.path.join(output_directory, test_result_output_file) Writer.write(test_result_lines, test_result_output_path) else: if batchfile: gtdb_annotation_matrix = self.get_gtdb_database_path(annotation_type, database) batchfile_metadata, batchfile_metadata_value_lists, batchfile_attribute_dict = Parser.parse_metadata_matrix(batchfile) genomes_set = set(batchfile_metadata.keys()) reference_genome_annotations, genomes_set = Parser.filter_large_matrix(genomes_set, gtdb_annotation_matrix) annotations_dict.update(reference_genome_annotations) new_batchfile_attribute_dict = dict() for group_name, accession_id_list in batchfile_attribute_dict.items(): filtered_accession_id_list = [accession_id for accession_id in accession_id_list if accession_id in genomes_set] if len(filtered_accession_id_list)>0: new_batchfile_attribute_dict[group_name] = filtered_accession_id_list attribute_dict.update(new_batchfile_attribute_dict) batchfile_metadata={group_name:batchfile_metadata[group_name] for group_name in genomes_set} metadata.update(batchfile_metadata) batchfile_metadata_value_lists = set(new_batchfile_attribute_dict.keys()) metadata_value_lists = metadata_value_lists.union(batchfile_metadata_value_lists) logging.info("Comparing sets of genomes") combination_dict = dict() for combination in product(*list([metadata_value_lists])): genome_list = list() for genome, attributes in metadata.items(): for feature in combination: if feature in attributes: genome_list.append(genome) combination_dict['_'.join(combination)] = genome_list test = Test(annotations_dict, combination_dict, annotation_type, threshold, multi_test_correction, processes, database) results = test.do(attribute_dict) for result in results: test_result_lines, test_result_output_file = result test_result_output_path = os.path.join(output_directory, test_result_output_file) Writer.write(test_result_lines, test_result_output_path) raw_proportions_output_lines = self.calculate_portions(annotations, combination_dict, annotations_dict, genome_list, proportions_cutoff) Writer.write(raw_proportions_output_lines, os.path.join(output_directory, self.PROPORTIONS)) logging.info('Generating summary plots') if annotation_type==self.KEGG: logging.info('Finding module completeness in differentially abundant KOs') for result_file in os.listdir(output_directory): if(result_file.endswith("fisher.tsv") or result_file.endswith("cdf.tsv")): plot.draw_barplots(os.path.join(output_directory, result_file), pval_cutoff, output_directory) module_output, prefix = self.module_completeness(database, os.path.join(output_directory, result_file), pval_cutoff) Writer.write(module_output, os.path.join(output_directory, prefix +'_'+ self.MODULE_COMPLETENESS)) plot.draw_pca_plot(annotation_matrix, metadata_path, output_directory)
def classify_pipeline(self, custom_modules, cutoff, aggregate, genome_and_annotation_matrix, module_rules_json, gff_file, output_directory): ''' Parameters ---------- custom_modules - string. Path to file containing custom module definitions, consistent with KEGG module nomenclature (http://www.genome.jp/kegg/module.html) cutoff - float. Fraction of a module needed in order to be included in the output. genome_and_annotation_matrix - string. Path to file containing genome - annotation matrix output - string. Path to file to output results to. ''' pathway = dict() genome_output_lines = list() if module_rules_json: cc = ClassifyChecks(RulesJson().load(module_rules_json)) if custom_modules: logging.info(f'Reading in custom modules: {custom_modules}') modules_to_classify = self.update_with_custom_modules( custom_modules) else: modules_to_classify = self.m2def if gff_file: logging.info("Reading in annotations from an input GFF file") genome_to_annotation_sets = dict() annotation_results = dict() annotation_results, genome_to_annotation_sets = Parser.parse_gff( gff_file) else: logging.info("Reading in annotations from an input matrix") genome_to_annotation_sets, _, _ = Parser.parse_simple_matrix( genome_and_annotation_matrix) if aggregate: logging.info( f'Reading in abundances: {genome_and_annotation_matrix}') abundances, _, _ = Parser.parse_simple_matrix( genome_and_annotation_matrix) abundance_result = dict() logging.info( f"Read in annotations for {len(genome_to_annotation_sets)} genomes" ) output_lines = [[ "Genome_name", "Module_id", "Module_name", "Steps_found", "Steps_needed", "Percent_steps_found" ]] genome_output_lines = [["Genome_name", "Module_id", "Module_name"]] for name, pathway_string in modules_to_classify.items(): if name not in self.signature_modules: path = ModuleDescription(pathway_string) pathway[name] = path for genome, annotation_frequency in genome_to_annotation_sets.items( ): annotations = get_present_annotations(annotation_frequency) num_covered, _, _, ko_path = path.num_covered_steps( annotations) num_all = path.num_steps() perc_covered = num_covered / float(num_all) ko_path_list = list(chain(*ko_path.values())) if perc_covered >= cutoff: if module_rules_json: rule_check_result = cc.check( name, annotation_results[genome]) else: rule_check_result = True if rule_check_result: if path.is_single_step: if perc_covered != 1: if cutoff < 1: num_all = 1 num_covered = 0 perc_covered = 0.0 else: continue else: num_all = 1 num_covered = 1 if aggregate: if genome not in abundance_result: abundance_result[genome] = dict() pathway_abundance = [ abundances[genome][ko] for ko in ko_path_list ] if len(pathway_abundance) > 0: pathway_average_abundance = sum( pathway_abundance) / len( pathway_abundance) else: pathway_average_abundance = 0 abundance_result[genome][ name] = pathway_average_abundance genome_output_lines.append([ genome, name, self.modules[name], ','.join(ko_path_list) ]) output_line = [ genome, name, self.modules[name], str(num_covered), str(num_all), str(round(perc_covered * 100, 2)) ] output_lines.append(output_line) Writer.write(output_lines, os.path.join(output_directory, self.ko_output)) Writer.write(genome_output_lines, os.path.join(output_directory, self.module_paths)) if aggregate: samples = list(abundance_result.keys()) output_lines = ['\t'.join(["ID"] + samples) + '\n'] for module in modules_to_classify.keys(): if module not in self.signature_modules: output_line = [module] for sample in samples: if module in abundance_result[sample]: output_line.append( str(abundance_result[sample][module])) else: output_line.append('0.0') output_lines.append(output_line) Writer.write(output_lines, os.path.join(output_directory, self.aggregate_output))