def __init__(self): databases = Databases() self.signature_modules = databases.signature_modules self.m2def = databases.m2def() self.modules = databases.m() self.ko_output = "module_completeness.tsv" self.module_paths = "module_paths.tsv" self.aggregate_output = "aggregate_output.tsv"
def __init__(self, output_directory, annotate_ko, annotate_ko_hmm, annotate_pfam, annotate_tigrfam, annoatate_cluster, annotate_ortholog, annotate_cazy, annotate_ec, annotate_orthogroup, evalue, bit, percent_id_cutoff, aln_query, aln_reference, fraction_aligned, cut_ga_pfam, cut_nc_pfam, cut_tc_pfam, cut_ga_tigrfam, cut_nc_tigrfam, cut_tc_tigrfam, cut_hmm, inflation, chunk_number, chunk_max, count_domains, threads, parallel, suffix, light): # Define inputs and outputs self.output_directory = output_directory # Define type of annotation to be carried out self.annotate_ko = annotate_ko self.annotate_ko_hmm = annotate_ko_hmm self.annotate_pfam = annotate_pfam self.annotate_tigrfam = annotate_tigrfam self.annotate_cluster = annoatate_cluster self.annotate_ortholog = annotate_ortholog self.annotate_orthogroup = annotate_orthogroup self.annotate_cazy = annotate_cazy self.annotate_ec = annotate_ec # Cutoffs self.evalue = evalue self.bit = bit self.percent_id_cutoff = percent_id_cutoff self.aln_query = aln_query self.aln_reference = aln_reference self.fraction_aligned = fraction_aligned self.cut_ga_pfam = cut_ga_pfam self.cut_nc_pfam = cut_nc_pfam self.cut_tc_pfam = cut_tc_pfam self.cut_ga_tigrfam = cut_ga_tigrfam self.cut_nc_tigrfam = cut_nc_tigrfam self.cut_tc_tigrfam = cut_tc_tigrfam self.cut_hmm = cut_hmm self.inflation = inflation self.chunk_number = chunk_number self.chunk_max = chunk_max self.count_domains = count_domains # Parameters self.threads = threads self.parallel = parallel self.suffix = suffix self.light = light # Set up multiprocesses pool self.pool = mp.Pool(processes=int(self.parallel)) # Load databases self.databases = Databases()
def __init__(self): d=Databases() self.ko_re = re.compile('^K\d+$') self.signature_modules = d.signature_modules self.m2def = d.m2def self.m = d.m
def __init__(self, annotation_type, clusters = None): ''' Interpret which annotation type to write a matrix for. Parameters ---------- annotation_type - String. ''' self.annotation_type = annotation_type self.databases = Databases() if self.annotation_type == self.KO: self.annotation_list = [x.strip() for x in open(os.path.join(self.databases.IDS_DIR, self.KO))] elif self.annotation_type == self.EC: self.annotation_list = [x.strip() for x in open(os.path.join(self.databases.IDS_DIR, self.EC))] elif self.annotation_type == self.PFAM: self.annotation_list = [x.strip() for x in open(os.path.join(self.databases.IDS_DIR, self.PFAM))] elif self.annotation_type == self.TIGRFAM: self.annotation_list = [x.strip() for x in open(os.path.join(self.databases.IDS_DIR, self.TIGRFAM))] elif self.annotation_type == self.CAZY: self.annotation_list = [x.strip() for x in open(os.path.join(self.databases.IDS_DIR, self.CAZY))] elif self.annotation_type == self.HYPOTHETICAL: self.annotation_list = clusters elif self.annotation_type == self.ORTHOLOG: self.annotation_list = clusters else: raise Exception("Annotation type not found: %s" % (self.annotation_type))
def __init__(self, metadata_keys): self.d=Databases() self.metadata_keys \ = list(metadata_keys) self.matrix_header \ = ["compound", "reaction", 'type'] self.transcriptome_header \ = [key + '_reaction_transcriptome' for key in self.metadata_keys] + \ [key + '_reaction_expression' for key in self.metadata_keys] self.compound_header \ = [key + '_compound' for key in self.metadata_keys] self.metadata_header \ = ['node', 'description', 'type', 'module', 'module_descr', 'pathway', 'pathway_descr', 'node_type'] self.query_header \ = ['query', 'step'] self.compound_reaction_index_header \ = [] self.step_header = ['step']
def __init__(self, matrix, transcriptome): d = Databases() self.r2k = d.r2k logging.info("Parsing input matrix: %s" % matrix) self.orthology_matrix \ = self._parse_matrix(matrix) logging.info("Calculating reaction abundances") self.reaction_matrix \ = self._calculate_abundances(self.r2k, self.orthology_matrix) if transcriptome: logging.info("Parsing input transcriptome: %s" % transcriptome) self.orthology_matrix_transcriptome \ = self._parse_matrix(transcriptome) logging.info("Calculating reaction transcriptome abundances") self.reaction_matrix_transcriptome \ = self._calculate_abundances(self.r2k, self.orthology_matrix_transcriptome) logging.info("Calculating normalized expression abundances") self.orthology_matrix_expression \ = self._calculate_expression_matrix(self.orthology_matrix, self.orthology_matrix_transcriptome) logging.info("Calculating reaction expression abundances") self.reaction_matrix_expression \ = self._calculate_abundances(self.r2k, self.orthology_matrix_expression)
def __init__(self): databases = Databases() self.reaction_to_ko = databases.r2k() self.compound_to_reaction = databases.c2r() self.compounds = databases.c() self.positive = 'positive' self.negative = 'negative' self.abundace = "frequency_matrix.tsv" self.enrichment = "enrichment_results.tsv" self.abundace_header = ["Compound"] self.enrichment_header = ["Compound", "Group_1", "Group_2", "group_1_mean", "group_2_mean", "score", "pvalue", "description"]
def __init__(self): self.databases = Databases() path_to_scripts = os.path.split(os.path.realpath(__file__))[0] self.draw_pca_script_path = os.path.join(path_to_scripts, "PLOT_ko_pca.r") self.draw_heatmap_script_path = os.path.join(path_to_scripts, "PLOT_ko_heatmap.r") self.draw_barplots_script_path = os.path.join(path_to_scripts, "PLOT_ko_breakdown.r") self.ko00000 = os.path.join(Data.DATABASE_DIR, self.databases.DB_VERSION, 'ko00000.tsv') self.output_pca_plot = 'presence_absence_pca_plot.svg' self.output_heatmap_plot = 'presence_absence_pca_plot.svg'
def __init__(self): d = Databases() self.m2def = d.m2def() self.m2c = d.m2c() self.c = d.c() self.m = d.m() self.c2m = d.c2m() self.signature_modules = d.signature_modules self.output_file = 'linkages.tsv'
def __init__(self): d = Databases() self.m2def = d.m2def self.m2c = d.m2c self.c2m = dict() for module, compounds in self.m2c.items(): substrates = compounds[0] for substrate in substrates: if substrate in self.c2m: self.c2m[substrate].append(module) else: self.c2m[substrate] = [module] self.c = d.c self.m = d.m self.signature_modules = d.signature_modules self.output_file = 'linkages.tsv'
def __init__(self, output_directory, ko, pfam, tigrfam, hypothetical, cazy, ec, evalue, bit, id, aln_query, aln_reference, c, cut_ga, cut_nc, cut_tc, inflation, chunk_number, chunk_max, count_domains, threads, parallel, suffix, light): # Define inputs and outputs self.output_directory = output_directory # Define type of annotation to be carried out self.ko = ko self.pfam = pfam self.tigrfam = tigrfam self.hypothetical = hypothetical self.cazy = cazy self.ec = ec # Cutoffs self.evalue = evalue self.bit = bit self.id = id self.aln_query = aln_query self.aln_reference = aln_reference self.c = c self.cut_ga = cut_ga self.cut_nc = cut_nc self.cut_tc = cut_tc self.inflation = inflation self.chunk_number = chunk_number self.chunk_max = chunk_max self.count_domains = count_domains # Parameters self.threads = threads self.parallel = parallel self.suffix = suffix self.light = light # Set up multiprocesses pool self.pool = mp.Pool(processes=int(self.parallel)) # Load databases self.databases = Databases()
def parse_tpm_values(tpm_values): from enrichm.databases import Databases k2r = Databases().k2r() output_dict = dict() annotation_types = set() genome_types = set() tpm_values_io = open(tpm_values, 'rb') tpm_values_io.readline() for line in tpm_values_io: gene, _, _, _, _, _, _, _, _, _, tpm, \ _, _, annotation, sample = line.strip().split(b'\t') annotation_list = annotation.split(b',') tpm = float(tpm) genome = '_'.join(str(gene, "utf-8").split('_')[:2]) # temporary genome_types.add(genome) if sample not in output_dict: output_dict[sample] = dict() if genome not in output_dict[sample]: output_dict[sample][genome] = dict() for annotation_type in annotation_list: if str(annotation_type, "utf-8") in k2r: reactions = k2r[str(annotation_type, "utf-8")] for reaction in reactions: reaction = str.encode(reaction) if reaction not in output_dict[sample][genome]: output_dict[sample][genome][reaction] = 0.0 annotation_types.add(reaction) output_dict[sample][genome][reaction] += tpm return output_dict, annotation_types, genome_types
def __init__(self, metadata, abundances_metagenome, abundances_transcriptome, abundances_metabolome, fisher_results): self.abundances_metagenome = abundances_metagenome self.abundances_transcriptome = abundances_transcriptome self.abundances_metabolome = abundances_metabolome self.fisher_results = fisher_results self.metadata_keys = list(metadata.keys()) databases = Databases() self.reaction_to_module = databases.r2m() self.module_to_reaction = databases.m2r() self.module_descriptions = databases.m() self.reaction_to_pathway = databases.r2p() self.pathway_to_reaction = databases.p2r() self.pathway_descriptions = databases.p() self.compound_desc_dict = databases.compound_desc_dict() self.compound_descriptions = databases.c() self.reaction_descriptions = databases.r() self.reactions_to_compounds = databases.r2c() self.reactions_to_kos = databases.r2k() self.matrix_header = ["compound", "reaction", 'type'] self.transcriptome_header = [ key + '_reaction_transcriptome' for key in self.metadata_keys ] self.compound_header = [ key + '_compound' for key in self.metadata_keys ] self.metadata_header = [ 'node', 'description', 'type', 'module', 'module_descr', 'pathway', 'pathway_descr', 'node_type' ] self.query_header = ['query', 'step'] self.step_header = ['step'] self.to_omit = set([ "C00828", # Menaquinone "C00534", # Pyridoxamine "C00006", # NADP+ "C00003", # NAD+ "C00002", # ATP "C00314", # Pyridoxine "C00864", # Pantothenate "C00504", # Folate "C00032", # Heme "C05443", # Vitamin D3 "C00253", # Nicotinate "C00250", # Pyridoxal "C11378", # Ubiquinone-10 "C05777", # Coenzyme F430 "C00072", # Ascorbate "C00378", # Thiamine "C00101", # Tetrahydrofolate "C00029", # UDP-glucose "C00068", # Thiamin diphosphate "C00061", # FMN "C00063", # CTP "C05776", # Vitamin B12 "C00113", # PQQ "C18237", # Molybdoenzyme molybdenum cofactor "C00051", # Glutathione "C00010", # CoA "C00016", # FAD "C00018", # Pyridoxal phosphate "C00019", # S-Adenosyl-L-methionine "C00153", # Nicotinamide "C04628", # Coenzyme B "C00862", # Methanofuran "C15672", # Heme O "C15670", # Heme A "C02059", # Phylloquinone "C03576", # Coenzyme M "C05441", # Vitamin D2 "C00272", # Tetrahydrobiopterin "C02477", # alpha-Tocopherol "C00473", # Retinol "C00120", # Biotin "C00725", # Lipoate "C00053", # 3'-Phosphoadenylyl sulfate "C00194", # Cobamide coenzyme "C00255", # Riboflavin 'C00001', # H2O 'C00008', # ADP 'C00013', # Diphosphate 'C00004', # NADH 'C00005', # NADPH 'C00080', # H+ 'C00009', # Orthophosphate 'C00008', # ADP 'C00004', # NADH 'C00020', # AMP 'C00007', # Oxygen 'C00015' ]) # UDP
class Tests(unittest.TestCase): genome_annotation_simple_example = { "genome_1": { "K00001": 1, "K00002": 2, "K00003": 0 }, "genome_2": { "K00001": 0, "K00002": 0, "K00003": 1 }, "genome_3": { "K00001": 5, "K00002": 4, "K00003": 5 } } genome_groups_simple_example = { "group_1": ["genome_1"], "group_2": ["genome_2", "genome_3"] } simple_test_object = Test(genome_annotation_simple_example, genome_groups_simple_example, "kegg", 0.1, 'fdr_bh', 1, Databases()) sample_to_annotation = { 'sample_group_1': { 'K00001': [16.0, 25.5, 30.1], 'K00002': [14.0, 21.0, 24.2], 'K00003': [15.5, 26.2, 31.1] }, 'sample_group_2': { 'K00001': [6.0, 6.5, 7.0], 'K00002': [10.8, 12.4, 14.0], 'K00003': [6.2, 5.4, 5.0] } } annotations = ["K00001", "K00002", "K00003"] def test_test_chooser(self): groups_1 = [[1, 2, 3, 4, 5], [1, 2, 3, 4, 5]] groups_2 = [[1], [1, 2, 3, 4, 5]] test_instance_1 = self.simple_test_object.test_chooser(groups_1) test_instance_2 = self.simple_test_object.test_chooser(groups_2) self.assertEqual(test_instance_1[0], stats.fisher_exact) self.assertEqual(test_instance_1[1], stats.mannwhitneyu) self.assertEqual(test_instance_2[0], self.simple_test_object.PA) self.assertEqual(test_instance_2[1], stats.norm.cdf) def test_count(self): ''' test both frequency and presence absence counting in Test. ''' self.assertEqual( self.simple_test_object.count("K00001", "group_1", False), (1, 0)) self.assertEqual( self.simple_test_object.count("K00001", "group_1", True), ([1], 0)) self.assertEqual( self.simple_test_object.count("K00001", "group_2", False), (1, 1)) self.assertEqual( self.simple_test_object.count("K00001", "group_2", True), ([0, 5], 0)) self.assertEqual( self.simple_test_object.count("K00003", "group_2", False), (2, 0)) self.assertEqual( self.simple_test_object.count("K00003", "group_2", True), ([1, 5], 0)) def test_gene_frequencies(self): expect_1 = [['K00002', 'group_1', 'group_2', [[2], 0], [[0, 4], 0]], ['K00003', 'group_1', 'group_2', [[0], 0], [[1, 5], 0]], ['K00001', 'group_1', 'group_2', [[1], 0], [[0, 5], 0]]] expect_2 = [['K00002', 'group_1', 'group_2', [1, 0], [1, 1]], ['K00003', 'group_1', 'group_2', [0, 1], [2, 0]], ['K00001', 'group_1', 'group_2', [1, 0], [1, 1]]] for result in self.simple_test_object.gene_frequencies( "group_1", "group_2", True): if result in expect_1: expect_1.pop(expect_1.index(result)) self.assertEqual(expect_1, list()) for result in self.simple_test_object.gene_frequencies( "group_1", "group_2", False): if result in expect_2: expect_2.pop(expect_2.index(result)) self.assertEqual(expect_2, list()) def test_test_weighted_abundances(self): expect = [[[ [ 'annotation', 'group_1', 'group_2', 'enriched_in', 'group_1_mean', 'group_2_mean', 'score', 'pvalue', 'corrected_pvalue', 'description' ], [ 'K00001', 'sample_group_1', 'sample_group_2', 'sample_group_1', '23.866666666666664', '6.5', 0.0, 0.04042779918502612, '0.060591636418731595', 'E1.1.1.1, adh; alcohol dehydrogenase [EC:1.1.1.1]' ], [ 'K00002', 'sample_group_1', 'sample_group_2', 'sample_group_1', '19.733333333333334', '12.4', 0.5, 0.060591636418731595, '0.060591636418731595', 'AKR1A1, adh; alcohol dehydrogenase (NADP+) [EC:1.1.1.2]' ], [ 'K00003', 'sample_group_1', 'sample_group_2', 'sample_group_1', '24.26666666666667', '5.533333333333334', 0.0, 0.04042779918502612, '0.060591636418731595', 'hom; homoserine dehydrogenase [EC:1.1.1.3]' ] ], 'sample_group_1_vs_sample_group_2_gvg_results.mannwhitneyu.tsv']] result = self.simple_test_object.test_weighted_abundances( self.sample_to_annotation, self.annotations) self.assertEqual(expect, result)
def do(# Input options self, annotate_output, annotation_matrix, metadata_path, abundances_path, abundance_metadata_path, transcriptome_path, transcriptome_metadata_path, # Runtime options pval_cutoff, proportions_cutoff, threshold, multi_test_correction, batchfile, processes, allow_negative_values, ko, pfam, tigrfam, cluster, ortholog, cazy, ec, ko_hmm, # Output options output_directory): plot = Plot() database = Databases() if annotate_output: logging.info('Parsing annotate output: %s' % (annotate_output)) pa = ParseAnnotate(annotate_output, processes) if ko: annotation_matrix = pa.ko elif ko_hmm: annotation_matrix = pa.ko_hmm elif pfam: annotation_matrix = pa.pfam elif tigrfam: annotation_matrix = pa.tigrfam elif cluster: annotation_matrix = pa.cluster elif ortholog: annotation_matrix = pa.ortholog elif cazy: annotation_matrix = pa.cazy elif ec: annotation_matrix = pa.ec annotations_dict, _, annotations, = Parser.parse_simple_matrix(annotation_matrix) annotation_type = self.check_annotation_type(annotations) logging.info('Parsing metadata: %s' % metadata_path) metadata, metadata_value_lists, attribute_dict = Parser.parse_metadata_matrix(metadata_path) if abundances_path: logging.info('Running abundances pipeline') logging.info('Parsing sample abundance') abundances_dict, _, _ = Parser.parse_simple_matrix(abundances_path) logging.info('Parsing sample metadata') _, _, ab_attribute_dict = Parser.parse_metadata_matrix(abundance_metadata_path) test = Test(annotations_dict, None, annotation_type, threshold, multi_test_correction, processes, database) weighted_abundance = self.weight_annotation_matrix(abundances_dict, annotations_dict, ab_attribute_dict, annotations) results = test.test_weighted_abundances(weighted_abundance, annotations) for result in results: test_result_lines, test_result_output_file = result test_result_output_path = os.path.join(output_directory, test_result_output_file) Writer.write(test_result_lines, test_result_output_path) else: if batchfile: gtdb_annotation_matrix = self.get_gtdb_database_path(annotation_type, database) batchfile_metadata, batchfile_metadata_value_lists, batchfile_attribute_dict = Parser.parse_metadata_matrix(batchfile) genomes_set = set(batchfile_metadata.keys()) reference_genome_annotations, genomes_set = Parser.filter_large_matrix(genomes_set, gtdb_annotation_matrix) annotations_dict.update(reference_genome_annotations) new_batchfile_attribute_dict = dict() for group_name, accession_id_list in batchfile_attribute_dict.items(): filtered_accession_id_list = [accession_id for accession_id in accession_id_list if accession_id in genomes_set] if len(filtered_accession_id_list)>0: new_batchfile_attribute_dict[group_name] = filtered_accession_id_list attribute_dict.update(new_batchfile_attribute_dict) batchfile_metadata={group_name:batchfile_metadata[group_name] for group_name in genomes_set} metadata.update(batchfile_metadata) batchfile_metadata_value_lists = set(new_batchfile_attribute_dict.keys()) metadata_value_lists = metadata_value_lists.union(batchfile_metadata_value_lists) logging.info("Comparing sets of genomes") combination_dict = dict() for combination in product(*list([metadata_value_lists])): genome_list = list() for genome, attributes in metadata.items(): for feature in combination: if feature in attributes: genome_list.append(genome) combination_dict['_'.join(combination)] = genome_list test = Test(annotations_dict, combination_dict, annotation_type, threshold, multi_test_correction, processes, database) results = test.do(attribute_dict) for result in results: test_result_lines, test_result_output_file = result test_result_output_path = os.path.join(output_directory, test_result_output_file) Writer.write(test_result_lines, test_result_output_path) raw_proportions_output_lines = self.calculate_portions(annotations, combination_dict, annotations_dict, genome_list, proportions_cutoff) Writer.write(raw_proportions_output_lines, os.path.join(output_directory, self.PROPORTIONS)) logging.info('Generating summary plots') if annotation_type==self.KEGG: logging.info('Finding module completeness in differentially abundant KOs') for result_file in os.listdir(output_directory): if(result_file.endswith("fisher.tsv") or result_file.endswith("cdf.tsv")): plot.draw_barplots(os.path.join(output_directory, result_file), pval_cutoff, output_directory) module_output, prefix = self.module_completeness(database, os.path.join(output_directory, result_file), pval_cutoff) Writer.write(module_output, os.path.join(output_directory, prefix +'_'+ self.MODULE_COMPLETENESS)) plot.draw_pca_plot(annotation_matrix, metadata_path, output_directory)
def __init__(self): databases = Databases() self.signature_modules = databases.signature_modules self.m2def = databases.m2def() self.m = databases.m()
def do( # Input options self, annotate_output, metadata_path, input_modules, abundances, # Runtime options genomes_to_compare_with_group_file, pval_cutoff, proportions_cutoff, threshold, multi_test_correction, batchfile, processes, ko, pfam, tigrfam, hypothetical, cazy, ec, # Output options output_directory): p = Plot() c = Compare() d = Databases() if genomes_to_compare_with_group_file: self.genomes_to_compare_with_group = self.parse_genomes_to_compare( genomes_to_compare_with_group_file) else: self.genomes_to_compare_with_group = None logging.info('Parsing annotate output: %s' % (annotate_output)) pa = ParseAnnotate(annotate_output, processes) logging.info('Parsing annotations') if ko: annotation_matrix = pa.ko gtdb_annotation_matrix = d.GTDB_KO elif pfam: annotation_matrix = pa.pfam gtdb_annotation_matrix = d.GTDB_PFAM elif tigrfam: annotation_matrix = pa.tigrfam gtdb_annotation_matrix = d.GTDB_TIGRFAM elif hypothetical: annotation_matrix = pa.hypothetical_cluster gtdb_annotation_matrix = None elif cazy: annotation_matrix = pa.cazy gtdb_annotation_matrix = d.GTDB_CAZY elif ec: annotation_matrix = pa.ec gtdb_annotation_matrix = d.GTDB_EC annotations_dict, modules, genomes \ = self._parse_annotation_matrix(annotation_matrix) if input_modules: logging.info('Limiting to %i modules' % len(modules)) modules = input_modules logging.info('Parsing metadata') metadata, metadata_value_lists, attribute_dict \ = self.parse_metadata_matrix(metadata_path) if batchfile: genomes_set = set() batchfile_metadata, batchfile_metadata_value_lists, batchfile_attribute_dict \ = self.parse_metadata_matrix(batchfile) genomes_set = genomes_set.union(set(batchfile_metadata.keys())) reference_genome_annotations, genomes_set = self.parse_gtdb_matrix( genomes_set, gtdb_annotation_matrix) annotations_dict.update(reference_genome_annotations) new_batchfile_attribute_dict = dict() for x, y in batchfile_attribute_dict.items(): s = [z for z in y if z in genomes_set] if len(s) > 0: new_batchfile_attribute_dict[x] = s attribute_dict.update(new_batchfile_attribute_dict) batchfile_metadata = { x: batchfile_metadata[x] for x in genomes_set } metadata.update(batchfile_metadata) batchfile_metadata_value_lists = set( new_batchfile_attribute_dict.keys()) metadata_value_lists = metadata_value_lists.union( batchfile_metadata_value_lists) logging.info("Comparing sets of genomes") combination_dict = dict() for combination in product(*list([metadata_value_lists])): genome_list = list() for genome, attributes in metadata.items(): for feature in combination: if feature in attributes: genome_list.append(genome) combination_dict['_'.join(combination)] = genome_list annotation_type = self.check_annotation_type(modules) t = Test(annotations_dict, modules, genomes, combination_dict, annotation_type, threshold, multi_test_correction, pval_cutoff, processes, d) results = t.do(attribute_dict) for result in results: test_result_lines, test_result_output_file = result test_result_output_path = os.path.join(output_directory, test_result_output_file) self._write(test_result_lines, test_result_output_path) raw_portions_path \ = os.path.join(output_directory, self.PROPORTIONS) unique_to_groups_path \ = os.path.join(output_directory, self.UNIQUE_TO_GROUPS) raw_proportions_output_lines \ = self.calculate_portions(modules, combination_dict, annotations_dict, genome_list, proportions_cutoff) self._write(raw_proportions_output_lines, raw_portions_path) logging.info('Generating summary plots') if annotation_type == self.KEGG: logging.info( 'Finding module completeness in differentially abundant KOs') for result_file in os.listdir(output_directory): if (result_file.endswith("fisher.tsv") or result_file.endswith("cdf.tsv")): p.draw_barplots( os.path.join(output_directory, result_file), pval_cutoff, output_directory) g1_sig_kos = set() g2_sig_kos = set() result_file_io = open( os.path.join(output_directory, result_file)) header = result_file_io.readline() for line in result_file_io: sline = line.strip().split('\t') if float(sline[-2]) < pval_cutoff: if result_file.endswith("fisher.tsv"): g1 = float( sline[3]) / (int(sline[3]) + int(sline[4])) g2 = float( sline[5]) / (int(sline[5]) + int(sline[6])) elif result_file.endswith("cdf.tsv"): g1 = float(sline[3]) g2 = float(sline[5]) if g1 > g2: g1_sig_kos.add(sline[0]) else: g2_sig_kos.add(sline[0]) module_output = [[ "Module", "Lineage", "Total steps", "Steps covered", "Percentage covered", "Module description" ]] for module, definition in d.m2def.items(): if module not in d.signature_modules: pathway = ModuleDescription(definition) num_all = pathway.num_steps() g1_num_covered, g1_ko_covered, g1_ko_total, g1_ko_path = pathway.num_covered_steps( g1_sig_kos) g1_perc_covered = g1_num_covered / float(num_all) g2_num_covered, g2_ko_covered, g2_ko_total, g2_ko_path = pathway.num_covered_steps( g2_sig_kos) g2_perc_covered = g2_num_covered / float(num_all) if g1_perc_covered > 0: output_line = [ module, sline[1], num_all, g1_num_covered, g1_perc_covered, d.m[module] ] module_output.append(output_line) if g2_perc_covered > 0: output_line = [ module, sline[2], num_all, g2_num_covered, g2_perc_covered, d.m[module] ] module_output.append(output_line) prefix = '_vs_'.join([sline[1], sline[2]]).replace(' ', '_') self._write( module_output, os.path.join(output_directory, prefix + '_' + self.MODULE_COMPLETENESS)) p.draw_pca_plot(annotation_matrix, metadata_path, output_directory)
def __init__(self): self.databases = Databases() self.reactions = self.databases.r() self.reaction_to_ko = self.databases.r2k()
class Annotate: ''' Annotates proteins, and MAGs ''' GENOME_BIN = 'genome_bin' GENOME_PROTEINS = 'genome_proteins' GENOME_GENES = 'genome_genes' GENOME_KO = 'annotations_ko' GENOME_KO_HMM = 'annotations_ko_hmm' GENOME_EC = 'annotations_ec' GENOME_PFAM = 'annotations_pfam' GENOME_TIGRFAM = 'annotations_tigrfam' GENOME_HYPOTHETICAL = 'annotations_hypothetical' GENOME_CAZY = 'annotations_cazy' GENOME_GFF = 'annotations_gff' GENOME_OBJ = 'annotations_genomes' OUTPUT_KO = 'ko_frequency_table.tsv' OUTPUT_KO_HMM = 'ko_hmm_frequency_table.tsv' OUTPUT_EC = 'ec_frequency_table.tsv' OUTPUT_PFAM = 'pfam_frequency_table.tsv' OUTPUT_TIGRFAM = 'tigrfam_frequency_table.tsv' OUTPUT_CAZY = 'cazy_frequency_table.tsv' OUTPUT_CLUSTER = 'cluster_frequency_table.tsv' OUTPUT_ORTHOLOG = 'ortholog_frequency_table.tsv' OUTPUT_HYPOTHETICAL_ANNOTATIONS = 'hypothetical_annotations.tsv' OUTPUT_DIAMOND = "DIAMOND_search" GFF_SUFFIX = '.gff' PROTEINS_SUFFIX = '.faa' ANNOTATION_SUFFIX = '.tsv' PICKLE_SUFFIX = '.pickle' def __init__(self, output_directory, annotate_ko, annotate_ko_hmm, annotate_pfam, annotate_tigrfam, annoatate_cluster, annotate_ortholog, annotate_cazy, annotate_ec, evalue, bit, percent_id_cutoff, aln_query, aln_reference, fraction_aligned, cut_ga, cut_nc, cut_tc, cut_hmm, inflation, chunk_number, chunk_max, count_domains, threads, parallel, suffix, light): # Define inputs and outputs self.output_directory = output_directory # Define type of annotation to be carried out self.annotate_ko = annotate_ko self.annotate_ko_hmm = annotate_ko_hmm self.annotate_pfam = annotate_pfam self.annotate_tigrfam = annotate_tigrfam self.annotate_cluster = annoatate_cluster self.annotate_ortholog = annotate_ortholog self.annotate_cazy = annotate_cazy self.annotate_ec = annotate_ec # Cutoffs self.evalue = evalue self.bit = bit self.percent_id_cutoff = percent_id_cutoff self.aln_query = aln_query self.aln_reference = aln_reference self.fraction_aligned = fraction_aligned self.cut_ga = cut_ga self.cut_nc = cut_nc self.cut_tc = cut_tc self.cut_hmm = cut_hmm self.inflation = inflation self.chunk_number = chunk_number self.chunk_max = chunk_max self.count_domains = count_domains # Parameters self.threads = threads self.parallel = parallel self.suffix = suffix self.light = light # Set up multiprocesses pool self.pool = mp.Pool(processes=int(self.parallel)) # Load databases self.databases = Databases() def prep_genome(self, genome_file_list, genome_directory): ''' Do any preparation specific to the genome annotation pipeline. Inputs ------ genome_file_list - List. list of strings, each a path to a file containing a genome Outputs ------- returns the directory with all genome ids sym-linked into it. ''' # link all the genomes into one file logging.info('Preparing genomes for annotation') if genome_file_list: mkdir(genome_directory) genome_paths = list() for genome_path in genome_file_list: if genome_path.endswith(self.suffix): genome_paths.append(genome_path) cmd = "xargs --arg-file=/dev/stdin cp --target-directory=%s" % genome_directory logging.debug(cmd) process = subprocess.Popen(["bash", "-c", cmd], stdin=subprocess.PIPE, stdout=subprocess.PIPE, universal_newlines=True) process.communicate(input=str('\n'.join(genome_paths))) return genome_directory def call_proteins(self, genome_directory): ''' Use prodigal to call proteins within the genomes Parameters ---------- genome_directory - string. Directory containing .fna files for each input genome Outputs ------- returns the directory containing an .faa file for each input genomes ''' protein_directory_path = path.join(self.output_directory, self.GENOME_PROTEINS) gene_directory_path = path.join(self.output_directory, self.GENOME_GENES) mkdir(protein_directory_path) mkdir(gene_directory_path) genome_list = list() genome_paths = list() for genome in listdir(genome_directory): if genome.endswith(self.suffix): genome_paths.append(path.splitext(genome)[0]) logging.info(" - Calling proteins for %i genomes", len(genome_paths)) cmd = "ls %s/*%s | \ sed 's/%s//g' | \ grep -o '[^/]*$' | \ parallel -j %s \ prodigal \ -q \ -p meta \ -o /dev/null \ -d %s/{}%s \ -a %s/{}%s \ -i %s/{}%s \ > /dev/null 2>&1" \ % (genome_directory, self.suffix, self.suffix, self.parallel, gene_directory_path, self.suffix, protein_directory_path, self.PROTEINS_SUFFIX, genome_directory, self.suffix) logging.debug(cmd) subprocess.call(cmd, shell=True) logging.debug('Finished') protein_directory_files = listdir(protein_directory_path) genome_directory_files = listdir(genome_directory) for genome_protein, genome_nucl in zip(protein_directory_files, genome_directory_files): genome_protein_base = genome_protein.replace( self.PROTEINS_SUFFIX, self.suffix) output_genome_protein_path = path.join(protein_directory_path, genome_protein) output_genome_nucl_path = path.join(genome_directory, genome_nucl) output_genome_gene_path = path.join(gene_directory_path, genome_protein_base) genome = (self.light, output_genome_protein_path, output_genome_nucl_path, output_genome_gene_path) genome_list.append(genome) return genome_list def annotate_diamond(self, genomes_list, database, parser_type, ids_type, output_subdirectory): ''' Annotate the proteins encoded by each genome with KO ids using either BLAST or using HMM searches (no implemented yet). Parameters ---------- genome_faa_directory - string. Directory containing .faa files for each input genome Outputs ------- returns a directory containing the search results for each of the input population genomes, and a frequency matrix contining with the KOs as rows, and the genomes as columns. ''' output_directory_path = path.join(self.output_directory, output_subdirectory) genome_dict = {genome.name: genome for genome in genomes_list} mkdir(output_directory_path) specific_cutoffs = None with tempfile.NamedTemporaryFile() as temp: to_write = str() for genome in genomes_list: to_write += f"sed \"s/>/>{genome.name}~/g\" {genome.path}\n" temp.write(str.encode(to_write)) temp.flush() output_annotation_path = path.join(output_directory_path, self.OUTPUT_DIAMOND) + \ self.ANNOTATION_SUFFIX logging.info(' - BLASTing genomes') self.diamond_search(temp.name, output_annotation_path, database) for genome_name, batch in self.get_batches(output_annotation_path): if batch: genome = genome_dict[genome_name] genome.add(batch, self.evalue, self.bit, self.aln_query, self.aln_reference, specific_cutoffs, parser_type, ids_type) def get_batches(self, input_file): ''' Separate DIAMOND blast results into batches, where a batch is all the hits for a genome. Parameters ---------- input_file - string. Directory to search for blast results. ''' last = None input_file_io = open(input_file) for line in input_file_io: split_line = line.strip().split('\t') genome_id, _ = split_line[0].split('~') if last is None: last = genome_id batch = [split_line] else: if last == genome_id: batch.append(split_line) else: yield last, batch batch = [split_line] last = genome_id if last is None: yield None, None else: yield last, batch def diamond_search(self, tmp_name, output_path, database): ''' Carry out a diamond blastp search. Parameters ---------- input_genome_path - string. Path to file containing .faa file for an input genome output_path - string. Path to file to output results into databases - string. Path to HMM to use for searching ''' cmd = f'bash {tmp_name} | diamond blastp \ --quiet \ --outfmt 6 \ --max-target-seqs 1 \ --query /dev/stdin \ --out {output_path} \ --db {database} \ --threads {self.threads} ' if self.evalue: cmd += f'--evalue {self.evalue} ' if self.bit: cmd += f'--min-score {self.bit} ' if self.percent_id_cutoff: cmd += f'--id {self.percent_id_cutoff*100} ' if self.aln_query: cmd += f"--query-cover {self.aln_query*100} " if self.aln_reference: cmd += f"--subject-cover {self.aln_reference*100} " logging.debug(cmd) subprocess.call(cmd, shell=True) logging.debug('Finished') def hmmsearch_annotation(self, genomes_list, output_directory_path, database, ids_type, parser): ''' Annotate the proteins encoded by each genome with pfam ids using HMM searches. Parameters ---------- genomes_list - list. list of Genome objects ''' mkdir(output_directory_path) genome_dict = {genome.name: genome for genome in genomes_list} if ids_type in (AnnotationParser.TIGRFAM, AnnotationParser.PFAM): hmmcutoff = True else: hmmcutoff = False if ids_type == AnnotationParser.KO_HMM: specific_cutoffs = self.databases.parse_ko_cutoffs() else: specific_cutoffs = None self.hmm_search(output_directory_path, database, hmmcutoff) for genome_annotation in listdir(output_directory_path): genome_id = path.splitext(genome_annotation)[0] genome = genome_dict[genome_id] output_annotation_path = path.join(output_directory_path, genome_annotation) genome.add(output_annotation_path, self.evalue, self.bit, self.aln_query, self.aln_reference, specific_cutoffs, parser, ids_type) def annotate_hypothetical(self, genomes_list): ''' Sort proteins coded by each genome into homologous clusters. Inputs ------ genomes_list - list. list of Genome objects ''' output_directory_path = path.join(self.output_directory, self.GENOME_HYPOTHETICAL) mkdir(output_directory_path) with tempfile.NamedTemporaryFile() as temp: to_write = str() for genome in genomes_list: to_write += f"sed \"s/>/>{genome.name}~/g\" {genome.path}\n" temp.flush() tmp_dir = tempfile.mkdtemp() db_path = path.join(output_directory_path, "db") clu_path = path.join(output_directory_path, "clu") align_path = path.join(output_directory_path, "alignDb") blast_output_path = path.join(output_directory_path, "alignDb.m8") formatted_blast_output_path = path.join(output_directory_path, "alignDb.formatted.m8") clu_tsv_path = path.join(output_directory_path, "hypothetical_clusters.tsv") logging.info(' - Generating MMSeqs2 database') cmd = "bash %s | sponge | mmseqs createdb /dev/stdin %s -v 0 > /dev/null 2>&1" % ( temp.name, db_path) logging.debug(cmd) subprocess.call(cmd, shell=True) logging.debug('Finished') logging.info(' - Clustering genome proteins') cmd = f"mmseqs cluster \ {db_path} \ {clu_path} \ {tmp_dir} \ --max-seqs 1000 \ --threads {self.threads} \ --min-seq-id {self.percent_id_cutoff} \ -e {self.evalue} \ -c {self.fraction_aligned} \ -v 0 " logging.debug(cmd) subprocess.call(cmd, shell=True) logging.debug('Finished') logging.info(' - Extracting clusters') cmd = 'mmseqs createtsv %s %s %s %s -v 0 > /dev/null 2>&1' % ( db_path, db_path, clu_path, clu_tsv_path) logging.debug(cmd) subprocess.call(cmd, shell=True) logging.debug('Finished') logging.info( ' - Computing Smith-Waterman alignments for clustering results' ) cmd = "mmseqs alignall %s %s %s --alignment-mode 3 -v 0 " % ( db_path, clu_path, align_path) logging.debug(cmd) subprocess.call(cmd, shell=True) logging.debug('Finished') logging.info(' - Converting to BLAST-like output') cmd = "mmseqs createtsv %s %s %s %s -v 0 > /dev/null 2>&1 " % ( db_path, db_path, align_path, blast_output_path) # --format-output query,target,bits logging.debug(cmd) subprocess.call(cmd, shell=True) logging.debug('Finished') logging.info(' - Reformatting BLAST output') cmd = "OFS=\"\t\" awk 'FNR==NR{a[$1]=$2;next}{$3=a[$3]; \ $1=\"\"; for(i=2;i<NF;i++){printf(\"%s\t\",$i)} \ printf(\"\\n\")}' %s %s | cut -f1,2,5 > %s" \ % ("%s", db_path + '.lookup', blast_output_path, formatted_blast_output_path) logging.debug(cmd) subprocess.call(cmd, shell=True) logging.debug('Finished') ortholog_dict = self.run_mcl(formatted_blast_output_path, output_directory_path) ortholog_ids = ortholog_dict.keys() cluster_ids = self.parse_cluster_results(clu_tsv_path, genomes_list, ortholog_dict, output_directory_path) return cluster_ids, ortholog_ids def run_mcl(self, blast_abc, output_directory_path): ''' Parse the protein clusters producedf from Mmseqs2 using mcl Parameters ---------- blast_abc - string. an abc file for mcl to run on. More information on the format of abc files can be found at https://micans.org/mcl/man/clmprotocols.html output_directory_path - string. Path to write the results of mcl parsing to. ''' dict_path = path.join(output_directory_path, "alignDb.dict") mci_path = path.join(output_directory_path, "alignDb.mci") cluster_path = path.join(output_directory_path, "mcl_clusters.tsv") output_path = path.join(output_directory_path, "mcl_clusters.convert.tsv") logging.info(' - Preparing network') ortholog_dict = dict() cmd = f"mcxload \ -abc {blast_abc} \ -write-tab {dict_path} \ -o {mci_path} \ --stream-mirror \ --stream-neg-log10 \ > /dev/null 2>&1" logging.debug(cmd) subprocess.call(cmd, shell=True) logging.debug('Finished') logging.info(' - Finding orthologs') ortholog_dict = dict() cmd = f'mcl \ {mci_path} \ -te {self.threads} \ -I {self.inflation} \ -o {cluster_path} \ > /dev/null 2>&1' logging.debug(cmd) subprocess.call(cmd, shell=True) logging.debug('Finished') logging.info(' - Reformatting output') ortholog_dict = dict() cmd = f'mcxdump \ -icl {cluster_path} \ -o {output_path} \ -tabr {dict_path} \ > /dev/null 2>&1' logging.debug(cmd) subprocess.call(cmd, shell=True) logging.debug('Finished') ortholog = 1 for line in open(output_path): ortholog_idx = "ortholog_%i" % ortholog ortholog_dict[ortholog_idx] = set() for protein in line.strip().split('\t'): ortholog_dict[ortholog_idx].add(protein) ortholog += 1 return ortholog_dict def parse_cluster_results(self, cluster_output_path, genomes_list, ortholog_dict, output_directory_path): ''' Parse cluster output in tab format. Inputs ------ from_cluster_results - String. Path to mmseqs2 clustering output file Yields ------- A cluster name, and a list of sequences in that cluster. ''' logging.info(' - Parsing input cluster file: %s', cluster_output_path) cluster_ids = set() previous_cluster_name = None counter = 0 genome_dictionary = {genome.name: genome for genome in genomes_list} output_hypothetical_annotations = path.join( output_directory_path, self.OUTPUT_HYPOTHETICAL_ANNOTATIONS) with open(output_hypothetical_annotations, 'w') as out_io: for line in open(cluster_output_path): cluster_id, member = line.strip().split('\t') genome_id, sequence_id = member.split('~') if cluster_id == previous_cluster_name: genome_dictionary[genome_id].add_cluster( sequence_id, "cluster_%i" % counter) else: counter += 1 previous_cluster_name = cluster_id cluster_ids.add("cluster_%i" % counter) genome_dictionary[genome_id].add_cluster( sequence_id, "cluster_%i" % counter) out_io.write( '\t'.join([genome_id, sequence_id, "cluster_%i" % counter]) + '\n') for ortholog, group in ortholog_dict.items(): for member in group: genome, protein = member.split('~') genome_dictionary[genome].add_ortholog(protein, ortholog) return cluster_ids def _default_hmmsearch_options(self): cmd = '' if self.bit: cmd += '-T %s ' % (str(self.bit)) else: cmd += '-E %s ' % (str(self.evalue)) return cmd def hmm_search(self, output_path, database, hmmcutoff): ''' Carry out a hmmsearch. Parameters ---------- input_genome_path - string. Path to file containing .faa file for an input genome output_path - string. Path to file to output results into databases - string. Path to HMM to use for searching ''' input_genome_path = path.join(self.output_directory, self.GENOME_PROTEINS) cmd = "ls %s | sed 's/%s//g' | parallel -j %s\ hmmsearch \ --cpu %s \ -o /dev/null \ --noali \ --domtblout %s/{}%s " \ % (input_genome_path, self.PROTEINS_SUFFIX, self.parallel, self.threads, output_path, self.ANNOTATION_SUFFIX) if hmmcutoff: if (self.cut_ga or self.cut_nc or self.cut_tc): if self.cut_ga: cmd += " --cut_ga " if self.cut_nc: cmd += " --cut_nc " if self.cut_tc: cmd += " --cut_tc " else: cmd += self._default_hmmsearch_options() else: cmd += self._default_hmmsearch_options() cmd += "%s %s/{}.faa 2> /dev/null" % (database, input_genome_path) logging.debug(cmd) subprocess.call(cmd, shell=True) logging.debug('Finished') def generate_gff_files(self, genomes_list): ''' Write GFF files for each of the genome objects in genomes_list Parameters ---------- genomes_list - List. List of Genome objects ''' output_directory_path = path.join(self.output_directory, self.GENOME_GFF) mkdir(output_directory_path) for genome in genomes_list: logging.info(' - Generating .gff file for %s', genome.name) gff_output = path.join(output_directory_path, genome.name + self.GFF_SUFFIX) Writer.write_gff(genome, gff_output) def rename_fasta(self, genomes_list): ''' Rename the called proteins with annotation ids. Parameters ---------- genomes_list - List. List of Genome objects ''' seqio = SequenceIO() for genome in genomes_list: file_object, fname = tempfile.mkstemp(suffix='.faa', text=True) if genome.gene: fd_gene, fname_gene = tempfile.mkstemp(suffix='.fna', text=True) with open(fname_gene, 'w') as out_gene_io: for description, sequence in seqio.each(open(genome.gene)): name = description.partition(' ')[0] annotations = ' '.join( genome.sequences[name].all_annotations()) out_gene_io.write(">%s %s\n" % (name, annotations)) out_gene_io.write(genome.sequences[name].gene + '\n') close(fd_gene) logging.debug('Moving %s to %s', fname_gene, genome.gene) shutil.move(fname_gene, genome.gene) with open(fname, 'w') as out_io: for description, sequence in seqio.each(open(genome.path)): name = description.partition(' ')[0] annotations = ' '.join( genome.sequences[name].all_annotations()) out_io.write(">%s %s\n" % (name, annotations)) out_io.write(str(sequence) + '\n') close(file_object) logging.debug('Moving %s to %s', fname, genome.path) shutil.move(fname, genome.path) def pickle_objects(self, genomes_list): ''' Store annotated genome objects as pickles. Parameters ---------- genomes_list - List. List of Genome objects ''' output_directory_path = path.join(self.output_directory, self.GENOME_OBJ) mkdir(output_directory_path) for genome in genomes_list: genome_pickle_path = path.join(output_directory_path, genome.name + self.PICKLE_SUFFIX) with open(genome_pickle_path, 'wb') as output: pickle.dump(genome, output) def list_splitter(self, input_list, chunk_number, chunk_max): """ An iterator that separates a list into a number of smaller lists (chunk_number). Maximum size for the sub-lists can also be specified (chunk_max) """ list_size = float(len(input_list)) chunk_size = int(round((list_size / chunk_number), 0)) if chunk_size > chunk_max: chunk_size = chunk_max elif chunk_size < 1: chunk_size = list_size while list_size > 0: if len(input_list) <= chunk_size: yield input_list del input_list else: yield input_list[:chunk_size] del input_list[:chunk_size] try: list_size = len(input_list) except NameError: list_size = 0 def parse_genome_inputs(self, genome_directory, protein_directory, genome_files, protein_files): ''' Inputs ------ Outputs ------- ''' prep_genomes_list = list() genomes_list = list() if protein_directory: logging.info("Using provided proteins") protein_genome_list = list() for protein_file in listdir(protein_directory): protein_genome_list.append( path.join(protein_directory, protein_file)) directory = self.prep_genome( protein_genome_list, path.join(self.output_directory, self.GENOME_PROTEINS)) for genome_proteins_file in listdir(directory): if genome_proteins_file.endswith(self.suffix): genome = (self.light, path.join(directory, genome_proteins_file), None, None) prep_genomes_list.append(genome) elif protein_files: logging.info("Using provided proteins") genome_proteins_path = path.join(self.output_directory, self.GENOME_PROTEINS) directory = self.prep_genome(protein_files, genome_proteins_path) for protein_file in listdir(directory): protein_file_path = path.join(directory, path.basename(protein_file)) prep_genomes_list.append( (self.light, protein_file_path, None, None)) elif genome_directory: logging.info("Calling proteins for annotation") prep_genomes_list = self.call_proteins(genome_directory) directory = genome_directory elif genome_files: logging.info("Calling proteins for annotation") directory = self.prep_genome( genome_files, path.join(self.output_directory, self.GENOME_BIN)) prep_genomes_list = self.call_proteins(directory) for chunk in self.list_splitter(prep_genomes_list, self.chunk_number, self.chunk_max): genomes_list += self.pool.map(parse_genomes, chunk) return genomes_list def annotate_pipeline(self, genome_directory, protein_directory, genome_files, protein_files): ''' Run Annotate pipeline for enrichM Parameters ---------- genome_directory - String. Path to directory containing genomes protein_directory - String. Path to directory containing proteins (.faa files) for genomes genome_files - List. List of strings, each to a .fna genome file. protein_files - List. List of strings, each to a .faa proteins file. ''' logging.info("Running pipeline: annotate") logging.info("Setting up for genome annotation") genomes_list = self.parse_genome_inputs(genome_directory, protein_directory, genome_files, protein_files) if genomes_list: logging.info("Starting annotation:") if (self.annotate_cluster or self.annotate_ortholog): logging.info( ' - Annotating genomes with hypothetical clusters') cluster_ids, ortholog_ids = self.annotate_hypothetical( genomes_list) logging.info(' - Generating hypotheticals frequency table') matrix_generator = MatrixGenerator( MatrixGenerator.HYPOTHETICAL, cluster_ids) freq_table = path.join(self.output_directory, self.OUTPUT_CLUSTER) matrix_generator.write_matrix(genomes_list, self.count_domains, freq_table) if self.annotate_ortholog: matrix_generator = MatrixGenerator( MatrixGenerator.ORTHOLOG, ortholog_ids) freq_table = path.join(self.output_directory, self.OUTPUT_ORTHOLOG) matrix_generator.write_matrix(genomes_list, self.count_domains, freq_table) if self.annotate_ko: annotation_type = AnnotationParser.BLASTPARSER logging.info( ' - Annotating genomes with ko ids using DIAMOND') self.annotate_diamond(genomes_list, self.databases.KO_DB, annotation_type, AnnotationParser.KO, self.GENOME_KO) logging.info(' - Generating ko frequency table') matrix_generator = MatrixGenerator(MatrixGenerator.KO) freq_table = path.join(self.output_directory, self.OUTPUT_KO) matrix_generator.write_matrix(genomes_list, self.count_domains, freq_table) if self.annotate_ko_hmm: annotation_type = AnnotationParser.HMMPARSER logging.info(' - Annotating genomes with ko ids using HMMs') self.hmmsearch_annotation( genomes_list, path.join(self.output_directory, self.GENOME_KO_HMM), self.databases.KO_HMM_DB, AnnotationParser.KO, annotation_type) logging.info(' - Generating ko frequency table') matrix_generator = MatrixGenerator(MatrixGenerator.KO) freq_table = path.join(self.output_directory, self.OUTPUT_KO_HMM) matrix_generator.write_matrix(genomes_list, self.count_domains, freq_table) if self.annotate_ec: annotation_type = AnnotationParser.BLASTPARSER logging.info(' - Annotating genomes with ec ids') self.annotate_diamond(genomes_list, self.databases.EC_DB, annotation_type, AnnotationParser.EC, self.GENOME_EC) logging.info(' - Generating ec frequency table') matrix_generator = MatrixGenerator(MatrixGenerator.EC) freq_table = path.join(self.output_directory, self.OUTPUT_EC) matrix_generator.write_matrix(genomes_list, self.count_domains, freq_table) if self.annotate_pfam: annotation_type = AnnotationParser.HMMPARSER logging.info(' - Annotating genomes with pfam ids') self.hmmsearch_annotation( genomes_list, path.join(self.output_directory, self.GENOME_PFAM), self.databases.PFAM_DB, AnnotationParser.PFAM, annotation_type) logging.info(' - Generating pfam frequency table') matrix_generator = MatrixGenerator(MatrixGenerator.PFAM) freq_table = path.join(self.output_directory, self.OUTPUT_PFAM) matrix_generator.write_matrix(genomes_list, self.count_domains, freq_table) if self.annotate_tigrfam: annotation_type = AnnotationParser.HMMPARSER logging.info(' - Annotating genomes with tigrfam ids') self.hmmsearch_annotation( genomes_list, path.join(self.output_directory, self.GENOME_TIGRFAM), self.databases.TIGRFAM_DB, AnnotationParser.TIGRFAM, annotation_type) logging.info(' - Generating tigrfam frequency table') matrix_generator = MatrixGenerator(MatrixGenerator.TIGRFAM) freq_table = path.join(self.output_directory, self.OUTPUT_TIGRFAM) matrix_generator.write_matrix(genomes_list, self.count_domains, freq_table) if self.annotate_cazy: annotation_type = AnnotationParser.HMMPARSER logging.info(' - Annotating genomes with CAZY ids') self.hmmsearch_annotation( genomes_list, path.join(self.output_directory, self.GENOME_CAZY), self.databases.CAZY_DB, AnnotationParser.CAZY, annotation_type) logging.info(' - Generating CAZY frequency table') matrix_generator = MatrixGenerator(MatrixGenerator.CAZY) freq_table = path.join(self.output_directory, self.OUTPUT_CAZY) matrix_generator.write_matrix(genomes_list, self.count_domains, freq_table) if hasattr(list(genomes_list[0].sequences.values())[0], "prod_id"): logging.info('Generating .gff files:') self.generate_gff_files(genomes_list) logging.info('Renaming protein headers') self.rename_fasta(genomes_list) if not self.light: logging.info('Storing genome objects') self.pickle_objects(genomes_list) logging.info('Finished annotation') else: logging.error('No files found with %s suffix in input directory', self.suffix)
class NetworkAnalyser: """ Prepare metagenome, metatranscriptome, metabolomic data for constructing SIF network files. """ MATRIX = 'matrix' NETWORK = 'network' EXPLORE = 'explore' DEGRADE = 'degrade' PATHWAY = 'pathway' ANNOTATE = 'annotate' ENRICHMENT = 'enrichment' MODULE_AB = 'module_ab' TRAVERSE = 'traverse' NETWORK_OUTPUT_FILE = 'network.tsv' METADATA_OUTPUT_FILE = 'metadata.tsv' TRAVERSE_OUTPUT_FILE = 'traverse.tsv' def __init__(self): self.databases = Databases() self.reactions = self.databases.r() self.reaction_to_ko = self.databases.r2k() def average(self, input_dictionary): ''' Take the average of the values of a dictionary of dictionaries ''' for sample_group, group_dict in input_dictionary.items(): for group, reaction_dict in group_dict.items(): for reaction, value in reaction_dict.items(): input_dictionary[sample_group][group][reaction] = sum(value) / len(value) return input_dictionary def median_genome_abundance(self, sample_abundance_dict, sample_metadata): """ Create a dictionary with the median abundance in sample_abundance_dict using sample_metadata as a reference. """ median_sample_abundance = dict() for group, samples in sample_metadata.items(): median_sample_abundance[group] = dict() sample_dictionaries = [sample_abundance_dict[sample] for sample in samples] genomes = set(list(itertools.chain(*[list(sample_dictionary.keys()) for sample_dictionary in sample_dictionaries]))) for genome in genomes: abundances = [sample_dictionary[genome] for sample_dictionary in sample_dictionaries] median_sample_abundance[group][genome] = statistics.median(abundances) return median_sample_abundance def normalise_by_abundance(self, median_sample_abundances, reaction_abundance_dict, group_to_genome, genome_to_group, genome_groups): normalised_abundance_dict = dict() for sample_group in list(median_sample_abundances.keys()): normalised_abundance_dict[sample_group] = dict() for genome_group in genome_groups: normalised_abundance_dict[sample_group][genome_group] = dict() for sample_group, genome_abundances in median_sample_abundances.items(): for genome, genome_abundance in genome_abundances.items(): if(genome in genome_to_group and genome in reaction_abundance_dict): for reaction in list(reaction_abundance_dict[genome].keys()): normalised_value = reaction_abundance_dict[genome][reaction]*genome_abundance genome_group = next(iter(genome_to_group[genome])) if reaction in normalised_abundance_dict[sample_group][genome_group]: normalised_abundance_dict[sample_group][genome_group][reaction].append( normalised_value ) else: normalised_abundance_dict[sample_group][genome_group][reaction] = [normalised_value] return normalised_abundance_dict def average_tpm_by_sample(self, tpm_results, sample_metadata): output_dict = dict() tpm_dict, annotations, genomes = tpm_results for group, samples in sample_metadata.items(): output_dict[group] = dict() for sample in samples: for annotation in annotations: if str.encode(sample) in tpm_dict: for genome in genomes: if genome not in output_dict[group]: output_dict[group][genome] = dict() if annotation not in output_dict[group][genome]: output_dict[group][genome][annotation] = list() if genome in tpm_dict[str.encode(sample)]: if annotation in tpm_dict[str.encode(sample)][genome]: output_dict[group][genome][annotation].append(tpm_dict[str.encode(sample)][genome][annotation]) else: output_dict[group][genome][annotation].append(0.0) else: output_dict[group][genome][annotation].append(0.0) for genome, values in output_dict[group].items(): for annotation in values: output_dict[group][genome][annotation] = sum(output_dict[group][genome][annotation])/len(output_dict[group][genome][annotation]) return output_dict def average_tpm_values(self, transriptome_abundance_dict, group_metadata): output_dict = dict() reactions = list(self.reactions.keys()) for genome_group_name, group_reaction_abundance_dict in transriptome_abundance_dict.items(): output_dict[genome_group_name] = dict() for group, members in group_metadata.items(): output_dict[genome_group_name][group] = dict() for reaction in reactions: to_average = list() for member in members: if member in group_reaction_abundance_dict: if str.encode(reaction) in group_reaction_abundance_dict[member]: to_average.append(group_reaction_abundance_dict[member][str.encode(reaction)]) else: to_average.append(0.0) else: to_average.append(0.0) average_value = sum(to_average) / len(to_average) output_dict[genome_group_name][group][reaction] = average_value return output_dict def aggregate_dictionary(self, reference_dict, matrix_dict): output_dict_mean = dict() for sample, ko_abundances in matrix_dict.items(): output_dict_mean[sample] = dict() for reaction, ko_list in reference_dict.items(): abundances = list() for ko in ko_list: if ko in ko_abundances: if ko_abundances[ko]>0: abundances.append(ko_abundances[ko]) else: logging.debug("ID not found in input matrix: %s" % ko) if any(abundances): abundance_mean = sum(abundances)/len(abundances) # average of the abundances... else: abundance_mean = 0 output_dict_mean[sample][reaction] = abundance_mean return output_dict_mean def mock_metadata(self, genomes): genome_to_group = {genome:set([genome]) for genome in genomes} genome_groups = set(genomes) group_to_genome = dict(genome_to_group) # Make a copy here return genome_to_group, genome_groups, group_to_genome def network_pipeline(self, subparser_name, matrix, genome_metadata_path, transcriptome_abundances_path, transcriptome_metadata_path, metagenome_abundances, metagenome_metadata_path, metabolome, enrichment_output, depth, filter, limit, queries, output_directory): ''' Parameters ---------- matrix transcriptome_abundances_path metagenome_abundances metagenome_metadata_path metabolome enrichment_output depth filter limit queries output_directory ''' orthology_matrix, genome_names, _ = Parser.parse_simple_matrix(matrix) if genome_metadata_path: genome_to_group, genome_groups, group_to_genome = \ Parser.parse_metadata_matrix(genome_metadata_path) else: genome_to_group, genome_groups, group_to_genome = \ self.mock_metadata(genome_names) reaction_matrix = self.aggregate_dictionary(self.reaction_to_ko, orthology_matrix) # Read in fisher results if enrichment_output: logging.info('Parsing input enrichment results') fisher_results = Parser.parse_enrichment_output(enrichment_output) else: logging.info('No enrichment results provided') fisher_results = None # Read in metabolome abundances if metabolome: logging.info('Parsing metabolome abundances') abundances_metabolome = Parser.parse_simple_matrix(metabolome) else: logging.info('No metabolome abundances provided') abundances_metabolome = None # Read in genome metagenome_abundances if metagenome_abundances: logging.info('Parsing input genome abundances') sample_abundance = Parser.parse_simple_matrix(metagenome_abundances)[0] sample_metadata = Parser.parse_metadata_matrix(metagenome_metadata_path)[2] else: # FIXME : There's always a better way than faking it. logging.info('No genome abundances provided') sample_abundance = {'MOCK': {x:1 for x in list(reaction_matrix.keys())} } sample_metadata = {"abundance": ['MOCK']} median_sample_abundances = self.median_genome_abundance(sample_abundance, sample_metadata) normalised_abundance_dict = self.normalise_by_abundance(median_sample_abundances, reaction_matrix, group_to_genome, genome_to_group, genome_groups) abundances_metagenome = self.average(normalised_abundance_dict) # Read in expression (TPM) values if transcriptome_abundances_path: logging.info("Parsing detectM TPM abundances") transcriptome_metadata = Parser.parse_metadata_matrix(transcriptome_metadata_path)[2] transcriptome_abundance_dict = self.average_tpm_by_sample(Parser.parse_tpm_values(transcriptome_abundances_path), transcriptome_metadata) transcriptome_abundances = self.average_tpm_values(transcriptome_abundance_dict, group_to_genome) else: transcriptome_abundances = None network_builder = NetworkBuilder(group_to_genome, abundances_metagenome, transcriptome_abundances, abundances_metabolome, fisher_results) # Run the subcommand specified if subparser_name == self.EXPLORE: network_lines, node_metadata = network_builder.query_matrix(queries, depth) elif subparser_name == self.PATHWAY: network_lines, node_metadata = network_builder.pathway_matrix(limit, filter) # Write the outputs Writer.write(network_lines, os.path.join(output_directory, self.NETWORK_OUTPUT_FILE)) Writer.write(node_metadata, os.path.join(output_directory, self.METADATA_OUTPUT_FILE)) logging.info('Finished the %s pipeline' % subparser_name)