def init_r_numbers(kgml_dir, step, total_steps): """Returns a RNumbers object representing associations in KGML files between R numbers, species, genes and pathways, for every species in kgml_dir. The RNumbers object is retrieved from a pickle file if it exists. :param kgml_dir: directory with a subdirectory for every species that CoMetGeNe was ran on, where each subdirectory contains metabolic pathways for the species in question in KGML format :param step: current step during initialization :param total_steps: total number of initialization steps :return: RNumbers object """ print '\t(%d/%d) Parsing KGML files ...' % (step, total_steps), if not os.path.exists(PICKLE_RN): r_numbers = get_r_numbers(kgml_dir) pickle(PICKLE_RN, r_numbers) else: r_numbers = unpickle(PICKLE_RN) for r_number in r_numbers.r_numbers: for organism in r_number.organisms: if organism.name not in os.listdir(kgml_dir): raise TrailGroupingError print 'done' return r_numbers
def init_species_dict(kgml_dir, step, total_steps): """Returns a dictionary storing genomic information for every species in kgml_dir. The dict is retrieved from a pickle file if it exists. :param kgml_dir: directory with a subdirectory for every species that CoMetGeNe was ran on, where each subdirectory contains metabolic pathways for the species in question in KGML format :param step: current step during initialization :param total_steps: total number of initialization steps :return: dict storing genome information for every species in the data set with the exception of the reference species """ print '\t(%d/%d) Building genomes for all species ...' % (step, total_steps), if not os.path.exists(PICKLE_GEN): species_dict = get_species_dict(kgml_dir) pickle(PICKLE_GEN, species_dict) else: species_dict = unpickle(PICKLE_GEN) for org in os.listdir(kgml_dir): if org not in species_dict: raise TrailGroupingError print 'done' return species_dict
def remove_non_neighboring_genes(organism): """Removes genes of the given Species object 'organism', from every chromosome and strand, if these genes are 'singletons', i.e. if they are not neighbors of other genes present in the Species object. Note that singleton genes are almost always present in Chromosome objects of Species objects. Recall that a Species object stores genomic information for genes involved in CoMetGeNe trails common to the reference species (for which a Species object is created) and at least one other species in the data set. Chromosome objects of the Species object 'organism' are modified. :param organism: Species object with chromosomes and gene lists for every chromosomal strand """ genomes = unpickle(PICKLE_GENOME) genes = genomes[organism.name] chromosomes = get_species_chromosomes(genomes, organism.name) for chromosome in chromosomes: genes_plus = get_genes_on_strand(genomes, organism.name, chromosome, genes, True) genes_minus = get_genes_on_strand(genomes, organism.name, chromosome, genes, False) ordered_plus = order_genes(genes_plus, organism.name, genomes) ordered_minus = order_genes(genes_minus, organism.name, genomes) org_chr = organism.find_chromosome(chromosome) assert org_chr is not None remove_plus = determine_genes_to_remove(org_chr.plus, ordered_plus) remove_minus = determine_genes_to_remove(org_chr.minus, ordered_minus) for gene in remove_plus: org_chr.plus.remove(gene) for gene in remove_minus: org_chr.minus.remove(gene) org_chr.groups_plus = delimit_gene_groups(org_chr.plus, ordered_plus) org_chr.groups_minus = delimit_gene_groups(org_chr.minus, ordered_minus)
def init_reaction_sets(results, step, total_steps): """Returns a ReactionSets object storing CoMetGeNe trail as CoMetGeNe reaction sets. The ReactionSets objects is retrieved from a pickle file if it exists. :param results: parsed CoMetGeNe results :param step: current step during initialization :param total_steps: total number of initialization steps :return: ReactionSets object """ print '\t(%d/%d) Generating reaction sets ...' % (step, total_steps), if not os.path.exists(PICKLE_RS): reaction_sets = get_reaction_sets(results) pickle(PICKLE_RS, reaction_sets) else: reaction_sets = unpickle(PICKLE_RS) print 'done' return reaction_sets
def main(): """Runs CoMetGeNe.py for every species in the data set. For every species in the data set, metabolic pathways and genomic information are retrieved from KEGG. Then EC numbers associations are also extracted. Finally, CoMetGeNe.py is ran for all species, for all combinations of the gap parameters. Output files and directories are created as needed. """ retrieve_pathways() if not os.path.exists(PICKLE_GENOME): genomes = dict() else: genomes = unpickle(PICKLE_GENOME) retrieve_genomes(genomes) retrieve_ec_numbers() run_CoMetGeNe()
def get_species_object(species, species_genes=None): """Creates and returns a Species object with the appropriate Chromosome list and gene contents. :param species: KEGG organism code :param species_genes: genes of 'species' involved in trails common to 'species' and at least another species; if None, all genes for 'species' are used :return: a Species object with chromosomes and gene lists for every chromosomal strand """ org = Species(species) genomes = unpickle(PICKLE_GENOME) chromosomes = get_species_chromosomes(genomes, species) if species_genes is None and species not in genomes: raise TrailGroupingError genes = genomes[species] if species_genes is None else species_genes.keys() for chromosome in chromosomes: current_chr = Chromosome(chromosome) org.add_chromosome(current_chr) genes_plus = get_genes_on_strand(genomes, species, chromosome, genes, True) genes_minus = get_genes_on_strand(genomes, species, chromosome, genes, False) genes_plus = order_genes(genes_plus, species, genomes) genes_minus = order_genes(genes_minus, species, genomes) for gene in genes_plus: current_chr.add_gene_plus(gene) for gene in genes_minus: current_chr.add_gene_minus(gene) return org