def init_r_numbers(kgml_dir, step, total_steps):
    """Returns a RNumbers object representing associations in KGML files between
    R numbers, species, genes and pathways, for every species in kgml_dir.

    The RNumbers object is retrieved from a pickle file if it exists.

    :param kgml_dir: directory with a subdirectory for every species that
        CoMetGeNe was ran on, where each subdirectory contains metabolic
        pathways for the species in question in KGML format
    :param step: current step during initialization
    :param total_steps: total number of initialization steps
    :return: RNumbers object
    """
    print '\t(%d/%d) Parsing KGML files ...' % (step, total_steps),
    if not os.path.exists(PICKLE_RN):
        r_numbers = get_r_numbers(kgml_dir)
        pickle(PICKLE_RN, r_numbers)
    else:
        r_numbers = unpickle(PICKLE_RN)
        for r_number in r_numbers.r_numbers:
            for organism in r_number.organisms:
                if organism.name not in os.listdir(kgml_dir):
                    raise TrailGroupingError
    print 'done'

    return r_numbers
def init_species_dict(kgml_dir, step, total_steps):
    """Returns a dictionary storing genomic information for every species in
    kgml_dir.

    The dict is retrieved from a pickle file if it exists.

    :param kgml_dir: directory with a subdirectory for every species that
        CoMetGeNe was ran on, where each subdirectory contains metabolic
        pathways for the species in question in KGML format
    :param step: current step during initialization
    :param total_steps: total number of initialization steps
    :return: dict storing genome information for every species in the data set
        with the exception of the reference species
    """
    print '\t(%d/%d) Building genomes for all species ...' % (step,
                                                              total_steps),
    if not os.path.exists(PICKLE_GEN):
        species_dict = get_species_dict(kgml_dir)
        pickle(PICKLE_GEN, species_dict)
    else:
        species_dict = unpickle(PICKLE_GEN)
        for org in os.listdir(kgml_dir):
            if org not in species_dict:
                raise TrailGroupingError
    print 'done'

    return species_dict
示例#3
0
def remove_non_neighboring_genes(organism):
    """Removes genes of the given Species object 'organism', from every
    chromosome and strand, if these genes are 'singletons', i.e. if they are not
    neighbors of other genes present in the Species object.

    Note that singleton genes are almost always present in Chromosome objects of
    Species objects. Recall that a Species object stores genomic information for
    genes involved in CoMetGeNe trails common to the reference species (for
    which a Species object is created) and at least one other species in the
    data set.

    Chromosome objects of the Species object 'organism' are modified.

    :param organism: Species object with chromosomes and gene lists for every
        chromosomal strand
    """
    genomes = unpickle(PICKLE_GENOME)
    genes = genomes[organism.name]
    chromosomes = get_species_chromosomes(genomes, organism.name)

    for chromosome in chromosomes:
        genes_plus = get_genes_on_strand(genomes, organism.name, chromosome,
                                         genes, True)
        genes_minus = get_genes_on_strand(genomes, organism.name, chromosome,
                                          genes, False)
        ordered_plus = order_genes(genes_plus, organism.name, genomes)
        ordered_minus = order_genes(genes_minus, organism.name, genomes)

        org_chr = organism.find_chromosome(chromosome)
        assert org_chr is not None

        remove_plus = determine_genes_to_remove(org_chr.plus, ordered_plus)
        remove_minus = determine_genes_to_remove(org_chr.minus, ordered_minus)

        for gene in remove_plus:
            org_chr.plus.remove(gene)
        for gene in remove_minus:
            org_chr.minus.remove(gene)

        org_chr.groups_plus = delimit_gene_groups(org_chr.plus, ordered_plus)
        org_chr.groups_minus = delimit_gene_groups(org_chr.minus,
                                                   ordered_minus)
def init_reaction_sets(results, step, total_steps):
    """Returns a ReactionSets object storing CoMetGeNe trail as CoMetGeNe
    reaction sets.

    The ReactionSets objects is retrieved from a pickle file if it exists.

    :param results: parsed CoMetGeNe results
    :param step: current step during initialization
    :param total_steps: total number of initialization steps
    :return: ReactionSets object
    """
    print '\t(%d/%d) Generating reaction sets ...' % (step, total_steps),
    if not os.path.exists(PICKLE_RS):
        reaction_sets = get_reaction_sets(results)
        pickle(PICKLE_RS, reaction_sets)
    else:
        reaction_sets = unpickle(PICKLE_RS)
    print 'done'

    return reaction_sets
示例#5
0
def main():
    """Runs CoMetGeNe.py for every species in the data set.

    For every species in the data set, metabolic pathways and genomic
    information are retrieved from KEGG. Then EC numbers associations are also
    extracted. Finally, CoMetGeNe.py is ran for all species, for all
    combinations of the gap parameters.

    Output files and directories are created as needed.
    """
    retrieve_pathways()

    if not os.path.exists(PICKLE_GENOME):
        genomes = dict()
    else:
        genomes = unpickle(PICKLE_GENOME)

    retrieve_genomes(genomes)

    retrieve_ec_numbers()

    run_CoMetGeNe()
def get_species_object(species, species_genes=None):
    """Creates and returns a Species object with the appropriate Chromosome
    list and gene contents.

    :param species: KEGG organism code
    :param species_genes: genes of 'species' involved in trails common to
        'species' and at least another species; if None, all genes for 'species'
        are used
    :return: a Species object with chromosomes and gene lists for every
        chromosomal strand
    """
    org = Species(species)
    genomes = unpickle(PICKLE_GENOME)
    chromosomes = get_species_chromosomes(genomes, species)

    if species_genes is None and species not in genomes:
        raise TrailGroupingError

    genes = genomes[species] if species_genes is None else species_genes.keys()

    for chromosome in chromosomes:
        current_chr = Chromosome(chromosome)
        org.add_chromosome(current_chr)

        genes_plus = get_genes_on_strand(genomes, species, chromosome, genes,
                                         True)
        genes_minus = get_genes_on_strand(genomes, species, chromosome, genes,
                                          False)

        genes_plus = order_genes(genes_plus, species, genomes)
        genes_minus = order_genes(genes_minus, species, genomes)

        for gene in genes_plus:
            current_chr.add_gene_plus(gene)
        for gene in genes_minus:
            current_chr.add_gene_minus(gene)

    return org