Exemplo n.º 1
0
    def test_gene_sets(self):
        gs1 = GeneSet(
            gs_id=self.test_gs_id,
            name=self.test_name,
            genes=self.test_genes,
            hierarchy=self.test_hierarchy,
            organism=self.test_organism,
            link='',
        )

        gs2 = GeneSet(gs_id='test2',
                      name='test_name2',
                      hierarchy=('Test', 'test'),
                      organism='3702')
        gs3 = GeneSet(gs_id='test3',
                      name='test_name3',
                      hierarchy=('Test', 'test'),
                      organism='3702')

        sets = GeneSets([gs1, gs2, gs3])
        self.assertIsNotNone(sets)

        self.assertRaises(GeneSetException, sets.common_org)
        self.assertRaises(GeneSetException, sets.common_hierarchy)

        self.assertGreater(len(sets.hierarchies()), 1)

        split_by_hierarchy = sets.split_by_hierarchy()
        self.assertLess(len(split_by_hierarchy), len(sets))
    def test_gene_set(self):

        gs1 = GeneSet(gs_id=self.test_gs_id,
                      name=self.test_name,
                      genes=self.test_genes,
                      hierarchy=self.test_hierarchy,
                      organism=self.test_organism,
                      link='')

        gs2 = GeneSet(gs_id='test2', name='test_name2')

        self.assertEqual(gs1.gmt_description(), 'test_gs,GO-biological_process,9606,test_name,_,_,_')
        self.assertFalse(gs1 == gs2)
        self.assertNotEqual(gs1, gs2)
        self.assertTrue(gs1 == gs1)
Exemplo n.º 3
0
def dicty_mutant_gene_sets(org):
    """ Return dicty mutant phenotype gene sets from Dictybase
    """
    if org == '352472':
        gene_sets = []
        gene_matcher = GeneMatcher('352472')

        for phenotype, mutants in dicty.phenotypes.phenotype_mutants().items():

            gene_symbols = [
                dicty.phenotypes.mutant_genes(mutant)[0] for mutant in mutants
            ]
            gene_matcher.genes = gene_symbols
            gene_matcher.run_matcher()
            genes = []

            for gene in gene_matcher.genes:
                if gene.ncbi_id is not None:
                    genes.append(int(gene.ncbi_id))

            if len(gene_symbols) != len(genes):
                print(len(gene_symbols), len(genes))

            gs = GeneSet(gs_id=phenotype,
                         name=phenotype,
                         genes=genes,
                         hierarchy=('Dictybase', 'Phenotypes'),
                         organism='352472',
                         link='')

            gene_sets.append(gs)

        return GeneSets(gene_sets)
Exemplo n.º 4
0
def kegg_gene_sets(org):
    """ Returns gene sets from KEGG pathways.
    """
    caching.clear_cache()
    kegg_org = kegg.KEGGOrganism(taxonomy.name(org))
    ncbi_id_mapper = kegg_org.kegg_to_ncbi_mapper()
    genesets = []

    for id in kegg_org.pathways():
        pway = kegg.KEGGPathway(id)
        hier = ('KEGG', 'pathways')

        if pway.pathway_attributes():
            kegg_names = kegg_org.get_genes_by_pathway(id)
            mapped_genes = []
            for gene in kegg_names:
                try:
                    mapped_genes.append(ncbi_id_mapper[gene.upper()])
                except KeyError:
                    # some kegg names can not be matched to ncbi ids
                    # they are included in geneset anyway
                    # remove prefix, that specifies kegg organism
                    # mapped_genes.append(gene.split(':')[-1])
                    pass

            gs = GeneSet(gs_id=id,
                         name=pway.title,
                         genes=mapped_genes,
                         hierarchy=hier,
                         organism=org,
                         link=pway.link)
            genesets.append(gs)

    return GeneSets(genesets)
Exemplo n.º 5
0
def omim_gene_sets(org):
    """ Return gene sets from OMIM (Online Mendelian Inheritance in Man) diseses
    """
    if org == '9606':
        gene_matcher = GeneMatcher('9606')
        genesets = []

        for disease in omim.diseases():
            gene_symbols = omim.disease_genes(disease)
            gene_matcher.genes = gene_symbols
            gene_matcher.run_matcher()
            genes = []

            for gene in gene_matcher.genes:
                if gene.ncbi_id is not None:
                    genes.append(int(gene.ncbi_id))

            gs = GeneSet(
                gs_id=disease.id,
                name=disease.name,
                genes=genes,
                hierarchy=('OMIM', ),
                organism='9606',
                link=(OMIM_LINK.format(disease.id) if disease.id else None))
            genesets.append(gs)

        return GeneSets(genesets)
Exemplo n.º 6
0
def reactome_gene_sets(org):
    """ Prepare human pathways gene sets from reactome pathways
    """
    if org == '9606':
        gene_matcher = GeneMatcher('9606')

        with urlopen(REACTOME_DOWNLOAD_LINK) as url:
            memfile = io.BytesIO(url.read())

            with ZipFile(memfile, 'r') as myzip:
                f = myzip.open(REACTOME_FILE_NAME)
                content = f.read().decode().splitlines()
                genesets = []

                for path in content:
                    gene_symbols = path.split('\t')[2:] if path.split(
                        '\t')[2:] else []
                    gene_matcher.genes = gene_symbols
                    gene_matcher.run_matcher()
                    genes = []

                    for gene in gene_matcher.genes:
                        if gene.ncbi_id is not None:
                            genes.append(int(gene.ncbi_id))

                    gs = GeneSet(gs_id=path.split('\t')[0],
                                 name=path.split('\t')[0],
                                 genes=genes,
                                 hierarchy=('Reactome', 'Pathways'),
                                 organism='9606',
                                 link='')

                    genesets.append(gs)

                return GeneSets(genesets)
Exemplo n.º 7
0
def cytoband_gene_sets(org):
    """ Create cytoband gene sets from Stanford Microarray Database
    """
    if org == '9606':
        gene_matcher = GeneMatcher('9606')

        with urlopen(CYTOBAND_DOWNLOAD_LINK) as stream:
            data = stream.read().splitlines()
            genesets = []

            for band in data:
                b = band.decode().split('\t')
                gene_symbols = b[2:]
                gene_matcher.genes = gene_symbols
                gene_matcher.run_matcher()

                genes = []
                for gene in gene_matcher.genes:
                    if gene.ncbi_id is not None:
                        genes.append(int(gene.ncbi_id))

                genesets.append(
                    GeneSet(gs_id=b[0],
                            name=b[1],
                            genes=genes if b[2:] else [],
                            hierarchy=('Cytobands', ),
                            organism='9606',
                            link=''))

            return GeneSets(genesets)
Exemplo n.º 8
0
    def add_custom_sets(self,
                        gene_sets_names,
                        gene_names,
                        hierarchy_title=None,
                        select_customs_flag=False):
        # type: (np.ndarray, np.ndarray) -> None

        self.custom_set_hier = hierarchy_title
        self.clear_custom_sets()

        temp_dict = defaultdict(list)
        for set_name, gene_name in zip(gene_sets_names, gene_names):
            temp_dict[set_name].append(gene_name)

        g_sets = []
        for key, value in temp_dict.items():
            g_sets.append(
                GeneSet(gs_id=key,
                        hierarchy=self.custom_set_hier,
                        organism=self.gs_object.common_org(),
                        name=key,
                        genes=set(value)))

        self.gs_object.update(g_sets)
        self.update_gs_hierarchy(select_customs_flag=select_customs_flag)
Exemplo n.º 9
0
    def test_gmt_file_format(self):

        gs = GeneSet(
            gs_id=self.test_gs_id,
            name=self.test_name,
            genes=self.test_genes,
            hierarchy=self.test_hierarchy,
            organism=self.test_organism,
            link='',
        )

        fd, file_name = mkstemp()

        # write to file
        write_sets = GeneSets([gs])
        write_sets.to_gmt_file_format(file_name)

        with open(file_name, 'r') as temp_f:
            line = temp_f.readline()
            columns = line.strip().split('\t')
            self.assertGreater(len(columns), 0)

        # read from file
        read_sets = GeneSets.from_gmt_file_format(file_name)
        self.assertIsNotNone(read_sets)
        self.assertGreater(len(read_sets), 0)
        self.assertEqual(read_sets.common_hierarchy(), self.test_hierarchy)
        self.assertEqual(read_sets.common_org(), self.test_organism)

        # clean-up
        os.close(fd)
        os.remove(file_name)
Exemplo n.º 10
0
def dicty_mutant_gene_sets(tax_id: str):
    """ Return dicty mutant phenotype gene sets from Dictybase
    """
    if tax_id == '44689':
        gene_sets = []
        gene_matcher = GeneMatcher('44689')

        for phenotype, mutants in phenotypes.phenotype_mutants().items():
            phenotype = phenotype.replace(",", " ")
            gene_symbols = [
                phenotypes.mutant_genes(mutant)[0] for mutant in mutants
            ]
            gene_matcher.genes = gene_symbols
            genes = set()

            for gene in gene_matcher.genes:
                if gene.gene_id is not None:
                    genes.add(str(gene.gene_id))

            gs = GeneSet(gs_id=phenotype,
                         name=phenotype,
                         genes=genes,
                         hierarchy=('Dictybase', 'Phenotypes'),
                         organism=tax_id,
                         link='')

            gene_sets.append(gs)

        for gs_group in GeneSets(gene_sets).split_by_hierarchy():
            hierarchy = gs_group.common_hierarchy()
            gs_group.to_gmt_file_format(
                f'{data_path}/gene_sets/{filename(hierarchy, tax_id)}')
Exemplo n.º 11
0
def cytoband_gene_sets(tax_id: str) -> None:
    """ Create cytoband gene sets from Stanford Microarray Database
    """
    if tax_id == '9606':
        download_link = 'http://statweb.stanford.edu/~tibs/GSA/cytobands-stanford.gmt'
        gene_matcher = GeneMatcher('9606')

        with urlopen(download_link) as stream:
            data = stream.read().splitlines()
            genesets = []

            for band in data:
                b = band.decode().split('\t')
                gene_symbols = b[2:]
                gene_matcher.genes = gene_symbols

                genes = set()
                for gene in gene_matcher.genes:
                    if gene.gene_id is not None:
                        genes.add(gene.gene_id)

                genesets.append(
                    GeneSet(gs_id=b[0],
                            name=b[1],
                            genes=genes if b[2:] else set(),
                            hierarchy=('Cytobands', ),
                            organism='9606',
                            link=''))

        for gs_group in GeneSets(genesets).split_by_hierarchy():
            hierarchy = gs_group.common_hierarchy()
            gs_group.to_gmt_file_format(
                f'{data_path}/gene_sets/{filename(hierarchy, tax_id)}')
Exemplo n.º 12
0
    def to_gene_set(term: go.Term) -> Optional[GeneSet]:
        genes = annotations.get_genes_by_go_term(term.id)

        if len(genes) > 0:
            return GeneSet(
                gs_id=term.id,
                name=term.name,
                genes=set(genes),
                hierarchy=('GO', term.namespace),
                organism=tax_id,
                link=f'http://amigo.geneontology.org/amigo/term/{term.id}')
Exemplo n.º 13
0
def reactome_gene_sets(tax_id: str) -> None:
    """ Prepare human pathways gene sets from reactome pathways
    """
    if tax_id == '9606':
        download_link = 'http://www.reactome.org/download/current/ReactomePathways.gmt.zip'
        file_name = 'ReactomePathways.gmt'
        detail_link = 'https://reactome.org/content/detail/{}'

        gene_matcher = GeneMatcher('9606')

        with urlopen(download_link) as url:
            memfile = io.BytesIO(url.read())

            with ZipFile(memfile, 'r') as myzip:
                f = myzip.open(file_name)
                content = f.read().decode().splitlines()
                genesets = []

                for path in content:
                    gene_symbols = path.split('\t')[2:] if path.split(
                        '\t')[2:] else []
                    gene_matcher.genes = gene_symbols
                    genes = set()

                    for gene in gene_matcher.genes:
                        if gene.gene_id is not None:
                            genes.add(str(gene.gene_id))

                    pathway = path.split('\t')[0].replace(',', ' ')
                    pathway_id = path.split('\t')[1].replace(',', ' ')

                    gs = GeneSet(gs_id=pathway_id,
                                 name=pathway,
                                 genes=genes,
                                 hierarchy=('Reactome', 'pathways'),
                                 organism='9606',
                                 link=detail_link.format(pathway_id))

                    genesets.append(gs)

        for gs_group in GeneSets(genesets).split_by_hierarchy():
            hierarchy = gs_group.common_hierarchy()
            gs_group.to_gmt_file_format(
                f'{data_path}/gene_sets/{filename(hierarchy, tax_id)}')
def go_gene_sets(org):
    """ Returns gene sets from GO.
    """

    ontology = go.Ontology()
    annotations = go.Annotations(org, ontology=ontology)

    gene_sets = []
    for termn, term in ontology.terms.items():
        genes = annotations.get_genes_by_go_term(termn)
        hier = ('GO', term.namespace)
        if len(genes) > 0:

            gs = GeneSet(gs_id=termn, name=term.name, genes=genes, hierarchy=hier,
                         organism=org, link=GO_TERM_LINK.format(termn))

            gene_sets.append(gs)

    return GeneSets(gene_sets)
Exemplo n.º 15
0
def kegg_gene_sets(tax_id: str) -> None:
    """ Returns gene sets from KEGG pathways.
    """
    caching.clear_cache()
    kegg_org = kegg.KEGGOrganism(taxonomy.name(tax_id))
    ncbi_id_mapper = kegg_org.kegg_to_ncbi_mapper()
    genesets = []

    for id in kegg_org.pathways():
        pway = kegg.KEGGPathway(id)
        hier = ('KEGG', 'Pathways')

        if pway.pathway_attributes():
            kegg_names = kegg_org.get_genes_by_pathway(id)
            mapped_genes = set()
            for gene in kegg_names:
                try:
                    mapped_genes.add(ncbi_id_mapper[gene.upper()])
                except KeyError:
                    # some kegg names can not be matched to ncbi ids
                    # they are included in geneset anyway
                    # remove prefix, that specifies kegg organism
                    # mapped_genes.append(gene.split(':')[-1])
                    pass

            gs = GeneSet(gs_id=id,
                         name=pway.title,
                         genes=mapped_genes,
                         hierarchy=hier,
                         organism=tax_id,
                         link=pway.link)
            genesets.append(gs)

    for gs_group in GeneSets(genesets).split_by_hierarchy():
        hierarchy = gs_group.common_hierarchy()
        gs_group.to_gmt_file_format(
            f'{data_path}/gene_sets/{filename(hierarchy, tax_id)}')
Exemplo n.º 16
0
def gene_marker_sets():
    file_names = ['panglao_gene_markers.tab', 'cellMarker_gene_markers.tab']
    file_name_to_hier = {
        'panglao_gene_markers.tab': 'Panglao',
        'cellMarker_gene_markers.tab': 'CellMarker'
    }

    for file_name in file_names:
        file_path = f'{data_path}/marker_genes/{file_name}'

        sets_by_org = {'9606': defaultdict(list), '10090': defaultdict(list)}
        name_to_tax = {'Human': '9606', 'Mouse': '10090'}

        for row in Table(file_path):
            tax_id = name_to_tax[row['Organism']]
            cell_type = row['Cell Type']
            gene_id = row['Entrez ID']
            sets_by_org[tax_id][cell_type].append(gene_id)

        for tax_id, cell_types in sets_by_org.items():
            gene_sets = []

            for cell_type, genes in cell_types.items():
                gs = GeneSet(
                    gs_id=str(cell_type),
                    name=str(cell_type),
                    genes=set([str(gene) for gene in genes if gene != '?']),
                    hierarchy=('Marker Genes', file_name_to_hier[file_name]),
                    organism=tax_id,
                    link='')

                gene_sets.append(gs)

            for gs_group in GeneSets(gene_sets).split_by_hierarchy():
                hierarchy = gs_group.common_hierarchy()
                gs_group.to_gmt_file_format(
                    f'{data_path}/gene_sets/{filename(hierarchy, tax_id)}')