示例#1
0
 def __init__(self, genes, resource_manager=None):
     self.genes = genes
     self.graph = nx.MultiGraph()
     if not resource_manager:
         self.resource_manager = ResourceManager()
     else:
         self.resource_manager = resource_manager
     self.go_dag = None
     self.goa = None
示例#2
0
 def __init__(self, genes, resource_manager=None):
     self.genes = genes
     self.graph = nx.MultiGraph()
     if not resource_manager:
         self.resource_manager = ResourceManager()
     else:
         self.resource_manager = resource_manager
     self.go_dag = GODag(self.resource_manager.get_go_obo())
     self.goa = self._load_goa_gaf()
示例#3
0
def test_read_gene_list_entrez_mouse():
    rm = ResourceManager(base_folder=default_base_folder)
    with open('test_gene_list_entrez_mouse.txt', 'w') as fh:
        fh.write('14433')
    refs = read_gene_list('test_gene_list_entrez_mouse.txt',
                          'entrez_mouse', rm)
    assert len(refs) == 1
    assert refs[0]['MGI'] == '95640'
    assert refs[0]['HGNC_SYMBOL'] == 'GAPDH'
示例#4
0
def test_read_gene_list_rgd():
    rm = ResourceManager(base_folder=default_base_folder)
    with open('test_gene_list_rgd.txt', 'w') as fh:
        fh.write('2561\n')
        fh.write('RGD:69323')
    refs = read_gene_list('test_gene_list_rgd.txt', 'rgd_id', rm)
    assert len(refs) == 2, refs
    assert refs[0]['RGD'] == '2561'
    assert refs[0]['HGNC_SYMBOL'] == 'ERBB2'
    assert refs[1]['RGD'] == '69323'
    assert refs[1]['HGNC_SYMBOL'] == 'ERBB3'
示例#5
0
def run_main(args):
    # Now we run the relevant stage of processing
    project_folder = create_folder(args.base_folder, args.project)

    # Add a logger specific to the project and processing stage
    log_file = os.path.join(project_folder, 'genewalk_%s.log' % args.stage)
    formatter = logging.Formatter(default_logger_format,
                                  datefmt=default_date_format)
    project_log_handler = logging.FileHandler(log_file)
    project_log_handler.setFormatter(formatter)
    root_logger.addHandler(project_log_handler)

    if args.random_seed:
        logger.info('Running with random seed %d' % args.random_seed)
        random.seed(a=int(args.random_seed))

    # Make sure we have all the resource files
    rm = ResourceManager(base_folder=args.base_folder)
    rm.download_all()

    if args.stage in ('all', 'node_vectors'):
        genes = read_gene_list(args.genes, args.id_type, rm)
        save_pickle(genes, project_folder, 'genes')
        MG = load_network(args.network_source, args.network_file, genes,
                          resource_manager=rm)
        save_pickle(MG.graph, project_folder, 'multi_graph')
        for i in range(args.nreps_graph):
            logger.info('%s/%s' % (i + 1, args.nreps_graph))
            DW = run_walks(MG.graph, workers=args.nproc, size=args.dim_rep)

            # Pickle the node vectors (embeddings) and DW object
            if args.save_dw:
                save_pickle(DW, project_folder, 'deepwalk_%d' % (i + 1))
            nv = copy.deepcopy(DW.model.wv)
            save_pickle(nv, project_folder,
                        'deepwalk_node_vectors_%d' % (i + 1))

            # Delete the DeepWalk object to clear memory
            del DW, nv
            gc.collect()

    if args.stage in ('all', 'null_distribution'):
        MG = load_pickle(project_folder, 'multi_graph')
        srd = []
        for i in range(args.nreps_null):
            logger.info('%s/%s' % (i + 1, args.nreps_null))
            RG = get_rand_graph(MG)
            DW = run_walks(RG, workers=args.nproc, size=args.dim_rep)

            # Pickle the node vectors (embeddings) and DW object
            if args.save_dw:
                save_pickle(DW, project_folder, 'deepwalk_rand_%d' % (i + 1))
            nv = copy.deepcopy(DW.model.wv)
            save_pickle(nv, project_folder, 'deepwalk_node_vectors_rand_%d'
                                            % (i + 1))
            # Delete the DeepWalk object to clear memory
            del DW
            gc.collect()

            # Calculate the null distributions
            srd += get_null_distributions(RG, nv)
            del nv
            gc.collect()
        srd = np.asarray(sorted(srd))
        save_pickle(srd, project_folder, 'genewalk_rand_simdists')

    if args.stage in ('all', 'statistics'):
        MG = load_pickle(project_folder, 'multi_graph')
        genes = load_pickle(project_folder, 'genes')
        nvs = [load_pickle(project_folder,
                           'deepwalk_node_vectors_%d' % (i + 1))
               for i in range(args.nreps_graph)]
        null_dist = load_pickle(project_folder, 'genewalk_rand_simdists')
        GW = GeneWalk(MG, genes, nvs, null_dist)
        df = GW.generate_output(alpha_fdr=args.alpha_fdr,
                                base_id_type=args.id_type)
        fname = os.path.join(project_folder, 'genewalk_results.csv')
        logger.info('Saving final results into %s' % fname)
        df.to_csv(fname, index=False, float_format='%.3e')

    if args.stage in ('all', 'visual'):
        fname = os.path.join(project_folder, 'genewalk_results.csv')
        dGW = pd.read_csv(fname)
        figure_folder = create_folder(project_folder, 'figures')
        create_folder(figure_folder, 'barplots')
        GWp = GW_Plotter(figure_folder, dGW, args.alpha_fdr)
        GWp.generate_plots()
示例#6
0
class NxMgAssembler(object):
    """Class which assembles a networkx MultiGraph based on a list of genes.

    Parameters
    ----------
    genes : list of dict
        A list of gene references based on which the graph is assembled.

    Attributes
    ----------
    graph : networkx.MultiGraph
        The assembled graph containing links for interactions between genes,
        GO annotations for genes, and the GO ontology.
    """

    def __init__(self, genes, resource_manager=None):
        self.genes = genes
        self.graph = nx.MultiGraph()
        if not resource_manager:
            self.resource_manager = ResourceManager()
        else:
            self.resource_manager = resource_manager
        self.go_dag = GODag(self.resource_manager.get_go_obo())
        self.goa = self._load_goa_gaf()

    def _get_go_terms_for_gene(self, gene):
        # Filter to rows with the given gene's UniProt ID
        if ('UP' not in gene) or ('HGNC_SYMBOL' not in gene):
            return []
        elif gene['HGNC_SYMBOL'] not in self.graph:
            return []
        df = self.goa[self.goa['DB_ID'] == gene['UP']]
        go_ids = sorted(list(set(df['GO_ID'])))
        return go_ids

    def add_go_annotations(self):
        """Add edges between gene nodes and GO nodes based on GO
        annotations."""
        logger.info('Adding GO annotations for genes in graph.')
        for gene in self.genes:
            go_ids = self._get_go_terms_for_gene(gene)
            for go_id in go_ids:
                if go_id in self.go_dag:
                    go_term = self.go_dag[go_id]
                    if go_term.is_obsolete:
                        continue
                    self.graph.add_node(go_term.id,
                                        name=go_term.name,
                                        GO=go_term.id,
                                        domain=go_term.namespace)
                    self.graph.add_edge(gene['HGNC_SYMBOL'], go_term.id,
                                        label='GO:annotation')

    def add_go_ontology(self):
        """Add edges between GO nodes based on the GO ontology."""
        logger.info('Adding GO ontology edges to graph.')
        for go_term in list(self.go_dag.values()):
            if go_term.is_obsolete:
                continue
            self.graph.add_node(go_term.id,
                                name=go_term.name,
                                GO=go_term.id,
                                domain=go_term.namespace)
            for parent_term in go_term.parents:
                if parent_term.is_obsolete:
                    continue
                self.graph.add_node(go_term.id,
                                    name=go_term.name,
                                    GO=go_term.id,
                                    domain=go_term.namespace)
                self.graph.add_edge(go_term.id, parent_term.id,
                                    label='GO:is_a')

    def node2edges(self, node_key):
        """Return the edges corresponding to a node."""
        return self.graph.edges(node_key, keys=True)

    def save_graph(self, fname):
        """Save the file into a GraphML file.

        Parameters
        ----------
        fname : str
            The name of the file to save the graph into.
        """
        nx.write_graphml(self.graph, fname)

    def _load_goa_gaf(self):
        """Load the gene/GO annotations as a pandas data frame."""
        goa_ec = {'EXP', 'IDA', 'IPI', 'IMP', 'IGI', 'IEP', 'HTP', 'HDA',
                  'HMP', 'HGI', 'HEP', 'IBA', 'IBD'}
        goa = pd.read_csv(self.resource_manager.get_goa_gaf(), sep='\t',
                          skiprows=23, dtype=str,
                          header=None,
                          names=['DB',
                                 'DB_ID',
                                 'DB_Symbol',
                                 'Qualifier',
                                 'GO_ID',
                                 'DB_Reference',
                                 'Evidence_Code',
                                 'With_From',
                                 'Aspect',
                                 'DB_Object_Name',
                                 'DB_Object_Synonym',
                                 'DB_Object_Type',
                                 'Taxon',
                                 'Date',
                                 'Assigned',
                                 'Annotation_Extension',
                                 'Gene_Product_Form_ID'])
        goa = goa.sort_values(by=['DB_ID', 'GO_ID'])
        # Filter out all "NOT" negative evidences
        goa['Qualifier'].fillna('', inplace=True)
        goa = goa[~goa['Qualifier'].str.startswith('NOT')]
        # Filter to rows with evidence code corresponding to experimental
        # evidence
        goa = goa[goa['Evidence_Code'].isin(goa_ec)]
        return goa
示例#7
0
def test_read_custom_list():
    rm = ResourceManager(base_folder=default_base_folder)
    gene_list_file = os.path.join(TEST_RESOURCES, 'custom_gene_list.txt')
    refs = read_gene_list(gene_list_file, 'custom', rm)
    assert len(refs) == 3, refs
    assert refs[0] == {'ID': 'CUSTOM:ABC'}, refs
示例#8
0
import os
from nose.tools import raises
from genewalk.gene_lists import *
from genewalk.cli import default_base_folder
from genewalk.resources import ResourceManager
from .util import TEST_RESOURCES

rm = ResourceManager()
gm = GeneMapper(rm)


def test_map_lists():
    refs = map_hgnc_symbols(['BRAF', 'KRAS'], gm)
    assert refs[0]['HGNC'] == '1097', refs
    assert refs[0]['UP'] == 'P15056', refs
    assert refs[0]['HGNC_SYMBOL'] == 'BRAF', refs
    assert refs[1]['HGNC'] == '6407', refs
    assert refs[1]['UP'] == 'P01116', refs
    assert refs[1]['HGNC_SYMBOL'] == 'KRAS', refs

    refs = map_hgnc_ids(['1097', '6407'], gm)
    assert refs[0]['HGNC'] == '1097', refs
    assert refs[0]['UP'] == 'P15056', refs
    assert refs[0]['HGNC_SYMBOL'] == 'BRAF', refs
    assert refs[1]['HGNC'] == '6407', refs
    assert refs[1]['UP'] == 'P01116', refs
    assert refs[1]['HGNC_SYMBOL'] == 'KRAS', refs

    refs = map_mgi_ids(['MGI:892970'], gm)
    assert refs[0]['HGNC'] == '6817', refs
    assert refs[0]['HGNC_SYMBOL'] == 'MAL', refs
示例#9
0
import os
from genewalk.gene_lists import read_gene_list
from genewalk.nx_mg_assembler import UserNxMgAssembler
from genewalk.resources import ResourceManager
from .util import TEST_RESOURCES, TEST_BASE_FOLDER


rm = ResourceManager(TEST_BASE_FOLDER)
sif_genes = os.path.join(TEST_RESOURCES, 'test_sif.sif')
sif_annots = os.path.join(TEST_RESOURCES, 'test_sif_annot.sif')
sif_full = os.path.join(TEST_RESOURCES, 'test_sif_full.sif')
genes = read_gene_list(os.path.join(TEST_RESOURCES, 'hgnc_symbols.txt'),
                       id_type='hgnc_symbol', resource_manager=rm)


def test_gene_only_sif():
    mga = UserNxMgAssembler(genes, resource_manager=rm, filepath=sif_genes,
                            gwn_format='sif')
    gene_nodes = {'KRAS', 'BRAF', 'MAP2K2', 'MAPK1', 'PIK3CA', 'AKT1'}
    go_nodes = {'GO:0001934', 'GO:0005515', 'GO:0000186', 'GO:0000001',
                'GO:0032147', 'GO:0003924'}
    assert set(mga.graph.nodes()) == gene_nodes | go_nodes

    # Make sure we have GO node annotations as expected
    go_node = mga.graph.nodes['GO:0000186']
    assert go_node['GO'] == 'GO:0000186', go_node
    assert go_node['domain'] == 'biological_process', go_node
    assert go_node['name'] == 'activation of MAPKK activity', go_node

    # Make sure we have GO annotation edges
    assert ('BRAF', 'GO:0000186') in mga.graph.edges
示例#10
0
文件: cli.py 项目: pythseq/genewalk
def main():
    parser = argparse.ArgumentParser(
        description='Run GeneWalk on a list of genes provided in a text '
        'file.')
    parser.add_argument('--version',
                        action='version',
                        version='GeneWalk %s' % __version__,
                        help='Print the version of GeneWalk and exit.')
    parser.add_argument('--project',
                        help='A name for the project which '
                        'determines the folder within the '
                        'base folder in which the '
                        'intermediate and final results '
                        'are written. Must contain only '
                        'characters that are valid in '
                        'folder names.',
                        required=True)
    parser.add_argument('--genes',
                        help='Path to a text file with a list of '
                        'genes of interest, for example'
                        'differentially expressed genes. '
                        'The type of gene identifiers used in '
                        'the text file are provided in the '
                        'id_type argument.',
                        required=True)
    parser.add_argument('--id_type',
                        help='The type of gene IDs provided in the text file '
                        'in the genes argument. Possible values are: '
                        'hgnc_symbol, hgnc_id, ensembl_id, and mgi_id.',
                        choices=[
                            'hgnc_symbol', 'hgnc_id', 'ensembl_id', 'mgi_id',
                            'entrez_human', 'entrez_mouse'
                        ],
                        required=True)
    parser.add_argument(
        '--stage',
        default='all',
        help='The stage of processing to run. Default: '
        '%(default)s',
        choices=['all', 'node_vectors', 'null_distribution', 'statistics'])
    parser.add_argument('--base_folder',
                        default=default_base_folder,
                        help='The base folder used to store GeneWalk '
                        'temporary and result files for a given project.'
                        ' Default: %(default)s')
    parser.add_argument('--network_source',
                        default='pc',
                        help='The source of the network to be used.'
                        'Possible values are: pc, indra, edge_list, and '
                        'sif. In case of indra, edge_list, and sif, '
                        'the network_file argument must be specified.'
                        ' Default: %(default)s',
                        choices=['pc', 'indra', 'edge_list', 'sif'])
    parser.add_argument('--network_file',
                        default=None,
                        help='If network_source is indra, this argument '
                        'points to a Python pickle file in which a list '
                        'of INDRA Statements constituting the network '
                        'is contained. In case network_source is '
                        'edge_list or sif, '
                        'the network_file argument points to a text file '
                        'representing the network.')
    parser.add_argument('--nproc',
                        default=1,
                        type=int,
                        help='The number of processors to use in a '
                        'multiprocessing environment. Default: '
                        '%(default)s')
    parser.add_argument('--nreps_graph',
                        default=3,
                        type=int,
                        help='The number of repeats to run when calculating '
                        'node vectors on the GeneWalk graph. '
                        'Default: %(default)s')
    parser.add_argument('--nreps_null',
                        default=3,
                        type=int,
                        help='The number of repeats to run when calculating '
                        'node vectors on the random network graphs '
                        'for constructing the null distribution. '
                        'Default: %(default)s')
    parser.add_argument('--alpha_fdr',
                        default=1,
                        type=float,
                        help='The false discovery rate to use when '
                        'outputting the final statistics table. '
                        'If 1 (default), all similarities are output, '
                        'otherwise only the ones whose false discovery '
                        'rate are below this parameter are included. '
                        'Default: %(default)s')
    parser.add_argument('--save_dw',
                        default=False,
                        type=bool,
                        help='If True, the full DeepWalk object for each '
                        'repeat is saved in the project folder. This can '
                        'be useful for debugging but the files are '
                        'typically very large. Default: %(default)s')
    parser.add_argument('--random_seed',
                        default=None,
                        type=int,
                        help='If provided, the random number generator is '
                        'seeded with the given value. This should only '
                        'be used if the goal is to deterministically '
                        'reproduce a prior result obtained with the same '
                        'random seed.')
    args = parser.parse_args()

    # Now we run the relevant stage of processing
    project_folder = create_project_folder(args.base_folder, args.project)

    # Add a logger specific to the project and processing stage
    log_file = os.path.join(project_folder, 'genewalk_%s.log' % args.stage)
    formatter = logging.Formatter(default_logger_format,
                                  datefmt=default_date_format)
    project_log_handler = logging.FileHandler(log_file)
    project_log_handler.setFormatter(formatter)
    root_logger.addHandler(project_log_handler)

    if args.random_seed:
        logger.info('Running with random seed %d' % args.random_seed)
        random.seed(a=int(args.random_seed))

    # Make sure we have all the resource files
    rm = ResourceManager(base_folder=args.base_folder)
    rm.download_all()

    if args.stage in ('all', 'node_vectors'):
        genes = read_gene_list(args.genes, args.id_type, rm)
        save_pickle(genes, project_folder, 'genes')
        MG = load_network(args.network_source,
                          args.network_file,
                          genes,
                          resource_manager=rm)
        save_pickle(MG.graph, project_folder, 'multi_graph')
        for i in range(args.nreps_graph):
            logger.info('%s/%s' % (i + 1, args.nreps_graph))
            DW = run_walks(MG.graph, workers=args.nproc)

            # Pickle the node vectors (embeddings) and DW object
            if args.save_dw:
                save_pickle(DW, project_folder, 'deepwalk_%d' % (i + 1))
            nv = copy.deepcopy(DW.model.wv)
            save_pickle(nv, project_folder,
                        'deepwalk_node_vectors_%d' % (i + 1))

            # Delete the DeepWalk object to clear memory
            del DW, nv
            gc.collect()

    if args.stage in ('all', 'null_distribution'):
        MG = load_pickle(project_folder, 'multi_graph')
        srd = []
        for i in range(args.nreps_null):
            logger.info('%s/%s' % (i + 1, args.nreps_null))
            RG = get_rand_graph(MG)
            DW = run_walks(RG, workers=args.nproc)

            # Pickle the node vectors (embeddings) and DW object
            if args.save_dw:
                save_pickle(DW, project_folder, 'deepwalk_rand_%d' % (i + 1))
            nv = copy.deepcopy(DW.model.wv)
            save_pickle(nv, project_folder,
                        'deepwalk_node_vectors_rand_%d' % (i + 1))
            # Delete the DeepWalk object to clear memory
            del DW
            gc.collect()

            # Calculate the null distributions
            srd += get_null_distributions(RG, nv)
            del nv
            gc.collect()
        srd = np.asarray(sorted(srd))
        save_pickle(srd, project_folder, 'genewalk_rand_simdists')

    if args.stage in ('all', 'statistics'):
        MG = load_pickle(project_folder, 'multi_graph')
        genes = load_pickle(project_folder, 'genes')
        nvs = [
            load_pickle(project_folder, 'deepwalk_node_vectors_%d' % (i + 1))
            for i in range(args.nreps_graph)
        ]
        null_dist = load_pickle(project_folder, 'genewalk_rand_simdists')
        GW = GeneWalk(MG, genes, nvs, null_dist)
        df = GW.generate_output(alpha_fdr=args.alpha_fdr,
                                base_id_type=args.id_type)
        fname = os.path.join(project_folder, 'genewalk_results.csv')
        logger.info('Saving final results into %s' % fname)
        df.to_csv(fname, index=False, float_format='%.3e')