예제 #1
0
    def __init__(self, src_name, base_url, aliases, args=None):
        """Init a SrcClass object with the provided parameters.

        Constructs a SrcClass object with the provided parameters, which should
        be provided by any class extending SrcClass.

        Args:
            src_name (str): The name of the remote source to be included in
                the KN. Must be provided by the extending class.
            url_base (str): The base url of the remote source, which may need
                additional processing to provide an actual download link (see
                get_remote_url). Must be provided by the extending class.
            aliases (dict): A dictionary with subsets of the source which will
                be included in the KN  as the keys (e.g. different species,
                data types, or interaction types), and a short string with
                information about the alias as the value.
            args (Namespace): args as populated namespace or 'None' for defaults
        """
        if args is None:
            args = cf.config_args()
        self.name = src_name
        self.url_base = base_url
        self.aliases = aliases
        self.remote_file = ''
        self.version = dict()
        self.args = args
        self.chunk_size = 500000
예제 #2
0
    def get_aliases(self, args=cf.config_args()):
        """Helper function for producing the alias dictionary.

        This returns a dictionary where alias names are keys and alias info
        are the values. This helper function usse the species
        specific information for the build of the Knowledge Network, which is
        produced by ensembl.py during setup utilities and is located at
        cf.DEFAULT_MAP_PATH/species/species.json, in order to fetch all matching
        species specific aliases from the source.

        Args:
            args (Namespace): args as populated namespace or 'None' for defaults

        Returns:
            dict: A dictionary of species:(taxid, division) values
        """
        src_data_dir = os.path.join(args.working_dir, args.data_path, cf.DEFAULT_MAP_PATH)
        sp_dir = os.path.join(src_data_dir, 'species', 'species.json')
        sp_dict = json.load(open(sp_dir))
        alias_dict = dict()
        for species, taxid in sp_dict.items():
            species = species.capitalize().replace('_', ' ')
            sp_abbrev = species[0] + species.split(' ')[1][:3]
            url = self.get_remote_url(taxid)
            req = requests.get(url)
            if req.status_code == 200:
                alias_dict[taxid] = sp_abbrev
            else:
                print("warning: string species {} error {}".format(taxid, req.status_code))
        return alias_dict
예제 #3
0
    def __init__(self, args=cf.config_args()):
        """Init a Intact with the staticly defined parameters.

        This calls the SrcClass constructor (see utilities.SrcClass)
        """
        name = 'intact'
        url_base = 'ftp.ebi.ac.uk'
        aliases = {"PPI": "PPI"}
        super(Intact, self).__init__(name, url_base, aliases, args)
        self.remote_file = 'intact.txt'
        self.chunk_size = 50000
        src_data_dir = os.path.join(args.working_dir, args.data_path,
                                    cf.DEFAULT_MAP_PATH)
        sp_dir = os.path.join(src_data_dir, 'species', 'species.json')
        sp_dict = json.load(open(sp_dir))
        self.taxid_list = sp_dict.values()

        self.source_url = "http://www.ebi.ac.uk/intact/"
        self.image = "http://www.ebi.ac.uk/intact/images/IntAct_logo.png"
        self.reference = (
            "Orchard S, Ammari M, Aranda B, et al. The MIntAct project--IntAct as a "
            "common curation platform for 11 molecular interaction databases. "
            "Nucleic Acids Res. 2014;42(Database issue):D358-63.")
        self.pmid = 24234451
        self.license = (
            'IntAct is released monthly. All IntAct data and software is freely '
            'available to all users, academic or commercial, under the terms of the '
            'Apache License, Version 2.0.')
예제 #4
0
    def get_aliases(self, args=cf.config_args()):
        """Helper function for producing the alias dictionary.

        This returns a dictionary where alias names are keys and alias info
        are the values. This helper function usse the species
        specific information for the build of the Knowledge Network, which is
        produced by ensembl.py during setup utilities and is located at
        cf.DEFAULT_MAP_PATH/species/species.json, in order to fetch all matching
        species specific aliases from the source.

        Args:
            args (Namespace): args as populated namespace or 'None' for defaults

        Returns:
            dict: A dictionary of species:(taxid, division) values
        """
        src_data_dir = os.path.join(args.working_dir, args.data_path,
                                    cf.DEFAULT_MAP_PATH)
        sp_dir = os.path.join(src_data_dir, 'species', 'species.json')
        sp_dict = json.load(open(sp_dir))
        alias_dict = {"pathway": "pathways"}
        kegg_url = self.url_base + 'list/organism'
        kegg_resp = urllib.request.urlopen(kegg_url)
        kegg_dict = dict()
        for line in kegg_resp:
            (_, org, species, _) = line.decode().split('\t')
            species = ' '.join(species.split(' ')[:2])
            kegg_dict[species] = org
        for species in sp_dict:
            if species in kegg_dict:
                org = kegg_dict[species]
                sp_abbrev = species[0] + species.split(' ')[1][:3]
                alias_dict[org] = species
                alias_dict[org + '_map'] = sp_abbrev + '_IDmap'
        return alias_dict
예제 #5
0
def check(module, args=None):
    """Runs compare_versions(SrcClass) on a 'module' object

    This runs the compare_versions function on a 'module' object to find the
    version information of the source and determine if a fetch is needed. The
    version information is also printed.

    Args:
        module (str): string name of module defining source specific class
        args (Namespace): args as populated namespace or 'None' for defaults

    Returns:
        dict: A nested dictionary describing the version information for each
            alias described in source.
    """
    if args is None:
        args = cf.config_args()
    src_code_dir = os.path.join(args.code_path, args.src_path)
    sys.path.append(src_code_dir)
    src_module = __import__(module)
    SrcClass = src_module.get_SrcClass(args)
    version_dict = compare_versions(SrcClass, args)
    for alias in version_dict:
        iu.import_filemeta(version_dict[alias], args)
    return version_dict
예제 #6
0
def fetch(version_dict, args=cf.config_args()):
    """Fetches all mysql tables and syntax for alias described by version_json.

    This takes the path to a version_json (source.alias.json) and downloads
    all relevant tables (see fetch_utilites.download).

    Args:
        version_dict (dict): version dictionary describing the source:alias
        args: populated namespace from argparse

    Returns:
    """
    shutil.move(download(version_dict), 'schema.sql')
    base_url = version_dict['remote_url']
    base_url = base_url[:base_url.rfind('/') + 1]
    for table in TABLE_LIST:
        version_dict['remote_url'] = base_url + table + '.txt.gz'
        shutil.move(download(version_dict), table + '.txt')
    try:
        db_import(version_dict, args)
    except mysql.connector.DatabaseError as err:
        print('Encountered error: ' + str(err))
        print('Trying operation again')
        db_import(version_dict, args)
    except:
        raise
예제 #7
0
    def __init__(self, args=cf.config_args()):
        """Init a Biogrid with the staticly defined parameters.

        This calls the SrcClass constructor (see utilities.SrcClass)
        """
        name = 'biogrid'
        url_base = ('http://thebiogrid.org/downloads/archives/'
                    'Latest%20Release/BIOGRID-ALL-LATEST.mitab.zip')
        aliases = {"PPI": "PPI"}
        super(Biogrid, self).__init__(name, url_base, aliases, args)
        self.access_key = '2fe900033b39209b8f63d531fcb24790'
        self.chunk_size = 50000
        src_data_dir = os.path.join(args.working_dir, args.data_path,
                                    cf.DEFAULT_MAP_PATH)
        sp_dir = os.path.join(src_data_dir, 'species', 'species.json')
        sp_dict = json.load(open(sp_dir))
        self.taxid_list = sp_dict.values()

        self.source_url = "https://thebiogrid.org/"
        self.image = "https://pbs.twimg.com/profile_images/875385819422437376/HQv1quNo_400x400.jpg"
        self.reference = (
            "Chatr-aryamontri A, Oughtred R, Boucher L, et al. The BioGRID "
            "interaction database: 2017 update. Nucleic Acids Res. "
            "2017;45(D1):D369-D379.")
        self.pmid = 27980099
        self.license = (
            'BioGRID interaction data are 100% freely available to both commercial and '
            'academic users.')
예제 #8
0
    def __init__(self, database=None, args=None):
        """Init a MySQL object with the provided parameters.

        Constructs a MySQL object with the provided parameters, and connect to
        the relevant database.

        Args:
            database (str): the MySQL database to connect to (optional)
            args (Namespace): args as populated namespace or 'None' for defaults
        """
        if args is None:
            args = cf.config_args()
        self.user = args.mysql_user
        self.host = args.mysql_host
        self.port = args.mysql_port
        self.passw = args.mysql_pass
        self.database = database
        self.args = args
        if self.database is None:
            self.conn = sql.connect(host=self.host, port=self.port,
                                    user=self.user, password=self.passw,
                                    client_flags=[sql.ClientFlag.LOCAL_FILES])
        else:
            self.conn = sql.connect(host=self.host, port=self.port,
                                    user=self.user, password=self.passw,
                                    db=self.database,
                                    client_flags=[sql.ClientFlag.LOCAL_FILES])
        self.cursor = self.conn.cursor()
예제 #9
0
    def __init__(self, args=cf.config_args()):
        """Init a Reactome with the staticly defined parameters.

        This calls the SrcClass constructor (see utilities.SrcClass)
        """
        name = 'reactome'
        url_base = 'http://www.reactome.org/'
        aliases = {
            "Ensembl2Reactome_All_Levels": "genes2pathways",
            "ReactomePathways": "reactomePathways",
            "reactome.homo_sapiens.interactions.tab-delimited":
            "pathwayInteractions",
            "ReactomePathwaysRelation": "ReactomeRelations"
        }
        super(Reactome, self).__init__(name, url_base, aliases, args)

        self.source_url = "http://www.reactome.org/"
        self.image = "http://blog.openhelix.eu/wp-content/uploads/2011/01/Reactome_logo.jpg"
        self.reference = (
            "Fabregat A, Sidiropoulos K, Garapati P, et al. The Reactome pathway "
            "Knowledgebase. Nucleic Acids Res. 2016;44(D1):D481-7.")
        self.pmid = 26656494
        self.license = (
            'The Reactome data and source code continues to be publicly accessible '
            'under the terms of a Creative Commons Attribution 3.0 Unported License.'
        )
예제 #10
0
def import_status(statusfile, args=None):
    """Imports the provided status file and any corresponding meta files into
    the KnowEnG MySQL database.

    Loads the data into a temporary table in MySQL. It then queries from the
    temporary table into the corresponding permanent table. If a duplication
    occurs during the query, it updates to the maximum edge score if it is an
    edge file, and ignores if it is metadata.

    Args:
        status (str): path to the file to be imported
        args (Namespace): args as populated namespace or 'None' for defaults
    """
    if args is None:
        args = cf.config_args()
    imports = ['node', 'node_meta', 'edge2line', 'status', 'edge_meta']
    for table in imports:
        ld_cmd = ''
        dup_cmd = ''
        if table == 'status':
            filename = statusfile
        else:
            filename = statusfile.replace('status', table)
        ufile = filename.replace(table, 'unique.' + table)
        if os.path.isfile(ufile):
            filename = ufile
        if not os.path.isfile(filename):
            continue
        import_file(filename, table, ld_cmd, dup_cmd, args)
예제 #11
0
    def __init__(self, args=cf.config_args()):
        """Init a Msigdb with the staticly defined parameters.

        This calls the SrcClass constructor (see utilities.SrcClass)
        """
        name = 'msigdb'
        url_base = 'http://www.broadinstitute.org/gsea/'
        aliases = {
            "c2.cgp": "curated_genes_cpg",
            "c3.mir": "motif_gene_mir",
            "c4.cgn": "comp_genes_cgn",
            "c4.cm": "onco_sigs_cm",
            "c6.all": "oncogenic_signatures_all",
            "c7.all": "immunologic_signatures_all"
        }
        super(Msigdb, self).__init__(name, url_base, aliases, args)
        self.date_modified = 'unknown'

        self.source_url = "http://software.broadinstitute.org/gsea/msigdb/"
        self.image = "http://software.broadinstitute.org/gsea/images/MSigDB-logo1.gif"
        self.reference = (
            "Subramanian A, Tamayo P, Mootha VK, et al. Gene set enrichment "
            "analysis: a knowledge-based approach for interpreting genome-wide "
            "expression profiles. Proc Natl Acad Sci USA. 2005;102(43):15545-50."
        )
        self.pmid = 16199517
        self.license = (
            'MSigDB v6.0 is available under a Creative Commons style license, plus '
            'additional terms for some gene sets. The full license terms are available '
            '<a href="http://software.broadinstitute.org/gsea/msigdb_license_terms.jsp"'
            '>here</a>.')
예제 #12
0
    def __init__(self, args=cf.config_args()):
        """Init a Dip with the staticly defined parameters.

        This calls the SrcClass constructor (see utilities.SrcClass)
        """
        name = 'dip'
        url_base = 'http://dip.doe-mbi.ucla.edu/dip/script/files/'
        aliases = {"PPI": "PPI"}
        super(Dip, self).__init__(name, url_base, aliases, args)
        self.year = ''
        src_data_dir = os.path.join(args.working_dir, args.data_path,
                                    cf.DEFAULT_MAP_PATH)
        sp_dir = os.path.join(src_data_dir, 'species', 'species.json')
        sp_dict = json.load(open(sp_dir))
        self.taxid_list = sp_dict.values()

        self.source_url = "http://dip.doe-mbi.ucla.edu/"
        self.image = "https://www.virtuallyimmune.org/wp-content/uploads/2014/07/dip_logo.png"
        self.reference = (
            "Salwinski L, Miller CS, Smith AJ, Pettit FK, Bowie JU, Eisenberg D. The "
            "Database of Interacting Proteins: 2004 update. Nucleic Acids Res. "
            "2004;32(Database issue):D449-51.")
        self.pmid = 14681454
        self.license = (
            '<a href="http://creativecommons.org/licenses/by-nd/3.0/">Creative Commons '
            'Attribution-NoDerivs License</a>. However, if you intend to distribute '
            'data from our database, you must <a href="mailto:[email protected]">ask '
            'us</a> for permission first.')
예제 #13
0
def main(chunkfile, version_json, args=None):
    """Tables the source:alias described by version_json.

    This takes the path to a chunked (see fetch_utilities.chunk)  raw_line file
    and it's correpsonding version_json (source.alias.json) and runs the
    source specific table command (see SrcClass.table) if the alias is a data
    file. If it is a mapping file, it does nothing:

        raw_line (line_hash, line_num, file_id, raw_line)
        table_file (line_hash, n1name, n1hint, n1type, n1spec,\
                    n2name, n2hint, n2type, n2spec, et_hint, score,\
                    table_hash)
        edge_meta (line_hash, info_type, info_desc)
        node_meta (node_id, \
                   info_type (evidence, relationship, experiment, or link), \
                   info_desc (text))
        node (node_id, n_alias, n_type)

    Args:
        version_json (str): path to a chunk file in raw_line format
        version_json (str): path to a json file describing the source:alias
        args (Namespace): args as populated namespace or 'None' for defaults
    """
    if args is None:
        args = cf.config_args()
    with open(version_json, 'r') as infile:
        version_dict = json.load(infile)
    src_code_dir = os.path.join(args.code_path, args.src_path)
    sys.path.append(src_code_dir)
    src_module = __import__(version_dict['source'])
    SrcClass = src_module.get_SrcClass(args)
    if not version_dict['is_map']:
        SrcClass.table(chunkfile, version_dict)
예제 #14
0
def import_ensembl(alias, args=None):
    """Imports the ensembl data for the provided alias into the Redis database.

    This stores the foreign key to ensembl stable ids in the Redis database.
    It uses the all mappings dictionary created by mysql.query_all_mappings
    for alias. This then iterates through each foreign_key. If the foreign_key
    has not been seen before, it sets unique:foreign_key as the stable id. If
    the key has been seen before and maps to a different ensembl stable id, it
    sets the value for unique:foreign_key as unmapped:many. In each case, it
    sets the value of taxid:hint:foreign_key as the stable_id, and appends
    taxid:hint to the set with foreign_key as the key.

    Args:
        alias (str): An alias defined in ensembl.aliases.
        args (Namespace): args as populated namespace or 'None' for defaults
    """
    if args is None:
        args = cf.config_args()
    rdb = get_database(args)
    map_dir = os.path.join(args.working_dir, args.data_path, cf.DEFAULT_MAP_PATH)
    with open(os.path.join(map_dir, alias + '_all.json')) as infile:
        map_dict = json.load(infile)
    for key in map_dict:
        (taxid, _, _, hint, foreign_key) = key.split('::')
        hint = hint.upper()
        ens_id = map_dict[key].upper()
        foreign_key = foreign_key.upper()

        keystr = 'unique::' + foreign_key
        rkey = rdb.getset(keystr, ens_id)
        if rkey is not None and rkey.decode() != ens_id:
            rdb.set(keystr, 'unmapped-many')

        keystr = 'hint::' + foreign_key + '::' + hint
        rkey = rdb.getset(keystr, ens_id)
        if rkey is not None and rkey.decode() != ens_id:
            rdb.set(keystr, 'unmapped-many')

        keystr = 'taxon::' + foreign_key + '::' + taxid
        rkey = rdb.getset(keystr, ens_id)
        if rkey is not None and rkey.decode() != ens_id:
            rdb.set(keystr, 'unmapped-many')

        keystr = 'triplet::' + foreign_key + '::' + taxid + '::' + hint
        rkey = rdb.getset(keystr, ens_id)
        if rkey is not None and rkey.decode() != ens_id:
            rdb.set(keystr, 'unmapped-many')

        if hint == 'WIKIGENE': # to replace integer aliases with strings
            try:
                int(rdb.get('::'.join(['stable', ens_id, 'alias'])))
            except TypeError:
                rdb.set('::'.join(['stable', ens_id, 'alias']), foreign_key)
            except ValueError:
                pass
            else:
                rdb.set('::'.join(['stable', ens_id, 'alias']), foreign_key)
예제 #15
0
def deploy_container(args=None):
    """Deplays a container with marathon running MySQL using the specified
    args.

    This replaces the placeholder args in the json describing how to deploy a
    container running mysql with those supplied in the users arguements.

    Args:
        args (Namespace): args as populated namespace or 'None' for defaults
    """
    if args is None:
        args = cf.config_args()
    deploy_dir = os.path.join(args.working_dir, args.logs_path, 'marathon_jobs')
    if not os.path.exists(deploy_dir):
        os.makedirs(deploy_dir)
    template_job = os.path.join(args.code_path, 'marathon',
                                'mysql.json')
    with open(template_job, 'r') as infile:
        deploy_dict = json.load(infile)
    deploy_dict["id"] = os.path.basename(args.mysql_dir)
    deploy_dict["cpus"] = float(args.mysql_cpu)
    deploy_dict["mem"] = int(args.mysql_mem)
    if args.mysql_curl:
        deploy_dict["constraints"] = [["hostname", "CLUSTER", args.mysql_curl]]
    else:
        deploy_dict["constraints"] = []
    conf_template = os.path.join(args.code_path, 'mysql', args.mysql_conf)
    if args.storage_dir:
        mysql_dir = os.path.join(args.storage_dir, args.data_path, 'mysql')
    else:
        mysql_dir = os.path.join(args.working_dir, args.data_path, 'mysql')
    conf_path = os.path.join(mysql_dir, args.mysql_conf)
    if not os.path.exists(conf_path):
        os.makedirs(conf_path)
    os.chmod(os.path.dirname(mysql_dir), 0o777)
    shutil.copy(os.path.join(conf_template, 'my.cnf'), os.path.join(conf_path, 'my.cnf'))
    with open(os.path.join(conf_path, 'password'), 'w') as f:
        f.write(args.mysql_pass)
    deploy_dict["container"]["volumes"][0]["hostPath"] = args.mysql_dir
    deploy_dict["container"]["volumes"][1]["hostPath"] = conf_path
    deploy_dict["container"]["docker"]["parameters"][0]["value"] = \
                    "MYSQL_ROOT_PASSWORD=KnowEnG"
    out_path = os.path.join(deploy_dir, "kn_mysql-" + args.mysql_port +'.json')
    with open(out_path, 'w') as outfile:
        outfile.write(json.dumps(deploy_dict))
    job = 'curl -X POST -H "Content-type: application/json" ' + args.marathon + " -d '"
    job += json.dumps(deploy_dict) + "'"
    if not args.test_mode:
        try:
            subprocess.check_output(job, shell=True)
        except subprocess.CalledProcessError as ex1:
            print(ex1.output)
    else:
        print(job)
예제 #16
0
def import_gene_nodes(node_table, args=None):
    """Import gene node metadata into redis.
    """
    if args is None:
        args = cf.config_args()
    rdb = get_database(args)
    for row in node_table:
        node_id, node_desc, node_type = row
        node_id = node_id.upper()
        rdb.set('::'.join(['stable', node_id, 'desc']), node_desc)
        rdb.set('::'.join(['stable', node_id, 'type']), node_type)
예제 #17
0
    def __init__(self, args=cf.config_args()):
        """Init a Ppi with the staticly defined parameters.

        This calls the SrcClass constructor (see utilities.SrcClass)
        """
        name = 'ppi'
        url_base = ('http://ontologies.berkeleybop.org/mi.obo')
        aliases = {"obo_map": "map file for PPI edge tyeps"}
        self.reference = ''
        self.image = ''
        self.source_url = ''
        self.pmid = 0
        self.license = ''
        super(Ppi, self).__init__(name, url_base, aliases, args)
예제 #18
0
def get_database(args=None):
    """Returns a Redis database connection.

    This returns a Redis database connection access to its functions if the
    module is imported.

    Args:
        args (Namespace): args as populated namespace or 'None' for defaults
    Returns:
        StrictRedis: a redis connection object
    """
    if args is None:
        args = cf.config_args()
    return redis.StrictRedis(host=args.redis_host, port=args.redis_port,
                             password=args.redis_pass)
예제 #19
0
    def __init__(self, args=cf.config_args()):
        """Init a Blast with the staticly defined parameters.

        This calls the SrcClass constructor (see utilities.SrcClass)
        """
        name = 'blast'
        #        url_base = 'http://veda.cs.uiuc.edu/blast/'
        #        aliases = {"mm9_Atha10": "10090_3702",
        #                   "mm9_Scer64": "10090_4932",
        #                   "mm9_Cele235": "10090_6239",
        #                   "mm9_Dmel5": "10090_7227",
        #                   "mm9_hg19": "10090_9606",
        #                   "mm9_mm9": "10090_10090",
        #                   "hg19_Atha10": "9606_3702",
        #                   "hg19_Scer64": "9606_4932",
        #                   "hg19_Cele235": "9606_6239",
        #                   "hg19_Dmel5": "9606_7227",
        #                   "hg19_hg19": "9606_9606",
        #                   "Dmel5_Atha10": "7227_3702",
        #                   "Dmel5_Scer64": "7227_4932",
        #                   "Dmel5_Cele235": "7227_6239",
        #                   "Dmel5_Dmel5": "7227_7227",
        #                   "Cele235_Atha10": "6239_3702",
        #                   "Cele235_Scer64": "6239_4932",
        #                   "Cele235_Cele235": "6239_6239",
        #                   "Scer64_Atha10": "4932_3702",
        #                   "Scer64_Scer64": "4932_4932",
        #                   "Atha10_Atha10": "3702_3702"}

        url_base = 'http://knowredis.knoweng.org:8082/'
        aliases = dict()
        super(Blast, self).__init__(name, url_base, aliases, args)
        self.aliases = self.get_aliases(args)
        self.sc_max = 100  # may want to load these
        self.sc_min = 2  # may want to load these

        self.source_url = "https://blast.ncbi.nlm.nih.gov/"
        self.image = "https://blast.ncbi.nlm.nih.gov/images/protein-blast-cover.png"
        self.reference = (
            "Altschul SF, Gish W, Miller W, Myers EW, Lipman DJ. Basic local "
            "alignment search tool. J Mol Biol. 1990;215(3):403-10.")
        self.pmid = 2231712
        self.license = (
            'NCBI itself places no restrictions on the use or distribution of the data '
            'contained therein. Nor do we accept data when the submitter has requested '
            'restrictions on reuse or redistribution. Full disclaimer can be found <a '
            'href="https://www.ncbi.nlm.nih.gov/home/about/policies/#data.">here</a>.'
        )
예제 #20
0
def get_database(db=None, args=None):
    """Returns an object of the MySQL class.

    This returns an object of the MySQL class to allow access to its functions
    if the module is imported.

    Args:
        db (str): optional db to connect to
        args (Namespace): args as populated namespace or 'None' for defaults

    Returns:
        MySQL: a source class object
    """
    if args is None:
        args = cf.config_args()
    return MySQL(db, args)
예제 #21
0
def query_all_mappings(version_dict, args=None):
    """Creates the all mappings dictionary for the provided alias.

    Produces a dictionary of ensembl stable mappings and the all unique mappings
    the provided alias. It then saves them as json objects to
    file.

    Args:
        version_dict (dict): the version dictionary describing the
            source:alias
        args (Namespace): args as populated namespace or 'None' for defaults
    """
    if args is None:
        args = cf.config_args()
    alias = version_dict['alias']
    taxid = version_dict['alias_info']
    database = 'ensembl_' + alias
    table = alias + '_mappings'
    map_dir = os.path.join(args.data_path, cf.DEFAULT_MAP_PATH)
    if os.path.isdir(args.working_dir):
        map_dir = os.path.join(args.working_dir, map_dir)
    if not os.path.isdir(map_dir):
        os.mkdir(map_dir)
    db = MySQL(database, args)
    cmd = "WHERE db_name='ENS_LRG_GENE'"
    results = db.query_distinct('dbprimary_acc, stable_id', table, cmd)
    lrg_dict = create_dictionary(results)
    results = db.query_distinct('stable_id, stable_id', table)
    map_dict = dict()
    for (raw, mapped) in results:
        if str(raw) in lrg_dict:
            mapped = lrg_dict[str(raw)]
        if str(mapped) in lrg_dict:
            mapped = lrg_dict[str(mapped)]
        map_dict[taxid + '::ENSEMBL_STABLE_ID::' + str(raw)] = str(mapped)
    results = db.query_distinct('display_label AS dbprimary_acc, db_name, stable_id',
                                table)
    results.extend(db.query_distinct('dbprimary_acc, db_name, stable_id',
                                     table))
    for (raw, hint, mapped) in results:
        if str(raw) in lrg_dict:
            mapped = lrg_dict[str(raw)]
        if str(mapped) in lrg_dict:
            mapped = lrg_dict[str(mapped)]
        map_dict['::'.join([taxid, str(hint), str(raw)])] = str(mapped)
    with open(os.path.join(map_dir, alias + '_all.json'), 'w') as outfile:
        json.dump(map_dict, outfile, indent=4)
예제 #22
0
    def get_aliases(self, args=cf.config_args()):
        """Helper function for producing the alias dictionary.

        This returns a dictionary where alias names are keys and alias info
        are the values. This helper function usse the species
        specific information for the build of the Knowledge Network, which is
        produced by ensembl.py during setup utilities and is located at
        cf.DEFAULT_MAP_PATH/species/species.json, in order to fetch all matching
        species specific aliases from the source.

        Args:
            args (Namespace): args as populated namespace or 'None' for defaults

        Returns:
            dict: A dictionary of species:(taxid, division) values
        """
        return dict()
예제 #23
0
def import_nodes(version_dict, args=None):
    """Imports the gene nodes into the KnowNet nodes and node_species tables.

    Queries the imported ensembl nodes and uses the stable ids as nodes for
    the KnowNet nodes table and uses the taxid to create the corresponding
    node_species table.

    Args:
        version_dict (dict): the version dictionary describing the
            source:alias
        args (Namespace): args as populated namespace or 'None' for defaults
    """
    if args is None:
        args = cf.config_args()
    alias = version_dict['alias']
    taxid = version_dict['alias_info'].split('::')[0]
    alias_db = 'ensembl_' + alias
    db = MySQL(alias_db, args)
    cmd = ("SELECT DISTINCT UCASE(gene.stable_id) AS node_id, "
           "SUBSTRING(gene.description, 1, 512) AS n_alias, "
           "'Gene' AS n_type_id "
           "FROM gene "
           "ON DUPLICATE KEY UPDATE node_id=node_id")
    tablename = 'KnowNet.node'
    db.insert(tablename, cmd)
    cmd = ("SELECT DISTINCT UCASE(gene.stable_id) AS node_id, " + taxid +
           " AS taxon FROM gene ON DUPLICATE KEY UPDATE node_id=node_id")
    tablename = 'KnowNet.node_species'
    db.insert(tablename, cmd)
    cmd = ("SELECT DISTINCT UCASE(gene.stable_id) AS node_id, "
           "'biotype' AS info_type, "
           "gene.biotype AS info_desc "
           "FROM gene "
           "ON DUPLICATE KEY UPDATE node_id=node_id")
    tablename = 'KnowNet.node_meta'
    db.insert(tablename, cmd)
    cmd = ("SELECT DISTINCT UCASE(gene.stable_id) AS node_id, 'taxid' AS info_type, " + taxid +
           " AS info_desc FROM gene ON DUPLICATE KEY UPDATE node_id=node_id")
    tablename = 'KnowNet.node_meta'
    db.insert(tablename, cmd)
    cmd = ("SELECT DISTINCT UCASE(gene.stable_id) AS node_id, "
           "gene.description AS n_alias, "
           "'Gene' AS n_type_id "
           "FROM gene")
    return db.run(cmd)
예제 #24
0
def deploy_container(args=None):
    """Deplays a container with marathon running nginx using the specified
    args.

    This replaces the placeholder args in the json describing how to deploy a
    container running Nginx with those supplied in the users arguements.

    Args:
        args (Namespace): args as populated namespace or 'None' for defaults
    """
    if args is None:
        args = cf.config_args()
    deploy_dir = os.path.join(args.working_dir, args.logs_path,
                              'marathon_jobs')
    if not os.path.exists(deploy_dir):
        os.makedirs(deploy_dir)
    template_job = os.path.join(args.code_path, 'marathon', 'nginx.json')
    with open(template_job, 'r') as infile:
        deploy_dict = json.load(infile)
    deploy_dict["id"] = os.path.basename(args.nginx_dir)
    if args.nginx_curl:
        deploy_dict["constraints"] = [["hostname", "CLUSTER", args.nginx_curl]]
    else:
        deploy_dict["constraints"] = []
    deploy_dict["container"]["volumes"][0]["hostPath"] = args.nginx_dir
    docs_path = os.path.join(args.working_dir, 'KnowNet_Pipeline', 'docs', \
                            '_build', 'html')
    deploy_dict["container"]["volumes"][1]["hostPath"] = docs_path
    conf_path = os.path.join(args.code_path, 'nginx', args.nginx_conf)
    deploy_dict["container"]["volumes"][2]["hostPath"] = conf_path
    deploy_dict["container"]["docker"]["portMappings"][0]["hostPort"] = int(
        args.nginx_port)
    out_path = os.path.join(deploy_dir,
                            "kn_nginx-" + args.nginx_port + '.json')
    with open(out_path, 'w') as outfile:
        outfile.write(json.dumps(deploy_dict))
    job = 'curl -X POST -H "Content-type: application/json" ' + args.marathon + " -d '"
    job += json.dumps(deploy_dict) + "'"
    if not args.test_mode:
        try:
            subprocess.check_output(job, shell=True)
        except subprocess.CalledProcessError as ex1:
            print(ex1.output)
    else:
        print(job)
예제 #25
0
def create_KnowNet(args=None):
    """Returns an object of the MySQL class with KnowNet db.

    This returns an object of the MySQL class to allow access to its functions
    if the module is imported.

    Args:
        db (str): optional db to connect to
        args (Namespace): args as populated namespace or 'None' for defaults

    Returns:
        MySQL: a source class object
    """
    if args is None:
        args = cf.config_args()
    db = MySQL(None, args)
    db.init_knownet()
    return db
예제 #26
0
def import_pnode(filename, args=None):
    """Imports the provided property node file into the KnowEnG MySQL database.

    Loads the data into a temporary table in MySQL. It then queries from the
    temporary table into the corresponding permanent table. If a duplication
    occurs during the query, it updates to the maximum edge score if it is an
    edge file, and ignores if it is metadata.

    Args:
        filename (str): path to the file to be imported
        args (Namespace): args as populated namespace or 'None' for defaults
    """
    if args is None:
        args = cf.config_args()
    ld_cmd = '(node_id, n_alias) SET n_type="Property"'
    dup_cmd = 'node.node_id = node.node_id'
    table = 'node'
    import_file(filename, table, ld_cmd, dup_cmd, args)
예제 #27
0
    def __init__(self, args=cf.config_args()):
        """Init a Pathcom with the staticly defined parameters.

        This calls the SrcClass constructor (see utilities.SrcClass)
        """
        name = 'pathcom'
        url_base = 'http://www.pathwaycommons.org/archives/PC2/'
        aliases = {"all":""}
        super(Pathcom, self).__init__(name, url_base, aliases, args)

        self.source_url = "http://www.pathwaycommons.org/"
        self.image = "https://pbs.twimg.com/profile_images/862675480281042944/PblJi9Va.jpg"
        self.reference = ("Cerami EG, Gross BE, Demir E, et al. Pathway Commons, a web resource "
                          "for biological pathway data. Nucleic Acids Res. "
                          "2011;39(Database issue):D685-90.")
        self.pmid = 21071392
        self.license = ('Full list of data sources are available <a '
                        'href="http://www.pathwaycommons.org/pc2/datasources">here</a>.')
예제 #28
0
파일: go.py 프로젝트: cblatti3/KN_Builder
    def __init__(self, args=cf.config_args()):
        """Init a Stringdb with the staticly defined parameters.

        This calls the SrcClass constructor (see utilities.SrcClass)
        """
        name = 'go'
        url_base = 'http://geneontology.org/gene-associations/'
        aliases = dict()
        super(Go, self).__init__(name, url_base, aliases, args)
        self.aliases = self.get_aliases(args)
        self.chunk_size = 250000

        self.source_url = "http://www.geneontology.org/"
        self.image = "https://avatars3.githubusercontent.com/u/7750835?v=3&s=200"
        self.reference = ("Gene Ontology Consortium: going forward. Nucleic Acids Res. "
                          "2015;43(Database issue):D1049-56.")
        self.pmid = 25428369
        self.license = 'Creative commons license attribution 4.0 international'
예제 #29
0
def import_nodemeta(nmfile, args=None):
    """Imports the provided node_meta file and any corresponding meta files into
    the KnowEnG MySQL database.

    Loads the data into a temporary table in MySQL. It then queries from the
    temporary table into the corresponding permanent table. If a duplication
    occurs during the query, it updates to the maximum edge score if it is an
    edge file, and ignores if it is metadata.

    Args:
        nmfile (str): path to the file to be imported
        args (Namespace): args as populated namespace or 'None' for defaults
    """
    if args is None:
        args = cf.config_args()
    table = 'node_meta'
    dup_cmd = 'node_meta.node_id = node_meta.node_id'
    ld_cmd = ''
    import_file(nmfile, table, ld_cmd, dup_cmd, args)
예제 #30
0
    def __init__(self, args=cf.config_args()):
        """Init a Stringdb with the staticly defined parameters.

        This calls the SrcClass constructor (see utilities.SrcClass)
        """
        name = 'stringdb'
        url_base = 'https://string-db.org/'
        aliases = dict()
        super(Stringdb, self).__init__(name, url_base, aliases, args)
        self.aliases = self.get_aliases(args)
        self.chunk_size = 250000

        self.source_url = "https://string-db.org/"
        self.image = "http://meringlab.org/logos/string.png"
        self.reference = ("Szklarczyk D, Franceschini A, Wyder S, et al. STRING v10: "
                          "protein-protein interaction networks, integrated over the tree of life. "
                          "Nucleic Acids Res. 2015;43(Database issue):D447-52.")
        self.pmid = 25352553
        self.license = ('The dataset obtained from STRING is distributed under '
                        'Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0)')