예제 #1
0
파일: tcga.py 프로젝트: ndexbio/ndex-nav
def build_mapping(file):
    id_to_symbol = dict()
    with open(file) as fid:
        for line in fid:
            target, src = line.split()
            id_to_symbol[src] = target

    symbol_to_id = genemania.id_lookup_table(id_to_symbol.values())
    id_to_ensembl = {k: symbol_to_id[v] for k, v in id_to_symbol.iteritems() if v in symbol_to_id}

    return id_to_symbol, id_to_ensembl
예제 #2
0
def build_mapping(file):
    id_to_symbol = dict()
    with open(file) as fid:
        for line in fid:
            target, src = line.split()
            id_to_symbol[src] = target

    symbol_to_id = genemania.id_lookup_table(id_to_symbol.values())
    id_to_ensembl = {
        k: symbol_to_id[v]
        for k, v in id_to_symbol.iteritems() if v in symbol_to_id
    }

    return id_to_symbol, id_to_ensembl
예제 #3
0
파일: go.py 프로젝트: ucsd-ccbb/Oncolist
def load_go_genes():
    info = {
        'database': 'go',
        'collection': 'genes',
        'url':
        'http://geneontology.org/gene-associations/gene_association.goa_human.gz',
        'timestamp': time.time()
    }
    client = pymongo.MongoClient()
    collection = client[info['database']][info['collection']]
    collection.drop()
    with mktemp() as pathname:
        filename = os.path.join(pathname, 'gene_association.goa_human.gz')
        log.debug('downloading %s to %s', info['url'], filename)
        subprocess.call(['wget', info['url'], '-O', filename])
        log.debug('gunzip %s', filename)
        subprocess.call(['gunzip', filename])
        filename, _ = os.path.splitext(filename)

        with open(filename, 'rt') as fid:
            log.debug(
                'creating a name to emsembl id lookup table from go genes...')
            go_genes = set([
                line.split('\t')[2] for line in fid if not line.startswith('!')
            ])

        name_to_id = genemania.id_lookup_table(go_genes)

        with open(filename, 'rt') as fid:
            status = Status(filename, log).fid(fid).start()
            for line in fid:
                status.log()
                if not line.startswith('!'):
                    tokens = line.split('\t')
                    obj = {'gene': name_to_id.get(tokens[2]), 'go': tokens[4]}
                    collection.insert(obj)
            status.stop()

    update_info(info)
    collection.create_index('go')
    collection.create_index('gene')
예제 #4
0
파일: go.py 프로젝트: ndexbio/ndex-nav
def load_go_genes():
    info = {
        'database': 'go',
        'collection': 'genes',
        'url': 'http://geneontology.org/gene-associations/gene_association.goa_human.gz',
        'timestamp': time.time()
    }
    client = pymongo.MongoClient()
    collection = client[info['database']][info['collection']]
    collection.drop()
    with mktemp() as pathname:
        filename = os.path.join(pathname, 'gene_association.goa_human.gz')
        log.debug('downloading %s to %s', info['url'], filename)
        subprocess.call(['wget', info['url'], '-O', filename])
        log.debug('gunzip %s', filename)
        subprocess.call(['gunzip', filename])
        filename, _ = os.path.splitext(filename)

        with open(filename, 'rt') as fid:
            log.debug('creating a name to emsembl id lookup table from go genes...')
            go_genes = set([line.split('\t')[2] for line in fid if not line.startswith('!')])

        name_to_id = genemania.id_lookup_table(go_genes)

        with open(filename, 'rt') as fid:
            status = Status(filename, log).fid(fid).start()
            for line in fid:
                status.log()
                if not line.startswith('!'):
                    tokens = line.split('\t')
                    obj = {
                        'gene': name_to_id.get(tokens[2]),
                        'go': tokens[4]
                    }
                    collection.insert(obj)
            status.stop()

    update_info(info)
    collection.create_index('go')
    collection.create_index('gene')
예제 #5
0
def main():
    client = pymongo.MongoClient()
    db = client.networks

    # collection stores metadata about source networks
    meta = db.meta

    # collection stores edge data
    edges = db.edges

    # create index, if necessary
    create_edges_index()

    # get list of previously loaded networks to delete, if any
    _ids = [result['_id'] for result in meta.find({'collection': 'humannet'})]

    # From http://www.functionalnet.org/humannet/HumanNet.v1.evidence_code.txt:
    # File format: [gene1] [gene2] [CE-CC] [CE-CX] [CE-GT] [CE-LC] [CE-YH] [DM-PI] [HS-CC] [HS-CX] [HS-DC] [HS-GN] [HS-LC] [HS-MS] [HS-PG] [HS-YH] [SC-CC] [SC-CX] [SC-GT] [SC-LC] [SC-MS] [SC-TS] [SC-YH] [IntNet]
    # CE-CC = Co-citation of worm gene
    # CE-CX = Co-expression among worm genes
    # CE-GT = Worm genetic interactions
    # CE-LC = Literature curated worm protein physical interactions
    # CE-YH = High-throughput yeast 2-hybrid assays among worm genes
    # DM-PI = Fly protein physical interactions
    # HS-CC = Co-citation of human genes
    # HS-CX = Co-expression among human genes
    # HS-DC = Co-occurrence of domains among human proteins
    # HS-GN = Gene neighbourhoods of bacterial and archaeal orthologs of human genes
    # HS-LC = Literature curated human protein physical interactions
    # HS-MS = human protein complexes from affinity purification/mass spectrometry
    # HS-PG = Co-inheritance of bacterial and archaeal orthologs of human genes
    # HS-YH = High-throughput yeast 2-hybrid assays among human genes
    # SC-CC = Co-citation of yeast genes
    # SC-CX = Co-expression among yeast genes
    # SC-GT = Yeast genetic interactions
    # SC-LC = Literature curated yeast protein physical interactions
    # SC-MS = Yeast protein complexes from affinity purification/mass spectrometry
    # SC-TS = Yeast protein interactions inferred from tertiary structures of complexes
    # SC-YH = High-throughput yeast 2-hybrid assays among yeast genes
    # IntNet = Integrated network (HumanNet)

    columns = [
        'co-citation of worm gene', 'co-expression among worm genes',
        'worm genetic interactions',
        'literature curated worm protein physical interactions',
        'high-throughput yeast 2-hybrid assays among worm genes',
        'fly protein physical interactions', 'co-citation of human genes',
        'co-expression among human genes',
        'co-occurrence of domains among human proteins',
        'gene neighbourhoods of bacterial and archaeal orthologs of human genes',
        'literature curated human protein physical interactions',
        'human protein complexes from affinity purification/mass spectrometry',
        'co-inheritance of bacterial and archaeal orthologs of human genes',
        'high-throughput yeast 2-hybrid assays among human genes',
        'co-citation of yeast genes', 'co-expression among yeast genes',
        'yeast genetic interactions',
        'literature curated yeast protein physical interactions',
        'yeast protein complexes from affinity purification/mass spectrometry',
        'yeast protein interactions inferred from tertiary structures of complexes',
        'high-throughput yeast 2-hybrid assays among yeast genes'
    ]

    metadata = {}

    for column in columns:
        m = {'collection': 'humannet', 'name': column, 'count': 0}
        set_status(m, 'parsing')
        m['_id'] = meta.insert_one(m).inserted_id
        metadata[column] = m

    url = 'http://www.functionalnet.org/humannet/HumanNet.v1.join.txt'
    log.info('reading network list from %s', url)
    r = requests.get(url)
    lines = list(r.iter_lines())

    count = 0

    iterator = parse(columns, metadata, lines)
    while True:
        records = [record for record in islice(iterator, 1000)]
        if len(records) > 0:
            name_to_id = genemania.id_lookup_table(
                set(it['source'] for it in records) | set(it['target']
                                                          for it in records))
            for record in records:
                source = name_to_id[record['source']]
                if source is None:
                    log.warning('unknown source %s', record['source'])
                record['source'] = source

                target = name_to_id[record['target']]
                if target is None:
                    log.warning('unknown target %s', record['target'])
                record['target'] = target

            records = [
                record for record in records if record['source'] is not None
                and record['target'] is not None
            ]
            count += len(records)
            edges.insert_many(records)
            log.debug('inserted %d edges (%d total)', len(records), count)
        else:
            break

    for m in metadata.itervalues():
        set_status(m, 'success')
        meta.replace_one({'_id': m['_id']}, m)

    if len(_ids) > 0:
        log.info('dropping old network metadata')
        meta.delete_many({'_id': {'$in': _ids}})

    cleanup_edges()

    return 0
예제 #6
0
def main():
    client = pymongo.MongoClient()
    db = client.networks

    # collection stores metadata about source networks
    meta = db.meta

    # collection stores edge data
    edges = db.edges

    # create index, if necessary
    create_edges_index()

    # get list of previously loaded networks to delete, if any
    _ids = [result['_id'] for result in meta.find({'collection': 'humannet'})]

    # From http://www.functionalnet.org/humannet/HumanNet.v1.evidence_code.txt:
    # File format: [gene1] [gene2] [CE-CC] [CE-CX] [CE-GT] [CE-LC] [CE-YH] [DM-PI] [HS-CC] [HS-CX] [HS-DC] [HS-GN] [HS-LC] [HS-MS] [HS-PG] [HS-YH] [SC-CC] [SC-CX] [SC-GT] [SC-LC] [SC-MS] [SC-TS] [SC-YH] [IntNet]
    # CE-CC = Co-citation of worm gene
    # CE-CX = Co-expression among worm genes
    # CE-GT = Worm genetic interactions
    # CE-LC = Literature curated worm protein physical interactions
    # CE-YH = High-throughput yeast 2-hybrid assays among worm genes
    # DM-PI = Fly protein physical interactions
    # HS-CC = Co-citation of human genes
    # HS-CX = Co-expression among human genes
    # HS-DC = Co-occurrence of domains among human proteins
    # HS-GN = Gene neighbourhoods of bacterial and archaeal orthologs of human genes
    # HS-LC = Literature curated human protein physical interactions
    # HS-MS = human protein complexes from affinity purification/mass spectrometry
    # HS-PG = Co-inheritance of bacterial and archaeal orthologs of human genes
    # HS-YH = High-throughput yeast 2-hybrid assays among human genes
    # SC-CC = Co-citation of yeast genes
    # SC-CX = Co-expression among yeast genes
    # SC-GT = Yeast genetic interactions
    # SC-LC = Literature curated yeast protein physical interactions
    # SC-MS = Yeast protein complexes from affinity purification/mass spectrometry
    # SC-TS = Yeast protein interactions inferred from tertiary structures of complexes
    # SC-YH = High-throughput yeast 2-hybrid assays among yeast genes
    # IntNet = Integrated network (HumanNet)

    columns = [
        'co-citation of worm gene',
        'co-expression among worm genes',
        'worm genetic interactions',
        'literature curated worm protein physical interactions',
        'high-throughput yeast 2-hybrid assays among worm genes',
        'fly protein physical interactions',
        'co-citation of human genes',
        'co-expression among human genes',
        'co-occurrence of domains among human proteins',
        'gene neighbourhoods of bacterial and archaeal orthologs of human genes',
        'literature curated human protein physical interactions',
        'human protein complexes from affinity purification/mass spectrometry',
        'co-inheritance of bacterial and archaeal orthologs of human genes',
        'high-throughput yeast 2-hybrid assays among human genes',
        'co-citation of yeast genes',
        'co-expression among yeast genes',
        'yeast genetic interactions',
        'literature curated yeast protein physical interactions',
        'yeast protein complexes from affinity purification/mass spectrometry',
        'yeast protein interactions inferred from tertiary structures of complexes',
        'high-throughput yeast 2-hybrid assays among yeast genes'
    ]

    metadata = {}

    for column in columns:
        m = {
            'collection': 'humannet',
            'name': column,
            'count': 0
        }
        set_status(m, 'parsing')
        m['_id'] = meta.insert_one(m).inserted_id
        metadata[column] = m

    url = 'http://www.functionalnet.org/humannet/HumanNet.v1.join.txt'
    log.info('reading network list from %s', url)
    r = requests.get(url)
    lines = list(r.iter_lines())

    count = 0

    iterator = parse(columns, metadata, lines)
    while True:
        records = [record for record in islice(iterator, 1000)]
        if len(records) > 0:
            name_to_id = genemania.id_lookup_table(set(it['source'] for it in records) | set(it['target'] for it in records))
            for record in records:
                source = name_to_id[record['source']]
                if source is None:
                    log.warning('unknown source %s', record['source'])
                record['source'] = source

                target = name_to_id[record['target']]
                if target is None:
                    log.warning('unknown target %s', record['target'])
                record['target'] = target

            records = [record for record in records if record['source'] is not None and record['target'] is not None]
            count += len(records)
            edges.insert_many(records)
            log.debug('inserted %d edges (%d total)', len(records), count)
        else:
            break

    for m in metadata.itervalues():
        set_status(m, 'success')
        meta.replace_one({'_id': m['_id']}, m)

    if len(_ids) > 0:
        log.info('dropping old network metadata')
        meta.delete_many({'_id': {'$in': _ids}})

    cleanup_edges()

    return 0