Пример #1
0
def main():
    client = pymongo.MongoClient()
    db = client.networks

    # collection stores metadata about source networks
    meta = db.meta

    # collection stores edge data
    edges = db.edges

    # create index, if necessary
    create_edges_index()

    # get list of previously loaded networks to delete, if any
    _ids = [result['_id'] for result in meta.find({'collection': 'humannet'})]

    # From http://www.functionalnet.org/humannet/HumanNet.v1.evidence_code.txt:
    # File format: [gene1] [gene2] [CE-CC] [CE-CX] [CE-GT] [CE-LC] [CE-YH] [DM-PI] [HS-CC] [HS-CX] [HS-DC] [HS-GN] [HS-LC] [HS-MS] [HS-PG] [HS-YH] [SC-CC] [SC-CX] [SC-GT] [SC-LC] [SC-MS] [SC-TS] [SC-YH] [IntNet]
    # CE-CC = Co-citation of worm gene
    # CE-CX = Co-expression among worm genes
    # CE-GT = Worm genetic interactions
    # CE-LC = Literature curated worm protein physical interactions
    # CE-YH = High-throughput yeast 2-hybrid assays among worm genes
    # DM-PI = Fly protein physical interactions
    # HS-CC = Co-citation of human genes
    # HS-CX = Co-expression among human genes
    # HS-DC = Co-occurrence of domains among human proteins
    # HS-GN = Gene neighbourhoods of bacterial and archaeal orthologs of human genes
    # HS-LC = Literature curated human protein physical interactions
    # HS-MS = human protein complexes from affinity purification/mass spectrometry
    # HS-PG = Co-inheritance of bacterial and archaeal orthologs of human genes
    # HS-YH = High-throughput yeast 2-hybrid assays among human genes
    # SC-CC = Co-citation of yeast genes
    # SC-CX = Co-expression among yeast genes
    # SC-GT = Yeast genetic interactions
    # SC-LC = Literature curated yeast protein physical interactions
    # SC-MS = Yeast protein complexes from affinity purification/mass spectrometry
    # SC-TS = Yeast protein interactions inferred from tertiary structures of complexes
    # SC-YH = High-throughput yeast 2-hybrid assays among yeast genes
    # IntNet = Integrated network (HumanNet)

    columns = [
        'co-citation of worm gene', 'co-expression among worm genes',
        'worm genetic interactions',
        'literature curated worm protein physical interactions',
        'high-throughput yeast 2-hybrid assays among worm genes',
        'fly protein physical interactions', 'co-citation of human genes',
        'co-expression among human genes',
        'co-occurrence of domains among human proteins',
        'gene neighbourhoods of bacterial and archaeal orthologs of human genes',
        'literature curated human protein physical interactions',
        'human protein complexes from affinity purification/mass spectrometry',
        'co-inheritance of bacterial and archaeal orthologs of human genes',
        'high-throughput yeast 2-hybrid assays among human genes',
        'co-citation of yeast genes', 'co-expression among yeast genes',
        'yeast genetic interactions',
        'literature curated yeast protein physical interactions',
        'yeast protein complexes from affinity purification/mass spectrometry',
        'yeast protein interactions inferred from tertiary structures of complexes',
        'high-throughput yeast 2-hybrid assays among yeast genes'
    ]

    metadata = {}

    for column in columns:
        m = {'collection': 'humannet', 'name': column, 'count': 0}
        set_status(m, 'parsing')
        m['_id'] = meta.insert_one(m).inserted_id
        metadata[column] = m

    url = 'http://www.functionalnet.org/humannet/HumanNet.v1.join.txt'
    log.info('reading network list from %s', url)
    r = requests.get(url)
    lines = list(r.iter_lines())

    count = 0

    iterator = parse(columns, metadata, lines)
    while True:
        records = [record for record in islice(iterator, 1000)]
        if len(records) > 0:
            name_to_id = genemania.id_lookup_table(
                set(it['source'] for it in records) | set(it['target']
                                                          for it in records))
            for record in records:
                source = name_to_id[record['source']]
                if source is None:
                    log.warning('unknown source %s', record['source'])
                record['source'] = source

                target = name_to_id[record['target']]
                if target is None:
                    log.warning('unknown target %s', record['target'])
                record['target'] = target

            records = [
                record for record in records if record['source'] is not None
                and record['target'] is not None
            ]
            count += len(records)
            edges.insert_many(records)
            log.debug('inserted %d edges (%d total)', len(records), count)
        else:
            break

    for m in metadata.itervalues():
        set_status(m, 'success')
        meta.replace_one({'_id': m['_id']}, m)

    if len(_ids) > 0:
        log.info('dropping old network metadata')
        meta.delete_many({'_id': {'$in': _ids}})

    cleanup_edges()

    return 0
Пример #2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--id', action='store_true', help='load identifiers only')
    parser.add_argument('--batch', type=int, default=10000, help='insert records batch size')
    parser.add_argument('--warmstart', action='store_true', help='warmstart')
    args = parser.parse_args()

    if not args.warmstart:
        load_identifiers()

    if not args.id:

        client = pymongo.MongoClient()
        db = client.networks

        # collection stores metadata about source networks
        meta = db.meta

        # collection stores edge data
        edges = db.edges

        create_edges_index()

        url = 'http://genemania.org/data/current/Homo_sapiens/networks.txt'
        log.info('reading network list from %s', url)
        r = requests.get(url)
        lines = list(r.iter_lines())[1:] # ignore header line

        status = Status('networks', logger=log).n(len(lines)).start()
        for idx, line in enumerate(lines):
            status.log(idx)
            file_name, network_group_name, network_name, source, pubmed_id = line.split('\t')

            metadata = {
                'collection': 'genemania',
                'type': network_group_name.lower(),
                'source': source,
                'name': network_name,
                'pubmed': int(pubmed_id) if not pubmed_id == '' else 0
            }

            if not args.warmstart or meta.find_one(dict(metadata.items() + [('status', 'success')])) is None:

                # old metadata records and their associated edges will be dropped after the new network is finished processing
                _ids = [result['_id'] for result in meta.find(metadata)]
                log.info('found %d matching network(s) that will be replaced: %s', len(_ids), ', '.join([str(_id) for _id in _ids]))

                set_status(metadata, 'parsing')
                _id = meta.insert_one(metadata).inserted_id

                metadata['count'] = load_network('http://genemania.org/data/current/Homo_sapiens/' + file_name, _id, args.batch)
                log.info('%s %s %s network has %d edges', metadata['source'], metadata['name'], metadata['type'], metadata['count'])

                set_status(metadata, 'success')
                meta.save(metadata)

                if len(_ids) > 0:
                    log.info('dropping old network metadata')
                    meta.delete_many({'_id': {'$in': _ids}})

        cleanup_edges()

        status.stop()

    return 0
Пример #3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--warmstart', action='store_true', help='warmstart')
    args = parser.parse_args()

    if not args.warmstart:
        load_identifiers()

    client = pymongo.MongoClient()
    db = client.networks

    # collection stores metadata about source networks
    meta = db.meta

    # collection stores edge data
    edges = db.edges

    create_edges_index()

    url = 'http://genemania.org/data/current/Homo_sapiens/networks.txt'
    log.info('reading network list from %s', url)
    r = requests.get(url)
    lines = list(r.iter_lines())[1:] # ignore header line

    status = Status('networks', logger=log).n(len(lines)).start()
    for idx, line in enumerate(lines):
        status.log(idx)
        file_name, network_group_name, network_name, source, pubmed_id = line.split('\t')

        metadata = {
            'collection': 'identifiers',
            'type': network_group_name.lower(),
            'source': source,
            'name': network_name,
            'pubmed': int(pubmed_id) if not pubmed_id == '' else 0
        }

        if not args.warmstart or meta.find_one(dict(metadata.items() + [('status', 'success')])) is None:

            # old metadata records and their associated edges will be dropped after the new network is finished processing
            _ids = [result['_id'] for result in meta.find(metadata)]
            log.info('found %d matching network(s) that will be replaced: %s', len(_ids), ', '.join([str(_id) for _id in _ids]))

            set_status(metadata, 'parsing')
            _id = meta.insert_one(metadata).inserted_id

            metadata['count'] = load_network('http://genemania.org/data/current/Homo_sapiens/' + file_name, _id)
            log.info('%s %s %s network has %d edges', metadata['source'], metadata['name'], metadata['type'], metadata['count'])

            set_status(metadata, 'success')
            meta.save(metadata)

            if len(_ids) > 0:
                log.info('dropping old network metadata')
                meta.delete_many({'_id': {'$in': _ids}})

    log.info('dropping old edge data')
    edges.delete_many({'meta': {'$nin': [it['_id'] for it in meta.find()]}})

    status.stop()
    return 0
Пример #4
0
def main():
    client = pymongo.MongoClient()
    db = client.networks

    # collection stores metadata about source networks
    meta = db.meta

    # collection stores edge data
    edges = db.edges

    # create index, if necessary
    create_edges_index()

    # get list of previously loaded networks to delete, if any
    _ids = [result['_id'] for result in meta.find({'collection': 'humannet'})]

    # From http://www.functionalnet.org/humannet/HumanNet.v1.evidence_code.txt:
    # File format: [gene1] [gene2] [CE-CC] [CE-CX] [CE-GT] [CE-LC] [CE-YH] [DM-PI] [HS-CC] [HS-CX] [HS-DC] [HS-GN] [HS-LC] [HS-MS] [HS-PG] [HS-YH] [SC-CC] [SC-CX] [SC-GT] [SC-LC] [SC-MS] [SC-TS] [SC-YH] [IntNet]
    # CE-CC = Co-citation of worm gene
    # CE-CX = Co-expression among worm genes
    # CE-GT = Worm genetic interactions
    # CE-LC = Literature curated worm protein physical interactions
    # CE-YH = High-throughput yeast 2-hybrid assays among worm genes
    # DM-PI = Fly protein physical interactions
    # HS-CC = Co-citation of human genes
    # HS-CX = Co-expression among human genes
    # HS-DC = Co-occurrence of domains among human proteins
    # HS-GN = Gene neighbourhoods of bacterial and archaeal orthologs of human genes
    # HS-LC = Literature curated human protein physical interactions
    # HS-MS = human protein complexes from affinity purification/mass spectrometry
    # HS-PG = Co-inheritance of bacterial and archaeal orthologs of human genes
    # HS-YH = High-throughput yeast 2-hybrid assays among human genes
    # SC-CC = Co-citation of yeast genes
    # SC-CX = Co-expression among yeast genes
    # SC-GT = Yeast genetic interactions
    # SC-LC = Literature curated yeast protein physical interactions
    # SC-MS = Yeast protein complexes from affinity purification/mass spectrometry
    # SC-TS = Yeast protein interactions inferred from tertiary structures of complexes
    # SC-YH = High-throughput yeast 2-hybrid assays among yeast genes
    # IntNet = Integrated network (HumanNet)

    columns = [
        'co-citation of worm gene',
        'co-expression among worm genes',
        'worm genetic interactions',
        'literature curated worm protein physical interactions',
        'high-throughput yeast 2-hybrid assays among worm genes',
        'fly protein physical interactions',
        'co-citation of human genes',
        'co-expression among human genes',
        'co-occurrence of domains among human proteins',
        'gene neighbourhoods of bacterial and archaeal orthologs of human genes',
        'literature curated human protein physical interactions',
        'human protein complexes from affinity purification/mass spectrometry',
        'co-inheritance of bacterial and archaeal orthologs of human genes',
        'high-throughput yeast 2-hybrid assays among human genes',
        'co-citation of yeast genes',
        'co-expression among yeast genes',
        'yeast genetic interactions',
        'literature curated yeast protein physical interactions',
        'yeast protein complexes from affinity purification/mass spectrometry',
        'yeast protein interactions inferred from tertiary structures of complexes',
        'high-throughput yeast 2-hybrid assays among yeast genes'
    ]

    metadata = {}

    for column in columns:
        m = {
            'collection': 'humannet',
            'name': column,
            'count': 0
        }
        set_status(m, 'parsing')
        m['_id'] = meta.insert_one(m).inserted_id
        metadata[column] = m

    url = 'http://www.functionalnet.org/humannet/HumanNet.v1.join.txt'
    log.info('reading network list from %s', url)
    r = requests.get(url)
    lines = list(r.iter_lines())

    count = 0

    iterator = parse(columns, metadata, lines)
    while True:
        records = [record for record in islice(iterator, 1000)]
        if len(records) > 0:
            name_to_id = genemania.id_lookup_table(set(it['source'] for it in records) | set(it['target'] for it in records))
            for record in records:
                source = name_to_id[record['source']]
                if source is None:
                    log.warning('unknown source %s', record['source'])
                record['source'] = source

                target = name_to_id[record['target']]
                if target is None:
                    log.warning('unknown target %s', record['target'])
                record['target'] = target

            records = [record for record in records if record['source'] is not None and record['target'] is not None]
            count += len(records)
            edges.insert_many(records)
            log.debug('inserted %d edges (%d total)', len(records), count)
        else:
            break

    for m in metadata.itervalues():
        set_status(m, 'success')
        meta.replace_one({'_id': m['_id']}, m)

    if len(_ids) > 0:
        log.info('dropping old network metadata')
        meta.delete_many({'_id': {'$in': _ids}})

    cleanup_edges()

    return 0