示例#1
0
def extract(path=None, uri=[], **kwargs):
    # Get mongo uri
    mongo_uri = first(u for u in uri if 'mongodb' in u.scheme.split('+'))
    # Get extract mongo db name
    db_path = mongo_uri.path[1:]
    del mongo_uri.path
    # Instantiate mongo client
    mongo = pymongo.MongoClient(str(mongo_uri))
    # Get mongo db
    db = getattr(mongo, db_path)
    #
    for dataset in db.datasets.find({'@type': 'geneset'}, {'_id': 1}):
        with open(
                os.path.join(
                    path, '{}_{}.data.uuid.gmt'.format(db_path,
                                                       dataset['_id'])),
                'w') as fw:
            for signature in db.signature_data.find(
                {
                    'dataset': dataset['_id'],
                    'data.set': {
                        '$ne': None
                    }
                }, {
                    '_id': 1,
                    'data.set': 1
                }):
                print(str(signature['_id']),
                      '',
                      *signature['data']['set'].keys(),
                      sep='\t',
                      file=fw)
示例#2
0
def ingest(input_files, uri=[], limit=1000, **kawrgs):
    input_file, = input_files
    # Get mongo uri
    mongo_uri = first(u for u in uri if 'mongodb' in u.scheme.split('+'))
    # Get extract mongo db name
    db_path = mongo_uri.path[1:]
    del mongo_uri.path
    # Instantiate mongo client
    mongo = pymongo.MongoClient(str(mongo_uri))
    # Get mongo db
    db = getattr(mongo, db_path)

    #
    def generate_signatures():
        with open(input_file, 'r') as fr:
            for signature in map(json.loads, fr):
                yield {
                    '_id': signature['@id'],
                }, {
                    '$set': {
                        'library': signature['library'],
                        'meta': signature['meta'],
                    },
                }

    #
    mongo_bulk_upsert(
        db.signature_meta,
        generate_signatures(),
        limit=limit,
    )
def ingest(input_files, uri=[], limit=1000, **kwargs):
    signatures_meta, = input_files
    # Get the meta_uri only
    meta_uri = first(u for u in uri if 'meta' in u.scheme.split('+'))
    # Extract token
    metadata_token = base64.b64encode('{username}:{password}'.format(
        username=meta_uri.username,
        password=meta_uri.password).encode()).decode()
    # Prepare uri
    del meta_uri.username
    meta_uri.scheme = ''.join(
        set(['http', 'https']) & set(meta_uri.scheme.split('+')))
    meta_uri.path = meta_uri.path + '/bulk'
    #
    with open(signatures_meta, 'r') as fr:
        for objs in chunk(map(json.loads, fr), limit=limit):
            urlopen(
                Request(
                    str(meta_uri),
                    data=json.dumps([{
                        'operationId':
                        'Signature.find_or_create',
                        'requestBody': [_prepare_obj(obj) for obj in objs],
                    }]).encode(),
                    headers={
                        'Content-Type': 'application/json',
                        'Authorization': 'Basic {}'.format(metadata_token)
                    },
                ))
def ingest(input_files, uri=[], limit=1000, **kawrgs):
    input_file, = input_files
    # Get mongo uri
    mongo_uri = first(u for u in uri if 'mongodb' in u.scheme.split('+'))
    # Get extract mongo db name
    db_path = mongo_uri.path[1:]
    del mongo_uri.path
    # Instantiate mongo client
    mongo = pymongo.MongoClient(str(mongo_uri))
    # Get mongo db
    db = getattr(mongo, db_path)

    #
    def generate_signatures():
        with open(input_file, 'r') as fr:
            for line in fr:
                sigid, ents = line.split('\t\t', maxsplit=1)
                entids = ents.strip().split('\t')
                yield {
                    '_id': sigid,
                }, {
                    '$set': {
                        'data.set': {entid: 1
                                     for entid in entids},
                        'data.size': len(entids),
                    },
                }

    #
    mongo_bulk_upsert(
        db.signature_data,
        generate_signatures(),
        limit=limit,
    )
示例#5
0
def extract(path=None, uri=[], **kwargs):
    #
    # Get the data_uri
    data_uri = first(u for u in uri if 'data' in u.scheme.split('+'))
    data_token = data_uri.username
    del data_uri.username
    data_uri.scheme = ''.join(
        set(['http', 'https']) & set(data_uri.scheme.split('+')))
    data_uri_base = data_uri.path
    # Get repositories
    data_uri.path = data_uri_base + '/api/v1/listdata'
    repos = json.load(
        urlopen(
            Request(
                str(data_uri),
                headers={
                    'Content-Type': 'application/json',
                    'Authorization': 'Token {}'.format(data_token)
                },
            )))['repositories']
    with open(os.path.join(path, '_.datasets.meta.jsonld'), 'w') as fw:
        for repo in repos:
            print(json.dumps({
                '@id': repo['uuid'],
                '@type': repo['datatype'],
            }),
                  file=fw)
def apply(uri=[], **kwargs):
  # Get s3 and data api uris
  s3_uri = first(u for u in uri if 's3' in u.scheme.split('+'))
  data_uri = first(u for u in uri if 'data' in u.scheme.split('+'))
  s3_username = s3_uri.username
  s3_password = s3_uri.password
  del s3_uri.username
  # Connect to s3
  s3_client = minio.Minio(
    s3_uri.netloc,
    access_key=s3_username,
    secret_key=s3_password,
    secure='https' in s3_uri.scheme,
  )
  s3_bucket = s3_uri.path[1:]
  # Get data uri token
  data_token = data_uri.username
  # Format the data_uri
  del data_uri.username
  data_uri.path = data_uri.path + '/api/v1/load'
  data_uri.scheme = ''.join(set(['http', 'https']) & set(data_uri.scheme.split('+')))
  # For each object in the s3 bucket
  for obj in s3_client.list_objects(s3_bucket):
    # Only take .so files
    if obj.object_name.endswith('.so'):
      # Trigger data-api load
      urlopen(
        Request(
          str(data_uri),
          headers={
            'Authorization': 'Token {}'.format(data_token)
          },
          data=json.dumps({
            'bucket': s3_bucket,
            'file': obj.object_name,
            'datasetname': obj.object_name.split('.', maxsplit=1)[0],
          }).encode(),
        )
      )
def extract(path=None, uri=[], limit=1000, **kwargs):
    # Get the meta_uri only
    meta_uri = first(u for u in uri if 'meta' in u.scheme.split('+'))
    # Extract the credentials
    metadata_token = base64.b64encode('{username}:{password}'.format(
        username=meta_uri.username,
        password=meta_uri.password).encode()).decode()
    # Format the meta_uri
    del meta_uri.username
    meta_uri.scheme = ''.join(
        set(['http', 'https']) & set(meta_uri.scheme.split('+')))
    meta_base_path = meta_uri.path
    #
    tbl = 'signatures'
    with open(os.path.join(path, '_.{}.jsonld'.format(tbl)), 'w') as fw:
        meta_uri.path = meta_base_path + '/{}/count'.format(tbl)
        #
        n_objs = json.load(
            urlopen(
                Request(
                    str(meta_uri),
                    headers={
                        'Content-Type': 'application/json',
                        'Authorization': 'Basic {}'.format(metadata_token)
                    },
                )))['count']
        #
        meta_uri.path = meta_base_path + '/{}'.format(tbl)
        #
        for skip, limit in pagination(n_objs, limit=limit):
            meta_uri.query = {
                'filter': json.dumps({
                    'skip': skip,
                    'limit': limit
                })
            }
            objs = json.load(
                urlopen(
                    Request(
                        str(meta_uri),
                        headers={
                            'Content-Type': 'application/json',
                            'Authorization': 'Basic {}'.format(metadata_token)
                        },
                    )))
            print(len(objs))
            for obj in objs:
                print(json.dumps(_process_obj(obj)), file=fw)
        #
        del meta_uri.query
示例#8
0
def extract(path=None, uri=[], **kwargs):
  # Get mongo uri
  mongo_uri = first(u for u in uri if 'mongodb' in u.scheme.split('+'))
  # Get extract mongo db name
  db_path = mongo_uri.path[1:]
  del mongo_uri.path
  # Instantiate mongo client
  mongo = pymongo.MongoClient(str(mongo_uri))
  # Get mongo db
  db = getattr(mongo, db_path)
  #
  tbl = 'entities'
  with open(os.path.join(path, '{}.{}.jsonld'.format(db_path, tbl)), 'w') as fw:
    collection = getattr(db, tbl)
    for signature in collection.find():
      print(json.dumps(_process_obj(signature)), file=fw)
示例#9
0
def ingest(input_files, uri=[], limit=1000, **kwargs):
    input_file, = input_files
    # Get the psql_uri only
    psql_uri = first(u for u in uri if 'psql' in u.scheme.split('+'))
    # Connect to db
    con = psycopg2.connect(
        database=psql_uri.path[1:],
        user=psql_uri.username,
        password=psql_uri.password,
        host=psql_uri.hostname,
        port=psql_uri.port,
    )
    cur = con.cursor()
    cur.execute('''
    create table libraries_tmp
    as table libraries
    with no data;
  ''')
    with open(input_file, 'r') as fr:
        cur.copy_from(
            fr,
            'libraries_tmp',
            columns=('uuid', 'resource', 'dataset', 'dataset_type', 'meta'),
            null='',
            sep='\t',
        )
    cur.execute('''
    insert into libraries (uuid, resource, dataset, dataset_type, meta)
      select uuid, resource, dataset, dataset_type, meta
      from libraries_tmp
      on conflict (uuid)
        do update
        set 
          resource = excluded.resource,
          dataset = excluded.dataset,
          dataset_type = excluded.dataset_type,
          meta = excluded.meta
    ;
  ''')
    cur.execute('drop table libraries_tmp;')
    con.commit()
示例#10
0
def extract(path=None, uri=[], **kwargs):
    # Get mongo uri
    mongo_uri = first(u for u in uri if 'mongodb' in u.scheme.split('+'))
    # Get extract mongo db name
    db_path = mongo_uri.path[1:]
    del mongo_uri.path
    # Instantiate mongo client
    mongo = pymongo.MongoClient(str(mongo_uri))
    # Get mongo db
    db = getattr(mongo, db_path)
    #
    for dataset in db.datasets.find({'@type': 'rank'}, {
            '_id': 1,
            'entities': 1
    }):
        with open(
                os.path.join(
                    path,
                    '{}_{}.data.uuid.T.tsv'.format(db_path, dataset['_id'])),
                'w') as fw:
            writer = csv.writer(fw, delimiter='\t')
            writer.writerow(['', *dataset['entities']])
            for signature in db.signature_data.find(
                {
                    'dataset': dataset['_id'],
                    'data.rank': {
                        '$ne': None
                    }
                }, {
                    '_id': 1,
                    'data.rank': 1
                }):
                print(
                    str(signature['_id']),
                    *[
                        signature['data']['rank'].get(ent, '')
                        for ent in dataset['entities']
                    ],
                    sep='\t',
                    file=fw,
                )
def apply(uri=[], **kwargs):
    # Get the meta_uri only
    meta_uri = first(u for u in uri if 'meta' in u.scheme.split('+'))
    # Extract the credentials
    metadata_token = base64.b64encode('{username}:{password}'.format(
        username=meta_uri.username,
        password=meta_uri.password).encode()).decode()
    # Format the meta_uri
    del meta_uri.username
    meta_uri.path = meta_uri.path + '/summary/refresh'
    meta_uri.scheme = ''.join(
        set(['http', 'https']) & set(meta_uri.scheme.split('+')))
    # Make the request
    backoff = 2
    while True:
        try:
            req = urlopen(
                Request(
                    str(meta_uri),
                    headers={
                        'Content-Type': 'application/json',
                        'Authorization': 'Basic {}'.format(metadata_token)
                    },
                ))
            print('[refresh_summary]: done')
            return req
        except HTTPError as e:
            if e.code == 409:
                backoff *= 2
                print(
                    '[refresh_summary]: An operation is already in progress: {}, trying again in {}s'
                    .format(e.read(), backoff))
                time.sleep(backoff)
                continue
            else:
                print('[refresh_summary]: HTTP Error {}: {}'.format(
                    e.code, e.read()))
            break
def extract(path=None, uri=[], **kwargs):
    # Get the psql_uri only
    psql_uri = first(u for u in uri if 'psql' in u.scheme.split('+'))
    # Connect to db
    con = psycopg2.connect(
        database=psql_uri.path[1:],
        user=psql_uri.username,
        password=psql_uri.password,
        host=psql_uri.hostname,
        port=psql_uri.port,
    )
    cur = con.cursor()
    tbl = 'entities'
    with open(
            os.path.join(path, '{}.{}.psql.tsv'.format(psql_uri.path[1:],
                                                       tbl)), 'w') as fw:
        cur.copy_to(
            fw,
            tbl,
            columns=('uuid', 'meta'),
            null='',
            sep='\t',
        )
示例#13
0
def ingest(input_files, uri=[], limit=1000, **kawrgs):
    input_file, = input_files
    # Get mongo uri
    mongo_uri = first(u for u in uri if 'mongodb' in u.scheme.split('+'))
    # Get extract mongo db name
    db_path = mongo_uri.path[1:]
    del mongo_uri.path
    # Instantiate mongo client
    mongo = pymongo.MongoClient(str(mongo_uri))
    # Get mongo db
    db = getattr(mongo, db_path)

    #
    def generate_signatures():
        with open(input_file, 'r') as fr:
            reader = csv.reader(fr, delimiter='\t')
            entities = next(iter(reader))[1:]
            #
            for line in reader:
                sigid = line[0]
                yield {
                    '_id': sigid,
                }, {
                    '$set': {
                        'data.expression': {
                            entid: float(expression)
                            for entid, expression in zip(entities, line[1:])
                        },
                    },
                }

    #
    mongo_bulk_upsert(
        db.signature_data,
        generate_signatures(),
        limit=limit,
    )
示例#14
0
def extract(path=None, uri=[], limit=1000, **kwargs):
    # Get the meta_uri
    meta_uri = first(u for u in uri if 'meta' in u.scheme.split('+'))
    metadata_token = base64.b64encode('{username}:{password}'.format(
        username=meta_uri.username,
        password=meta_uri.password).encode()).decode()
    del meta_uri.username
    meta_uri.scheme = ''.join(
        set(['http', 'https']) & set(meta_uri.scheme.split('+')))
    meta_uri_base = meta_uri.path
    #
    # Get the data_uri
    data_uri = first(u for u in uri if 'data' in u.scheme.split('+'))
    data_token = data_uri.username
    del data_uri.username
    data_uri.scheme = ''.join(
        set(['http', 'https']) & set(data_uri.scheme.split('+')))
    data_uri_base = data_uri.path
    # Get repositories
    data_uri.path = data_uri_base + '/api/v1/listdata'
    repos = json.load(
        urlopen(
            Request(
                str(data_uri),
                headers={
                    'Content-Type': 'application/json',
                    'Authorization': 'Token {}'.format(data_token)
                },
            )))['repositories']
    #
    # Get genelist data
    #
    data_uri.path = data_uri_base + '/api/v1/fetch/set'
    for repo in repos:
        if repo['datatype'] != 'geneset_library':
            continue
        repo_id = repo['uuid']
        with open(os.path.join(path, '{}.data.uuid.gmt'.format(repo_id)),
                  'w') as fw:
            meta_uri.path = meta_uri_base + '/libraries'
            meta_uri.query = {
                'filter':
                json.dumps({
                    'fields': ['id'],
                    'where': {
                        'dataset': repo_id
                    },
                }),
            }
            library_ids = [
                lib['id'] for lib in json.load(
                    urlopen(
                        Request(
                            str(meta_uri),
                            headers={
                                'Content-Type': 'application/json',
                                'Authorization': 'Basic {}'.format(
                                    metadata_token)
                            },
                        )))
            ]
            meta_uri.path = meta_uri_base + '/signatures/count'
            meta_uri.query = {
                'where': json.dumps({'library': {
                    'inq': library_ids
                }})
            }
            n_signatures = json.load(
                urlopen(
                    Request(
                        str(meta_uri),
                        headers={
                            'Content-Type': 'application/json',
                            'Authorization': 'Basic {}'.format(metadata_token)
                        },
                    )))['count']
            meta_uri.path = meta_uri_base + '/signatures'
            for skip, limit in pagination(n_signatures, limit=limit):
                meta_uri.query = {
                    'filter':
                    json.dumps({
                        'fields': ['id'],
                        'where': {
                            'library': {
                                'inq': library_ids
                            },
                        },
                        'skip': skip,
                        'limit': limit,
                    }),
                }
                signature_ids = [
                    sig['id'] for sig in json.load(
                        urlopen(
                            Request(
                                str(meta_uri),
                                headers={
                                    'Content-Type':
                                    'application/json',
                                    'Authorization':
                                    'Basic {}'.format(metadata_token)
                                },
                            )))
                ]
                signature_data = json.load(
                    urlopen(
                        Request(
                            str(data_uri),
                            data=json.dumps({
                                'database': repo_id,
                                'signatures': signature_ids,
                            }).encode('utf8'),
                            headers={
                                'Content-Type': 'application/json',
                                'Authorization': 'Token {}'.format(data_token)
                            },
                        )))['signatures']
                for signature in signature_data:
                    print(signature['uid'],
                          '',
                          *signature['entities'],
                          sep='\t',
                          file=fw)
        #
        del meta_uri.query