def extract(path=None, uri=[], **kwargs): # Get mongo uri mongo_uri = first(u for u in uri if 'mongodb' in u.scheme.split('+')) # Get extract mongo db name db_path = mongo_uri.path[1:] del mongo_uri.path # Instantiate mongo client mongo = pymongo.MongoClient(str(mongo_uri)) # Get mongo db db = getattr(mongo, db_path) # for dataset in db.datasets.find({'@type': 'geneset'}, {'_id': 1}): with open( os.path.join( path, '{}_{}.data.uuid.gmt'.format(db_path, dataset['_id'])), 'w') as fw: for signature in db.signature_data.find( { 'dataset': dataset['_id'], 'data.set': { '$ne': None } }, { '_id': 1, 'data.set': 1 }): print(str(signature['_id']), '', *signature['data']['set'].keys(), sep='\t', file=fw)
def ingest(input_files, uri=[], limit=1000, **kawrgs): input_file, = input_files # Get mongo uri mongo_uri = first(u for u in uri if 'mongodb' in u.scheme.split('+')) # Get extract mongo db name db_path = mongo_uri.path[1:] del mongo_uri.path # Instantiate mongo client mongo = pymongo.MongoClient(str(mongo_uri)) # Get mongo db db = getattr(mongo, db_path) # def generate_signatures(): with open(input_file, 'r') as fr: for signature in map(json.loads, fr): yield { '_id': signature['@id'], }, { '$set': { 'library': signature['library'], 'meta': signature['meta'], }, } # mongo_bulk_upsert( db.signature_meta, generate_signatures(), limit=limit, )
def ingest(input_files, uri=[], limit=1000, **kwargs): signatures_meta, = input_files # Get the meta_uri only meta_uri = first(u for u in uri if 'meta' in u.scheme.split('+')) # Extract token metadata_token = base64.b64encode('{username}:{password}'.format( username=meta_uri.username, password=meta_uri.password).encode()).decode() # Prepare uri del meta_uri.username meta_uri.scheme = ''.join( set(['http', 'https']) & set(meta_uri.scheme.split('+'))) meta_uri.path = meta_uri.path + '/bulk' # with open(signatures_meta, 'r') as fr: for objs in chunk(map(json.loads, fr), limit=limit): urlopen( Request( str(meta_uri), data=json.dumps([{ 'operationId': 'Signature.find_or_create', 'requestBody': [_prepare_obj(obj) for obj in objs], }]).encode(), headers={ 'Content-Type': 'application/json', 'Authorization': 'Basic {}'.format(metadata_token) }, ))
def ingest(input_files, uri=[], limit=1000, **kawrgs): input_file, = input_files # Get mongo uri mongo_uri = first(u for u in uri if 'mongodb' in u.scheme.split('+')) # Get extract mongo db name db_path = mongo_uri.path[1:] del mongo_uri.path # Instantiate mongo client mongo = pymongo.MongoClient(str(mongo_uri)) # Get mongo db db = getattr(mongo, db_path) # def generate_signatures(): with open(input_file, 'r') as fr: for line in fr: sigid, ents = line.split('\t\t', maxsplit=1) entids = ents.strip().split('\t') yield { '_id': sigid, }, { '$set': { 'data.set': {entid: 1 for entid in entids}, 'data.size': len(entids), }, } # mongo_bulk_upsert( db.signature_data, generate_signatures(), limit=limit, )
def extract(path=None, uri=[], **kwargs): # # Get the data_uri data_uri = first(u for u in uri if 'data' in u.scheme.split('+')) data_token = data_uri.username del data_uri.username data_uri.scheme = ''.join( set(['http', 'https']) & set(data_uri.scheme.split('+'))) data_uri_base = data_uri.path # Get repositories data_uri.path = data_uri_base + '/api/v1/listdata' repos = json.load( urlopen( Request( str(data_uri), headers={ 'Content-Type': 'application/json', 'Authorization': 'Token {}'.format(data_token) }, )))['repositories'] with open(os.path.join(path, '_.datasets.meta.jsonld'), 'w') as fw: for repo in repos: print(json.dumps({ '@id': repo['uuid'], '@type': repo['datatype'], }), file=fw)
def apply(uri=[], **kwargs): # Get s3 and data api uris s3_uri = first(u for u in uri if 's3' in u.scheme.split('+')) data_uri = first(u for u in uri if 'data' in u.scheme.split('+')) s3_username = s3_uri.username s3_password = s3_uri.password del s3_uri.username # Connect to s3 s3_client = minio.Minio( s3_uri.netloc, access_key=s3_username, secret_key=s3_password, secure='https' in s3_uri.scheme, ) s3_bucket = s3_uri.path[1:] # Get data uri token data_token = data_uri.username # Format the data_uri del data_uri.username data_uri.path = data_uri.path + '/api/v1/load' data_uri.scheme = ''.join(set(['http', 'https']) & set(data_uri.scheme.split('+'))) # For each object in the s3 bucket for obj in s3_client.list_objects(s3_bucket): # Only take .so files if obj.object_name.endswith('.so'): # Trigger data-api load urlopen( Request( str(data_uri), headers={ 'Authorization': 'Token {}'.format(data_token) }, data=json.dumps({ 'bucket': s3_bucket, 'file': obj.object_name, 'datasetname': obj.object_name.split('.', maxsplit=1)[0], }).encode(), ) )
def extract(path=None, uri=[], limit=1000, **kwargs): # Get the meta_uri only meta_uri = first(u for u in uri if 'meta' in u.scheme.split('+')) # Extract the credentials metadata_token = base64.b64encode('{username}:{password}'.format( username=meta_uri.username, password=meta_uri.password).encode()).decode() # Format the meta_uri del meta_uri.username meta_uri.scheme = ''.join( set(['http', 'https']) & set(meta_uri.scheme.split('+'))) meta_base_path = meta_uri.path # tbl = 'signatures' with open(os.path.join(path, '_.{}.jsonld'.format(tbl)), 'w') as fw: meta_uri.path = meta_base_path + '/{}/count'.format(tbl) # n_objs = json.load( urlopen( Request( str(meta_uri), headers={ 'Content-Type': 'application/json', 'Authorization': 'Basic {}'.format(metadata_token) }, )))['count'] # meta_uri.path = meta_base_path + '/{}'.format(tbl) # for skip, limit in pagination(n_objs, limit=limit): meta_uri.query = { 'filter': json.dumps({ 'skip': skip, 'limit': limit }) } objs = json.load( urlopen( Request( str(meta_uri), headers={ 'Content-Type': 'application/json', 'Authorization': 'Basic {}'.format(metadata_token) }, ))) print(len(objs)) for obj in objs: print(json.dumps(_process_obj(obj)), file=fw) # del meta_uri.query
def extract(path=None, uri=[], **kwargs): # Get mongo uri mongo_uri = first(u for u in uri if 'mongodb' in u.scheme.split('+')) # Get extract mongo db name db_path = mongo_uri.path[1:] del mongo_uri.path # Instantiate mongo client mongo = pymongo.MongoClient(str(mongo_uri)) # Get mongo db db = getattr(mongo, db_path) # tbl = 'entities' with open(os.path.join(path, '{}.{}.jsonld'.format(db_path, tbl)), 'w') as fw: collection = getattr(db, tbl) for signature in collection.find(): print(json.dumps(_process_obj(signature)), file=fw)
def ingest(input_files, uri=[], limit=1000, **kwargs): input_file, = input_files # Get the psql_uri only psql_uri = first(u for u in uri if 'psql' in u.scheme.split('+')) # Connect to db con = psycopg2.connect( database=psql_uri.path[1:], user=psql_uri.username, password=psql_uri.password, host=psql_uri.hostname, port=psql_uri.port, ) cur = con.cursor() cur.execute(''' create table libraries_tmp as table libraries with no data; ''') with open(input_file, 'r') as fr: cur.copy_from( fr, 'libraries_tmp', columns=('uuid', 'resource', 'dataset', 'dataset_type', 'meta'), null='', sep='\t', ) cur.execute(''' insert into libraries (uuid, resource, dataset, dataset_type, meta) select uuid, resource, dataset, dataset_type, meta from libraries_tmp on conflict (uuid) do update set resource = excluded.resource, dataset = excluded.dataset, dataset_type = excluded.dataset_type, meta = excluded.meta ; ''') cur.execute('drop table libraries_tmp;') con.commit()
def extract(path=None, uri=[], **kwargs): # Get mongo uri mongo_uri = first(u for u in uri if 'mongodb' in u.scheme.split('+')) # Get extract mongo db name db_path = mongo_uri.path[1:] del mongo_uri.path # Instantiate mongo client mongo = pymongo.MongoClient(str(mongo_uri)) # Get mongo db db = getattr(mongo, db_path) # for dataset in db.datasets.find({'@type': 'rank'}, { '_id': 1, 'entities': 1 }): with open( os.path.join( path, '{}_{}.data.uuid.T.tsv'.format(db_path, dataset['_id'])), 'w') as fw: writer = csv.writer(fw, delimiter='\t') writer.writerow(['', *dataset['entities']]) for signature in db.signature_data.find( { 'dataset': dataset['_id'], 'data.rank': { '$ne': None } }, { '_id': 1, 'data.rank': 1 }): print( str(signature['_id']), *[ signature['data']['rank'].get(ent, '') for ent in dataset['entities'] ], sep='\t', file=fw, )
def apply(uri=[], **kwargs): # Get the meta_uri only meta_uri = first(u for u in uri if 'meta' in u.scheme.split('+')) # Extract the credentials metadata_token = base64.b64encode('{username}:{password}'.format( username=meta_uri.username, password=meta_uri.password).encode()).decode() # Format the meta_uri del meta_uri.username meta_uri.path = meta_uri.path + '/summary/refresh' meta_uri.scheme = ''.join( set(['http', 'https']) & set(meta_uri.scheme.split('+'))) # Make the request backoff = 2 while True: try: req = urlopen( Request( str(meta_uri), headers={ 'Content-Type': 'application/json', 'Authorization': 'Basic {}'.format(metadata_token) }, )) print('[refresh_summary]: done') return req except HTTPError as e: if e.code == 409: backoff *= 2 print( '[refresh_summary]: An operation is already in progress: {}, trying again in {}s' .format(e.read(), backoff)) time.sleep(backoff) continue else: print('[refresh_summary]: HTTP Error {}: {}'.format( e.code, e.read())) break
def extract(path=None, uri=[], **kwargs): # Get the psql_uri only psql_uri = first(u for u in uri if 'psql' in u.scheme.split('+')) # Connect to db con = psycopg2.connect( database=psql_uri.path[1:], user=psql_uri.username, password=psql_uri.password, host=psql_uri.hostname, port=psql_uri.port, ) cur = con.cursor() tbl = 'entities' with open( os.path.join(path, '{}.{}.psql.tsv'.format(psql_uri.path[1:], tbl)), 'w') as fw: cur.copy_to( fw, tbl, columns=('uuid', 'meta'), null='', sep='\t', )
def ingest(input_files, uri=[], limit=1000, **kawrgs): input_file, = input_files # Get mongo uri mongo_uri = first(u for u in uri if 'mongodb' in u.scheme.split('+')) # Get extract mongo db name db_path = mongo_uri.path[1:] del mongo_uri.path # Instantiate mongo client mongo = pymongo.MongoClient(str(mongo_uri)) # Get mongo db db = getattr(mongo, db_path) # def generate_signatures(): with open(input_file, 'r') as fr: reader = csv.reader(fr, delimiter='\t') entities = next(iter(reader))[1:] # for line in reader: sigid = line[0] yield { '_id': sigid, }, { '$set': { 'data.expression': { entid: float(expression) for entid, expression in zip(entities, line[1:]) }, }, } # mongo_bulk_upsert( db.signature_data, generate_signatures(), limit=limit, )
def extract(path=None, uri=[], limit=1000, **kwargs): # Get the meta_uri meta_uri = first(u for u in uri if 'meta' in u.scheme.split('+')) metadata_token = base64.b64encode('{username}:{password}'.format( username=meta_uri.username, password=meta_uri.password).encode()).decode() del meta_uri.username meta_uri.scheme = ''.join( set(['http', 'https']) & set(meta_uri.scheme.split('+'))) meta_uri_base = meta_uri.path # # Get the data_uri data_uri = first(u for u in uri if 'data' in u.scheme.split('+')) data_token = data_uri.username del data_uri.username data_uri.scheme = ''.join( set(['http', 'https']) & set(data_uri.scheme.split('+'))) data_uri_base = data_uri.path # Get repositories data_uri.path = data_uri_base + '/api/v1/listdata' repos = json.load( urlopen( Request( str(data_uri), headers={ 'Content-Type': 'application/json', 'Authorization': 'Token {}'.format(data_token) }, )))['repositories'] # # Get genelist data # data_uri.path = data_uri_base + '/api/v1/fetch/set' for repo in repos: if repo['datatype'] != 'geneset_library': continue repo_id = repo['uuid'] with open(os.path.join(path, '{}.data.uuid.gmt'.format(repo_id)), 'w') as fw: meta_uri.path = meta_uri_base + '/libraries' meta_uri.query = { 'filter': json.dumps({ 'fields': ['id'], 'where': { 'dataset': repo_id }, }), } library_ids = [ lib['id'] for lib in json.load( urlopen( Request( str(meta_uri), headers={ 'Content-Type': 'application/json', 'Authorization': 'Basic {}'.format( metadata_token) }, ))) ] meta_uri.path = meta_uri_base + '/signatures/count' meta_uri.query = { 'where': json.dumps({'library': { 'inq': library_ids }}) } n_signatures = json.load( urlopen( Request( str(meta_uri), headers={ 'Content-Type': 'application/json', 'Authorization': 'Basic {}'.format(metadata_token) }, )))['count'] meta_uri.path = meta_uri_base + '/signatures' for skip, limit in pagination(n_signatures, limit=limit): meta_uri.query = { 'filter': json.dumps({ 'fields': ['id'], 'where': { 'library': { 'inq': library_ids }, }, 'skip': skip, 'limit': limit, }), } signature_ids = [ sig['id'] for sig in json.load( urlopen( Request( str(meta_uri), headers={ 'Content-Type': 'application/json', 'Authorization': 'Basic {}'.format(metadata_token) }, ))) ] signature_data = json.load( urlopen( Request( str(data_uri), data=json.dumps({ 'database': repo_id, 'signatures': signature_ids, }).encode('utf8'), headers={ 'Content-Type': 'application/json', 'Authorization': 'Token {}'.format(data_token) }, )))['signatures'] for signature in signature_data: print(signature['uid'], '', *signature['entities'], sep='\t', file=fw) # del meta_uri.query