def setUp(self): self.host = redbiom.get_config()['hostname'] req = requests.get(self.host + '/flushall') assert req.status_code == 200 self.get = redbiom._requests.make_get(redbiom.get_config()) self.se = redbiom._requests.make_script_exec(redbiom.get_config()) redbiom.admin.ScriptManager.load_scripts(read_only=False)
def data_from_features(context, features, exact): """Fetch sample data from an iterable of features. Parameters ---------- context : str The name of the context to retrieve sample data from. features : Iterable of str The features of interest. exact : bool If True, only samples in which all features exist are obtained. Otherwise, all samples with at least one feature are obtained. Returns ------- biom.Table A Table populated with the found samples. dict A map of {sample_id_in_table: original_id}. This map can be used to identify what samples are ambiguous based off their original IDs. """ import redbiom import redbiom.util import redbiom._requests config = redbiom.get_config() get = redbiom._requests.make_get(config) redbiom._requests.valid(context, get) # determine the samples which contain the features of interest samples = redbiom.util.ids_from(features, exact, 'feature', [context]) return _biom_from_samples(context, iter(samples), get=get)
def category_exists(category, get=None): """Test if a category exists Parameters ---------- category : str The category to test for get : function A get method Returns ------- bool True if the category exists, False otherwise Redis Command Summary --------------------- SISMEMBER <category> metadata:catetories-represented """ if get is None: import redbiom import redbiom._requests config = redbiom.get_config() get = redbiom._requests.make_get(config) # this use highlights how get is being abused at the moment. this is a # command which takes two arguments, they key and the member to test. return get('metadata', 'SISMEMBER', 'categories-represented/%s' % category)
def sample_counts_per_category(categories=None): """Get the number of samples with usable metadata per category Paramaters ---------- categories : list of str, optional The specific categories to summarize. If None, then summarize all of the categories in the database. Returns ------- pandas.Series A series keyed by the category and valued by the number of samples which have metadata for that category. Redis command summary --------------------- SMEMBERS metadata:categories-represented HLEN metadata:category:<category> """ import redbiom import redbiom._requests import pandas as pd get = redbiom._requests.make_get(redbiom.get_config()) if categories is None: categories = list(get('metadata', 'SMEMBERS', 'categories-represented')) results = [] for category in categories: key = 'category:%s' % category results.append(int(get('metadata', 'HLEN', key))) return pd.Series(results, index=categories)
def create_context(name, description): """Create a context within the cache Parameters ---------- name : str The name of the context, e.g., deblur@150nt description : str A brief description about the context, e.g., "Default quality filtering, followed by application of Deblur with a trim length of 150nt." Redis commmand summary ---------------------- HSET state:context <name> <description> HSET <context>:state db-version <current-db-version> """ import redbiom import redbiom._requests config = redbiom.get_config() post = redbiom._requests.make_post(config) post('state', 'HSET', "contexts/%s/%s" % (name, description)) post(name, 'HSET', "state/db-version/%s" % redbiom.__db_version__) ScriptManager.load_scripts()
def whereeval(str_, get=None): """Evaluate a set operation string, where each Name is fetched""" if get is None: import redbiom config = redbiom.get_config() get = redbiom._requests.make_get(config) # Load is subject to indirection to simplify testing globals()['Load'] = make_Load(get) formed = ast.parse(str_, mode='eval') node_types = [ast.Compare, ast.In, ast.NotIn, ast.BoolOp, ast.And, ast.Name, ast.Or, ast.Eq, ast.Lt, ast.LtE, ast.Gt, ast.GtE, ast.NotEq, ast.Str, ast.Num, ast.Load, ast.Expression, ast.Tuple, ast.Is, ast.IsNot] if sys.version_info.major == 3: node_types.append(ast.NameConstant) node_types = tuple(node_types) for node in ast.walk(formed): if not isinstance(node, node_types): raise TypeError("Unsupported node type: %s" % ast.dump(node)) result = eval(ast.dump(formed)) # clean up global Load del Load return result
def test_resolve_ambiguities(self): import redbiom._requests import redbiom config = redbiom.get_config() get = redbiom._requests.make_get(config) redbiom.admin.create_context('test', 'foo') redbiom.admin.load_sample_metadata(metadata) redbiom.admin.ScriptManager.load_scripts(read_only=False) redbiom.admin.load_sample_data(table, 'test', tag=None) # all samples as ambiguous samples = {'10317.000047188', '10317.000046868', '10317.000051129', '10317.000012975', '10317.000033804', '10317.000001405', '10317.000022252', '10317.000001378', '10317.000005080', '10317.000003302'} exp_stable = {"%s.UNTAGGED" % k: k for k in samples} exp_unobserved = [] exp_ambiguous = {k: ["UNTAGGED_%s" % k] for k in samples} exp_ri = {'UNTAGGED_%s' % k: '%s.UNTAGGED' % k for k in samples} obs_stable, obs_unobserved, obs_ambiguous, obs_ri = \ resolve_ambiguities('test', samples, get) self.assertEqual(obs_stable, exp_stable) self.assertEqual(obs_unobserved, exp_unobserved) self.assertEqual(obs_ambiguous, exp_ambiguous) self.assertEqual(obs_ri, exp_ri)
def test_resolve_ambiguities_ambigbug(self): import redbiom._requests import redbiom config = redbiom.get_config() get = redbiom._requests.make_get(config) redbiom.admin.create_context('test', 'foo') redbiom.admin.load_sample_metadata(metadata) redbiom.admin.ScriptManager.load_scripts(read_only=False) redbiom.admin.load_sample_data(table, 'test', tag='fromtest') redbiom.admin.load_sample_metadata(metadata_with_alt) redbiom.admin.load_sample_data(table_with_alt, 'test', tag='fromalt') samples = {'fromtest_10317.000005080', 'fromalt_10317.000005080'} exp_stable = {'10317.000005080.fromtest': 'fromtest_10317.000005080', '10317.000005080.fromalt': 'fromalt_10317.000005080'} exp_unobserved = [] exp_ambiguous = {'10317.000005080': ['fromtest_10317.000005080', 'fromalt_10317.000005080']} exp_ri = {'fromtest_10317.000005080': '10317.000005080.fromtest', 'fromalt_10317.000005080': '10317.000005080.fromalt'} obs_stable, obs_unobserved, obs_ambiguous, obs_ri = \ resolve_ambiguities('test', samples, get) self.assertEqual(obs_stable, exp_stable) self.assertEqual(obs_unobserved, exp_unobserved) self.assertEqual({k: set(v) for k, v in obs_ambiguous.items()}, {k: set(v) for k, v in exp_ambiguous.items()}) self.assertEqual(obs_ri, exp_ri)
def buffered(it, prefix, cmd, context, get=None, buffer_size=10, multikey=None): """Bulk fetch data Many of the commands within REDIS accept multiple arguments (e.g., MGET). This method facilitates the use of these bulk commands over an iterable of items. The method will additionally "chunk" by a buffer_size as to limit the size of the URL being constructed. The URLs have an upper bound of around 100kb from testing -- this is limit is dependent on the client and the server. It is not clear what the actual limit is for Webdis. As a rule of thumb, the aim is to target requests for a few kb at a time. Parameters ---------- it : iterable The items to query for prefix : string A key prefix such as "data" cmd : string, a Redis command The command to request be executed context : string The context to operate under (ie another prefix). get : function, optional An existing get function buffer_size: int, optional The number of items to query for at once. It is important to avoid having a buffer size which may result in a URL exceeding 100kb as in testing, that was not well support unsurprisingly. multikey: string, optional For hashbucket commands, like HMGET, where there is an outer and inner key. """ if get is None: import redbiom config = redbiom.get_config() get = make_get(config) if multikey is None: prefixer = lambda a, b, c: '%s:%s:%s' % (a, b, c) else: prefixer = lambda a, b, c: c it = iter(it) exhausted = False while not exhausted: items = [] for i in range(buffer_size): try: items.append(next(it).strip()) except StopIteration: exhausted = True break # it may be possible to use _format_request here bulk = '/'.join([prefixer(context, prefix, i) for i in items]) if multikey: bulk = "%s:%s/%s" % (context, multikey, bulk) if bulk: yield items, get(None, cmd, bulk)
def make_post(config, redis_protocol=None): """Factory function: produce a post() method""" import redbiom s = get_session() config = redbiom.get_config() if redis_protocol: # for expensive load operations like feature data, it potentially # faster to use the native protocol. this writes out the redis # commands in their native for feeding into redis-cli --pipe. More # information can be found here: # https://redis.io/topics/mass-insert def f(context, cmd, payload): import sys args = payload.split('/') args[0] = ':'.join([context, args[0]]) args.insert(0, cmd) # https://gist.github.com/laserson/2689744 proto = '' proto += '*' + str(len(args)) + '\r\n' for arg in args: proto += '$' + str(len(bytes(str(arg), 'utf-8'))) + '\r\n' proto += str(arg) + '\r\n' sys.stdout.write(proto) sys.stdout.flush() else: def f(context, cmd, payload): req = s.post(config['hostname'], data=_format_request(context, cmd, payload)) return _parse_validate_request(req, cmd) return f
def get(name): """Retreive the SHA1 of a script Parameters ---------- name : str The name of the script to fetch Raises ------ ValueError If the script name is not recognized """ if name in ScriptManager._cache: return ScriptManager._cache[name] import redbiom import redbiom._requests config = redbiom.get_config() get = redbiom._requests.make_get(config) sha = get('state', 'HGET', 'scripts/%s' % name) if sha is None: raise ValueError('Unknown script') ScriptManager._cache[name] = sha return sha
def ids_from(it, exact, axis, contexts): """Grab samples from an iterable of IDs Parameters ---------- it : iteraable of str The IDs to search for exact : boolean If True, compute the intersection of results per context. If False, compute the union of results per context. axis : {'feature', 'sample'} The axis to operate over. contexts : list of str The contexts to search in Notes ----- Contexts are evaluated independently, and the results of each context are unioned. Returns ------- set The sample IDs associated with the search IDs. """ import redbiom import redbiom._requests import redbiom.admin config = redbiom.get_config() se = redbiom._requests.make_script_exec(config) retrieved = set() if axis not in {'feature', 'sample'}: raise ValueError("Unknown axis: %s" % axis) if not isinstance(contexts, (list, set, tuple)): contexts = [contexts] it = list(it) fetcher = redbiom.admin.ScriptManager.get('fetch-%s' % axis) for context in contexts: context_ids = None for id_ in it: block = se(fetcher, 0, context, id_) if not exact: if context_ids is None: context_ids = set() context_ids.update(block) else: if context_ids is None: context_ids = set(block) else: context_ids = context_ids.intersection(block) if context_ids: retrieved = retrieved.union(context_ids) return retrieved
def load_scripts(read_only=True): """Load scripts into Redis Parameters ---------- read_only : bool, optional If True, only load read-only scripts. If False, load writable scripts """ import redbiom import redbiom._requests import hashlib config = redbiom.get_config() s = redbiom._requests.get_session() post = redbiom._requests.make_post(config) get = redbiom._requests.make_get(config) for name, script in ScriptManager._scripts.items(): if read_only and name in ScriptManager._admin_scripts: continue sha1 = hashlib.sha1(script.encode('ascii')).hexdigest() keypair = 'scripts/%s/%s' % (name, sha1) # load the script s.put(config['hostname'] + '/SCRIPT/LOAD', data=script) # create a mapping post('state', 'HSET', keypair) # verify we've correctly computed the hash obs = get('state', 'HGET', 'scripts/%s' % name) assert obs == sha1
def valid(context, get=None): """Test if a context exists""" if get is None: import redbiom config = redbiom.get_config() get = make_get(config) if not get('state', 'HEXISTS', 'contexts/%s' % context): raise ValueError("Unknown context: %s" % context)
def drop_scripts(): """Flush the loaded scripts in the redis database""" import redbiom import redbiom._requests config = redbiom.get_config() s = redbiom._requests.get_session() s.get(config['hostname'] + '/SCRIPT/FLUSH') s.get(config['hostname'] + '/DEL/state:scripts') ScriptManager._cache = {}
def search_samples(from_, exact, context, samples): """Get features present in samples.""" import redbiom import redbiom._requests import redbiom.util config = redbiom.get_config() get = redbiom._requests.make_get(config) _, _, _, rb_ids = redbiom.util.resolve_ambiguities(context, samples, get) rb_ids = list(rb_ids) _axis_search(from_, exact, context, iter(rb_ids), 'sample')
def make_get(config): """Factory function: produce a get() method""" import redbiom s = get_session() config = redbiom.get_config() def f(context, cmd, data): payload = _format_request(context, cmd, data) url = '/'.join([config['hostname'], payload]) return _parse_validate_request(s.get(url), cmd) return f
def test_resolve_ambiguities_mixed(self): import redbiom._requests import redbiom config = redbiom.get_config() get = redbiom._requests.make_get(config) redbiom.admin.create_context('test', 'foo') redbiom.admin.load_sample_metadata(metadata) redbiom.admin.ScriptManager.load_scripts(read_only=False) redbiom.admin.load_sample_data(table, 'test', tag=None) samples = { '10317.000047188', '10317.000046868', '10317.000051129', '10317.000012975', '10317.000033804', '10317.000001405', '10317.000022252', '10317.000001378', 'foo', 'UNTAGGED_bar', 'UNTAGGED_10317.000003302' } exp_stable = { "%s.UNTAGGED" % k: k for k in samples if 'foo' not in k and 'bar' not in k } exp_stable.pop('UNTAGGED_10317.000003302.UNTAGGED') exp_stable['10317.000003302.UNTAGGED'] = 'UNTAGGED_10317.000003302' exp_unobserved = ['foo', 'UNTAGGED_bar'] exp_ambiguous = { '10317.000047188': ['UNTAGGED_10317.000047188'], '10317.000046868': ['UNTAGGED_10317.000046868'], '10317.000051129': ['UNTAGGED_10317.000051129'], '10317.000012975': ['UNTAGGED_10317.000012975'], '10317.000033804': ['UNTAGGED_10317.000033804'], '10317.000001405': ['UNTAGGED_10317.000001405'], '10317.000022252': ['UNTAGGED_10317.000022252'], '10317.000001378': ['UNTAGGED_10317.000001378'], '10317.000003302': ['UNTAGGED_10317.000003302'] } exp_ri = { 'UNTAGGED_10317.000047188': '10317.000047188.UNTAGGED', 'UNTAGGED_10317.000046868': '10317.000046868.UNTAGGED', 'UNTAGGED_10317.000051129': '10317.000051129.UNTAGGED', 'UNTAGGED_10317.000012975': '10317.000012975.UNTAGGED', 'UNTAGGED_10317.000033804': '10317.000033804.UNTAGGED', 'UNTAGGED_10317.000001405': '10317.000001405.UNTAGGED', 'UNTAGGED_10317.000022252': '10317.000022252.UNTAGGED', 'UNTAGGED_10317.000001378': '10317.000001378.UNTAGGED', 'UNTAGGED_10317.000003302': '10317.000003302.UNTAGGED' } obs_stable, obs_unobserved, obs_ambiguous, obs_ri = \ resolve_ambiguities('test', samples, get) self.assertEqual(obs_stable, exp_stable) self.assertEqual(obs_unobserved, exp_unobserved) self.assertEqual(obs_ambiguous, exp_ambiguous) self.assertEqual(obs_ri, exp_ri)
def assert_test_env(): import os import redbiom conf = redbiom.get_config() if not conf['hostname'].startswith('http://127.0.0.1'): if not os.environ.get('REDBIOM_OVERRIDE_HOST_AND_TEST', False): raise ValueError("It appears the REDBIOM_HOST is not 127.0.0.1. " "By default, the tests will not run on outside " "of localhost, however if you're sure you want " "to run the tests against the set host, please " "set the environment variable " "REDBIOM_OVERRIDE_HOST_AND_TEST")
def get_timestamps(): """Obtain the stored timestamps Redis command summary --------------------- LRANGE state:timestamps 0 -1 """ import redbiom import redbiom._requests config = redbiom.get_config() get = redbiom._requests.make_get(config) return get('state', 'LRANGE', 'timestamps/0/-1')
def make_script_exec(config): """Factory function: produce a script_exec() method""" import redbiom import json s = get_session() config = redbiom.get_config() def f(sha, *args): payload = [config['hostname'], 'EVALSHA', sha] payload.extend([str(a) for a in args]) url = '/'.join(payload) return json.loads(_parse_validate_request(s.get(url), 'EVALSHA')) return f
def features(from_, context, exact, samples): """Given samples, select the features associated.""" import redbiom import redbiom._requests config = redbiom.get_config() get = redbiom._requests.make_get(config) import redbiom.util iterator = redbiom.util.from_or_nargs(from_, samples) _, _, _, rids = redbiom.util.resolve_ambiguities(context, iterator, get) for i in redbiom.util.ids_from(rids, exact, 'sample', context): click.echo(i)
def seteval(str_, get=None, stemmer=None, target=None): """Evaluate a set operation string, where each Name is fetched Parameters ---------- str_ : str The query to evaluate get : function, optional A getting method, defaults to instatiating one from _requests stemmer : function, optional A method to stem a query Name. If None, defaults to passthrough. target : str, optional A subcontext to query against. If None, defaults to text-search. """ if get is None: import redbiom config = redbiom.get_config() get = redbiom._requests.make_get(config) if stemmer is None: stemmer = passthrough if target is None: target = 'text-search' # Load is subject to indirection to simplify testing globals()['Load'] = make_Load(get) # this seems right now to be the easiest way to inject parameters # into Name globals()['stemmer'] = stemmer globals()['target'] = target formed = ast.parse(str_, mode='eval') node_types = (ast.BitAnd, ast.BitOr, ast.BitXor, ast.Name, ast.Sub, ast.Expression, ast.BinOp, ast.Load) for node in ast.walk(formed): if not isinstance(node, node_types): raise TypeError("Unsupported node type: %s" % ast.dump(node)) result = eval(ast.dump(formed)) # clean up global Load del Load del stemmer del target return result
def category_sample_values(category, samples=None): """Obtain the samples and their corresponding category values Parameters ---------- category : str A metadata column of interest. samples : Iterable of str, optional If provided, only the specified samples and their values are obtained. Returns ------- pandas.Series A Series indexed by the Sample ID and valued by the metadata value for that sample for the specified category. Redis command summary --------------------- HGETALL metadata:category:<category> HMGET metadata:category:<category> <sample_id> ... <sample_id> """ import redbiom import redbiom._requests import pandas as pd get = redbiom._requests.make_get(redbiom.get_config()) key = 'category:%s' % category if samples is None: keys_vals = list(get('metadata', 'HGETALL', key).items()) else: untagged, _, _, tagged_clean = \ redbiom.util.partition_samples_by_tags(samples) samples = untagged + tagged_clean getter = redbiom._requests.buffered(iter(samples), None, 'HMGET', 'metadata', get=get, buffer_size=100, multikey=key) # there is probably some niftier method than this. keys_vals = [(sample, obs_val) for idx, vals in getter for sample, obs_val in zip(idx, vals)] index = (v[0] for v in keys_vals) data = (v[1] for v in keys_vals) return pd.Series(data=data, index=index)
def make_put(config): """Factory function: produce a put() method Within Webdis, PUT is generally used to provide content in the body for use as a file upload. """ import redbiom s = get_session() config = redbiom.get_config() def f(context, cmd, key, data): url = '/'.join([config['hostname'], _format_request(context, cmd, key)]) req = s.put(url, data=data) return _parse_validate_request(req, cmd) return f
def get_index(context, key, axis): """Get a unique integer value for a key within a context Parameters ---------- context : str The context to operate in key : str The key to get a unique index for axis : str Either feature or sample Notes ----- This method is an atomic equivalent of: def get_or_set(d, item): if item not in d: d[item] = len(d) return d[item] Returns ------- int A unique integer index within the context for the key """ import redbiom import redbiom._requests config = redbiom.get_config() # we need to issue the request directly as the command structure is # rather different than other commands s = redbiom._requests.get_session() sha = ScriptManager.get('get-index') url = '/'.join([ config['hostname'], 'EVALSHA', sha, '1', "%s:%s-index" % (context, axis), key ]) req = s.get(url) if req.status_code != 200: raise ValueError("Unable to obtain index; %d; %s" % (req.status_code, req.content)) return int(req.json()['EVALSHA'])
def has_sample_metadata(samples, get=None): """Test if all samples have sample metadata""" import redbiom._requests if get is None: import redbiom config = redbiom.get_config() get = redbiom._requests.make_get(config) untagged, tagged, _, tagged_clean = partition_samples_by_tags(samples) # make sure all samples have metadata represented = set(get('metadata', 'SMEMBERS', 'samples-represented')) if not set(untagged).issubset(represented): return False if not set(tagged_clean).issubset(represented): return False return True
def samples_in_context(context, unambiguous, get=None): """Fetch samples in a context Parameters ---------- context : str The context to obtain samples from. unambiguous : bool If True, return unambiguous identifiers, if false return ambiguated identifiers. get : a make_get instance, optional A constructed get method. Returns ------- set The set of sample identifers within a context. Raises ------ ValueError If the requested context is not known. Redis Command Summary --------------------- SMEMBERS <context>:samples-represented """ import redbiom import redbiom._requests import redbiom.util if get is None: config = redbiom.get_config() get = redbiom._requests.make_get(config) redbiom._requests.valid(context, get) obs = get(context, 'SMEMBERS', 'samples-represented') if not unambiguous: _, _, _, tagged_clean = redbiom.util.partition_samples_by_tags(obs) return set(tagged_clean) else: return set(obs)
def select_samples_from_metadata(from_, context, query, samples): """Given samples, select based on metadata""" import redbiom.util import redbiom.search import redbiom import redbiom._requests config = redbiom.get_config() get = redbiom._requests.make_get(config) iterator = redbiom.util.from_or_nargs(from_, samples) _, _, ambig, _ = redbiom.util.resolve_ambiguities(context, iterator, get) full_search = redbiom.search.metadata_full(query) for i in (full_search & set(ambig)): for rid in ambig[i]: # get unambiguous redbiom id click.echo(rid)
def contexts(detail=True): """Obtain the name and description of known contexts Parameters ---------- detail : bool, optional If True, obtain additional context detail. Returns ------- DataFrame Containing context information. Redis command summary --------------------- HGETALL state:contexts SCARD <context>:samples-represented SCARD <context>:features-represented """ import pandas as pd import redbiom import redbiom._requests get = redbiom._requests.make_get(redbiom.get_config()) if not detail: contexts = get('state', 'HKEYS', 'contexts') return pd.DataFrame(contexts, columns=['ContextName']) else: contexts = get('state', 'HGETALL', 'contexts') result = [] for name, desc in contexts.items(): ctx_n_samp = get(name, 'SCARD', 'samples-represented') ctx_n_feat = get(name, 'SCARD', 'features-represented') result.append((name, int(ctx_n_samp), int(ctx_n_feat), desc)) return pd.DataFrame(result, columns=[ 'ContextName', 'SamplesWithData', 'FeaturesWithData', 'Description' ])