def stat(where, limit=16, **kwargs): from hustle.core.settings import Settings from hustle.core.stat import StatPipe from disco.core import result_iterator from collections import defaultdict settings = Settings(**kwargs) ddfs = settings['ddfs'] job_blobs = set(tuple(sorted(w)) for w in _get_blobs(where, ddfs, limit)) # print job_blobs job = StatPipe(settings['server']) job.run(name="stat_" + where._name, input=job_blobs, **settings) res = job.wait() # first we need the total, so that we can calculate weighted average total = float(sum(v['_'] for _, v in result_iterator(res))) final = defaultdict(int) for _, cols in result_iterator(res): weight = cols.pop('_') / total for col, card in cols.iteritems(): final[col] += card * weight # round everything up to a number between 0 .. 100 really_final = {} for key in final: card = int(final[key] * 100) if card > 0: really_final[key] = card really_final['_'] = int(total) return really_final
def stat(where, limit=16, **kwargs): """ Fetch statistical information of a collection of selected `Table <hustle.Table>`. :type where: sequence of :class:`Table <hustle.Table>` | :class:`Expr <hustle.core.marble.Expr>` :param where: the Tables to fetch data from, as well as the conditions in the *where clause* :type limit: int :param limit: the maximum number of blobs from the where clause, default value is 16 Return a dict of column key cardinalities [0-100] for indexed columns in a table """ from hustle.core.settings import Settings from hustle.core.stat import StatPipe from disco.core import result_iterator from collections import defaultdict settings = Settings(**kwargs) ddfs = settings['ddfs'] job_blobs = set(tuple(sorted(w)) for w in _get_blobs(where, ddfs, limit)) # print job_blobs job = StatPipe(settings['server']) job.run(name="stat_" + where._name, input=job_blobs, **settings) res = job.wait() # first we need the total, so that we can calculate weighted average total = float(sum(v['_'] for _, v in result_iterator(res))) final = defaultdict(int) for _, cols in result_iterator(res): weight = cols.pop('_') / total for col, card in cols.iteritems(): final[col] += card * weight # round everything up to a number between 0 .. 100 really_final = {} for key in final: card = int(final[key] * 100) if card > 0: really_final[key] = card really_final['_'] = int(total) return really_final