示例#1
0
def stat(where, limit=16, **kwargs):
    from hustle.core.settings import Settings
    from hustle.core.stat import StatPipe
    from disco.core import result_iterator
    from collections import defaultdict

    settings = Settings(**kwargs)
    ddfs = settings['ddfs']
    job_blobs = set(tuple(sorted(w)) for w in _get_blobs(where, ddfs, limit))
    # print job_blobs
    job = StatPipe(settings['server'])
    job.run(name="stat_" + where._name, input=job_blobs, **settings)
    res = job.wait()

    # first we need the total, so that we can calculate weighted average
    total = float(sum(v['_'] for _, v in result_iterator(res)))
    final = defaultdict(int)
    for _, cols in result_iterator(res):
        weight = cols.pop('_') / total
        for col, card in cols.iteritems():
            final[col] += card * weight

    # round everything up to a number between 0 .. 100
    really_final = {}
    for key in final:
        card = int(final[key] * 100)
        if card > 0:
            really_final[key] = card
    really_final['_'] = int(total)

    return really_final
示例#2
0
def stat(where, limit=16, **kwargs):
    from hustle.core.settings import Settings
    from hustle.core.stat import StatPipe
    from disco.core import result_iterator
    from collections import defaultdict

    settings = Settings(**kwargs)
    ddfs = settings['ddfs']
    job_blobs = set(tuple(sorted(w)) for w in _get_blobs(where, ddfs, limit))
    # print job_blobs
    job = StatPipe(settings['server'])
    job.run(name="stat_" + where._name, input=job_blobs, **settings)
    res = job.wait()

    # first we need the total, so that we can calculate weighted average
    total = float(sum(v['_'] for _, v in result_iterator(res)))
    final = defaultdict(int)
    for _, cols in result_iterator(res):
        weight = cols.pop('_') / total
        for col, card in cols.iteritems():
            final[col] += card * weight

    # round everything up to a number between 0 .. 100
    really_final = {}
    for key in final:
        card = int(final[key] * 100)
        if card > 0:
            really_final[key] = card
    really_final['_'] = int(total)

    return really_final
示例#3
0
def stat(where, limit=16, **kwargs):
    """
    Fetch statistical information of a collection of selected `Table <hustle.Table>`.

    :type where: sequence of :class:`Table <hustle.Table>` | :class:`Expr <hustle.core.marble.Expr>`
    :param where: the Tables to fetch data from, as well as the conditions in the *where clause*

    :type limit: int
    :param limit: the maximum number of blobs from the where clause, default value is 16

    Return a dict of column key cardinalities [0-100] for indexed columns in a table
    """
    from hustle.core.settings import Settings
    from hustle.core.stat import StatPipe
    from disco.core import result_iterator
    from collections import defaultdict

    settings = Settings(**kwargs)
    ddfs = settings['ddfs']
    job_blobs = set(tuple(sorted(w)) for w in _get_blobs(where, ddfs, limit))
    # print job_blobs
    job = StatPipe(settings['server'])
    job.run(name="stat_" + where._name, input=job_blobs, **settings)
    res = job.wait()

    # first we need the total, so that we can calculate weighted average
    total = float(sum(v['_'] for _, v in result_iterator(res)))
    final = defaultdict(int)
    for _, cols in result_iterator(res):
        weight = cols.pop('_') / total
        for col, card in cols.iteritems():
            final[col] += card * weight

    # round everything up to a number between 0 .. 100
    really_final = {}
    for key in final:
        card = int(final[key] * 100)
        if card > 0:
            really_final[key] = card
    really_final['_'] = int(total)

    return really_final
示例#4
0
def stat(where, limit=16, **kwargs):
    """
    Fetch statistical information of a collection of selected `Table <hustle.Table>`.

    :type where: sequence of :class:`Table <hustle.Table>` | :class:`Expr <hustle.core.marble.Expr>`
    :param where: the Tables to fetch data from, as well as the conditions in the *where clause*

    :type limit: int
    :param limit: the maximum number of blobs from the where clause, default value is 16

    Return a dict of column key cardinalities [0-100] for indexed columns in a table
    """
    from hustle.core.settings import Settings
    from hustle.core.stat import StatPipe
    from disco.core import result_iterator
    from collections import defaultdict

    settings = Settings(**kwargs)
    ddfs = settings['ddfs']
    job_blobs = set(tuple(sorted(w)) for w in _get_blobs(where, ddfs, limit))
    # print job_blobs
    job = StatPipe(settings['server'])
    job.run(name="stat_" + where._name, input=job_blobs, **settings)
    res = job.wait()

    # first we need the total, so that we can calculate weighted average
    total = float(sum(v['_'] for _, v in result_iterator(res)))
    final = defaultdict(int)
    for _, cols in result_iterator(res):
        weight = cols.pop('_') / total
        for col, card in cols.iteritems():
            final[col] += card * weight

    # round everything up to a number between 0 .. 100
    really_final = {}
    for key in final:
        card = int(final[key] * 100)
        if card > 0:
            really_final[key] = card
    really_final['_'] = int(total)

    return really_final