示例#1
0
    def test_limit(self):
        with self.assertRaises(ValueError):
            check_query(self.single_select, [], [], self.limit_single_invalid,
                        self.single_where)

        self.assertTrue(
            check_query(self.single_select, [], [], self.limit_single,
                        self.single_where))
示例#2
0
 def test_order_by(self):
     # should raise if select columns don't contain the order column
     with self.assertRaises(ValueError):
         check_query(self.single_select, [], self.order_by, None,
                     self.single_where)
     self.assertTrue(
         check_query(self.single_select, [], [self.albums.name], None,
                     self.single_where))
示例#3
0
 def test_where_clauses(self):
     # should raise if a single table shows up in multi-wheres
     # should raise if where and select are from different tables
     with self.assertRaises(ValueError):
         check_query(self.single_select, [], [], self.order_by,
                     [self.transaction.id == 1000])
     self.assertTrue(
         check_query(self.single_select, [], [], None, self.single_where))
示例#4
0
 def test_order_by(self):
     # should raise if select columns don't contain the order column
     with self.assertRaises(ValueError):
         check_query(self.single_select,
                     [],
                     self.order_by,
                     None,
                     self.single_where)
     self.assertTrue(check_query(self.single_select, [], [self.albums.name],
                                 None, self.single_where))
示例#5
0
 def test_select_clauses(self):
     # test empty select
     with self.assertRaises(ValueError):
         check_query([], [], self.order_by, None, self.single_where)
     # test duplicate select
     with self.assertRaises(ValueError):
         check_query(self.single_select + self.single_select, [],
                     self.order_by, None, self.single_where)
     self.assertTrue(
         check_query(self.single_select, [], [], None, self.single_where))
示例#6
0
    def test_join(self):
        # test join with single table
        with self.assertRaises(ValueError):
            check_query(self.single_select,
                        self.join,
                        [],
                        None,
                        self.single_where)

        # test invalid join
        with self.assertRaises(ValueError):
            check_query(self.single_select,
                        self.join_invalid,
                        [],
                        None,
                        self.cross_wheres)

        # test invalid join
        with self.assertRaises(ValueError):
            check_query(self.single_select,
                        self.join_invalid_1,
                        [],
                        None,
                        self.cross_wheres)

        # test invalid join
        with self.assertRaises(ValueError):
            check_query(self.single_select,
                        self.join_invalid_2,
                        [],
                        None,
                        self.cross_wheres)
        self.assertTrue(check_query(self.single_select,
                                    self.join, [], None, self.cross_wheres))
示例#7
0
 def test_where_clauses(self):
     # should raise if a single table shows up in multi-wheres
     # should raise if where and select are from different tables
     with self.assertRaises(ValueError):
         check_query(self.single_select,
                     [],
                     [],
                     self.order_by,
                     [self.transaction.id == 1000])
     self.assertTrue(check_query(self.single_select, [], [],
                                 None, self.single_where))
示例#8
0
    def test_limit(self):
        with self.assertRaises(ValueError):
            check_query(self.single_select,
                        [],
                        [],
                        self.limit_single_invalid,
                        self.single_where)

        self.assertTrue(
            check_query(self.single_select,
                        [],
                        [],
                        self.limit_single,
                        self.single_where))
示例#9
0
 def test_select_clauses(self):
     # test empty select
     with self.assertRaises(ValueError):
         check_query([],
                     [],
                     self.order_by,
                     None,
                     self.single_where)
     # test duplicate select
     with self.assertRaises(ValueError):
         check_query(self.single_select + self.single_select,
                     [],
                     self.order_by,
                     None,
                     self.single_where)
     self.assertTrue(check_query(self.single_select, [], [],
                                 None, self.single_where))
示例#10
0
 def test_full_query(self):
     self.assertTrue(
         check_query(
             self.cross_select,
             self.join,
             self.single_select,
             self.limit_single,
             self.cross_wheres))
示例#11
0
def _create_job(*project, **kwargs):
    from hustle import _get_blobs
    from hustle.core.settings import Settings
    from hustle.core.pipeline import SelectPipe
    from hustle.core.util import ensure_list

    settings = Settings(**kwargs)
    wheres = ensure_list(settings.pop('where', ()))
    order_by = ensure_list(settings.pop('order_by', ()))
    join = settings.pop('join', ())
    distinct = settings.pop('distinct', False)
    desc = settings.pop('desc', False)
    limit = settings.pop('limit', None)
    ddfs = settings['ddfs']
    partition = settings.get('partition', 0)
    if partition < 0:
        partition = 0
    nest = settings.get('nest', False)

    try:
        # if join is a string, extract the actual join columns.
        # do it here to make the query checker happy.
        join = _resolve_join(wheres, join)
        check_query(project, join, order_by, limit, wheres)
    except ValueError as e:
        print "  Invalid query:\n    %s" % e
        return None

    name = '-'.join([where._name for where in wheres])[:64]
    job_blobs = set()
    for where in wheres:
        job_blobs.update(tuple(sorted(w)) for w in _get_blobs(where, ddfs))

    job = SelectPipe(settings['server'],
                     wheres=wheres,
                     project=project,
                     order_by=order_by,
                     join=join,
                     distinct=distinct,
                     desc=desc,
                     limit=limit,
                     partition=partition,
                     nest=nest)
    return job, job_blobs, name
示例#12
0
    def test_join(self):
        # test join with single table
        with self.assertRaises(ValueError):
            check_query(self.single_select, self.join, [], None,
                        self.single_where)

        # test invalid join
        with self.assertRaises(ValueError):
            check_query(self.single_select, self.join_invalid, [], None,
                        self.cross_wheres)

        # test invalid join
        with self.assertRaises(ValueError):
            check_query(self.single_select, self.join_invalid_1, [], None,
                        self.cross_wheres)

        # test invalid join
        with self.assertRaises(ValueError):
            check_query(self.single_select, self.join_invalid_2, [], None,
                        self.cross_wheres)
        self.assertTrue(
            check_query(self.single_select, self.join, [], None,
                        self.cross_wheres))
示例#13
0
def select(*project, **kwargs):
    """
    Perform a relational query, by selecting rows and columns from one or more tables.

    The return value is either::

    * an iterator over the resulting tuples when :code:`nest==False`
    * a :class:`Table <hustle.Table>` instance when :code:`nest==True`
    * in the case of :code:`nest==False and dump==True` return None (this is the default CLI interaction)

    For all of the examples below, *imps* and *pix* are instances of :class:`Table <hustle.Table>`.

    :type project: list of :class:`Column <hustle.core.marble.Column>` | :class:`Aggregation <hustle.core.marble.Aggregation>`
    :param project: a positional argument list of columns and aggregate expressions to return in the result

        A simple projection::

            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps)

        Selects three columns from the *imps* table.

        Hustle also allows for *aggregation functions* such as :func:`h_sum() <hustle.h_sum>`,
        :func:`h_count <hustle.h_count>`, :func:`h_min() <hustle.h_min>`, :func:`h_max() <hustle.h_max>`,
        :func:`h_avg <hustle.h_avg>` as in this example which sums the :code:`imps.cpm_millis`
        column::

            select(imps.ad_id, h_sum(imps.cpm_millis), h_count(), where=imps.date == '2014-01-27')

        Note that Hustle doesn't have a *group by* clause.  In this query, the output will be *grouped* by the
        :code:`imps.ad_id` column implicitly.  Note that in Hustle, if there is an aggregation function present in the
        :code:`project` param, the query results will be *grouped* by all non-aggregation present.

    :type where: (optional) sequence of :class:`Table <hustle.Table>` | :class:`Expr <hustle.core.marble.Expr>`
    :param where: the Tables to fetch data from, as well as the conditions in the *where clause*

        This two purposes: to specify the tables that are to be queried and to allow for the
        selection of data under specific criteria with our Python DSL selection syntax, much the like SQL's *where
        clause*::

            # simple projection with restriction
            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date == '2014-01-27')

        Note the :code:`==` operation between the :code:`imps.date` column and the date string.
        The :class:`Column <hustle.core.marble.Column>`
        class overrides all of Python's comparison operators, which, along with the *&*, *|* and *~* logical
        operators allows you to build arbitrarily complex column selection expressions like this::

            select(imps.ad_id, imps.date, imps.cpm_millis,
                    where=((imps.date >= '2014-01-21') & (imps.date <= '2014-01-23')) |
                          ~(imps.site_id == 'google.com))

        Note that for these expressions, the column must come first.  This means that the following expression is
        **illegal**::

            select(imps.ad_id, imps.date, imps.cpm_millis, where='2014-01-27' == imps.date)

        Where clause also supports *in* and *not in* statements by using special operators "<<" and ">>" respectively::

            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.ad_id << [1000, 1005])
            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.ad_id >> [1000, 1005])

        Note that the right value "<<" and ">>" could be any type of iterable with each element must be
        a valid single right value.

        In addition, multiple tables can be specified in the where clause like this::

            select(imps.ad_id, pix.amount, where=(imps.date < '2014-01-13', pix))

        which specifies an expression, :code:`imps.date < '2014-01-13'` and a :class:`Table <hustle.Table>` tuple.
        This query will simply return all of the *ad_id* values in *imps* for dates less than January 13th followed
        by all of the *amount* values in the *pix* table.

        Using multiple columns is typically reserved for when you use a *join clause*

    :type join: string | sequence of exactly length 2 of :class:`Column <hustle.core.marble.Column>`
    :param join: specified the columns to perform a relational join operation on for the query

        The join columns can be specified either as a list of 2 columns, or a list of 2 strings. In particular, if
        two columns have the same names, a single string is valid as well.

        Here's an example of a Hustle join::

            select(imps.ad_id, imps.site_id, h_sum(pix.amount), h_count(),
                   where=(imps.date < '2014-01-13', pix.date < '2014-01-13'),
                   join=(imps.site_id, pix.site_id))

        or equivalently::

            select(imps.ad_id, imps.site_id, h_sum(pix.amount), h_count(),
                   where=(imps.date < '2014-01-13', pix.date < '2014-01-13'),
                   join='site_id')

        which joins the *imps* and *pix* tables on their common *site_id* column, then returns the sum of the
        *pix.amount* columns and a count, grouped by the *ad_id* and the *site_id*.  The equivalent query in SQL
        is::

            select i.ad_id, i.site_id, sum(p.amount), count(*)
            from imps i
            join pix p on p.site_id = p.site_id
            where i.date < '2014-01-13' and i.date < '2014-01-13'
            group by i.ad_id, i.site_id

    :type full_join: bool
    :param full_join:

        if True, specifies that a full join between the specified tables in the *where clause* should
        be joined in a full cross-product.  Note that if both *full_join* and *join* are specified, *join* will
        be ignored.

    :type order_by: string | :class:`Column <hustle.core.marble.Column>` | int |
        (sequence of string | :class:`Column <hustle.core.marble.Column>` | int)
    :param order_by: the column(s) to sort the result by

        The sort columns can be specified either as a Column or a list of Columns.  Alternatively, you can specify
        a column by using a string with either the name of the column or the *table.column* string notation.
        Furthermore, you can also represent the column using a zero based index of the *projected* columns.  This
        last case would be used for *Aggregations*.  Here are a few examples::

            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by=imps.date)
            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by=(imps.date, imps.ad_id))
            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by='date')
            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by='imps.date')
            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by=('date', imps.ad_id))
            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by=('date', 2))
            select(imps.ad_id, imps.date, h_sum(imps.cpm_millis), where=imps, order_by=2)

    :type desc: boolean
    :param desc: affects sort order of the *order_by clause* to descending (default ascending)

    :type distinct: boolean
    :param distinct: indicates whether to remove duplicates in results

    :type limit: int
    :param limit: limits the total number of records in the output

    :type block: boolean
    :param block: make select call either blocking (default) or non-blocking.  If True, causes select() to return
    a :class:`Future <hustle.Future>` object

    :type nest: boolean (default = False)
    :param nest: specify that the return value is a :class:`Table <hustle.Table>` to be used in another query

        This allows us to build nested queries.  You may want to do this to join more than two tables, or to reuse
        the results of a query in more than one subsequent query.  For example::

            active_pix = select(*star(pix), where=pix.isActive > 0, nest=True)
            select(h_sum(active_pix.amount), where=active_pix)

    :type tag: string (default = None)
    :param tag: specify the tag name for a nested query, note it must be used with option "nest". If this option
    is not specified, a random name will be given to the result of this nested query.

    :type max_cores: int (default = 0)
    :param max_cores: specify the max number of cores (disco workers) this query could utilize. 0 means no limit

    :type profile: boolean (default = False)
    :param profile: specify whether generate disco job's profile

    :type purge: boolean (default = True)
    :param purge: specify whether purge the query related data. This only works when "dump = True" and "profile = False".

    :type kwargs: dict
    :param kwargs: custom settings for this query see :mod:`hustle.core.settings`

    """

    from hustle.core.settings import Settings
    from hustle.core.pipeline import SelectPipe
    from hustle.core.util import ensure_list

    settings = Settings(**kwargs)
    wheres = ensure_list(settings.pop('where', ()))
    order_by = ensure_list(settings.pop('order_by', ()))
    join = settings.pop('join', ())
    full_join = settings.pop('full_join', False)
    distinct = settings.pop('distinct', False)
    desc = settings.pop('desc', False)
    limit = settings.pop('limit', None)
    wide = settings.pop('wide', False)
    nest = settings.pop('nest', False)
    tag = settings.pop('tag', None)
    block = settings.pop('block', True)
    autodump = settings.pop('dump', False)
    pre_order_stage = settings.pop('pre_order_stage', ())
    ddfs = settings['ddfs']
    partition = settings.pop('partition', 0)
    max_cores = settings.pop('max_cores', 0)
    profile = settings.pop('profile', False)
    purge = settings.pop('purge', True)
    if partition < 0:
        partition = 0
    if tag:
        t = Table.from_tag(tag)
        if t is not None:
            print "The tag name %s is already existed. Try another tag name"
            " or drop the old one" % tag
            return

    try:
        # if join is a string, extract the actual join columns.
        # do it here to make the query checker happy.
        join = _resolve_join(wheres, join)
        check_query(project, join, order_by, limit, wheres)
    except ValueError as e:
        print "  Invalid query:\n    %s" % e
        return None

    name = '-'.join([where._name for where in wheres])[:64]
    job_blobs = set()
    for where in wheres:
        job_blobs.update(tuple(sorted(w)) for w in _get_blobs(where, ddfs))

    job = SelectPipe(settings['server'],
                     wheres=wheres,
                     project=project,
                     order_by=order_by,
                     join=join,
                     full_join=full_join,
                     distinct=distinct,
                     desc=desc,
                     limit=limit,
                     partition=partition,
                     wide=wide,
                     nest=nest,
                     tag=tag,
                     pre_order_stage=pre_order_stage,
                     max_cores=max_cores,
                     profile=profile)

    job.run(name='select_from_%s' % name, input=job_blobs, **settings)
    if block:
        blobs = job.wait()
        if nest:
            rtab = job.get_result_schema(project)
            rtab._blobs = blobs
            return rtab
        elif autodump:
            # the result will be just dumped to stdout
            cols = [c.name for c in project]
            _print_separator(80)
            _print_line(cols,
                        width=80,
                        cols=len(cols),
                        alignments=[
                            _ALG_RIGHT if c.is_numeric else _ALG_LEFT
                            for c in project
                        ])
            _print_separator(80)
            cat(_query_iterator(blobs), 80)
            if purge and not profile:
                settings['server'].purge(_safe_str(job.name))
            return
        return QueryResult(job.name, blobs, settings['server'])
    else:
        return Future(job.name, job, settings['server'], nest, *project)
示例#14
0
def select(*project, **kwargs):
    """
    Perform a relational query, by selecting rows and columns from one or more tables.

    The return value is either::

    * an iterator over the resulting tuples when :code:`nest==False`
    * a :class:`Table <hustle.Table>` instance when :code:`nest==True`
    * in the case of :code:`nest==False and dump==True` return None (this is the default CLI interaction)

    For all of the examples below, *imps* and *pix* are instances of :class:`Table <hustle.Table>`.

    :type project: list of :class:`Column <hustle.core.marble.Column>` | :class:`Aggregation <hustle.core.marble.Aggregation>`
    :param project: a positional argument list of columns and aggregate expressions to return in the result

        A simple projection::

            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps)

        Selects three columns from the *imps* table.

        Hustle also allows for *aggregation functions* such as :func:`h_sum() <hustle.h_sum>`,
        :func:`h_count <hustle.h_count>`, :func:`h_min() <hustle.h_min>`, :func:`h_max() <hustle.h_max>`,
        :func:`h_avg <hustle.h_avg>` as in this example which sums the :code:`imps.cpm_millis`
        column::

            select(imps.ad_id, h_sum(imps.cpm_millis), h_count(), where=imps.date == '2014-01-27')

        Note that Hustle doesn't have a *group by* clause.  In this query, the output will be *grouped* by the
        :code:`imps.ad_id` column implicitly.  Note that in Hustle, if there is an aggregation function present in the
        :code:`project` param, the query results will be *grouped* by all non-aggregation present.

    :type where: (optional) sequence of :class:`Table <hustle.Table>` | :class:`Expr <hustle.core.marble.Expr>`
    :param where: the Tables to fetch data from, as well as the conditions in the *where clause*

        This two purposes: to specify the tables that are to be queried and to allow for the
        selection of data under specific criteria with our Python DSL selection syntax, much the like SQL's *where
        clause*::

            # simple projection with restriction
            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date == '2014-01-27')

        Note the :code:`==` operation between the :code:`imps.date` column and the date string.
        The :class:`Column <hustle.core.marble.Column>`
        class overrides all of Python's comparison operators, which, along with the *&*, *|* and *~* logical
        operators allows you to build arbitrarily complex column selection expressions like this::

            select(imps.ad_id, imps.date, imps.cpm_millis,
                    where=((imps.date >= '2014-01-21') & (imps.date <= '2014-01-23')) |
                          ~(imps.site_id == 'google.com))

        Note that for these expressions, the column must come first.  This means that the following expression is
        **illegal**::

            select(imps.ad_id, imps.date, imps.cpm_millis, where='2014-01-27' == imps.date)

        Where clause also supports *in* and *not in* statements by using special operators "<<" and ">>" respectively::

            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.ad_id << [1000, 1005])
            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.ad_id >> [1000, 1005])

        Note that the right value "<<" and ">>" could be any type of iterable with each element must be
        a valid single right value.

        In addition, multiple tables can be specified in the where clause like this::

            select(imps.ad_id, pix.amount, where=(imps.date < '2014-01-13', pix))

        which specifies an expression, :code:`imps.date < '2014-01-13'` and a :class:`Table <hustle.Table>` tuple.
        This query will simply return all of the *ad_id* values in *imps* for dates less than January 13th followed
        by all of the *amount* values in the *pix* table.

        Using multiple columns is typically reserved for when you use a *join clause*

    :type join: string | sequence of exactly length 2 of :class:`Column <hustle.core.marble.Column>`
    :param join: specified the columns to perform a relational join operation on for the query

        The join columns can be specified either as a list of 2 columns, or a list of 2 strings. In particular, if
        two columns have the same names, a single string is valid as well.

        Here's an example of a Hustle join::

            select(imps.ad_id, imps.site_id, h_sum(pix.amount), h_count(),
                   where=(imps.date < '2014-01-13', pix.date < '2014-01-13'),
                   join=(imps.site_id, pix.site_id))

        or equivalently::

            select(imps.ad_id, imps.site_id, h_sum(pix.amount), h_count(),
                   where=(imps.date < '2014-01-13', pix.date < '2014-01-13'),
                   join='site_id')

        which joins the *imps* and *pix* tables on their common *site_id* column, then returns the sum of the
        *pix.amount* columns and a count, grouped by the *ad_id* and the *site_id*.  The equivalent query in SQL
        is::

            select i.ad_id, i.site_id, sum(p.amount), count(*)
            from imps i
            join pix p on p.site_id = p.site_id
            where i.date < '2014-01-13' and i.date < '2014-01-13'
            group by i.ad_id, i.site_id

    :type full_join: bool
    :param full_join:

        if True, specifies that a full join between the specified tables in the *where clause* should
        be joined in a full cross-product.  Note that if both *full_join* and *join* are specified, *join* will
        be ignored.

    :type order_by: string | :class:`Column <hustle.core.marble.Column>` | int |
        (sequence of string | :class:`Column <hustle.core.marble.Column>` | int)
    :param order_by: the column(s) to sort the result by

        The sort columns can be specified either as a Column or a list of Columns.  Alternatively, you can specify
        a column by using a string with either the name of the column or the *table.column* string notation.
        Furthermore, you can also represent the column using a zero based index of the *projected* columns.  This
        last case would be used for *Aggregations*.  Here are a few examples::

            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by=imps.date)
            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by=(imps.date, imps.ad_id))
            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by='date')
            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by='imps.date')
            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by=('date', imps.ad_id))
            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by=('date', 2))
            select(imps.ad_id, imps.date, h_sum(imps.cpm_millis), where=imps, order_by=2)

    :type desc: boolean
    :param desc: affects sort order of the *order_by clause* to descending (default ascending)

    :type distinct: boolean
    :param distinct: indicates whether to remove duplicates in results

    :type limit: int
    :param limit: limits the total number of records in the output

    :type block: boolean
    :param block: make select call either blocking (default) or non-blocking.  If True, causes select() to return
    a :class:`Future <hustle.Future>` object

    :type nest: boolean (default = False)
    :param nest: specify that the return value is a :class:`Table <hustle.Table>` to be used in another query

        This allows us to build nested queries.  You may want to do this to join more than two tables, or to reuse
        the results of a query in more than one subsequent query.  For example::

            active_pix = select(*star(pix), where=pix.isActive > 0, nest=True)
            select(h_sum(active_pix.amount), where=active_pix)

    :type tag: string (default = None)
    :param tag: specify the tag name for a nested query, note it must be used with option "nest". If this option
    is not specified, a random name will be given to the result of this nested query.

    :type max_cores: int (default = 0)
    :param max_cores: specify the max number of cores (disco workers) this query could utilize. 0 means no limit

    :type profile: boolean (default = False)
    :param profile: specify whether generate disco job's profile

    :type purge: boolean (default = True)
    :param purge: specify whether purge the query related data. This only works when "dump = True" and "profile = False".

    :type kwargs: dict
    :param kwargs: custom settings for this query see :mod:`hustle.core.settings`

    """

    from hustle.core.settings import Settings
    from hustle.core.pipeline import SelectPipe
    from hustle.core.util import ensure_list

    settings = Settings(**kwargs)
    wheres = ensure_list(settings.pop('where', ()))
    order_by = ensure_list(settings.pop('order_by', ()))
    join = settings.pop('join', ())
    full_join = settings.pop('full_join', False)
    distinct = settings.pop('distinct', False)
    desc = settings.pop('desc', False)
    limit = settings.pop('limit', None)
    wide = settings.pop('wide', False)
    nest = settings.pop('nest', False)
    tag = settings.pop('tag', None)
    block = settings.pop('block', True)
    autodump = settings.pop('dump', False)
    pre_order_stage = settings.pop('pre_order_stage', ())
    ddfs = settings['ddfs']
    partition = settings.pop('partition', 0)
    max_cores = settings.pop('max_cores', 0)
    profile = settings.pop('profile', False)
    purge = settings.pop('purge', True)
    if partition < 0:
        partition = 0
    if tag:
        t = Table.from_tag(tag)
        if t is not None:
            print "The tag name %s is already existed. Try another tag name"
            " or drop the old one" % tag
            return

    try:
        # if join is a string, extract the actual join columns.
        # do it here to make the query checker happy.
        join = _resolve_join(wheres, join)
        check_query(project, join, order_by, limit, wheres)
    except ValueError as e:
        print "  Invalid query:\n    %s" % e
        return None

    name = '-'.join([where._name for where in wheres])[:64]
    job_blobs = set()
    for where in wheres:
        job_blobs.update(tuple(sorted(w)) for w in _get_blobs(where, ddfs))

    job = SelectPipe(settings['server'],
                     wheres=wheres,
                     project=project,
                     order_by=order_by,
                     join=join,
                     full_join=full_join,
                     distinct=distinct,
                     desc=desc,
                     limit=limit,
                     partition=partition,
                     wide=wide,
                     nest=nest,
                     tag=tag,
                     pre_order_stage=pre_order_stage,
                     max_cores=max_cores,
                     profile=profile)

    job.run(name='select_from_%s' % name, input=job_blobs, **settings)
    if block:
        blobs = job.wait()
        if nest:
            rtab = job.get_result_schema(project)
            rtab._blobs = blobs
            return rtab
        elif autodump:
            # the result will be just dumped to stdout
            cols = [c.name for c in project]
            _print_separator(80)
            _print_line(cols, width=80, cols=len(cols),
                        alignments=[_ALG_RIGHT if c.is_numeric else _ALG_LEFT
                                    for c in project])
            _print_separator(80)
            cat(_query_iterator(blobs), 80)
            if purge and not profile:
                settings['server'].purge(_safe_str(job.name))
            return
        return QueryResult(job.name, blobs, settings['server'])
    else:
        return Future(job.name, job, settings['server'], nest, *project)
示例#15
0
def select(*project, **kwargs):
    """
    Perform a relational query, by selecting rows and columns from one or more tables.

    The return value is either:

    * a list of urls containing the result records.  This is the same as normal results from Disco
    * a :class:`Table <hustle.Table>` instance when :code:`nest==True`

    For all of the examples below, *imps* and *pix* are instances of :class:`Table <hustle.Table>`.

    :type project: list of :class:`Column <hustle.core.marble.Column>` | :class:`Aggregation <hustle.core.marble.Aggregation>`
    :param project: a positional argument list of columns and aggregate expressions to return in the result

        A simple projection::

            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps)

        Selects three columns from the *imps* table.

        Hustle also allows for *aggregation functions* such as :func:`h_sum() <hustle.h_sum>`,
        :func:`h_count <hustle.h_count>`, :func:`h_min() <hustle.h_min>`, :func:`h_max() <hustle.h_max>`,
        :func:`h_avg <hustle.h_avg>` as in this example which sums the :code:`imps.cpm_millis`
        column::

            select(imps.ad_id, h_sum(imps.cpm_millis), h_count(), where=imps.date == '2014-01-27')

        Note that Hustle doesn't have a *group by* clause.  In this query, the output will be *grouped* by the
        :code:`imps.ad_id` column implicitly.  Note that in Hustle, if there is an aggregation function present in the
        :code:`project` param, the query results will be *grouped* by all non-aggregation present.

    :type where: (optional) sequence of :class:`Table <hustle.Table>` | :class:`Expr <hustle.core.marble.Expr>`
    :param where: the Tables to fetch data from, as well as the conditions in the *where clause*

        This two purposes: to specify the tables that are to be queried and to allow for the
        selection of data under specific criteria with our Python DSL selection syntax, much the like SQL's *where
        clause*::

            # simple projection with restriction
            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date == '2014-01-27')

        Note the :code:`==` operation between the :code:`imps.date` column and the date string.
        The :class:`Column <hustle.core.marble.Column>`
        class overrides all of Python's comparison operators, which, along with the *&*, *|* and *~* logical
        operators allows you to build arbitrarily complex column selection expressions like this::

            select(imps.ad_id, imps.date, imps.cpm_millis,
                    where=((imps.date >= '2014-01-21') & (imps.date <= '2014-01-23')) |
                          ~(imps.site_id == 'google.com))

        Note that for these expressions, the column must come first.  This means that the following expression is
        **illegal**::

            select(imps.ad_id, imps.date, imps.cpm_millis, where='2014-01-27' == imps.date)

        In addition, multiple tables can be specified in the where clause like this::

            select(imps.ad_id, pix.amount, where=(imps.date < '2014-01-13', pix))

        which specifies an expression, :code:`imps.date < '2014-01-13'` and a :class:`Table <hustle.Table>` tuple.
        This query will simply return all of the *ad_id* values in *imps* for dates less than January 13th followed
        by all of the *amount* values in the *pix* table.

        Using multiple columns is typically reserved for when you use a *join clause*

    :type join: sequence of exactly length 2 of :class:`Column <hustle.core.marble.Column>`
    :param join: specified the columns to perform a relational join operation on for the query

        Here's an example of a Hustle join::

            select(imps.ad_id, imps.site_id, h_sum(pix.amount), h_count(),
                   where=(imps.date < '2014-01-13', pix.date < '2014-01-13'),
                   join=(imps.site_id, pix.site_id))

        which joins the *imps* and *pix* tables on their common *site_id* column, then returns the sum of the
        *pix.amount* columns and a count, grouped by the *ad_id* and the *site_id*.  The equivalent query in SQL
        is::

            select i.ad_id, i.site_id, sum(p.amount), count(*)
            from imps i
            join pix p on p.site_id = p.site_id
            where i.date < '2014-01-13' and i.date < '2014-01-13'
            group by i.ad_id, i.site_id

    :type order_by: string | :class:`Column <hustle.core.marble.Column>` | int |
        (sequence of string | :class:`Column <hustle.core.marble.Column>` | int)
    :param order_by: the column(s) to sort the result by

        The sort columns can be specified either as a Column or a list of Columns.  Alternatively, you can specify
        a column by using a string with either the name of the column or the *table.column* string notation.
        Furthermore, you can also represent the column using a zero based index of the *projected* columns.  This
        last case would be used for *Aggregations*.  Here are a few examples::

            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by=imps.date)
            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by=(imps.date, imps.ad_id))
            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by='date')
            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by='imps.date')
            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by=('date', imps.ad_id))
            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by=('date', 2))
            select(imps.ad_id, imps.date, h_sum(imps.cpm_millis), where=imps, order_by=2)

    :type desc: boolean
    :param desc: affects sort order of the *order_by clause* to descending (default ascending)

    :type distinct: boolean
    :param distinct: indicates whether to remove duplicates in results

    :type limit: int
    :param limit: limits the total number of records in the output

    :type nest: boolean (default = False)
    :param nest: specify that the return value is a :class:`Table <hustle.Table>` to be used in another query

        This allows us to build nested queries.  You may want to do this to join more than two tables, or to reuse
        the results of a query in more than one subsequent query.  For example::

            active_pix = select(*star(pix), where=pix.isActive > 0, nest=True)
            select(h_sum(active_pix.amount), where=active_pix)

    :type kwargs: dict
    :param kwargs: custom settings for this query see :mod:`hustle.core.settings`

    """

    from hustle import _get_blobs
    from hustle.core.settings import Settings
    from hustle.core.pipeline import SelectPipe
    from hustle.core.util import ensure_list

    settings = Settings(**kwargs)
    wheres = ensure_list(settings.pop('where', ()))
    order_by = ensure_list(settings.pop('order_by', ()))
    join = settings.pop('join', ())
    distinct = settings.pop('distinct', False)
    desc = settings.pop('desc', False)
    limit = settings.pop('limit', None)
    ddfs = settings['ddfs']
    autodump = settings['dump']
    partition = settings.get('partition', 0)
    if partition < 0:
        partition = 0
    nest = settings.get('nest', False)
    try:
        check_query(project, join, order_by, limit, wheres)
    except ValueError as e:
        print "  Invalid query:\n    %s" % e
        return None

    name = '-'.join([where._name for where in wheres])[:64]
    job_blobs = set()
    for where in wheres:
        job_blobs.update(tuple(sorted(w)) for w in _get_blobs(where, ddfs))

    job = SelectPipe(settings['server'],
                     wheres=wheres,
                     project=project,
                     order_by=order_by,
                     join=join,
                     distinct=distinct,
                     desc=desc,
                     limit=limit,
                     partition=partition,
                     nest=nest)

    job.run(name='select_from_%s' % name, input=job_blobs, **settings)
    blobs = job.wait()
    if nest:
        rtab = job.get_result_schema(project)
        rtab._blobs = blobs
        return rtab
    elif autodump:
        # the result will be just dumped to stdout
        cols = [c.name for c in project]
        _print_separator(80)
        _print_line(cols, width=80, cols=len(cols),
                   alignments=[_ALG_RIGHT if c.is_numeric else _ALG_LEFT for c in project])
        _print_separator(80)
        dump(blobs, 80)
        return
    return blobs
示例#16
0
 def test_full_query(self):
     self.assertTrue(
         check_query(self.cross_select, self.join, self.single_select,
                     self.limit_single, self.cross_wheres))