def test_limit(self): with self.assertRaises(ValueError): check_query(self.single_select, [], [], self.limit_single_invalid, self.single_where) self.assertTrue( check_query(self.single_select, [], [], self.limit_single, self.single_where))
def test_order_by(self): # should raise if select columns don't contain the order column with self.assertRaises(ValueError): check_query(self.single_select, [], self.order_by, None, self.single_where) self.assertTrue( check_query(self.single_select, [], [self.albums.name], None, self.single_where))
def test_where_clauses(self): # should raise if a single table shows up in multi-wheres # should raise if where and select are from different tables with self.assertRaises(ValueError): check_query(self.single_select, [], [], self.order_by, [self.transaction.id == 1000]) self.assertTrue( check_query(self.single_select, [], [], None, self.single_where))
def test_order_by(self): # should raise if select columns don't contain the order column with self.assertRaises(ValueError): check_query(self.single_select, [], self.order_by, None, self.single_where) self.assertTrue(check_query(self.single_select, [], [self.albums.name], None, self.single_where))
def test_select_clauses(self): # test empty select with self.assertRaises(ValueError): check_query([], [], self.order_by, None, self.single_where) # test duplicate select with self.assertRaises(ValueError): check_query(self.single_select + self.single_select, [], self.order_by, None, self.single_where) self.assertTrue( check_query(self.single_select, [], [], None, self.single_where))
def test_join(self): # test join with single table with self.assertRaises(ValueError): check_query(self.single_select, self.join, [], None, self.single_where) # test invalid join with self.assertRaises(ValueError): check_query(self.single_select, self.join_invalid, [], None, self.cross_wheres) # test invalid join with self.assertRaises(ValueError): check_query(self.single_select, self.join_invalid_1, [], None, self.cross_wheres) # test invalid join with self.assertRaises(ValueError): check_query(self.single_select, self.join_invalid_2, [], None, self.cross_wheres) self.assertTrue(check_query(self.single_select, self.join, [], None, self.cross_wheres))
def test_where_clauses(self): # should raise if a single table shows up in multi-wheres # should raise if where and select are from different tables with self.assertRaises(ValueError): check_query(self.single_select, [], [], self.order_by, [self.transaction.id == 1000]) self.assertTrue(check_query(self.single_select, [], [], None, self.single_where))
def test_select_clauses(self): # test empty select with self.assertRaises(ValueError): check_query([], [], self.order_by, None, self.single_where) # test duplicate select with self.assertRaises(ValueError): check_query(self.single_select + self.single_select, [], self.order_by, None, self.single_where) self.assertTrue(check_query(self.single_select, [], [], None, self.single_where))
def test_full_query(self): self.assertTrue( check_query( self.cross_select, self.join, self.single_select, self.limit_single, self.cross_wheres))
def _create_job(*project, **kwargs): from hustle import _get_blobs from hustle.core.settings import Settings from hustle.core.pipeline import SelectPipe from hustle.core.util import ensure_list settings = Settings(**kwargs) wheres = ensure_list(settings.pop('where', ())) order_by = ensure_list(settings.pop('order_by', ())) join = settings.pop('join', ()) distinct = settings.pop('distinct', False) desc = settings.pop('desc', False) limit = settings.pop('limit', None) ddfs = settings['ddfs'] partition = settings.get('partition', 0) if partition < 0: partition = 0 nest = settings.get('nest', False) try: # if join is a string, extract the actual join columns. # do it here to make the query checker happy. join = _resolve_join(wheres, join) check_query(project, join, order_by, limit, wheres) except ValueError as e: print " Invalid query:\n %s" % e return None name = '-'.join([where._name for where in wheres])[:64] job_blobs = set() for where in wheres: job_blobs.update(tuple(sorted(w)) for w in _get_blobs(where, ddfs)) job = SelectPipe(settings['server'], wheres=wheres, project=project, order_by=order_by, join=join, distinct=distinct, desc=desc, limit=limit, partition=partition, nest=nest) return job, job_blobs, name
def test_join(self): # test join with single table with self.assertRaises(ValueError): check_query(self.single_select, self.join, [], None, self.single_where) # test invalid join with self.assertRaises(ValueError): check_query(self.single_select, self.join_invalid, [], None, self.cross_wheres) # test invalid join with self.assertRaises(ValueError): check_query(self.single_select, self.join_invalid_1, [], None, self.cross_wheres) # test invalid join with self.assertRaises(ValueError): check_query(self.single_select, self.join_invalid_2, [], None, self.cross_wheres) self.assertTrue( check_query(self.single_select, self.join, [], None, self.cross_wheres))
def select(*project, **kwargs): """ Perform a relational query, by selecting rows and columns from one or more tables. The return value is either:: * an iterator over the resulting tuples when :code:`nest==False` * a :class:`Table <hustle.Table>` instance when :code:`nest==True` * in the case of :code:`nest==False and dump==True` return None (this is the default CLI interaction) For all of the examples below, *imps* and *pix* are instances of :class:`Table <hustle.Table>`. :type project: list of :class:`Column <hustle.core.marble.Column>` | :class:`Aggregation <hustle.core.marble.Aggregation>` :param project: a positional argument list of columns and aggregate expressions to return in the result A simple projection:: select(imps.ad_id, imps.date, imps.cpm_millis, where=imps) Selects three columns from the *imps* table. Hustle also allows for *aggregation functions* such as :func:`h_sum() <hustle.h_sum>`, :func:`h_count <hustle.h_count>`, :func:`h_min() <hustle.h_min>`, :func:`h_max() <hustle.h_max>`, :func:`h_avg <hustle.h_avg>` as in this example which sums the :code:`imps.cpm_millis` column:: select(imps.ad_id, h_sum(imps.cpm_millis), h_count(), where=imps.date == '2014-01-27') Note that Hustle doesn't have a *group by* clause. In this query, the output will be *grouped* by the :code:`imps.ad_id` column implicitly. Note that in Hustle, if there is an aggregation function present in the :code:`project` param, the query results will be *grouped* by all non-aggregation present. :type where: (optional) sequence of :class:`Table <hustle.Table>` | :class:`Expr <hustle.core.marble.Expr>` :param where: the Tables to fetch data from, as well as the conditions in the *where clause* This two purposes: to specify the tables that are to be queried and to allow for the selection of data under specific criteria with our Python DSL selection syntax, much the like SQL's *where clause*:: # simple projection with restriction select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date == '2014-01-27') Note the :code:`==` operation between the :code:`imps.date` column and the date string. The :class:`Column <hustle.core.marble.Column>` class overrides all of Python's comparison operators, which, along with the *&*, *|* and *~* logical operators allows you to build arbitrarily complex column selection expressions like this:: select(imps.ad_id, imps.date, imps.cpm_millis, where=((imps.date >= '2014-01-21') & (imps.date <= '2014-01-23')) | ~(imps.site_id == 'google.com)) Note that for these expressions, the column must come first. This means that the following expression is **illegal**:: select(imps.ad_id, imps.date, imps.cpm_millis, where='2014-01-27' == imps.date) Where clause also supports *in* and *not in* statements by using special operators "<<" and ">>" respectively:: select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.ad_id << [1000, 1005]) select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.ad_id >> [1000, 1005]) Note that the right value "<<" and ">>" could be any type of iterable with each element must be a valid single right value. In addition, multiple tables can be specified in the where clause like this:: select(imps.ad_id, pix.amount, where=(imps.date < '2014-01-13', pix)) which specifies an expression, :code:`imps.date < '2014-01-13'` and a :class:`Table <hustle.Table>` tuple. This query will simply return all of the *ad_id* values in *imps* for dates less than January 13th followed by all of the *amount* values in the *pix* table. Using multiple columns is typically reserved for when you use a *join clause* :type join: string | sequence of exactly length 2 of :class:`Column <hustle.core.marble.Column>` :param join: specified the columns to perform a relational join operation on for the query The join columns can be specified either as a list of 2 columns, or a list of 2 strings. In particular, if two columns have the same names, a single string is valid as well. Here's an example of a Hustle join:: select(imps.ad_id, imps.site_id, h_sum(pix.amount), h_count(), where=(imps.date < '2014-01-13', pix.date < '2014-01-13'), join=(imps.site_id, pix.site_id)) or equivalently:: select(imps.ad_id, imps.site_id, h_sum(pix.amount), h_count(), where=(imps.date < '2014-01-13', pix.date < '2014-01-13'), join='site_id') which joins the *imps* and *pix* tables on their common *site_id* column, then returns the sum of the *pix.amount* columns and a count, grouped by the *ad_id* and the *site_id*. The equivalent query in SQL is:: select i.ad_id, i.site_id, sum(p.amount), count(*) from imps i join pix p on p.site_id = p.site_id where i.date < '2014-01-13' and i.date < '2014-01-13' group by i.ad_id, i.site_id :type full_join: bool :param full_join: if True, specifies that a full join between the specified tables in the *where clause* should be joined in a full cross-product. Note that if both *full_join* and *join* are specified, *join* will be ignored. :type order_by: string | :class:`Column <hustle.core.marble.Column>` | int | (sequence of string | :class:`Column <hustle.core.marble.Column>` | int) :param order_by: the column(s) to sort the result by The sort columns can be specified either as a Column or a list of Columns. Alternatively, you can specify a column by using a string with either the name of the column or the *table.column* string notation. Furthermore, you can also represent the column using a zero based index of the *projected* columns. This last case would be used for *Aggregations*. Here are a few examples:: select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by=imps.date) select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by=(imps.date, imps.ad_id)) select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by='date') select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by='imps.date') select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by=('date', imps.ad_id)) select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by=('date', 2)) select(imps.ad_id, imps.date, h_sum(imps.cpm_millis), where=imps, order_by=2) :type desc: boolean :param desc: affects sort order of the *order_by clause* to descending (default ascending) :type distinct: boolean :param distinct: indicates whether to remove duplicates in results :type limit: int :param limit: limits the total number of records in the output :type block: boolean :param block: make select call either blocking (default) or non-blocking. If True, causes select() to return a :class:`Future <hustle.Future>` object :type nest: boolean (default = False) :param nest: specify that the return value is a :class:`Table <hustle.Table>` to be used in another query This allows us to build nested queries. You may want to do this to join more than two tables, or to reuse the results of a query in more than one subsequent query. For example:: active_pix = select(*star(pix), where=pix.isActive > 0, nest=True) select(h_sum(active_pix.amount), where=active_pix) :type tag: string (default = None) :param tag: specify the tag name for a nested query, note it must be used with option "nest". If this option is not specified, a random name will be given to the result of this nested query. :type max_cores: int (default = 0) :param max_cores: specify the max number of cores (disco workers) this query could utilize. 0 means no limit :type profile: boolean (default = False) :param profile: specify whether generate disco job's profile :type purge: boolean (default = True) :param purge: specify whether purge the query related data. This only works when "dump = True" and "profile = False". :type kwargs: dict :param kwargs: custom settings for this query see :mod:`hustle.core.settings` """ from hustle.core.settings import Settings from hustle.core.pipeline import SelectPipe from hustle.core.util import ensure_list settings = Settings(**kwargs) wheres = ensure_list(settings.pop('where', ())) order_by = ensure_list(settings.pop('order_by', ())) join = settings.pop('join', ()) full_join = settings.pop('full_join', False) distinct = settings.pop('distinct', False) desc = settings.pop('desc', False) limit = settings.pop('limit', None) wide = settings.pop('wide', False) nest = settings.pop('nest', False) tag = settings.pop('tag', None) block = settings.pop('block', True) autodump = settings.pop('dump', False) pre_order_stage = settings.pop('pre_order_stage', ()) ddfs = settings['ddfs'] partition = settings.pop('partition', 0) max_cores = settings.pop('max_cores', 0) profile = settings.pop('profile', False) purge = settings.pop('purge', True) if partition < 0: partition = 0 if tag: t = Table.from_tag(tag) if t is not None: print "The tag name %s is already existed. Try another tag name" " or drop the old one" % tag return try: # if join is a string, extract the actual join columns. # do it here to make the query checker happy. join = _resolve_join(wheres, join) check_query(project, join, order_by, limit, wheres) except ValueError as e: print " Invalid query:\n %s" % e return None name = '-'.join([where._name for where in wheres])[:64] job_blobs = set() for where in wheres: job_blobs.update(tuple(sorted(w)) for w in _get_blobs(where, ddfs)) job = SelectPipe(settings['server'], wheres=wheres, project=project, order_by=order_by, join=join, full_join=full_join, distinct=distinct, desc=desc, limit=limit, partition=partition, wide=wide, nest=nest, tag=tag, pre_order_stage=pre_order_stage, max_cores=max_cores, profile=profile) job.run(name='select_from_%s' % name, input=job_blobs, **settings) if block: blobs = job.wait() if nest: rtab = job.get_result_schema(project) rtab._blobs = blobs return rtab elif autodump: # the result will be just dumped to stdout cols = [c.name for c in project] _print_separator(80) _print_line(cols, width=80, cols=len(cols), alignments=[ _ALG_RIGHT if c.is_numeric else _ALG_LEFT for c in project ]) _print_separator(80) cat(_query_iterator(blobs), 80) if purge and not profile: settings['server'].purge(_safe_str(job.name)) return return QueryResult(job.name, blobs, settings['server']) else: return Future(job.name, job, settings['server'], nest, *project)
def select(*project, **kwargs): """ Perform a relational query, by selecting rows and columns from one or more tables. The return value is either:: * an iterator over the resulting tuples when :code:`nest==False` * a :class:`Table <hustle.Table>` instance when :code:`nest==True` * in the case of :code:`nest==False and dump==True` return None (this is the default CLI interaction) For all of the examples below, *imps* and *pix* are instances of :class:`Table <hustle.Table>`. :type project: list of :class:`Column <hustle.core.marble.Column>` | :class:`Aggregation <hustle.core.marble.Aggregation>` :param project: a positional argument list of columns and aggregate expressions to return in the result A simple projection:: select(imps.ad_id, imps.date, imps.cpm_millis, where=imps) Selects three columns from the *imps* table. Hustle also allows for *aggregation functions* such as :func:`h_sum() <hustle.h_sum>`, :func:`h_count <hustle.h_count>`, :func:`h_min() <hustle.h_min>`, :func:`h_max() <hustle.h_max>`, :func:`h_avg <hustle.h_avg>` as in this example which sums the :code:`imps.cpm_millis` column:: select(imps.ad_id, h_sum(imps.cpm_millis), h_count(), where=imps.date == '2014-01-27') Note that Hustle doesn't have a *group by* clause. In this query, the output will be *grouped* by the :code:`imps.ad_id` column implicitly. Note that in Hustle, if there is an aggregation function present in the :code:`project` param, the query results will be *grouped* by all non-aggregation present. :type where: (optional) sequence of :class:`Table <hustle.Table>` | :class:`Expr <hustle.core.marble.Expr>` :param where: the Tables to fetch data from, as well as the conditions in the *where clause* This two purposes: to specify the tables that are to be queried and to allow for the selection of data under specific criteria with our Python DSL selection syntax, much the like SQL's *where clause*:: # simple projection with restriction select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date == '2014-01-27') Note the :code:`==` operation between the :code:`imps.date` column and the date string. The :class:`Column <hustle.core.marble.Column>` class overrides all of Python's comparison operators, which, along with the *&*, *|* and *~* logical operators allows you to build arbitrarily complex column selection expressions like this:: select(imps.ad_id, imps.date, imps.cpm_millis, where=((imps.date >= '2014-01-21') & (imps.date <= '2014-01-23')) | ~(imps.site_id == 'google.com)) Note that for these expressions, the column must come first. This means that the following expression is **illegal**:: select(imps.ad_id, imps.date, imps.cpm_millis, where='2014-01-27' == imps.date) Where clause also supports *in* and *not in* statements by using special operators "<<" and ">>" respectively:: select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.ad_id << [1000, 1005]) select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.ad_id >> [1000, 1005]) Note that the right value "<<" and ">>" could be any type of iterable with each element must be a valid single right value. In addition, multiple tables can be specified in the where clause like this:: select(imps.ad_id, pix.amount, where=(imps.date < '2014-01-13', pix)) which specifies an expression, :code:`imps.date < '2014-01-13'` and a :class:`Table <hustle.Table>` tuple. This query will simply return all of the *ad_id* values in *imps* for dates less than January 13th followed by all of the *amount* values in the *pix* table. Using multiple columns is typically reserved for when you use a *join clause* :type join: string | sequence of exactly length 2 of :class:`Column <hustle.core.marble.Column>` :param join: specified the columns to perform a relational join operation on for the query The join columns can be specified either as a list of 2 columns, or a list of 2 strings. In particular, if two columns have the same names, a single string is valid as well. Here's an example of a Hustle join:: select(imps.ad_id, imps.site_id, h_sum(pix.amount), h_count(), where=(imps.date < '2014-01-13', pix.date < '2014-01-13'), join=(imps.site_id, pix.site_id)) or equivalently:: select(imps.ad_id, imps.site_id, h_sum(pix.amount), h_count(), where=(imps.date < '2014-01-13', pix.date < '2014-01-13'), join='site_id') which joins the *imps* and *pix* tables on their common *site_id* column, then returns the sum of the *pix.amount* columns and a count, grouped by the *ad_id* and the *site_id*. The equivalent query in SQL is:: select i.ad_id, i.site_id, sum(p.amount), count(*) from imps i join pix p on p.site_id = p.site_id where i.date < '2014-01-13' and i.date < '2014-01-13' group by i.ad_id, i.site_id :type full_join: bool :param full_join: if True, specifies that a full join between the specified tables in the *where clause* should be joined in a full cross-product. Note that if both *full_join* and *join* are specified, *join* will be ignored. :type order_by: string | :class:`Column <hustle.core.marble.Column>` | int | (sequence of string | :class:`Column <hustle.core.marble.Column>` | int) :param order_by: the column(s) to sort the result by The sort columns can be specified either as a Column or a list of Columns. Alternatively, you can specify a column by using a string with either the name of the column or the *table.column* string notation. Furthermore, you can also represent the column using a zero based index of the *projected* columns. This last case would be used for *Aggregations*. Here are a few examples:: select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by=imps.date) select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by=(imps.date, imps.ad_id)) select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by='date') select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by='imps.date') select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by=('date', imps.ad_id)) select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by=('date', 2)) select(imps.ad_id, imps.date, h_sum(imps.cpm_millis), where=imps, order_by=2) :type desc: boolean :param desc: affects sort order of the *order_by clause* to descending (default ascending) :type distinct: boolean :param distinct: indicates whether to remove duplicates in results :type limit: int :param limit: limits the total number of records in the output :type block: boolean :param block: make select call either blocking (default) or non-blocking. If True, causes select() to return a :class:`Future <hustle.Future>` object :type nest: boolean (default = False) :param nest: specify that the return value is a :class:`Table <hustle.Table>` to be used in another query This allows us to build nested queries. You may want to do this to join more than two tables, or to reuse the results of a query in more than one subsequent query. For example:: active_pix = select(*star(pix), where=pix.isActive > 0, nest=True) select(h_sum(active_pix.amount), where=active_pix) :type tag: string (default = None) :param tag: specify the tag name for a nested query, note it must be used with option "nest". If this option is not specified, a random name will be given to the result of this nested query. :type max_cores: int (default = 0) :param max_cores: specify the max number of cores (disco workers) this query could utilize. 0 means no limit :type profile: boolean (default = False) :param profile: specify whether generate disco job's profile :type purge: boolean (default = True) :param purge: specify whether purge the query related data. This only works when "dump = True" and "profile = False". :type kwargs: dict :param kwargs: custom settings for this query see :mod:`hustle.core.settings` """ from hustle.core.settings import Settings from hustle.core.pipeline import SelectPipe from hustle.core.util import ensure_list settings = Settings(**kwargs) wheres = ensure_list(settings.pop('where', ())) order_by = ensure_list(settings.pop('order_by', ())) join = settings.pop('join', ()) full_join = settings.pop('full_join', False) distinct = settings.pop('distinct', False) desc = settings.pop('desc', False) limit = settings.pop('limit', None) wide = settings.pop('wide', False) nest = settings.pop('nest', False) tag = settings.pop('tag', None) block = settings.pop('block', True) autodump = settings.pop('dump', False) pre_order_stage = settings.pop('pre_order_stage', ()) ddfs = settings['ddfs'] partition = settings.pop('partition', 0) max_cores = settings.pop('max_cores', 0) profile = settings.pop('profile', False) purge = settings.pop('purge', True) if partition < 0: partition = 0 if tag: t = Table.from_tag(tag) if t is not None: print "The tag name %s is already existed. Try another tag name" " or drop the old one" % tag return try: # if join is a string, extract the actual join columns. # do it here to make the query checker happy. join = _resolve_join(wheres, join) check_query(project, join, order_by, limit, wheres) except ValueError as e: print " Invalid query:\n %s" % e return None name = '-'.join([where._name for where in wheres])[:64] job_blobs = set() for where in wheres: job_blobs.update(tuple(sorted(w)) for w in _get_blobs(where, ddfs)) job = SelectPipe(settings['server'], wheres=wheres, project=project, order_by=order_by, join=join, full_join=full_join, distinct=distinct, desc=desc, limit=limit, partition=partition, wide=wide, nest=nest, tag=tag, pre_order_stage=pre_order_stage, max_cores=max_cores, profile=profile) job.run(name='select_from_%s' % name, input=job_blobs, **settings) if block: blobs = job.wait() if nest: rtab = job.get_result_schema(project) rtab._blobs = blobs return rtab elif autodump: # the result will be just dumped to stdout cols = [c.name for c in project] _print_separator(80) _print_line(cols, width=80, cols=len(cols), alignments=[_ALG_RIGHT if c.is_numeric else _ALG_LEFT for c in project]) _print_separator(80) cat(_query_iterator(blobs), 80) if purge and not profile: settings['server'].purge(_safe_str(job.name)) return return QueryResult(job.name, blobs, settings['server']) else: return Future(job.name, job, settings['server'], nest, *project)
def select(*project, **kwargs): """ Perform a relational query, by selecting rows and columns from one or more tables. The return value is either: * a list of urls containing the result records. This is the same as normal results from Disco * a :class:`Table <hustle.Table>` instance when :code:`nest==True` For all of the examples below, *imps* and *pix* are instances of :class:`Table <hustle.Table>`. :type project: list of :class:`Column <hustle.core.marble.Column>` | :class:`Aggregation <hustle.core.marble.Aggregation>` :param project: a positional argument list of columns and aggregate expressions to return in the result A simple projection:: select(imps.ad_id, imps.date, imps.cpm_millis, where=imps) Selects three columns from the *imps* table. Hustle also allows for *aggregation functions* such as :func:`h_sum() <hustle.h_sum>`, :func:`h_count <hustle.h_count>`, :func:`h_min() <hustle.h_min>`, :func:`h_max() <hustle.h_max>`, :func:`h_avg <hustle.h_avg>` as in this example which sums the :code:`imps.cpm_millis` column:: select(imps.ad_id, h_sum(imps.cpm_millis), h_count(), where=imps.date == '2014-01-27') Note that Hustle doesn't have a *group by* clause. In this query, the output will be *grouped* by the :code:`imps.ad_id` column implicitly. Note that in Hustle, if there is an aggregation function present in the :code:`project` param, the query results will be *grouped* by all non-aggregation present. :type where: (optional) sequence of :class:`Table <hustle.Table>` | :class:`Expr <hustle.core.marble.Expr>` :param where: the Tables to fetch data from, as well as the conditions in the *where clause* This two purposes: to specify the tables that are to be queried and to allow for the selection of data under specific criteria with our Python DSL selection syntax, much the like SQL's *where clause*:: # simple projection with restriction select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date == '2014-01-27') Note the :code:`==` operation between the :code:`imps.date` column and the date string. The :class:`Column <hustle.core.marble.Column>` class overrides all of Python's comparison operators, which, along with the *&*, *|* and *~* logical operators allows you to build arbitrarily complex column selection expressions like this:: select(imps.ad_id, imps.date, imps.cpm_millis, where=((imps.date >= '2014-01-21') & (imps.date <= '2014-01-23')) | ~(imps.site_id == 'google.com)) Note that for these expressions, the column must come first. This means that the following expression is **illegal**:: select(imps.ad_id, imps.date, imps.cpm_millis, where='2014-01-27' == imps.date) In addition, multiple tables can be specified in the where clause like this:: select(imps.ad_id, pix.amount, where=(imps.date < '2014-01-13', pix)) which specifies an expression, :code:`imps.date < '2014-01-13'` and a :class:`Table <hustle.Table>` tuple. This query will simply return all of the *ad_id* values in *imps* for dates less than January 13th followed by all of the *amount* values in the *pix* table. Using multiple columns is typically reserved for when you use a *join clause* :type join: sequence of exactly length 2 of :class:`Column <hustle.core.marble.Column>` :param join: specified the columns to perform a relational join operation on for the query Here's an example of a Hustle join:: select(imps.ad_id, imps.site_id, h_sum(pix.amount), h_count(), where=(imps.date < '2014-01-13', pix.date < '2014-01-13'), join=(imps.site_id, pix.site_id)) which joins the *imps* and *pix* tables on their common *site_id* column, then returns the sum of the *pix.amount* columns and a count, grouped by the *ad_id* and the *site_id*. The equivalent query in SQL is:: select i.ad_id, i.site_id, sum(p.amount), count(*) from imps i join pix p on p.site_id = p.site_id where i.date < '2014-01-13' and i.date < '2014-01-13' group by i.ad_id, i.site_id :type order_by: string | :class:`Column <hustle.core.marble.Column>` | int | (sequence of string | :class:`Column <hustle.core.marble.Column>` | int) :param order_by: the column(s) to sort the result by The sort columns can be specified either as a Column or a list of Columns. Alternatively, you can specify a column by using a string with either the name of the column or the *table.column* string notation. Furthermore, you can also represent the column using a zero based index of the *projected* columns. This last case would be used for *Aggregations*. Here are a few examples:: select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by=imps.date) select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by=(imps.date, imps.ad_id)) select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by='date') select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by='imps.date') select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by=('date', imps.ad_id)) select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by=('date', 2)) select(imps.ad_id, imps.date, h_sum(imps.cpm_millis), where=imps, order_by=2) :type desc: boolean :param desc: affects sort order of the *order_by clause* to descending (default ascending) :type distinct: boolean :param distinct: indicates whether to remove duplicates in results :type limit: int :param limit: limits the total number of records in the output :type nest: boolean (default = False) :param nest: specify that the return value is a :class:`Table <hustle.Table>` to be used in another query This allows us to build nested queries. You may want to do this to join more than two tables, or to reuse the results of a query in more than one subsequent query. For example:: active_pix = select(*star(pix), where=pix.isActive > 0, nest=True) select(h_sum(active_pix.amount), where=active_pix) :type kwargs: dict :param kwargs: custom settings for this query see :mod:`hustle.core.settings` """ from hustle import _get_blobs from hustle.core.settings import Settings from hustle.core.pipeline import SelectPipe from hustle.core.util import ensure_list settings = Settings(**kwargs) wheres = ensure_list(settings.pop('where', ())) order_by = ensure_list(settings.pop('order_by', ())) join = settings.pop('join', ()) distinct = settings.pop('distinct', False) desc = settings.pop('desc', False) limit = settings.pop('limit', None) ddfs = settings['ddfs'] autodump = settings['dump'] partition = settings.get('partition', 0) if partition < 0: partition = 0 nest = settings.get('nest', False) try: check_query(project, join, order_by, limit, wheres) except ValueError as e: print " Invalid query:\n %s" % e return None name = '-'.join([where._name for where in wheres])[:64] job_blobs = set() for where in wheres: job_blobs.update(tuple(sorted(w)) for w in _get_blobs(where, ddfs)) job = SelectPipe(settings['server'], wheres=wheres, project=project, order_by=order_by, join=join, distinct=distinct, desc=desc, limit=limit, partition=partition, nest=nest) job.run(name='select_from_%s' % name, input=job_blobs, **settings) blobs = job.wait() if nest: rtab = job.get_result_schema(project) rtab._blobs = blobs return rtab elif autodump: # the result will be just dumped to stdout cols = [c.name for c in project] _print_separator(80) _print_line(cols, width=80, cols=len(cols), alignments=[_ALG_RIGHT if c.is_numeric else _ALG_LEFT for c in project]) _print_separator(80) dump(blobs, 80) return return blobs
def test_full_query(self): self.assertTrue( check_query(self.cross_select, self.join, self.single_select, self.limit_single, self.cross_wheres))