def setUp(self): overrides['server'] = 'disco://localhost' overrides['dump'] = False overrides['nest'] = False self.settings = Settings() self.ddfs = self.settings['ddfs'] self.table = ensure_tables()
def ensure_tables(): overrides['server'] = 'disco://localhost' overrides['dump'] = False overrides['nest'] = False settings = Settings() ddfs = settings['ddfs'] imps = Table.create(IMPS, fields=[ '=$token', '%url', '+%site_id', '@cpm_millis', '+#ad_id', '+$date', '+@time' ], partition='date', force=True) pixels = Table.create(PIXELS, fields=[ '=$token', '+@1isActive', '+%site_id', '@amount', '+#account_id', '+%city', '+%2state', '+#2metro', '$ip', '*keyword', '+$date' ], partition='date', force=True) tags = ddfs.list("hustle:%s:" % IMPS) if len(tags) == 0: # insert the files insert(imps, phile='fixtures/imps.json', preprocess=imp_process) tags = ddfs.list("hustle:%s:" % PIXELS) if len(tags) == 0: # insert the files insert(pixels, phile='fixtures/pixel.json')
def stat(where, limit=16, **kwargs): from hustle.core.settings import Settings from hustle.core.stat import StatPipe from disco.core import result_iterator from collections import defaultdict settings = Settings(**kwargs) ddfs = settings['ddfs'] job_blobs = set(tuple(sorted(w)) for w in _get_blobs(where, ddfs, limit)) # print job_blobs job = StatPipe(settings['server']) job.run(name="stat_" + where._name, input=job_blobs, **settings) res = job.wait() # first we need the total, so that we can calculate weighted average total = float(sum(v['_'] for _, v in result_iterator(res))) final = defaultdict(int) for _, cols in result_iterator(res): weight = cols.pop('_') / total for col, card in cols.iteritems(): final[col] += card * weight # round everything up to a number between 0 .. 100 really_final = {} for key in final: card = int(final[key] * 100) if card > 0: really_final[key] = card really_final['_'] = int(total) return really_final
def delete(table_or_expr, **kwargs): """ Delete data and partitions for a given table, keep the table definition. :type table_or_expr: :class:`Table <hustle.Table>` | :class:`Expr <hustle.core.marble.Expr>` :param table_or_expr: A table object or an expression with only a partition column :type kwargs: dict :param kwargs: custom settings for this query see :mod:`hustle.core.settings` .. warning:: Given a table object, all partitions will be deleted. Use a Hustle expression to delete a specific range of partitions, e.g. 'impression.date < 2014-01-01'. """ from hustle.core.settings import Settings settings = Settings(**kwargs) ddfs = settings["ddfs"] if not isinstance(table_or_expr, (Expr, Table)): raise ValueError("The first argument must be a table or an exprssion.") if isinstance(table_or_expr, Expr) and not table_or_expr.is_partition: raise ValueError( "Column in the expression must be a partition column.") tags = _get_tags(table_or_expr, ddfs) for tag in tags: ddfs.delete(tag)
def ensure_tables(): overrides['server'] = 'disco://localhost' overrides['dump'] = False overrides['nest'] = False settings = Settings() ddfs = settings['ddfs'] imps = Table.create(IMPS, columns=[ 'wide index string token', 'trie url', 'index trie site_id', 'uint cpm_millis', 'index int ad_id', 'index string date', 'index uint time', 'bit click', 'index bit impression', 'bit conversion' ], partition='date', force=True) pixels = Table.create(PIXELS, columns=[ 'wide index string token', 'index bit isActive', 'index trie site_id', 'uint amount', 'index int account_id', 'index trie city', 'index trie16 state', 'index int16 metro', 'string ip', 'lz4 keyword', 'index string date' ], partition='date', force=True) pixel_hlls = Table.create(PIXELS_HLL, columns=[ 'index bit isActive', 'index trie site_id', 'index int account_id', 'index trie city', 'index trie16 state', 'index string date', 'binary hll' ], partition='date', force=True) ips = Table.create(IPS, columns=['index trie16 exchange_id', 'index uint32 ip'], force=True) tags = ddfs.list("hustle:%s:" % IMPS) if len(tags) == 0: # insert the files insert(imps, File='fixtures/imps.json', preprocess=imp_process) tags = ddfs.list("hustle:%s:" % PIXELS) if len(tags) == 0: # insert the files insert(pixels, File='fixtures/pixel.json') tags = ddfs.list("hustle:%s:" % IPS) if len(tags) == 0: # insert the files insert(ips, File='fixtures/ip.json') tags = ddfs.list("hustle:%s:" % PIXELS_HLL) if len(tags) == 0: # insert the files insert_hll(pixel_hlls, file='./fixtures/pixel.json', hll_field='token')
def create(cls, name, columns=(), fields=(), partition=None, force=False, **kwargs): """ Create a new :class:`Table <hustle.Table>`, replace existing table if force=True. :type name: string :param name: the name of the table to create :type columns: sequence of string :param columns: the list of *columns* and their extended index/type information :type fields: sequence of string :param fields: the list of *columns* and their encoded index/type information :type partition: string :param partition: the name of the column to act as the partition for this table :type force: bool :param force: overwrite the existing DDFS base tag with this schema If *columns* is set, the *fields* parameter is ignored. Example:: pixels = Table.create('pixels', columns=['index string token', 'index uint8 isActive', 'index site_id', 'uint32 amount', 'index int32 account_id', 'index city', 'index trie16 state', 'index int16 metro', 'string ip', 'lz4 keyword', 'index string date'], partition='date', force=True) .. warning:: This function will not delete or update existing data in any way. If you use :code:`force=True` to change the schema, make sure you either make the change backward compatible (by only adding new columns), or by deleting and reloading your data. .. seealso:: For a good example of creating a partitioned Hustle database see :ref:`integrationtests` For detailed schema design docs look no further than :ref:`schemadesign` """ from hustle.core.settings import Settings settings = Settings(**kwargs) ddfs = settings['ddfs'] if ddfs.exists(cls.base_tag(name)): print "Table already exists..." if force: print " Overwriting schema..." else: return None if len(columns): fields = cls.parse_column_specs(columns) ddfs.setattr(cls.base_tag(name), '_fields_', ujson.dumps(fields)) ddfs.setattr(cls.base_tag(name), '_partition_', ujson.dumps(partition)) return cls(name=name, fields=fields, partition=partition)
def _create_job(*project, **kwargs): from hustle import _get_blobs from hustle.core.settings import Settings from hustle.core.pipeline import SelectPipe from hustle.core.util import ensure_list settings = Settings(**kwargs) wheres = ensure_list(settings.pop('where', ())) order_by = ensure_list(settings.pop('order_by', ())) join = settings.pop('join', ()) distinct = settings.pop('distinct', False) desc = settings.pop('desc', False) limit = settings.pop('limit', None) ddfs = settings['ddfs'] partition = settings.get('partition', 0) if partition < 0: partition = 0 nest = settings.get('nest', False) try: # if join is a string, extract the actual join columns. # do it here to make the query checker happy. join = _resolve_join(wheres, join) check_query(project, join, order_by, limit, wheres) except ValueError as e: print " Invalid query:\n %s" % e return None name = '-'.join([where._name for where in wheres])[:64] job_blobs = set() for where in wheres: job_blobs.update(tuple(sorted(w)) for w in _get_blobs(where, ddfs)) job = SelectPipe(settings['server'], wheres=wheres, project=project, order_by=order_by, join=join, distinct=distinct, desc=desc, limit=limit, partition=partition, nest=nest) return job, job_blobs, name
def from_tag(cls, name, **kwargs): """ Instantiate a named :class:`Table <hustle.Table>` based on meta data from a *DDFS* tag. :type name: string :param name: the name of the table """ from hustle.core.settings import Settings settings = Settings(**kwargs) ddfs = settings['ddfs'] partition = ujson.loads(ddfs.getattr(cls.base_tag(name), '_partition_')) fields = ujson.loads(ddfs.getattr(cls.base_tag(name), '_fields_')) return cls(name=name, fields=fields, partition=partition)
def tag(self, **kwargs): from hustle.core.settings import Settings if not self.tagged: settings = Settings(**kwargs) ddfs = settings['ddfs'] # check whether the table is already existed t = self.create(self._name, fields=self._fields, force=False, **kwargs) try: ddfs.tag(self.base_tag(self._name), self._blobs or []) self.tagged = True except Exception: print('Error tagging result %s', self._name) raise return t
def ensure_tables(): overrides['server'] = 'disco://localhost' overrides['dump'] = False overrides['nest'] = False settings = Settings() ddfs = settings['ddfs'] imps = Table.create(IMPS, fields=['=$token', '%url', '+%site_id', '@cpm_millis', '+#ad_id', '+$date', '+@time'], partition='date', force=True) tags = ddfs.list("hustle:%s:" % IMPS) if len(tags) == 0: # insert the files insert(imps, File='fixtures/imps.json', preprocess=imp_process) return imps
def drop(table, **kwargs): """ Drop all data, partitions, and table definition for a given table. :type table_or_expr: :class:`Table <hustle.Table>` :param table_or_expr: A table object :type kwargs: dict :param kwargs: custom settings for this query see :mod:`hustle.core.settings` """ from hustle.core.settings import Settings settings = Settings(**kwargs) ddfs = settings["ddfs"] if not isinstance(table, Table): raise ValueError("Only table is allowed here.") delete(table, **kwargs) ddfs.delete(Table.base_tag(table._name))
def stat(where, limit=16, **kwargs): """ Fetch statistical information of a collection of selected `Table <hustle.Table>`. :type where: sequence of :class:`Table <hustle.Table>` | :class:`Expr <hustle.core.marble.Expr>` :param where: the Tables to fetch data from, as well as the conditions in the *where clause* :type limit: int :param limit: the maximum number of blobs from the where clause, default value is 16 Return a dict of column key cardinalities [0-100] for indexed columns in a table """ from hustle.core.settings import Settings from hustle.core.stat import StatPipe from disco.core import result_iterator from collections import defaultdict settings = Settings(**kwargs) ddfs = settings['ddfs'] job_blobs = set(tuple(sorted(w)) for w in _get_blobs(where, ddfs, limit)) # print job_blobs job = StatPipe(settings['server']) job.run(name="stat_" + where._name, input=job_blobs, **settings) res = job.wait() # first we need the total, so that we can calculate weighted average total = float(sum(v['_'] for _, v in result_iterator(res))) final = defaultdict(int) for _, cols in result_iterator(res): weight = cols.pop('_') / total for col, card in cols.iteritems(): final[col] += card * weight # round everything up to a number between 0 .. 100 really_final = {} for key in final: card = int(final[key] * 100) if card > 0: really_final[key] = card really_final['_'] = int(total) return really_final
def create(cls, name, fields=(), partition=None, force=False, **kwargs): """ Create a new :class:`Table <hustle.Table>`, replace existing table if force=True. :type name: string :param name: the name of the table to create :type fields: sequence of string :param fields: the list of *columns* and their encoded index/type information :type partition: string :param partition: the name of the column to act as the partition for this table :type force: bool :param force: overwrite the existing DDFS base tag with this schema .. warning:: This function will not delete or update existing data in any way. If you use :code:`force=True` to change the schema, make sure you either make the change backward compatible (by only adding new columns), or by deleting and reloading your data. .. seealso:: For a good example of creating a partitioned Hustle database see :ref:`integrationtests` """ from hustle.core.settings import Settings settings = Settings(**kwargs) ddfs = settings['ddfs'] if ddfs.exists(cls.base_tag(name)): print "Table already exists..." if force: print " Overwriting schema..." else: return None ddfs.setattr(cls.base_tag(name), '_fields_', ujson.dumps(fields)) ddfs.setattr(cls.base_tag(name), '_partition_', ujson.dumps(partition)) return cls(name=name, fields=fields, partition=partition)
def get_tables(**kwargs): """ return the visible Hustle tables in the currently configured DDFS server. Hustle finds tables by looking for DDFS tags that have a *hustle:* prefix. :type kwargs: dict :param kwargs: custom settings for this query see :mod:`hustle.core.settings` """ from hustle.core.settings import Settings settings = Settings(**kwargs) tags = settings["ddfs"].list(_TAG_PREFIX) uniqs = set() for tag in tags: l = tag.find(':') if l > 0: ctag = tag[l + 1:] r = ctag.find(':') if r > 0: uniqs.add(ctag[:r]) else: uniqs.add(ctag) return sorted(uniqs)
def get_partitions(table, **kwargs): """ Get partitions for a given table. :type kwargs: dict :param kwargs: custom settings for this query see :mod:`hustle.core.settings` """ from hustle.core.settings import Settings settings = Settings(**kwargs) ddfs = settings["ddfs"] if isinstance(table, Marble): tablename = table._name else: tablename = table tags = ddfs.list(Table.base_tag(tablename) + ":") uniqs = set() for tag in tags: l = tag.find(':') r = tag.rfind(':') if r != l: uniqs.add(tag) return sorted(uniqs)
def insert(table, phile=None, streams=None, preprocess=None, maxsize=100 * 1024 * 1024, tmpdir='/tmp', decoder=None, lru_size=10000, **kwargs): """ Insert data into a Hustle :class:`Table <hustle.Table>`. Create a :class:`Marble <hustle.core.marble.Marble>` file given the input file or streams according to the schema of the table. Push this (these) file(s) into *DDFS* under the appropriated (possibly) partitioned *DDFS* tags. Note that a call to :func:`insert() <hustle.insert>` may actually create and push more than one file, depending on how many partition values exist in the input. Be careful. For a good example of inserting into a partitioned Hustle database see :ref:`insertguide` :type table: :class:`Table <hustle.Table>` :param table: the table to perform the insert on :type phile: string :param phile: the file path to open :type streams: sequence of iterable :param streams: as an alternative to the *phile* argument, you can specify a list of generators as input :type preprocess: function :param preprocess: a function that accepts and returns a dict() The input is transformed into a :class:`dict` by the *decoder* param, then the *preprocess* function is called for every record. This gives you the opportunity to transform, filter or otherwise clean your data before it is inserted into the :class:`Marble <hustle.core.marble.Marble>` :type maxsize: int :param maxsize: the initial size in bytes of the *LMDB* memory mapped file Note that the actual underlying LMDB file will grow as data is added to it - this setting is just for its initial size. :type tmpdir: string :param tmpdir: the temporary directory to write the LMDB memory mapped file Note that choosing a directory on an SSD drive will nicely increase throughput. :type decoder: function :param decoder: accepts a line of raw input from the input and returns a :class:`dict` The dict is expected to have keys that correspond to the column names in the table you are inserting to. There are two built-in decoders in Hustle: :func:`json_decoder() <hustle.core.marble.json_decoder>` (default) and :func:`kv_decoder() <hustle.core.marble.kv_decoder>` for processing JSON and Disco *chain* input files, respectively. :type lru_size: int :param lru_size: the size in records of the LRU cache for holding bitmapped indexes You probably won't have to worry about this unless you find your insert is running out of memory or is too slow when inserting gigantic files or on nodes with limited memory resources. """ from hustle.core.settings import Settings settings = Settings(**kwargs) ddfs = settings['ddfs'] if not decoder: decoder = json_decoder # print 'committed' def part_tag(name, partition=None): rval = "hustle:" + name if partition: rval += ':' + str(partition) return rval if phile: streams = [open(phile)] lines, partition_files = table._insert(streams, preprocess=preprocess, maxsize=maxsize, tmpdir=tmpdir, decoder=decoder, lru_size=lru_size) if partition_files is not None: for part, pfile in partition_files.iteritems(): tag = part_tag(table._name, part) ddfs.push(tag, [pfile]) print 'pushed %s, %s to %s' % (part, tag, ddfs) os.unlink(pfile) return table._name, lines
def select(*project, **kwargs): """ Perform a relational query, by selecting rows and columns from one or more tables. The return value is either:: * an iterator over the resulting tuples when :code:`nest==False` * a :class:`Table <hustle.Table>` instance when :code:`nest==True` * in the case of :code:`nest==False and dump==True` return None (this is the default CLI interaction) For all of the examples below, *imps* and *pix* are instances of :class:`Table <hustle.Table>`. :type project: list of :class:`Column <hustle.core.marble.Column>` | :class:`Aggregation <hustle.core.marble.Aggregation>` :param project: a positional argument list of columns and aggregate expressions to return in the result A simple projection:: select(imps.ad_id, imps.date, imps.cpm_millis, where=imps) Selects three columns from the *imps* table. Hustle also allows for *aggregation functions* such as :func:`h_sum() <hustle.h_sum>`, :func:`h_count <hustle.h_count>`, :func:`h_min() <hustle.h_min>`, :func:`h_max() <hustle.h_max>`, :func:`h_avg <hustle.h_avg>` as in this example which sums the :code:`imps.cpm_millis` column:: select(imps.ad_id, h_sum(imps.cpm_millis), h_count(), where=imps.date == '2014-01-27') Note that Hustle doesn't have a *group by* clause. In this query, the output will be *grouped* by the :code:`imps.ad_id` column implicitly. Note that in Hustle, if there is an aggregation function present in the :code:`project` param, the query results will be *grouped* by all non-aggregation present. :type where: (optional) sequence of :class:`Table <hustle.Table>` | :class:`Expr <hustle.core.marble.Expr>` :param where: the Tables to fetch data from, as well as the conditions in the *where clause* This two purposes: to specify the tables that are to be queried and to allow for the selection of data under specific criteria with our Python DSL selection syntax, much the like SQL's *where clause*:: # simple projection with restriction select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date == '2014-01-27') Note the :code:`==` operation between the :code:`imps.date` column and the date string. The :class:`Column <hustle.core.marble.Column>` class overrides all of Python's comparison operators, which, along with the *&*, *|* and *~* logical operators allows you to build arbitrarily complex column selection expressions like this:: select(imps.ad_id, imps.date, imps.cpm_millis, where=((imps.date >= '2014-01-21') & (imps.date <= '2014-01-23')) | ~(imps.site_id == 'google.com)) Note that for these expressions, the column must come first. This means that the following expression is **illegal**:: select(imps.ad_id, imps.date, imps.cpm_millis, where='2014-01-27' == imps.date) Where clause also supports *in* and *not in* statements by using special operators "<<" and ">>" respectively:: select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.ad_id << [1000, 1005]) select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.ad_id >> [1000, 1005]) Note that the right value "<<" and ">>" could be any type of iterable with each element must be a valid single right value. In addition, multiple tables can be specified in the where clause like this:: select(imps.ad_id, pix.amount, where=(imps.date < '2014-01-13', pix)) which specifies an expression, :code:`imps.date < '2014-01-13'` and a :class:`Table <hustle.Table>` tuple. This query will simply return all of the *ad_id* values in *imps* for dates less than January 13th followed by all of the *amount* values in the *pix* table. Using multiple columns is typically reserved for when you use a *join clause* :type join: string | sequence of exactly length 2 of :class:`Column <hustle.core.marble.Column>` :param join: specified the columns to perform a relational join operation on for the query The join columns can be specified either as a list of 2 columns, or a list of 2 strings. In particular, if two columns have the same names, a single string is valid as well. Here's an example of a Hustle join:: select(imps.ad_id, imps.site_id, h_sum(pix.amount), h_count(), where=(imps.date < '2014-01-13', pix.date < '2014-01-13'), join=(imps.site_id, pix.site_id)) or equivalently:: select(imps.ad_id, imps.site_id, h_sum(pix.amount), h_count(), where=(imps.date < '2014-01-13', pix.date < '2014-01-13'), join='site_id') which joins the *imps* and *pix* tables on their common *site_id* column, then returns the sum of the *pix.amount* columns and a count, grouped by the *ad_id* and the *site_id*. The equivalent query in SQL is:: select i.ad_id, i.site_id, sum(p.amount), count(*) from imps i join pix p on p.site_id = p.site_id where i.date < '2014-01-13' and i.date < '2014-01-13' group by i.ad_id, i.site_id :type full_join: bool :param full_join: if True, specifies that a full join between the specified tables in the *where clause* should be joined in a full cross-product. Note that if both *full_join* and *join* are specified, *join* will be ignored. :type order_by: string | :class:`Column <hustle.core.marble.Column>` | int | (sequence of string | :class:`Column <hustle.core.marble.Column>` | int) :param order_by: the column(s) to sort the result by The sort columns can be specified either as a Column or a list of Columns. Alternatively, you can specify a column by using a string with either the name of the column or the *table.column* string notation. Furthermore, you can also represent the column using a zero based index of the *projected* columns. This last case would be used for *Aggregations*. Here are a few examples:: select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by=imps.date) select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by=(imps.date, imps.ad_id)) select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by='date') select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by='imps.date') select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by=('date', imps.ad_id)) select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by=('date', 2)) select(imps.ad_id, imps.date, h_sum(imps.cpm_millis), where=imps, order_by=2) :type desc: boolean :param desc: affects sort order of the *order_by clause* to descending (default ascending) :type distinct: boolean :param distinct: indicates whether to remove duplicates in results :type limit: int :param limit: limits the total number of records in the output :type block: boolean :param block: make select call either blocking (default) or non-blocking. If True, causes select() to return a :class:`Future <hustle.Future>` object :type nest: boolean (default = False) :param nest: specify that the return value is a :class:`Table <hustle.Table>` to be used in another query This allows us to build nested queries. You may want to do this to join more than two tables, or to reuse the results of a query in more than one subsequent query. For example:: active_pix = select(*star(pix), where=pix.isActive > 0, nest=True) select(h_sum(active_pix.amount), where=active_pix) :type tag: string (default = None) :param tag: specify the tag name for a nested query, note it must be used with option "nest". If this option is not specified, a random name will be given to the result of this nested query. :type max_cores: int (default = 0) :param max_cores: specify the max number of cores (disco workers) this query could utilize. 0 means no limit :type profile: boolean (default = False) :param profile: specify whether generate disco job's profile :type purge: boolean (default = True) :param purge: specify whether purge the query related data. This only works when "dump = True" and "profile = False". :type kwargs: dict :param kwargs: custom settings for this query see :mod:`hustle.core.settings` """ from hustle.core.settings import Settings from hustle.core.pipeline import SelectPipe from hustle.core.util import ensure_list settings = Settings(**kwargs) wheres = ensure_list(settings.pop('where', ())) order_by = ensure_list(settings.pop('order_by', ())) join = settings.pop('join', ()) full_join = settings.pop('full_join', False) distinct = settings.pop('distinct', False) desc = settings.pop('desc', False) limit = settings.pop('limit', None) wide = settings.pop('wide', False) nest = settings.pop('nest', False) tag = settings.pop('tag', None) block = settings.pop('block', True) autodump = settings.pop('dump', False) pre_order_stage = settings.pop('pre_order_stage', ()) ddfs = settings['ddfs'] partition = settings.pop('partition', 0) max_cores = settings.pop('max_cores', 0) profile = settings.pop('profile', False) purge = settings.pop('purge', True) if partition < 0: partition = 0 if tag: t = Table.from_tag(tag) if t is not None: print "The tag name %s is already existed. Try another tag name" " or drop the old one" % tag return try: # if join is a string, extract the actual join columns. # do it here to make the query checker happy. join = _resolve_join(wheres, join) check_query(project, join, order_by, limit, wheres) except ValueError as e: print " Invalid query:\n %s" % e return None name = '-'.join([where._name for where in wheres])[:64] job_blobs = set() for where in wheres: job_blobs.update(tuple(sorted(w)) for w in _get_blobs(where, ddfs)) job = SelectPipe(settings['server'], wheres=wheres, project=project, order_by=order_by, join=join, full_join=full_join, distinct=distinct, desc=desc, limit=limit, partition=partition, wide=wide, nest=nest, tag=tag, pre_order_stage=pre_order_stage, max_cores=max_cores, profile=profile) job.run(name='select_from_%s' % name, input=job_blobs, **settings) if block: blobs = job.wait() if nest: rtab = job.get_result_schema(project) rtab._blobs = blobs return rtab elif autodump: # the result will be just dumped to stdout cols = [c.name for c in project] _print_separator(80) _print_line(cols, width=80, cols=len(cols), alignments=[ _ALG_RIGHT if c.is_numeric else _ALG_LEFT for c in project ]) _print_separator(80) cat(_query_iterator(blobs), 80) if purge and not profile: settings['server'].purge(_safe_str(job.name)) return return QueryResult(job.name, blobs, settings['server']) else: return Future(job.name, job, settings['server'], nest, *project)
def insert(table, File=None, streams=None, preprocess=None, maxsize=100 * 1024 * 1024, tmpdir='/tmp', decoder=None, lru_size=10000, header=False, partition_filter=None, purge_local=True, **kwargs): """ Insert data into a Hustle :class:`Table <hustle.Table>`. Create a :class:`Marble <hustle.core.marble.Marble>` file given the input file or streams according to the schema of the table. Push this (these) file(s) into *DDFS* under the appropriated (possibly) partitioned *DDFS* tags. Note that a call to :func:`insert() <hustle.insert>` may actually create and push more than one file, depending on how many partition values exist in the input. Be careful. For a good example of inserting into a partitioned Hustle database see :ref:`insertguide` :type table: :class:`Table <hustle.Table>` :param table: the table to perform the insert on :type File: string :param File: the file path to open :type streams: sequence of iterable :param streams: as an alternative to the *File* argument, you can specify a list of generators as input :type preprocess: function :param preprocess: a function that acts as transformer or filter and returns a boolean or None The input is transformed into a :class:`dict` by the *decoder* param, then the *preprocess* function is called for every record. This gives you the opportunity to transform, filter or otherwise clean your data before it is inserted into the :class:`Marble <hustle.core.marble.Marble>`. As transformer: it modifies the original data in place, the return value shoule be either None or True As filter: it returns a boolean to flag whether the current data record should be inserted or not :type maxsize: int :param maxsize: the initial size in bytes of the *LMDB* memory mapped file Note that the actual underlying LMDB file will grow as data is added to it - this setting is just for its initial size. :type tmpdir: string :param tmpdir: the temporary directory to write the LMDB memory mapped file Note that choosing a directory on an SSD drive will nicely increase throughput. :type decoder: function :param decoder: accepts a line of raw input from the input and returns a :class:`dict` The dict is expected to have keys that correspond to the column names in the table you are inserting to. There are two built-in decoders in Hustle: :func:`json_decoder() <hustle.core.marble.json_decoder>` (default) and :func:`kv_decoder() <hustle.core.marble.kv_decoder>` for processing JSON and Disco *chain* input files, respectively. :type lru_size: int :param lru_size: the size in records of the LRU cache for holding bitmapped indexes You probably won't have to worry about this unless you find your insert is running out of memory or is too slow when inserting gigantic files or on nodes with limited memory resources. :type header: boolean :param header: whether or not the streams contain a header (as with CSV) If you are using CSV and it contains a header with the column names, set this so it gets skipped. Only works if the header is on the first line otherwise you will skip the first line of data. :type partition_filter: a single value or a list of values :param partition_filter: a single value or a list of partition values you want to filter your *streams* This list will filter the insert to only acknowledge the partition(s) defined if set. Useful for reloads where single files may hold data for multiple partitions. :type purge_local: boolean :param purge_local: whether or not to delete the local marble after creation If you want to do additional processing with the marble after it has been pushed to DDFS, set this flag to False and it will not be automatically cleaned up after successful insertion. Setting this to False will also return the partition file information. """ from hustle.core.settings import Settings settings = Settings(**kwargs) ddfs = settings['ddfs'] if not decoder: decoder = json_decoder def part_tag(name, partition=None): rval = "hustle:" + name if partition: rval += ':' + str(partition) return rval if File: streams = [open(File)] lines, partition_files = table._insert(streams, preprocess=preprocess, maxsize=maxsize, tmpdir=tmpdir, decoder=decoder, lru_size=lru_size, header=header, partition_filter=partition_filter) if partition_files is not None: for part, pfile in partition_files.iteritems(): tag = part_tag(table._name, part) st = os.stat(pfile) ddfs.push(tag, [pfile]) print 'pushed %s(%.2fG), %s to %s' % \ (part, st.st_size * 1.0 / 1073741824, tag, ddfs) if purge_local: os.unlink(pfile) return table._name, lines, partition_files
def select(*project, **kwargs): """ Perform a relational query, by selecting rows and columns from one or more tables. The return value is either: * a list of urls containing the result records. This is the same as normal results from Disco * a :class:`Table <hustle.Table>` instance when :code:`nest==True` For all of the examples below, *imps* and *pix* are instances of :class:`Table <hustle.Table>`. :type project: list of :class:`Column <hustle.core.marble.Column>` | :class:`Aggregation <hustle.core.marble.Aggregation>` :param project: a positional argument list of columns and aggregate expressions to return in the result A simple projection:: select(imps.ad_id, imps.date, imps.cpm_millis, where=imps) Selects three columns from the *imps* table. Hustle also allows for *aggregation functions* such as :func:`h_sum() <hustle.h_sum>`, :func:`h_count <hustle.h_count>`, :func:`h_min() <hustle.h_min>`, :func:`h_max() <hustle.h_max>`, :func:`h_avg <hustle.h_avg>` as in this example which sums the :code:`imps.cpm_millis` column:: select(imps.ad_id, h_sum(imps.cpm_millis), h_count(), where=imps.date == '2014-01-27') Note that Hustle doesn't have a *group by* clause. In this query, the output will be *grouped* by the :code:`imps.ad_id` column implicitly. Note that in Hustle, if there is an aggregation function present in the :code:`project` param, the query results will be *grouped* by all non-aggregation present. :type where: (optional) sequence of :class:`Table <hustle.Table>` | :class:`Expr <hustle.core.marble.Expr>` :param where: the Tables to fetch data from, as well as the conditions in the *where clause* This two purposes: to specify the tables that are to be queried and to allow for the selection of data under specific criteria with our Python DSL selection syntax, much the like SQL's *where clause*:: # simple projection with restriction select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date == '2014-01-27') Note the :code:`==` operation between the :code:`imps.date` column and the date string. The :class:`Column <hustle.core.marble.Column>` class overrides all of Python's comparison operators, which, along with the *&*, *|* and *~* logical operators allows you to build arbitrarily complex column selection expressions like this:: select(imps.ad_id, imps.date, imps.cpm_millis, where=((imps.date >= '2014-01-21') & (imps.date <= '2014-01-23')) | ~(imps.site_id == 'google.com)) Note that for these expressions, the column must come first. This means that the following expression is **illegal**:: select(imps.ad_id, imps.date, imps.cpm_millis, where='2014-01-27' == imps.date) In addition, multiple tables can be specified in the where clause like this:: select(imps.ad_id, pix.amount, where=(imps.date < '2014-01-13', pix)) which specifies an expression, :code:`imps.date < '2014-01-13'` and a :class:`Table <hustle.Table>` tuple. This query will simply return all of the *ad_id* values in *imps* for dates less than January 13th followed by all of the *amount* values in the *pix* table. Using multiple columns is typically reserved for when you use a *join clause* :type join: sequence of exactly length 2 of :class:`Column <hustle.core.marble.Column>` :param join: specified the columns to perform a relational join operation on for the query Here's an example of a Hustle join:: select(imps.ad_id, imps.site_id, h_sum(pix.amount), h_count(), where=(imps.date < '2014-01-13', pix.date < '2014-01-13'), join=(imps.site_id, pix.site_id)) which joins the *imps* and *pix* tables on their common *site_id* column, then returns the sum of the *pix.amount* columns and a count, grouped by the *ad_id* and the *site_id*. The equivalent query in SQL is:: select i.ad_id, i.site_id, sum(p.amount), count(*) from imps i join pix p on p.site_id = p.site_id where i.date < '2014-01-13' and i.date < '2014-01-13' group by i.ad_id, i.site_id :type order_by: string | :class:`Column <hustle.core.marble.Column>` | int | (sequence of string | :class:`Column <hustle.core.marble.Column>` | int) :param order_by: the column(s) to sort the result by The sort columns can be specified either as a Column or a list of Columns. Alternatively, you can specify a column by using a string with either the name of the column or the *table.column* string notation. Furthermore, you can also represent the column using a zero based index of the *projected* columns. This last case would be used for *Aggregations*. Here are a few examples:: select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by=imps.date) select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by=(imps.date, imps.ad_id)) select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by='date') select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by='imps.date') select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by=('date', imps.ad_id)) select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by=('date', 2)) select(imps.ad_id, imps.date, h_sum(imps.cpm_millis), where=imps, order_by=2) :type desc: boolean :param desc: affects sort order of the *order_by clause* to descending (default ascending) :type distinct: boolean :param distinct: indicates whether to remove duplicates in results :type limit: int :param limit: limits the total number of records in the output :type nest: boolean (default = False) :param nest: specify that the return value is a :class:`Table <hustle.Table>` to be used in another query This allows us to build nested queries. You may want to do this to join more than two tables, or to reuse the results of a query in more than one subsequent query. For example:: active_pix = select(*star(pix), where=pix.isActive > 0, nest=True) select(h_sum(active_pix.amount), where=active_pix) :type kwargs: dict :param kwargs: custom settings for this query see :mod:`hustle.core.settings` """ from hustle import _get_blobs from hustle.core.settings import Settings from hustle.core.pipeline import SelectPipe from hustle.core.util import ensure_list settings = Settings(**kwargs) wheres = ensure_list(settings.pop('where', ())) order_by = ensure_list(settings.pop('order_by', ())) join = settings.pop('join', ()) distinct = settings.pop('distinct', False) desc = settings.pop('desc', False) limit = settings.pop('limit', None) ddfs = settings['ddfs'] autodump = settings['dump'] partition = settings.get('partition', 0) if partition < 0: partition = 0 nest = settings.get('nest', False) try: check_query(project, join, order_by, limit, wheres) except ValueError as e: print " Invalid query:\n %s" % e return None name = '-'.join([where._name for where in wheres])[:64] job_blobs = set() for where in wheres: job_blobs.update(tuple(sorted(w)) for w in _get_blobs(where, ddfs)) job = SelectPipe(settings['server'], wheres=wheres, project=project, order_by=order_by, join=join, distinct=distinct, desc=desc, limit=limit, partition=partition, nest=nest) job.run(name='select_from_%s' % name, input=job_blobs, **settings) blobs = job.wait() if nest: rtab = job.get_result_schema(project) rtab._blobs = blobs return rtab elif autodump: # the result will be just dumped to stdout cols = [c.name for c in project] _print_separator(80) _print_line(cols, width=80, cols=len(cols), alignments=[_ALG_RIGHT if c.is_numeric else _ALG_LEFT for c in project]) _print_separator(80) dump(blobs, 80) return return blobs
def select(*project, **kwargs): """ Perform a relational query, by selecting rows and columns from one or more tables. The return value is either:: * an iterator over the resulting tuples when :code:`nest==False` * a :class:`Table <hustle.Table>` instance when :code:`nest==True` * in the case of :code:`nest==False and dump==True` return None (this is the default CLI interaction) For all of the examples below, *imps* and *pix* are instances of :class:`Table <hustle.Table>`. :type project: list of :class:`Column <hustle.core.marble.Column>` | :class:`Aggregation <hustle.core.marble.Aggregation>` :param project: a positional argument list of columns and aggregate expressions to return in the result A simple projection:: select(imps.ad_id, imps.date, imps.cpm_millis, where=imps) Selects three columns from the *imps* table. Hustle also allows for *aggregation functions* such as :func:`h_sum() <hustle.h_sum>`, :func:`h_count <hustle.h_count>`, :func:`h_min() <hustle.h_min>`, :func:`h_max() <hustle.h_max>`, :func:`h_avg <hustle.h_avg>` as in this example which sums the :code:`imps.cpm_millis` column:: select(imps.ad_id, h_sum(imps.cpm_millis), h_count(), where=imps.date == '2014-01-27') Note that Hustle doesn't have a *group by* clause. In this query, the output will be *grouped* by the :code:`imps.ad_id` column implicitly. Note that in Hustle, if there is an aggregation function present in the :code:`project` param, the query results will be *grouped* by all non-aggregation present. :type where: (optional) sequence of :class:`Table <hustle.Table>` | :class:`Expr <hustle.core.marble.Expr>` :param where: the Tables to fetch data from, as well as the conditions in the *where clause* This two purposes: to specify the tables that are to be queried and to allow for the selection of data under specific criteria with our Python DSL selection syntax, much the like SQL's *where clause*:: # simple projection with restriction select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date == '2014-01-27') Note the :code:`==` operation between the :code:`imps.date` column and the date string. The :class:`Column <hustle.core.marble.Column>` class overrides all of Python's comparison operators, which, along with the *&*, *|* and *~* logical operators allows you to build arbitrarily complex column selection expressions like this:: select(imps.ad_id, imps.date, imps.cpm_millis, where=((imps.date >= '2014-01-21') & (imps.date <= '2014-01-23')) | ~(imps.site_id == 'google.com)) Note that for these expressions, the column must come first. This means that the following expression is **illegal**:: select(imps.ad_id, imps.date, imps.cpm_millis, where='2014-01-27' == imps.date) Where clause also supports *in* and *not in* statements by using special operators "<<" and ">>" respectively:: select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.ad_id << [1000, 1005]) select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.ad_id >> [1000, 1005]) Note that the right value "<<" and ">>" could be any type of iterable with each element must be a valid single right value. In addition, multiple tables can be specified in the where clause like this:: select(imps.ad_id, pix.amount, where=(imps.date < '2014-01-13', pix)) which specifies an expression, :code:`imps.date < '2014-01-13'` and a :class:`Table <hustle.Table>` tuple. This query will simply return all of the *ad_id* values in *imps* for dates less than January 13th followed by all of the *amount* values in the *pix* table. Using multiple columns is typically reserved for when you use a *join clause* :type join: string | sequence of exactly length 2 of :class:`Column <hustle.core.marble.Column>` :param join: specified the columns to perform a relational join operation on for the query The join columns can be specified either as a list of 2 columns, or a list of 2 strings. In particular, if two columns have the same names, a single string is valid as well. Here's an example of a Hustle join:: select(imps.ad_id, imps.site_id, h_sum(pix.amount), h_count(), where=(imps.date < '2014-01-13', pix.date < '2014-01-13'), join=(imps.site_id, pix.site_id)) or equivalently:: select(imps.ad_id, imps.site_id, h_sum(pix.amount), h_count(), where=(imps.date < '2014-01-13', pix.date < '2014-01-13'), join='site_id') which joins the *imps* and *pix* tables on their common *site_id* column, then returns the sum of the *pix.amount* columns and a count, grouped by the *ad_id* and the *site_id*. The equivalent query in SQL is:: select i.ad_id, i.site_id, sum(p.amount), count(*) from imps i join pix p on p.site_id = p.site_id where i.date < '2014-01-13' and i.date < '2014-01-13' group by i.ad_id, i.site_id :type full_join: bool :param full_join: if True, specifies that a full join between the specified tables in the *where clause* should be joined in a full cross-product. Note that if both *full_join* and *join* are specified, *join* will be ignored. :type order_by: string | :class:`Column <hustle.core.marble.Column>` | int | (sequence of string | :class:`Column <hustle.core.marble.Column>` | int) :param order_by: the column(s) to sort the result by The sort columns can be specified either as a Column or a list of Columns. Alternatively, you can specify a column by using a string with either the name of the column or the *table.column* string notation. Furthermore, you can also represent the column using a zero based index of the *projected* columns. This last case would be used for *Aggregations*. Here are a few examples:: select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by=imps.date) select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by=(imps.date, imps.ad_id)) select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by='date') select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by='imps.date') select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by=('date', imps.ad_id)) select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by=('date', 2)) select(imps.ad_id, imps.date, h_sum(imps.cpm_millis), where=imps, order_by=2) :type desc: boolean :param desc: affects sort order of the *order_by clause* to descending (default ascending) :type distinct: boolean :param distinct: indicates whether to remove duplicates in results :type limit: int :param limit: limits the total number of records in the output :type block: boolean :param block: make select call either blocking (default) or non-blocking. If True, causes select() to return a :class:`Future <hustle.Future>` object :type nest: boolean (default = False) :param nest: specify that the return value is a :class:`Table <hustle.Table>` to be used in another query This allows us to build nested queries. You may want to do this to join more than two tables, or to reuse the results of a query in more than one subsequent query. For example:: active_pix = select(*star(pix), where=pix.isActive > 0, nest=True) select(h_sum(active_pix.amount), where=active_pix) :type tag: string (default = None) :param tag: specify the tag name for a nested query, note it must be used with option "nest". If this option is not specified, a random name will be given to the result of this nested query. :type max_cores: int (default = 0) :param max_cores: specify the max number of cores (disco workers) this query could utilize. 0 means no limit :type profile: boolean (default = False) :param profile: specify whether generate disco job's profile :type purge: boolean (default = True) :param purge: specify whether purge the query related data. This only works when "dump = True" and "profile = False". :type kwargs: dict :param kwargs: custom settings for this query see :mod:`hustle.core.settings` """ from hustle.core.settings import Settings from hustle.core.pipeline import SelectPipe from hustle.core.util import ensure_list settings = Settings(**kwargs) wheres = ensure_list(settings.pop('where', ())) order_by = ensure_list(settings.pop('order_by', ())) join = settings.pop('join', ()) full_join = settings.pop('full_join', False) distinct = settings.pop('distinct', False) desc = settings.pop('desc', False) limit = settings.pop('limit', None) wide = settings.pop('wide', False) nest = settings.pop('nest', False) tag = settings.pop('tag', None) block = settings.pop('block', True) autodump = settings.pop('dump', False) pre_order_stage = settings.pop('pre_order_stage', ()) ddfs = settings['ddfs'] partition = settings.pop('partition', 0) max_cores = settings.pop('max_cores', 0) profile = settings.pop('profile', False) purge = settings.pop('purge', True) if partition < 0: partition = 0 if tag: t = Table.from_tag(tag) if t is not None: print "The tag name %s is already existed. Try another tag name" " or drop the old one" % tag return try: # if join is a string, extract the actual join columns. # do it here to make the query checker happy. join = _resolve_join(wheres, join) check_query(project, join, order_by, limit, wheres) except ValueError as e: print " Invalid query:\n %s" % e return None name = '-'.join([where._name for where in wheres])[:64] job_blobs = set() for where in wheres: job_blobs.update(tuple(sorted(w)) for w in _get_blobs(where, ddfs)) job = SelectPipe(settings['server'], wheres=wheres, project=project, order_by=order_by, join=join, full_join=full_join, distinct=distinct, desc=desc, limit=limit, partition=partition, wide=wide, nest=nest, tag=tag, pre_order_stage=pre_order_stage, max_cores=max_cores, profile=profile) job.run(name='select_from_%s' % name, input=job_blobs, **settings) if block: blobs = job.wait() if nest: rtab = job.get_result_schema(project) rtab._blobs = blobs return rtab elif autodump: # the result will be just dumped to stdout cols = [c.name for c in project] _print_separator(80) _print_line(cols, width=80, cols=len(cols), alignments=[_ALG_RIGHT if c.is_numeric else _ALG_LEFT for c in project]) _print_separator(80) cat(_query_iterator(blobs), 80) if purge and not profile: settings['server'].purge(_safe_str(job.name)) return return QueryResult(job.name, blobs, settings['server']) else: return Future(job.name, job, settings['server'], nest, *project)
def setUp(self): overrides['server'] = 'disco://localhost' overrides['dump'] = False overrides['nest'] = False self.settings = Settings()
def insert_hll(table, file=None, streams=None, preprocess=None, maxsize=100 * 1024 * 1024, tmpdir='/tmp', decoder=ujson.decode, lru_size=10000, hll_field=None, **kwargs): from cardunion import Cardunion import os settings = Settings(**kwargs) ddfs = settings['ddfs'] def part_tag(name, partition=None): rval = "hustle:" + name if partition: rval += ':' + str(partition) return rval def hll_iter(strms): buf = {} fields = table._field_names fields.remove('hll') # fields.remove('maxhash') for stream in strms: for line in stream: try: data = decoder(line) except Exception as e: print "Exception decoding record (skipping): %s %s" % ( e, line) else: if preprocess: if not preprocess(data): continue key = ujson.dumps([data[f] for f in fields]) if key not in buf: hll = Cardunion(12) buf[key] = hll else: hll = buf[key] hll.add(data[hll_field]) for key, hll in buf.iteritems(): data = dict(zip(fields, ujson.loads(key))) data['hll'] = hll.dumps() yield data if file: streams = [open(file)] lines, partition_files = table._insert([hll_iter(streams)], maxsize=maxsize, tmpdir=tmpdir, decoder=lambda x: x, lru_size=lru_size) if partition_files is not None: for part, pfile in partition_files.iteritems(): tag = part_tag(table._name, part) ddfs.push(tag, [pfile]) print 'pushed %s, %s' % (part, tag) os.unlink(pfile) return table._name, lines