def __init__(self, master): super(StatPipe, self).__init__(master=master, worker=Worker()) self.pipeline = [ ('split', HustleStage('stat', process=process_stat, input_chain=[task_input_stream, stat_input_stream])) ]
def __init__(self, master, wheres, project=(), order_by=(), join=(), full_join=False, distinct=False, desc=False, limit=0, partition=0, nest=False, wide=False, pre_order_stage=(), tag=None, max_cores=0, profile=False): from hustle.core.pipeworker import Worker super(SelectPipe, self).__init__(master=master, worker=Worker()) if max_cores < 0: max_cores = 0 if max_cores > 0: self.scheduler = {"max_cores": max_cores} self.profile = profile self.wheres = wheres self.order_by = self._resolve(order_by, project) partition = partition or _NPART binaries = [ i for i, c in enumerate(project) if isinstance(c, (Column, Aggregation)) and c.is_binary ] # if nest is true, use output_schema to store the output table self.output_table = None # aggregation functions and their defaults efs, gees, ehches, dflts = zip( *[(c.f, c.g, c.h, c.default) if isinstance(c, Aggregation) else (dflt_f, dflt_gh, dflt_gh, dflt_default) for c in project]) need_agg = False # need to commit aggregatation all_agg = True # whether all columns in select are aggregates for c in project: if isinstance(c, Aggregation): need_agg = True else: all_agg = False if all_agg: _agg_fn = _aggregate_fast else: _agg_fn = _aggregate # build the pipeline select_hash_cols = () sort_range = _get_sort_range(0, project, self.order_by) join_stage = [] if join or full_join: joinbins = [i + 2 for i in binaries] join_stage = [ (GROUP_LABEL, HustleStage('join', sort=(1, 0), binaries=joinbins, process=partial(process_join, full_join=full_join, ffuncs=efs, ghfuncs=ehches, deffuncs=dflts, wide=wide, need_agg=need_agg, agg_fn=_agg_fn, label_fn=partial(_tuple_hash, cols=sort_range, p=partition)))) ] select_hash_cols = (1, ) group_by_stage = [] if need_agg: # If all columns in project are aggregations, use process_skip_group # to skip the internal groupby if all_agg: process_group_fn = process_skip_group group_by_range = [] else: process_group_fn = process_group group_by_range = [ i for i, c in enumerate(project) if isinstance(c, Column) ] # build the pipeline group_by_stage = [] if wide: group_by_stage = [ (GROUP_LABEL_NODE, HustleStage('group-combine', sort=group_by_range, binaries=binaries, process=partial(process_group_fn, ffuncs=efs, ghfuncs=ehches, deffuncs=dflts, label_fn=partial( _tuple_hash, cols=group_by_range, p=partition)))) ] # A Hack here that overrides disco stage's default option 'combine'. # Hustle needs all inputs with the same label to be combined. group_by_stage.append( (GROUP_LABEL, HustleStage('group-reduce', combine=True, input_sorted=wide, sort=group_by_range, binaries=binaries, process=partial(process_group_fn, ffuncs=efs, ghfuncs=gees, deffuncs=dflts)))) # process the order_by/distinct stage order_stage = [] # re-scan the binary columns/aggregations for the order-by stage, since # some aggregations could have changed the type of the original column. # For instance, `h_cardinality(hll)` would generate result of integer type # from the binary type `hll`. It should use the type from `result_spec` # instead of the original one. order_by_binaries = [] for i, c in enumerate(project): if isinstance(c, Column) and c.is_binary: order_by_binaries.append(i) if isinstance(c, Aggregation) and c.result_spec.is_binary: order_by_binaries.append(i) if self.order_by or distinct or limit: order_stage = [ (GROUP_LABEL_NODE, HustleStage('order-combine', sort=sort_range, binaries=order_by_binaries, desc=desc, process=partial(process_order, distinct=distinct, limit=limit or sys.maxint))), (GROUP_ALL, HustleStage('order-reduce', sort=sort_range, desc=desc, input_sorted=True, combine_labels=True, process=partial(process_order, distinct=distinct, limit=limit or sys.maxint))), ] if not select_hash_cols: select_hash_cols = sort_range key_names = self._get_key_names(project, join) restrict_distinct = False restrict_limit = 0 # check whether need to do a distinct/limit in the restrict select stage if not (join or full_join) and not need_agg and not self.order_by and distinct: restrict_distinct = True restrict_limit = limit or 0 # check whether need to do a limit in the hustle_input stream input_stream_limit = 0 if not (join or full_join ) and not need_agg and not self.order_by and not distinct: input_stream_limit = limit or 0 pipeline = [( SPLIT, HustleStage( 'restrict-select', # combine=True, # cannot set combine -- see #hack in restrict-select phase process=partial(process_restrict, ffuncs=efs, ghfuncs=ehches, deffuncs=dflts, wide=wide or join or full_join, need_agg=need_agg, agg_fn=_agg_fn, distinct=restrict_distinct, limit=restrict_limit or sys.maxint, label_fn=partial(_tuple_hash, cols=select_hash_cols, p=partition)), input_chain=[ task_input_stream, partial(hustle_input_stream, wheres=wheres, gen_where_index=join or full_join, key_names=key_names, limit=input_stream_limit or sys.maxint) ]) )] + join_stage + group_by_stage + list(pre_order_stage) + order_stage # determine the style of output (ie. if it is a Hustle Table), # and modify the last stage accordingly if nest: pipeline[-1][1].output_chain = \ [partial(hustle_output_stream, result_table=self.get_result_schema(project, tag))] self.pipeline = pipeline
def __init__(self, master, wheres, project=(), order_by=(), join=(), distinct=False, desc=False, limit=0, partition=0, nest=False, pre_order_stage=()): from hustle.core.pipeworker import Worker super(SelectPipe, self).__init__(master=master, worker=Worker()) self.wheres = wheres self.order_by = self._resolve(order_by, project) partition = partition or _NPART binaries = [ i for i, c in enumerate(project) if isinstance(c, (Column, Aggregation)) and c.is_binary ] # if nest is true, use output_schema to store the output table self.output_table = None # build the pipeline select_hash_cols = () sort_range = _get_sort_range(0, project, self.order_by) join_stage = [] if join: joinbins = [i + 2 for i in binaries] join_stage = [ (GROUP_LABEL, HustleStage('join', sort=(1, 0), binaries=joinbins, process=partial(process_join, label_fn=partial(_tuple_hash, cols=sort_range, p=partition)))) ] select_hash_cols = (1, ) efs, gees, ehches, dflts = zip( *[(c.f, c.g, c.h, c.default) if isinstance(c, Aggregation) else (None, None, None, None) for c in project]) group_by_stage = [] if any(efs): # If all columns in project are aggregations, use process_skip_group # to skip the internal groupby if all([isinstance(c, Aggregation) for c in project]): process_group_fn = process_skip_group group_by_range = [] else: process_group_fn = process_group group_by_range = [ i for i, c in enumerate(project) if isinstance(c, Column) ] # build the pipeline group_by_stage = [ (GROUP_LABEL_NODE, HustleStage('group-combine', sort=group_by_range, binaries=binaries, process=partial(process_group_fn, ffuncs=efs, ghfuncs=ehches, deffuncs=dflts, label_fn=partial( _tuple_hash, cols=group_by_range, p=partition)))), # A Hack here that overrides disco stage's default option 'combine'. # Hustle needs all inputs with the same label to be combined. (GROUP_LABEL, HustleStage('group-reduce', input_sorted=True, combine=True, sort=group_by_range, process=partial(process_group_fn, ffuncs=efs, ghfuncs=gees, deffuncs=dflts))) ] # process the order_by/distinct stage order_stage = [] if self.order_by or distinct or limit: order_stage = [ (GROUP_LABEL_NODE, HustleStage('order-combine', sort=sort_range, binaries=binaries, desc=desc, process=partial(process_order, distinct=distinct, limit=limit or sys.maxint))), (GROUP_ALL, HustleStage('order-reduce', sort=sort_range, desc=desc, input_sorted=True, combine_labels=True, process=partial(process_order, distinct=distinct, limit=limit or sys.maxint))), ] if not select_hash_cols: select_hash_cols = sort_range pipeline = [ (SPLIT, HustleStage( 'restrict-select', process=partial(process_restrict, label_fn=partial(_tuple_hash, cols=select_hash_cols, p=partition)), input_chain=[ task_input_stream, partial(hustle_input_stream, wheres=wheres, gen_where_index=join, key_names=self._get_key_names(project, join)) ])) ] + join_stage + group_by_stage + list(pre_order_stage) + order_stage # determine the style of output (ie. if it is a Hustle Table), and modify the last stage accordingly if nest: pipeline[-1][1].output_chain = [ partial(hustle_output_stream, result_table=self.get_result_schema(project)) ] self.pipeline = pipeline