예제 #1
0
파일: stat.py 프로젝트: zhengyuv/hustle
    def __init__(self, master):

        super(StatPipe, self).__init__(master=master, worker=Worker())
        self.pipeline = [
            ('split',
             HustleStage('stat',
                         process=process_stat,
                         input_chain=[task_input_stream, stat_input_stream]))
        ]
예제 #2
0
파일: pipeline.py 프로젝트: cih-y2k/hustle
    def __init__(self,
                 master,
                 wheres,
                 project=(),
                 order_by=(),
                 join=(),
                 full_join=False,
                 distinct=False,
                 desc=False,
                 limit=0,
                 partition=0,
                 nest=False,
                 wide=False,
                 pre_order_stage=(),
                 tag=None,
                 max_cores=0,
                 profile=False):
        from hustle.core.pipeworker import Worker

        super(SelectPipe, self).__init__(master=master, worker=Worker())
        if max_cores < 0:
            max_cores = 0
        if max_cores > 0:
            self.scheduler = {"max_cores": max_cores}
        self.profile = profile
        self.wheres = wheres
        self.order_by = self._resolve(order_by, project)
        partition = partition or _NPART
        binaries = [
            i for i, c in enumerate(project)
            if isinstance(c, (Column, Aggregation)) and c.is_binary
        ]

        # if nest is true, use output_schema to store the output table
        self.output_table = None

        # aggregation functions and their defaults
        efs, gees, ehches, dflts = zip(
            *[(c.f, c.g, c.h,
               c.default) if isinstance(c, Aggregation) else (dflt_f, dflt_gh,
                                                              dflt_gh,
                                                              dflt_default)
              for c in project])
        need_agg = False  # need to commit aggregatation
        all_agg = True  # whether all columns in select are aggregates
        for c in project:
            if isinstance(c, Aggregation):
                need_agg = True
            else:
                all_agg = False
        if all_agg:
            _agg_fn = _aggregate_fast
        else:
            _agg_fn = _aggregate

        # build the pipeline
        select_hash_cols = ()
        sort_range = _get_sort_range(0, project, self.order_by)

        join_stage = []
        if join or full_join:
            joinbins = [i + 2 for i in binaries]
            join_stage = [
                (GROUP_LABEL,
                 HustleStage('join',
                             sort=(1, 0),
                             binaries=joinbins,
                             process=partial(process_join,
                                             full_join=full_join,
                                             ffuncs=efs,
                                             ghfuncs=ehches,
                                             deffuncs=dflts,
                                             wide=wide,
                                             need_agg=need_agg,
                                             agg_fn=_agg_fn,
                                             label_fn=partial(_tuple_hash,
                                                              cols=sort_range,
                                                              p=partition))))
            ]
            select_hash_cols = (1, )

        group_by_stage = []
        if need_agg:
            # If all columns in project are aggregations, use process_skip_group
            # to skip the internal groupby
            if all_agg:
                process_group_fn = process_skip_group
                group_by_range = []
            else:
                process_group_fn = process_group
                group_by_range = [
                    i for i, c in enumerate(project) if isinstance(c, Column)
                ]

            # build the pipeline
            group_by_stage = []
            if wide:
                group_by_stage = [
                    (GROUP_LABEL_NODE,
                     HustleStage('group-combine',
                                 sort=group_by_range,
                                 binaries=binaries,
                                 process=partial(process_group_fn,
                                                 ffuncs=efs,
                                                 ghfuncs=ehches,
                                                 deffuncs=dflts,
                                                 label_fn=partial(
                                                     _tuple_hash,
                                                     cols=group_by_range,
                                                     p=partition))))
                ]
            # A Hack here that overrides disco stage's default option 'combine'.
            # Hustle needs all inputs with the same label to be combined.
            group_by_stage.append(
                (GROUP_LABEL,
                 HustleStage('group-reduce',
                             combine=True,
                             input_sorted=wide,
                             sort=group_by_range,
                             binaries=binaries,
                             process=partial(process_group_fn,
                                             ffuncs=efs,
                                             ghfuncs=gees,
                                             deffuncs=dflts))))

        # process the order_by/distinct stage
        order_stage = []

        # re-scan the binary columns/aggregations for the order-by stage, since
        # some aggregations could have changed the type of the original column.
        # For instance, `h_cardinality(hll)` would generate result of integer type
        # from the binary type `hll`. It should use the type from `result_spec`
        # instead of the original one.
        order_by_binaries = []
        for i, c in enumerate(project):
            if isinstance(c, Column) and c.is_binary:
                order_by_binaries.append(i)

            if isinstance(c, Aggregation) and c.result_spec.is_binary:
                order_by_binaries.append(i)
        if self.order_by or distinct or limit:
            order_stage = [
                (GROUP_LABEL_NODE,
                 HustleStage('order-combine',
                             sort=sort_range,
                             binaries=order_by_binaries,
                             desc=desc,
                             process=partial(process_order,
                                             distinct=distinct,
                                             limit=limit or sys.maxint))),
                (GROUP_ALL,
                 HustleStage('order-reduce',
                             sort=sort_range,
                             desc=desc,
                             input_sorted=True,
                             combine_labels=True,
                             process=partial(process_order,
                                             distinct=distinct,
                                             limit=limit or sys.maxint))),
            ]

        if not select_hash_cols:
            select_hash_cols = sort_range

        key_names = self._get_key_names(project, join)

        restrict_distinct = False
        restrict_limit = 0
        # check whether need to do a distinct/limit in the restrict select stage
        if not (join or
                full_join) and not need_agg and not self.order_by and distinct:
            restrict_distinct = True
            restrict_limit = limit or 0
        # check whether need to do a limit in the hustle_input stream
        input_stream_limit = 0
        if not (join or full_join
                ) and not need_agg and not self.order_by and not distinct:
            input_stream_limit = limit or 0
        pipeline = [(
            SPLIT,
            HustleStage(
                'restrict-select',
                # combine=True,  # cannot set combine -- see #hack in restrict-select phase
                process=partial(process_restrict,
                                ffuncs=efs,
                                ghfuncs=ehches,
                                deffuncs=dflts,
                                wide=wide or join or full_join,
                                need_agg=need_agg,
                                agg_fn=_agg_fn,
                                distinct=restrict_distinct,
                                limit=restrict_limit or sys.maxint,
                                label_fn=partial(_tuple_hash,
                                                 cols=select_hash_cols,
                                                 p=partition)),
                input_chain=[
                    task_input_stream,
                    partial(hustle_input_stream,
                            wheres=wheres,
                            gen_where_index=join or full_join,
                            key_names=key_names,
                            limit=input_stream_limit or sys.maxint)
                ])
        )] + join_stage + group_by_stage + list(pre_order_stage) + order_stage

        # determine the style of output (ie. if it is a Hustle Table),
        # and modify the last stage accordingly
        if nest:
            pipeline[-1][1].output_chain = \
                [partial(hustle_output_stream, result_table=self.get_result_schema(project, tag))]
        self.pipeline = pipeline
예제 #3
0
파일: pipeline.py 프로젝트: pooya/hustle
    def __init__(self,
                 master,
                 wheres,
                 project=(),
                 order_by=(),
                 join=(),
                 distinct=False,
                 desc=False,
                 limit=0,
                 partition=0,
                 nest=False,
                 pre_order_stage=()):
        from hustle.core.pipeworker import Worker

        super(SelectPipe, self).__init__(master=master, worker=Worker())
        self.wheres = wheres
        self.order_by = self._resolve(order_by, project)
        partition = partition or _NPART
        binaries = [
            i for i, c in enumerate(project)
            if isinstance(c, (Column, Aggregation)) and c.is_binary
        ]
        # if nest is true, use output_schema to store the output table
        self.output_table = None

        # build the pipeline
        select_hash_cols = ()
        sort_range = _get_sort_range(0, project, self.order_by)
        join_stage = []
        if join:
            joinbins = [i + 2 for i in binaries]
            join_stage = [
                (GROUP_LABEL,
                 HustleStage('join',
                             sort=(1, 0),
                             binaries=joinbins,
                             process=partial(process_join,
                                             label_fn=partial(_tuple_hash,
                                                              cols=sort_range,
                                                              p=partition))))
            ]
            select_hash_cols = (1, )

        efs, gees, ehches, dflts = zip(
            *[(c.f, c.g, c.h,
               c.default) if isinstance(c, Aggregation) else (None, None, None,
                                                              None)
              for c in project])
        group_by_stage = []
        if any(efs):
            # If all columns in project are aggregations, use process_skip_group
            # to skip the internal groupby
            if all([isinstance(c, Aggregation) for c in project]):
                process_group_fn = process_skip_group
                group_by_range = []
            else:
                process_group_fn = process_group
                group_by_range = [
                    i for i, c in enumerate(project) if isinstance(c, Column)
                ]

            # build the pipeline
            group_by_stage = [
                (GROUP_LABEL_NODE,
                 HustleStage('group-combine',
                             sort=group_by_range,
                             binaries=binaries,
                             process=partial(process_group_fn,
                                             ffuncs=efs,
                                             ghfuncs=ehches,
                                             deffuncs=dflts,
                                             label_fn=partial(
                                                 _tuple_hash,
                                                 cols=group_by_range,
                                                 p=partition)))),
                # A Hack here that overrides disco stage's default option 'combine'.
                # Hustle needs all inputs with the same label to be combined.
                (GROUP_LABEL,
                 HustleStage('group-reduce',
                             input_sorted=True,
                             combine=True,
                             sort=group_by_range,
                             process=partial(process_group_fn,
                                             ffuncs=efs,
                                             ghfuncs=gees,
                                             deffuncs=dflts)))
            ]

        # process the order_by/distinct stage
        order_stage = []
        if self.order_by or distinct or limit:
            order_stage = [
                (GROUP_LABEL_NODE,
                 HustleStage('order-combine',
                             sort=sort_range,
                             binaries=binaries,
                             desc=desc,
                             process=partial(process_order,
                                             distinct=distinct,
                                             limit=limit or sys.maxint))),
                (GROUP_ALL,
                 HustleStage('order-reduce',
                             sort=sort_range,
                             desc=desc,
                             input_sorted=True,
                             combine_labels=True,
                             process=partial(process_order,
                                             distinct=distinct,
                                             limit=limit or sys.maxint))),
            ]

        if not select_hash_cols:
            select_hash_cols = sort_range

        pipeline = [
            (SPLIT,
             HustleStage(
                 'restrict-select',
                 process=partial(process_restrict,
                                 label_fn=partial(_tuple_hash,
                                                  cols=select_hash_cols,
                                                  p=partition)),
                 input_chain=[
                     task_input_stream,
                     partial(hustle_input_stream,
                             wheres=wheres,
                             gen_where_index=join,
                             key_names=self._get_key_names(project, join))
                 ]))
        ] + join_stage + group_by_stage + list(pre_order_stage) + order_stage

        # determine the style of output (ie. if it is a Hustle Table), and modify the last stage accordingly
        if nest:
            pipeline[-1][1].output_chain = [
                partial(hustle_output_stream,
                        result_table=self.get_result_schema(project))
            ]
        self.pipeline = pipeline