示例#1
0
    def start(self):

        self.op_metrics.timer_start()

        it = IndexedTuple.build_default(self.col_defs)

        if self.log_enabled:
            print("{}('{}') | Sending field names: {}".format(
                self.__class__.__name__, self.name, it.field_names()))

        self.send(TupleMessage(Tuple(it.field_names())), self.consumers)

        for i in range(0, self.num_rows):

            if self.is_completed():
                break

            self.op_metrics.rows_returned += 1

            t = Tuple()
            for col_def in self.col_defs:
                col_val = col_def.generate()
                t.append(col_val)

            if self.log_enabled:
                print("{}('{}') | Sending field values: {}".format(
                    self.__class__.__name__, self.name, t))

            self.send(TupleMessage(t), self.consumers)

        if not self.is_completed():
            self.complete()

        self.op_metrics.timer_stop()
示例#2
0
    def complete(self):
        """
        When all producers complete, the topk tuples are passed to the next operators.
        :return:
        """
        if not self.use_pandas:
            # if the number of tuples beyond the cut-off value is less than k, we need to some tuples from
            # the sample set
            if len(self.heap) < self.max_tuples:
                self.on_receive([TupleMessage(t) for t in self.sample_tuples],
                                self.name)

            for t in self.heap.get_topk(self.max_tuples, sort=True):
                if self.is_completed():
                    break
                self.send(TupleMessage(t.tuple), self.consumers)

            self.heap.clear()
        else:
            if self.sort_expression.sort_order == 'ASC':
                self.global_topk_df = self.global_topk_df.nsmallest(self.max_tuples, self.sort_expression.col_index) \
                    .head(self.max_tuples)
            elif self.sort_expression.sort_order == 'DESC':
                self.global_topk_df = self.global_topk_df.nlargest(self.max_tuples, self.sort_expression.col_index) \
                    .head(self.max_tuples)

            self.send(self.global_topk_df, self.consumers)

        super(TopKTableScan, self).complete()

        self.op_metrics.timer_stop()
示例#3
0
    def execute_py_query(op):
        cur = Cursor(op.s3).select(op.s3key, op.s3sql)
        tuples = cur.execute()
        first_tuple = True
        for t in tuples:

            if op.is_completed():
                break

            op.op_metrics.rows_returned += 1

            if first_tuple:
                # Create and send the record field names
                it = IndexedTuple.build_default(t)
                first_tuple = False

                if op.log_enabled:
                    print("{}('{}') | Sending field names: {}".format(
                        op.__class__.__name__, op.name, it.field_names()))

                op.send(TupleMessage(Tuple(it.field_names())), op.consumers)

            # if op.log_enabled:
            #     print("{}('{}') | Sending field values: {}".format(op.__class__.__name__, op.name, t))

            op.send(TupleMessage(Tuple(t)), op.consumers)
        return cur
示例#4
0
    def __on_receive_tuple(self, tuple_, producer_name):
        """Event handler for a received tuple

        :param tuple_: The received tuple
        :return: None
        """

        if self.field_names is None:
            self.field_names = tuple_

            self.send(TupleMessage(tuple_), self.consumers)
            self.producers_received[producer_name] = True
        else:

            if producer_name not in self.producers_received.keys():
                # Will be field names, skip
                self.producers_received[producer_name] = True
            else:
                it = IndexedTuple.build(tuple_, self.field_names)

                idx = int(it[self.map_field_name]) % len(self.consumers)

                self.op_metrics.rows_mapped += 1

                self.send(TupleMessage(tuple_), [self.consumers[idx]])
示例#5
0
    def on_producer_completed(self, producer_name):
        """Event handler for a producer completion event.

        :param producer_name: The producer that completed.
        :return: None
        """

        if producer_name in self.producer_completions.keys():
            self.producer_completions[producer_name] = True
        if self.use_pandas:
            if not self.is_completed() and all(self.producer_completions.values()):
                if len(self.agg_df > 0):
                    self.send(DataFrameMessage(self.agg_df.agg(['sum'])), self.consumers) 
                else: 
                    self.send(DataFrameMessage(pd.DataFrame()), self.consumers)
        else:
            if not self.is_completed() and all(self.producer_completions.values()):
                # Build and send the field names
                field_names = self.__build_field_names()
                self.send(TupleMessage(Tuple(field_names)), self.consumers)

                # Send the field values, if there are any
                if self.__expression_contexts is not None:
                    field_values = self.__build_field_values()
                    self.send(TupleMessage(Tuple(field_values)), self.consumers)

        Operator.on_producer_completed(self, producer_name)
示例#6
0
    def on_producer_completed(self, producer_name):
        """Handles the event where the producer has completed producing all the tuples it will produce. Once this
        occurs the tuples can be sent to consumers downstream.

        :param producer_name: The producer that has completed
        :return: None
        """
        if producer_name in self.producer_completions.keys():
            self.producer_completions[producer_name] = True
        else:
            raise Exception(
                "Unrecognized producer {} has completed".format(producer_name))

        is_all_producers_done = all(self.producer_completions.values())
        if not is_all_producers_done:
            return

        if not self.use_pandas:
            # Send the field names
            lt = IndexedTuple.build_default(self.group_field_names +
                                            self.aggregate_expressions)
            self.send(TupleMessage(Tuple(lt.field_names())), self.consumers)

            for group_tuple, group_aggregate_contexts in self.group_contexts.items(
            ):

                if self.is_completed():
                    break

                # Convert the aggregate contexts to their results
                group_fields = list(group_tuple)

                group_aggregate_values = list(
                    v.result for v in group_aggregate_contexts.values())

                t_ = group_fields + group_aggregate_values
                self.send(TupleMessage(Tuple(t_)), self.consumers)
        else:
            # for groupby_reducer, aggregate one more time.
            if not self.is_completed() and len(self.producers) > 1:
                self.aggregate_df = self.pd_expr(self.aggregate_df)

            if not self.is_completed() and self.aggregate_df is not None:
                self.aggregate_df.reset_index(drop=True, inplace=True)

                # if self.log_enabled:
                #     with pd.option_context('display.max_rows', None, 'display.max_columns', None):
                #         print("{}('{}') | Sending grouped field values: \n{}"
                #               .format(self.__class__.__name__, self.name, self.aggregate_df))

                #self.send(TupleMessage(Tuple(list(self.aggregate_df))), self.consumers)
                self.send(DataFrameMessage(self.aggregate_df), self.consumers)

                del self.aggregate_df

        Operator.on_producer_completed(self, producer_name)
示例#7
0
    def __on_receive_tuple(self, tuple_, producer_name):
        """Handles the receipt of a tuple. When a tuple is received, it's compared with the top of the heap to decide
        on adding to the heap or skip it. Given this process, it is guaranteed to keep the k topmost tuples given some
        defined comparison criteria

        :param tuple_: The received tuple
        :return: None
        """
        if not self.field_names:
            # Collect and send field names through
            self.field_names = tuple_
            self.send(TupleMessage(tuple_), self.consumers)
        elif not is_header(tuple_):
            self.send(TupleMessage(tuple_), self.consumers)
示例#8
0
    def download_part(self, part, records_queue, stats):
        print('Started downloading part {}'.format(part))
        part_range = self.ranges[part]
        part_sql = self.s3sql + ' and CAST({} AS int) >= {} AND CAST({} AS int) <= {}'.format(self.partitioning_key,
                                                                                                part_range[0],
                                                                                                self.partitioning_key,
                                                                                                part_range[1])

        op_metrics = SQLTableScanMetrics()
        op_metrics.timer_start()

        cur = Cursor(self.query_plan.s3).select(self.s3key, part_sql)

        tuples = cur.execute()

        op_metrics.query_bytes = cur.query_bytes
        op_metrics.time_to_first_response = op_metrics.elapsed_time()

        for t in tuples:
            op_metrics.rows_returned += 1
            tuple_msg = TupleMessage(Tuple(t))
            records_queue.append(tuple_msg)

        del tuples

        op_metrics.bytes_scanned = cur.bytes_scanned
        op_metrics.bytes_processed = cur.bytes_processed
        op_metrics.bytes_returned = cur.bytes_returned
        op_metrics.time_to_first_record_response = cur.time_to_first_record_response
        op_metrics.time_to_last_record_response = cur.time_to_last_record_response

        op_metrics.timer_stop()
        stats[part] = op_metrics

        print('Finished downloading part {} read {} records'.format(part, op_metrics.rows_returned))
示例#9
0
    def download_part(self, part, part_key, records_queue, stats):
        print('Started downloading part {} key {}'.format(part, part_key))

        op_metrics = SQLTableScanMetrics()
        op_metrics.timer_start()

        cur = Cursor().select(part_key, self.s3sql)

        tuples = cur.execute()

        op_metrics.query_bytes = cur.query_bytes
        op_metrics.time_to_first_response = op_metrics.elapsed_time()

        for t in tuples:
            op_metrics.rows_returned += 1
            tuple_msg = TupleMessage(Tuple(t))
            records_queue.append(tuple_msg)

        del tuples

        op_metrics.bytes_scanned = cur.bytes_scanned
        op_metrics.bytes_processed = cur.bytes_processed
        op_metrics.bytes_returned = cur.bytes_returned
        op_metrics.time_to_first_record_response = cur.time_to_first_record_response
        op_metrics.time_to_last_record_response = cur.time_to_last_record_response

        op_metrics.timer_stop()
        stats[part] = op_metrics

        print('Finished downloading part {} read {} records'.format(
            part, op_metrics.rows_returned))
示例#10
0
    def join_field_names(self):
        """Examines the collected field names and joins them into a single list, left field names followed by right
        field names. The joined field names tuple is then sent.

        :return: None
        """

        joined_field_names = []

        # We can only emit field name tuples if we received tuples for both sides of the join
        if self.__l_field_names is not None and self.__r_field_names is not None:

            for field_name in self.__l_field_names:
                joined_field_names.append(field_name)

            for field_name in self.__r_field_names:
                joined_field_names.append(field_name)

            if self.log_enabled:
                print("{}('{}') | Sending field names [{}]".format(
                    self.__class__.__name__,
                    self.name,
                    {'field_names': joined_field_names}))

            self.send(TupleMessage(Tuple(joined_field_names)), self.consumers)
示例#11
0
    def nested_loop(self):
        """Performs the join on data tuples using a nested loop joining algorithm. The joined tuples are each sent.
        Allows for the loop to be broken if the operator completes while executing.

        :return: None
        """

        for l_tuple in self.__l_tuples:

            if self.is_completed():
                break

            for r_tuple in self.__r_tuples:

                if self.is_completed():
                    break

                l_field_name_index = self.__l_field_names.index(self.join_expr.l_field)
                r_field_name_index = self.__r_field_names.index(self.join_expr.r_field)

                if l_tuple[l_field_name_index] == r_tuple[r_field_name_index]:
                    t = l_tuple + r_tuple

                    if self.log_enabled:
                        print("{}('{}') | Sending field values [{}]".format(
                            self.__class__.__name__,
                            self.name,
                            {'data': t}))

                    self.op_metrics.rows_joined += 1

                    self.send(TupleMessage(Tuple(t)), self.consumers)
示例#12
0
    def join_field_names(self):
        """Examines the collected field names and joins them into a single list, left field names followed by right
        field names. The joined field names tuple is then sent.

        :return: None
        """

        joined_field_names = []

        # We can only emit field name tuples if we
        # received tuples for both sides of the join,
        #  we may not always get them
        # as some reads may return an empty record set
        if self.build_field_names is not None and self.tuple_field_names is not None:

            for field_name in self.build_field_names:
                joined_field_names.append(field_name)

            for field_name in self.tuple_field_names:
                joined_field_names.append(field_name)

            if self.log_enabled:
                print("{} | {}('{}') | Sending field names [{}]".format(
                    time.time(), self.__class__.__name__, self.name,
                    {'field_names': joined_field_names}))

            self.send(TupleMessage(Tuple(joined_field_names)), self.consumers)
示例#13
0
    def start(self):
        self.op_metrics.timer_start()

        if self.parts == 1:
            self.records, part, part_op_metrics = download_part_local(self.s3sql, 0, self.s3key, self.records,
                                                                            self.worker_metrics)
            self.worker_metrics[part] = part_op_metrics
        else:
            result_ids = [download_part_remote.remote(self.s3sql, part, self.get_part_key('sf1000-lineitem', part))
                          for part in range(self.parts)]

            for result_id, part_id, part_op_metrics_id in result_ids:
                res_records = ray.get(result_id)
                part = ray.get(part_id)
                part_metrics = ray.get(part_op_metrics_id)
                self.worker_metrics[part] = part_metrics
                self.records.append(res_records)
                print('got {} records from part {}'.format(len(res_records), part))
                # self.send(msg, self.consumers)

        self.records = np.vstack(self.records)
        print("All parts finished")
        print('got {} records'.format(len(self.records)))

        for rec in self.records[0:10]:
            self.send(TupleMessage(Tuple(rec)), self.consumers)

        self.complete()
        self.op_metrics.timer_stop()
        self.print_stats(to_file=self.s3key + '.' + str(self.parts) +'.stats.txt')
示例#14
0
    def __send_field_names(self, tuple_):
        """Sends the field names tuple to consumers

        :param tuple_: The field names tuple
        :return: None
        """

        self.send(TupleMessage(tuple_), self.consumers)
示例#15
0
    def __on_receive_tuple(self, tuple_, producer_name):
        """Event handler for a received tuple

        :param tuple_: The received tuple
        :return: None
        """

        assert (len(tuple_) > 0)

        if self.field_names is None:
            self.field_names = tuple_
            self.producers_received[producer_name] = True
            self.send(TupleMessage(tuple_), self.consumers)
        else:
            if producer_name not in self.producers_received.keys():
                # This will be the field names tuple, skip it
                self.producers_received[producer_name] = True
            else:
                self.send(TupleMessage(tuple_), self.consumers)
示例#16
0
    def send_field_values(self, tuple_):
        """Sends a field values tuple

        :param tuple_: The tuple
        :return: None
        """

        if self.log_enabled:
            print("{}('{}') | Sending field values [{}]".format(
                self.__class__.__name__, self.name, {'data': tuple_}))

        self.send(TupleMessage(Tuple(tuple_)), self.consumers)
示例#17
0
    def send_field_names(self, tuple_):
        """Sends the field names tuple

        :param tuple_: The tuple
        :return: None
        """

        # Create and send the record field names
        lt = IndexedTuple.build_default(tuple_)
        labels = Tuple(lt.field_names())

        if self.log_enabled:
            print("{}('{}') | Sending field names [{}]".format(
                self.__class__.__name__, self.name, {'field_names': labels}))

        self.send(TupleMessage(labels), self.consumers)
示例#18
0
    def join_field_values(self):
        """Performs the join on data tuples using a nested loop joining algorithm. The joined tuples are each sent.
        Allows for the loop to be broken if the operator completes while executing.

        :return: None
        """

        # Check that we actually got tuple field names to join on, we may not have as producers may not have produced
        # any
        if self.tuple_field_names is not None:

            outer_tuple_field_index = self.tuple_field_names.index(
                self.join_expr.r_field)

            for outer_tuple in self.tuples:

                if self.is_completed():
                    break

                outer_tuple_field_value = outer_tuple[outer_tuple_field_index]
                inner_tuples = self.hashtable.get(outer_tuple_field_value,
                                                  None)

                # if self.log_enabled:
                #     print("{}('{}') | Joining Outer: {} Inner: {}".format(
                #         self.__class__.__name__,
                #         self.name,
                #         outer_tuple,
                #         inner_tuples))

                if inner_tuples is not None:

                    for inner_tuple in inner_tuples:

                        # if l_to_r:
                        #     t = outer_tuple + inner_tuple
                        # else:
                        t = inner_tuple + outer_tuple

                        if self.log_enabled:
                            print("{} | {}('{}') | Sending field values [{}]".
                                  format(time.time(), self.__class__.__name__,
                                         self.name, {'data': t}))

                        self.op_metrics.rows_joined += 1

                        self.send(TupleMessage(Tuple(t)), self.consumers)
示例#19
0
    def on_receive_tuple(self, tuple_):
        """Handles receipt of a tuple. Field names are stored and sent. Field values are placed into a sorted heap
        using the sort expressions to define the sort order.

        :param tuple_: The received tuple
        :return: None
        """

        if not self.field_names:
            # Collect and send field names through
            self.field_names = tuple_
            self.send(TupleMessage(tuple_), self.consumers)
        else:
            # Store the tuple in the sorted heap
            sortable_t = HeapSortableTuple(tuple_, self.field_names,
                                           self.sort_expressions)
            heappush(self.heap, sortable_t)
示例#20
0
    def on_producer_completed(self, producer_name):
        """Handles the event when a producer completes. When this happens the sorted tuples are emitted.

        :param producer_name: The producer that completed
        :return: None
        """

        # print("Sort Done | ")
        while self.heap:

            if self.is_completed():
                break

            t = heappop(self.heap).tuple
            self.send(TupleMessage(t), self.consumers)

        del self.heap

        Operator.on_producer_completed(self, producer_name)
示例#21
0
    def on_receive_tuple(self, tuple_, _producer_name):

        if not self.field_names_index:
            self.field_names_index = IndexedTuple.build_field_names_index(tuple_)
            self.send(TupleMessage(tuple_), self.consumers)
            self.producers_received[_producer_name] = True
        else:

            if _producer_name not in self.producers_received.keys():
                # Will be field names, skip
                self.producers_received[_producer_name] = True
            else:

                if self.hashtable is None:
                    self.hashtable = {}

                self.op_metrics.rows_processed += 1
                it = IndexedTuple(tuple_, self.field_names_index)
                itd = self.hashtable.setdefault(it[self.key], [])
                itd.append(tuple_)
示例#22
0
    def start(self):
        self.op_metrics.timer_start()

        if self.parts == 1:
            self.records = []
            self.worker_metrics = {}
            self.download_part(0, self.records, self.worker_metrics)
        else:
            processes = []
            for part in range(self.parts):
                p = Process(target=self.download_part, args=(part, self.records, self.worker_metrics))
                p.start()
                processes.append(p)

            for p in processes:
                p.join()

        print("All parts finished with {} records".format(len(self.records)))

        first_tuple = True
        for msg in self.records:

            if first_tuple:
                # Create and send the record field names
                it = IndexedTuple.build_default(msg.tuple_)
                first_tuple = False

                if self.log_enabled:
                    print("{}('{}') | Sending field names: {}"
                          .format(self.__class__.__name__, self.name, it.field_names()))

                self.send(TupleMessage(Tuple(it.field_names())), self.consumers)

            self.send(msg, self.consumers)

        self.complete()
        self.op_metrics.timer_stop()
        self.print_stats(to_file=self.s3key + '.' + str(self.parts) +'.stats.txt')

        self.records[:] = []
示例#23
0
    def on_receive_tuple(self, tuple_):
        """Handles the receipt of a tuple. When the number of tuples reaches max it informs the producer to stop
        producing. This allows table scans to stop once tuples limit has been reached. It also informs any consumers
        that it is done producing tuples.

        :param tuple_: The received tuple
        :return: None
        """

        if not self.first_tuple:
            self.current += 1
        else:
            self.first_tuple = False

        if self.current <= self.max_tuples:
            self.send(TupleMessage(tuple_), self.consumers)
        elif self.current == self.max_tuples:
            # Set this operator to complete
            if not self.is_completed():
                self.complete()
        else:
            pass
示例#24
0
    def execute_pandas_query(op):
        cur = PandasCursor(op.s3).select(op.s3key, op.s3sql)
        dfs = cur.execute()
        op.op_metrics.query_bytes = cur.query_bytes
        op.op_metrics.time_to_first_response = op.op_metrics.elapsed_time()
        first_tuple = True
        for df in dfs:

            assert (len(df) > 0)

            if first_tuple:
                assert (len(df.columns.values) > 0)
                op.send(TupleMessage(Tuple(df.columns.values)), op.consumers)
                first_tuple = False

                if op.log_enabled:
                    print("{}('{}') | Sending field names: {}".format(
                        op.__class__.__name__, op.name, df.columns.values))

            op.op_metrics.rows_returned += len(df)

            op.send(df, op.consumers)
        return cur
示例#25
0
            def on_numpy_array(np_array):

                df = pd.DataFrame(np_array)

                if closure['first_tuple']:
                    assert (len(df.columns.values) > 0)
                    op.send(TupleMessage(Tuple(df.columns.values)),
                            op.consumers)
                    closure['first_tuple'] = False

                    if op.log_enabled:
                        print("{}('{}') | Sending field names: {}".format(
                            op.__class__.__name__, op.name, df.columns.values))

                op.op_metrics.time_to_first_response = op.op_metrics.elapsed_time(
                )
                op.op_metrics.rows_returned += len(df)

                if op.log_enabled:
                    print("{}('{}') | Sending field values:".format(
                        op.__class__.__name__, op.name))
                    print(df)

                op.send(df, op.consumers)
示例#26
0
    def join_field_values(self):
        """Performs the join on data tuples using a nested loop joining algorithm. The joined tuples are each sent.
        Allows for the loop to be broken if the operator completes while executing.

        :return: None
        """

        # Determine which direction the hash join should run
        # The larger relation should remain as a list and the smaller relation should be hashed. If either of the
        # relations are empty then just return
        if len(self.l_tuples) == 0 or len(self.r_tuples) == 0:
            return
        elif len(self.l_tuples) > len(self.r_tuples):
            l_to_r = True
            # r_to_l = not l_to_r
        else:
            l_to_r = False
            # r_to_l = not l_to_r

        if l_to_r:
            outer_tuples_list = self.l_tuples
            inner_tuples_list = self.r_tuples
            inner_tuple_field_name = self.join_expr.r_field
            inner_tuple_field_names = self.r_field_names
            outer_tuple_field_index = self.l_field_names.index(
                self.join_expr.l_field)
        else:
            outer_tuples_list = self.r_tuples
            inner_tuples_list = self.l_tuples
            inner_tuple_field_name = self.join_expr.l_field
            inner_tuple_field_names = self.l_field_names
            outer_tuple_field_index = self.r_field_names.index(
                self.join_expr.r_field)

        # Hash the tuples from the smaller set of tuples
        inner_tuples_dict = {}
        for t in inner_tuples_list:
            it = IndexedTuple.build(t, inner_tuple_field_names)
            itd = inner_tuples_dict.setdefault(it[inner_tuple_field_name], [])
            itd.append(t)

        for outer_tuple in outer_tuples_list:

            if self.is_completed():
                break

            outer_tuple_field_value = outer_tuple[outer_tuple_field_index]
            inner_tuples = inner_tuples_dict.get(outer_tuple_field_value, None)

            # if self.log_enabled:
            #     print("{}('{}') | Joining Outer: {} Inner: {}".format(
            #         self.__class__.__name__,
            #         self.name,
            #         outer_tuple,
            #         inner_tuples))

            if inner_tuples is not None:

                for inner_tuple in inner_tuples:

                    if l_to_r:
                        t = outer_tuple + inner_tuple
                    else:
                        t = inner_tuple + outer_tuple

                    # if self.log_enabled:
                    #     print("{}('{}') | Sending field values [{}]".format(
                    #         self.__class__.__name__,
                    #         self.name,
                    #         {'data': t}))

                    self.op_metrics.rows_joined += 1

                    self.send(TupleMessage(Tuple(t)), self.consumers)
示例#27
0
    def on_receive_tuple(self, tuple_, producer_name):
        """Handles the receipt of a tuple. The tuple is mapped to a new tuple using the given projection expressions.
        The field names are modified according to the new field names in the projection expressions.

        :param producer_name:
        :param tuple_: The received tuple
        :return: None
        """

        assert (len(tuple_) > 0)

        if not self.field_names_index:

            self.field_names_index = IndexedTuple.build_field_names_index(
                tuple_)

            # Map the old field names to the new
            projected_field_names = []
            for e in self.project_exprs:
                fn = e.new_field_name
                projected_field_names.append(fn)

            if self.log_enabled:
                print(
                    "{}('{}') | Sending projected field names: from: {} to: {}"
                    .format(self.__class__.__name__, self.name, tuple_,
                            projected_field_names))

            self.producers_received[producer_name] = True

            assert (len(projected_field_names) == len(self.project_exprs))

            self.send(TupleMessage(Tuple(projected_field_names)),
                      self.consumers)

        else:

            assert (len(tuple_) == len(self.field_names_index))

            if producer_name not in self.producers_received.keys():
                # This will be the field names tuple, skip it
                self.producers_received[producer_name] = True
            else:

                # Perform the projection using the given expressions
                it = IndexedTuple(tuple_, self.field_names_index)

                projected_field_values = []
                for e in self.project_exprs:
                    fv = e.expr(it)
                    projected_field_values.append(fv)

                self.op_metrics.rows_projected += 1

                if self.log_enabled:
                    print(
                        "{}('{}') | Sending projected field values: from: {} to: {}"
                        .format(self.__class__.__name__, self.name, tuple_,
                                projected_field_values))

                assert (len(projected_field_values) == len(self.project_exprs))

                self.send(TupleMessage(Tuple(projected_field_values)),
                          self.consumers)
示例#28
0
    def execute_pandas_query(op):

        # if op.use_native:
        #     cur = NativeCursor(op.fast_s3).select(op.s3key, op.s3sql)
        #     df = cur.execute()
        #
        #     op.op_metrics.query_bytes = cur.query_bytes
        #     op.op_metrics.rows_returned += len(df)
        #     op.op_metrics.bytes_returned += cur.bytes_returned
        #
        #     op.send(TupleMessage(Tuple(df.columns.values)), op.consumers)
        #     op.send(df, op.consumers)
        #
        #     return cur
        # else:

        dfs = op.cur.execute()
        op.op_metrics.query_bytes = op.cur.query_bytes
        op.op_metrics.time_to_first_response = op.op_metrics.elapsed_time()
        first_tuple = True

        counter = 0

        buffer_ = pd.DataFrame()
        for df in dfs:

            if first_tuple:
                assert (len(df.columns.values) > 0)
                op.send(TupleMessage(Tuple(df.columns.values)), op.consumers)
                first_tuple = False

                # if op.log_enabled:
                #     print("{}('{}') | Sending field names: {}"
                #           .format(op.__class__.__name__, op.name, df.columns.values))

            op.op_metrics.rows_returned += len(df)

            # if op.log_enabled:
            #     print("{}('{}') | Sending field values: {}".format(op.__class__.__name__, op.name, df))

            counter += 1
            if op.log_enabled:
                sys.stdout.write('.')
                if counter % 100 == 0:
                    print("Rows {}".format(op.op_metrics.rows_returned))

            op.send(DataFrameMessage(df), op.consumers)
            # buffer_ = pd.concat([buffer_, df], axis=0, sort=False, ignore_index=True, copy=False)
            # if len(buffer_) >= 8192:
            #    op.send(buffer_, op.consumers)
            #    buffer_ = pd.DataFrame()

        #if len(buffer_) > 0:
        #    op.send(buffer_, op.consumers)
        #    del buffer_

        op.op_metrics.bytes_scanned = op.cur.bytes_scanned
        op.op_metrics.bytes_processed = op.cur.bytes_processed
        op.op_metrics.bytes_returned = op.cur.bytes_returned
        op.op_metrics.time_to_first_record_response = op.cur.time_to_first_record_response
        op.op_metrics.time_to_last_record_response = op.cur.time_to_last_record_response
        op.op_metrics.num_http_get_requests = op.cur.num_http_get_requests

        if not op.is_completed():
            op.complete()
        return op.cur