Пример #1
0
    def on_producer_completed(self, producer_name):

        self.producer_completions[producer_name] = True

        if not self.is_completed() and all(self.producer_completions.values()):

            if self.log_enabled:
                print("{}('{}') | Hashtable is:\n py: {}, pandas: {}".format(
                    self.__class__.__name__,
                    self.name,
                    self.hashtable,
                    self.hashtable_df))

            if self.hashtable_df is not None:
                self.hashtable_df = self.hashtable_df.set_index(self.key)
                self.send(HashTableMessage(self.hashtable_df), self.consumers)
                del self.hashtable_df
            elif self.hashtable is not None:
                self.send(HashTableMessage(self.hashtable), self.consumers)
                del self.hashtable

            # Note: It is a legitimate state for no tuples to be received, it just means an emtpy hash table
            # else:
            #     raise Exception("All producers completed but have not received field value tuples")

        Operator.on_producer_completed(self, producer_name)
Пример #2
0
    def on_producer_completed(self, producer_name):
        """Event handler for a producer completion event.

        :param producer_name: The producer that completed.
        :return: None
        """

        if producer_name in self.producer_completions.keys():
            self.producer_completions[producer_name] = True
        if self.use_pandas:
            if not self.is_completed() and all(self.producer_completions.values()):
                if len(self.agg_df > 0):
                    self.send(DataFrameMessage(self.agg_df.agg(['sum'])), self.consumers) 
                else: 
                    self.send(DataFrameMessage(pd.DataFrame()), self.consumers)
        else:
            if not self.is_completed() and all(self.producer_completions.values()):
                # Build and send the field names
                field_names = self.__build_field_names()
                self.send(TupleMessage(Tuple(field_names)), self.consumers)

                # Send the field values, if there are any
                if self.__expression_contexts is not None:
                    field_values = self.__build_field_values()
                    self.send(TupleMessage(Tuple(field_values)), self.consumers)

        Operator.on_producer_completed(self, producer_name)
Пример #3
0
    def on_producer_completed(self, producer_name):
        """Handles the event where the producer has completed producing all the tuples it will produce. Once this
        occurs the tuples can be sent to consumers downstream.

        :param producer_name: The producer that has completed
        :return: None
        """
        if producer_name in self.producer_completions.keys():
            self.producer_completions[producer_name] = True
        else:
            raise Exception(
                "Unrecognized producer {} has completed".format(producer_name))

        is_all_producers_done = all(self.producer_completions.values())
        if not is_all_producers_done:
            return

        if not self.use_pandas:
            # Send the field names
            lt = IndexedTuple.build_default(self.group_field_names +
                                            self.aggregate_expressions)
            self.send(TupleMessage(Tuple(lt.field_names())), self.consumers)

            for group_tuple, group_aggregate_contexts in self.group_contexts.items(
            ):

                if self.is_completed():
                    break

                # Convert the aggregate contexts to their results
                group_fields = list(group_tuple)

                group_aggregate_values = list(
                    v.result for v in group_aggregate_contexts.values())

                t_ = group_fields + group_aggregate_values
                self.send(TupleMessage(Tuple(t_)), self.consumers)
        else:
            # for groupby_reducer, aggregate one more time.
            if not self.is_completed() and len(self.producers) > 1:
                self.aggregate_df = self.pd_expr(self.aggregate_df)

            if not self.is_completed() and self.aggregate_df is not None:
                self.aggregate_df.reset_index(drop=True, inplace=True)

                # if self.log_enabled:
                #     with pd.option_context('display.max_rows', None, 'display.max_columns', None):
                #         print("{}('{}') | Sending grouped field values: \n{}"
                #               .format(self.__class__.__name__, self.name, self.aggregate_df))

                #self.send(TupleMessage(Tuple(list(self.aggregate_df))), self.consumers)
                self.send(DataFrameMessage(self.aggregate_df), self.consumers)

                del self.aggregate_df

        Operator.on_producer_completed(self, producer_name)
Пример #4
0
    def on_producer_completed(self, producer_name):

        self.producer_completions[producer_name] = True

        if all(self.producer_completions.values()):

            if self.log_enabled:
                print("{}{}".format(self, self.hashtable))

            Operator.on_producer_completed(self, producer_name)
Пример #5
0
    def on_producer_completed(self, producer_name):

        if producer_name in self.build_producers.keys():
            self.build_producer_completions[producer_name] = True
        elif producer_name in self.tuple_producers.keys():
            self.tuple_producer_completions[producer_name] = True
        else:
            raise Exception(
                "Unrecognized producer {} has completed".format(producer_name))

        # Check that we have received a completed event from all the producers
        is_all_build_producers_done = all(
            self.build_producer_completions.values())

        is_all_producers_done = is_all_build_producers_done and \
                                all(self.tuple_producer_completions.values())

        if is_all_build_producers_done:
            # if self.log_enabled:
            #     print("{} | {}('{}') | All build producers complete, enabling join".format(
            #         time.time(),
            #         self.__class__.__name__,
            #         self.name))

            # Need to build index here rather than build job
            # if self.hashtable_df is not None:
            #     self.hashtable_df.set_index(self.join_expr.l_field, inplace=True, drop=False)

            self.do_join = True

        if is_all_producers_done and not self.is_completed():

            if self.hashtable_df is not None:
                if not self.field_names_joined:
                    self.join_field_names()
                    self.field_names_joined = True
                self.join_field_values_pd()
            elif self.hashtable is not None:
                if not self.field_names_joined:
                    self.join_field_names()
                    self.field_names_joined = True
                self.join_field_values()
            else:
                raise Exception(
                    "All producers done but have not received a hashtable")

            self.hashtable_df = None
            self.tuples_df = None

        Operator.on_producer_completed(self, producer_name)
Пример #6
0
    def connect(self, consumer, tag=0):
        """Overrides the generic connect method to make sure that the connecting operator is an operator that consumes
        bloom filters.

        :param consumer: The consumer to connect
        :return: None
        """

        if type(consumer) is not SQLTableScanBloomUse:
            raise Exception(
                "Illegal consumer. {} operator may only be connected to {} operators"
                .format(self.__class__.__name__,
                        SQLTableScanBloomUse.__name__))

        Operator.connect(self, consumer)
Пример #7
0
    def on_producer_completed(self, producer_name):
        """This event is overridden because we don't want the normal operator completion procedure to run. We want this
        operator to complete when all the tuples have been retrieved or consumers indicate they need no more tuples.

        :param producer_name: The completed producer
        :return: None
        """

        if producer_name in self.producer_completions.keys():
            self.producer_completions[producer_name] = True

        if all(self.producer_completions.values()):
            self.start()

        Operator.on_producer_completed(self, producer_name)
Пример #8
0
    def on_producer_completed(self, producer_name):
        """Event handler for a completed producer. When producers complete the bloom filter can be sent.

        :param producer_name: The producer that completed.
        :return: None
        """

        self.producer_completions[producer_name] = True

        if all(self.producer_completions.values()):

            # Get the SQL from a bloom use operators
            bloom_use_operators = filter(
                lambda o: isinstance(o, SQLTableScanBloomUse), self.consumers)
            bloom_use_sql_strings = map(lambda o: o.s3sql, bloom_use_operators)
            max_bloom_use_sql_strings = max(
                map(lambda s: len(s), bloom_use_sql_strings))

            # Build bloom filter
            best_possible_fp_rate = SlicedSQLBloomFilter.calc_best_fp_rate(
                len(self.__tuples), max_bloom_use_sql_strings)

            if best_possible_fp_rate > self.fp_rate:
                print("{}('{}') | Bloom filter fp rate ({}) too low, "
                      "will exceed max S3 Select SQL expression length ({}). "
                      "Raising to best possible ({})".format(
                          self.__class__.__name__, self.name, self.fp_rate,
                          MAX_S3_SELECT_EXPRESSION_LEN, best_possible_fp_rate))
                fp_rate_to_use = best_possible_fp_rate
            else:
                fp_rate_to_use = self.fp_rate

            bloom_filter = self.build_bloom_filter(len(self.__tuples),
                                                   fp_rate_to_use)

            for t in self.__tuples:
                lt = IndexedTuple.build(t, self.__field_names)
                bloom_filter.add(int(lt[self.bloom_field_name]))

            del self.__tuples

            # Send the bloom filter
            self.__send_bloom_filter(bloom_filter)

        Operator.on_producer_completed(self, producer_name)
Пример #9
0
    def connect_right_producer(self, producer):
        """Connects a producer as the producer of right tuples in the join expression

        :param producer: The right producer
        :return: None
        """

        if self.__r_producer_name is not None:
            raise Exception("Only 1 right Producer can be added. Right producer '{}' already added"
                            .format(self.__r_producer_name))

        if producer.name is self.__l_producer_name:
            raise Exception("Producer cannot be added as both right and left producer. "
                            "Producer '{}' already added as left producer"
                            .format(self.__l_producer_name))

        self.__r_producer_name = producer.name
        Operator.connect(producer, self)
Пример #10
0
    def on_producer_completed(self, producer_name):
        """Handles the event when a producer completes. When this happens the sorted tuples are emitted.

        :param producer_name: The producer that completed
        :return: None
        """

        # print("Sort Done | ")
        while self.heap:

            if self.is_completed():
                break

            t = heappop(self.heap).tuple
            self.send(TupleMessage(t), self.consumers)

        del self.heap

        Operator.on_producer_completed(self, producer_name)
Пример #11
0
    def on_producer_completed(self, producer_name):
        """Handles the event where a producer has completed producing all the tuples it will produce. Note that the
        Join operator may have multiple producers. Once all producers are complete the operator can send the tuples
        it contains to downstream consumers.

        :type producer_name: The producer that has completed
        :return: None
        """

        if producer_name == self.l_producer_name:
            self.l_producer_completed = True
        elif producer_name == self.r_producer_name:
            self.r_producer_completed = True
        else:
            raise Exception(
                "Unrecognized producer {} has completed".format(producer_name))

        # Check that we have received a completed event from all the producers
        is_all_producers_done = self.l_producer_completed & self.r_producer_completed

        if self.log_enabled:
            print("{}('{}') | Producer completed [{}]".format(
                self.__class__.__name__, self.name, {
                    'completed_producer': producer_name,
                    'all_producers_completed': is_all_producers_done
                }))

        if is_all_producers_done and not self.is_completed():

            # Join and send the field names first
            self.join_field_names()

            # Join and send the joined data tuples
            self.join_field_values()

            # del self.__l_tuples
            # del self.__r_tuples

        Operator.on_producer_completed(self, producer_name)
Пример #12
0
    def on_producer_completed(self, producer_name):
        if producer_name in self.producer_completions.keys():
            self.producer_completions[producer_name] = True
        else:
            raise Exception(
                "Unrecognized producer {} has completed".format(producer_name))

        is_all_producers_done = all(self.producer_completions.values())
        if not is_all_producers_done:
            return

        # NOTE: Not sure if this is necessary as the global df is sorted on receipt of each dataframe

        # if not self.use_pandas:
        #     raise Exception("TopK only supports pandas right now")
        # elif len(self.global_topk_df) > 0:
        #     if self.sort_expression.sort_order == 'ASC':
        #         self.global_topk_df = self.global_topk_df.nsmallest(self.max_tuples, self.sort_expression.col_index) #.head(self.max_tuples)
        #     elif self.sort_expression.sort_order == 'DESC':
        #         self.global_topk_df = self.global_topk_df.nlargest(self.max_tuples, self.sort_expression.col_index) #.head(self.max_tuples)

        self.send(DataFrameMessage(self.global_topk_df), self.consumers)

        Operator.on_producer_completed(self, producer_name)
Пример #13
0
    def on_producer_completed(self, producer_name):
        """ start accessing S3 only after the producer (index scan) finsihes
        """
        self.execute_pandas_query(self)

        Operator.on_producer_completed(self, producer_name)
Пример #14
0
 def on_producer_completed(self, producer_name):
     Operator.on_producer_completed(self, producer_name)
Пример #15
0
 def connect_bloom_consumer(self, consumer):
     self.__bloom_consumers.append(consumer)
     Operator.connect(self, consumer)