def on_producer_completed(self, producer_name): self.producer_completions[producer_name] = True if not self.is_completed() and all(self.producer_completions.values()): if self.log_enabled: print("{}('{}') | Hashtable is:\n py: {}, pandas: {}".format( self.__class__.__name__, self.name, self.hashtable, self.hashtable_df)) if self.hashtable_df is not None: self.hashtable_df = self.hashtable_df.set_index(self.key) self.send(HashTableMessage(self.hashtable_df), self.consumers) del self.hashtable_df elif self.hashtable is not None: self.send(HashTableMessage(self.hashtable), self.consumers) del self.hashtable # Note: It is a legitimate state for no tuples to be received, it just means an emtpy hash table # else: # raise Exception("All producers completed but have not received field value tuples") Operator.on_producer_completed(self, producer_name)
def on_producer_completed(self, producer_name): """Event handler for a producer completion event. :param producer_name: The producer that completed. :return: None """ if producer_name in self.producer_completions.keys(): self.producer_completions[producer_name] = True if self.use_pandas: if not self.is_completed() and all(self.producer_completions.values()): if len(self.agg_df > 0): self.send(DataFrameMessage(self.agg_df.agg(['sum'])), self.consumers) else: self.send(DataFrameMessage(pd.DataFrame()), self.consumers) else: if not self.is_completed() and all(self.producer_completions.values()): # Build and send the field names field_names = self.__build_field_names() self.send(TupleMessage(Tuple(field_names)), self.consumers) # Send the field values, if there are any if self.__expression_contexts is not None: field_values = self.__build_field_values() self.send(TupleMessage(Tuple(field_values)), self.consumers) Operator.on_producer_completed(self, producer_name)
def on_producer_completed(self, producer_name): """Handles the event where the producer has completed producing all the tuples it will produce. Once this occurs the tuples can be sent to consumers downstream. :param producer_name: The producer that has completed :return: None """ if producer_name in self.producer_completions.keys(): self.producer_completions[producer_name] = True else: raise Exception( "Unrecognized producer {} has completed".format(producer_name)) is_all_producers_done = all(self.producer_completions.values()) if not is_all_producers_done: return if not self.use_pandas: # Send the field names lt = IndexedTuple.build_default(self.group_field_names + self.aggregate_expressions) self.send(TupleMessage(Tuple(lt.field_names())), self.consumers) for group_tuple, group_aggregate_contexts in self.group_contexts.items( ): if self.is_completed(): break # Convert the aggregate contexts to their results group_fields = list(group_tuple) group_aggregate_values = list( v.result for v in group_aggregate_contexts.values()) t_ = group_fields + group_aggregate_values self.send(TupleMessage(Tuple(t_)), self.consumers) else: # for groupby_reducer, aggregate one more time. if not self.is_completed() and len(self.producers) > 1: self.aggregate_df = self.pd_expr(self.aggregate_df) if not self.is_completed() and self.aggregate_df is not None: self.aggregate_df.reset_index(drop=True, inplace=True) # if self.log_enabled: # with pd.option_context('display.max_rows', None, 'display.max_columns', None): # print("{}('{}') | Sending grouped field values: \n{}" # .format(self.__class__.__name__, self.name, self.aggregate_df)) #self.send(TupleMessage(Tuple(list(self.aggregate_df))), self.consumers) self.send(DataFrameMessage(self.aggregate_df), self.consumers) del self.aggregate_df Operator.on_producer_completed(self, producer_name)
def on_producer_completed(self, producer_name): self.producer_completions[producer_name] = True if all(self.producer_completions.values()): if self.log_enabled: print("{}{}".format(self, self.hashtable)) Operator.on_producer_completed(self, producer_name)
def on_producer_completed(self, producer_name): if producer_name in self.build_producers.keys(): self.build_producer_completions[producer_name] = True elif producer_name in self.tuple_producers.keys(): self.tuple_producer_completions[producer_name] = True else: raise Exception( "Unrecognized producer {} has completed".format(producer_name)) # Check that we have received a completed event from all the producers is_all_build_producers_done = all( self.build_producer_completions.values()) is_all_producers_done = is_all_build_producers_done and \ all(self.tuple_producer_completions.values()) if is_all_build_producers_done: # if self.log_enabled: # print("{} | {}('{}') | All build producers complete, enabling join".format( # time.time(), # self.__class__.__name__, # self.name)) # Need to build index here rather than build job # if self.hashtable_df is not None: # self.hashtable_df.set_index(self.join_expr.l_field, inplace=True, drop=False) self.do_join = True if is_all_producers_done and not self.is_completed(): if self.hashtable_df is not None: if not self.field_names_joined: self.join_field_names() self.field_names_joined = True self.join_field_values_pd() elif self.hashtable is not None: if not self.field_names_joined: self.join_field_names() self.field_names_joined = True self.join_field_values() else: raise Exception( "All producers done but have not received a hashtable") self.hashtable_df = None self.tuples_df = None Operator.on_producer_completed(self, producer_name)
def connect(self, consumer, tag=0): """Overrides the generic connect method to make sure that the connecting operator is an operator that consumes bloom filters. :param consumer: The consumer to connect :return: None """ if type(consumer) is not SQLTableScanBloomUse: raise Exception( "Illegal consumer. {} operator may only be connected to {} operators" .format(self.__class__.__name__, SQLTableScanBloomUse.__name__)) Operator.connect(self, consumer)
def on_producer_completed(self, producer_name): """This event is overridden because we don't want the normal operator completion procedure to run. We want this operator to complete when all the tuples have been retrieved or consumers indicate they need no more tuples. :param producer_name: The completed producer :return: None """ if producer_name in self.producer_completions.keys(): self.producer_completions[producer_name] = True if all(self.producer_completions.values()): self.start() Operator.on_producer_completed(self, producer_name)
def on_producer_completed(self, producer_name): """Event handler for a completed producer. When producers complete the bloom filter can be sent. :param producer_name: The producer that completed. :return: None """ self.producer_completions[producer_name] = True if all(self.producer_completions.values()): # Get the SQL from a bloom use operators bloom_use_operators = filter( lambda o: isinstance(o, SQLTableScanBloomUse), self.consumers) bloom_use_sql_strings = map(lambda o: o.s3sql, bloom_use_operators) max_bloom_use_sql_strings = max( map(lambda s: len(s), bloom_use_sql_strings)) # Build bloom filter best_possible_fp_rate = SlicedSQLBloomFilter.calc_best_fp_rate( len(self.__tuples), max_bloom_use_sql_strings) if best_possible_fp_rate > self.fp_rate: print("{}('{}') | Bloom filter fp rate ({}) too low, " "will exceed max S3 Select SQL expression length ({}). " "Raising to best possible ({})".format( self.__class__.__name__, self.name, self.fp_rate, MAX_S3_SELECT_EXPRESSION_LEN, best_possible_fp_rate)) fp_rate_to_use = best_possible_fp_rate else: fp_rate_to_use = self.fp_rate bloom_filter = self.build_bloom_filter(len(self.__tuples), fp_rate_to_use) for t in self.__tuples: lt = IndexedTuple.build(t, self.__field_names) bloom_filter.add(int(lt[self.bloom_field_name])) del self.__tuples # Send the bloom filter self.__send_bloom_filter(bloom_filter) Operator.on_producer_completed(self, producer_name)
def connect_right_producer(self, producer): """Connects a producer as the producer of right tuples in the join expression :param producer: The right producer :return: None """ if self.__r_producer_name is not None: raise Exception("Only 1 right Producer can be added. Right producer '{}' already added" .format(self.__r_producer_name)) if producer.name is self.__l_producer_name: raise Exception("Producer cannot be added as both right and left producer. " "Producer '{}' already added as left producer" .format(self.__l_producer_name)) self.__r_producer_name = producer.name Operator.connect(producer, self)
def on_producer_completed(self, producer_name): """Handles the event when a producer completes. When this happens the sorted tuples are emitted. :param producer_name: The producer that completed :return: None """ # print("Sort Done | ") while self.heap: if self.is_completed(): break t = heappop(self.heap).tuple self.send(TupleMessage(t), self.consumers) del self.heap Operator.on_producer_completed(self, producer_name)
def on_producer_completed(self, producer_name): """Handles the event where a producer has completed producing all the tuples it will produce. Note that the Join operator may have multiple producers. Once all producers are complete the operator can send the tuples it contains to downstream consumers. :type producer_name: The producer that has completed :return: None """ if producer_name == self.l_producer_name: self.l_producer_completed = True elif producer_name == self.r_producer_name: self.r_producer_completed = True else: raise Exception( "Unrecognized producer {} has completed".format(producer_name)) # Check that we have received a completed event from all the producers is_all_producers_done = self.l_producer_completed & self.r_producer_completed if self.log_enabled: print("{}('{}') | Producer completed [{}]".format( self.__class__.__name__, self.name, { 'completed_producer': producer_name, 'all_producers_completed': is_all_producers_done })) if is_all_producers_done and not self.is_completed(): # Join and send the field names first self.join_field_names() # Join and send the joined data tuples self.join_field_values() # del self.__l_tuples # del self.__r_tuples Operator.on_producer_completed(self, producer_name)
def on_producer_completed(self, producer_name): if producer_name in self.producer_completions.keys(): self.producer_completions[producer_name] = True else: raise Exception( "Unrecognized producer {} has completed".format(producer_name)) is_all_producers_done = all(self.producer_completions.values()) if not is_all_producers_done: return # NOTE: Not sure if this is necessary as the global df is sorted on receipt of each dataframe # if not self.use_pandas: # raise Exception("TopK only supports pandas right now") # elif len(self.global_topk_df) > 0: # if self.sort_expression.sort_order == 'ASC': # self.global_topk_df = self.global_topk_df.nsmallest(self.max_tuples, self.sort_expression.col_index) #.head(self.max_tuples) # elif self.sort_expression.sort_order == 'DESC': # self.global_topk_df = self.global_topk_df.nlargest(self.max_tuples, self.sort_expression.col_index) #.head(self.max_tuples) self.send(DataFrameMessage(self.global_topk_df), self.consumers) Operator.on_producer_completed(self, producer_name)
def on_producer_completed(self, producer_name): """ start accessing S3 only after the producer (index scan) finsihes """ self.execute_pandas_query(self) Operator.on_producer_completed(self, producer_name)
def on_producer_completed(self, producer_name): Operator.on_producer_completed(self, producer_name)
def connect_bloom_consumer(self, consumer): self.__bloom_consumers.append(consumer) Operator.connect(self, consumer)