def load_udf(self, module_name, func_name): try: func = __import__(module_name, globals(), locals(), [func_name], -1).__dict__[func_name] return func except: # These errors should always be caused by user code. write_user_exception(module_name, self.stream_error, NUM_LINES_OFFSET_TRACE) self.close_controller(-1)
def load_udaf(self, module_name, class_name, func_name): try: if self.udaf_instance is None: clazz = __import__(module_name, globals(), locals(), [class_name]).__dict__[class_name] self.udaf_instance = clazz() func = getattr(self.udaf_instance, func_name) return func except: # These errors should always be caused by user code. write_user_exception(module_name, self.stream_error, NUM_LINES_OFFSET_TRACE) self.close_controller(-1)
def process_input(self, func_name, func, input_str): try: try: if self.should_log: self.log_message("Serialized Input: %s" % (input_str)) inputs = deserialize_input(input_str) if self.should_log: self.log_message("Deserialized Input: %s" % (unicode(inputs))) except: # Capture errors where the user passes in bad data. write_user_exception(self.module_name, self.stream_error, NUM_LINES_OFFSET_TRACE) self.close_controller(-3) try: if func_name == GET_PARTIAL_RESULT_FUNC: func_output = func() output = json.dumps(func_output) elif func_name == GET_FINAL_RESULT_FUNC: func_output = func() output = serialize_output(func_output, self.output_schema) else: func_output = func(*inputs) output = serialize_output(func_output, self.output_schema) if self.should_log: self.log_message("Serialized Output: %s" % output) except: # These errors should always be caused by user code. write_user_exception(self.module_name, self.stream_error, NUM_LINES_OFFSET_TRACE) self.close_controller(-2) self.stream_output.write("%s%s" % (output, END_RECORD_DELIM)) except Exception as e: # This should only catch internal exceptions with the controller # and pig- not with user code. import traceback traceback.print_exc(file=self.stream_error) sys.exit(-3) sys.stdout.flush() sys.stderr.flush() self.stream_output.flush() self.stream_error.flush()
def main(self, module_name, file_path, func_name, cache_path, output_stream_path, error_stream_path, log_file_name, output_schema): sys.stdin = os.fdopen(sys.stdin.fileno(), 'rb', 0) # Need to ensure that user functions can't write to the streams we use to communicate with pig. self.stream_output = os.fdopen(sys.stdout.fileno(), 'wb', 0) self.stream_error = os.fdopen(sys.stderr.fileno(), 'wb', 0) self.input_stream = sys.stdin # TODO: support controller logging # self.log_stream = open(output_stream_path, 'a') # sys.stderr = open(error_stream_path, 'w') sys.path.append(file_path) sys.path.append(cache_path) sys.path.append('.') should_log = False if should_log: logging.basicConfig(filename=log_file_name, format="%(asctime)s %(levelname)s %(message)s", level=udf_logging.udf_log_level) logging.info("To reduce the amount of information being logged only a small subset of rows are logged at the " "INFO level. Call udf_logging.set_log_level_debug in tajo_util to see all rows being processed.") input_str = self.get_next_input() try: func = __import__(module_name, globals(), locals(), [func_name], -1).__dict__[func_name] except: # These errors should always be caused by user code. write_user_exception(module_name, self.stream_error, NUM_LINES_OFFSET_TRACE) self.close_controller(-1) log_message = logging.info if udf_logging.udf_log_level == logging.DEBUG: log_message = logging.debug while input_str != END_OF_STREAM: try: try: if should_log: log_message("Serialized Input: %s" % (input_str)) inputs = deserialize_input(input_str) if should_log: log_message("Deserialized Input: %s" % (unicode(inputs))) except: # Capture errors where the user passes in bad data. write_user_exception(module_name, self.stream_error, NUM_LINES_OFFSET_TRACE) self.close_controller(-3) try: func_output = func(*inputs) if should_log: log_message("UDF Output: %s" % (unicode(func_output))) except: # These errors should always be caused by user code. write_user_exception(module_name, self.stream_error, NUM_LINES_OFFSET_TRACE) self.close_controller(-2) output = serialize_output(func_output, output_schema) if should_log: log_message("Serialized Output: %s" % (output)) self.stream_output.write( "%s%s" % (output, END_RECORD_DELIM) ) except Exception as e: # This should only catch internal exceptions with the controller # and pig- not with user code. import traceback traceback.print_exc(file=self.stream_error) sys.exit(-3) sys.stdout.flush() sys.stderr.flush() self.stream_output.flush() self.stream_error.flush() input_str = self.get_next_input()