def process_tokenization_kf(): file_ops = FileOperation() DOWNLOAD_FOLDER =file_ops.file_download(config.download_folder) # instatiation of consumer for respective topic try: consumer_class = Consumer(config.input_topic, config.bootstrap_server) consumer = consumer_class.consumer_instantiate() log_info("process_tokenization_kf : trying to receive value from consumer ", None) for msg in consumer: data = msg.value log_info("process_tokenization_kf : received input json from input topic consumer ", data) task_id = str("TOK-" + str(time.time()).replace('.', '')[0:13]) task_starttime = eval(str(time.time()).replace('.', '')[0:13]) input_files, workflow_id, jobid, tool_name, step_order, user_id = file_ops.json_input_format(data) response_gen = Response(data, DOWNLOAD_FOLDER) file_value_response = response_gen.workflow_response(task_id, task_starttime) if "errorID" not in file_value_response.keys(): producer = Producer() producer.push_data_to_queue(config.output_topic, file_value_response, data, task_id) else: log_error("process_tokenization_kf : error send to error handler", data, None) except KafkaConsumerError as e: response_custom = CustomResponse(Status.ERR_STATUS.value, None, None) response_custom.status_code['message'] = str(e) file_ops.error_handler(response_custom.status_code, "KAFKA_CONSUMER_ERROR", True) log_exception("process_tokenization_kf : Consumer didn't instantiate", None, e) except KafkaProducerError as e: response_custom = e.code response_custom['message'] = e.message file_ops.error_handler(response_custom, "KAFKA_PRODUCER_ERROR", True) log_exception("process_tokenization_kf : response send to topic %s"%(config.output_topic), data, e)
def workflow_response_block_tokeniser(self, task_id, task_starttime): input_key, workflow_id, jobid, tool_name, step_order, user_id = file_ops.json_input_format(self.json_data) log_info("workflow_response : started the block tokenisation response generation", self.json_data) error_validator = ValidationResponse(self.DOWNLOAD_FOLDER) tokenisation = Tokenisation(self.DOWNLOAD_FOLDER, self.json_data) try: error_validator.wf_keyerror(jobid, workflow_id, tool_name, step_order) error_validator.inputfile_list_empty(input_key) blocks_list, record_id, model_id, in_locale = file_ops.get_input_values_for_block_tokenise(input_key) input_key = tokenisation.adding_tokenised_text_blockmerger(input_key, in_locale, 0) task_endtime = eval(str(time.time()).replace('.', '')[0:13]) response_true = CustomResponse(Status.SUCCESS.value, jobid, task_id) response_success = response_true.success_response(workflow_id, task_starttime, task_endtime, tool_name, step_order, input_key) response = copy.deepcopy(response_success) log_info("workflow_response : successfully generated response for workflow", self.json_data) return response except WorkflowkeyError as e: response_custom = self.json_data response_custom['taskID'] = task_id response_custom['message'] = str(e) response = file_ops.error_handler(response_custom, "WORKFLOWKEY-ERROR", True) log_exception("workflow_response : workflow key error: key value missing", self.json_data, e) response = copy.deepcopy(response) return response except FileErrors as e: response_custom = self.json_data response_custom['taskID'] = task_id response_custom['message'] = e.message response = file_ops.error_handler(response_custom, e.code, True) log_exception("workflow_response : some error occured while validating file", self.json_data, e) response = copy.deepcopy(response) return response except ServiceError as e: response_custom = self.json_data response_custom['taskID'] = task_id response_custom['message'] = str(e) response = file_ops.error_handler(response_custom, "SERVICE_ERROR", True) log_exception("workflow_response : Error occured during tokenisation or file writing", self.json_data, e) response = copy.deepcopy(response) return response except Exception as e: response_custom = self.json_data response_custom['taskID'] = task_id response_custom['message'] = str(e) response = file_ops.error_handler(response_custom, "SERVICE_ERROR", True) log_exception("workflow_response : Any random exception", self.json_data, e) response = copy.deepcopy(response) return response
def process_fc_kf(): file_ops = FileOperation() DOWNLOAD_FOLDER =file_ops.file_download(config.download_folder) task_id = str("FC-" + str(time.time()).replace('.', '')) task_starttime = str(time.time()).replace('.', '') producer_tok = Producer(config.bootstrap_server) # instatiation of consumer for respective topic try: consumer = consumer_validator() log_info("process_fc_kf", "trying to receive value from consumer ", None) for msg in consumer: log_info("process_fc_kf", "value received from consumer", None) data = msg.value task_id = str("FC-" + str(time.time()).replace('.', '')) task_starttime = str(time.time()).replace('.', '') input_files, workflow_id, jobid, tool_name, step_order = file_ops.json_input_format(data) response_gen = Response(data, DOWNLOAD_FOLDER) file_value_response = response_gen.workflow_response(task_id, task_starttime) if "errorID" not in file_value_response.keys(): push_output(producer_tok, config.tok_output_topic, file_value_response, jobid, task_id) log_info("process_fc_kf", "response send to topic %s"%(config.tok_output_topic), None) else: log_info("process_fc_kf", "error send to error handler", jobid) except KafkaConsumerError as e: response_custom = CustomResponse(Status.ERR_STATUS.value, None, None) response_custom['message'] = str(e) file_ops.error_handler(response_custom, "KAFKA_CONSUMER_ERROR", True) log_exception("process_fc_kf", "Consumer didn't instantiate", None, e) except KafkaProducerError as e: response_custom = e.code response_custom['message'] = e.message file_ops.error_handler(response_custom, "KAFKA_PRODUCER_ERROR", True) log_exception("process_fc_kf", "response send to topic %s"%(config.tok_output_topic), response_custom['jobID'], e)
def process_HTML_kf(): file_ops = FileOperation() DOWNLOAD_FOLDER = file_ops.create_file_download_dir(config.download_folder) task_id = str("HTML2JSON-" + str(time.time()).replace('.', '')) task_starttime = str(time.time()).replace('.', '') # instatiation of consumer for respective topic try: consumer_class = Consumer(config.html_input_topic, config.bootstrap_server) consumer = consumer_class.consumer_instantiate() #Consumer log.info("--- consumer running -----") except: response = Status.ERR_Consumer.value producer_html2json = Producer(config.bootstrap_server) producer = producer_html2json.producer_fn() producer.send(config.html_output_topic, value=response) producer.flush() log.error( "error in kafka opertation producer flushed value on topic %s" % (config.html_input_topic)) try: log.info("trying to receive value from consumer ") for msg in consumer: log.info("value received from consumer") data = msg.value task_id = str("HTML2JSON-" + str(time.time()).replace('.', '')) task_starttime = str(time.time()).replace('.', '') checking_response = CheckingResponse(data, task_id, task_starttime, DOWNLOAD_FOLDER) file_value_response = checking_response.main_response_wf() try: producer_html2json = Producer(config.bootstrap_server) producer = producer_html2json.producer_fn() producer.send(config.html_output_topic, value=file_value_response) producer.flush() log.info("producer flushed value on topic %s" % (config.html_output_topic)) except: log.info( "error occured in file operation of workflow and it is pushed to error queue" ) except Exception as e: log.error( "error occured while listening message from consumer or flushing data to another queue %s" % e) for msg in consumer: log.info("value received from consumer") data = msg.value input_files, workflow_id, jobid, tool_name, step_order = file_ops.input_format( data) task_id = str("HTML2JSON-" + str(time.time()).replace('.', '')) task_starttime = str(time.time()).replace('.', '') response_custom = CustomResponse(Status.ERR_Producer.value, jobid, task_id) file_ops.error_handler(response_custom.status_code, True) log.info( "error in kafka opertation producer flushed value on error topic" )
def push_output(producer, topic_name, output, jobid, task_id): try: producer.push_data_to_queue(topic_name, output) log_info("push_output", "producer flushed value on topic %s"%(topic_name), jobid) except: response_custom = CustomResponse(Status.ERR_STATUS.value, jobid, task_id) log_exception("push_output", "Response can't be pushed to queue %s"%(topic_name), jobid, None) raise KafkaProducerError(response_custom, "data Not pushed to queue: %s"%(topic_name))
def push_data_to_queue(self, topic_name, push_data, jobid, task_id): producer = self.producer_fn() try: producer.send(topic_name, value=push_data) producer.flush() log_info("push_data_to_queue", "successfully pushed data to output queue", None) except: response_custom = CustomResponse(Status.ERR_STATUS.value, jobid, task_id) log_exception( "push_data_to queue", "Response can't be pushed to queue %s" % (topic_name), jobid, None) raise KafkaProducerError( response_custom.status_code, "data Not pushed to queue: %s" % (topic_name))
def push_output(producer, topic_name, output, jobid, task_id): try: producer.push_data_to_queue(topic_name, output) ctx = { "jobID": jobid, "taskID": task_id, "metadata": { "module": "FILE-CONVERTER" } } log_info( "push_output : producer flushed value on topic %s" % (topic_name), ctx) except Exception as e: response_custom = CustomResponse(Status.ERR_STATUS.value, jobid, task_id) log_exception( "push_output : Response can't be pushed to queue %s" % (topic_name), jobid, None) raise KafkaProducerError(response_custom, "data Not pushed to queue: %s" % (topic_name))
def workflow_response(self, task_id, task_starttime): input_key, workflow_id, jobid, tool_name, step_order = file_ops.json_input_format( self.json_data) log_info("workflow_response : started the response generation", self.json_data) error_validator = ValidationResponse(self.DOWNLOAD_FOLDER) tokenisation = Tokenisation(self.DOWNLOAD_FOLDER, self.json_data) try: error_validator.wf_keyerror(jobid, workflow_id, tool_name, step_order) error_validator.inputfile_list_empty(input_key) if 'files' in input_key.keys(): output_file_response = list() for i, item in enumerate(input_key['files']): input_filename, in_file_type, in_locale = file_ops.accessing_files( item) if in_file_type == "txt": input_file_data = file_ops.read_txt_file( input_filename) error_validator.file_encoding_error(input_file_data) output_filename = tokenisation.tokenisation_response( input_file_data, in_locale, i) elif in_file_type == "json": input_jsonfile_data = file_ops.read_json_file( input_filename) input_jsonfile_data['result'] = [ tokenisation.adding_tokenised_text_blockmerger( item, in_locale, page_id) for page_id, item in enumerate(input_jsonfile_data['result']) ] output_filename = tokenisation.writing_json_file_blockmerger( i, input_jsonfile_data) file_res = file_ops.one_filename_response( input_filename, output_filename, in_locale, in_file_type) output_file_response.append(file_res) else: input_paragraphs = input_key['text'] input_locale = input_key['locale'] tokenised_sentences = [ tokenisation.tokenisation_core([input_paragraph], input_locale) for input_paragraph in input_paragraphs ] output_list_text = [{ "inputText": x, "tokenisedSentences": y } for x, y in zip(input_paragraphs, tokenised_sentences)] output_file_response = { 'tokenisedText': output_list_text, 'locale': input_locale } task_endtime = str(time.time()).replace('.', '') response_true = CustomResponse(Status.SUCCESS.value, jobid, task_id) response_success = response_true.success_response( workflow_id, task_starttime, task_endtime, tool_name, step_order, output_file_response) response = copy.deepcopy(response_success) log_info( "workflow_response : successfully generated response for workflow", self.json_data) return response except WorkflowkeyError as e: response_custom = self.json_data response_custom['taskID'] = task_id response_custom['message'] = str(e) response = file_ops.error_handler(response_custom, "WORKFLOWKEY-ERROR", True) log_exception( "workflow_response : workflow key error: key value missing", self.json_data, e) response = copy.deepcopy(response) return response except FileErrors as e: response_custom = self.json_data response_custom['taskID'] = task_id response_custom['message'] = e.message response = file_ops.error_handler(response_custom, e.code, True) log_exception( "workflow_response : some error occured while validating file", self.json_data, e) response = copy.deepcopy(response) return response except FileEncodingError as e: response_custom = self.json_data response_custom['taskID'] = task_id response_custom['message'] = str(e) response = file_ops.error_handler(response_custom, "ENCODING_ERROR", True) log_exception( "workflow_response : service supports only utf-16 encoded file", self.json_data, e) response = copy.deepcopy(response) return response except ServiceError as e: response_custom = self.json_data response_custom['taskID'] = task_id response_custom['message'] = str(e) response = file_ops.error_handler(response_custom, "SERVICE_ERROR", True) log_exception( "workflow_response : Error occured during tokenisation or file writing", self.json_data, e) response = copy.deepcopy(response) return response
def workflow_response(self, task_id, task_starttime): input_key, workflow_id, jobid, tool_name, step_order, user_id = file_ops.json_input_format( self.json_data) log_info("workflow_response : started the response generation", self.json_data) error_validator = ValidationResponse(self.DOWNLOAD_FOLDER) tokenisation = Tokenisation(self.DOWNLOAD_FOLDER, self.json_data) try: error_validator.wf_keyerror( jobid, workflow_id, tool_name, step_order) # Validating Workflow key-values error_validator.inputfile_list_empty( input_key ) # Validating Input key for files input and only text input # input key is a dictionary data for files input, "files" as a key if not isinstance(input_key, list): if 'files' in input_key.keys(): output_file_response = list() for i, item in enumerate(input_key['files']): input_filename, in_file_type, in_locale = file_ops.accessing_files( item) if in_file_type == "txt": input_file_data = file_ops.read_txt_file( input_filename) error_validator.file_encoding_error( input_file_data) output_filename = tokenisation.tokenisation_response( input_file_data, in_locale, i) elif in_file_type == "json": input_jsonfile_data, file_write = file_ops.read_json_file( input_filename) # input_jsonfile_data['result'] = tokenisation.getting_incomplete_text_merging_blocks(input_jsonfile_data['result']) input_jsonfile_data['result'] = [ tokenisation.adding_tokenised_text_blockmerger( item, in_locale, page_id) for page_id, item in enumerate(input_jsonfile_data['result']) ] input_jsonfile_data[ 'result'] = tokenisation.getting_incomplete_text_merging_blocks( input_jsonfile_data['result']) input_jsonfile_data['file_locale'] = in_locale #tokenisation.sending_data_to_content_handler(jobid, user_id, input_jsonfile_data) json_data_write = json.dumps(input_jsonfile_data) file_write.seek(0) file_write.truncate() file_write.write(json_data_write) output_filename = input_filename file_res = file_ops.one_filename_response( input_filename, output_filename, in_locale, in_file_type) output_file_response.append(file_res) # input key is a list data of objects, object contain text and language code else: output_file_response = [] for paragraph in input_key: input_paragraphs = paragraph['text'] input_locale = paragraph['locale'] tokenised_sentences = [ tokenisation.tokenisation_core([input_paragraph], input_locale) for input_paragraph in input_paragraphs ] output_list_text = [{ "inputText": x, "tokenisedSentences": y } for x, y in zip(input_paragraphs, tokenised_sentences)] output_per_para = { 'tokenisedText': output_list_text, 'locale': input_locale } output_file_response.append(output_per_para) task_endtime = eval(str(time.time()).replace('.', '')[0:13]) response_true = CustomResponse(Status.SUCCESS.value, jobid, task_id) response_success = response_true.success_response( workflow_id, task_starttime, task_endtime, tool_name, step_order, output_file_response) response = copy.deepcopy(response_success) log_info( "workflow_response : successfully generated response for workflow", self.json_data) return response # exceptions for workflow key error except WorkflowkeyError as e: response_custom = self.json_data response_custom['taskID'] = task_id response_custom['message'] = str(e) response = file_ops.error_handler(response_custom, "WORKFLOWKEY-ERROR", True) log_exception( "workflow_response : workflow key error: key value missing", self.json_data, e) response = copy.deepcopy(response) return response # exceptions for input key data validation except FileErrors as e: response_custom = self.json_data response_custom['taskID'] = task_id response_custom['message'] = e.message response = file_ops.error_handler(response_custom, e.code, True) log_exception( "workflow_response : some error occured while validating file", self.json_data, e) response = copy.deepcopy(response) return response # checking filedata unicodes and null data except FileEncodingError as e: response_custom = self.json_data response_custom['taskID'] = task_id response_custom['message'] = str(e) response = file_ops.error_handler(response_custom, "ENCODING_ERROR", True) log_exception( "workflow_response : service supports only utf-16 encoded file", self.json_data, e) response = copy.deepcopy(response) return response # exceptions for tokenisation core logic and file writing of tokenised output except ServiceError as e: response_custom = self.json_data response_custom['taskID'] = task_id response_custom['message'] = str(e) response = file_ops.error_handler(response_custom, "SERVICE_ERROR", True) log_exception( "workflow_response : Error occured during tokenisation or file writing", self.json_data, e) response = copy.deepcopy(response) return response # any other exception i.e. not covered in above exceptions except Exception as e: response_custom = self.json_data response_custom['taskID'] = task_id response_custom['message'] = str(e) response = file_ops.error_handler(response_custom, "SERVICE_ERROR", True) log_exception("workflow_response : Any random exception", self.json_data, e) response = copy.deepcopy(response) return response
def workflow_response(self, task_id, task_starttime): input_files, workflow_id, jobid, tool_name, step_order = file_ops.json_input_format( self.json_data) log_info("workflow_response : started the response generation", self.json_data) error_validator = ValidationResponse(self.DOWNLOAD_FOLDER) try: error_validator.wf_keyerror(jobid, workflow_id, tool_name, step_order) error_validator.inputfile_list_empty(input_files) output_file_response = list() for i, item in enumerate(input_files): upload_id = str(uuid4()) input_filename, in_file_type, in_locale = file_ops.accessing_files( item) filepath = os.path.join(config.download_folder, input_filename) log_info( "workflow_response : input filename received %s" % (input_filename), self.json_data) if input_filename.endswith('.pdf'): file_res = file_ops.one_filename_response( input_filename, input_filename, in_locale, 'pdf') output_file_response.append(file_res) else: result = convert_to(os.path.join(config.download_folder, 'pdf', upload_id), filepath, timeout=60) copyfile( result, os.path.join(config.download_folder, upload_id + '.pdf')) file_res = file_ops.one_filename_response( input_filename, upload_id + '.pdf', in_locale, 'pdf') output_file_response.append(file_res) task_endtime = eval(str(time.time()).replace('.', '')[0:13]) response_true = CustomResponse(Status.SUCCESS.value, jobid, task_id) response_success = response_true.success_response( workflow_id, task_starttime, task_endtime, tool_name, step_order, output_file_response) log_info( "workflow_response : successfully generated response for workflow", self.json_data) return response_success except LibreOfficeError as e: response_custom = self.json_data response_custom['taskID'] = task_id response_custom[ 'message'] = 'workflow_response : Error when converting file to PDF: LibreOfficeError' response = file_ops.error_handler(response_custom, "SERVICE_ERROR", True) log_exception( "workflow_response : Error when converting file to PDF", self.json_data, e) return response except TimeoutExpired as e: response_custom = self.json_data response_custom['taskID'] = task_id response_custom[ 'message'] = 'workflow_response : Timeout when converting file to PDF: TimeoutExpired' response = file_ops.error_handler(response_custom, "SERVICE_ERROR", True) log_exception( "workflow_response : Timeout when converting file to PDF", self.json_data, e) return response except WorkflowkeyError as e: response_custom = self.json_data response_custom['taskID'] = task_id response_custom['message'] = str(e) response = file_ops.error_handler(response_custom, "WORKFLOWKEY-ERROR", True) log_exception( "workflow_response : workflow key error: key value missing", self.json_data, e) return response except FileErrors as e: response_custom = self.json_data response_custom['taskID'] = task_id response_custom['message'] = e.message response = file_ops.error_handler(response_custom, e.code, True) log_exception( "workflow_response : some error occured while validating file", self.json_data, e) return response except FileEncodingError as e: response_custom = self.json_data response_custom['taskID'] = task_id response_custom['message'] = str(e) response = file_ops.error_handler(response_custom, "ENCODING_ERROR", True) log_exception( "workflow_response : service supports only utf-16 encoded file", self.json_data, e) return response except ServiceError as e: response_custom = self.json_data response_custom['taskID'] = task_id response_custom['message'] = str(e) response = file_ops.error_handler(response_custom, "SERVICE_ERROR", True) log_exception( "workflow_response : Error occured during file conversion or file writing", self.json_data, e) return response
def workflow_response(self, task_id, task_starttime): input_files, workflow_id, jobid, tool_name, step_order = file_ops.json_input_format( self.json_data) log_info("workflow_response", "started the response generation", jobid) error_validator = ValidationResponse(self.DOWNLOAD_FOLDER) pdf_html_service = Pdf2HtmlService(self.DOWNLOAD_FOLDER) try: error_validator.wf_keyerror(jobid, workflow_id, tool_name, step_order) error_validator.inputfile_list_error(input_files) output_file_response = list() for item in input_files: input_filename, in_file_type, in_locale = file_ops.accessing_files( item) input_filepath = file_ops.input_path(input_filename) output_htmlfiles_path, output_pngfiles_path = pdf_html_service.pdf2html( input_filepath, jobid) file_res = file_ops.one_filename_response( input_filename, output_htmlfiles_path, output_pngfiles_path, in_locale, in_file_type) output_file_response.append(file_res) task_endtime = str(time.time()).replace('.', '') response_true = CustomResponse(Status.SUCCESS.value, jobid, task_id) response_success = response_true.success_response( workflow_id, task_starttime, task_endtime, tool_name, step_order, output_file_response) response = copy.deepcopy(response_success) log_info("workflow_response", "successfully generated response for workflow", jobid) return response except WorkflowkeyError as e: response_custom = CustomResponse(Status.ERR_STATUS.value, jobid, task_id) response_custom.status_code['message'] = str(e) response = file_ops.error_handler(response_custom.status_code, "WORKFLOWKEY-ERROR", True) log_exception("workflow_response", "workflow key error: key value missing", jobid, e) response = copy.deepcopy(response) return response except FileErrors as e: response_custom = CustomResponse(Status.ERR_STATUS.value, jobid, task_id) response_custom.status_code['message'] = e.message response = file_ops.error_handler(response_custom.status_code, e.code, True) log_exception("workflow_response", "some error occured while validating file", jobid, e) response = copy.deepcopy(response) return response except ServiceError as e: response_custom = CustomResponse(Status.ERR_STATUS.value, jobid, task_id) response_custom.status_code['message'] = str(e) response = file_ops.error_handler(response_custom.status_code, "SERVICE_ERROR", True) log_exception( "workflow_response", "Something went wrong during pdf to html conversion.", jobid, e) response = copy.deepcopy(response) return response