Пример #1
0
def vision_ocr_request_worker():
    file_ops            = FileOperation()
    DOWNLOAD_FOLDER     = file_ops.create_file_download_dir(config.download_folder)
    producer_tok        = Producer(config.bootstrap_server)
    log_info("vision_ocr_request_worker : starting thread ", LOG_WITHOUT_CONTEXT)

    while True:
        data            = processQueue.get(block=True)
        #################
        task_id         = str("vision_ocr" + str(time.time()).replace('.', ''))
        ###################
        task_starttime  = str(time.time()).replace('.', '')
        input_files, workflow_id, jobid, tool_name, step_order = file_ops.json_input_format(data)
        
        log_info("vision_ocr_request_worker processing -- received message "+str(jobid), data)

        try:
            response_gen    = Response(data, DOWNLOAD_FOLDER)

            file_value_response = response_gen.workflow_response(task_id, task_starttime, False)
            if file_value_response != None:
                if "errorID" not in file_value_response.keys():
                    push_output(producer_tok, config.output_topic, file_value_response, jobid, task_id,data)
                    log_info("vision_ocr_request_worker : response send to topic %s"%(config.output_topic), LOG_WITHOUT_CONTEXT)
                else:
                    log_info("vision_ocr_request_worker : error send to error handler", data)

            log_info('vision_ocr_request_worker - request in internal queue {}'.format(Queue.qsize()), data)

            processQueue.task_done()
        except Exception as e:
            log_exception("vision_ocr_request_worker ",  LOG_WITHOUT_CONTEXT, e)

        controlQueue.put(1)
Пример #2
0
def processRequest(data):
    file_ops = FileOperation()
    producer_tok = Producer(config.bootstrap_server)
    DOWNLOAD_FOLDER = file_ops.file_download(config.download_folder)
    task_id = str("ANNO-" + str(time.time()).replace('.', ''))
    task_starttime = str(time.time()).replace('.', '')
    input_params, workflow_id, jobid, tool_name, step_order = file_ops.json_input_format(
        data)

    log_info("processing -- received message " + str(jobid), data)

    try:
        response_gen = Response(data, DOWNLOAD_FOLDER)
        result_response = response_gen.workflow_response(
            task_id, task_starttime)

        if result_response != None:
            if "errorID" not in result_response.keys():
                push_output(producer_tok, config.output_topic, result_response,
                            jobid, task_id, data)
                log_info(
                    "processing completed successfully, published at %s" %
                    (config.output_topic), data)
            else:
                log_info("processing failed, informed WFM", data)

    except Exception as e:
        log_exception("exception encountered ", LOG_WITHOUT_CONTEXT, e)
Пример #3
0
def word_detector_request_worker():
    file_ops = FileOperation()
    DOWNLOAD_FOLDER = file_ops.create_file_download_dir(config.download_folder)
    producer_tok = Producer(config.bootstrap_server)
    log_info("word_detector_request_worker : starting thread ",
             LOG_WITHOUT_CONTEXT)

    while True:
        data = Queue.get(block=True)
        #################
        task_id = str("word_detector" + str(time.time()).replace('.', ''))
        ###################
        task_starttime = str(time.time()).replace('.', '')
        input_files, workflow_id, jobid, tool_name, step_order = file_ops.json_input_format(
            data)

        log_info(
            "word_detector_request_worker processing -- received message " +
            str(jobid), data)

        try:
            response_gen = Response(data, DOWNLOAD_FOLDER)

            file_value_response = response_gen.workflow_response(
                task_id, task_starttime, False)
            if file_value_response != None:
                push_output(producer_tok, config.output_topic,
                            file_value_response, jobid, task_id, data)
                log_info(
                    "word_detector_request_worker : response send to topic %s"
                    % (config.output_topic), LOG_WITHOUT_CONTEXT)
            else:
                erro_obj = {
                    'code': 400,
                    'jobID': jobid,
                    'message': "Word detector failed"
                }
                producer_tok.push_data_to_queue(
                    config.KAFKA_ANUVAAD_ETL_WF_ERROR_TOPIC, erro_obj)

                log_info(
                    "word_detector_request_worker : error send to error handler",
                    data)

            log_info(
                'word_detector_request_worker - request in internal queue {}'.
                format(Queue.qsize()), data)

            Queue.task_done()
        except Exception as e:
            log_exception("word_detector_request_worker ", LOG_WITHOUT_CONTEXT,
                          e)
Пример #4
0
def process_block_segmenter_kf():
    file_ops = FileOperation()
    DOWNLOAD_FOLDER = file_ops.create_file_download_dir(config.download_folder)
    producer_tok = Producer(config.bootstrap_server)

    # instatiation of consumer for respective topic
    try:
        consumer = consumer_validator()
        log_info(
            "process_document_segmenter_kf : trying to receive value from consumer ",
            LOG_WITHOUT_CONTEXT)

        for msg in consumer:
            if Consumer.get_json_data(msg.value) == None:
                log_info(
                    'process_document_segmenter_kf - received invalid data {}'.
                    format(msg.value), None)
                continue
            data = Consumer.get_json_data(msg.value)

            jobid = data['jobID']
            log_info(
                'process_document_segmenter_kf - received message from kafka, dumping into internal queue',
                data)
            input_files, workflow_id, jobid, tool_name, step_order = file_ops.json_input_format(
                data)

            #if input_files[0]['locale'] == 'en':
            #############
            ####################################
            Queue.put(data)
            log_info(
                'process_document_segmenter_kf - request in internal queue {}'.
                format(Queue.qsize()), data)
            ########################################
            # else:
            #     blockMergerOCRQueue.put(data)
            #     log_info('process_block_merger_kf - request in internal OCR queue {}'.format(blockMergerOCRQueue.qsize()), data)

            # We should reject kafka request if internal queue size become too-much.
            #

    except KafkaConsumerError as e:
        response_custom = {}
        response_custom['message'] = str(e)
        file_ops.error_handler(response_custom, "KAFKA_CONSUMER_ERROR", True)
        log_exception(
            "process_layout_detector_kf : Consumer didn't instantiate", None,
            e)
    except KafkaProducerError as e:
        response_custom = {}
        response_custom['message'] = e.message
        file_ops.error_handler(response_custom, "KAFKA_PRODUCER_ERROR", True)
        log_exception(
            "process_layout_detector_kf : response send to topic %s" %
            (config.output_topic), None, e)
Пример #5
0
def process_vision_ocr_kf():
    file_ops            = FileOperation()
    DOWNLOAD_FOLDER     = file_ops.create_file_download_dir(config.download_folder)
    producer_tok        = Producer(config.bootstrap_server)
    
    # instatiation of consumer for respective topic
    try:
        consumer = consumer_validator()
        log_info("process_vision_ocr_kf : trying to receive value from consumer ", LOG_WITHOUT_CONTEXT)

        while True:
            wait_for_control = controlQueue.get(block=True)
        
            for msg in consumer:
                if Consumer.get_json_data(msg.value) == None:
                    log_info('process_vision_ocr_kf - received invalid data {}'.format(msg.value), None)
                    continue

                data            = Consumer.get_json_data(msg.value)

                consumer.commit()  # <--- This is what we need
                # Optionally, To check if everything went good
                print('New Kafka offset: %s' % consumer.committed(TopicPartition(config.input_topic, msg.partition)))

                jobid           = data['jobID']
                log_info('process_vision_ocr_kf - received message from kafka, dumping into internal queue', data)
                input_files, workflow_id, jobid, tool_name, step_order = file_ops.json_input_format(data)

                #if input_files[0]['locale'] == 'en':
                    #############
                ####################################
                processQueue.put(data)
                log_info('process_vision_ocr_kf - request in internal queue {}'.format(Queue.qsize()),
                            data)
                break

            ########################################
            # else:
            #     blockMergerOCRQueue.put(data)
            #     log_info('process_block_merger_kf - request in internal OCR queue {}'.format(blockMergerOCRQueue.qsize()), data)

            # We should reject kafka request if internal queue size become too-much.
            #
    
    except KafkaConsumerError as e:
        response_custom = {}
        response_custom['message'] = str(e)
        file_ops.error_handler(response_custom, "KAFKA_CONSUMER_ERROR", True)
        log_exception("process_vision_ocr_kf : Consumer didn't instantiate", None, e)
    except KafkaProducerError as e:
        response_custom = {}
        response_custom['message'] = e.message      
        file_ops.error_handler(response_custom, "KAFKA_PRODUCER_ERROR", True)
        log_exception("process_vision_ocr_kf : response send to topic %s"%(config.output_topic), None, e)
Пример #6
0
def process_merger_kf():
    file_ops = FileOperation()
    DOWNLOAD_FOLDER = file_ops.create_file_download_dir(config.download_folder)
    task_id = str("BM-" + str(time.time()).replace('.', ''))
    task_starttime = str(time.time()).replace('.', '')
    # instatiation of consumer for respective topic
    try:
        consumer_class = Consumer(config.input_topic, config.bootstrap_server)
        consumer = consumer_class.consumer_instantiate()
        log_info("process_merger_kf", "trying to receive value from consumer",
                 None)
        thread_instance = 0
        for msg in consumer:
            try:
                data = msg.value
                task_id = str("BM-" + str(time.time()).replace('.', ''))
                task_starttime = str(time.time()).replace('.', '')
                input_files, workflow_id, jobid, tool_name, step_order = file_ops.json_input_format(
                    data)
                log_info("process_merger_kf", "kafka request arrived ", jobid)
                response_gen = Response(data, DOWNLOAD_FOLDER)
                t1 = threading.Thread(
                    target=response_gen.multi_thred_block_merger,
                    args=(task_id, task_starttime, jobid),
                    name='BM-thread-' + str(thread_instance))
                t1.start()
                thread_instance += 1
                log_info("multithread", "block-merger running on multithread",
                         None)
                '''
                file_value_response = response_gen.workflow_response(task_id, task_starttime)
                if "errorID" not in file_value_response.keys():
                    producer = Producer()
                    producer.push_data_to_queue(config.output_topic, file_value_response, jobid, task_id)
                else:
                    log_info("process_merger_kf", "error send to error handler", jobid)'''
            except Exception as e:
                log_exception("process_pdf_kf",
                              "exception while consuming the records", jobid,
                              e)

    except KafkaConsumerError as e:
        response_custom = CustomResponse(Status.ERR_STATUS.value, None, None)
        response_custom.status_code['message'] = str(e)
        file_ops.error_handler(response_custom.status_code,
                               "KAFKA_CONSUMER_ERROR", True)
        log_exception("process_pdf_kf", "Consumer didn't instantiate", None, e)
    except KafkaProducerError as e:
        response_custom = e.code
        response_custom['message'] = e.message
        file_ops.error_handler(response_custom, "KAFKA_PRODUCER_ERROR", True)
        log_exception("process_pdf_kf",
                      "response send to topic %s" % (config.output_topic),
                      response_custom['jobID'], e)
Пример #7
0
def process_annotation_kf():
    file_ops = FileOperation()
    DOWNLOAD_FOLDER = file_ops.create_file_upload_dir(config.download_folder)
    # instatiation of consumer for respective topic
    try:
        consumer_class = Consumer(config.ner_input_topic,
                                  config.bootstrap_server)
        consumer = consumer_class.consumer_instantiate()
        log.info("--- consumer running -----")
    except:
        response = Status.ERR_Consumer.value
        producer_html2json = Producer(config.bootstrap_server)
        producer = producer_html2json.producer_fn()
        producer.send(config.ner_output_topic, value=response)
        producer.flush()
        log.error(
            "error in kafka opertation while listening to consumer on topic %s"
            % (config.ner_input_topic))
        log.info("response send to topic %s" % (config.ner_output_topic))
    try:
        log.info("trying to receive data from consumer")
        for msg in consumer:
            log.info("received data from consumer")
            data = msg.value
            task_id = str("NER-" + str(time.time()).replace('.', ''))
            task_starttime = str(time.time()).replace('.', '')
            checking_response = CheckingResponse(data, task_id, task_starttime,
                                                 DOWNLOAD_FOLDER)
            file_value_response = checking_response.main_response_wf()
            try:
                producer_ner = Producer(config.bootstrap_server)
                producer = producer_ner.producer_fn()
                producer.send(config.ner_output_topic,
                              value=file_value_response)
                producer.flush()
                log.info("producer flushed for topic %s" %
                         (config.ner_output_topic))
            except:
                log.info(
                    "error occured in file operation of workflow and it is pushed to error queue"
                )
    except Exception as e:
        log.error(
            "error occured during consumer running or flushing data to another queue %s"
            % e)
        for msg in consumer:
            log.info("value received from consumer")
            data = msg.value
            input_files, workflow_id, jobid, tool_name, step_order = file_ops.json_input_format(
                data)
            task_id = str("NER-" + str(time.time()).replace('.', ''))
            task_starttime = str(time.time()).replace('.', '')
            response = CustomResponse(Status.ERR_Producer.value, jobid,
                                      task_id)
            file_ops.error_handler(response, True)
            log.info(
                "error in kafka opertation producer flushed value on error topic"
            )
Пример #8
0
def block_merger_request_worker():
    file_ops = FileOperation()
    DOWNLOAD_FOLDER = file_ops.create_file_download_dir(config.download_folder)
    producer_tok = Producer(config.bootstrap_server)

    while True:
        data = blockMergerQueue.get(block=True)
        task_id = str("BM-" + str(time.time()).replace('.', ''))
        task_starttime = str(time.time()).replace('.', '')
        input_files, workflow_id, jobid, tool_name, step_order = file_ops.json_input_format(
            data)

        log_info(
            "block_merger_request_worker processing -- received message " +
            str(jobid), data)

        response_gen = Response(data, DOWNLOAD_FOLDER)

        file_value_response = response_gen.workflow_response(
            task_id, task_starttime, False)
        if file_value_response != None:
            if "errorID" not in file_value_response.keys():
                push_output(producer_tok, config.output_topic,
                            file_value_response, jobid, task_id)
                log_info(
                    "process_block_merger_kf : response send to topic %s" %
                    (config.output_topic), None)
            else:
                log_info(
                    "process_block_merger_kf : error send to error handler",
                    jobid)

        log_info(
            'block_merger_request_worker - request in internal queue {}'.
            format(blockMergerQueue.qsize()), jobid)

        blockMergerQueue.task_done()
Пример #9
0
def process_kf_request_payload():
    file_ops = FileOperation()

    # instatiation of consumer for respective topic
    try:
        consumer = consumer_validator()
        log_info("trying to receive value from consumer ", LOG_WITHOUT_CONTEXT)

        for msg in consumer:
            if Consumer.get_json_data(msg.value) == None:
                log_info('received invalid data {}'.format(msg.value),
                         LOG_WITHOUT_CONTEXT)
                continue

            data = Consumer.get_json_data(msg.value)
            LOG_WITHOUT_CONTEXT['jobID'] = data['jobID']
            log_info(
                "received input request from Kafka queue for JobID: %s " %
                (data['jobID']), LOG_WITHOUT_CONTEXT)
            processRequest(data)

    except KafkaConsumerError as e:
        response_custom = {}
        response_custom['message'] = str(e)
        file_ops.error_handler(response_custom, "KAFKA_CONSUMER_ERROR", True)
        log_exception("Consumer didn't instantiate", None, e)
    except KafkaProducerError as e:
        response_custom = {}
        response_custom['message'] = e.message
        file_ops.error_handler(response_custom, "KAFKA_PRODUCER_ERROR", True)
        log_exception("response send to topic %s" % (config.output_topic),
                      None, e)
    except Exception as e:
        file_ops.error_handler(response_custom, "KAFKA_CONSUMER_ERROR", True)
        log_exception("response send to topic %s" % (config.output_topic),
                      None, e)
Пример #10
0
from src.services.main import DocumentStructure
from src.utilities.model_response import CustomResponse
from src.utilities.model_response import Status
from src.errors.errors_exception import WorkflowkeyError
from src.errors.errors_exception import FileErrors
from src.errors.errors_exception import ServiceError
from src.errors.error_validator import ValidationResponse
from anuvaad_auditor.loghandler import log_info
from anuvaad_auditor.loghandler import log_exception
import time
import config
import copy
import threading
from src.kafka_module.producer import Producer

file_ops = FileOperation()


class Response(object):
    def __init__(self, json_data, DOWNLOAD_FOLDER):
        self.json_data = json_data
        self.DOWNLOAD_FOLDER = DOWNLOAD_FOLDER

    def workflow_response(self, task_id, task_starttime):
        input_files, workflow_id, jobid, tool_name, step_order = file_ops.json_input_format(
            self.json_data)
        log_info("workflow_response", "started the response generation", jobid)
        error_validator = ValidationResponse(self.DOWNLOAD_FOLDER)
        try:
            error_validator.wf_keyerror(jobid, workflow_id, tool_name,
                                        step_order)
Пример #11
0
from flask_restful import Resource
from flask.json import jsonify
from flask import request
from src.utilities.utils import FileOperation
from src.resources.response_gen import Response
from src.errors.error_validator import ValidationResponse
from src.errors.errors_exception import FormatError
from src.utilities.model_response import Status
from anuvaad_auditor.loghandler import log_info
from anuvaad_auditor.loghandler import log_error
import config
import time
import src.utilities.app_context as app_context

# sentence block merging
file_ops = FileOperation()
DOWNLOAD_FOLDER = file_ops.create_file_download_dir(config.download_folder)


# rest request for block merging workflow service
class Tesseract_OCR_WF(Resource):

    # reading json request and reurnung final response
    def post(self):
        json_data = request.get_json(force=True)
        app_context.init()
        app_context.application_context = json_data
        log_info("Resource Tesseract_OCR_WF  Tesseract_OCR service started",
                 app_context.application_context)
        task_id = str("TESSOCR-" + str(time.time()).replace('.', '')[0:13])
        task_starttime = eval(str(time.time()).replace('.', '')[0:13])
Пример #12
0
def process_vision_ocr_kf():
    file_ops            = FileOperation()
    DOWNLOAD_FOLDER     = file_ops.create_file_download_dir(config.download_folder)
    producer_tok        = Producer(config.bootstrap_server)