def __init__(self, nlprp_request: JsonObjectType) -> None: """ Args: nlprp_request: dictionary from the (entire) JSON NLPRP request Raises: :exc:`NlprpError` for malformed requests """ self.nlprp_request = nlprp_request args = json_get_toplevel_args(nlprp_request) # The processors being requested. We fetch all of them now, so they # can be iterated through fast for each document. requested_processors = json_get_array(args, NKeys.PROCESSORS, required=True) self.processors = [ ServerProcessor.get_processor_nlprp(d) for d in requested_processors ] # Queue? self.queue = json_get_bool(args, NKeys.QUEUE, default=False) # Client job ID self.client_job_id = json_get_str(args, NKeys.CLIENT_JOB_ID, default="") # Include the source text in the reply? self.include_text = json_get_bool(args, NKeys.INCLUDE_TEXT) # Content: list of objects (each with text and metadata) self.content = json_get_array(args, NKeys.CONTENT, required=True)
def show_queue(self) -> JsonObjectType: """ Finds the queue entries associated with the client, optionally restricted to one client job id. """ args = json_get_toplevel_args(self.body, required=False) if args: client_job_id = json_get_str(args, NKeys.CLIENT_JOB_ID, default="", required=False) else: client_job_id = "" # Queue IDs that are of interest queue_id_wheres = [Document.username == self.username ] # type: List[ClauseElement] # nopep8 if client_job_id: queue_id_wheres.append(Document.client_job_id == client_job_id) # noinspection PyUnresolvedReferences queue_ids = fetch_all_first_values( dbsession, select([Document.queue_id]).select_from(Document.__table__).where( and_(*queue_id_wheres)).distinct().order_by( Document.queue_id)) # type: List[str] queue_answer = [] # type: JsonArrayType for queue_id in queue_ids: # DocProcRequest objects that are of interest dprs = list( dbsession.query(DocProcRequest).join(Document).filter( Document.queue_id == queue_id).all()) # type: List[DocProcRequest] busy = not all([dpr.done for dpr in dprs]) if busy: max_time = datetime.datetime.min else: max_time = max([dpr.when_done_utc for dpr in dprs]) assert dprs, "No DocProcRequests found; bug?" dt_submitted = dprs[0].document.datetime_submitted_pendulum queue_answer.append({ NKeys.QUEUE_ID: queue_id, NKeys.CLIENT_JOB_ID: client_job_id, NKeys.STATUS: NlprpValues.BUSY if busy else NlprpValues.READY, NKeys.DATETIME_SUBMITTED: pendulum_to_nlprp_datetime(dt_submitted, to_utc=True), NKeys.DATETIME_COMPLETED: (None if busy else pendulum_to_nlprp_datetime(max_time, to_utc=True)) }) return self.create_response(status=HttpStatus.OK, extra_info={NKeys.QUEUE: queue_answer})
def handle_nlprp_request(self) -> JsonObjectType: """ The main function. Authenticates user and checks the request is not malformed, then calls the function relating to the command specified by the user. """ self._authenticate() self._set_body_json_from_request() command = json_get_str(self.body, NKeys.COMMAND, required=True) log.debug(f"NLPRP request received from {self.request.remote_addr}: " f"username={self.username}, command={command}") if DEBUG_SHOW_REQUESTS: log.debug(f"Request: {self.body!r}") return self.parse_command(command)
def gen_text_metadataobj( self) -> Generator[Tuple[str, JsonValueType], None, None]: """ Generates text and metadata pairs from the request, with the metadata in JSON object (Python dictionary) format. Yields: tuple: ``(text, metadata)``, as above """ for document in self.content: text = json_get_str(document, NKeys.TEXT, required=True) metadata = json_get_value(document, NKeys.METADATA, default=None, required=False) yield text, metadata
def gen_text_metadatastr(self) -> Generator[Tuple[str, str], None, None]: """ Generates text and metadata pairs from the request, with the metadata in string (serialized JSON) format. Yields: tuple: ``(text, metadata)``, as above """ try: for document in self.content: text = json_get_str(document, NKeys.TEXT, required=True) metadata = json_get_value(document, NKeys.METADATA, default=None, required=False) metadata_str = json.dumps(metadata, separators=JSON_SEPARATORS_COMPACT) yield text, metadata_str except KeyError: raise key_missing_error(key=NKeys.TEXT)
def fetch_from_queue(self) -> JsonObjectType: """ Fetches requests for all document-processor pairs for the queue_id supplied by the user (if complete). """ # --------------------------------------------------------------------- # Args # --------------------------------------------------------------------- args = json_get_toplevel_args(self.body) queue_id = json_get_str(args, NKeys.QUEUE_ID, required=True) # --------------------------------------------------------------------- # Start with the DocProcRequests, because if some are still busy, # we will return a "busy" response. # --------------------------------------------------------------------- q_dpr = (dbsession.query(DocProcRequest).join(Document).filter( Document.username == self.username).filter( Document.queue_id == queue_id)) q_doc = (dbsession.query(Document).filter( Document.username == self.username).filter( Document.queue_id == queue_id)) dprs = list(q_dpr.all()) # type: List[DocProcRequest] if not dprs: raise mkerror(NOT_FOUND, "The queue_id given was not found") busy = not all([dpr.done for dpr in dprs]) if busy: response = self.create_response(HttpStatus.PROCESSING, {}) # todo: is it correct (from previous comments) that we can't # return JSON via Pyramid with a status of HttpStatus.PROCESSING? # If that's true, we have to force as below, but then we need to # alter the NLPRP docs (as these state the JSON code and HTTP code # should always be the same). self.set_http_response_status(HttpStatus.OK) return response # --------------------------------------------------------------------- # Make it easy to look up processors # --------------------------------------------------------------------- processor_cache = {} # type: Dict[str, ServerProcessor] def get_processor_cached(_processor_id: str) -> ServerProcessor: """ Cache lookups for speed. (All documents will share the same set of processors, so there'll be a fair bit of duplication.) """ nonlocal processor_cache try: return processor_cache[_processor_id] except KeyError: _processor = ServerProcessor.get_processor_from_id( _processor_id) # may raise # noqa processor_cache[_processor_id] = _processor return _processor # --------------------------------------------------------------------- # Collect results by document # --------------------------------------------------------------------- doc_results = [] # type: JsonArrayType client_job_id = None # type: Optional[str] docs = set(dpr.document for dpr in dprs) for doc in docs: if client_job_id is None: client_job_id = doc.client_job_id processor_data = [] # type: JsonArrayType # ... data for *all* the processors for this doc for dpr in doc.docprocrequests: procresult = json.loads(dpr.results) # type: Dict[str, Any] if procresult[NKeys.NAME] is None: processor = get_processor_cached(dpr.processor_id) procresult[NKeys.NAME] = processor.name procresult[NKeys.TITLE] = processor.title procresult[NKeys.VERSION] = processor.version processor_data.append(procresult) metadata = json.loads(doc.client_metadata) doc_result = { NKeys.METADATA: metadata, NKeys.PROCESSORS: processor_data } if doc.include_text: doc_result[NKeys.TEXT] = doc.doctext doc_results.append(doc_result) # --------------------------------------------------------------------- # Delete leftovers # --------------------------------------------------------------------- with sqla_transaction_commit(): q_doc.delete(synchronize_session=False) # ... will also delete the DocProcRequests via a cascade response_info = { NKeys.CLIENT_JOB_ID: (client_job_id if client_job_id is not None else ""), NKeys.RESULTS: doc_results } return self.create_response(status=HttpStatus.OK, extra_info=response_info)