示例#1
0
 def _api(self, get_or_post, url, headers, data = None, polling = False):
     from handprint.network import net
     response, error = net(get_or_post, url, headers = headers,
                           data = data, polling = polling)
     if isinstance(error, NetworkFailure):
         if __debug__: log(f'network exception: {str(error)}')
         return TRResult(path = path, data = {}, text = '', error = str(error))
     elif isinstance(error, RateLimitExceeded):
         # https://docs.microsoft.com/en-us/azure/azure-resource-manager/resource-manager-request-limits
         # The headers have a Retry-After number in seconds in some cases
         # but not others, so we default to something just in case.
         sleep_time = 20
         if 'Retry-After' in response.headers:
             sleep_time = int(response.headers['Retry-After'])
         if __debug__: log(f'sleeping for {sleep_time} s and retrying')
         wait(sleep_time)
         return self._api(get_or_post, url, headers, data, polling) # Recurse
     elif error:
         if isinstance(error, ServiceFailure):
             # If it was an error generated by the Microsoft service, there
             # will be additional details in the response.  Check for it.
             try:
                 json_response = response.json()
                 if json_response and json_response.get('error', None):
                     error = json_response['error']
                     if 'code' in error:
                         code = error['code']
                         message = error['message']
                         raise ServiceFailure('Microsoft returned error code '
                                              + code + ' -- ' + message)
             except:
                 pass
         raise error
     else:
         return response
示例#2
0
def timed_request(get_or_post, url, session=None, timeout=20, **kwargs):
    '''Perform a network "get" or "post", handling timeouts and retries.
    If "session" is not None, it is used as a requests.Session object.
    "Timeout" is a timeout (in seconds) on the network requests get or post.
    Other keyword arguments are passed to the network call.
    '''
    failures = 0
    retries = 0
    error = None
    while failures < _MAX_FAILURES and not interrupted():
        try:
            with warnings.catch_warnings():
                # The underlying urllib3 library used by the Python requests
                # module will issue a warning about missing SSL certificates.
                # We don't care here.  See also this for a discussion:
                # https://github.com/kennethreitz/requests/issues/2214
                warnings.simplefilter("ignore", InsecureRequestWarning)
                if __debug__: log(f'doing http {get_or_post} on {url}')
                if session:
                    method = getattr(session, get_or_post)
                else:
                    method = requests.get if get_or_post == 'get' else requests.post
                response = method(url, timeout=timeout, verify=False, **kwargs)
                if __debug__: log('response received')
                return response
        except (KeyboardInterrupt, UserCancelled) as ex:
            if __debug__: log(f'network {method} interrupted by {str(ex)}')
            raise
        except Exception as ex:
            # Problem might be transient.  Don't quit right away.
            failures += 1
            if __debug__: log(f'exception (failure #{failures}): {str(ex)}')
            # Record the first error we get, not the subsequent ones, because
            # in the case of network outages, the subsequent ones will be
            # about being unable to reconnect and not the original problem.
            if not error:
                error = ex
        if failures >= _MAX_FAILURES:
            # Pause with exponential back-off, reset failure count & try again.
            if retries < _MAX_RETRIES:
                retries += 1
                failures = 0
                if __debug__: log('pausing because of consecutive failures')
                wait(10 * retries * retries)
            else:
                # We've already paused & restarted once.
                raise error
    if interrupted():
        if __debug__: log('interrupted -- raising UserCancelled')
        raise UserCancelled(f'Network request has been interrupted for {url}')
    else:
        # In theory, we should never reach this point.  If we do, then:
        raise InternalError(f'Unexpected case in timed_request for {url}')
示例#3
0
def net(get_or_post, url, session=None, polling=False, recursing=0, **kwargs):
    '''Gets or posts the 'url' with optional keyword arguments provided.
    Returns a tuple of (response, exception), where the first element is
    the response from the get or post http call, and the second element is
    an exception object if an exception occurred.  If no exception occurred,
    the second element will be None.  This allows the caller to inspect the
    response even in cases where exceptions are raised.

    If keyword 'session' is not None, it's assumed to be a requests session
    object to use for the network call.

    If keyword 'polling' is True, certain statuses like 404 are ignored and
    the response is returned; otherwise, they are considered errors.

    This method hands allow_redirects = True to the underlying Python requests
    network call.
    '''
    def addurl(text):
        return f'{text} for {url}'

    req = None
    try:
        req = timed_request(get_or_post,
                            url,
                            session,
                            allow_redirects=True,
                            **kwargs)
    except requests.exceptions.ConnectionError as ex:
        if recursing >= _MAX_RECURSIVE_CALLS:
            return (req, NetworkFailure(addurl('Too many connection errors')))
        arg0 = ex.args[0]
        if isinstance(arg0, urllib3.exceptions.MaxRetryError):
            if __debug__: log(str(arg0))
            original = unwrapped_urllib3_exception(arg0)
            if isinstance(original, str) and 'unreacheable' in original:
                return (req,
                        NetworkFailure(addurl('Unable to connect to server')))
            elif network_available():
                raise NetworkFailure(addurl('Unable to resolve host'))
            else:
                raise NetworkFailure(
                    addurl('Lost network connection with server'))
        elif (isinstance(arg0, urllib3.exceptions.ProtocolError) and arg0.args
              and isinstance(args0.args[1], ConnectionResetError)):
            if __debug__: log('net() got ConnectionResetError; will recurse')
            wait(1)  # Sleep a short time and try again.
            return net(get_or_post, url, session, polling, recursing + 1,
                       **kwargs)
        else:
            return (req, NetworkFailure(str(ex)))
    except requests.exceptions.ReadTimeout as ex:
        if network_available():
            return (req,
                    ServiceFailure(
                        addurl('Timed out reading data from server')))
        else:
            return (req,
                    NetworkFailure(
                        addurl('Timed out reading data over network')))
    except requests.exceptions.InvalidSchema as ex:
        return (req, NetworkFailure(addurl('Unsupported network protocol')))
    except Exception as ex:
        return (req, ex)

    # Interpret the response.  Note that the requests library handles code 301
    # and 302 redirects automatically, so we don't need to do it here.
    code = req.status_code
    error = None
    if code == 400:
        error = ServiceFailure('Server rejected the request')
    elif code in [401, 402, 403, 407, 451, 511]:
        error = AuthFailure(addurl('Access is forbidden'))
    elif code in [404, 410] and not polling:
        error = NoContent(addurl("No content found"))
    elif code in [405, 406, 409, 411, 412, 414, 417, 428, 431, 505, 510]:
        error = InternalError(addurl(f'Server returned code {code}'))
    elif code in [415, 416]:
        error = ServiceFailure(addurl('Server rejected the request'))
    elif code == 429:
        if recursing < _MAX_RECURSIVE_CALLS:
            pause = 5 * (recursing + 1)  # +1 b/c we start with recursing = 0.
            if __debug__: log(f'rate limit hit -- sleeping {pause}')
            wait(pause)  # 5 s, then 10 s, then 15 s, etc.
            return net(get_or_post, url, session, polling, recursing + 1,
                       **kwargs)
        error = RateLimitExceeded(
            'Server blocking further requests due to rate limits')
    elif code == 503:
        error = ServiceFailure('Server is unavailable -- try again later')
    elif code in [500, 501, 502, 506, 507, 508]:
        error = ServiceFailure(f'Dimensions server error (HTTP code {code})')
    elif not (200 <= code < 400):
        error = NetworkFailure(f'Unable to resolve {url}')
    return (req, error)
示例#4
0
def download(url, user, password, local_destination, recursing=0):
    '''Download the 'url' to the file 'local_destination'.'''
    def addurl(text):
        return f'{text} for {url}'

    try:
        req = timed_request('get', url, stream=True, auth=(user, password))
    except requests.exceptions.ConnectionError as ex:
        if recursing >= _MAX_RECURSIVE_CALLS:
            raise NetworkFailure(addurl('Too many connection errors'))
        arg0 = ex.args[0]
        if isinstance(arg0, urllib3.exceptions.MaxRetryError):
            if __debug__: log(str(arg0))
            original = unwrapped_urllib3_exception(arg0)
            if isinstance(original, str) and 'unreacheable' in original:
                return (req,
                        NetworkFailure(addurl('Unable to connect to server')))
            elif network_available():
                raise NetworkFailure(addurl('Unable to resolve host'))
            else:
                raise NetworkFailure(
                    addurl('Lost network connection with server'))
        elif (isinstance(arg0, urllib3.exceptions.ProtocolError) and arg0.args
              and isinstance(args0.args[1], ConnectionResetError)):
            if __debug__:
                log('download() got ConnectionResetError; will recurse')
            wait(1)  # Sleep a short time and try again.
            recursing += 1
            download(url, user, password, local_destination, recursing)
        else:
            raise NetworkFailure(str(ex))
    except requests.exceptions.ReadTimeout as ex:
        if network_available():
            raise ServiceFailure(addurl('Timed out reading data from server'))
        else:
            raise NetworkFailure(addurl('Timed out reading data over network'))
    except requests.exceptions.InvalidSchema as ex:
        raise NetworkFailure(addurl('Unsupported network protocol'))
    except Exception as ex:
        raise

    # Interpret the response.
    code = req.status_code
    if code == 202:
        # Code 202 = Accepted, "received but not yet acted upon."
        wait(1)  # Sleep a short time and try again.
        recursing += 1
        if __debug__: log('calling download() recursively for http code 202')
        download(url, user, password, local_destination, recursing)
    elif 200 <= code < 400:
        # This started as code in https://stackoverflow.com/a/13137873/743730
        # Note: I couldn't get the shutil.copyfileobj approach to work; the
        # file always ended up zero-length.  I couldn't figure out why.
        with open(local_destination, 'wb') as f:
            for chunk in req.iter_content(chunk_size=1024):
                f.write(chunk)
        req.close()
        size = os.stat(local_destination).st_size
        if __debug__: log(f'wrote {size} bytes to file {local_destination}')
    elif code in [401, 402, 403, 407, 451, 511]:
        raise AuthFailure(addurl('Access is forbidden'))
    elif code in [404, 410]:
        raise NoContent(addurl('No content found'))
    elif code in [405, 406, 409, 411, 412, 414, 417, 428, 431, 505, 510]:
        raise InternalError(addurl(f'Server returned code {code}'))
    elif code in [415, 416]:
        raise ServiceFailure(addurl('Server rejected the request'))
    elif code == 429:
        raise RateLimitExceeded(
            'Server blocking further requests due to rate limits')
    elif code == 503:
        raise ServiceFailure('Server is unavailable -- try again later')
    elif code in [500, 501, 502, 506, 507, 508]:
        raise ServiceFailure(
            addurl(f'Internal server error (HTTP code {code})'))
    else:
        raise NetworkFailure(f'Unable to resolve {url}')
示例#5
0
    def _result_from_api(self, path):
        # Read the image and proceed with contacting the service.
        (image, error) = self._image_from_file(path)
        if error:
            return error

        endpoint = self._credentials['endpoint']
        key = self._credentials['subscription_key']
        url = f'{endpoint}/vision/v3.2/read/analyze'
        headers = {'Ocp-Apim-Subscription-Key': key,
                   'Content-Type': 'application/octet-stream'}

        # The Microsoft API requires 2 phases: first submit the image for
        # processing, then wait & poll until the text is ready to be retrieved.

        if __debug__: log(f'contacting Microsoft for {relative(path)}')
        response = self._api('post', url, headers, image)
        if isinstance(response, tuple):
            return response             # If get back a tuple, it's an error.

        if 'Operation-Location' in response.headers:
            poll_url = response.headers['Operation-Location']
        else:
            if __debug__: log('no operation-location in response headers')
            raise ServiceFailure('Unexpected response from Microsoft server')
        if __debug__: log('polling MS for results ...')
        analysis = {}
        poll = True
        while poll:
            raise_for_interrupts()
            # Have never seen results returned in 1 s, and meanwhile, polling
            # still counts against our rate limit.  Wait 2 s to reduce calls.
            wait(2)
            response = self._api('get', poll_url, headers = headers, polling = True)
            if isinstance(response, tuple):
                return response         # If get back a tuple, it's an error.

            # Sometimes the response has no content.  I don't know why.
            # It's not clear what else can be done except to keep trying.
            if not response.text:
                if __debug__: log('received empty result from Microsoft.')
                continue

            analysis = response.json()
            if 'status' in analysis:
                if analysis['status'] in ('notStarted', 'running'):
                    if __debug__: log('Microsoft still processing image')
                    poll = True
                elif analysis['status'] == 'succeeded':
                    if __debug__: log('Microsoft returned success code')
                    poll = False
                else:
                    if analysis['status'] == 'failed':
                        text = 'Microsoft analysis failed'
                    else:
                        text = 'Error: Microsoft returned unexpected result'
                    return TRResult(path = path, data = {}, text = '',
                                    boxes = [], error = text)
            else:
                # No status key in JSON results means something's wrong.
                text = 'Error: Microsoft results not in expected format'
                return TRResult(path = path, data = {}, text = '',
                                boxes = [], error = text)

        if __debug__: log(f'results received from Microsoft for {relative(path)}')
        return analysis
示例#6
0
    def _send(self, image, service):
        '''Get results from service named "service" for the "image".'''

        service_name = f'[{service.name_color()}]{service.name()}[/]'
        base_path = path.join(image.dest_dir, path.basename(image.file))
        json_file = self._renamed(base_path, str(service), 'json')

        saved_results = None
        if self._reuse_json and readable(json_file):
            inform(
                f'Reading saved results for {service_name} from {relative(json_file)}'
            )
            with open(json_file, 'r') as f:
                saved_results = json.load(f)
            output = service.result(image.file, saved_results)
        else:
            inform(f'Sending to {service_name} and waiting for response ...')
            last_time = timer()
            try:
                output = service.result(image.file, None)
            except AuthFailure as ex:
                raise AuthFailure(f'Service {service}: {str(ex)}')
            except RateLimitExceeded as ex:
                time_passed = timer() - last_time
                if time_passed < 1 / service.max_rate():
                    warn(f'Pausing {service_name} due to rate limits')
                    wait(1 / service.max_rate() - time_passed)
                    warn(f'Continuing {service_name}')
                    return self._send(image, service)
            if output.error:
                # Sanitize the error string in case it contains '{' characters.
                msg = output.error.replace('{', '{{{{').replace('}', '}}}}')
                alert(f'{service_name} failed: {msg}')
                warn(
                    f'No result from {service_name} for {relative(image.file)}'
                )
                return None
            inform(f'Got result from {service_name}.')

        raise_for_interrupts()
        inform(f'Creating annotated image for {service_name}.')
        annot_path = self._renamed(base_path, str(service), 'png')
        report_path = None
        from handprint.images import annotated_image
        with self._lock:
            img = annotated_image(image.file, output.boxes, service,
                                  self._text_size, self._text_color,
                                  self._text_shift, self._display,
                                  self._confidence)
            self._save(img, annot_path)

        if self._extended_results and (saved_results is None):
            inform(f'Saving all data for {service_name}.')
            raw_json = json.dumps(output.data, sort_keys=True, indent=2)
            self._save(raw_json, json_file)
            inform(f'Saving extracted text for {service_name}.')
            txt_file = self._renamed(base_path, str(service), 'txt')
            self._save(output.text, txt_file)
        if self._compare:
            gt_file = alt_extension(image.item_file, 'gt.txt')
            gt_path = relative(gt_file)
            report_path = self._renamed(image.item_file, str(service), 'tsv')
            relaxed = (self._compare == 'relaxed')
            if readable(gt_file) and nonempty(gt_file):
                if __debug__: log(f'reading ground truth from {gt_file}')
                gt_text = open(gt_file, 'r').read()
                inform(f'Saving {service_name} comparison to ground truth')
                from handprint.comparison import text_comparison
                self._save(text_comparison(output.text, gt_text, relaxed),
                           report_path)
            elif not nonempty(gt_file):
                warn(
                    f'Skipping {service_name} comparison because {gt_path} is empty'
                )
            else:
                warn(
                    f'Skipping {service_name} comparison because {gt_path} not available'
                )
        return Result(service, image, annot_path, report_path)