def create_request_interceptor(event): """ Intercept "op" (opaque parameters) request parameter, decode it and serve as ``request.opaque`` dictionary. """ request = event.request request.opaque = {} request.opaque_meta = {} # extract opaque parameters token from request op_token = request.params.get('op') # do nothing if no token given if not op_token: return registry = event.request.registry signer = registry.getUtility(ISigner) try: data, meta = signer.unsign(op_token) if data: request.opaque.update(data) else: log.error('opaque parameter token is empty. data=%s, token=%s', data, op_token) if meta: request.opaque_meta.update(meta) request.opaque_meta.update({'status': 'ok'}) else: log.error( 'metadata of opaque parameter token is empty. meta=%s, token=%s', meta, op_token) except JwtExpiryError as ex: expiry_unixtime = ex.message['jwt_expiry'] expiry_iso = datetime_iso(unixtime_to_datetime(expiry_unixtime)) ex.message['jwt_expiry_iso'] = expiry_iso log.error('Opaque parameter token expired: expiry=%s, message=%s', expiry_iso, ex.message) request.opaque_meta.update({'status': 'error', 'errors': [ex.message]}) # TODO: log/send full stacktrace log.error(exception_traceback()) except JwtVerifyError as ex: log.error('Error while decoding opaque parameter token: %s', ex.message) request.opaque_meta.update({'status': 'error', 'errors': [ex.message]}) # TODO: log/send full stacktrace log.error(exception_traceback())
def pdf_universal(patent): pdf = None datasource = None meta = {} document = decode_patent_number(patent) number_normalized = normalize_patent(patent) # first, try archive try: # Skip requests for documents w/o kindcode if not document.kind: raise ValueError(u'No kindcode for patent: {}'.format(patent)) pdf = archive_fetch_pdf(number_normalized) datasource = 'archive' except Exception as ex: if not isinstance(ex, HTTPNotFound): log.error(exception_traceback()) """ # second, try archive again after running acquisition try: # Skip requests for documents w/o kindcode if not document.kind: raise ValueError(u'No kindcode') run_acquisition(number_normalized, 'pdf') pdf = archive_fetch_pdf(number_normalized, 2) datasource = 'archive' except Exception as ex: """ if True: if not isinstance(ex, HTTPNotFound): log.error(exception_traceback()) if document: pdf = pdf_from_ops(patent, document, meta) datasource = 'ops' else: log.error('Locating a document at the domestic office requires ' \ 'a decoded document number for "{}"'.format(patent)) return {'pdf': pdf, 'datasource': datasource, 'meta': meta}
def pdf_from_ops(patent, document, meta): # third, try building from OPS single images try: # 2016-04-21: Amend document number for CA documents, e.g. CA2702893C -> CA2702893A1 # TOOD: Reenable feature, but only when prefixing document with a custom page # informing the user about recent changes not yet arrived at EPO. #if document.country == 'CA': # patent = document.country + document.number log.info('PDF OPS attempt for {0}'.format(patent)) return ops_build_pdf(patent) except Exception as ex: if not isinstance(ex, HTTPNotFound): log.error(exception_traceback()) if document.country == 'US': log.info('PDF USPTO attempt for {0}'.format(patent)) images_location = get_images_view_url(document) if images_location: meta.update(images_location) else: log.warning('PDF USPTO not available for {}'.format(patent))
def depatisconnect_claims_handler_real(patent): try: claims = depatisconnect_claims(patent) except KeyError as ex: log.error('No details at DEPATISconnect: %s %s', type(ex), ex) raise HTTPNotFound(ex) except ValueError as ex: log.error('Fetching details from DEPATISconnect failed: %s %s', type(ex), ex) raise HTTPBadRequest(ex) except Exception as ex: log.error('Unknown error from DEPATISconnect: %s %s.', type(ex), ex) log.error(exception_traceback()) raise HTTPBadRequest(ex) return claims
def depatisconnect_description_handler_real(patent): try: description = depatisconnect_description(patent) if not description['xml']: raise KeyError('Description is empty') except KeyError as ex: log.error('No details at DEPATISconnect: %s %s', type(ex), ex) raise HTTPNotFound(ex) except ValueError as ex: log.error('Fetching details from DEPATISconnect failed: %s %s', type(ex), ex) raise HTTPBadRequest(ex) except Exception as ex: log.error('Unknown error from DEPATISconnect: %s %s.', type(ex), ex) log.error(exception_traceback()) raise HTTPBadRequest(ex) return description
def pdf_universal_real(patent, response): document = decode_patent_number(patent) number_normalized = normalize_patent(patent) # Sanity checks. if document is None: log.error('Locating a document at the domestic office requires ' \ 'a decoded document number for "{}"'.format(patent)) raise ValueError('Unable to decode document number {}'.format(patent)) # 1. If it's an EP document, try European publication server first. if response.pdf is None and document.country == 'EP': try: response.pdf = publicationserver_fetch_pdf(patent) response.datasource = 'epo-publication-server' except Exception as ex: log.warning('PDF {}: Not available from EPO. {}'.format( patent, ex)) if not isinstance(ex, HTTPError): log.error(exception_traceback()) # 2. Next, try USPTO servers if it's an US document. if response.pdf is None and document.country == 'US': try: response.pdf = uspto_fetch_pdf(patent) response.datasource = 'uspto' except Exception as ex: log.warning('PDF {}: Not available from USPTO. {}'.format( patent, ex)) if not isinstance(ex, HTTPError): log.error(exception_traceback()) # 3. Next, try DPMA servers. if response.pdf is None: try: # Skip requests for documents w/o kindcode if not document.kind: raise ValueError('No kindcode for patent: {}'.format(patent)) response.pdf = depatisconnect_fetch_pdf(number_normalized) response.datasource = 'dpma' except Exception as ex: log.warning('PDF {}: Not available from DPMA. {}'.format( patent, ex)) # Evaluate exception. if isinstance(ex, NotConfiguredError): log.warning(ex) elif not isinstance(ex, HTTPNotFound): log.error(exception_traceback()) # 4. Next, try EPO OPS service. # Note this will assemble PDF out of single pages requested # from EPO OPS, which is a rather expensive operation. if response.pdf is None: # 2016-04-21: Amend document number for CA documents, e.g. CA2702893C -> CA2702893A1 # TODO: Reenable feature, but only when prefixing document with a custom page # informing the user about recent changes not yet arrived at EPO. # if document.country == 'CA': # patent = document.country + document.number try: response.pdf = ops_build_pdf(patent) response.datasource = 'epo-ops' except Exception as ex: log.warning('PDF {}: Not available from OPS. {}'.format( patent, ex)) if not isinstance(ex, HTTPError): log.error(exception_traceback()) # 5. Last but not least, try to redirect to USPTO server. # TODO: Move elsewhere as deactivated on 2019-02-19. if False and response.pdf is None and document.country == 'US': log.info('PDF {}: USPTO attempt'.format(patent)) uspto_found = False reason = None try: images_location = uspto_pdfview_url(document) if images_location: response.meta.update(images_location) response.datasource = 'uspto' uspto_found = True except Exception as ex: reason = ex if not isinstance(ex, HTTPError): log.error(exception_traceback()) if not uspto_found: log.warning('PDF {}: Not available on USPTO. {}'.format( patent, reason)) return True
def export_util_handler(request): #print 'request.matchdict:', request.matchdict output_kind = request.matchdict['kind'] output_format = request.matchdict['format'] # Convert numberlists to Excel if output_kind == 'numberlist': numberlist = parse_numberlist(request.params.get('numberlist')) payload = create_xlsx({'numberlist': numberlist}) # Export buffer to HTTP response filename = '{0}.xlsx'.format('numberlist') mimetype, encoding = mimetypes.guess_type(filename, strict=False) request.response.content_type = mimetype request.response.charset = None request.response.headers['Content-Disposition'] = 'attachment; filename={filename}'.format(filename=filename) #response['numberlist'] = numberlist # Send as response return payload elif output_kind == 'dossier': log.info('Starting dossier export to format "{format}"'.format(format=output_format)) data = bunchify(json.loads(request.params.get('json'))) # Debugging #print 'dossier-data:'; pprint(data.toDict()) payload = None try: if output_format == 'xlsx': # Generate Office Open XML Workbook payload = DossierXlsx(data).create() elif output_format == 'pdf': # Generate Office Open XML Workbook and convert to PDF dossier = DossierXlsx(data) payload = dossier.to_pdf() elif output_format == 'csv': # TODO: Add comments inline into numberlist dossier = Dossier(data) payload = dossier.to_csv(dossier.df_documents) elif output_format == 'zip': dossier = Dossier(data) payload = dossier.to_zip(request=request, options=data.get('options')) else: return HTTPBadRequest('Export format "{format}" is unknown.'.format(format=output_format)) except Exception as ex: message = 'Exporting format "{format}" failed.'.format(format=output_format) log.error('{message}. Exception:\n{trace}'.format(message=message, trace=exception_traceback())) return HTTPServerError(message) # Send HTTP response filename = 'dossier_{name}_{timestamp}.{format}'.format( name=data.get('name', 'default'), timestamp=data.get('project', {}).get('modified'), format=output_format) mimetype, encoding = mimetypes.guess_type(filename, strict=False) request.response.content_type = mimetype request.response.charset = None request.response.headers['Content-Disposition'] = 'attachment; filename={filename}'.format(filename=filename) return payload # TODO: Log request log.error('Data export error') # TODO: Proper error page for user to report this problem. raise HTTPServerError('Data export error. Please contact support.')
def handle_exception(self, ex, service_name, document): if isinstance(ex, (_JSONError, HTTPError)) and hasattr( ex, 'status_int') and ex.status_int == 404: log.warning(u'XML({service_name}, {document}) not found'.format( service_name=service_name, document=document)) # Signal exception has been handled (ignored) return True else: log.warning(u'XML({service_name}, {document}) failed. ' \ u'Exception:\n{trace}'.format(service_name=service_name, document=document, trace=exception_traceback())) # Signal exception should be re-raised, maybe return False
def to_zip(self, request=None, options=None): """ u'options': {u'media': {u'biblio': False, u'claims': False, u'description': False, u'pdf': True, u'register': False}, u'report': {u'csv': True, u'json': True, u'pdf': False, u'xlsx': False}}, """ # TODO: Text representations for biblio, register, family # TODO: PDF Extracts options = options or bunchify({'report': {}, 'media': {}}) # Remove entries with empty/undefined document numbers self.df_documents.dropna(subset=['document'], inplace=True) # Reject entries with seen == True filtered = self.df_documents[(self.df_documents.seen == False)] documents = list(filtered.document) buffer = BytesIO() with ZipFile(buffer, 'w', ZIP_DEFLATED) as zipfile: # FIXME: Add TERMS (liability waiver) and more... zipfile.writestr('@readme.txt', u'Zip archive created by IP Navigator.') # Add text summary zipfile.writestr('@metadata.txt', self.get_metadata().encode('utf-8')) zipfile.writestr('@summary.txt', self.get_summary().encode('utf-8')) # Report files # ------------ # Add Workbook workbook_payload = None if options.report.xlsx: workbook_payload = DossierXlsx(self.data).create() zipfile.writestr('report/@dossier.xlsx', workbook_payload) # Add Workbook in PDF format if options.report.pdf: try: zipfile.writestr( 'report/@dossier.pdf', DossierXlsx( self.data).to_pdf(payload=workbook_payload)) except Exception as ex: log.error(u'Rendering dossier to PDF failed. ' \ u'Exception: {ex}\n{trace}'.format(ex=ex, trace=exception_traceback())) # Add CSV if options.report.csv: zipfile.writestr('report/csv/01-queries.csv', self.to_csv(self.df_queries)) zipfile.writestr('report/csv/02-documents.csv', self.to_csv(self.df_documents)) zipfile.writestr('report/csv/03-comments.csv', self.to_csv(self.df_comments)) # Add JSON if options.report.json: zipfile.writestr('report/json/01-queries.json', self.to_json(self.df_queries)) zipfile.writestr('report/json/02-documents.json', self.to_json(self.df_documents)) zipfile.writestr('report/json/03-comments.json', self.to_json(self.df_comments)) # Media files # ----------- # FIXME: This should go to some configuration setting. fulltext_countries_excluded_ops = [ 'BE', 'CN', 'DD', 'DE', 'DK', 'FR', 'GR', 'HU', 'JP', 'LU', 'KR', 'RU', 'PT', 'SE', 'TR', 'SK', 'US' ] # Add full PDF documents if options.media.pdf: pdf_ziparchive_add(zipfile, documents, path='media/pdf') # Add XML data # TODO: Add @report.txt for reflecting missing documents, differentiate between different XML kinds. # TODO: Add more TEXT formats (.abstract.txt, .biblio.txt, .register.txt) # TODO: Add ST.36 XML; e.g. from https://register.epo.org/download?number=EP08835045&tab=main&xml=st36 # via https://register.epo.org/application?number=EP08835045 # TODO: Add equivalents, e.g. http://ops.epo.org/3.1/rest-services/published-data/publication/epodoc/EP1000000/equivalents/biblio status = OrderedDict() for document in documents: if not document or not document.strip(): continue log.info('Data acquisition for document {document}'.format( document=document)) status.setdefault(document, OrderedDict()) patent = decode_patent_number(document) # Add XML "bibliographic" data (full-cycle) if options.media.biblio: try: biblio_payload = get_ops_biblio_data('publication', document, xml=True) zipfile.writestr( 'media/xml/{document}.biblio.xml'.format( document=document), biblio_payload) status[document]['biblio'] = True except Exception as ex: status[document]['biblio'] = False self.handle_exception(ex, 'biblio', document) self.clear_request_errors(request) # Add XML "description" full text data # OPS does not have full texts for DE, US, ... if options.media.description: status[document]['description'] = False if patent.country not in fulltext_countries_excluded_ops: try: # Write XML document_number = encode_epodoc_number(patent) description_payload = ops_description( document_number, xml=True) zipfile.writestr( 'media/xml/{document}.description.xml'.format( document=document), description_payload) status[document]['description'] = True # Write TEXT with ignored(): text_payload = self.get_fulltext( description_payload, 'description') if text_payload: zipfile.writestr( 'media/txt/{document}.description.txt'. format(document=document), text_payload.encode('utf-8')) except Exception as ex: self.handle_exception(ex, 'description', document) self.clear_request_errors(request) # Add XML "claims" full text data # OPS does not have full texts for DE, US, ... if options.media.claims: status[document]['claims'] = False if patent.country not in fulltext_countries_excluded_ops: try: # Write XML document_number = encode_epodoc_number(patent) claims_payload = ops_claims(document_number, xml=True) zipfile.writestr( 'media/xml/{document}.claims.xml'.format( document=document), claims_payload) status[document]['claims'] = True # Write TEXT with ignored(): text_payload = self.get_fulltext( claims_payload.replace( '<claim-text>', '<p>').replace('</claim-text>', '</p>'), 'claims') if text_payload: zipfile.writestr( 'media/txt/{document}.claims.txt'. format(document=document), text_payload.encode('utf-8')) except Exception as ex: self.handle_exception(ex, 'claims', document) self.clear_request_errors(request) # Add XML register data if options.media.register: try: register_payload = ops_register('publication', document, xml=True) zipfile.writestr( 'media/xml/{document}.register.xml'.format( document=document), register_payload) status[document]['register'] = True except Exception as ex: status[document]['register'] = False self.handle_exception(ex, 'register', document) self.clear_request_errors(request) # Add XML family data if options.media.family: try: document_number = encode_epodoc_number( patent, {'nokind': True}) family_payload = ops_family_inpadoc('publication', document_number, 'biblio', xml=True) zipfile.writestr( 'media/xml/{document}.family.xml'.format( document=document), family_payload) status[document]['family'] = True except Exception as ex: status[document]['family'] = False self.handle_exception(ex, 'family', document) self.clear_request_errors(request) #from pprint import pprint; print '====== status:'; pprint(status) # Generate report # --------------- # TODO: Format more professionally incl. generator description # TODO: Unify with "pdf_universal_multi" delivered_items = [] missing_items = [] for document, kinds in status.iteritems(): delivered = [] missing = [] for kind, ok in kinds.iteritems(): if ok: delivered.append(kind) else: missing.append(kind) if delivered: item = u'{document:20}{delivered}'.format( document=document, delivered=u', '.join(delivered)) delivered_items.append(item) if missing: item = u'{document:20}{missing}'.format( document=document, missing=u', '.join(missing)) missing_items.append(item) if delivered_items or missing_items: report_template = dedent(""" Delivered artifacts ({delivered_count}): {delivered_files} Missing artifacts ({missing_count}): {missing_files} """).strip() report = report_template.format( delivered_count=len(delivered_items), missing_count=len(missing_items), delivered_files='\n'.join(delivered_items), missing_files='\n'.join(missing_items), ) log.info('Export report:\n{report}'.format(report=report)) zipfile.writestr('media/xml/@report.txt', report) payload = buffer.getvalue() return payload