def get_images_view_url(document): document = normalize_patent(document, for_ops=False) reference_type = None if len(document.number) <= 9: reference_type = 'publication' elif len(document.number) >= 10: reference_type = 'application' url_tpl = None if reference_type == 'application': # AppFT image server # http://pdfaiw.uspto.gov/.aiw?docid=20160105912 url_tpl = 'http://pdfaiw.uspto.gov/.aiw?docid={docid}' elif reference_type == 'publication': # PatFT image server # http://pdfpiw.uspto.gov/.piw?docid=9317610 url_tpl = 'http://pdfpiw.uspto.gov/.piw?docid={docid}' if url_tpl: url = url_tpl.format(docid=document.number) # Pre-flight check upstream url for existence of document try: response = requests.get(url) if 'is not a valid ID' not in response.content: return {'location': url, 'origin': 'USPTO'} except: pass
def pair_to_term(cls, key, value): try: fieldname = cls.fieldmap[key]['name'] parameter = cls.fieldmap[key]['parameter'] except KeyError: return if fieldname: if key == 'country': value = value.lower() elif key == 'patentnumber': value_normalized = normalize_patent(value) if value_normalized: value = value_normalized term = '{0}:{1}'.format(fieldname, value) else: term = value term_data = { 'parameter': parameter, 'term': term, } return term_data
def action(token, index, binop, term): term = term.replace('"', '') # apply document number normalization to values of certain indexes only if index.lower() in indexes_publication_number: term = normalize_patent(term, fix_kindcode=True) if term: token[2] = term
def examples_ok(self): for number, number_normalized_expect in test_numbers_normalized_ok.items( ): number_normalized_computed = normalize_patent(number, fix_kindcode=True, for_ops=True) yield number, number_normalized_expect, number_normalized_computed
def document_to_number(self, document): ucid = document[u'ucid'] cc, docno, kindcode = ucid.split('-') number = cc + docno + kindcode number_normalized = normalize_patent(number) if number_normalized: number = number_normalized return number
def normalize_numbers(entries): entries = map(lambda s: s.replace(u' ', u''), entries) response = {'valid': [], 'invalid': [], 'all': []} for entry in entries: entry_normalized = normalize_patent(entry, fix_kindcode=True) if entry_normalized: response['valid'].append(entry_normalized) response['all'].append(entry_normalized) else: response['invalid'].append(entry) response['all'].append(entry) return response
def espacenet_fetch(document_number, section, element_id): patent = normalize_patent(document_number, as_dict=True, provider='espacenet') # Blueprint: https://worldwide.espacenet.com/publicationDetails/biblio?CC=EP&NR=0666666&KC=A3 url_tpl = 'https://worldwide.espacenet.com/data/publicationDetails/{section}?CC={country}&NR={number}' if 'kind' in patent and patent['kind']: url_tpl += '&KC={kind}' url = url_tpl.format(section=section, **patent) logger.info('Accessing Espacenet: {}'.format(url)) response = requests.get(url, headers={'User-Agent': regular_user_agent}) # Debugging #print 'response.content:\n', response.content message_404 = 'No section "{section}" at Espacenet for "{document_number}"'.format( **locals()) message_fail = 'Fetching section "{section}" from Espacenet for "{document_number}" failed'.format( **locals()) if response.status_code == 200: # TODO: when no result, "Claims not available" appears in response body soup = BeautifulSoup(response.content) element = soup.find('div', {'id': element_id}) if element: element = element.find('p') lang = element['lang'] del element['class'] content = element.prettify() else: raise KeyError(message_404) data = { 'xml': content, 'lang': lang, 'source': 'espacenet', } return data elif response.status_code == 404: raise KeyError(message_404) else: if 'Entity not found' in response.content: raise KeyError(message_404) else: raise ValueError(message_fail)
def pdf_universal(patent): pdf = None datasource = None meta = {} document = decode_patent_number(patent) number_normalized = normalize_patent(patent) # first, try archive try: # Skip requests for documents w/o kindcode if not document.kind: raise ValueError(u'No kindcode for patent: {}'.format(patent)) pdf = archive_fetch_pdf(number_normalized) datasource = 'archive' except Exception as ex: if not isinstance(ex, HTTPNotFound): log.error(exception_traceback()) """ # second, try archive again after running acquisition try: # Skip requests for documents w/o kindcode if not document.kind: raise ValueError(u'No kindcode') run_acquisition(number_normalized, 'pdf') pdf = archive_fetch_pdf(number_normalized, 2) datasource = 'archive' except Exception as ex: """ if True: if not isinstance(ex, HTTPNotFound): log.error(exception_traceback()) if document: pdf = pdf_from_ops(patent, document, meta) datasource = 'ops' else: log.error('Locating a document at the domestic office requires ' \ 'a decoded document number for "{}"'.format(patent)) return {'pdf': pdf, 'datasource': datasource, 'meta': meta}
def fetch_pdf(document_number): """ Retrieve PDF document from the European publication server. https://data.epo.org/publication-server/ Blueprint address: https://data.epo.org/publication-server/pdf-document?cc=EP&pn=nnnnnn&ki=nn """ logger.info('PDF {}: European publication server attempt'.format(document_number)) patent = normalize_patent(document_number, as_dict=True, provider='espacenet') url_tpl = 'https://data.epo.org/publication-server/pdf-document?cc=EP&pn={number}&ki={kind}' url = url_tpl.format(**patent) logger.info('Accessing EPO publication server: {}'.format(url)) response = requests.get(url, headers={'User-Agent': regular_user_agent}) # Debugging #print 'response.content:\n', response.content if response.status_code == 200: if response.headers['Content-Type'] == 'application/pdf': payload = response.content return payload # Sometimes, an appropriate HTML document is returned, # pointing to the corresponding WIPO document. # # Example: EP2706864A2 # https://data.epo.org/publication-server/pdf-document?cc=EP&pn=2706864&ki=A2 # http://www.wipo.int/patentscope/search/en/WO2012153305 # # TODO: Unlock this again by leveraging the WIPO URL. else: msg = 'No PDF document returned from European ' \ 'publication server for "{document_number}".'.format(**locals()) logger.warn(msg) raise HTTPNotFound(msg) else: msg = 'No document found at European publication ' \ 'server for "{document_number}"'.format(**locals()) logger.warn(msg) raise HTTPNotFound(msg)
def get_xml(number): """ Fetch XML from EPD archive service """ number_normalized = normalize_patent(number) # 2015-01-13: apply patentnumber fixes for getting more out of DEPATISconnect numbers = depatisconnect_alternatives(number_normalized) for number_real in numbers: try: return get_xml_real(number_real) except KeyError: continue raise KeyError('No XML document for "{0}" at DPMA'.format(number))
def pdf_url(document_number): """ # Application >>> pdf_url('US2016101909A1') 'http://pdfaiw.uspto.gov/fdd/09/2016/19/010/0.pdf' # Grant I >>> pdf_url('US10194689B2') 'http://pdfpiw.uspto.gov/fdd/89/946/101/0.pdf' # Grant II >>> pdf_url('US2548918') 'http://pdfpiw.uspto.gov/fdd/18/489/025/0.pdf' """ document = normalize_patent(document_number, for_ops=False, as_dict=True, provider='uspto') if not document: return # Application if len(document.number) == 11: n = document.number # US20160101909A1 # http://pdfaiw.uspto.gov/fdd/09/2016/19/010/0.pdf url = 'http://pdfaiw.uspto.gov/fdd/{}/{}/{}/{}/0.pdf'.format( n[9:11], n[0:4], n[7:9], n[4:7]) # Grant elif len(document.number) == 8: n = document.number # US10194689B2 # http://pdfpiw.uspto.gov/fdd/89/946/101/0.pdf url = 'http://pdfpiw.uspto.gov/fdd/{}/{}/{}/0.pdf'.format( n[6:8], n[3:6], n[0:3]) else: raise ValueError( 'US document number "{}" has unexpected length'.format( document_number)) return url
def get_drawing_png(document, page, kind): # 2. Try to fetch drawing from OPS, fall back to other patent offices try: payload = get_ops_image(document, page, kind, 'tiff') except HTTPNotFound: # fallback to USPTO (U.S.) if document.upper().startswith('US'): document_id = normalize_patent(split_patent_number(document), for_ops=False) try: payload = get_uspto_image_cached(document_id) except PayloadEmpty as ex: raise HTTPNotFound( 'No drawing for "{0}" at OPS or USPTO'.format(document)) # fallback to CIPO (Canada) elif document.upper().startswith('CA'): document_id = split_patent_number(document) try: payload = get_cipo_image_cached(document_id) except PayloadEmpty as ex: raise HTTPNotFound( 'No drawing for "{0}" at OPS or CIPO'.format(document)) # otherwise, pass through exception else: raise # 3. Croak if no image available if not payload: msg = 'No image available for document={document}, kind={kind}, page={page}'.format( **locals()) log.warn(msg) raise HTTPNotFound(msg) # 4. Convert image from TIFF to PNG format payload = to_png(BytesIO(payload)) return payload
def read_documents(self): for document in self.documents: try: number = self.document_to_number(document) except (KeyError, TypeError): number = None # Whether kindcodes should be fixed on number normalization normalize_fix_kindcode = 'normalize_fix_kindcode' in self.options and self.options.normalize_fix_kindcode # Apply number normalization # TODO: Check how we can decouple from "for_ops=True" here number_normalized = normalize_patent( number, fix_kindcode=normalize_fix_kindcode, for_ops=True) # Be graceful if this didn't work if number_normalized: number = number_normalized document['publication_number'] = number document['upstream_provider'] = self.meta.upstream.name
def toCQL(self): text = [] for p in list(self.prefixes.keys()): if (p != ''): text.append('>%s="%s"' % (p, self.prefixes[p])) else: text.append('>"%s"' % (self.prefixes[p])) # add some smartness: # 1. for certain attributes, apply document number normalization to value term_vanilla = term = self.term.toCQL() if str(self.index).lower() in ['pn', 'num']: term = normalize_patent(str(term)) # 2. fallback to original value, if number normalization couldn't handle this value if not term: term = term_vanilla # 3. exclude some values from being quoted (Error code: 1107 - Quote marks not applicable for this index) if str(self.index).lower() in [ 'pa', 'in', 'pc', 'ac', 'prc', 'py', 'ay', 'pry', 'pub', 'ad', 'prd' ]: pass else: term = '"%s"' % term text.append('%s %s %s' % (self.index, self.relation.toCQL(), term)) # Add sortKeys if self.sortKeys: text.append("sortBy") for sk in self.sortKeys: text.append(sk.toCQL()) return ' '.join(text) return SearchClause.toCQL(self)
def pdf_universal_real(patent, response): document = decode_patent_number(patent) number_normalized = normalize_patent(patent) # Sanity checks. if document is None: log.error('Locating a document at the domestic office requires ' \ 'a decoded document number for "{}"'.format(patent)) raise ValueError('Unable to decode document number {}'.format(patent)) # 1. If it's an EP document, try European publication server first. if response.pdf is None and document.country == 'EP': try: response.pdf = publicationserver_fetch_pdf(patent) response.datasource = 'epo-publication-server' except Exception as ex: log.warning('PDF {}: Not available from EPO. {}'.format( patent, ex)) if not isinstance(ex, HTTPError): log.error(exception_traceback()) # 2. Next, try USPTO servers if it's an US document. if response.pdf is None and document.country == 'US': try: response.pdf = uspto_fetch_pdf(patent) response.datasource = 'uspto' except Exception as ex: log.warning('PDF {}: Not available from USPTO. {}'.format( patent, ex)) if not isinstance(ex, HTTPError): log.error(exception_traceback()) # 3. Next, try DPMA servers. if response.pdf is None: try: # Skip requests for documents w/o kindcode if not document.kind: raise ValueError('No kindcode for patent: {}'.format(patent)) response.pdf = depatisconnect_fetch_pdf(number_normalized) response.datasource = 'dpma' except Exception as ex: log.warning('PDF {}: Not available from DPMA. {}'.format( patent, ex)) # Evaluate exception. if isinstance(ex, NotConfiguredError): log.warning(ex) elif not isinstance(ex, HTTPNotFound): log.error(exception_traceback()) # 4. Next, try EPO OPS service. # Note this will assemble PDF out of single pages requested # from EPO OPS, which is a rather expensive operation. if response.pdf is None: # 2016-04-21: Amend document number for CA documents, e.g. CA2702893C -> CA2702893A1 # TODO: Reenable feature, but only when prefixing document with a custom page # informing the user about recent changes not yet arrived at EPO. # if document.country == 'CA': # patent = document.country + document.number try: response.pdf = ops_build_pdf(patent) response.datasource = 'epo-ops' except Exception as ex: log.warning('PDF {}: Not available from OPS. {}'.format( patent, ex)) if not isinstance(ex, HTTPError): log.error(exception_traceback()) # 5. Last but not least, try to redirect to USPTO server. # TODO: Move elsewhere as deactivated on 2019-02-19. if False and response.pdf is None and document.country == 'US': log.info('PDF {}: USPTO attempt'.format(patent)) uspto_found = False reason = None try: images_location = uspto_pdfview_url(document) if images_location: response.meta.update(images_location) response.datasource = 'uspto' uspto_found = True except Exception as ex: reason = ex if not isinstance(ex, HTTPError): log.error(exception_traceback()) if not uspto_found: log.warning('PDF {}: Not available on USPTO. {}'.format( patent, reason)) return True
def jump_office(request): office = request.matchdict.get('office') service = request.matchdict.get('service') document_type = request.matchdict.get('document_type') document_number = request.matchdict.get('document_number') redirect = request.params.get('redirect') if document_number: url = None if office == 'dpma' and service == 'register': dra = DpmaRegisterAccess() try: url = dra.get_document_url(document_number) except: return HTTPNotFound('Document number {} not found.'.format(document_number)) # TODO: application number vs. file number, e.g. # - EP666666 vs. E95480005.8 # - DE19630877 vs. 196308771 elif office == 'uspto' and service == 'biblio': if document_type == 'publication': # http://patft.uspto.gov/netacgi/nph-Parser?Sect1=PTO1&Sect2=HITOFF&d=PALL&p=1&u=%2Fnetahtml%2FPTO%2Fsrchnum.htm&r=1&f=G&l=50&s1=9317610 document = normalize_patent(document_number, as_dict=True, for_ops=False) url = 'http://patft.uspto.gov/netacgi/nph-Parser'\ '?Sect1=PTO1&Sect2=HITOFF&d=PALL&p=1&u=%2Fnetahtml%2FPTO%2Fsrchnum.htm&r=1&f=G&l=50&s1={number}.PN.'.format(**document) elif document_type == 'application': # http://appft.uspto.gov/netacgi/nph-Parser?Sect1=PTO1&Sect2=HITOFF&d=PG01&p=1&u=%2Fnetahtml%2FPTO%2Fsrchnum.html&r=1&f=G&l=50&s1=20160105912 document = normalize_patent(document_number, as_dict=True, for_ops=False) url = 'http://appft.uspto.gov/netacgi/nph-Parser'\ '?Sect1=PTO1&Sect2=HITOFF&d=PG01&p=1&u=%2Fnetahtml%2FPTO%2Fsrchnum.html&r=1&f=G&l=50&s1={number}'.format(**document) elif office == 'uspto' and service == 'images': if document_type == 'publication': # http://pdfpiw.uspto.gov/.piw?docid=9317610 document = normalize_patent(document_number, as_dict=True, for_ops=False) url = 'http://pdfpiw.uspto.gov/.piw?docid={number}'.format(**document) elif document_type == 'application': # http://pdfaiw.uspto.gov/.aiw?docid=20160105912 document = normalize_patent(document_number, as_dict=True, for_ops=False) url = 'http://pdfaiw.uspto.gov/.aiw?docid={number}'.format(**document) elif office == 'uspto' and service == 'global-dossier': # https://globaldossier.uspto.gov/#/result/publication/DE/112015004959/1 normalized = normalize_patent(document_number, as_dict=True, for_ops=False) url = 'https://globaldossier.uspto.gov/#/result/{document_type}/{country}/{number}/1'.format( document_type=document_type, **normalized) elif office == 'google' and service == 'patents': # https://www.google.com/patents/EP0666666B1 # https://patents.google.com/patent/EP0666666B1 normalized = normalize_patent(document_number, for_ops=False) url = 'https://patents.google.com/patent/{}'.format(normalized) # Add Google Prior Art search again. See "priorArtKeywords" and "priorArtDate" in HTML response. if url: if redirect: return HTTPFound(location=url) else: return url return HTTPNotFound(u'Could not locate document "{document_number}" at {office}/{service}.'.format( document_number=document_number, office=office, service=service))
def document_to_number(self, document): _id = document[u'_id'] cc, docno, kindcode = _id.split('.') publication_number = cc + docno + kindcode number = normalize_patent(publication_number) return number
def pair_to_solr(cls, key, value, modifiers=None): try: fieldname = cls.datasource_indexnames[key] except KeyError: return expression = None format = u'{0}:{1}' # ------------------------------------------ # value mogrifiers # ------------------------------------------ if key == 'patentnumber': # TODO: parse more sophisticated to make things like "EP666666 or EP666667" or "?query=pn%3AEP666666&datasource=ifi" possible # TODO: use different normalization flavor for IFI, e.g. JP01153210A will not work as JPH01153210A, which is required by OPS value = normalize_patent(value, for_ops=False) elif key == 'pubdate': """ - pd:[19800101 TO 19851231] - pd:[* TO 19601231] - pdyear:[1980 TO 1985] - pdyear:[* TO 1960] """ try: parsed = False # e.g. 1991 if len(value) == 4 and value.isdigit(): fieldname = 'pdyear' parsed = True # e.g. 1990-2014, 1990 - 2014 value = year_range_to_within(value) # e.g. # within 1978,1986 # within 1900,2009-08-20 # within 2009-08-20,2011-03-03 if 'within' in value: within_dates = parse_date_within(value) elements_are_years = all([len(value) == 4 and value.isdigit() for value in within_dates.values()]) if elements_are_years: fieldname = 'pdyear' else: if within_dates['startdate']: within_dates['startdate'] = parse_date_universal(within_dates['startdate']).format('YYYYMMDD') if within_dates['enddate']: within_dates['enddate'] = parse_date_universal(within_dates['enddate']).format('YYYYMMDD') if not within_dates['startdate']: within_dates['startdate'] = '*' if not within_dates['enddate']: within_dates['enddate'] = '*' expression = '{fieldname}:[{startdate} TO {enddate}]'.format(fieldname=fieldname, **within_dates) elif not parsed: value_date = parse_date_universal(value) if value_date: value = value_date.format('YYYYMMDD') else: raise ValueError(value) except Exception as ex: message = 'IFI CLAIMS query: Invalid date or range expression "{0}". Reason: {1}.'.format(value, ex) logger.warn(message + '\nException was:\n{0}'.format(_exception_traceback())) return {'error': True, 'message': message} elif key == 'inventor' or key == 'applicant': if not has_booleans(value) and should_be_quoted(value): value = u'"{0}"'.format(value) elif key == 'class': # v1: Naive implementation can only handle single values #value = ifi_convert_class(value) # v2: Advanced implementation can handle expressions on field "class" # Translate class expression from "H04L12/433 or H04L12/24" # to "(ic:H04L0012433 OR cpc:H04L0012433) OR (ic:H04L001224 OR cpc:H04L001224)" try: # Put value into parenthesis, to properly capture expressions if value: value = u'({value})'.format(value=value) # Parse value as simple query expression query_object = CQL(cql=value) # Rewrite all patent classifications in query expression ast from OPS format to IFI format rewrite_classes_ifi(query_object, format, fieldname) # Serialize into appropriate upstream datasource query expression syntax expression = query_object.dumps() except pyparsing.ParseException as ex: return {'error': True, 'message': '<pre>' + str(ex.explanation) + '</pre>'} # ------------------------------------------ # surround with parentheses # ------------------------------------------ if key in ['fulltext', 'inventor', 'applicant', 'country', 'citation']: if has_booleans(value) and not should_be_quoted(value) and not '{!complexphrase' in value: value = u'({0})'.format(value) # ------------------------------------------ # expression formatter # ------------------------------------------ # Serialize into appropriate upstream datasource query expression syntax if not expression: if key == 'fulltext' and '{!complexphrase' in value: expression = value else: expression = format_expression(format, fieldname, value) #print 'expression:', expression # ------------------------------------------ # final polishing # ------------------------------------------ # Solr(?) syntax: boolean operators must be uppercase if has_booleans(expression): boolis = [' or ', ' and ', ' not '] for booli in boolis: expression = expression.replace(booli, booli.upper()) return {'query': expression}
def pair_to_elasticsearch(cls, key, value, modifiers=None): try: fieldname = cls.datasource_indexnames[key] except KeyError: return expression = None format = u'{0}:{1}' # ------------------------------------------ # value mogrifiers # ------------------------------------------ if key == 'patentnumber': # Transform into distinct fields PC, DE, KI #if has_booleans(value): # value = '({})'.format(value) expression_parts = [] # Publication number patent = split_patent_number(value) patent_normalized = normalize_patent(patent, for_ops=False) if patent_normalized: patent = patent_normalized if patent: subexpression = u'PC:{country} AND DE:{number}'.format( **patent) if patent['kind']: subexpression += u' AND KI:{kind}'.format(**patent) expression_parts.append(u'({})'.format(subexpression)) # Application number subexpression = u'AN:{}'.format(value) expression_parts.append(subexpression) expression = u' OR '.join(expression_parts) # Priority number subexpression = u'NP:{}'.format(value) expression_parts.append(subexpression) expression = u' OR '.join(expression_parts) elif key == 'pubdate': """ - DP:[19800101 TO 19851231] - DP:[* TO 19601231] """ try: parsed = False # e.g. 1991 if len(value) == 4 and value.isdigit(): value = u'within {}0101,{}1231'.format(value, value) # e.g. 1990-2014, 1990 - 2014 value = year_range_to_within(value) # e.g. # within 1978,1986 # within 1900,2009-08-20 # within 2009-08-20,2011-03-03 if 'within' in value: within_dates = parse_date_within(value) if within_dates['startdate']: if len(within_dates['startdate']) == 4: within_dates['startdate'] += '0101' within_dates['startdate'] = parse_date_universal( within_dates['startdate']).format('YYYYMMDD') else: within_dates['startdate'] = '*' if within_dates['enddate']: if len(within_dates['enddate']) == 4: within_dates['enddate'] += '1231' within_dates['enddate'] = parse_date_universal( within_dates['enddate']).format('YYYYMMDD') else: within_dates['enddate'] = '*' expression = '{fieldname}:[{startdate} TO {enddate}]'.format( fieldname=fieldname, **within_dates) elif not parsed: value_date = parse_date_universal(value) if value_date: value = value_date.format('YYYYMMDD') else: raise ValueError(value) except Exception as ex: message = 'depatech query: Invalid date or range expression "{0}". Reason: {1}.'.format( value, ex) logger.warn( message + ' Exception was: {0}'.format(_exception_traceback())) return {'error': True, 'message': message} elif key == 'inventor' or key == 'applicant': if not has_booleans(value) and should_be_quoted(value): value = u'"{0}"'.format(value) elif key == 'class': # v1: Naive implementation can only handle single values #value = lucene_convert_class(value) # v2: Advanced implementation can handle expressions on field "class" # Translate class expression from "H04L12/433 or H04L12/24" # to "(ic:H04L0012433 OR cpc:H04L0012433) OR (ic:H04L001224 OR cpc:H04L001224)" try: # Put value into parenthesis, to properly capture expressions if value: value = u'({value})'.format(value=value) # Parse value as simple query expression query_object = CQL(cql=value) # Rewrite all patent classifications in query expression ast from OPS format to Lucene format rewrite_classes_lucene(query_object, format, fieldname) # Serialize into appropriate upstream datasource query expression syntax expression = query_object.dumps() except pyparsing.ParseException as ex: return { 'error': True, 'message': '<pre>' + str(ex.explanation) + '</pre>' } elif key == 'country': value = value.upper() # ------------------------------------------ # surround with parentheses # ------------------------------------------ if key in ['fulltext', 'inventor', 'applicant', 'country', 'citation']: if has_booleans(value) and not should_be_quoted(value): value = u'({0})'.format(value) # ------------------------------------------ # expression formatter # ------------------------------------------ # Serialize into appropriate upstream datasource query expression syntax if not expression: expression = format_expression(format, fieldname, value) #print 'expression:', expression # ------------------------------------------ # final polishing # ------------------------------------------ # Solr(?) syntax: boolean operators must be uppercase if has_booleans(expression): boolis = [' or ', ' and ', ' not '] for booli in boolis: expression = expression.replace(booli, booli.upper()) return {'query': expression}
def invalidate_xml(number): number_normalized = normalize_patent(number) region_invalidate(get_xml, None, 'get_xml', number_normalized)