def match_filter(item, filter): if callable(filter): patent = split_patent_number(item) outcome = filter(patent) else: outcome = item.startswith(filter) return outcome
def depatisconnect_alternatives(number): """reverse "fix_patent" for DE documents""" # always add original number first numbers = [number] patent = split_patent_number(number) if patent['country'] == 'DE': if not patent['number'].isdigit(): return [join_patent(patent)] patent_number = int(patent['number']) # e.g. DE000000121107A, DE000000801283B if patent_number < 1000000: if patent['kind'] == 'C': patent['kind'] = 'B' numbers.append(join_patent(patent)) patent['kind'] = 'A' numbers.append(join_patent(patent)) # e.g. DE000001020931A elif 1000000 <= patent_number < 1400000: #numbers.append(join_patent(patent)) pass # e.g. DE000002363448A elif 1400000 <= patent_number: if patent['kind'] == 'A1': patent['kind'] = 'A' numbers.append(join_patent(patent)) return numbers
def normalize_patent(number, as_dict=False, as_string=False, fix_kindcode=False, for_ops=True, provider=None): if provider is None and for_ops is True: provider = 'ops' # 1. handle patent dicts or convert (split) from string if isinstance(number, types.DictionaryType): patent = number else: patent = split_patent_number(number) # 2.a. normalize patent dict patent_normalized = patch_patent(patent, provider=provider) # 2.b. apply fixes if fix_kindcode: fix_patent_kindcode_ops(patent_normalized) # 3. result handling # 3.a) default mechanism: return what we've got if isinstance(number, types.DictionaryType): result = patent_normalized else: result = join_patent(patent_normalized) # 3.b) extended mechanism: return what we are requested for if as_dict: result = patent_normalized elif as_string: result = join_patent(patent_normalized) return result
def get_drawing_png(document, page, kind): # 2. Try to fetch drawing from OPS, fall back to other patent offices try: payload = get_ops_image(document, page, kind, 'tiff') except HTTPNotFound: # fallback to USPTO (U.S.) if document.upper().startswith('US'): document_id = normalize_patent(split_patent_number(document), for_ops=False) try: payload = get_uspto_image_cached(document_id) except PayloadEmpty as ex: raise HTTPNotFound( 'No drawing for "{0}" at OPS or USPTO'.format(document)) # fallback to CIPO (Canada) elif document.upper().startswith('CA'): document_id = split_patent_number(document) try: payload = get_cipo_image_cached(document_id) except PayloadEmpty as ex: raise HTTPNotFound( 'No drawing for "{0}" at OPS or CIPO'.format(document)) # otherwise, pass through exception else: raise # 3. Croak if no image available if not payload: msg = 'No image available for document={document}, kind={kind}, page={page}'.format( **locals()) log.warn(msg) raise HTTPNotFound(msg) # 4. Convert image from TIFF to PNG format payload = to_png(BytesIO(payload)) return payload
def test_denormalization(): payload = """ WO2002051230 WO2002051231 WO2006113621A3 WO1998016331A3 WO2000001014A1 WO2001002000A3 WO1999012345 WO1999123456 WO2001012345 WO2001098623A1 WO2001098623A1 WO2001098623A1 WO2001098623A1 WO2003107732 WO2003107732 WO2004000001 WO1999013800 WO1999023997 WO1990004917 WO2000027301 WO2000000748 WO2003043359 WO2003107520 WO2007054055 --- WO1990004917 """ print "-" * 30 print "original\tdenormalized" print "-" * 30 for number in payload.split("\n"): if not number or number == "\n": continue if number.startswith('---'): print number continue number_denormalized = join_patent( denormalize_patent(split_patent_number(number))) print "%s\t%s" % (number, number_denormalized)
def ops_register(reference_type, document_number, constituents=None, xml=False): """ Request register information from OPS in JSON or XML format. reference_type = publication|application|priority Examples: - http://ops.epo.org/3.1/rest-services/register/publication/epodoc/EP2485810/biblio - http://ops.epo.org/3.1/rest-services/register/publication/epodoc/EP2485810/biblio,legal.json """ if constituents is None: constituents = 'biblio,legal' # Compute document identifier. document_id = split_patent_number(document_number) #ops_id = epo_ops.models.Docdb(document_id.number, document_id.country, document_id.kind) ops_id = epo_ops.models.Epodoc(document_id.country + document_id.number, document_id.kind) # Acquire register information from OPS. with ops_client(xml=xml) as ops: response = ops.register(reference_type, ops_id, constituents=to_list(constituents)) return handle_response(response, 'ops-register')
def ops_family_inpadoc(reference_type, document_number, constituents, xml=False): """ Request family information from OPS in JSON format. reference_type = publication|application|priority constituents = biblio|legal Examples: - http://ops.epo.org/3.1/rest-services/family/publication/docdb/EP.1491501.A1/biblio,legal - http://ops.epo.org/3.1/rest-services/family/publication/docdb/EP0666666/biblio - http://ops.epo.org/3.1/rest-services/family/publication/docdb/EP0666666.A2/biblio - http://ops.epo.org/3.1/rest-services/family/publication/docdb/EP0666666.B1/biblio """ # Compute document identifier. document_id = split_patent_number(document_number) ops_id = epo_ops.models.Epodoc(document_id.country + document_id.number, document_id.kind) # Acquire family information from OPS. with ops_client(xml=xml) as ops: response = ops.family(reference_type, ops_id, constituents=to_list(constituents)) return handle_response(response, 'ops-family')
def ops_family_publication_docdb_xml(reference_type, document_number, constituents): """ Request family information from OPS in XML format. reference_type = publication|application|priority constituents = biblio|legal Examples: - http://ops.epo.org/3.1/rest-services/family/publication/docdb/EP.1491501.A1/biblio,legal """ # Compute document identifier. document_id = split_patent_number(document_number) ops_id = epo_ops.models.Docdb(document_id.number, document_id.country, document_id.kind) # Acquire family information from OPS. ops = get_ops_client() # FIXME: Better use "accept_type" on a per-request basis supported by ``python-epo-ops-client``. ops.accept_type = 'application/xml' response = ops.family(reference_type, ops_id, constituents=to_list(constituents)) ops.accept_type = 'application/json' return handle_response(response, 'ops-family')
def normalize_patent_wo_pct(patent): """ Normalizes to "WIPO Application Number" format, e.g. PCT/US2005/009417 Takes inputs like WOPCT/US02/03226, PCT/US1999/9417 or WOEP/2004/008531 see "International Application No.": http://www.wipo.int/pctdb/en/wo.jsp?IA=PCT/US2005/009417 http://www.wipo.int/pctdb/en/wo.jsp?IA=US2005009417 see also: http://www.wipo.int/edocs/pctdocs/en/2005/pct_2005_42-section3.pdf """ assert patent['country'] == 'WO' patched = copy(patent) #print patched r = re.compile('[\/|-]') parts = r.split(patched['number']) # handle special formatting like "WOPCT/WO9831467": convert to WO publication number if len(parts) == 2: pct = parts[0] patent_number = parts[1] if patent_number.startswith('WO'): wo_patent = split_patent_number(patent_number) return normalize_patent_wo(wo_patent) # only allow numbers containing three segments if not len(parts) == 3: return # assign segment names pct = parts[0] country_year = parts[1] seqnumber = parts[2] # handle special formatting like "WOPCT-WO97/29690": convert to WO publication number if country_year.startswith('WO'): wo_patent = split_patent_number(country_year + seqnumber) return normalize_patent_wo(wo_patent) # handle special formatting like "WOEP/2004/008531" if pct.startswith('WO') and len(pct) == 4: country_year = pct[2:4] + country_year # assume s.th. like "EP02": expand year to full year if len(country_year) == 4: # assume for century: 78-99 => 19, otherwise => 20 # build fullyear from (2-digit) year fullyear = fullyear_from_year(country_year[2:]) country_year = country_year[0:2] + fullyear # pad sequential number to six digits with leading zeros seqnumber = pad_left(seqnumber, '0', 6) # delete country, patched['country'] = '' patched['number'] = ('%s/%s/%s' % (pct, country_year, seqnumber)) return patched
images_index_url = None for anchor in anchors: if "Drawings" in str(anchor): images_index_url = cipo_baseurl + anchor['href'] break if not images_index_url: return # 2. fetch and parse images index page images_index_html = fetch_images_index(images_index_url) soup = BeautifulSoup(images_index_html) # <img src="/opic-cipo/cpd/page/141597_20130713_drawings_page1_scale25_rotate0.gif?page=3&section=drawings&scale=25&rotation=0&type=" alt="Canadian Patent Document 141597. Drawings page. Image 1 of 3" /> first_drawing_url = cipo_baseurl + soup.find('img', src=re.compile(ur'/opic-cipo/cpd/page'))['src'] return first_drawing_url if __name__ == '__main__': numbers = [ 'CA141597A' ] for number in numbers: payload = fetch_first_drawing(split_patent_number(number)) if payload: #print "payload length:", len(payload) print payload else: print "not found"
def generate(self, data): for number, number_normalized_expect in data.iteritems(): number_normalized_computed = split_patent_number(number) yield number, number_normalized_expect, number_normalized_computed
def compute(self): self.document = split_patent_number(self.original)
def pair_to_elasticsearch(cls, key, value, modifiers=None): try: fieldname = cls.datasource_indexnames[key] except KeyError: return expression = None format = u'{0}:{1}' # ------------------------------------------ # value mogrifiers # ------------------------------------------ if key == 'patentnumber': # Transform into distinct fields PC, DE, KI #if has_booleans(value): # value = '({})'.format(value) expression_parts = [] # Publication number patent = split_patent_number(value) patent_normalized = normalize_patent(patent, for_ops=False) if patent_normalized: patent = patent_normalized if patent: subexpression = u'PC:{country} AND DE:{number}'.format( **patent) if patent['kind']: subexpression += u' AND KI:{kind}'.format(**patent) expression_parts.append(u'({})'.format(subexpression)) # Application number subexpression = u'AN:{}'.format(value) expression_parts.append(subexpression) expression = u' OR '.join(expression_parts) # Priority number subexpression = u'NP:{}'.format(value) expression_parts.append(subexpression) expression = u' OR '.join(expression_parts) elif key == 'pubdate': """ - DP:[19800101 TO 19851231] - DP:[* TO 19601231] """ try: parsed = False # e.g. 1991 if len(value) == 4 and value.isdigit(): value = u'within {}0101,{}1231'.format(value, value) # e.g. 1990-2014, 1990 - 2014 value = year_range_to_within(value) # e.g. # within 1978,1986 # within 1900,2009-08-20 # within 2009-08-20,2011-03-03 if 'within' in value: within_dates = parse_date_within(value) if within_dates['startdate']: if len(within_dates['startdate']) == 4: within_dates['startdate'] += '0101' within_dates['startdate'] = parse_date_universal( within_dates['startdate']).format('YYYYMMDD') else: within_dates['startdate'] = '*' if within_dates['enddate']: if len(within_dates['enddate']) == 4: within_dates['enddate'] += '1231' within_dates['enddate'] = parse_date_universal( within_dates['enddate']).format('YYYYMMDD') else: within_dates['enddate'] = '*' expression = '{fieldname}:[{startdate} TO {enddate}]'.format( fieldname=fieldname, **within_dates) elif not parsed: value_date = parse_date_universal(value) if value_date: value = value_date.format('YYYYMMDD') else: raise ValueError(value) except Exception as ex: message = 'depatech query: Invalid date or range expression "{0}". Reason: {1}.'.format( value, ex) logger.warn( message + ' Exception was: {0}'.format(_exception_traceback())) return {'error': True, 'message': message} elif key == 'inventor' or key == 'applicant': if not has_booleans(value) and should_be_quoted(value): value = u'"{0}"'.format(value) elif key == 'class': # v1: Naive implementation can only handle single values #value = lucene_convert_class(value) # v2: Advanced implementation can handle expressions on field "class" # Translate class expression from "H04L12/433 or H04L12/24" # to "(ic:H04L0012433 OR cpc:H04L0012433) OR (ic:H04L001224 OR cpc:H04L001224)" try: # Put value into parenthesis, to properly capture expressions if value: value = u'({value})'.format(value=value) # Parse value as simple query expression query_object = CQL(cql=value) # Rewrite all patent classifications in query expression ast from OPS format to Lucene format rewrite_classes_lucene(query_object, format, fieldname) # Serialize into appropriate upstream datasource query expression syntax expression = query_object.dumps() except pyparsing.ParseException as ex: return { 'error': True, 'message': '<pre>' + str(ex.explanation) + '</pre>' } elif key == 'country': value = value.upper() # ------------------------------------------ # surround with parentheses # ------------------------------------------ if key in ['fulltext', 'inventor', 'applicant', 'country', 'citation']: if has_booleans(value) and not should_be_quoted(value): value = u'({0})'.format(value) # ------------------------------------------ # expression formatter # ------------------------------------------ # Serialize into appropriate upstream datasource query expression syntax if not expression: expression = format_expression(format, fieldname, value) #print 'expression:', expression # ------------------------------------------ # final polishing # ------------------------------------------ # Solr(?) syntax: boolean operators must be uppercase if has_booleans(expression): boolis = [' or ', ' and ', ' not '] for booli in boolis: expression = expression.replace(booli, booli.upper()) return {'query': expression}