def denormalize_patent_wo(patent): assert patent['country'] == 'WO' patched = patent.copy() length = len(patent['number']) # convert from 4+6 to 2+5 ... if length == 10: fullyear = patched['number'][0:4] century = fullyear[0:2] seqnumber = patched['number'][4:] # ... for fullyear == 19*: convert to 2+5 if century == '19': seqnumber = str(int(seqnumber)) patched['number'] = fullyear[2:4] + pad_left(seqnumber, '0', 5) # ... for fullyear == 20* if century == '20': patched['number'] = denormalize_patent_wo_algo(int(fullyear), int(seqnumber)) # convert from 2+6 to 2+5 ... elif length == 8: year = patched['number'][0:2] seqnumber = patched['number'][2:] fullyear = fullyear_from_year(year) #print fullyear #print patched['number'] patched['number'] = denormalize_patent_wo_algo(int(fullyear), int(seqnumber)) #print patched['number'] # wrong format: assume 4+5, convert to 2+5 ... elif length == 9: fullyear = patched['number'][0:4] seqnumber = patched['number'][4:] patched['number'] = denormalize_patent_wo_algo(int(fullyear), int(seqnumber)) return patched
def normalize_patent_wo_pct(patent): """ Normalizes to "WIPO Application Number" format, e.g. PCT/US2005/009417 Takes inputs like WOPCT/US02/03226, PCT/US1999/9417 or WOEP/2004/008531 see "International Application No.": http://www.wipo.int/pctdb/en/wo.jsp?IA=PCT/US2005/009417 http://www.wipo.int/pctdb/en/wo.jsp?IA=US2005009417 see also: http://www.wipo.int/edocs/pctdocs/en/2005/pct_2005_42-section3.pdf """ assert patent['country'] == 'WO' patched = copy(patent) #print patched r = re.compile('[\/|-]') parts = r.split(patched['number']) # handle special formatting like "WOPCT/WO9831467": convert to WO publication number if len(parts) == 2: pct = parts[0] patent_number = parts[1] if patent_number.startswith('WO'): wo_patent = split_patent_number(patent_number) return normalize_patent_wo(wo_patent) # only allow numbers containing three segments if not len(parts) == 3: return # assign segment names pct = parts[0] country_year = parts[1] seqnumber = parts[2] # handle special formatting like "WOPCT-WO97/29690": convert to WO publication number if country_year.startswith('WO'): wo_patent = split_patent_number(country_year + seqnumber) return normalize_patent_wo(wo_patent) # handle special formatting like "WOEP/2004/008531" if pct.startswith('WO') and len(pct) == 4: country_year = pct[2:4] + country_year # assume s.th. like "EP02": expand year to full year if len(country_year) == 4: # assume for century: 78-99 => 19, otherwise => 20 # build fullyear from (2-digit) year fullyear = fullyear_from_year(country_year[2:]) country_year = country_year[0:2] + fullyear # pad sequential number to six digits with leading zeros seqnumber = pad_left(seqnumber, '0', 6) # delete country, patched['country'] = '' patched['number'] = ('%s/%s/%s' % (pct, country_year, seqnumber)) return patched
def normalize_patent_wo(patent): """ Normalizes to "WIPO Publication Number" format, e.g. WO2005092324 see "Pub. No.": http://www.wipo.int/pctdb/en/wo.jsp?IA=WO/2005/092324 http://www.wipo.int/pctdb/en/wo.jsp?IA=WO0067640 """ assert patent['country'] == 'WO' patched = copy(patent) # filter: leave special documents untouched (with alphanumeric prefix) pattern = '^\D+' r = re.compile(pattern) if r.match(patched['number']): return patched length = len(patent['number']) # convert from 2+5 or 2+6 to 4+6 if length == 7 or length == 8: year = patched['number'][0:2] seqnumber = patched['number'][2:] # assume for century: 78-99 => 19, otherwise => 20 # build fullyear from (2-digit) year fullyear = fullyear_from_year(year) """ # try different decoding: 1 zero + 2 year + 5 seqnumber # (wrong format due to "pad everything to 8 characters" logic of Bestellsystem) # so strip off first leading zero before decoding again # TODO: what about WO09802618A2, WO00202618A2, WO00402618A2, WO09201000A1, WO09901000A3, WO00101000A1? if length == 8: # 1. numbers like WO00101000A1 are ambiguous, could be WO2000101000A1 or WO2001001000A1 ambiguous_2000_2003 = ( 2000 <= int(fullyear) and int(fullyear) <= 2003 and patched['number'].startswith('00') ) # 2. same with 8 digit numbers >= 2004, starting with "WO004..." # hint: WO00402618A2 can not be WO2000402618A2 (due to format 2+6 and release date), so must be WO2004002618A2 ambiguous_2004_bis = ( int(fullyear) >= 2004 ) if ambiguous_2000_2003: # or ambiguous_2004_bis: patched['number'] = patched['number'][1:] year = patched['number'][0:2] seqnumber = patched['number'][2:] fullyear = fullyear_from_year(year) """ #if length == 8 and patched['number'].startswith('0') and int(fullyear) < 2003: # return # pad sequential number to 6 characters patched['number'] = fullyear + pad_left(seqnumber, '0', 6) # convert from 4+5 to 4+6 (wrong format) elif length == 9: fullyear = patched['number'][0:4] seqnumber = patched['number'][4:] # pad sequential number to 6 characters patched['number'] = fullyear + pad_left(seqnumber, '0', 6) patched['number'] = trim_leading_zeros(patched['number']) return patched