Пример #1
0
def normalize_patent_au(patent):
    """
    Normalizes "Australian" format, e.g. AU2003212220A1, AU200042655B2, AU00784257B2

    Patent Application Number:
      old: 4+5 digits  (Patadmin, before 5 July 2002)
      new: 4+6 digits  (PAMS, after 5 July 2002)
      http://apa.hpa.com.au:8080/ipapa/intro
      http://pericles.ipaustralia.gov.au/aub/aub_pages_1.intro
    Patent Number:
      6 digits
      http://pericles.ipaustralia.gov.au/aub/aub_pages_1.intro
    """

    assert patent['country'] == 'AU'

    patched = copy(patent)

    length = len(patent['number'])

    # convert from 4+5 to 4+6 (old to new format)
    if length == 9:
        fullyear = patched['number'][0:4]
        seqnumber = patched['number'][4:]

        # pad sequential number to 6 characters
        patched['number'] = fullyear + pad_left(seqnumber, '0', 6)

    else:
        patched['number'] = trim_leading_zeros(patched['number'])

    if len(patched['number']) < 6:
        patched['number'] = pad_left(patched['number'], '0', 6)

    return patched
Пример #2
0
def normalize_patent_wo(patent):
    """
    Normalizes to "WIPO Publication Number" format, e.g. WO2005092324

    see "Pub. No.":
    http://www.wipo.int/pctdb/en/wo.jsp?IA=WO/2005/092324
    http://www.wipo.int/pctdb/en/wo.jsp?IA=WO0067640
    """

    assert patent['country'] == 'WO'

    patched = copy(patent)

    # filter: leave special documents untouched (with alphanumeric prefix)
    pattern = '^\D+'
    r = re.compile(pattern)
    if r.match(patched['number']):
        return patched

    length = len(patent['number'])

    # convert from 2+5 or 2+6 to 4+6
    if length == 7 or length == 8:

        year = patched['number'][0:2]
        seqnumber = patched['number'][2:]

        # assume for century: 78-99 => 19, otherwise => 20
        # build fullyear from (2-digit) year
        fullyear = fullyear_from_year(year)
        """
        # try different decoding: 1 zero + 2 year + 5 seqnumber
        # (wrong format due to "pad everything to 8 characters" logic of Bestellsystem)
        # so strip off first leading zero before decoding again
        # TODO: what about WO09802618A2, WO00202618A2, WO00402618A2, WO09201000A1, WO09901000A3, WO00101000A1?
        if length == 8:

          # 1. numbers like WO00101000A1 are ambiguous, could be WO2000101000A1 or WO2001001000A1
          ambiguous_2000_2003 = ( 2000 <= int(fullyear) and int(fullyear) <= 2003 and patched['number'].startswith('00') )

          # 2. same with 8 digit numbers >= 2004, starting with "WO004..."
          #    hint: WO00402618A2 can not be WO2000402618A2 (due to format 2+6 and release date), so must be WO2004002618A2
          ambiguous_2004_bis  = ( int(fullyear) >= 2004 )

          if ambiguous_2000_2003:  # or ambiguous_2004_bis:
            patched['number'] = patched['number'][1:]
            year = patched['number'][0:2]
            seqnumber = patched['number'][2:]
            fullyear = fullyear_from_year(year)
        """

        #if length == 8 and patched['number'].startswith('0') and int(fullyear) < 2003:
        #    return

        # pad sequential number to 6 characters
        patched['number'] = fullyear + pad_left(seqnumber, '0', 6)

    # convert from 4+5 to 4+6 (wrong format)
    elif length == 9:
        fullyear = patched['number'][0:4]
        seqnumber = patched['number'][4:]

        # pad sequential number to 6 characters
        patched['number'] = fullyear + pad_left(seqnumber, '0', 6)

    patched['number'] = trim_leading_zeros(patched['number'])
    return patched
Пример #3
0
def normalize_patent_us(patent, provider=None):

    # USPTO number formats

    # PATFT - Issued Patents:
    # http://patft.uspto.gov/netahtml/PTO/srchnum.htm
    #
    #   Utility                           --   5,146,634 6923014 0000001
    #   Design                            --    D339,456 D321987 D000152
    #   Plant                             --    PP08,901 PP07514 PP00003
    #   Reissue                           --    RE35,312 RE12345 RE00007
    #   Defensive Publication             --    T109,201 T855019 T100001
    #   Statutory Invention Registration  --    H001,523 H001234 H000001
    #   Re-examination                    --    RX12
    #   Additional Improvement            --    AI00,002 AI000318 AI00007
    subtype_prefixes = ['D', 'PP', 'RD', 'RE', 'T', 'H', 'AI']

    # AppFT - Patent Applications
    # http://appft.uspto.gov/netahtml/PTO/srchnum.html
    #
    #   Utility: 20010000044

    assert patent['country'] == 'US'

    patched = copy(patent)

    length = len(patched['number'])

    if provider == 'ops' or provider == 'espacenet':

        # OPS accepts US patent application publication numbers in 4+6=10 format
        # Examples: US2015322651A1, US2017250417A1, US2017285092A1

        # 2017-10-25
        # DEPATISnet started delivering application publication numbers in 5+7 format
        # with a leading zero after the country, e.g. US000006166174A, US020170285092A1
        # around October 2017. Account for that.
        if length == 12:
            patched['number'] = patched['number'].lstrip('0')
            length = len(patched['number'])

        # US application publication numbers: Convert from 4+5=9 to 4+6=10
        if length == 9:
            padding = '0' * (10 - length)
            patched['number'] = patched['number'][0:4] + padding + patched[
                'number'][4:]

        # US application publication numbers: Convert from 4+7=11 to 4+6=10
        # 2015-12-20: Normalize responses from SIP like "US20150322651A1" to "US2015322651A1"
        elif length == 11:
            if patched['number'][4] == '0':
                patched[
                    'number'] = patched['number'][0:4] + patched['number'][5:]

        # US patents: Handle document numbers with character prefixes
        # Trim leading zeros for OPS
        elif 'number-type' in patched and 'number-real' in patched:
            subtype = patched['number-type']
            seqnumber = patched['number-real']
            if subtype in subtype_prefixes:
                patched['number'] = subtype + trim_leading_zeros(seqnumber)

        # US patents: Strip leading zeros
        else:
            patched['number'] = patched['number'].lstrip('0')

    else:

        # US patents: Handle document numbers with character prefixes
        # Pad patent number with zeros to get total length of 7 characters
        if 'number-type' in patched and 'number-real' in patched:
            subtype = patched['number-type']
            seqnumber = patched['number-real']
            if subtype in subtype_prefixes:
                patched['number'] = subtype + seqnumber.zfill(7)

        # Convert from 4+5=9 or 4+6=10 to 4+7=11
        # US20170000054A1
        elif length == 9 or length == 10:
            padding = '0' * (11 - length)
            patched['number'] = patched['number'][0:4] + padding + patched[
                'number'][4:]

    # 2018-04-23: Espacenet changed behavior, handle edge case for
    # USD813591S to yield https://worldwide.espacenet.com/publicationDetails/claims?CC=US&NR=D813591S&KC=S
    if provider == 'espacenet':
        if 'number-type' in patched:
            if patched['number-type'] == 'D' and patched['kind'] == 'S':
                patched['number'] += patched['kind']

    return patched
Пример #4
0
def patch_patent(patent, provider=None):

    if not patent:
        return

    number_length = len(patent['number'])

    patched = copy(patent)
    #print 'patched:', patched

    # strip leading zeros of *publication* to 6 digits, if seqnumber is longer than 6 digits
    # examples: publication: AT401234; application: AT 967/1994 => AT96794
    if patched['country'] == 'AT':
        """
        if len(patched['number']) > 6 and not '/' in patched['number']:
            patched['number'] = trim_leading_zeros(patched['number'])
            patched['number'] = pad_left(patched['number'], '0', 6)
        """
        patched['number'] = trim_leading_zeros(patched['number'])

    # pad to 6 characters with leading zeros
    elif patched['country'] == 'AR':
        patched['number'] = patched['number'].lstrip('0').rjust(6, '0')

    elif patched['country'] == 'AU':
        patched = normalize_patent_au(patched)

    elif patched['country'] == 'BR':
        patched['number'] = patched['number'].lstrip('0')

    # strip leading zeros with exception of kindcode == T1, then pad to 7 digits like EP
    # "Veröffentlichung der europäischen Patentanmeldung"
    elif patched['country'] == 'DE':
        patched['number'] = trim_leading_zeros(patched['number'])
        #if patched.get('kind') == 'T1':
        #    patched['number'] = pad_left(patched['number'], '0', 7)

    # The Eurasian Patent Organization (EAPO)
    # Pad to 6 characters with leading zeros
    elif patched['country'] == 'EA' and number_length < 9:
        patched['number'] = trim_leading_zeros(patched['number'])
        patched['number'] = pad_left(patched['number'], '0', 6)

    # pad to 7 characters with leading zeros
    elif patched['country'] == 'EP':
        patched['number'] = trim_leading_zeros(patched['number'])
        patched['number'] = pad_left(patched['number'], '0', 7)

    elif patched['country'] == 'GE':
        patched['number'] = patched['number'].lstrip('0')

        # e.g.
        # GE00U200501210Y = GEU20051210Y
        # GE00P200503700B = GEP20053700B
        #print '77777777777:', patched['number'][5]
        if patched['number'][5] == '0':
            patched['number'] = patched['number'][:5] + patched['number'][6:]

    elif patched['country'] == 'IT':
        patched['number'] = patched['number'].lstrip('0')
        patched = normalize_patent_it(patched)

    # 2017-09-06: KR numbers
    # e.g. KR1020150124192A => KR20150124192A
    elif patched['country'] == 'KR':
        patched['number'] = trim_leading_zeros(patched['number'])
        if len(patched['number']) > 11 and patched['number'][:2] == '10':
            patched['number'] = patched['number'][2:]

    # 2009-11-09: JP numbers
    elif patched['country'] == 'JP':
        patched = normalize_patent_jp(patched)

    # 2015-09-01: SE numbers
    elif patched['country'] == 'SE':
        patched = normalize_patent_se(patched)
        patched['number'] = trim_leading_zeros(patched['number'])

    # 2007-07-26: US applications are 4+7
    elif patched['country'] == 'US':
        patched = normalize_patent_us(patched, provider=provider)

    # normalize wo numbers to 4+6 format
    elif patched['country'] == 'WO':
        # WOPCT/US86/01765 or WOEP/2004/008531
        if patched['number'].startswith('PCT'):
            patched = normalize_patent_wo_pct(patched)
        else:
            patched = normalize_patent_wo(patched)
            #patched = denormalize_patent_wo(patched)

    # strip leading zeros
    else:
        patched['number'] = trim_leading_zeros(patched['number'])

    #print "patched (regular):", patent, patched
    return patched