예제 #1
0
def import_ipc_classes(xlsfile, force=False):

    log.info('SIP IPC class map: Starting import, this might take some time')

    count = SipIpcClass.objects().count()
    if count > 0:
        if force:
            log.info(
                'SIP IPC class map: Dropping collection because of "--force"')
            SipIpcClass.drop_collection()
        else:
            log.info(
                'SIP IPC class map: Database contains {0} entries, will not import again'
                .format(count))
            return

    terminator_chars = ['0', '-']

    log.info('SIP IPC class map: Opening concordance file {}'.format(xlsfile))
    try:
        wb = load_workbook(filename=xlsfile)
    except InvalidFileException as ex:
        log.error('SIP IPC class map: Reading XLSX file {} failed: {}'.format(
            xlsfile, ex.message))
        return

    log.info('SIP IPC class map: Importing data')
    ws = wb.active
    for row in tqdm(ws.rows[1:]):
        #print "row:", row
        itid = row[0].value
        c1 = str(row[1].value)
        c2 = str(row[2].value)
        c3 = str(row[3].value)
        c4 = str(row[4].value)
        c5 = str(row[5].value)

        ipc_dict = IpcDecoder.getempty()
        ipc_dict['section'] = c1
        if c2 not in terminator_chars:
            ipc_dict['class'] = c2
        if c3 not in terminator_chars:
            ipc_dict['subclass'] = c3
        if c4 not in terminator_chars:
            ipc_dict['group'] = c4
        if c5 not in terminator_chars:
            ipc_dict['subgroup'] = c5

        #print ipc_dict
        ipc = IpcDecoder(ipc_dict=ipc_dict)
        ipc_docdb = ipc.formatOPS()
        #print 'ipc-docdb:', ipc_docdb

        ipc_class = SipIpcClass(itid=itid, ipc=ipc_docdb)
        ipc_class.save()

    count = SipIpcClass.objects().count()
    log.info('SIP IPC class map: Imported {0} entries'.format(count))
예제 #2
0
def lucene_convert_class(value):
    right_truncation = False
    if value.endswith('*'):
        right_truncation = True

    ipc = IpcDecoder(value)
    value = ipc.formatLucene()

    if right_truncation:
        value += '*'

    return value
예제 #3
0
    def triple_callback(token, index, binop, term):

        if index in ['ic', 'cpc']:
            try:
                # Decode IPC or CPC class from format "G01F000184"
                patent_class = IpcDecoder(term)

                # Encode IPC or CPC class to format "G01F1/84"
                # token[2] has a reference to "term"
                token[2] = patent_class.formatOPS()

            except:
                pass
예제 #4
0
def import_cpc_classes(filename, force=False):
    """
    mdb-schema IPC_CPC.mdb
    mdb-export IPC_CPC.mdb cpcterm > IPC_CPC.csv
    """

    log.info('SIP CPC class map: Starting import, this might take some time')

    count = SipCpcClass.objects().count()
    if count > 0:
        if force:
            log.info(
                'SIP CPC class map: Dropping collection because of "--force"')
            SipCpcClass.drop_collection()
        else:
            log.info(
                'SIP CPC class map: Database contains {0} entries, will not import again'
                .format(count))
            return

    terminator_chars = ['0', '-']

    log.info('SIP CPC class map: Opening concordance file {}'.format(filename))

    if filename.endswith('.csv'):

        def decode_row(row):
            cpcid = row['ID']
            c1 = str(row['C1'])
            c2 = str(row['C2'])
            c3 = str(row['C3'])
            c4 = str(row['C4'])
            c5 = str(row['C5'])
            return locals()

        try:
            csvfile = open(filename)
        except IOError as ex:
            log.error('SIP CPC class map: Opening file {} failed: {}'.format(
                filename, str(ex)))
            return

        try:
            stream = DictReader(csvfile)
            print(stream.fieldnames)
        except Exception as ex:
            log.error(
                'SIP CPC class map: Reading CSV file {} failed: {}'.format(
                    filename, ex.message))
            return

    elif filename.endswith('.xlsx'):

        def decode_row(row):
            cpcid = row[0].value
            c1 = str(row[1].value)
            c2 = str(row[2].value)
            c3 = str(row[3].value)
            c4 = str(row[4].value)
            c5 = str(row[5].value)
            return locals()

        try:
            wb = load_workbook(filename=filename)
        except InvalidFileException as ex:
            log.error(
                'SIP IPC class map: Reading XLSX file {} failed: {}'.format(
                    filename, ex.message))
            return

        ws = wb.active
        print('XLSX row 1:', [cell.value for cell in ws.rows[0]])
        stream = ws.rows[1:20]

    #sys.exit(1)

    log.info('SIP CPC class map: Importing data')
    for row in tqdm(stream, total=255628):
        item = decode_row(row)
        #print "row:", row
        #print "item:", item
        cpcid = int(item['cpcid'])
        c1 = item['c1']
        c2 = item['c2']
        c3 = item['c3']
        c4 = item['c4']
        c5 = item['c5']

        ipc_dict = IpcDecoder.getempty()
        ipc_dict['section'] = c1
        if c2 not in terminator_chars:
            ipc_dict['class'] = c2
        if c3 not in terminator_chars:
            ipc_dict['subclass'] = c3
        if c4 not in terminator_chars:
            ipc_dict['group'] = c4
        if c5 not in terminator_chars:
            ipc_dict['subgroup'] = c5

        #print ipc_dict
        ipc = IpcDecoder(ipc_dict=ipc_dict)
        ipc_docdb = ipc.formatOPS()
        #print 'ipc-docdb:', ipc_docdb

        ipc_class = SipCpcClass(cpcid=cpcid, cpc=ipc_docdb)
        ipc_class.save()

    count = SipCpcClass.objects().count()
    log.info('SIP CPC class map: Imported {0} entries'.format(count))
예제 #5
0
    def expand_class(self, value):

        ipc_raw = value
        ipc_raw_stripped = ipc_raw.rstrip(wildcards + '/ .')

        # check for right truncated ipc classes
        right_truncation = False

        try:
            ipc = IpcDecoder(ipc_raw_stripped)
            if ipc.ipc['subgroup'] is None:
                ipc.ipc['subgroup'] = '00'
                right_truncation = True
            ipc_ops = ipc.formatOPS()
            self.keyword_add(ipc_ops)

        except:
            message = 'SIP query: Class "{0}" could not be decoded.'.format(
                ipc_raw_stripped)
            logger.warn(message)
            raise ClassDecodingError(message)

        if right_truncation:
            modifier = 'SmartSelect="true"'
        else:
            modifier = 'SmartSelect="false"'

        sip_ipc = SipIpcClass.objects(ipc=ipc_ops).first()
        sip_cpc = SipCpcClass.objects(cpc=ipc_ops).first()
        if not sip_ipc and not sip_cpc:
            message = 'SIP query: Class "{0}" could not be resolved.'.format(
                ipc_ops)
            logger.warn(message)
            raise ClassDecodingError(message)

        ipc_expression = None
        cpc_expression = None
        expression_entries = []

        if sip_ipc:

            ipc_expression =\
            '<ipc {0}>\n'.format(modifier) +\
            '<ipcid>{ipcid}</ipcid>'.format(ipcid=sip_ipc.itid) +\
            '\n</ipc>'
            expression_entries.append(ipc_expression)

        if sip_cpc:

            cpc_expression =\
            '<cpc {0}>\n'.format(modifier) +\
            '<cpcid>{cpcid}</cpcid>'.format(cpcid=sip_cpc.cpcid) +\
            '\n</cpc>'
            expression_entries.append(cpc_expression)

        if len(expression_entries) == 1:
            xml = expression_entries[0]

        elif len(expression_entries) > 1:

            # regular implementation
            xml = '<or>' + '\n'.join(expression_entries) + '</or>'

        return xml