def import_ipc_classes(xlsfile, force=False): log.info('SIP IPC class map: Starting import, this might take some time') count = SipIpcClass.objects().count() if count > 0: if force: log.info( 'SIP IPC class map: Dropping collection because of "--force"') SipIpcClass.drop_collection() else: log.info( 'SIP IPC class map: Database contains {0} entries, will not import again' .format(count)) return terminator_chars = ['0', '-'] log.info('SIP IPC class map: Opening concordance file {}'.format(xlsfile)) try: wb = load_workbook(filename=xlsfile) except InvalidFileException as ex: log.error('SIP IPC class map: Reading XLSX file {} failed: {}'.format( xlsfile, ex.message)) return log.info('SIP IPC class map: Importing data') ws = wb.active for row in tqdm(ws.rows[1:]): #print "row:", row itid = row[0].value c1 = str(row[1].value) c2 = str(row[2].value) c3 = str(row[3].value) c4 = str(row[4].value) c5 = str(row[5].value) ipc_dict = IpcDecoder.getempty() ipc_dict['section'] = c1 if c2 not in terminator_chars: ipc_dict['class'] = c2 if c3 not in terminator_chars: ipc_dict['subclass'] = c3 if c4 not in terminator_chars: ipc_dict['group'] = c4 if c5 not in terminator_chars: ipc_dict['subgroup'] = c5 #print ipc_dict ipc = IpcDecoder(ipc_dict=ipc_dict) ipc_docdb = ipc.formatOPS() #print 'ipc-docdb:', ipc_docdb ipc_class = SipIpcClass(itid=itid, ipc=ipc_docdb) ipc_class.save() count = SipIpcClass.objects().count() log.info('SIP IPC class map: Imported {0} entries'.format(count))
def lucene_convert_class(value): right_truncation = False if value.endswith('*'): right_truncation = True ipc = IpcDecoder(value) value = ipc.formatLucene() if right_truncation: value += '*' return value
def triple_callback(token, index, binop, term): if index in ['ic', 'cpc']: try: # Decode IPC or CPC class from format "G01F000184" patent_class = IpcDecoder(term) # Encode IPC or CPC class to format "G01F1/84" # token[2] has a reference to "term" token[2] = patent_class.formatOPS() except: pass
def import_cpc_classes(filename, force=False): """ mdb-schema IPC_CPC.mdb mdb-export IPC_CPC.mdb cpcterm > IPC_CPC.csv """ log.info('SIP CPC class map: Starting import, this might take some time') count = SipCpcClass.objects().count() if count > 0: if force: log.info( 'SIP CPC class map: Dropping collection because of "--force"') SipCpcClass.drop_collection() else: log.info( 'SIP CPC class map: Database contains {0} entries, will not import again' .format(count)) return terminator_chars = ['0', '-'] log.info('SIP CPC class map: Opening concordance file {}'.format(filename)) if filename.endswith('.csv'): def decode_row(row): cpcid = row['ID'] c1 = str(row['C1']) c2 = str(row['C2']) c3 = str(row['C3']) c4 = str(row['C4']) c5 = str(row['C5']) return locals() try: csvfile = open(filename) except IOError as ex: log.error('SIP CPC class map: Opening file {} failed: {}'.format( filename, str(ex))) return try: stream = DictReader(csvfile) print(stream.fieldnames) except Exception as ex: log.error( 'SIP CPC class map: Reading CSV file {} failed: {}'.format( filename, ex.message)) return elif filename.endswith('.xlsx'): def decode_row(row): cpcid = row[0].value c1 = str(row[1].value) c2 = str(row[2].value) c3 = str(row[3].value) c4 = str(row[4].value) c5 = str(row[5].value) return locals() try: wb = load_workbook(filename=filename) except InvalidFileException as ex: log.error( 'SIP IPC class map: Reading XLSX file {} failed: {}'.format( filename, ex.message)) return ws = wb.active print('XLSX row 1:', [cell.value for cell in ws.rows[0]]) stream = ws.rows[1:20] #sys.exit(1) log.info('SIP CPC class map: Importing data') for row in tqdm(stream, total=255628): item = decode_row(row) #print "row:", row #print "item:", item cpcid = int(item['cpcid']) c1 = item['c1'] c2 = item['c2'] c3 = item['c3'] c4 = item['c4'] c5 = item['c5'] ipc_dict = IpcDecoder.getempty() ipc_dict['section'] = c1 if c2 not in terminator_chars: ipc_dict['class'] = c2 if c3 not in terminator_chars: ipc_dict['subclass'] = c3 if c4 not in terminator_chars: ipc_dict['group'] = c4 if c5 not in terminator_chars: ipc_dict['subgroup'] = c5 #print ipc_dict ipc = IpcDecoder(ipc_dict=ipc_dict) ipc_docdb = ipc.formatOPS() #print 'ipc-docdb:', ipc_docdb ipc_class = SipCpcClass(cpcid=cpcid, cpc=ipc_docdb) ipc_class.save() count = SipCpcClass.objects().count() log.info('SIP CPC class map: Imported {0} entries'.format(count))
def expand_class(self, value): ipc_raw = value ipc_raw_stripped = ipc_raw.rstrip(wildcards + '/ .') # check for right truncated ipc classes right_truncation = False try: ipc = IpcDecoder(ipc_raw_stripped) if ipc.ipc['subgroup'] is None: ipc.ipc['subgroup'] = '00' right_truncation = True ipc_ops = ipc.formatOPS() self.keyword_add(ipc_ops) except: message = 'SIP query: Class "{0}" could not be decoded.'.format( ipc_raw_stripped) logger.warn(message) raise ClassDecodingError(message) if right_truncation: modifier = 'SmartSelect="true"' else: modifier = 'SmartSelect="false"' sip_ipc = SipIpcClass.objects(ipc=ipc_ops).first() sip_cpc = SipCpcClass.objects(cpc=ipc_ops).first() if not sip_ipc and not sip_cpc: message = 'SIP query: Class "{0}" could not be resolved.'.format( ipc_ops) logger.warn(message) raise ClassDecodingError(message) ipc_expression = None cpc_expression = None expression_entries = [] if sip_ipc: ipc_expression =\ '<ipc {0}>\n'.format(modifier) +\ '<ipcid>{ipcid}</ipcid>'.format(ipcid=sip_ipc.itid) +\ '\n</ipc>' expression_entries.append(ipc_expression) if sip_cpc: cpc_expression =\ '<cpc {0}>\n'.format(modifier) +\ '<cpcid>{cpcid}</cpcid>'.format(cpcid=sip_cpc.cpcid) +\ '\n</cpc>' expression_entries.append(cpc_expression) if len(expression_entries) == 1: xml = expression_entries[0] elif len(expression_entries) > 1: # regular implementation xml = '<or>' + '\n'.join(expression_entries) + '</or>' return xml