def upload(granted_ids, pregranted_ids, config): logging.info('granted_ids size %s', len(granted_ids)) logging.info('pregranted_ids size %s', len(pregranted_ids)) pairs_pregranted = [] pairs_granted = [] with open(config['LOCATION_UPLOAD']['input'], 'r') as fin: for line in fin: splt = line.strip().split('\t') if splt[0] in pregranted_ids: pairs_pregranted.append((splt[0], splt[1])) elif splt[0] in granted_ids: pairs_granted.append((splt[0], splt[1])) else: logging.warning('missing id %s', splt[0]) logging.info('pairs granted size %s', len(pairs_granted)) logging.info('pairs pregranted size %s', len(pairs_pregranted)) cnx_g = pvdb.granted_table(config) cnx_pg = pvdb.granted_table(config) g_cursor = cnx_g.cursor() batch_size = 100000 offsets = [x for x in range(0, len(pairs_granted), batch_size)] for idx in tqdm(range(len(offsets)), 'adding granted', total=len(offsets)): sidx = offsets[idx] eidx = min(len(pairs_granted), offsets[idx] + batch_size) sql = "INSERT INTO location_disambiguation_mapping (uuid, location_id,in_granted,in_pregrant) VALUES " + ', '.join( ['("%s", "%s", 1, 0)' % x for x in pairs_granted[sidx:eidx]]) # logging.log_first_n(logging.INFO, '%s', 1, sql) g_cursor.execute(sql) cnx_g.commit() #g_cursor.execute('alter table location_disambiguation_mapping add primary key (uuid)') cnx_g.close() pg_cursor = cnx_pg.cursor() batch_size = 100000 offsets = [x for x in range(0, len(pairs_pregranted), batch_size)] for idx in tqdm(range(len(offsets)), 'adding pregranted', total=len(offsets)): sidx = offsets[idx] eidx = min(len(pairs_pregranted), offsets[idx] + batch_size) sql = "INSERT INTO location_disambiguation_mapping (uuid, location_id, in_granted, in_pregrant) VALUES " + ', '.join( ['("%s", "%s", 0, 1)' % x for x in pairs_pregranted[sidx:eidx]]) # logging.log_first_n(logging.INFO, '%s', 1, sql) pg_cursor.execute(sql) cnx_pg.commit() #pg_cursor.execute('alter table location_disambiguation_mapping add primary key (uuid)') cnx_pg.close()
def __init__(self, pregranted_canopies, granted_canopies, config): self.pregranted_canopies = pregranted_canopies self.granted_canopies = granted_canopies self.cnx_g = pvdb.granted_table(config) self.cnx_pg = pvdb.pregranted_table(config) self.cnx_g_inc = pvdb.incremental_granted_table(config) self.cnx_pg_inc = pvdb.incremental_pregranted_table(config)
def build_granted(fout, config): cnx = pvdb.granted_table(config) cursor = cnx.cursor() query = "SELECT uuid, patent_id, sequence FROM rawinventor;" cursor.execute(query) for uuid, patent_id, sequence in tqdm(cursor, 'process', total=17000000): fout.write('%s\t%s-%s\n' % (uuid, patent_id, sequence))
def collection_location_mentions_granted(config): canopy2uuids = collections.defaultdict(list) uuid2entityid = load_disambiguation(config) cnx = pvdb.granted_table(config) cursor = cnx.cursor() query = "SELECT uuid, rawlocation_id FROM rawinventor;" cursor.execute(query) for uuid, rawlocation_id in tqdm(cursor, 'process', total=18000000): canopy2uuids[uuid2entityid[uuid]].append(rawlocation_id) return canopy2uuids
def drop_tables(config): cnx_g = pvdb.granted_table(config) cnx_pg = pvdb.pregranted_table(config) g_cursor = cnx_g.cursor() g_cursor.execute("TRUNCATE TABLE temp_assignee_disambiguation_mapping") pg_cursor = cnx_pg.cursor() pg_cursor.execute("TRUNCATE TABLE temp_assignee_disambiguation_mapping") g_cursor.close() pg_cursor.close()
def create_tables(config): cnx_g = pvdb.granted_table(config) cnx_pg = pvdb.pregranted_table(config) g_cursor = cnx_g.cursor() g_cursor.execute( "CREATE TABLE location_disambiguation_mapping (uuid VARCHAR(255), location_id VARCHAR(255))") pg_cursor = cnx_pg.cursor() pg_cursor.execute( "CREATE TABLE location_disambiguation_mapping (uuid VARCHAR(255), location_id VARCHAR(255))") g_cursor.close() pg_cursor.close()
def build_granted(canopy2mentions, granted_uuid2canopy, config): cnx = pvdb.granted_table(config) cursor = cnx.cursor() query = "SELECT id , location_id , city , state, country, country_transformed, location_id_transformed FROM rawlocation;" cursor.execute(query) for rec in tqdm(cursor, 'working on granted patents', total=29032921): lm = LocationMention.from_granted_sql_record(rec) if lm.uuid in granted_uuid2canopy: canopy = granted_uuid2canopy[lm.uuid] canopy2mentions[canopy].append(lm) return canopy2mentions
def drop_tables(config): cnx_g = pvdb.granted_table(config) cnx_pg = pvdb.pregranted_table(config) g_cursor = cnx_g.cursor() g_cursor.execute("DROP TABLE location_disambiguation_mapping") pg_cursor = cnx_pg.cursor() pg_cursor.execute("DROP TABLE location_disambiguation_mapping") g_cursor.close() pg_cursor.close()
def build_granted(granted_uuid2canopy, config): canopy2mentions = collections.defaultdict(list) cnx = pvdb.granted_table(config) cursor = cnx.cursor() query = "SELECT id , location_id , city , state , country , country_transformed , location_id_transformed FROM rawlocation;" cursor.execute(query) for rec in tqdm(cursor, 'process', total=18000000): lm = LocationMention.from_granted_sql_record(rec) if lm.uuid in granted_uuid2canopy: canopy = granted_uuid2canopy[lm.uuid] canopy2mentions[canopy].append(lm) return canopy2mentions
def build_granted(granted_uuids, pgranted_uuids, config): canopy2uuids = collections.defaultdict(list) uuid2canopy = dict() uuid2entityid = load_disambiguation(granted_uuids, pgranted_uuids, config) cnx = pvdb.granted_table(config) cursor = cnx.cursor() query = "SELECT uuid, rawlocation_id FROM rawassignee;" cursor.execute(query) for uuid, rawlocation_id in tqdm(cursor, 'process', total=6789244): canopy2uuids[uuid2entityid[uuid]].append(rawlocation_id) uuid2canopy[rawlocation_id] = uuid2entityid[uuid] return canopy2uuids, uuid2canopy
def create_tables(config): cnx_g = pvdb.granted_table(config) cnx_pg = pvdb.pregranted_table(config) g_cursor = cnx_g.cursor() g_cursor.execute( "CREATE TABLE tmp_inventor_disambiguation_granted2 (uuid VARCHAR(255), disambiguated_id VARCHAR(255))" ) pg_cursor = cnx_pg.cursor() pg_cursor.execute( "CREATE TABLE tmp_inventor_disambiguation_pregranted2 (uuid VARCHAR(255), disambiguated_id VARCHAR(255))" ) g_cursor.close() pg_cursor.close()
def create_uuid_map(config): cnx_g = pvdb.granted_table(config) cnx_pg = pvdb.pregranted_table(config) g_cursor = cnx_g.cursor() g_cursor.execute("SELECT uuid, patent_id, sequence FROM rawassignee;") granted_uuids = dict() for uuid, patent_id, seq in tqdm(g_cursor, 'granted uuids'): granted_uuids['%s-%s' % (patent_id, seq)] = uuid pg_cursor = cnx_pg.cursor() pg_cursor.execute("SELECT id, document_number, sequence-1 as sequence FROM rawassignee;") pgranted_uuids = dict() for uuid, doc_id, seq in tqdm(pg_cursor, 'pregranted uuids'): pgranted_uuids['pg-%s-%s' % (doc_id, seq)] = uuid return granted_uuids, pgranted_uuids
def build_granted(config): # | uuid | patent_id | assignee_id | rawlocation_id | type | name_first | name_last | organization | sequence | cnx = pvdb.granted_table(config) cursor = cnx.cursor() query = "SELECT uuid , patent_id , assignee_id , rawlocation_id , type , name_first , name_last , organization , sequence FROM rawassignee;" cursor.execute(query) feature_map = collections.defaultdict(list) idx = 0 for rec in cursor: am = AssigneeMention.from_granted_sql_record(rec) feature_map[am.name_features()[0]].append(am) idx += 1 logging.log_every_n(logging.INFO, 'Processed %s granted records - %s features', 10000, idx, len(feature_map)) return feature_map
def upload(granted_ids, pregranted_ids, config): pairs_pregranted = [] pairs_granted = [] with open(config['ASSIGNEE_UPLOAD']['input'], 'r') as fin: for line in fin: splt = line.strip().split('\t') if splt[0] in pregranted_ids: pairs_pregranted.append((pregranted_ids[splt[0]], splt[1])) elif splt[0] in granted_ids: pairs_granted.append((granted_ids[splt[0]], splt[1])) cnx_g = pvdb.granted_table(config) cnx_pg = pvdb.pregranted_table(config) g_cursor = cnx_g.cursor() batch_size = 100000 offsets = [x for x in range(0, len(pairs_granted), batch_size)] for idx in tqdm(range(len(offsets)), 'adding granted', total=len(offsets)): sidx = offsets[idx] eidx = min(len(pairs_granted), offsets[idx] + batch_size) sql = "INSERT INTO temp_assignee_disambiguation_mapping (uuid, assignee_id, version_indicator) VALUES " + ', '.join( ['("%s", "%s", "20201229")' % x for x in pairs_granted[sidx:eidx]]) #logging.log_first_n(logging.INFO, '%s', 1, sql) g_cursor.execute(sql) cnx_g.commit() # g_cursor.execute('alter table temp_assignee_disambiguation_mapping add primary key (uuid)') cnx_g.close() pg_cursor = cnx_pg.cursor() batch_size = 100000 offsets = [x for x in range(0, len(pairs_pregranted), batch_size)] for idx in tqdm(range(len(offsets)), 'adding pregranted', total=len(offsets)): sidx = offsets[idx] eidx = min(len(pairs_pregranted), offsets[idx] + batch_size) sql = "INSERT INTO temp_assignee_disambiguation_mapping (uuid, assignee_id, version_indicator) VALUES " + ', '.join( [ '("%s", "%s", "20201229")' % x for x in pairs_pregranted[sidx:eidx] ]) # logging.log_first_n(logging.INFO, '%s', 1, sql) pg_cursor.execute(sql) cnx_pg.commit() # pg_cursor.execute('alter table temp_assignee_disambiguation_mapping add primary key (uuid)') cnx_pg.close()
def create_uuid_map(config): cnx_g = pvdb.granted_table(config) cnx_pg = pvdb.pregranted_table(config) g_cursor = cnx_g.cursor() g_cursor.execute("SELECT id FROM rawlocation;") granted_uuids = set() for uuid in tqdm(g_cursor, 'granted uuids'): granted_uuids.add(uuid[0]) pg_cursor = cnx_pg.cursor() pg_cursor.execute("SELECT id FROM rawlocation;") pgranted_uuids = set() for uuid in tqdm(pg_cursor, 'pregranted uuids'): pgranted_uuids.add(uuid[0]) return granted_uuids, pgranted_uuids
def build_granted(config): feature_map = dict() cnx = pvdb.granted_table(config) if cnx is None: return feature_map cursor = cnx.cursor() query = "SELECT id,title FROM patent;" cursor.execute(query) idx = 0 for rec in cursor: record_id = '%s' % rec[0] feature_map[record_id] = rec[1] idx += 1 logging.log_every_n(logging.INFO, 'Processed %s grant records - %s features', 10000, idx, len(feature_map)) logging.log(logging.INFO, 'Processed %s grant records - %s features', idx, len(feature_map)) return feature_map
def build_granted(config): canopy2uuids = collections.defaultdict(list) cnx = pvdb.granted_table(config) # cnx is none if we haven't specified a granted table if cnx is None: return canopy2uuids cursor = cnx.cursor() query = "SELECT uuid, name_first, name_last FROM rawinventor;" cursor.execute(query) idx = 0 for uuid, name_first, name_last in cursor: im = InventorMention(uuid, '0', '', name_first if name_first else '', name_last if name_last else '', '', '', '') canopy2uuids[first_letter_last_name(im)].append(uuid) idx += 1 logging.log_every_n(logging.INFO, 'Processed %s granted records - %s canopies', 10000, idx, len(canopy2uuids)) logging.log(logging.INFO, 'Processed %s granted records - %s canopies', idx, len(canopy2uuids)) return canopy2uuids
def upload(config): loader = load_mysql.Loader.from_config(config) pregranted_ids = set( [y for x in loader.pregranted_canopies.values() for y in x]) granted_ids = set([y for x in loader.granted_canopies.values() for y in x]) disamb = dict() with open(config['INVENTOR_UPLOAD']['input'], 'r') as fin: for line in fin: splt = line.strip().split('\t') if len(splt) != 2: print('error %s' % str(splt)) else: disamb[splt[0]] = splt[1] pairs_pregranted = [] pairs_granted = [] with open(config['INVENTOR_UPLOAD']['input'], 'r') as fin: for line in fin: splt = line.strip().split('\t') if splt[0] in pregranted_ids: pairs_pregranted.append((splt[0], splt[1])) elif splt[0] in granted_ids: pairs_granted.append((splt[0], splt[1])) cnx_g = pvdb.granted_table(config) cnx_pg = pvdb.pregranted_table(config) g_cursor = cnx_g.cursor() batch_size = 100000 offsets = [x for x in range(0, len(pairs_granted), batch_size)] for idx in tqdm(range(len(offsets)), 'adding granted', total=len(offsets)): sidx = offsets[idx] eidx = min(len(pairs_granted), offsets[idx] + batch_size) sql = "INSERT INTO tmp_inventor_disambiguation_granted2 (uuid, disambiguated_id) VALUES " + ', '.join( ['("%s", "%s")' % x for x in pairs_granted[sidx:eidx]]) # logging.log_first_n(logging.INFO, '%s', 1, sql) g_cursor.execute(sql) cnx_g.commit() g_cursor.execute( 'alter table tmp_inventor_disambiguation_granted2 add primary key (uuid)' ) cnx_g.close() pg_cursor = cnx_pg.cursor() batch_size = 100000 offsets = [x for x in range(0, len(pairs_pregranted), batch_size)] for idx in tqdm(range(len(offsets)), 'adding pregranted', total=len(offsets)): sidx = offsets[idx] eidx = min(len(pairs_pregranted), offsets[idx] + batch_size) sql = "INSERT INTO tmp_inventor_disambiguation_pregranted2 (uuid, disambiguated_id) VALUES " + ', '.join( ['("%s", "%s")' % x for x in pairs_pregranted[sidx:eidx]]) # logging.log_first_n(logging.INFO, '%s', 1, sql) pg_cursor.execute(sql) cnx_pg.commit() pg_cursor.execute( 'alter table tmp_inventor_disambiguation_pregranted2 add primary key (uuid)' ) cnx_pg.close()