Пример #1
0
 def __init__(self, pregranted_canopies, granted_canopies, config):
     self.pregranted_canopies = pregranted_canopies
     self.granted_canopies = granted_canopies
     self.cnx_g = pvdb.granted_table(config)
     self.cnx_pg = pvdb.pregranted_table(config)
     self.cnx_g_inc = pvdb.incremental_granted_table(config)
     self.cnx_pg_inc = pvdb.incremental_pregranted_table(config)
def drop_tables(config):
    cnx_g = pvdb.granted_table(config)
    cnx_pg = pvdb.pregranted_table(config)

    g_cursor = cnx_g.cursor()
    g_cursor.execute("TRUNCATE TABLE temp_assignee_disambiguation_mapping")
    pg_cursor = cnx_pg.cursor()
    pg_cursor.execute("TRUNCATE TABLE temp_assignee_disambiguation_mapping")
    g_cursor.close()
    pg_cursor.close()
def drop_tables(config):
    cnx_g = pvdb.granted_table(config)

    cnx_pg = pvdb.pregranted_table(config)

    g_cursor = cnx_g.cursor()
    g_cursor.execute("DROP TABLE location_disambiguation_mapping")
    pg_cursor = cnx_pg.cursor()
    pg_cursor.execute("DROP TABLE location_disambiguation_mapping")
    g_cursor.close()
    pg_cursor.close()
def create_tables(config):
    cnx_g = pvdb.granted_table(config)
    cnx_pg = pvdb.pregranted_table(config)
    g_cursor = cnx_g.cursor()
    g_cursor.execute(
        "CREATE TABLE location_disambiguation_mapping (uuid VARCHAR(255), location_id VARCHAR(255))")
    pg_cursor = cnx_pg.cursor()
    pg_cursor.execute(
        "CREATE TABLE location_disambiguation_mapping (uuid VARCHAR(255), location_id VARCHAR(255))")
    g_cursor.close()
    pg_cursor.close()
Пример #5
0
def build_pregrants(canopy2mentions, pregranted_uuid2canopy, config):
    cnx = pvdb.pregranted_table(config)
    cursor = cnx.cursor()
    query = "SELECT id, city, state, country, latitude, longitude, filename, created_date, updated_date FROM rawlocation;"
    cursor.execute(query)
    for rec in tqdm(cursor, 'working on pregrants', total=10866744):
        lm = LocationMention.from_application_sql_record(rec)
        if lm.uuid in pregranted_uuid2canopy:
            canopy = pregranted_uuid2canopy[lm.uuid]
            canopy2mentions[canopy].append(lm)
    return canopy2mentions
def build_pregrants(config):
    canopy2uuids = collections.defaultdict(list)
    uuid2canopy = dict()
    uuid2entityid = load_disambiguation(config)
    cnx = pvdb.pregranted_table(config)
    cursor = cnx.cursor()
    query = "SELECT id, rawlocation_id FROM rawinventor;"
    cursor.execute(query)
    for uuid, rawlocation_id in tqdm(cursor, 'process', total=8100000):
        canopy2uuids[uuid2entityid[uuid]].append(rawlocation_id)
        uuid2canopy[rawlocation_id] = uuid2entityid[uuid]
    return canopy2uuids, uuid2canopy
Пример #7
0
def create_tables(config):
    cnx_g = pvdb.granted_table(config)
    cnx_pg = pvdb.pregranted_table(config)

    g_cursor = cnx_g.cursor()
    g_cursor.execute(
        "CREATE TABLE tmp_inventor_disambiguation_granted2 (uuid VARCHAR(255), disambiguated_id VARCHAR(255))"
    )
    pg_cursor = cnx_pg.cursor()
    pg_cursor.execute(
        "CREATE TABLE tmp_inventor_disambiguation_pregranted2 (uuid VARCHAR(255), disambiguated_id VARCHAR(255))"
    )
    g_cursor.close()
    pg_cursor.close()
def build_pregrants(config):
    # | id | document_number | sequence | name_first | name_last | organization | type | rawlocation_id | city | state | country | filename | created_date | updated_date |
    cnx = pvdb.pregranted_table(config)
    cursor = cnx.cursor()
    query = "SELECT id, document_number, sequence -1 as sequence, name_first, name_last, organization, type, rawlocation_id, city, state, country FROM rawassignee"
    cursor.execute(query)
    feature_map = collections.defaultdict(list)
    idx = 0
    for rec in cursor:
        am = AssigneeMention.from_application_sql_record(rec)
        feature_map[am.name_features()[0]].append(am)
        idx += 1
        logging.log_every_n(logging.INFO,
                            'Processed %s pregrant records - %s features',
                            10000, idx, len(feature_map))
    return feature_map
Пример #9
0
def create_uuid_map(config):
    cnx_g = pvdb.granted_table(config)
    cnx_pg = pvdb.pregranted_table(config)

    g_cursor = cnx_g.cursor()
    g_cursor.execute("SELECT uuid, patent_id, sequence FROM rawassignee;")
    granted_uuids = dict()
    for uuid, patent_id, seq in tqdm(g_cursor, 'granted uuids'):
        granted_uuids['%s-%s' % (patent_id, seq)] = uuid

    pg_cursor = cnx_pg.cursor()
    pg_cursor.execute("SELECT id, document_number, sequence-1 as sequence FROM rawassignee;")
    pgranted_uuids = dict()
    for uuid, doc_id, seq in tqdm(pg_cursor, 'pregranted uuids'):
        pgranted_uuids['pg-%s-%s' % (doc_id, seq)] = uuid
    return granted_uuids, pgranted_uuids
def upload(granted_ids, pregranted_ids, config):
    pairs_pregranted = []
    pairs_granted = []
    with open(config['ASSIGNEE_UPLOAD']['input'], 'r') as fin:
        for line in fin:
            splt = line.strip().split('\t')
            if splt[0] in pregranted_ids:
                pairs_pregranted.append((pregranted_ids[splt[0]], splt[1]))
            elif splt[0] in granted_ids:
                pairs_granted.append((granted_ids[splt[0]], splt[1]))

    cnx_g = pvdb.granted_table(config)
    cnx_pg = pvdb.pregranted_table(config)

    g_cursor = cnx_g.cursor()
    batch_size = 100000
    offsets = [x for x in range(0, len(pairs_granted), batch_size)]
    for idx in tqdm(range(len(offsets)), 'adding granted', total=len(offsets)):
        sidx = offsets[idx]
        eidx = min(len(pairs_granted), offsets[idx] + batch_size)
        sql = "INSERT INTO temp_assignee_disambiguation_mapping (uuid,  assignee_id, version_indicator) VALUES " + ', '.join(
            ['("%s", "%s", "20201229")' % x for x in pairs_granted[sidx:eidx]])
        #logging.log_first_n(logging.INFO, '%s', 1, sql)
        g_cursor.execute(sql)
    cnx_g.commit()
    #    g_cursor.execute('alter table temp_assignee_disambiguation_mapping add primary key (uuid)')
    cnx_g.close()

    pg_cursor = cnx_pg.cursor()
    batch_size = 100000
    offsets = [x for x in range(0, len(pairs_pregranted), batch_size)]
    for idx in tqdm(range(len(offsets)),
                    'adding pregranted',
                    total=len(offsets)):
        sidx = offsets[idx]
        eidx = min(len(pairs_pregranted), offsets[idx] + batch_size)
        sql = "INSERT INTO temp_assignee_disambiguation_mapping (uuid, assignee_id, version_indicator) VALUES " + ', '.join(
            [
                '("%s", "%s", "20201229")' % x
                for x in pairs_pregranted[sidx:eidx]
            ])
        # logging.log_first_n(logging.INFO, '%s', 1, sql)
        pg_cursor.execute(sql)
    cnx_pg.commit()
    #    pg_cursor.execute('alter table temp_assignee_disambiguation_mapping add primary key (uuid)')
    cnx_pg.close()
def create_uuid_map(config):
    cnx_g = pvdb.granted_table(config)

    cnx_pg = pvdb.pregranted_table(config)

    g_cursor = cnx_g.cursor()
    g_cursor.execute("SELECT id FROM rawlocation;")
    granted_uuids = set()
    for uuid in tqdm(g_cursor, 'granted uuids'):
        granted_uuids.add(uuid[0])

    pg_cursor = cnx_pg.cursor()
    pg_cursor.execute("SELECT id FROM rawlocation;")
    pgranted_uuids = set()
    for uuid in tqdm(pg_cursor, 'pregranted uuids'):
        pgranted_uuids.add(uuid[0])
    return granted_uuids, pgranted_uuids
def build_pregrants(config):
    feature_map = dict()
    cnx = pvdb.pregranted_table(config)
    if cnx is None:
        return feature_map
    cursor = cnx.cursor()
    query = "select document_number,invention_title from application;"
    cursor.execute(query)
    idx = 0
    for rec in cursor:
        record_id = 'pg-%s' % rec[0]
        feature_map[record_id] = rec[1]
        idx += 1
        logging.log_every_n(logging.INFO,
                            'Processed %s pregrant records - %s features',
                            10000, idx, len(feature_map))
    logging.log(logging.INFO, 'Processed %s pregrant records - %s features',
                idx, len(feature_map))
    return feature_map
def build_pregrants(config):
    canopy2uuids = collections.defaultdict(list)
    cnx = pvdb.pregranted_table(config)
    # cnx is none if we haven't specified a pregranted table
    if cnx is None:
        return canopy2uuids
    cursor = cnx.cursor()
    query = "SELECT id, name_first, name_last FROM rawinventor;"
    cursor.execute(query)
    idx = 0
    for uuid, name_first, name_last in cursor:
        im = InventorMention(uuid, '0', '', name_first if name_first else '',
                             name_last if name_last else '', '', '', '')
        canopy2uuids[first_letter_last_name(im)].append(uuid)
        idx += 1
        logging.log_every_n(logging.INFO,
                            'Processed %s pregrant records - %s canopies',
                            10000, idx, len(canopy2uuids))
    logging.log(logging.INFO, 'Processed %s pregrant records - %s canopies',
                idx, len(canopy2uuids))

    return canopy2uuids
Пример #14
0
def upload(config):
    loader = load_mysql.Loader.from_config(config)
    pregranted_ids = set(
        [y for x in loader.pregranted_canopies.values() for y in x])
    granted_ids = set([y for x in loader.granted_canopies.values() for y in x])

    disamb = dict()
    with open(config['INVENTOR_UPLOAD']['input'], 'r') as fin:
        for line in fin:
            splt = line.strip().split('\t')
            if len(splt) != 2:
                print('error %s' % str(splt))
            else:
                disamb[splt[0]] = splt[1]

    pairs_pregranted = []
    pairs_granted = []
    with open(config['INVENTOR_UPLOAD']['input'], 'r') as fin:
        for line in fin:
            splt = line.strip().split('\t')
            if splt[0] in pregranted_ids:
                pairs_pregranted.append((splt[0], splt[1]))
            elif splt[0] in granted_ids:
                pairs_granted.append((splt[0], splt[1]))

    cnx_g = pvdb.granted_table(config)
    cnx_pg = pvdb.pregranted_table(config)

    g_cursor = cnx_g.cursor()
    batch_size = 100000
    offsets = [x for x in range(0, len(pairs_granted), batch_size)]
    for idx in tqdm(range(len(offsets)), 'adding granted', total=len(offsets)):
        sidx = offsets[idx]
        eidx = min(len(pairs_granted), offsets[idx] + batch_size)
        sql = "INSERT INTO tmp_inventor_disambiguation_granted2 (uuid, disambiguated_id) VALUES " + ', '.join(
            ['("%s", "%s")' % x for x in pairs_granted[sidx:eidx]])
        # logging.log_first_n(logging.INFO, '%s', 1, sql)
        g_cursor.execute(sql)
    cnx_g.commit()
    g_cursor.execute(
        'alter table tmp_inventor_disambiguation_granted2 add primary key (uuid)'
    )
    cnx_g.close()

    pg_cursor = cnx_pg.cursor()
    batch_size = 100000
    offsets = [x for x in range(0, len(pairs_pregranted), batch_size)]
    for idx in tqdm(range(len(offsets)),
                    'adding pregranted',
                    total=len(offsets)):
        sidx = offsets[idx]
        eidx = min(len(pairs_pregranted), offsets[idx] + batch_size)
        sql = "INSERT INTO tmp_inventor_disambiguation_pregranted2 (uuid, disambiguated_id) VALUES " + ', '.join(
            ['("%s", "%s")' % x for x in pairs_pregranted[sidx:eidx]])
        # logging.log_first_n(logging.INFO, '%s', 1, sql)
        pg_cursor.execute(sql)
    cnx_pg.commit()
    pg_cursor.execute(
        'alter table tmp_inventor_disambiguation_pregranted2 add primary key (uuid)'
    )
    cnx_pg.close()