def create_assignee(update_config): engine = create_engine(get_connection_string(update_config, "NEW_DB")) assignee_name_with_count = pd.read_sql_query( "SELECT assignee_id, name_first, name_last, type, count(1) name_count from rawassignee where assignee_id " "is not null and organization is null group by assignee_id, name_first, name_last, type;", con=engine) assignee_name_data = assignee_name_with_count.sort_values( "name_count", ascending=False).groupby("assignee_id").head(1).reset_index(drop=True) assignee_name_data = assignee_name_data.drop( "name_count", axis=1).assign(organization=None) assignee_name_data.rename({ "assignee_id": "id" }, axis=1).to_sql(name='assignee', con=engine, if_exists='append', index=False) assignee_organization_with_count = pd.read_sql_query( "SELECT assignee_id, organization, type, count(1) org_count from rawassignee where assignee_id is not " "null and organization is not null group by assignee_id, organization, type;", con=engine) assignee_org_data = assignee_organization_with_count.sort_values( "org_count", ascending=False).groupby("assignee_id").head(1).reset_index(drop=True) assignee_org_data = assignee_org_data.drop("org_count", axis=1).assign(name_first=None, name_last=None) assignee_org_data.rename({ "assignee_id": "id" }, axis=1).to_sql(name='assignee', con=engine, if_exists='append', index=False)
def process_and_upload_wipo(config): myengine = create_engine(get_connection_string(config, "NEW_DB")) wipo_output = '{}/{}'.format(config['FOLDERS']['WORKING_FOLDER'], 'wipo_output') if not os.path.exists(wipo_output): os.mkdir(wipo_output) persistent_files = config['FOLDERS']['PERSISTENT_FILES'] ipc_tech_file = '{}/ipc_technology.csv'.format(persistent_files) ipc_tech_field_map = get_ipc_tech_code_field_map(ipc_tech_file) concordance_file = '{}/{}/{}'.format(config['FOLDERS']['WORKING_FOLDER'], 'cpc_input', 'ipc_concordance.txt') cpc_ipc_concordance_map = get_ipc_cpc_ipc_concordance_map(concordance_file) limit = 10000 offset = 0 batch_counter = 0 base_query_template = "SELECT id from patent order by id limit {limit} offset {offset}" cpc_query_template = "SELECT c.patent_id, c.subgroup_id from cpc_current c join ({base_query}) p on p.id = " \ "c.patent_id" while True: start = time.time() batch_counter += 1 base_query = base_query_template.format(limit=limit, offset=offset) cpc_join_query = cpc_query_template.format(base_query=base_query) cpc_current_data = pd.read_sql_query(con=myengine, sql=cpc_join_query) if cpc_current_data.shape[0] < 1: break wipo_chunk_processor(cpc_current_data, ipc_tech_field_map, cpc_ipc_concordance_map, config) offset = offset + limit end = time.time() print("Chunk Time:" + str(round(end - start))) consolidate_wipo(config)
def session_generator(dbtype='grant'): """ Read from ../project/Development/config.ini file and load appropriate database @dbtype: string indicating if we are fetching the session for the grant database or the application database session_generator will return an object that can be called to retrieve more sessions, e.g. sg = session_generator(dbtype='grant') session1 = sg() session2 = sg() etc. These sessions will be protected with the ping refresher above """ # config = get_config() # echo = config.get('GLOBAL').get('echo') echo = True read_database = "NEW_DB" cstr = get_connection_string(config, read_database) engine = create_engine(cstr) # engine = create_engine('mysql+mysqldb://{0}:{1}@{2}/{3}?charset=utf8mb4'.format( # config.get('DATABASE').get('user'), # config.get('DATABASE').get('password'), # config.get('DATABASE').get('host'), # config.get('DATABASE').get(read_database), echo=echo), pool_size=3, pool_recycle=3600, echo_pool=True) schema.GrantBase.metadata.create_all(engine) Session = sessionmaker(bind=engine, _enable_transaction_accounting=False) return scoped_session(Session)
def rename_old_db(update_config): old_database = update_config["DATABASE"]["OLD_DB"] new_database = update_config["DATABASE"]["NEW_DB"] connection_string = get_connection_string(update_config, "OLD_DB") engine = create_engine(connection_string) tables = [ t[0] for t in engine.execute('show tables from {}'.format(old_database)) if not t[0].startswith('temp') ] tables_to_truncate = [ 'inventor_disambiguation_mapping', 'assignee_disambiguation_mapping', 'assignee', 'cpc_current', 'cpc_group', 'cpc_subgroup', 'cpc_subsection', 'inventor', 'location', 'location_assignee', 'location_inventor', 'patent_assignee', 'patent_inventor', 'patent_lawyer' ] for table in tables: con = engine.connect() con.execute('alter table {0}.{2} rename {1}.{2}'.format( old_database, new_database, table)) if table in tables_to_truncate: con.execute('truncate table {}.{}'.format(new_database, table)) con.close()
def test_column_encoding(self): new_database = self.config['DATABASE']["TEMP_UPLOAD_DB"] connection_string = get_connection_string(self.config, database='TEMP_UPLOAD_DB') engine = create_engine(connection_string) collation_query = ''' SELECT TABLE_NAME, COLUMN_NAME, character_set_name, collation_name FROM information_schema.columns WHERE table_schema='{new_db}' AND data_type IN ('varchar', 'longtext', 'mediumtext', 'text', 'enum', 'char', 'set', 'int') '''.format( new_db=new_database) collation_cursor = engine.execute(collation_query) for column_collation_name in collation_cursor: if column_collation_name[2] != 'utf8mb4': raise AssertionError( "Table character set should be utf8mb4 instead found {cset} for table {tbl}. column name {col}".format( cset=column_collation_name[2], tbl=column_collation_name[0], col=column_collation_name[1])) if column_collation_name[3] != 'utf8mb4_unicode_ci': raise AssertionError( "Table collation should be utf8mb4_unicode_ci instead found {collation} for table {tbl}".format( collation=column_collation_name[3], tbl=column_collation_name[0], col=column_collation_name[1]))
def test_inventor_table_generator(self, config): from updater.post_processing.post_process_inventor import create_inventor create_inventor(config) from sqlalchemy import create_engine import pandas as pd engine = create_engine(get_connection_string(config, "NEW_DB")) count_query = "SELECT count(1) as `rows` from inventor" distinct_id_query = "SELECT count(distinct inventor_id) inventors from rawinventor" column_query = """ SELECT COLUMN_NAME from information_schema.COLUMNS where TABLE_SCHEMA='{schema}' and TABLE_NAME='inventor'""".format( schema=config["DATABASE"]["NEW_DB"]) column_data = pd.read_sql_query(column_query, con=engine) expected_columns = ['id', 'name_first', 'name_last'] actual_column_names = column_data.COLUMN_NAME.tolist() assert len(actual_column_names) == len(expected_columns) assert set(actual_column_names) == set(expected_columns) inv_count_data = pd.read_sql_query(count_query, con=engine) rinv_distinct_count_data = pd.read_sql_query(distinct_id_query, con=engine) inv_count = inv_count_data.rows.tolist()[0] rinv_distinct_counts = rinv_distinct_count_data.inventors.tolist()[0] assert inv_count == rinv_distinct_counts duplicate_query = "SELECT count(1) dups from (SELECT count(1) from inventor group by id having count(1)>1)x" dup_count = pd.read_sql_query(duplicate_query, con=engine) assert (dup_count.dups.tolist()[0] == 0) engine.execute("TRUNCATE TABLE inventor")
def fetch_session(dbtype='grant'): """ Read from ../project/Development/config.ini file and load appropriate database @dbtype: string indicating if we are fetching the session for the grant database or the application database """ # config = get_config() echo = True if dbtype == 'grant': # this is here so we can port to work for applicaitons also read_database = "NEW_DB" cstr = get_connection_string(config, read_database) engine = create_engine(cstr) # engine = create_engine('mysql+mysqldb://{0}:{1}@{2}/{3}?charset=utf8'.format( # config['DATABASE']['USERNAME'], # config['DATABASE']['PASSWORD'], # config['DATABASE']['HOST'], # config['DATABASE'][read_database], echo=echo)) schema.GrantBase.metadata.create_all(engine) Session = sessionmaker(bind=engine, _enable_transaction_accounting=False) session = Session() return session
def prepare_tables(config): cstr = get_connection_string(config, 'NEW_DB') engine = create_engine(cstr + "&local_infile=1") timestamp = str(int(time.time())) with engine.connect() as connection: connection.execute( "CREATE TABLE rawlawyer_copy_backup_{} LIKE rawlawyer;".format( timestamp)) connection.execute( "INSERT INTO rawlawyer_copy_backup_{} SELECT * FROM rawlawyer". format(timestamp)) with engine.connect() as connection: connection.execute( "ALTER TABLE rawlawyer ADD COLUMN alpha_lawyer_id varchar(128) AFTER organization;" ) with engine.connect() as connection: connection.execute( "UPDATE rawlawyer rc SET rc.alpha_lawyer_id = rc.organization WHERE rc.organization IS NOT NULL;" ) connection.execute( "UPDATE rawlawyer rc SET rc.alpha_lawyer_id = concat(rc.name_first, '|', rc.name_last) WHERE " "rc.name_first IS NOT NULL AND rc.name_last IS NOT NULL;") connection.execute( "UPDATE rawlawyer rc SET rc.alpha_lawyer_id = '' WHERE rc.alpha_lawyer_id IS NULL;" )
def upload_disambig_results(update_config): engine = create_engine(get_connection_string(update_config, "NEW_DB")) disambig_output_file = "{wkfolder}/disambig_output/{disamb_file}".format( wkfolder=update_config['FOLDERS']['WORKING_FOLDER'], disamb_file="inventor_disambiguation.tsv") disambig_output = pd.read_csv(disambig_output_file, sep="\t", chunksize=300000, header=None, quoting=csv.QUOTE_NONE, names=[ 'unknown_1', 'uuid', 'inventor_id', 'name_first', 'name_middle', 'name_last', 'name_suffix' ]) count = 0 for disambig_chunk in disambig_output: engine.connect() start = time.time() count += disambig_chunk.shape[0] disambig_chunk[["uuid", "inventor_id" ]].to_sql(name='inventor_disambiguation_mapping', con=engine, if_exists='append', index=False, method='multi') end = time.time() print("It took {duration} seconds to get to {cnt}".format( duration=round(end - start, 3), cnt=count)) engine.dispose()
def upload_disambig_results(update_config): engine = create_engine(get_connection_string(update_config, "NEW_DB")) disambig_output_file = "{wkfolder}/disambig_output/{disamb_file}".format( wkfolder=update_config['FOLDERS']['WORKING_FOLDER'], disamb_file="assignee_disambiguation.tsv") disambig_output = pd.read_csv( disambig_output_file, sep="\t", chunksize=300000, header=None, quoting=csv.QUOTE_NONE, names=['uuid', 'organization_id', 'person_id', 'organization', 'name']) count = 0 engine.execute("TRUNCATE TABLE assignee_disambiguation_mapping") for disambig_chunk in disambig_output: disambig_chunk = disambig_chunk.assign( assignee_id=disambig_chunk.organization_id) disambig_chunk.assignee_id.fillna(disambig_chunk.person_id, inplace=True) disambig_chunk.drop(["organization_id", "person_id"], axis=1, inplace=True) count += disambig_chunk.shape[0] engine.connect() start = time.time() disambig_chunk[["uuid", "assignee_id" ]].to_sql(name='assignee_disambiguation_mapping', con=engine, if_exists='append', index=False, method='multi') end = time.time() print("It took {duration} seconds to get to {cnt}".format( duration=round(end - start, 3), cnt=count)) engine.dispose()
def test_inventor_precache(self, config): from updater.post_processing.post_process_inventor import precache_inventors from sqlalchemy import create_engine import pandas as pd count_query = """ SELECT count(1) as `rows` from disambiguated_inventor_ids; """ distinct_count_query = """ SELECT count(distinct inventor_id) distinct_inventor_ids from rawinventor; """ duplicate_query = """ SELECT count(1) as duplicates from (SELECT inventor_id from disambiguated_inventor_ids group by inventor_id having count(1) > 1) x """ engine = create_engine(get_connection_string(config, "NEW_DB")) pre_count = pd.read_sql_query(count_query, engine) assert (pre_count.rows.tolist()[0] == 0) precache_inventors(config) post_count = pd.read_sql_query(count_query, engine) distinct_count = pd.read_sql_query(distinct_count_query, engine) assert (post_count.rows.tolist()[0] == distinct_count.distinct_inventor_ids.tolist()[0]) duplicate_count = pd.read_sql_query(duplicate_query, engine) assert (duplicate_count.duplicates.tolist()[0] == 0)
def test_inventor_generator(self, config, limit, offset, expected): from updater.post_processing.post_process_inventor import generate_disambiguated_inventors from sqlalchemy import create_engine engine = create_engine(get_connection_string(config, "NEW_DB")) inventors = generate_disambiguated_inventors(engine, limit, offset) assert (len(inventors.inventor_id.unique()) == expected) # disambiguated id, name first, name last, patent date assert (inventors.shape[1] == 4)
def upload_cpc_classes(config): cstr = get_connection_string(config, "NEW_DB") db_con = create_engine(cstr) cpc_folder = '{}/{}'.format(config['FOLDERS']['WORKING_FOLDER'], 'cpc_output') upload_cpc_small_tables(db_con, config['DATABASE']['NEW_DB'], cpc_folder) upload_cpc_subgroup(db_con, config['DATABASE']['NEW_DB'], cpc_folder)
def clean_data(): yield config = get_config() config["DATABASE"]["NEW_DB"] = 'sarvo_test_db' from sqlalchemy import create_engine engine = create_engine(get_connection_string(config, "NEW_DB")) engine.execute("TRUNCATE TABLE inventor;") engine.execute("TRUNCATE TABLE disambiguated_inventor_ids;")
def extract_wipo_data(cpc_chunk, cpc_ipc_concordance, ipc_tech_map, config): # Obtain IPC Concordance for each patent based on cpc subgrou ID cpc_current_with_concordance = cpc_chunk.merge(right=cpc_ipc_concordance, how='left', left_on='subgroup_id', right_on='cpc_code').drop( "cpc_code", axis=1) # If concordance does not exist, use subgroup id as IPC code cpc_current_with_concordance.ipc_code.fillna( cpc_current_with_concordance.subgroup_id, inplace=True) # Create lookup fields for IPC Wipo technology field id cpc_current_with_concordance = cpc_current_with_concordance.assign( section=cpc_current_with_concordance.ipc_code.str.slice(0, 4)) cpc_current_with_concordance = cpc_current_with_concordance.assign( group=cpc_current_with_concordance.ipc_code.str.split("/", expand=True)[0]) # Lookup IPC Tech field ID (WIpo field id) # First lookup using "section" column cpc_current_with_wito_merge_1 = cpc_current_with_concordance.merge( right=ipc_tech_map, how='left', left_on='section', right_on='IPC_Code').drop('IPC_Code', axis=1).rename({"Field_number": "field_id"}, axis=1) # For failed lookups use "group" field secondary_lookup = cpc_current_with_wito_merge_1.merge(right=ipc_tech_map, how='left', left_on='group', right_on='IPC_Code') # Merge the secondary lookup with main dataset (Merge by index) wipo_data_with_merge = cpc_current_with_wito_merge_1.join( secondary_lookup[["Field_number"]]) wipo_data_with_merge.field_id.fillna(wipo_data_with_merge.Field_number, inplace=True) # Clean UP wipo_data = wipo_data_with_merge.dropna(subset=['field_id'], axis=0).drop( ["subgroup_id", "ipc_code", "section", "group", "Field_number"], axis=1) # Counter for Each Field ID for each patent wipo_count = wipo_data.groupby(["patent_id", "field_id"]).size().to_frame('wipo_count') wipo_count = wipo_count.reset_index() # Retain Top 3 most frequent Wipo field IDs wipo_filtered_data = wipo_count.groupby("patent_id").apply( lambda _df: _df.nlargest(3, 'wipo_count', keep='all')).reset_index( drop=True) # Assign Sequence wipo_filtered_data_sequenced = wipo_filtered_data.drop( ["wipo_count"], axis=1).assign( sequence=wipo_filtered_data.groupby(['patent_id']).cumcount()) cstr = get_connection_string(config, "TEMP_UPLOAD_DB") print(cstr) engine = create_engine(cstr) with engine.begin() as conn: wipo_filtered_data_sequenced.to_sql('wipo', conn, if_exists='append', index=False, method="multi")
def create_new_database(config): new_database = config["DATABASE"]["NEW_DB"] connection_string = get_connection_string(config, "OLD_DB") engine = create_engine(connection_string) connection = engine.connect() connection.execute( 'create schema {} default character set=utf8mb4 default collate=utf8mb4_unicode_ci' .format(new_database)) connection.close()
def prepare_cpc_table(config, drop_indexes): """ Prepare the CPC Current table by dropping Indexes :param config: Config file containing variour runtime paramters :param drop_indexes: List of Drop Index Statements """ engine = create_engine(get_connection_string(config, "NEW_DB")) for drop_statement in drop_indexes: engine.execute(drop_statement[0])
def __init__(self, config): self.config = config self.new_db = config['DATABASE']['NEW_DB'] self.old_db = config['DATABASE']['OLD_DB'] self.old_id_col = 'disamb_inventor_id_{}'.format(self.old_db[-8:]) self.new_id_col = 'disamb_inventor_id_{}'.format(self.new_db[-8:]) self.cstr = get_connection_string(config, database='NEW_DB') self.id_to_gender = {} pass
def precache_assignee(config): assignee_cache_query = """ INSERT INTO disambiguated_assignee_ids (assignee_id) SELECT distinct assignee_id from rawassignee UNION SELECT distinct assignee_id from {pgpubs_database}.rawassignee; """.format(pgpubs_database=config['DATABASE']['PGPUBS_DATABASE']) engine = create_engine(get_connection_string(config, "NEW_DB")) engine.execute(assignee_cache_query)
def __init__(self, config): self.config = config self.qa_connection_string = get_connection_string(self.config, 'QA_DATABASE') self.connection = pymysql.connect(host=self.config['DATABASE']['HOST'], user=self.config['DATABASE']['USERNAME'], password=self.config['DATABASE']['PASSWORD'], db=self.config['DATABASE']['NEW_DB'], charset='utf8mb4', cursorclass=pymysql.cursors.SSCursor, defer_connect=True) self.qa_data = {"DataMonitor_patentwithdrawncount": []}
def generate_timestamp_uploads(update_config): working_folder = update_config['FOLDERS']['WORKING_FOLDER'] connection_string = get_connection_string(update_config, "TEMP_UPLOAD_DB") parsed_data_folder = "{working_folder}/{parsed_folder}".format( working_folder=working_folder, parsed_folder="parsed_data") for timestamp_folder in os.listdir(parsed_data_folder): timestamp_folder_full_path = "{source_root}/{folder_name}/".format( source_root=parsed_data_folder, folder_name=timestamp_folder) upload_from_timestamp_folder(timestamp_folder_full_path, connection_string)
def __init__(self, config, database_section, start_date, end_date): """ Do not instantiate. Overriden class constructor :param config: Configparser object containing update parameters :param database_section: Section in config that indicates database to use. NEW_DB or TEMP_UPLOAD_DB :param start_date: Database Update start date :param end_date: Database Update end date """ # Tables that do not directly link to patent table self.patent_exclusion_list = [ 'mainclass', 'mainclass_current', 'subclass', 'subclass_current', 'patent', 'rawlocation' ] # Update start and end date self.start_date = start_date self.end_date = end_date # Indicator for Upload/Patents database self.database_section = database_section self.qa_connection_string = get_connection_string( config, 'QA_DATABASE') self.connection = pymysql.connect( host=config['DATABASE']['HOST'], user=config['DATABASE']['USERNAME'], password=config['DATABASE']['PASSWORD'], db=config['DATABASE'][database_section], charset='utf8mb4', cursorclass=pymysql.cursors.SSCursor, defer_connect=True) # self.database_connection_string = get_connection_string(config, database_section) self.config = config # Place Holder for saving QA counts - keys map to table names in patent_QA self.qa_data = { "DataMonitor_count": [], 'DataMonitor_nullcount': [], 'DataMonitor_patentyearlycount': [], 'DataMonitor_categorycount': [], 'DataMonitor_floatingpatentcount': [], 'DataMonitor_maxtextlength': [], 'DataMonitor_prefixedentitycount': [] } database_type, version = self.config["DATABASE"][ self.database_section].split("_") self.version = version self.database_type = database_type # Following variables are overridden by inherited classes # Dict of tables involved on QC checks self.table_config = {} # Prefix indicates database where patent table is available # Current databaseif prefix is None # Used for Text databases self.patent_db_prefix = None
def consolidate_cpc_data(config, add_indexes): """ Finalize CPC Current table by removing patents not in patent database and re-adding indexes :param config: Config file containing variour runtime paramters :param add_indexes: List of Add Index statments """ engine = create_engine(get_connection_string(config, "NEW_DB")) delete_query = "DELETE cpc FROM cpc_current cpc LEFT JOIN patent p on p.id = cpc.patent_id WHERE p.id is null" engine.execute(delete_query) for add_statement in add_indexes: engine.execute(add_statement[0])
def start_lawyer_disambiguation(config): prepare_tables(config) clean_rawlawyer(config) load_clean_rawlawyer(config) disambiguator = LawyerDisambiguator(config) disambiguator.run_disambiguation() cstr = get_connection_string(config, 'NEW_DB') engine = create_engine(cstr + "&local_infile=1") engine.execute("ALTER TABLE rawlawyer DROP alpha_lawyer_id") engine.dispose()
def post_process_location(config): engine = create_engine(get_connection_string(config, "NEW_DB")) disambiguated_folder = "{}/disambig_output".format(config['FOLDERS']['WORKING_FOLDER']) print('here!', flush=True) id_lat_long_lookup, lat_long_cannonical_name = make_lookup(disambiguated_folder) print('made lookup', flush=True) fips_lookups = create_fips_lookups(config['FOLDERS']['PERSISTENT_FILES']) upload_location(engine, lat_long_cannonical_name, disambiguated_folder, fips_lookups) print('done locupload ', flush=True) process_rawlocation(engine, lat_long_cannonical_name, id_lat_long_lookup, disambiguated_folder) print('done process', flush=True) upload_rawloc(engine, disambiguated_folder, config['DATABASE']['NEW_DB'])
def update_rawinventor(update_config): engine = create_engine(get_connection_string(update_config, "NEW_DB")) update_statement = "UPDATE rawinventor ri left join inventor_disambiguation_mapping idm on idm.uuid = ri.uuid set " \ "" \ "" \ "" \ "" \ "" \ "" \ "" \ "ri.inventor_id=idm.inventor_id " engine.execute(update_statement)
def update_rawassignee(update_config): engine = create_engine(get_connection_string(update_config, "NEW_DB")) update_statement = "UPDATE rawassignee ra left join assignee_disambiguation_mapping adm on adm.uuid = ra.uuid set " \ "" \ "" \ "" \ "" \ "" \ "" \ "" \ "ra.assignee_id=adm.assignee_id " engine.execute(update_statement)
def test_table_encoding(self): new_database = self.config['DATABASE']["TEMP_UPLOAD_DB"] connection_string = get_connection_string(self.config, database='TEMP_UPLOAD_DB') engine = create_engine(connection_string) collation_query = "SELECT TABLE_NAME, TABLE_COLLATION from information_schema.tables where TABLE_SCHEMA='{new_db}'".format( new_db=new_database) collation_cursor = engine.execute(collation_query) for table_collation_row in collation_cursor: if table_collation_row[1] != 'utf8mb4_unicode_ci': raise AssertionError( "Table collation should be utf8mb4_unicode_ci instead found {collation} for table {tbl}".format( cset=table_collation_row[1], tbl=table_collation_row[0]))
def create_persistent_wide_entity(config, entity): db_con = create_engine( get_connection_string(config, 'NEW_DB') + '&local_infile=1') disambig_folder = "{}/disambig_output/".format( config['FOLDERS']['WORKING_FOLDER']) old_db = config['DATABASE']['OLD_DB'] new_db = config['DATABASE']['NEW_DB'] new_db_timestamp = new_db.replace('patent_', '') # set of values that change depending on entity persistent_long_table = 'persistent_{0}_disambig_long'.format(entity) raw_table = 'raw{0}'.format(entity) id_col = '{0}_id'.format(entity) outfile_name_wide = 'persistent_{}_wide.tsv'.format(entity) outfile_fp_wide = disambig_folder + outfile_name_wide persistent_disambig_table = 'persistent_{0}_disambig'.format(entity) # get disambig cols from old db's persistent_inventor_disambig disambig_cols = get_wide_entity_disambig_cols(db_con, old_db, persistent_disambig_table) # Add new column for this data update: raw_cols = [ 'current_{0}_id'.format(raw_table), 'old_{0}_id'.format(raw_table) ] header_wide = [raw_cols[0], raw_cols[1]] + disambig_cols + [ 'disamb_{0}_id_'.format(entity) + new_db_timestamp ] print(header_wide) header_df = pd.DataFrame(columns=header_wide) header_df.to_csv(outfile_fp_wide, index=False, header=True, sep='\t') # get total rows in raw entity table result = db_con.execute('select count(*) from {0}.{1}'.format( new_db, raw_table)) total_rows = [r[0] for r in result][0] write_wide_outfile(db_con, new_db, old_db, entity, persistent_long_table, raw_table, id_col, total_rows, outfile_fp_wide, header_df) ####### 3. create table in database create_wide_table_database(db_con, entity, persistent_disambig_table, outfile_fp_wide) ######### 4. load data db_con.execute( "LOAD DATA LOCAL INFILE '{0}' INTO TABLE {1}.{2} FIELDS TERMINATED BY '\t' NULL DEFINED BY '' IGNORE 1 lines;" .format(outfile_fp_wide, new_db, persistent_disambig_table)) return True
def test_database_encoding(self): new_database = self.config['DATABASE']["TEMP_UPLOAD_DB"] connection_string = get_connection_string(self.config, database='TEMP_UPLOAD_DB') engine = create_engine(connection_string) collation_query = "SELECT DEFAULT_CHARACTER_SET_NAME, DEFAULT_COLLATION_NAME from information_schema.schemata where SCHEMA_NAME='{new_db}'".format( new_db=new_database) collation_cursor = engine.execute(collation_query) cset, collation = collation_cursor.fetchall()[0] if cset != 'utf8mb4': raise AssertionError("Database character set should be utf8mb4 instead found {cset}".format(cset=cset)) if collation != 'utf8mb4_unicode_ci': raise AssertionError( "Database collationshould be utf8mb4_unicode_ci instead found {collation}".format(cset=collation))