def consolidate_entities(read_only): db = DatabaseConnection(path_config='utils/db_config.yaml', search_path='mysql') ids1, eids1, edges = consolidate_people(db) ids2, eids2 = consolidate_companies(db) cur = db.dict_cursor() # 1. Reset eids to be equal to ids in entities. print "Reset eids" cur.execute("update entities set eid = id;") # 2. Consolidate people print "Update DB with people eids" update_eids_of_ids(cur, ids1, eids1) # 3. Consolidate companies print "Update DB with companies eids" update_eids_of_ids(cur, ids2, eids2) # 4. Remove neighbour edges print "Delete neighbour edges" cur.execute("DELETE from related where source=%s", ("neighbour", )) # 5. Add new neighbour edges add_neighbour_edges(cur, edges) # 6. Update related table print "Updating related eids" cur.execute( "UPDATE related SET eid1=entities.eid FROM entities WHERE related.id1=entities.id;" ) cur.execute( "UPDATE related SET eid2=entities.eid FROM entities WHERE related.id2=entities.id;" ) cur.close() if not read_only: db.commit() db.close()
def main(args_dict): test_mode = not args_dict['disable_test_mode'] if test_mode: print "=======================" print "=======TEST MODE=======" print "=======================" timestamp = datetime.now().strftime('%Y%m%d%H%M%S') # Write output into prod_schema_name prod_schema_name = "prod_" + timestamp print "prod_schema_name", prod_schema_name # Create database connections: # Read / write address cache from this one db_address_cache = DatabaseConnection( path_config='db_config_update_source.yaml', search_path='address_cache') # Write prod tables into this one db_prod = DatabaseConnection(path_config='db_config_update_source.yaml') CreateAndSetProdSchema(db_prod, prod_schema_name) # Initialize geocoder geocoder = geocoder_lib.Geocoder(db_address_cache, db_prod, test_mode) # Initialize entity lookup entities_lookup = entities.Entities(db_prod) # Table prod_tables.yaml defines a specifications of SQL selects to read # source data and describtion of additional tables to be created. with open('prod_tables.yaml', 'r') as stream: config = yaml.load(stream) # This is where all the population happens!!! # Go through all the specified data sources and process them, adding data # as needed. We process them in the order! for key in sorted(config.keys()): config_per_source = config[key] print "Working on source:", key ProcessSource(db_prod, geocoder, entities_lookup, config_per_source, test_mode) print "GEOCODER STATS" geocoder.PrintStats() # Grant apps read-only access to the newly created schema and tables within db_prod.grant_usage_and_select_on_schema(prod_schema_name, 'data') db_prod.grant_usage_and_select_on_schema(prod_schema_name, 'verejne') db_prod.grant_usage_and_select_on_schema(prod_schema_name, 'kataster') db_prod.grant_usage_and_select_on_schema(prod_schema_name, 'prepojenia') db_prod.grant_usage_and_select_on_schema(prod_schema_name, 'obstaravania') # Commit database changes and close database connections db_address_cache.commit() db_address_cache.close() if test_mode: db_prod.conn.rollback() else: db_prod.commit() db_prod.close()
def update_SQL_source(source, timestamp, dry_run, verbose): # Check that the (temporary) schema names created by this data source # do not conflict with existing schemas in the database db = DatabaseConnection(path_config='db_config_update_source.yaml') q = """SELECT schema_name FROM information_schema.schemata WHERE schema_name IN %s LIMIT 1;""" q_data = (tuple(source['schemas']),) res = db.query(q, q_data, return_dicts=False) db.close() if len(res) >= 1: raise Exception('Schema "%s" that source "%s" reads into already exists' % (res[0][0], source['name'])) if verbose: print('[OK] No conflicting schema names found') # Download online resource if a URL is specified, storing it at the # location specified in source['path'] if ('url' in source): urllib.urlretrieve(source['url'], source['path']) if verbose: print('[OK] Downloaded from %s to %s' % (source['url'], source['path'])) if dry_run: print('[WARNING] --dry_run option not implemented for entire pipeline of updating an SQL source') db.close() return # Load into postgres, unzipping along the way if source['path'].endswith('.sql.gz'): p1 = subprocess.Popen(['gunzip', '-c', source['path']], stdout=subprocess.PIPE) subprocess.check_output(['psql', '-d', 'vd', '-q'], stdin=p1.stdout) # Load into postgres directly else: # The options -q -o /dev/null just suppress output subprocess.call(['psql', '-d', 'vd', '-f', source['path'], '-q', '-o', '/dev/null']) # Rename loaded schema(s) to the desired schema name(s) # If there is a single schema, rename it to source_NAME_TIMESTAMP # If there are multiple schemas, rename them to source_NAME_SCHEMA_TIMESTAMP db = DatabaseConnection(path_config='db_config_update_source.yaml') if len(source['schemas']) == 1: schema_old = source['schemas'][0] schema_new = 'source_' + source['name'] + '_' + timestamp db.rename_schema(schema_old, schema_new, verbose) # Grant privileges to user data for data/SourceDataInfo to work properly db.grant_usage_and_select_on_schema(schema_new, 'data') else: for schema_old in source['schemas']: schema_new = 'source_' + source['name'] + '_' + schema_old + '_' + timestamp db.rename_schema(schema_old, schema_new, verbose) # Grant privileges to user data for data/SourceDataInfo to work properly db.grant_usage_and_select_on_schema(schema_new, 'data') # Commit and close database connection db.commit() db.close()
def update_CSV_source(source, timestamp, dry_run, verbose): # Load the CSV file with open(source['path'], 'r') as f: delimiter = str(source['delimiter']) # requires string, not unicode reader = csv.reader(f, delimiter=delimiter) # Extract column names from header line and then the actual data header = next(reader) column_names = [column_name.decode('utf-8') for column_name in header] data = [tuple(row) for row in reader] if verbose: print('Loaded CSV file with %d columns and %d data rows' % (len(column_names), len(data))) # Create postgres schema db = DatabaseConnection(path_config='db_config_update_source.yaml') schema = 'source_' + source['name'] + '_' + timestamp q = 'CREATE SCHEMA %s; SET search_path="%s";' % (schema, schema) db.execute(q) # Compute normalised column names, saving original names in a separate table column_names_normalised = map(normalise_CSV_column_name, column_names) q = 'CREATE TABLE column_names (name_original text, name_normalised text);' db.execute(q) q = """INSERT INTO column_names VALUES %s;""" q_data = [(original, normalised) for original, normalised in zip(column_names, column_names_normalised)] db.execute_values(q, q_data) # Create table containing the actual data from the CSV file table = source['table_name'] table_columns = ', '.join(['%s text' % (name) for name in column_names_normalised]) q = 'CREATE TABLE %s (%s);' % (table, table_columns) db.execute(q) # Populate the table with data q = 'INSERT INTO ' + table + ' VALUES %s;' db.execute_values(q, data) if verbose: print('Inserted %d rows into %s.%s%s' % (len(data), schema, table, ' (dry run)' if dry_run else '')) # Grant privileges to user data for data/SourceDataInfo to work properly db.grant_usage_and_select_on_schema(schema, 'data') # Commit and close database connection if not dry_run: db.commit() db.close()
def update_JSON_source(source, timestamp, dry_run, verbose): # Load the JSON file data = json_load(source['path']) # Obtain column names appearing anywhere in the JSON columns = sorted(list(set(chain.from_iterable([datum.keys() for datum in data])))) if verbose: print('Loaded JSON files with %d columns and %d data rows' % (len(columns), len(data))) # Reorganise data into a list of tuples data = [tuple(datum[column] if column in datum else "" for column in columns) for datum in data] # Create postgres schema db = DatabaseConnection(path_config='db_config_update_source.yaml') schema = 'source_' + source['name'] + '_' + timestamp q = 'CREATE SCHEMA "%s"; SET search_path="%s";' % (schema, schema) db.execute(q) # Create table containing the actual data from the CSV file table = source['table_name'] table_columns = ', '.join(['%s text' % (name) for name in columns]) q = 'CREATE TABLE %s (%s);' % (table, table_columns) db.execute(q) # Populate the table with data q = 'INSERT INTO ' + table + ' VALUES %s;' db.execute_values(q, data) if verbose: print('Inserted %d rows into %s.%s%s' % (len(data), schema, table, ' (dry run)' if dry_run else '')) # Grant privileges to user data for data/SourceDataInfo to work properly db.grant_usage_and_select_on_schema(schema, 'data') # Commit and close database connection if not dry_run: db.commit() db.close()