示例#1
0
def consolidate_entities(read_only):
    db = DatabaseConnection(path_config='utils/db_config.yaml',
                            search_path='mysql')

    ids1, eids1, edges = consolidate_people(db)
    ids2, eids2 = consolidate_companies(db)

    cur = db.dict_cursor()
    # 1. Reset eids to be equal to ids in entities.
    print "Reset eids"
    cur.execute("update entities set eid = id;")
    # 2. Consolidate people
    print "Update DB with people eids"
    update_eids_of_ids(cur, ids1, eids1)
    # 3. Consolidate companies
    print "Update DB with companies eids"
    update_eids_of_ids(cur, ids2, eids2)
    # 4. Remove neighbour edges
    print "Delete neighbour edges"
    cur.execute("DELETE from related where source=%s", ("neighbour", ))
    # 5. Add new neighbour edges
    add_neighbour_edges(cur, edges)
    # 6. Update related table
    print "Updating related eids"
    cur.execute(
        "UPDATE related SET eid1=entities.eid FROM entities WHERE related.id1=entities.id;"
    )
    cur.execute(
        "UPDATE related SET eid2=entities.eid FROM entities WHERE related.id2=entities.id;"
    )
    cur.close()
    if not read_only:
        db.commit()
    db.close()
示例#2
0
def main(args_dict):
    test_mode = not args_dict['disable_test_mode']
    if test_mode:
        print "======================="
        print "=======TEST MODE======="
        print "======================="

    timestamp = datetime.now().strftime('%Y%m%d%H%M%S')
    # Write output into prod_schema_name
    prod_schema_name = "prod_" + timestamp
    print "prod_schema_name", prod_schema_name

    # Create database connections:
    # Read / write address cache from this one
    db_address_cache = DatabaseConnection(
        path_config='db_config_update_source.yaml',
        search_path='address_cache')
    # Write prod tables into this one
    db_prod = DatabaseConnection(path_config='db_config_update_source.yaml')
    CreateAndSetProdSchema(db_prod, prod_schema_name)

    # Initialize geocoder
    geocoder = geocoder_lib.Geocoder(db_address_cache, db_prod, test_mode)
    # Initialize entity lookup
    entities_lookup = entities.Entities(db_prod)
    # Table prod_tables.yaml defines a specifications of SQL selects to read
    # source data and describtion of additional tables to be created.
    with open('prod_tables.yaml', 'r') as stream:
        config = yaml.load(stream)
    # This is where all the population happens!!!
    # Go through all the specified data sources and process them, adding data
    # as needed. We process them in the order!
    for key in sorted(config.keys()):
        config_per_source = config[key]
        print "Working on source:", key
        ProcessSource(db_prod, geocoder, entities_lookup, config_per_source,
                      test_mode)
        print "GEOCODER STATS"
        geocoder.PrintStats()

    # Grant apps read-only access to the newly created schema and tables within
    db_prod.grant_usage_and_select_on_schema(prod_schema_name, 'data')
    db_prod.grant_usage_and_select_on_schema(prod_schema_name, 'verejne')
    db_prod.grant_usage_and_select_on_schema(prod_schema_name, 'kataster')
    db_prod.grant_usage_and_select_on_schema(prod_schema_name, 'prepojenia')
    db_prod.grant_usage_and_select_on_schema(prod_schema_name, 'obstaravania')

    # Commit database changes and close database connections
    db_address_cache.commit()
    db_address_cache.close()
    if test_mode:
        db_prod.conn.rollback()
    else:
        db_prod.commit()
        db_prod.close()
示例#3
0
def update_SQL_source(source, timestamp, dry_run, verbose):
    # Check that the (temporary) schema names created by this data source
    # do not conflict with existing schemas in the database
    db = DatabaseConnection(path_config='db_config_update_source.yaml')
    q = """SELECT schema_name FROM information_schema.schemata WHERE schema_name IN %s LIMIT 1;"""
    q_data = (tuple(source['schemas']),)
    res = db.query(q, q_data, return_dicts=False)
    db.close()
    if len(res) >= 1:
        raise Exception('Schema "%s" that source "%s" reads into already exists' % (res[0][0], source['name']))
    if verbose:
        print('[OK] No conflicting schema names found')

    # Download online resource if a URL is specified, storing it at the
    # location specified in source['path']
    if ('url' in source):
        urllib.urlretrieve(source['url'], source['path'])
        if verbose:
            print('[OK] Downloaded from %s to %s' % (source['url'], source['path']))

    if dry_run:
        print('[WARNING] --dry_run option not implemented for entire pipeline of updating an SQL source')
        db.close()
        return

    # Load into postgres, unzipping along the way
    if source['path'].endswith('.sql.gz'):
        p1 = subprocess.Popen(['gunzip', '-c', source['path']], stdout=subprocess.PIPE)
        subprocess.check_output(['psql', '-d', 'vd', '-q'], stdin=p1.stdout)
    # Load into postgres directly
    else:
        # The options -q -o /dev/null just suppress output
        subprocess.call(['psql', '-d', 'vd', '-f', source['path'], '-q', '-o', '/dev/null'])

    # Rename loaded schema(s) to the desired schema name(s)
    # If there is a single schema, rename it to source_NAME_TIMESTAMP
    # If there are multiple schemas, rename them to source_NAME_SCHEMA_TIMESTAMP
    db = DatabaseConnection(path_config='db_config_update_source.yaml')
    if len(source['schemas']) == 1:
        schema_old = source['schemas'][0]
        schema_new = 'source_' + source['name'] + '_' + timestamp
        db.rename_schema(schema_old, schema_new, verbose)
        # Grant privileges to user data for data/SourceDataInfo to work properly
        db.grant_usage_and_select_on_schema(schema_new, 'data')
    else:
        for schema_old in source['schemas']:
            schema_new = 'source_' + source['name'] + '_' + schema_old + '_' + timestamp
            db.rename_schema(schema_old, schema_new, verbose)
            # Grant privileges to user data for data/SourceDataInfo to work properly
            db.grant_usage_and_select_on_schema(schema_new, 'data')

    # Commit and close database connection
    db.commit()
    db.close()
示例#4
0
def update_CSV_source(source, timestamp, dry_run, verbose):
    # Load the CSV file
    with open(source['path'], 'r') as f:
        delimiter = str(source['delimiter']) # requires string, not unicode
        reader = csv.reader(f, delimiter=delimiter)

        # Extract column names from header line and then the actual data
        header = next(reader)
        column_names = [column_name.decode('utf-8') for column_name in header]
        data = [tuple(row) for row in reader]
    if verbose:
        print('Loaded CSV file with %d columns and %d data rows' % (len(column_names), len(data)))

    # Create postgres schema
    db = DatabaseConnection(path_config='db_config_update_source.yaml')
    schema = 'source_' + source['name'] + '_' + timestamp
    q = 'CREATE SCHEMA %s; SET search_path="%s";' % (schema, schema)
    db.execute(q)

    # Compute normalised column names, saving original names in a separate table
    column_names_normalised = map(normalise_CSV_column_name, column_names)
    q = 'CREATE TABLE column_names (name_original text, name_normalised text);'
    db.execute(q)
    q = """INSERT INTO column_names VALUES %s;"""
    q_data = [(original, normalised) for original, normalised in zip(column_names, column_names_normalised)]
    db.execute_values(q, q_data)

    # Create table containing the actual data from the CSV file
    table = source['table_name']
    table_columns = ', '.join(['%s text' % (name) for name in column_names_normalised])
    q = 'CREATE TABLE %s (%s);' % (table, table_columns)
    db.execute(q)

    # Populate the table with data
    q = 'INSERT INTO ' + table + ' VALUES %s;'
    db.execute_values(q, data)
    if verbose:
        print('Inserted %d rows into %s.%s%s' % (len(data), schema, table, ' (dry run)' if dry_run else ''))

    # Grant privileges to user data for data/SourceDataInfo to work properly
    db.grant_usage_and_select_on_schema(schema, 'data')

    # Commit and close database connection
    if not dry_run:
        db.commit()
    db.close()
示例#5
0
def update_JSON_source(source, timestamp, dry_run, verbose):
    # Load the JSON file
    data = json_load(source['path'])

    # Obtain column names appearing anywhere in the JSON
    columns = sorted(list(set(chain.from_iterable([datum.keys() for datum in data]))))
    if verbose:
        print('Loaded JSON files with %d columns and %d data rows' % (len(columns), len(data)))

    # Reorganise data into a list of tuples
    data = [tuple(datum[column] if column in datum else "" for column in columns) for datum in data]

    # Create postgres schema
    db = DatabaseConnection(path_config='db_config_update_source.yaml')
    schema = 'source_' + source['name'] + '_' + timestamp
    q = 'CREATE SCHEMA "%s"; SET search_path="%s";' % (schema, schema)
    db.execute(q)

    # Create table containing the actual data from the CSV file
    table = source['table_name']
    table_columns = ', '.join(['%s text' % (name) for name in columns])
    q = 'CREATE TABLE %s (%s);' % (table, table_columns)
    db.execute(q)

    # Populate the table with data
    q = 'INSERT INTO ' + table + ' VALUES %s;'
    db.execute_values(q, data)
    if verbose:
        print('Inserted %d rows into %s.%s%s' % (len(data), schema, table, ' (dry run)' if dry_run else ''))

    # Grant privileges to user data for data/SourceDataInfo to work properly
    db.grant_usage_and_select_on_schema(schema, 'data')

    # Commit and close database connection
    if not dry_run:
        db.commit()
    db.close()