示例#1
0
def setup_country_tables(dsn, sql_dir, ignore_partitions=False):
    """ Create and populate the tables with basic static data that provides
        the background for geocoding. Data is assumed to not yet exist.
    """
    db_utils.execute_file(dsn, sql_dir / 'country_name.sql')
    db_utils.execute_file(dsn, sql_dir / 'country_osm_grid.sql.gz')

    params = []
    for ccode, props in _COUNTRY_INFO.items():
        if ccode is not None and props is not None:
            if ignore_partitions:
                partition = 0
            else:
                partition = props.get('partition')
            lang = props['languages'][0] if len(
                props['languages']) == 1 else None
            params.append((ccode, partition, lang))

    with connect(dsn) as conn:
        with conn.cursor() as cur:
            cur.execute_values(
                """ UPDATE country_name
                    SET partition = part, country_default_language_code = lang
                    FROM (VALUES %s) AS v (cc, part, lang)
                    WHERE country_code = v.cc""", params)
        conn.commit()
示例#2
0
def import_wikipedia_articles(dsn, data_path, ignore_errors=False):
    """ Replaces the wikipedia importance tables with new data.
        The import is run in a single transaction so that the new data
        is replace seemlessly.

        Returns 0 if all was well and 1 if the importance file could not
        be found. Throws an exception if there was an error reading the file.
    """
    datafile = data_path / 'wikimedia-importance.sql.gz'

    if not datafile.exists():
        return 1

    pre_code = """BEGIN;
                  DROP TABLE IF EXISTS "wikipedia_article";
                  DROP TABLE IF EXISTS "wikipedia_redirect"
               """
    post_code = "COMMIT"
    execute_file(dsn,
                 datafile,
                 ignore_errors=ignore_errors,
                 pre_code=pre_code,
                 post_code=post_code)

    return 0
示例#3
0
def test_execute_file_with_pre_code(dsn, tmp_path, temp_db_cursor):
    tmpfile = tmp_path / 'test.sql'
    tmpfile.write_text('INSERT INTO test VALUES(4)')

    db_utils.execute_file(dsn, tmpfile, pre_code='CREATE TABLE test (id INT)')

    assert temp_db_cursor.row_set('SELECT * FROM test') == {(4, )}
示例#4
0
def test_execute_file_success(dsn, temp_db_cursor, tmp_path):
    tmpfile = tmp_path / 'test.sql'
    tmpfile.write_text(
        'CREATE TABLE test (id INT);\nINSERT INTO test VALUES(56);')

    db_utils.execute_file(dsn, tmpfile)

    assert temp_db_cursor.row_set('SELECT * FROM test') == {(56, )}
示例#5
0
def test_execute_file_success(dsn, temp_db_cursor, tmp_path):
    tmpfile = tmp_path / 'test.sql'
    tmpfile.write_text(
        'CREATE TABLE test (id INT);\nINSERT INTO test VALUES(56);')

    db_utils.execute_file(dsn, tmpfile)

    temp_db_cursor.execute('SELECT * FROM test')

    assert temp_db_cursor.rowcount == 1
    assert temp_db_cursor.fetchone()[0] == 56
示例#6
0
    def _init_db_tables(self, config):
        """ Set up the word table and fill it with pre-computed word
            frequencies.
        """
        with connect(self.dsn) as conn:
            sqlp = SQLPreprocessor(conn, config)
            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
            conn.commit()

        LOG.warning("Precomputing word tokens")
        db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
示例#7
0
def test_execute_file_with_post_code(dsn, tmp_path, temp_db_cursor):
    tmpfile = tmp_path / 'test.sql'
    tmpfile.write_text('CREATE TABLE test (id INT)')

    db_utils.execute_file(dsn,
                          tmpfile,
                          post_code='INSERT INTO test VALUES(23)')

    temp_db_cursor.execute('SELECT * FROM test')

    assert temp_db_cursor.rowcount == 1
    assert temp_db_cursor.fetchone()[0] == 23
示例#8
0
def import_base_data(dsn, sql_dir, ignore_partitions=False):
    """ Create and populate the tables with basic static data that provides
        the background for geocoding. Data is assumed to not yet exist.
    """
    db_utils.execute_file(dsn, sql_dir / 'country_name.sql')
    db_utils.execute_file(dsn, sql_dir / 'country_osm_grid.sql.gz')

    if ignore_partitions:
        with connect(dsn) as conn:
            with conn.cursor() as cur:
                cur.execute('UPDATE country_name SET partition = 0')
            conn.commit()
示例#9
0
def load_data(dsn, data_dir, threads):
    """ Copy data into the word and placex table.
    """
    # Pre-calculate the most important terms in the word list.
    db_utils.execute_file(dsn, data_dir / 'words.sql')

    sel = selectors.DefaultSelector()
    # Then copy data from place to placex in <threads - 1> chunks.
    place_threads = max(1, threads - 1)
    for imod in range(place_threads):
        conn = DBConnection(dsn)
        conn.connect()
        conn.perform("""INSERT INTO placex ({0})
                         SELECT {0} FROM place
                         WHERE osm_id % {1} = {2}
                           AND NOT (class='place' and type='houses')
                           AND ST_IsValid(geometry)
                     """.format(_COPY_COLUMNS, place_threads, imod))
        sel.register(conn, selectors.EVENT_READ, conn)

    # Address interpolations go into another table.
    conn = DBConnection(dsn)
    conn.connect()
    conn.perform("""INSERT INTO location_property_osmline (osm_id, address, linegeo)
                      SELECT osm_id, address, geometry FROM place
                      WHERE class='place' and type='houses' and osm_type='W'
                            and ST_GeometryType(geometry) = 'ST_LineString'
                 """)
    sel.register(conn, selectors.EVENT_READ, conn)

    # Now wait for all of them to finish.
    todo = place_threads + 1
    while todo > 0:
        for key, _ in sel.select(1):
            conn = key.data
            sel.unregister(conn)
            conn.wait()
            conn.close()
            todo -= 1
        print('.', end='', flush=True)
    print('\n')

    with connect(dsn) as conn:
        with conn.cursor() as cur:
            cur.execute('ANALYSE')
示例#10
0
def test_execute_file_bad_sql_ignore_errors(dsn, tmp_path):
    tmpfile = tmp_path / 'test.sql'
    tmpfile.write_text('CREATE STABLE test (id INT)')

    db_utils.execute_file(dsn, tmpfile, ignore_errors=True)
示例#11
0
def test_execute_file_bad_sql(dsn, tmp_path):
    tmpfile = tmp_path / 'test.sql'
    tmpfile.write_text('CREATE STABLE test (id INT)')

    with pytest.raises(UsageError):
        db_utils.execute_file(dsn, tmpfile)
示例#12
0
def test_execute_file_bad_file(dsn, tmp_path):
    with pytest.raises(FileNotFoundError):
        db_utils.execute_file(dsn, tmp_path / 'test2.sql')
示例#13
0
def recompute_word_counts(dsn, sql_dir):
    """ Compute the frequency of full-word search terms.
    """
    execute_file(dsn, sql_dir / 'words_from_search_name.sql')
示例#14
0
def update_postcodes(dsn, sql_dir):
    """ Recalculate postcode centroids and add, remove and update entries in the
        location_postcode table. `conn` is an opne connection to the database.
    """
    execute_file(dsn, sql_dir / 'update-postcodes.sql')
示例#15
0
def import_postcodes(dsn, project_dir, tokenizer):
    """ Set up the initial list of postcodes.
    """

    with connect(dsn) as conn:
        conn.drop_table('gb_postcode')
        conn.drop_table('us_postcode')

        with conn.cursor() as cur:
            cur.execute("""CREATE TABLE gb_postcode (
                            id integer,
                            postcode character varying(9),
                            geometry GEOMETRY(Point, 4326))""")

        with conn.cursor() as cur:
            cur.execute("""CREATE TABLE us_postcode (
                            postcode text,
                            x double precision,
                            y double precision)""")
        conn.commit()

        gb_postcodes = project_dir / 'gb_postcode_data.sql.gz'
        if gb_postcodes.is_file():
            execute_file(dsn, gb_postcodes)

        us_postcodes = project_dir / 'us_postcode_data.sql.gz'
        if us_postcodes.is_file():
            execute_file(dsn, us_postcodes)

        with conn.cursor() as cur:
            cur.execute("TRUNCATE location_postcode")
            cur.execute("""
                INSERT INTO location_postcode
                 (place_id, indexed_status, country_code, postcode, geometry)
                SELECT nextval('seq_place'), 1, country_code,
                       token_normalized_postcode(address->'postcode') as pc,
                       ST_Centroid(ST_Collect(ST_Centroid(geometry)))
                  FROM placex
                 WHERE address ? 'postcode'
                       and token_normalized_postcode(address->'postcode') is not null
                       AND geometry IS NOT null
                 GROUP BY country_code, pc
            """)

            cur.execute("""
                INSERT INTO location_postcode
                 (place_id, indexed_status, country_code, postcode, geometry)
                SELECT nextval('seq_place'), 1, 'us',
                       token_normalized_postcode(postcode),
                       ST_SetSRID(ST_Point(x,y),4326)
                  FROM us_postcode WHERE token_normalized_postcode(postcode) NOT IN
                        (SELECT postcode FROM location_postcode
                          WHERE country_code = 'us')
            """)

            cur.execute("""
                INSERT INTO location_postcode
                 (place_id, indexed_status, country_code, postcode, geometry)
                SELECT nextval('seq_place'), 1, 'gb',
                       token_normalized_postcode(postcode), geometry
                  FROM gb_postcode WHERE token_normalized_postcode(postcode) NOT IN
                           (SELECT postcode FROM location_postcode
                             WHERE country_code = 'gb')
            """)

            cur.execute("""
                    DELETE FROM word WHERE class='place' and type='postcode'
                    and word NOT IN (SELECT postcode FROM location_postcode)
            """)
        conn.commit()

        with tokenizer.name_analyzer() as analyzer:
            analyzer.add_postcodes_from_db()
示例#16
0
def import_postcodes(dsn, project_dir):
    """ Set up the initial list of postcodes.
    """

    with connect(dsn) as conn:
        conn.drop_table('gb_postcode')
        conn.drop_table('us_postcode')

        with conn.cursor() as cur:
            cur.execute("""CREATE TABLE gb_postcode (
                            id integer,
                            postcode character varying(9),
                            geometry GEOMETRY(Point, 4326))""")

        with conn.cursor() as cur:
            cur.execute("""CREATE TABLE us_postcode (
                            postcode text,
                            x double precision,
                            y double precision)""")
        conn.commit()

        gb_postcodes = project_dir / 'gb_postcode_data.sql.gz'
        if gb_postcodes.is_file():
            execute_file(dsn, gb_postcodes)

        us_postcodes = project_dir / 'us_postcode_data.sql.gz'
        if us_postcodes.is_file():
            execute_file(dsn, us_postcodes)

        with conn.cursor() as cur:
            cur.execute("TRUNCATE location_postcode")
            cur.execute("""
                INSERT INTO location_postcode
                 (place_id, indexed_status, country_code, postcode, geometry)
                SELECT nextval('seq_place'), 1, country_code,
                       upper(trim (both ' ' from address->'postcode')) as pc,
                       ST_Centroid(ST_Collect(ST_Centroid(geometry)))
                  FROM placex
                 WHERE address ? 'postcode' AND address->'postcode' NOT SIMILAR TO '%(,|;)%'
                       AND geometry IS NOT null
                 GROUP BY country_code, pc
            """)

            cur.execute("""
                INSERT INTO location_postcode
                 (place_id, indexed_status, country_code, postcode, geometry)
                SELECT nextval('seq_place'), 1, 'us', postcode,
                       ST_SetSRID(ST_Point(x,y),4326)
                  FROM us_postcode WHERE postcode NOT IN
                        (SELECT postcode FROM location_postcode
                          WHERE country_code = 'us')
            """)

            cur.execute("""
                INSERT INTO location_postcode
                 (place_id, indexed_status, country_code, postcode, geometry)
                SELECT nextval('seq_place'), 1, 'gb', postcode, geometry
                  FROM gb_postcode WHERE postcode NOT IN
                           (SELECT postcode FROM location_postcode
                             WHERE country_code = 'gb')
            """)

            cur.execute("""
                    DELETE FROM word WHERE class='place' and type='postcode'
                    and word NOT IN (SELECT postcode FROM location_postcode)
            """)

            cur.execute("""
                SELECT count(getorcreate_postcode_id(v)) FROM
                (SELECT distinct(postcode) as v FROM location_postcode) p
            """)
        conn.commit()