def run(self, args): from ..tools import refresh, postcodes from ..indexer.indexer import Indexer if args.postcodes: if postcodes.can_compute(args.config.get_libpq_dsn()): LOG.warning("Update postcodes centroid") tokenizer = self._get_tokenizer(args.config) postcodes.update_postcodes(args.config.get_libpq_dsn(), args.project_dir, tokenizer) indexer = Indexer(args.config.get_libpq_dsn(), tokenizer, args.threads or 1) indexer.index_postcodes() else: LOG.error( "The place table doesn't exist. " "Postcode updates on a frozen database is not possible.") if args.word_counts: LOG.warning('Recompute word statistics') self._get_tokenizer(args.config).update_statistics() if args.address_levels: LOG.warning('Updating address levels') with connect(args.config.get_libpq_dsn()) as conn: refresh.load_address_levels_from_config(conn, args.config) if args.functions: LOG.warning('Create functions') with connect(args.config.get_libpq_dsn()) as conn: refresh.create_functions(conn, args.config, args.diffs, args.enable_debug_statements) self._get_tokenizer(args.config).update_sql_functions( args.config) if args.wiki_data: data_path = Path(args.config.WIKIPEDIA_DATA_PATH or args.project_dir) LOG.warning('Import wikipdia article importance from %s', data_path) if refresh.import_wikipedia_articles(args.config.get_libpq_dsn(), data_path) > 0: LOG.fatal('FATAL: Wikipedia importance dump file not found') return 1 # Attention: importance MUST come after wiki data import. if args.importance: LOG.warning('Update importance values for database') with connect(args.config.get_libpq_dsn()) as conn: refresh.recompute_importance(conn) if args.website: webdir = args.project_dir / 'website' LOG.warning('Setting up website directory at %s', webdir) with connect(args.config.get_libpq_dsn()) as conn: refresh.setup_website(webdir, args.config, conn) return 0
def _update(args): from ..tools import replication from ..indexer.indexer import Indexer params = args.osm2pgsql_options(default_cache=2000, default_threads=1) params.update(base_url=args.config.REPLICATION_URL, update_interval=args.config.get_int('REPLICATION_UPDATE_INTERVAL'), import_file=args.project_dir / 'osmosischange.osc', max_diff_size=args.config.get_int('REPLICATION_MAX_DIFF'), indexed_only=not args.once) # Sanity check to not overwhelm the Geofabrik servers. if 'download.geofabrik.de'in params['base_url']\ and params['update_interval'] < 86400: LOG.fatal("Update interval too low for download.geofabrik.de.\n" "Please check install documentation " "(https://nominatim.org/release-docs/latest/admin/Import-and-Update#" "setting-up-the-update-process).") raise UsageError("Invalid replication update interval setting.") if not args.once: if not args.do_index: LOG.fatal("Indexing cannot be disabled when running updates continuously.") raise UsageError("Bad argument '--no-index'.") recheck_interval = args.config.get_int('REPLICATION_RECHECK_INTERVAL') while True: with connect(args.config.get_libpq_dsn()) as conn: start = dt.datetime.now(dt.timezone.utc) state = replication.update(conn, params) if state is not replication.UpdateState.NO_CHANGES: status.log_status(conn, start, 'import') batchdate, _, _ = status.get_status(conn) if state is not replication.UpdateState.NO_CHANGES and args.do_index: index_start = dt.datetime.now(dt.timezone.utc) indexer = Indexer(args.config.get_libpq_dsn(), args.threads or 1) indexer.index_boundaries(0, 30) indexer.index_by_rank(0, 30) with connect(args.config.get_libpq_dsn()) as conn: status.set_indexed(conn, True) status.log_status(conn, index_start, 'index') else: index_start = None if LOG.isEnabledFor(logging.WARNING): UpdateReplication._report_update(batchdate, start, index_start) if args.once: break if state is replication.UpdateState.NO_CHANGES: LOG.warning("No new changes. Sleeping for %d sec.", recheck_interval) time.sleep(recheck_interval)
def run(args): from ..tools import refresh from ..tokenizer import factory as tokenizer_factory if args.postcodes: LOG.warning("Update postcodes centroid") refresh.update_postcodes(args.config.get_libpq_dsn(), args.sqllib_dir) if args.word_counts: LOG.warning('Recompute frequency of full-word search terms') refresh.recompute_word_counts(args.config.get_libpq_dsn(), args.sqllib_dir) if args.address_levels: cfg = Path(args.config.ADDRESS_LEVEL_CONFIG) LOG.warning('Updating address levels from %s', cfg) with connect(args.config.get_libpq_dsn()) as conn: refresh.load_address_levels_from_file(conn, cfg) if args.functions: LOG.warning('Create functions') with connect(args.config.get_libpq_dsn()) as conn: refresh.create_functions(conn, args.config, args.diffs, args.enable_debug_statements) tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config) tokenizer.update_sql_functions(args.config) if args.wiki_data: data_path = Path(args.config.WIKIPEDIA_DATA_PATH or args.project_dir) LOG.warning('Import wikipdia article importance from %s', data_path) if refresh.import_wikipedia_articles(args.config.get_libpq_dsn(), data_path) > 0: LOG.fatal('FATAL: Wikipedia importance dump file not found') return 1 # Attention: importance MUST come after wiki data import. if args.importance: LOG.warning('Update importance values for database') with connect(args.config.get_libpq_dsn()) as conn: refresh.recompute_importance(conn) if args.website: webdir = args.project_dir / 'website' LOG.warning('Setting up website directory at %s', webdir) refresh.setup_website(webdir, args.config) return 0
def create_db(dsn, rouser=None): """ Create a new database for the given DSN. Fails when the database already exists or the PostgreSQL version is too old. Uses `createdb` to create the database. If 'rouser' is given, then the function also checks that the user with that given name exists. Requires superuser rights by the caller. """ proc = subprocess.run(['createdb'], env=get_pg_env(dsn), check=False) if proc.returncode != 0: raise UsageError('Creating new database failed.') with connect(dsn) as conn: postgres_version = conn.server_version_tuple() if postgres_version < POSTGRESQL_REQUIRED_VERSION: LOG.fatal('Minimum supported version of Postgresql is %d.%d. ' 'Found version %d.%d.', POSTGRESQL_REQUIRED_VERSION[0], POSTGRESQL_REQUIRED_VERSION[1], postgres_version[0], postgres_version[1]) raise UsageError('PostgreSQL server is too old.') if rouser is not None: with conn.cursor() as cur: cnt = cur.scalar('SELECT count(*) FROM pg_user where usename = %s', (rouser, )) if cnt == 0: LOG.fatal("Web user '%s' does not exists. Create it with:\n" "\n createuser %s", rouser, rouser) raise UsageError('Missing read-only user.')
def check_database(self, _): """ Check that the tokenizer is set up correctly. """ hint = """\ The Postgresql extension nominatim.so was not correctly loaded. Error: {error} Hints: * Check the output of the CMmake/make installation step * Does nominatim.so exist? * Does nominatim.so exist on the database server? * Can nominatim.so be accessed by the database user? """ with connect(self.dsn) as conn: with conn.cursor() as cur: try: out = cur.scalar("SELECT make_standard_name('a')") except psycopg2.Error as err: return hint.format(error=str(err)) if out != 'a': return hint.format( error='Unexpected result for make_standard_name()') return None
def setup_country_tables(dsn, sql_dir, ignore_partitions=False): """ Create and populate the tables with basic static data that provides the background for geocoding. Data is assumed to not yet exist. """ db_utils.execute_file(dsn, sql_dir / 'country_name.sql') db_utils.execute_file(dsn, sql_dir / 'country_osm_grid.sql.gz') params = [] for ccode, props in _COUNTRY_INFO.items(): if ccode is not None and props is not None: if ignore_partitions: partition = 0 else: partition = props.get('partition') lang = props['languages'][0] if len( props['languages']) == 1 else None params.append((ccode, partition, lang)) with connect(dsn) as conn: with conn.cursor() as cur: cur.execute_values( """ UPDATE country_name SET partition = part, country_default_language_code = lang FROM (VALUES %s) AS v (cc, part, lang) WHERE country_code = v.cc""", params) conn.commit()
def index_full(self, analyse=True): """ Index the complete database. This will first index boudnaries followed by all other objects. When `analyse` is True, then the database will be analysed at the appropriate places to ensure that database statistics are updated. """ with connect(self.dsn) as conn: conn.autocommit = True if analyse: def _analyze(): with conn.cursor() as cur: cur.execute('ANALYZE') else: def _analyze(): pass self.index_by_rank(0, 4) _analyze() self.index_boundaries(0, 30) _analyze() self.index_by_rank(5, 25) _analyze() self.index_by_rank(26, 30) _analyze() self.index_postcodes() _analyze()
def can_compute(dsn): """ Check that the place table exists so that postcodes can be computed. """ with connect(dsn) as conn: return conn.table_exists('place')
def __init__(self, dsn, sanitizer, token_analysis): self.conn = connect(dsn).connection self.conn.autocommit = True self.sanitizer = sanitizer self.token_analysis = token_analysis self._cache = _TokenCache()
def init_from_project(self, config): """ Initialise the tokenizer from the project directory. """ self.loader = ICURuleLoader(config) with connect(self.dsn) as conn: self.loader.load_config_from_db(conn)
def finalize_import(self, config): """ Do any required postprocessing to make the tokenizer data ready for use. """ with connect(self.dsn) as conn: sqlp = SQLPreprocessor(conn, config) sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
def run(args): if args.import_from_wiki: LOG.warning('Special phrases importation starting') with connect(args.config.get_libpq_dsn()) as db_connection: SpecialPhrasesImporter(args.config, args.phplib_dir, db_connection).import_from_wiki() return 0
def __init__(self, dsn, normalizer): self.conn = connect(dsn).connection self.conn.autocommit = True self.normalizer = normalizer psycopg2.extras.register_hstore(self.conn) self._cache = _TokenCache(self.conn)
def import_osm_data(osm_files, options, drop=False, ignore_errors=False): """ Import the given OSM files. 'options' contains the list of default settings for osm2pgsql. """ options['import_file'] = osm_files options['append'] = False options['threads'] = 1 if not options['flatnode_file'] and options['osm2pgsql_cache'] == 0: # Make some educated guesses about cache size based on the size # of the import file and the available memory. mem = psutil.virtual_memory() fsize = 0 if isinstance(osm_files, list): for fname in osm_files: fsize += os.stat(str(fname)).st_size else: fsize = os.stat(str(osm_files)).st_size options['osm2pgsql_cache'] = int(min((mem.available + mem.cached) * 0.75, fsize * 2) / 1024 / 1024) + 1 run_osm2pgsql(options) with connect(options['dsn']) as conn: if not ignore_errors: with conn.cursor() as cur: cur.execute('SELECT * FROM place LIMIT 1') if cur.rowcount == 0: raise UsageError('No data imported by osm2pgsql.') if drop: conn.drop_table('planet_osm_nodes') if drop and options['flatnode_file']: Path(options['flatnode_file']).unlink()
def update_status_table(self): """ Update the status in the status table to 'indexed'. """ with connect(self.dsn) as conn: with conn.cursor() as cur: cur.execute('UPDATE import_status SET indexed = true') conn.commit()
def _init_db_tables(self, config): """ Set up the word table and fill it with pre-computed word frequencies. """ with connect(self.dsn) as conn: sqlp = SQLPreprocessor(conn, config) sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql') conn.commit()
def run(args): from ..tools import freeze with connect(args.config.get_libpq_dsn()) as conn: freeze.drop_update_tables(conn) freeze.drop_flatnode_file(str(args.config.get_path('FLATNODE_FILE'))) return 0
def __init__(self, dsn, normalizer, transliterator, abbreviations): self.conn = connect(dsn).connection self.conn.autocommit = True self.normalizer = normalizer self.transliterator = transliterator self.abbreviations = abbreviations self._cache = _TokenCache()
def init_from_project(self): """ Initialise the tokenizer from the project directory. """ with connect(self.dsn) as conn: self.normalization = get_property(conn, DBCFG_NORMALIZATION) self.transliteration = get_property(conn, DBCFG_TRANSLITERATION) self.abbreviations = json.loads( get_property(conn, DBCFG_ABBREVIATIONS))
def update_sql_functions(self, config): """ Reimport the SQL functions for this tokenizer. """ with connect(self.dsn) as conn: max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ) sqlp = SQLPreprocessor(conn, config) sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql', max_word_freq=max_word_freq)
def has_pending(self): """ Check if any data still needs indexing. This function must only be used after the import has finished. Otherwise it will be very expensive. """ with connect(self.dsn) as conn: with conn.cursor() as cur: cur.execute( "SELECT 'a' FROM placex WHERE indexed_status > 0 LIMIT 1") return cur.rowcount > 0
def _save_config(self, config): """ Save the configuration that needs to remain stable for the given database as database properties. """ with connect(self.dsn) as conn: set_property(conn, DBCFG_NORMALIZATION, self.normalization) set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY) set_property(conn, DBCFG_TRANSLITERATION, self.transliteration) set_property(conn, DBCFG_ABBREVIATIONS, json.dumps(self.abbreviations))
def add_tiger_data(data_dir, config, threads): """ Import tiger data from directory or tar file `data dir`. """ dsn = config.get_libpq_dsn() sql_files, tar = handle_tarfile_or_directory(data_dir) if not sql_files: return with connect(dsn) as conn: sql = SQLPreprocessor(conn, config) sql.run_sql_file(conn, 'tiger_import_start.sql') # Reading sql_files and then for each file line handling # sql_query in <threads - 1> chunks. sel = selectors.DefaultSelector() place_threads = max(1, threads - 1) # Creates a pool of database connections for _ in range(place_threads): conn = DBConnection(dsn) conn.connect() sel.register(conn, selectors.EVENT_WRITE, conn) for sql_file in sql_files: if not tar: file = open(sql_file) else: file = tar.extractfile(sql_file) handle_threaded_sql_statements(sel, file) # Unregistering pool of database connections handle_unregister_connection_pool(sel, place_threads) if tar: tar.close() print('\n') LOG.warning("Creating indexes on Tiger data") with connect(dsn) as conn: sql = SQLPreprocessor(conn, config) sql.run_sql_file(conn, 'tiger_import_finish.sql')
def run(args): from ..tokenizer import factory as tokenizer_factory if args.import_from_wiki: LOG.warning('Special phrases importation starting') tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config) with connect(args.config.get_libpq_dsn()) as db_connection: SpecialPhrasesImporter( args.config, args.phplib_dir, db_connection).import_from_wiki(tokenizer) return 0
def update_postcodes(dsn, project_dir, tokenizer): """ Update the table of artificial postcodes. Computes artificial postcode centroids from the placex table, potentially enhances it with external data and then updates the postcodes in the table 'location_postcode'. """ with tokenizer.name_analyzer() as analyzer: with connect(dsn) as conn: # First get the list of countries that currently have postcodes. # (Doing this before starting to insert, so it is fast on import.) with conn.cursor() as cur: cur.execute( "SELECT DISTINCT country_code FROM location_postcode") todo_countries = set((row[0] for row in cur)) # Recompute the list of valid postcodes from placex. with conn.cursor(name="placex_postcodes") as cur: cur.execute(""" SELECT cc as country_code, pc, ST_X(centroid), ST_Y(centroid) FROM (SELECT COALESCE(plx.country_code, get_country_code(ST_Centroid(pl.geometry))) as cc, token_normalized_postcode(pl.address->'postcode') as pc, ST_Centroid(ST_Collect(COALESCE(plx.centroid, ST_Centroid(pl.geometry)))) as centroid FROM place AS pl LEFT OUTER JOIN placex AS plx ON pl.osm_id = plx.osm_id AND pl.osm_type = plx.osm_type WHERE pl.address ? 'postcode' AND pl.geometry IS NOT null GROUP BY cc, pc) xx WHERE pc IS NOT null AND cc IS NOT null ORDER BY country_code, pc""") collector = None for country, postcode, x, y in cur: if collector is None or country != collector.country: if collector is not None: collector.commit(conn, analyzer, project_dir) collector = _CountryPostcodesCollector(country) todo_countries.discard(country) collector.add(postcode, x, y) if collector is not None: collector.commit(conn, analyzer, project_dir) # Now handle any countries that are only in the postcode table. for country in todo_countries: _CountryPostcodesCollector(country).commit( conn, analyzer, project_dir) conn.commit() analyzer.update_postcodes_from_db()
def _init_replication(args): from ..tools import replication, refresh LOG.warning("Initialising replication updates") with connect(args.config.get_libpq_dsn()) as conn: replication.init_replication(conn, base_url=args.config.REPLICATION_URL) if args.update_functions: LOG.warning("Create functions") refresh.create_functions(conn, args.config, args.sqllib_dir, True, False) return 0
def _init_db_tables(self, config): """ Set up the word table and fill it with pre-computed word frequencies. """ with connect(self.dsn) as conn: sqlp = SQLPreprocessor(conn, config) sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql') conn.commit() LOG.warning("Precomputing word tokens") db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
def start_import(args, loader): """ Create the SPImporter object containing the right sp loader and then start the import of special phrases. """ from ..tokenizer import factory as tokenizer_factory tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config) should_replace = not args.no_replace with connect(args.config.get_libpq_dsn()) as db_connection: SPImporter(args.config, db_connection, loader).import_phrases(tokenizer, should_replace)
def import_base_data(dsn, sql_dir, ignore_partitions=False): """ Create and populate the tables with basic static data that provides the background for geocoding. Data is assumed to not yet exist. """ db_utils.execute_file(dsn, sql_dir / 'country_name.sql') db_utils.execute_file(dsn, sql_dir / 'country_osm_grid.sql.gz') if ignore_partitions: with connect(dsn) as conn: with conn.cursor() as cur: cur.execute('UPDATE country_name SET partition = 0') conn.commit()
def update_sql_functions(self, config): """ Reimport the SQL functions for this tokenizer. """ with connect(self.dsn) as conn: max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ) modulepath = config.DATABASE_MODULE_PATH or \ str((config.project_dir / 'module').resolve()) sqlp = SQLPreprocessor(conn, config) sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer.sql', max_word_freq=max_word_freq, modulepath=modulepath)