def init_from_project(self, config): """ Initialise the tokenizer from the project directory. """ self.loader = ICURuleLoader(config) with connect(self.dsn) as conn: self.loader.load_config_from_db(conn)
def test_get_transliteration_rules(self): self.config_rules() loader = ICURuleLoader(self.project_env) rules = loader.get_transliteration_rules() trans = Transliterator.createFromRules("test", rules) assert trans.transliterate(" проспект-Prospekt ") == " prospekt Prospekt "
def test_empty_rule_set(self): self.write_config("""\ normalization: transliteration: token-analysis: - analyzer: generic variants: """) rules = ICURuleLoader(self.project_env) assert rules.get_search_rules() == '' assert rules.get_normalization_rules() == '' assert rules.get_transliteration_rules() == ''
def test_get_search_rules(cfgrules): loader = ICURuleLoader(cfgrules()) rules = loader.get_search_rules() trans = Transliterator.createFromRules("test", rules) assert trans.transliterate(" Baum straße ") == " baum straße " assert trans.transliterate(" Baumstraße ") == " baumstraße " assert trans.transliterate(" Baumstrasse ") == " baumstrasse " assert trans.transliterate(" Baumstr ") == " baumstr " assert trans.transliterate(" Baumwegstr ") == " baumwegstr " assert trans.transliterate(" Αθήνα ") == " athēna " assert trans.transliterate(" проспект ") == " prospekt "
def test_empty_rule_set(test_config): (test_config.project_dir / 'icu_tokenizer.yaml').write_text( dedent("""\ normalization: transliteration: token-analysis: - analyzer: generic variants: """)) rules = ICURuleLoader(test_config) assert rules.get_search_rules() == '' assert rules.get_normalization_rules() == '' assert rules.get_transliteration_rules() == ''
def init_new_db(self, config, init_db=True): """ Set up a new tokenizer for the database. This copies all necessary data in the project directory to make sure the tokenizer remains stable even over updates. """ self.loader = ICURuleLoader(config) self._install_php(config.lib_dir.php) self._save_config() if init_db: self.update_sql_functions(config) self._init_db_tables(config)
def test_get_search_rules(self): self.config_rules() loader = ICURuleLoader(self.project_env) rules = loader.get_search_rules() trans = Transliterator.createFromRules("test", rules) assert trans.transliterate(" Baum straße ") == " baum straße " assert trans.transliterate(" Baumstraße ") == " baumstraße " assert trans.transliterate(" Baumstrasse ") == " baumstrasse " assert trans.transliterate(" Baumstr ") == " baumstr " assert trans.transliterate(" Baumwegstr ") == " baumwegstr " assert trans.transliterate(" Αθήνα ") == " athēna " assert trans.transliterate(" проспект ") == " prospekt "
def test_missing_section(section, test_config): rule_cfg = {s: [] for s in CONFIG_SECTIONS if s != section} (test_config.project_dir / 'icu_tokenizer.yaml').write_text( yaml.dump(rule_cfg)) with pytest.raises(UsageError): ICURuleLoader(test_config)
def test_search_rules(self): self.config_rules('~street => s,st', 'master => mstr') proc = ICURuleLoader(self.project_env).make_token_analysis() assert proc.search.transliterate('Master Street').strip() == 'master street' assert proc.search.transliterate('Earnes St').strip() == 'earnes st' assert proc.search.transliterate('Nostreet').strip() == 'nostreet'
def test_search_rules(cfgrules): config = cfgrules('~street => s,st', 'master => mstr') proc = ICURuleLoader(config).make_token_analysis() assert proc.search.transliterate( 'Master Street').strip() == 'master street' assert proc.search.transliterate('Earnes St').strip() == 'earnes st' assert proc.search.transliterate('Nostreet').strip() == 'nostreet'
def test_transliteration_rules_from_file(self): self.write_config("""\ normalization: transliteration: - "'ax' > 'b'" - !include transliteration.yaml token-analysis: - analyzer: generic variants: """) transpath = self.project_env.project_dir / ('transliteration.yaml') transpath.write_text('- "x > y"') loader = ICURuleLoader(self.project_env) rules = loader.get_transliteration_rules() trans = Transliterator.createFromRules("test", rules) assert trans.transliterate(" axxt ") == " byt "
def test_transliteration_rules_from_file(test_config): cfgpath = test_config.project_dir / ('icu_tokenizer.yaml') cfgpath.write_text( dedent("""\ normalization: transliteration: - "'ax' > 'b'" - !include transliteration.yaml token-analysis: - analyzer: generic variants: """)) transpath = test_config.project_dir / ('transliteration.yaml') transpath.write_text('- "x > y"') loader = ICURuleLoader(test_config) rules = loader.get_transliteration_rules() trans = Transliterator.createFromRules("test", rules) assert trans.transliterate(" axxt ") == " byt "
def test_get_normalization_rules(cfgrules): loader = ICURuleLoader(cfgrules()) rules = loader.get_normalization_rules() trans = Transliterator.createFromRules("test", rules) assert trans.transliterate(" проспект-Prospekt ") == " проспект prospekt "
def test_invalid_variant_description(self, variant): with pytest.raises(UsageError): ICURuleLoader(self.cfgrules(variant))
def get_replacements(self, *variants): loader = ICURuleLoader(self.cfgrules(*variants)) rules = loader.analysis[None].config['replacements'] return sorted((k, sorted(v)) for k, v in rules)
class LegacyICUTokenizer(AbstractTokenizer): """ This tokenizer uses libICU to covert names and queries to ASCII. Otherwise it uses the same algorithms and data structures as the normalization routines in Nominatim 3. """ def __init__(self, dsn, data_dir): self.dsn = dsn self.data_dir = data_dir self.loader = None def init_new_db(self, config, init_db=True): """ Set up a new tokenizer for the database. This copies all necessary data in the project directory to make sure the tokenizer remains stable even over updates. """ self.loader = ICURuleLoader(config) self._install_php(config.lib_dir.php) self._save_config() if init_db: self.update_sql_functions(config) self._init_db_tables(config) def init_from_project(self, config): """ Initialise the tokenizer from the project directory. """ self.loader = ICURuleLoader(config) with connect(self.dsn) as conn: self.loader.load_config_from_db(conn) def finalize_import(self, config): """ Do any required postprocessing to make the tokenizer data ready for use. """ with connect(self.dsn) as conn: sqlp = SQLPreprocessor(conn, config) sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql') def update_sql_functions(self, config): """ Reimport the SQL functions for this tokenizer. """ with connect(self.dsn) as conn: sqlp = SQLPreprocessor(conn, config) sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql') def check_database(self, config): """ Check that the tokenizer is set up correctly. """ # Will throw an error if there is an issue. self.init_from_project(config) def update_statistics(self): """ Recompute frequencies for all name words. """ with connect(self.dsn) as conn: if conn.table_exists('search_name'): with conn.cursor() as cur: cur.drop_table("word_frequencies") LOG.info("Computing word frequencies") cur.execute("""CREATE TEMP TABLE word_frequencies AS SELECT unnest(name_vector) as id, count(*) FROM search_name GROUP BY id""") cur.execute("CREATE INDEX ON word_frequencies(id)") LOG.info("Update word table with recomputed frequencies") cur.execute("""UPDATE word SET info = info || jsonb_build_object('count', count) FROM word_frequencies WHERE word_id = id""") cur.drop_table("word_frequencies") conn.commit() def name_analyzer(self): """ Create a new analyzer for tokenizing names and queries using this tokinzer. Analyzers are context managers and should be used accordingly: ``` with tokenizer.name_analyzer() as analyzer: analyser.tokenize() ``` When used outside the with construct, the caller must ensure to call the close() function before destructing the analyzer. Analyzers are not thread-safe. You need to instantiate one per thread. """ return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(), self.loader.make_token_analysis()) def _install_php(self, phpdir): """ Install the php script for the tokenizer. """ php_file = self.data_dir / "tokenizer.php" php_file.write_text( dedent(f"""\ <?php @define('CONST_Max_Word_Frequency', 10000000); @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}"); @define('CONST_Transliteration', "{self.loader.get_search_rules()}"); require_once('{phpdir}/tokenizer/icu_tokenizer.php');""")) def _save_config(self): """ Save the configuration that needs to remain stable for the given database as database properties. """ with connect(self.dsn) as conn: self.loader.save_config_to_db(conn) def _init_db_tables(self, config): """ Set up the word table and fill it with pre-computed word frequencies. """ with connect(self.dsn) as conn: sqlp = SQLPreprocessor(conn, config) sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql') conn.commit()
def test_missing_section(self, section): rule_cfg = { s: [] for s in CONFIG_SECTIONS if s != section} self.write_config(yaml.dump(rule_cfg)) with pytest.raises(UsageError): ICURuleLoader(self.project_env)
def test_invalid_variant_description(self, variant): self.config_rules(variant) with pytest.raises(UsageError): ICURuleLoader(self.project_env)