def sql_functions(temp_db_conn, def_config, src_dir): orig_sql = def_config.lib_dir.sql def_config.lib_dir.sql = src_dir / 'lib-sql' sqlproc = SQLPreprocessor(temp_db_conn, def_config) sqlproc.run_sql_file(temp_db_conn, 'functions/utils.sql') sqlproc.run_sql_file(temp_db_conn, 'tokenizer/icu_tokenizer.sql') def_config.lib_dir.sql = orig_sql
def _init_db_tables(self, config): """ Set up the word table and fill it with pre-computed word frequencies. """ with connect(self.dsn) as conn: sqlp = SQLPreprocessor(conn, config) sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql') conn.commit()
def create_functions(conn, config, enable_diff_updates=True, enable_debug=False): """ (Re)create the PL/pgSQL functions. """ sql = SQLPreprocessor(conn, config) sql.run_sql_file(conn, 'functions.sql', disable_diff_updates=not enable_diff_updates, debug=enable_debug)
def _init_db_tables(self, config): """ Set up the word table and fill it with pre-computed word frequencies. """ with connect(self.dsn) as conn: sqlp = SQLPreprocessor(conn, config) sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql') conn.commit() LOG.warning("Precomputing word tokens") db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
def update_sql_functions(self, config): """ Reimport the SQL functions for this tokenizer. """ with connect(self.dsn) as conn: max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ) modulepath = config.DATABASE_MODULE_PATH or \ str((config.project_dir / 'module').resolve()) sqlp = SQLPreprocessor(conn, config) sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer.sql', max_word_freq=max_word_freq, modulepath=modulepath)
def _init_db_tables(self, config): """ Set up the word table and fill it with pre-computed word frequencies. """ with connect(self.dsn) as conn: sqlp = SQLPreprocessor(conn, config) sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql') conn.commit() LOG.warning("Precomputing word tokens") # get partial words and their frequencies words = Counter() with self.name_analyzer() as analyzer: with conn.cursor(name="words") as cur: cur.execute( "SELECT svals(name) as v, count(*) FROM place GROUP BY v" ) for name, cnt in cur: term = analyzer.make_standard_word(name) if term: for word in term.split(): words[word] += cnt # copy them back into the word table copystr = io.StringIO(''.join( ('{}\t{}\n'.format(*args) for args in words.items()))) with conn.cursor() as cur: copystr.seek(0) cur.copy_from(copystr, 'word', columns=['word_token', 'search_name_count']) cur.execute("""UPDATE word SET word_id = nextval('seq_word') WHERE word_id is null""") conn.commit()
def add_tiger_data(data_dir, config, threads, tokenizer): """ Import tiger data from directory or tar file `data dir`. """ dsn = config.get_libpq_dsn() files, tar = handle_tarfile_or_directory(data_dir) if not files: return with connect(dsn) as conn: sql = SQLPreprocessor(conn, config) sql.run_sql_file(conn, 'tiger_import_start.sql') # Reading files and then for each file line handling # sql_query in <threads - 1> chunks. place_threads = max(1, threads - 1) with WorkerPool(dsn, place_threads, ignore_sql_errors=True) as pool: with tokenizer.name_analyzer() as analyzer: for fname in files: if not tar: fd = open(fname) else: fd = io.TextIOWrapper(tar.extractfile(fname)) handle_threaded_sql_statements(pool, fd, analyzer) fd.close() if tar: tar.close() print('\n') LOG.warning("Creating indexes on Tiger data") with connect(dsn) as conn: sql = SQLPreprocessor(conn, config) sql.run_sql_file(conn, 'tiger_import_finish.sql')
def update_sql_functions(self, config): """ Reimport the SQL functions for this tokenizer. """ with connect(self.dsn) as conn: sqlp = SQLPreprocessor(conn, config) sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')