logging_level = 'info' logging.basicConfig(level=(logging_level).upper()) log = logging.getLogger(__name__) # =========================================== # Create access # =========================================== # Load the access info access_info = read_info_from_pgpass_file(PGPASS_FILE) storage1 = PostgresStorage(dbname=access_info['dbname'], user=access_info['user'], password=access_info['passwd'], host=access_info['host'], port=access_info['port'], schema=SRC_SCHEMA, role=SRC_ROLE) # =========================================== # Prepare input & output corpus # =========================================== in_collection = storage1.get_collection(SRC_COLLECTION) assert in_collection.exists() log.info(' Collection {!r} exists. '.format(SRC_COLLECTION)) print('Other existing collections:') for collection in storage1.collections: print(' ', collection)
logging_level = 'info' logging.basicConfig(level=(logging_level).upper()) log = logging.getLogger(__name__) print('Loading hash index for {!r}...'.format(SOURCE_COLLECTION)) src_hash_index_file = SOURCE_COLLECTION + '__hash_index.txt' src_hash_index = load_hash_index(src_hash_index_file) print('Done.') # Load the access info access_info = read_info_from_pgpass_file(PGPASS_FILE) src_storage = PostgresStorage(dbname=access_info['dbname'], user=access_info['user'], password=access_info['passwd'], host=access_info['host'], port=access_info['port'], schema=SOURCE_SCHEMA, role=SOURCE_ROLE) src_collection = src_storage.get_collection(SOURCE_COLLECTION) assert src_collection.exists(), '(!) Collection {!r} does not exist.'.format( SOURCE_COLLECTION) log.info(' Source collection {!r} exists. '.format(src_collection.name)) trg_storage = PostgresStorage(dbname=access_info['dbname'], user=access_info['user'], password=access_info['passwd'], host=access_info['host'], port=access_info['port'], schema=TARGET_SCHEMA,
parser.add_argument('--collection_meta', dest='collection_meta', action='store', nargs='*', help='list of collection meta data columns to include') args = parser.parse_args() from estnltk.storage.postgres import PostgresStorage from estnltk.converters import TextaExporter from estnltk import logger logger.info('start script') storage = PostgresStorage(dbname=args.dbname, user=None, pgpass_file=args.pgpass, schema=args.schema, role=args.role) collection = storage.get_collection(args.collection) exporter = TextaExporter(index=args.textaindex or args.schema, doc_type=args.textamapping or args.collection, fact_mapping=args.fact_mapping, textaurl=args.textaurl, textapass=args.textapass, sessionpass=args.sessionpass) try: with exporter.buffered_export() as buffered_export: for collection_id, text, meta in collection.select(
parser.add_argument('-f', '--file_pick', dest='file_pick', action='store', type=str, \ help="name of the file containing indexes of the documents that need to be processed "+\ "in the difference evaluation. if specified, then only documents listed in the "+\ "file will be processed (instead of processing the whole corpus). note: each "+\ "document id must be on a separate line in the index file. (default: None)" ) args = parser.parse_args() logger.setLevel( (args.logging).upper() ) log = logger chunk_large_texts = not args.no_chunking if not chunk_large_texts: log.info(' Chunking of large documents disabled.' ) storage = PostgresStorage(pgpass_file=args.pgpass, schema=args.schema, role=args.role) try: # Check layer names if args.morph_layer == args.new_morph_layer: log.error("(!) Invalid layer names: morph_layer cannot be identical to new_morph_layer: {!r}".format(args.morph_layer)) exit(1) collection = storage.get_collection( args.collection ) if not collection.exists(): log.error(' (!) Collection {!r} does not exist...'.format(args.collection)) exit(1) else: docs_in_collection = len( collection ) log.info(' Collection {!r} exists and has {} documents. '.format( args.collection,
'id', 'data', 'source_id', 'start', 'paragraph_nr', 'sentence_nr' ] if set(source_columns) & set(collection_columns): logger.error('source_columns can not include: {}'.format( ', '.join(set(source_columns) & set(collection_columns)))) exit(1) if (source_text_column is None) is (source_data is None): logger.error( 'exactly one of --source_text (given: {}) or --source_data (given: {}) expected' .format(source_text_column, source_data)) exit(1) storage = PostgresStorage(dbname=args.dbname, user=args.user, host=args.host, pgpass_file=args.pgpass, schema=schema, role=args.role) condition = SQL('') if args.chunk_column: condition = SQL('where {}={}').format(Identifier(args.chunk_column), Literal(args.chunk_value)) with storage.conn.cursor() as c: c.execute( SQL('SELECT count({}) FROM {}.{}').format(Identifier(source_id), Identifier(source_schema), Identifier(source_table))) total = c.fetchone()[0] logger.debug('total number of rows in the source table: {}'.format(total))