def tag_collection(tagger, args): logger.setLevel(args.logging) logger.info('start script') storage = PostgresStorage(dbname=args.dbname, user=args.user, host=args.host, port=args.port, pgpass_file=args.pgpass, schema=args.schema, role=args.role) collection = storage.get_collection(args.collection) overwrite = (args.mode == 'overwrite') try: collection.create_layer(tagger=tagger, overwrite=overwrite, progressbar=args.progressbar) except Exception as e: logger.error(e) exit(1) finally: storage.close() logger.info('end script')
"and then all documents will be processed as whole. "+ "(default: False)", \ ) parser.add_argument('-r', '--rand_pick', dest='rand_pick', action='store', type=int, \ help="integer value specifying the amount of documents to be randomly chosen for "+\ "difference evaluation. if specified, then the given amount of documents will be "+\ "processed (instead of processing the whole corpus). if the amount exceeds the "+\ "corpus size, then the whole corpus is processed. (default: None)" ) parser.add_argument('-f', '--file_pick', dest='file_pick', action='store', type=str, \ help="name of the file containing indexes of the documents that need to be processed "+\ "in the difference evaluation. if specified, then only documents listed in the "+\ "file will be processed (instead of processing the whole corpus). note: each "+\ "document id must be on a separate line in the index file. (default: None)" ) args = parser.parse_args() logger.setLevel( (args.logging).upper() ) log = logger chunk_large_texts = not args.no_chunking if not chunk_large_texts: log.info(' Chunking of large documents disabled.' ) storage = PostgresStorage(pgpass_file=args.pgpass, schema=args.schema, role=args.role) try: # Check layer names if args.morph_layer == args.new_morph_layer: log.error("(!) Invalid layer names: morph_layer cannot be identical to new_morph_layer: {!r}".format(args.morph_layer)) exit(1)
nargs='?', help='only rows with this value in the chunk column are selected') args = parser.parse_args() from collections import OrderedDict from psycopg2.sql import SQL, Identifier, Literal import tqdm from estnltk import Text from estnltk import logger from estnltk.converters import dict_to_text from estnltk.storage.postgres import PostgresStorage from estnltk.layer_operations import split_by from estnltk.storage.postgres import table_exists logger.setLevel(args.logging) logger.info('start script') schema = args.schema source_schema = args.source_schema source_table = args.source_table source_id = args.source_id source_text_column = args.source_text source_columns = [c.strip() for c in args.source_columns or []] source_data = args.source_data assert (args.chunk_column is None) is (args.chunk_value is None), ( args.chunk_column, args.chunk_value) collection_columns = [