logging_level = 'info'
logging.basicConfig(level=(logging_level).upper())
log = logging.getLogger(__name__)

# ===========================================
#   Create access
# ===========================================

# Load the access info
access_info = read_info_from_pgpass_file(PGPASS_FILE)
storage1 = PostgresStorage(dbname=access_info['dbname'],
                           user=access_info['user'],
                           password=access_info['passwd'],
                           host=access_info['host'],
                           port=access_info['port'],
                           schema=SRC_SCHEMA,
                           role=SRC_ROLE)

# ===========================================
#   Prepare input & output corpus
# ===========================================
in_collection = storage1.get_collection(SRC_COLLECTION)
assert in_collection.exists()
log.info(' Collection {!r} exists. '.format(SRC_COLLECTION))

print('Other existing collections:')
for collection in storage1.collections:
    print('  ', collection)
예제 #2
0
logging_level = 'info'
logging.basicConfig(level=(logging_level).upper())
log = logging.getLogger(__name__)

print('Loading hash index for {!r}...'.format(SOURCE_COLLECTION))
src_hash_index_file = SOURCE_COLLECTION + '__hash_index.txt'
src_hash_index = load_hash_index(src_hash_index_file)
print('Done.')

# Load the access info
access_info = read_info_from_pgpass_file(PGPASS_FILE)

src_storage = PostgresStorage(dbname=access_info['dbname'],
                              user=access_info['user'],
                              password=access_info['passwd'],
                              host=access_info['host'],
                              port=access_info['port'],
                              schema=SOURCE_SCHEMA,
                              role=SOURCE_ROLE)

src_collection = src_storage.get_collection(SOURCE_COLLECTION)
assert src_collection.exists(), '(!) Collection {!r} does not exist.'.format(
    SOURCE_COLLECTION)
log.info(' Source collection {!r} exists. '.format(src_collection.name))

trg_storage = PostgresStorage(dbname=access_info['dbname'],
                              user=access_info['user'],
                              password=access_info['passwd'],
                              host=access_info['host'],
                              port=access_info['port'],
                              schema=TARGET_SCHEMA,
예제 #3
0
parser.add_argument('--collection_meta',
                    dest='collection_meta',
                    action='store',
                    nargs='*',
                    help='list of collection meta data columns to include')
args = parser.parse_args()

from estnltk.storage.postgres import PostgresStorage
from estnltk.converters import TextaExporter
from estnltk import logger

logger.info('start script')

storage = PostgresStorage(dbname=args.dbname,
                          user=None,
                          pgpass_file=args.pgpass,
                          schema=args.schema,
                          role=args.role)

collection = storage.get_collection(args.collection)

exporter = TextaExporter(index=args.textaindex or args.schema,
                         doc_type=args.textamapping or args.collection,
                         fact_mapping=args.fact_mapping,
                         textaurl=args.textaurl,
                         textapass=args.textapass,
                         sessionpass=args.sessionpass)

try:
    with exporter.buffered_export() as buffered_export:
        for collection_id, text, meta in collection.select(
    parser.add_argument('-f', '--file_pick', dest='file_pick', action='store', type=str, \
                        help="name of the file containing indexes of the documents that need to be processed "+\
                             "in the difference evaluation. if specified, then only documents listed in the "+\
                             "file will be processed (instead of processing the whole corpus). note: each "+\
                             "document id must be on a separate line in the index file. (default: None)" )
    args = parser.parse_args()

    logger.setLevel( (args.logging).upper() )
    log = logger
    
    chunk_large_texts = not args.no_chunking
    if not chunk_large_texts:
        log.info(' Chunking of large documents disabled.' )
    
    storage = PostgresStorage(pgpass_file=args.pgpass,
                              schema=args.schema,
                              role=args.role)
    try:

        # Check layer names
        if args.morph_layer == args.new_morph_layer:
            log.error("(!) Invalid layer names: morph_layer cannot be identical to new_morph_layer: {!r}".format(args.morph_layer))
            exit(1)
        
        collection = storage.get_collection( args.collection )
        if not collection.exists():
            log.error(' (!) Collection {!r} does not exist...'.format(args.collection))
            exit(1)
        else:
            docs_in_collection = len( collection )
            log.info(' Collection {!r} exists and has {} documents. '.format( args.collection,
    'id', 'data', 'source_id', 'start', 'paragraph_nr', 'sentence_nr'
]
if set(source_columns) & set(collection_columns):
    logger.error('source_columns can not include: {}'.format(
        ', '.join(set(source_columns) & set(collection_columns))))
    exit(1)

if (source_text_column is None) is (source_data is None):
    logger.error(
        'exactly one of --source_text (given: {}) or --source_data (given: {}) expected'
        .format(source_text_column, source_data))
    exit(1)

storage = PostgresStorage(dbname=args.dbname,
                          user=args.user,
                          host=args.host,
                          pgpass_file=args.pgpass,
                          schema=schema,
                          role=args.role)

condition = SQL('')
if args.chunk_column:
    condition = SQL('where {}={}').format(Identifier(args.chunk_column),
                                          Literal(args.chunk_value))

with storage.conn.cursor() as c:
    c.execute(
        SQL('SELECT count({}) FROM {}.{}').format(Identifier(source_id),
                                                  Identifier(source_schema),
                                                  Identifier(source_table)))
    total = c.fetchone()[0]
    logger.debug('total number of rows in the source table: {}'.format(total))