pgpass_file=args.pgpass, schema=schema, role=args.role) condition = SQL('') if args.chunk_column: condition = SQL('where {}={}').format(Identifier(args.chunk_column), Literal(args.chunk_value)) with storage.conn.cursor() as c: c.execute( SQL('SELECT count({}) FROM {}.{}').format(Identifier(source_id), Identifier(source_schema), Identifier(source_table))) total = c.fetchone()[0] logger.debug('total number of rows in the source table: {}'.format(total)) c.execute( SQL('SELECT count(DISTINCT {}) FROM {}.{}').format( Identifier(source_id), Identifier(source_schema), Identifier(source_table))) distinct = c.fetchone()[0] if total != distinct: logger.error( 'values in the source table column {!r} are not unique, {} distinct values in total' .format(source_id, distinct)) exit(1) if args.chunk_column is not None: c.execute( SQL('SELECT count({}) FROM {}.{} {}').format( Identifier(source_id), Identifier(source_schema),
def process_files(rootdir, doc_iterator, collection, focus_input_files=None,\ encoding='utf-8', create_empty_docs=False, logger=None, \ tokenization=None, use_sentence_sep_newlines=False, \ orig_tokenization_layer_name_prefix='', \ splittype='no_splitting', metadata_extent='complete', \ insert_query_size = 5000000, \ skippable_documents=None ): """ Uses given doc_iterator (iter_packed_xml or iter_unpacked_xml) to extract texts from the files in the folder root_dir. Optionally, adds tokenization layers to created Text objects. Extracted Text objects will be stored in given PostgreSQL collection. Parameters ---------- root_dir: str The root directory which contains XML TEI files that doc_iterator can extract; doc_iterator: iter_packed_xml or iter_unpacked_xml Iterator function that can extract Text objects from (packed or unpacked) files in the root_dir; collection: estnltk.storage.postgres.collection.PgCollection EstNLTK's PgCollection where extracted Texts should be stored; focus_input_files: set of str Set of input XML files that should be exclusively processed from root_dir. If provided, then only files from the set will be processed, and all other files will be skipped. If None, then all files returned by doc_iterator will be processed. encoding: str Encoding of the XML files. (default: 'utf-8') create_empty_docs: boolean If True, then documents are also created if there is no textual content, but only metadata content. (default: False) logger: logging.Logger Logger used for debugging messages; tokenization: ['none', 'preserve', 'estnltk'] specifies if tokenization will be added to Texts, and if so, then how it will be added. * 'none' -- text will be created without any tokenization layers; * 'preserve' -- original tokenization from XML files will be preserved in layers of the text; * 'estnltk' -- text's original tokenization will be overwritten by estnltk's tokenization; orig_tokenization_layer_name_prefix: str Prefix that will be added to names of layers of original tokenization, if tokenization=='preserve'. (Default: '') use_sentence_sep_newlines: boolean If set, then during the reconstruction of a text string, sentences from the original XML mark-up will always separated from each other by a newline, regardless the tokenization option used. As a result, sentence endings can also be noticed in the reconstructed text string. Otherwise, a single space character will be used as a sentence separator. (default: False) splittype: ['no_splitting', 'sentences', 'paragraphs'] specifies if and how texts should be split before inserting into the database: * 'no_splitting' -- insert full texts, do no split; * 'sentences' -- split into sentences (a Text object for each sentence), and insert sentences into database; * 'paragraphs' -- split into paragraphs (a Text object for each paragraph), and insert paragraphs into database; metadata_extent: ['minimal', 'complete'] specifies to which extent created Text object should be populated with metadata. (default: 'complete') insert_query_size: int (default: 5000000) maximum insert query size used during the database insert; skippable_documents: set of str (default: None) A set of XML document names corresponding to the documents that have already been processed and inserted into the database. All documents inside this set will skipped. An XML document name is a string in the format: XML_file_name + ':' + subdocument_number + ':' + paragraph_number + ':' + sentence_number Paragraph_number and sentence_number can be missing, if the database does not contain the corresponding fields. If skippable_documents is None or empty, all processed files will be inserted into the database. Note: skippable_documents is more fine-grained set than focus_input_files, thus overrides the skipping directed by the later set. """ global special_tokens_tagger global special_compound_tokens_tagger global special_sentence_tokenizer assert doc_iterator in [iter_unpacked_xml, iter_packed_xml] assert tokenization in [None, 'none', 'preserve', 'estnltk'] assert splittype in ['no_splitting', 'sentences', 'paragraphs'] assert metadata_extent in ['minimal', 'complete'] add_tokenization = False preserve_tokenization = False paragraph_separator = '\n\n' if skippable_documents == None: skippable_documents = set() if tokenization: if tokenization == 'none': tokenization = None if tokenization == 'preserve': add_tokenization = True preserve_tokenization = True elif tokenization == 'estnltk': add_tokenization = True preserve_tokenization = False sentence_separator = ' ' if use_sentence_sep_newlines: sentence_separator = '\n' # Choose how the loaded document will be # split before the insertion split = to_text if args.splittype == 'no_splitting': split = partial(to_text, layer_prefix=orig_tokenization_layer_name_prefix) elif args.splittype == 'sentences': split = partial(to_sentences, layer_prefix=orig_tokenization_layer_name_prefix) elif args.splittype == 'paragraphs': split = partial(to_paragraphs, layer_prefix=orig_tokenization_layer_name_prefix) last_xml_file = '' doc_id = 1 total_insertions = 0 xml_files_processed = 0 with collection.insert( query_length_limit=insert_query_size) as buffered_insert: for doc in doc_iterator(rootdir, focus_input_files=focus_input_files, encoding=encoding, \ create_empty_docs=create_empty_docs, \ orig_tokenization_layer_name_prefix=orig_tokenization_layer_name_prefix, \ add_tokenization=add_tokenization, preserve_tokenization=preserve_tokenization,\ sentence_separator=sentence_separator, paragraph_separator=paragraph_separator): # Get subcorpus name subcorpus = '' if '_xml_file' in doc.meta: subcorpus = get_text_subcorpus_name(None, doc.meta['_xml_file'], doc, expand_names=False) # Reset the document counter if we have a new file coming up xml_file = doc.meta.get('_xml_file', '') if last_xml_file != xml_file: doc_nr = 1 # Split the loaded document into smaller units if required for doc_fragment, para_nr, sent_nr in split(doc): meta = {} # Gather metadata # 1) minimal metadata: meta['file'] = xml_file doc_fragment.meta['file'] = meta['file'] # Remove redundant attribute '_xml_file' if doc_fragment.meta.get('_xml_file', '') == meta['file']: del doc_fragment.meta['_xml_file'] doc_fragment.meta['subcorpus'] = subcorpus meta['subcorpus'] = subcorpus if para_nr is not None: meta['document_nr'] = doc_nr doc_fragment.meta['doc_nr'] = doc_nr meta['paragraph_nr'] = para_nr doc_fragment.meta['para_nr'] = para_nr if sent_nr is not None: meta['sentence_nr'] = sent_nr doc_fragment.meta['sent_nr'] = sent_nr # 2) complete metadata: if metadata_extent == 'complete': for key, value in doc.meta.items(): doc_fragment.meta[key] = value # Collect remaining metadata for key in ['title', 'type']: meta[key] = doc_fragment.meta.get(key, '') # Create an identifier of the insertable chunk: # XML file + subdocument nr + paragraph nr + sentence nr file_chunk_lst = [meta['file']] file_chunk_lst.append(':') file_chunk_lst.append(str(doc_nr)) if 'paragraph_nr' in meta: file_chunk_lst.append(':') file_chunk_lst.append(str(meta['paragraph_nr'])) if 'sentence_nr' in meta: file_chunk_lst.append(':') file_chunk_lst.append(str(meta['sentence_nr'])) file_chunk_str = ''.join(file_chunk_lst) # Finally, insert document (if not skippable) if file_chunk_str not in skippable_documents: row_id = buffered_insert(text=doc_fragment, meta_data=meta) total_insertions += 1 if logger: # Debugging stuff # Listing of annotation layers added to Text with_layers = list(doc_fragment.layers) if with_layers: with_layers = ' with layers ' + str(with_layers) else: with_layers = '' if file_chunk_str not in skippable_documents: logger.debug((' {} inserted as Text{}.').format( file_chunk_str, with_layers)) else: logger.debug((' {} skipped (already in the database).' ).format(file_chunk_str)) #logger.debug(' Metadata: {}'.format(doc_fragment.meta)) doc_nr += 1 if last_xml_file != xml_file: xml_files_processed += 1 last_xml_file = xml_file #print('.', end = '') #sys.stdout.flush() if logger: logger.info( 'Total {} XML files processed.'.format(xml_files_processed)) logger.info( 'Total {} estnltk texts inserted into the database.'.format( total_insertions))
def fetch_skippable_documents(storage, schema, collection, meta_fields, logger): """ Fetches names of existing / skippable documents from the PostgreSQL storage. Returns a set of existing document names. A document name is represented as a string in the format: XML_file_name + ':' + subdocument_number + ':' + paragraph_number + ':' + sentence_number Paragraph_number and sentence_number are skipped, if they are not in meta_fields. Parameters ---------- storage: PostgresStorage PostgresStorage to be queried for column names of the collection; schema: str Name of the schema; collection: boolean Name of the collection / db table; meta_fields: OrderedDict Current fields of the collection / database table. logger: logger For logging the stuff. Returns ------- set of str Set of document names corresponding to documents already existing in the collection; """ # Filter fields: keep only fields that correspond to the fields of # the current table query_fields = ['file', 'id', 'document_nr', 'paragraph_nr', 'sentence_nr'] query_fields = [ f for f in query_fields if f == 'id' or f in meta_fields.keys() ] prev_fname = None fname_doc_nr = 1 file_chunks_in_db = set() # Construct the query sql_str = 'SELECT ' + (','.join( query_fields)) + ' FROM {}.{} ORDER BY ' + (','.join(query_fields)) with storage.conn as conn: # Named cursors: http://initd.org/psycopg/docs/usage.html#server-side-cursors with conn.cursor('read_fname_chunks', withhold=True) as read_cursor: try: read_cursor.execute( SQL(sql_str).format(Identifier(schema), Identifier(collection))) except Exception as e: logger.error(e) raise finally: logger.debug(read_cursor.query.decode()) for items in read_cursor: fname = items[0] doc_id = items[1] if prev_fname and prev_fname != fname: # Reset document number (in case of a new file) fname_doc_nr = 1 doc_nr = items[ 2] if 'document_nr' in query_fields else fname_doc_nr paragraph_nr = items[ 3] if 'paragraph_nr' in query_fields else None sentence_nr = items[ 4] if 'sentence_nr' in query_fields else None # Reconstruct file name chunk file_chunk_lst = [fname] file_chunk_lst.append(':') file_chunk_lst.append(str(doc_nr)) if paragraph_nr: file_chunk_lst.append(':') file_chunk_lst.append(str(paragraph_nr)) if sentence_nr: file_chunk_lst.append(':') file_chunk_lst.append(str(sentence_nr)) file_chunk_str = ''.join(file_chunk_lst) # Sanity check: file_chunk_str should be unique # if not, then we cannot expect skipping to be # consistent ... assert file_chunk_str not in file_chunks_in_db, \ ' (!) Document chunk {!r} appears more than once in database.'.format(file_chunk_str) file_chunks_in_db.add(file_chunk_str) prev_fname = fname fname_doc_nr += 1 return file_chunks_in_db
def process_files(in_file, collection, focus_doc_ids=None,\ encoding='utf-8', discard_empty_paragraphs=True, logger=None, \ tokenization=None, insert_query_size = 5000000, \ skippable_documents=None, doc_id_to_texttype=None ): """ Reads etTenTen 2013 corpus from in_file, extracts documents and reconstructs corresponding Text objects, and stores the results in given database collection. Optionally, adds tokenization layers to created Text objects. Parameters ---------- in_file: str Full name of etTenTen corpus file (name with path); collection: estnltk.storage.postgres.collection.PgCollection EstNLTK's PgCollection where extracted Texts should be stored; focus_doc_ids: set of str Set of document id-s corresponding to the documents which need to be extracted from the in_file. If provided, then only documents with given id-s will be processed, and all other documents will be skipped. If None or empty, then all documents in the file will be processed; encoding: str Encoding of in_file. Defaults to 'utf-8'; discard_empty_paragraphs: boolean If set, then empty paragraphs will be discarded. (default: True) logger: logging.Logger Logger used for debugging etc messages; tokenization: ['none', 'preserve', 'estnltk'] specifies if tokenization will be added to Texts, and if so, then how it will be added. * 'none' -- text will be created without any tokenization layers; * 'preserve' -- original tokenization from XML files will be preserved in layers of the text. Note that etTenTen only has original tokenization for paragraphs, and thus Texts will only have original_paragraphs layer, nothing more. * 'estnltk' -- text's original tokenization will be overwritten by estnltk's tokenization; insert_query_size: int (default: 5000000) maximum insert query size used during the database insert; skippable_documents: set of str (default: None) A set of web document ids corresponding to the documents that have already been processed and inserted into the database. All documents inside this set will skipped. A web document is a string in the format: original_doc_id + ':' + subdocument_number + ':' + paragraph_number + ':' + sentence_number Subdocument_number, paragraph_number and sentence_number are skipped, if the database does not contain the corresponding fields. If skippable_documents is None or empty, all processed files will be inserted into the database. Note: skippable_documents is more fine-grained set than focus_doc_ids, thus overrides the skipping directed by the later set. doc_id_to_texttype: dict (default: None) A mapping from document ids (strings) to their texttypes. Should cover all documents listed in focus_doc_ids, or if focus_doc_ids==None, all documents in in_file; """ assert tokenization in [None, 'none', 'preserve', 'estnltk'] add_tokenization = False preserve_tokenization = False if skippable_documents == None: skippable_documents = set() if tokenization: if tokenization == 'none': add_tokenization = False preserve_tokenization = False if tokenization == 'preserve': add_tokenization = False preserve_tokenization = True elif tokenization == 'estnltk': add_tokenization = True preserve_tokenization = False doc_nr = 1 last_original_doc_id = None total_insertions = 0 docs_processed = 0 with collection.insert( query_length_limit=insert_query_size) as buffered_insert: for web_doc in parse_ettenten_corpus_file_iterator( in_file, encoding=encoding, \ focus_doc_ids=focus_doc_ids, \ discard_empty_paragraphs=discard_empty_paragraphs, \ add_tokenization=add_tokenization, \ store_paragraph_attributes=True, \ paragraph_separator='\n\n' ): # Rename id to original_doc_id (to avoid confusion with DB id-s) original_doc_id = web_doc.meta.get('id') web_doc.meta['original_doc_id'] = original_doc_id del web_doc.meta['id'] # Reset subdocument counter (if required) if last_original_doc_id != original_doc_id: doc_nr = 1 # Delete original_paragraphs layer (if tokenization == None) if not add_tokenization and not preserve_tokenization: delattr(web_doc, 'original_paragraphs') # Remove layer from the text # Add texttype (if mapping is available) if doc_id_to_texttype and original_doc_id in doc_id_to_texttype: web_doc.meta['texttype'] = doc_id_to_texttype[original_doc_id] # Gather metadata meta = {} for key, value in web_doc.meta.items(): meta[key] = value # Create an identifier of the insertable chunk: # original_doc_id + ':' + subdocument_number (+ ':' + paragraph_number + ':' + sentence_number) file_chunk_lst = [web_doc.meta['original_doc_id']] file_chunk_lst.append(':') file_chunk_lst.append(str(doc_nr)) file_chunk_str = ''.join(file_chunk_lst) # Finally, insert document (if not skippable) if file_chunk_str not in skippable_documents: row_id = buffered_insert(text=web_doc, meta_data=meta) total_insertions += 1 if logger: # Debugging stuff # Listing of annotation layers added to Text with_layers = list(web_doc.layers) if with_layers: with_layers = ' with layers ' + str(with_layers) else: with_layers = '' if file_chunk_str not in skippable_documents: logger.debug((' {}:{} inserted as Text{}.').format( meta['web_domain'], file_chunk_str, with_layers)) else: logger.debug( (' {}:{} skipped (already in the database).').format( meta['web_domain'], file_chunk_str)) doc_nr += 1 last_original_doc_id = original_doc_id docs_processed += 1 #print('.', end = '') #sys.stdout.flush() if logger: logger.info( 'Total {} input documents processed.'.format(docs_processed)) logger.info( 'Total {} estnltk texts inserted into the database.'.format( total_insertions))