예제 #1
0
import zipfile
import pandas as pd
import text_analytic_tools.utility as utility

logger = utility.getLogger("text_analytic_tools")


def store_tokenized_corpus_as_archive(tokenized_docs, target_filename):
    """Stores a tokenized (string) corpus to a zip archive

    Parameters
    ----------
    tokenized_docs : [type]
        [description]
    corpus_source_filepath : [type]
        [description]

    Returns
    -------
    [type]
        [description]
    """

    file_stats = []
    process_count = 0

    # TODO: Enable store of all documents line-by-line in a single file
    with zipfile.ZipFile(target_filename, "w") as zf:

        for document_id, document_name, chunk_index, tokens in tokenized_docs:
예제 #2
0
# -*- coding: utf-8 -*-
import re
import zipfile

import ftfy
import textacy

import text_analytic_tools.utility as utility

from . import utils

logger = utility.getLogger('corpus_text_analysis')

HYPHEN_REGEXP = re.compile(r'\b(\w+)-\s*\r?\n\s*(\w+)\b', re.UNICODE)


def preprocess_text(source_filename, target_filename, tick=utility.noop):
    '''
    Pre-process of zipped archive that contains text documents

    Returns
    -------
    Zip-archive
    '''

    filenames = utility.zip_get_filenames(source_filename)
    texts = ((filename, utility.zip_get_text(source_filename, filename))
             for filename in filenames)
    logger.info('Preparing text corpus...')
    tick(0, len(filenames))
    with zipfile.ZipFile(target_filename, 'w', zipfile.ZIP_DEFLATED) as zf:
import text_analytic_tools.domain.common_logic as common_logic

import text_analytic_tools
import text_analytic_tools.utility as utility
import text_analytic_tools.common.textacy_utility as textacy_utility
import text_analytic_tools.common as common

logger = utility.getLogger('tCoIR')

current_domain = text_analytic_tools.CURRENT_DOMAIN
container = None

source_path = '/home/roger/source/text_analytic_tools/data/tCoIR/tCoIR_en_45-72.txt.zip'

if container is None:
    container = textacy_utility.load_or_create(source_path=source_path,
                                               language='en',
                                               document_index=None,
                                               merge_entities=False,
                                               overwrite=False,
                                               use_compression=True,
                                               disabled_pipes=tuple(
                                                   ("ner", "parser",
                                                    "textcat")))

corpus = container.textacy_corpus
min_freq_stats = {
    k: textacy_utility.generate_word_count_score(corpus, k, 10)
    for k in ['lemma', 'lower', 'orth']
}
max_doc_freq_stats = {