Пример #1
0
def nest_corpus(path_in: pathlib.Path, path_out: pathlib.Path,
                control: pathlib.Path) -> None:
    u.assert_folder_is_readable(path_in)
    u.assert_folder_is_writable(path_out)
    i = 1
    widgets = [
        'Re-nesting Corpus # ',
        pb.Counter(), ' ',
        pb.Timer(), ' ',
        pb.BouncingBar(marker='.', left='[', right=']')
    ]
    with pb.ProgressBar(widgets=widgets) as bar:
        with control.open('r', encoding='utf-8', newline='') as control:
            reader = csv.DictReader(control,
                                    delimiter=',',
                                    quotechar='"',
                                    quoting=csv.QUOTE_ALL)
            for row in reader:
                bar.update(i)
                i = i + 1
                source_path = path_in.joinpath(row['filename'])
                dest_path = path_out.joinpath(row['relitive path'])
                pathlib.Path(dest_path.parent).mkdir(parents=True,
                                                     exist_ok=True)
                shutil.copy(source_path, dest_path)
Пример #2
0
def unnest_corpus(path_in: pathlib.Path, path_out: pathlib.Path,
                  control: pathlib.Path) -> None:
    u.assert_folder_is_readable(path_in)
    u.assert_folder_is_writable(path_out)
    i = 1
    widgets = [
        'Unnesting Corpus # ',
        pb.Counter(), ' ',
        pb.Timer(), ' ',
        pb.BouncingBar(marker='.', left='[', right=']')
    ]
    with pb.ProgressBar(widgets=widgets) as bar:
        with control.open('w', encoding='utf-8', newline='') as control:
            writer = csv.writer(control,
                                delimiter=',',
                                quotechar='"',
                                quoting=csv.QUOTE_ALL)
            writer.writerow(['filename', 'relitive path'])
            for file_name in path_in.rglob("*"):
                if u.is_corpus_document(file_name):
                    bar.update(i)
                    i = i + 1
                    dest_path = path_out.joinpath(
                        f'{str(uuid.uuid4())}{file_name.suffix}')
                    shutil.copy(file_name, dest_path)
                    relative_path = file_name.relative_to(path_in)
                    writer.writerow([dest_path.name, str(relative_path)])
Пример #3
0
def documents_to_corpus(path_in: pathlib.Path, path_out: pathlib.Path) -> None:
    u.assert_folder_is_readable(path_in)
    u.assert_folder_is_writable(path_out)
    i = 1
    widgets = [ 'Formatting Document # ', pb.Counter(), ' ', pb.Timer(), ' ', pb.BouncingBar(marker = '.', left = '[', right = ']')]
    with pb.ProgressBar(widgets = widgets) as bar:
        for file_name in path_in.iterdir():
            if u.is_corpus_document(file_name):
                bar.update(i)
                i = i + 1
                sentences = __tokenize_document(file_name)
                u.write_document(path_out, file_name, sentences)
def corpus_to_reading_level(path_in: pathlib.Path, path_out: pathlib.Path) -> None:
    u.assert_folder_is_readable(path_in)
    u.assert_folder_is_writable(path_out)
    start = time.time()
    __clean_grade_level_files(path_out)
    doc_cnt = 0
    for file_name in path_in.iterdir():
        if u.is_corpus_document(file_name):
            print(file_name.stem)
            sentences = __document_to_sentences(file_name)
            __save_new_documents(path_out, sentences)
            doc_cnt = doc_cnt + 1
    seconds = time.time() - start
    if(doc_cnt > 0):
        print(f'Processed (seconds/document): {int(seconds):,}/{doc_cnt:,} = {int(seconds/doc_cnt):,} spd')
    else:
        print(f'No text documents found in {path_in}')
def normalize_corpus_by_truncation(path_in: pathlib.Path,
                                   path_out: pathlib.Path,
                                   max_length: int) -> None:
    u.assert_folder_is_readable(path_in)
    u.assert_folder_is_writable(path_out)
    i = 1
    widgets = [
        'Normalize Document # ',
        pb.Counter(), ' ',
        pb.Timer(), ' ',
        pb.BouncingBar(marker='.', left='[', right=']')
    ]
    with pb.ProgressBar(widgets=widgets) as bar:
        for file_name in path_in.iterdir():
            if u.is_corpus_document(file_name):
                bar.update(i)
                i = i + 1
                sentences = __normalize_document_by_truncation(
                    file_name, min_length)
                u.write_document(path_out, file_name, sentences)
def remove_stopwords_from_corpus(path_in: pathlib.Path,
                                 path_out: pathlib.Path) -> None:
    u.assert_folder_is_readable(path_in)
    u.assert_folder_is_writable(path_out)
    i = 1
    stopwords = nltk.corpus.stopwords.words('english')
    stopwords = set(stopwords)
    widgets = [
        'Pre-Processing Document # ',
        pb.Counter(), ' ',
        pb.Timer(), ' ',
        pb.BouncingBar(marker='.', left='[', right=']')
    ]
    with pb.ProgressBar(widgets=widgets) as bar:
        for file_name in path_in.iterdir():
            if u.is_corpus_document(file_name):
                bar.update(i)
                i = i + 1
                sentences = __remove_stopwords_from_document(
                    file_name, stopwords)
                u.write_document(path_out, file_name, sentences)
Пример #7
0
def vectorize_corpus(path_in: pathlib.Path, path_out: pathlib.Path, path_control: pathlib.Path) -> None:
    u.assert_folder_is_readable(path_in)
    u.assert_folder_is_writable(path_out)
    i = 1
    widgets = [ 'Collecting Tokens # ', pb.Counter(), ' ', pb.Timer(), ' ', pb.BouncingBar(marker = '.', left = '[', right = ']')]
    tokens = dict()
    with pb.ProgressBar(widgets = widgets) as bar:
        for file_name in path_in.iterdir():
            if u.is_corpus_document(file_name):
                bar.update(i)
                i = i + 1
                __append_tokens(tokens, file_name)
    token_map = __map_tokens(tokens)
    __save_token_file(tokens, token_map, path_control)
    i = 1
    widgets = [ 'Vectorizing Document # ', pb.Counter(), ' ', pb.Timer(), ' ', pb.BouncingBar(marker = '.', left = '[', right = ']')]
    with pb.ProgressBar(widgets = widgets) as bar:
        for file_name in path_in.iterdir():
            if u.is_corpus_document(file_name):
                bar.update(i)
                i = i + 1
                vector = __vectorise_document(file_name, token_map)
                u.write_document(path_out, file_name, vector)
Пример #8
0
def tokenize_corpus(corpus_in: pathlib.Path, corpus_out: pathlib.Path) -> None:
    u.assert_folder_is_readable(corpus_in)
    u.ensure_folder_is_writable(corpus_out)
    pass
Пример #9
0
def extract_fold(corpus_in: pathlib.Path, corpus_out: pathlib.Path, subset: pathlib.Path) -> None:
    u.assert_folder_is_readable(corpus_in)
    u.assert_file_is_readable(subset)
    u.ensure_folder_is_writable(corpus_out)
    pass
Пример #10
0
def validate_model(corpus_in: pathlib.Path,
                   state_folder: pathlib.Path) -> None:
    u.assert_folder_is_readable(corpus_in)
    u.ensure_folder_is_writable(state_folder.joinpath('./results'))
    pass
Пример #11
0
def create_corpus_folds(corpus_in: pathlib.Path,
                        state_folder: pathlib.Path) -> None:
    u.assert_folder_is_readable(corpus_in)
    u.ensure_folder_is_writable(state_folder.joinpath('./sub'))
    pass
Пример #12
0
def train_model(corpus_in: pathlib.Path, state_folder: pathlib.Path) -> None:
    u.assert_folder_is_readable(corpus_in)
    u.ensure_folder_is_writable(state_folder.joinpath('./weights'))
    pass
Пример #13
0
def hyper_tune_model(train_in: pathlib.Path, test_in: pathlib.Path,
                     state_folder: pathlib.Path) -> None:
    u.assert_folder_is_readable(train_in)
    u.assert_folder_is_readable(test_in)
    u.ensure_folder_is_writable(state_folder)
    pass