def segment(path, model, word2vec, output_folder, wiki=False):

    for filename in glob.glob(path + '*.txt'):
        with open(filename, "r+") as f:
            paragraph = f.read()
            sentences = [clean_paragraph(paragraph)]

            cutoffs = evaluate.predict_cutoffs(sentences, model, word2vec)
            total = []
            segment = []
            for i, (sentence, cutoff) in enumerate(zip(sentences, cutoffs)):
                segment.append(sentence)
                if cutoff:
                    full_segment = '.'.join(segment) + '.'
                    full_segment = full_segment + '\n' + section_delimiter + '\n'
                    total.append(full_segment)
                    segment = []

        file_id = str(filename).split('/')[-1:][0]

        # Model does not return prediction for last sentence
        segment.append(sentences[-1:][0])
        total.append('.'.join(segment))

        output_file_content = "".join(total)
        output_file_full_path = Path(output_folder).joinpath(Path(file_id))
        with output_file_full_path.open('w') as f:
            f.write(output_file_content)
Exemplo n.º 2
0
def treat_text(raw_text):
    sentences = split_sentences(raw_text, 123)
    print(sentences)

    cutoffs = evaluate.predict_cutoffs(sentences, model, word2vec)
    total = []
    segment = []
    for i, (sentence, cutoff) in enumerate(zip(sentences, cutoffs)):
        segment.append(sentence)
        if cutoff:
            total.append(segment)
            segment = []

    return total
def segment(path, model, word2vec, output_folder, wiki = False):
    file_id = str(path).split('/')[-1:][0]
    if wiki:
        splited_sentences, target, _ = wiki_loader.read_wiki_file(path, None, remove_preface_segment= True, return_w2v_tensors = False)
    else:
        splited_sentences, target, _ = choiloader.read_choi_file(path, word2vec, False, False)

    sentences = [' '.join(s) for s in splited_sentences]
    gold_set = np.zeros(len(splited_sentences)).astype(int)
    gold_set[np.asarray(target)] = 1



    cutoffs = evaluate.predict_cutoffs(sentences, model, word2vec)
    total = []
    segment = []
    for i, (sentence, cutoff) in enumerate(zip(sentences, cutoffs)):
        segment.append(sentence)
        if cutoff or gold_set[i] == 1:
            full_segment ='.'.join(segment) + '.'
            if cutoff:
                full_segment = full_segment + '\n' + wiki_loader.section_delimiter + '\n'
                if gold_set[i] == 1:
                    full_segment = full_segment + goldset_delimiter + '\n'
            else:
                full_segment = full_segment + '\n' +  goldset_delimiter + '\n'
            total.append(full_segment)
            segment = []



    # Model does not return prediction for last sentence
    segment.append(sentences[-1:][0])
    total.append('.'.join(segment))

    output_file_content = "".join(total)
    output_file_full_path = Path(output_folder).joinpath(Path(file_id))
    with output_file_full_path.open('w') as f:
        f.write(output_file_content)