def segment(path, model, word2vec, output_folder, wiki=False): for filename in glob.glob(path + '*.txt'): with open(filename, "r+") as f: paragraph = f.read() sentences = [clean_paragraph(paragraph)] cutoffs = evaluate.predict_cutoffs(sentences, model, word2vec) total = [] segment = [] for i, (sentence, cutoff) in enumerate(zip(sentences, cutoffs)): segment.append(sentence) if cutoff: full_segment = '.'.join(segment) + '.' full_segment = full_segment + '\n' + section_delimiter + '\n' total.append(full_segment) segment = [] file_id = str(filename).split('/')[-1:][0] # Model does not return prediction for last sentence segment.append(sentences[-1:][0]) total.append('.'.join(segment)) output_file_content = "".join(total) output_file_full_path = Path(output_folder).joinpath(Path(file_id)) with output_file_full_path.open('w') as f: f.write(output_file_content)
def treat_text(raw_text): sentences = split_sentences(raw_text, 123) print(sentences) cutoffs = evaluate.predict_cutoffs(sentences, model, word2vec) total = [] segment = [] for i, (sentence, cutoff) in enumerate(zip(sentences, cutoffs)): segment.append(sentence) if cutoff: total.append(segment) segment = [] return total
def segment(path, model, word2vec, output_folder, wiki = False): file_id = str(path).split('/')[-1:][0] if wiki: splited_sentences, target, _ = wiki_loader.read_wiki_file(path, None, remove_preface_segment= True, return_w2v_tensors = False) else: splited_sentences, target, _ = choiloader.read_choi_file(path, word2vec, False, False) sentences = [' '.join(s) for s in splited_sentences] gold_set = np.zeros(len(splited_sentences)).astype(int) gold_set[np.asarray(target)] = 1 cutoffs = evaluate.predict_cutoffs(sentences, model, word2vec) total = [] segment = [] for i, (sentence, cutoff) in enumerate(zip(sentences, cutoffs)): segment.append(sentence) if cutoff or gold_set[i] == 1: full_segment ='.'.join(segment) + '.' if cutoff: full_segment = full_segment + '\n' + wiki_loader.section_delimiter + '\n' if gold_set[i] == 1: full_segment = full_segment + goldset_delimiter + '\n' else: full_segment = full_segment + '\n' + goldset_delimiter + '\n' total.append(full_segment) segment = [] # Model does not return prediction for last sentence segment.append(sentences[-1:][0]) total.append('.'.join(segment)) output_file_content = "".join(total) output_file_full_path = Path(output_folder).joinpath(Path(file_id)) with output_file_full_path.open('w') as f: f.write(output_file_content)