def text_to_word2vec(sentences, word2vec): new_text = [] for sentence in sentences: words = text_manipulation.extract_sentence_words(sentence) new_text.append([word_model(w, word2vec) for w in words]) return new_text
def read_wiki_file(path, word2vec, remove_preface_segment=True, ignore_list=False, remove_special_tokens=False, return_as_sentences=False, high_granularity=True, only_letters=False): data = [] targets = [] all_sections = get_sections(path, high_granularity) required_sections = all_sections[1:] if remove_preface_segment and len( all_sections) > 0 else all_sections required_non_empty_sections = [ section for section in required_sections if len(section) > 0 and section != "\n" ] for section in required_non_empty_sections: sentences = section.split('\n') if sentences: for sentence in sentences: is_list_sentence = wiki_utils.get_list_token() + "." == str( sentence) #print(is_list_sentence) if ignore_list and is_list_sentence: # print(wiki_utils.get_list_token() + ".") # print(sentence.encode('utf-8')) continue if not return_as_sentences: sentence_words = extract_sentence_words( sentence, remove_special_tokens=remove_special_tokens) if 1 <= len(sentence_words): data.append([ word_model(word, word2vec) for word in sentence_words ]) else: #print(sentence) #print(sentence_words) #raise ValueError('Sentence in wikipedia file is empty') logger.info('Sentence in wikipedia file is empty') else: # for the annotation. keep sentence as is. if (only_letters): sentence = re.sub('[^a-zA-Z0-9 ]+', '', sentence) data.append(sentence) else: data.append(sentence) if data: targets.append(len(data) - 1) return data, targets, path
def visualize_document(path, pred, golden, remove_preface_segment=True, ignore_list=True, remove_special_tokens=True, return_as_sentences=False, high_granularity=False): dir_path = "/" + "/".join(path.strip("/").split("/")[:-1]) # if os.path.exists(os.path.join(dir_path,"pred")) and os.path.exists(os.path.join(dir_path,"golden")): # return all_sections = get_sections(path, high_granularity) required_sections = all_sections[1:] if remove_preface_segment and len( all_sections) > 0 else all_sections required_non_empty_sections = [ section for section in required_sections if len(section) > 0 and section != "\n" ] final_sentence_lst = [] for section in required_non_empty_sections: sentences = section.split('\n') if sentences: for sentence in sentences: is_list_sentence = wiki_utils.get_list_token() + "." == str( sentence) if ignore_list and is_list_sentence: continue if not return_as_sentences: sentence_words = extract_sentence_words( sentence, remove_special_tokens=remove_special_tokens) if 1 > len(sentence_words): continue else: final_sentence_lst.append(sentence) def write_document(sentences, path, seg, name): with open(os.path.join(path, name), "w") as f: seg_list = seg.split(" ") cot = 0 for i in seg_list: if cot == 0: f.write("=" * 10 + "开头" + "\n") cot = 1 else: if i == "|": # 分割点 f.write("=" * 10 + "\n") else: f.write(sentences[int(i) - 1] + "\n") write_document(final_sentence_lst, dir_path, pred, "pred") write_document(final_sentence_lst, dir_path, golden, "golden")
def read_choi_file(path, word2vec, sent_bert_vec, train, return_w2v_tensors=True, manifesto=False): seperator = '========' if manifesto else '==========' with Path(path).open('r') as f: raw_text = f.read() paragraphs = [ clean_paragraph(p) for p in raw_text.strip().split(seperator) if len(p) > 5 and p != "\n" ] if train: random.shuffle(paragraphs) targets = [] new_text = [] text = [] lastparagraphsentenceidx = 0 for paragraph in paragraphs: if manifesto: sentences = split_sentences(paragraph, 0) else: sentences = [ s for s in paragraph.split('\n') if len(s.split()) > 0 ] if sentences: sentences_count = 0 # This is the number of sentences in the paragraph and where we need to split. for sentence in sentences: words, sentence_str = extract_sentence_words(sentence) if (len(words) == 0): continue sentences_count += 1 if return_w2v_tensors: text.append(words) new_text.append([word_model(w, word2vec) for w in words]) else: text.append(words) new_text.append(words) lastparagraphsentenceidx += sentences_count targets.append(lastparagraphsentenceidx - 1) return new_text, targets, path, sent_bert_vec
def main(args): utils.read_config_file(args.config) utils.config.update(args.__dict__) algo_delimeter = graphseg_delimeter files = get_files(args.folder) acc = accuracy.Accuracy() for file_path in files: file = open(str(file_path), "r") raw_content = file.read() file.close() sentences = [ s for s in raw_content.decode('utf-8').strip().split("\n") if len(s) > 0 and s != "\n" ] sentences_length = [] h = [] t = [] is_first_sentence = True for sentence in sentences: if sentence == truth: if not is_first_sentence: t[-1] = 1 continue if sentence == algo_delimeter: if not is_first_sentence: h[-1] = 1 continue words = extract_sentence_words(sentence) sentences_length.append(len(words)) t.append(0) h.append(0) is_first_sentence = False t[-1] = 1 # end of last segment h[-1] = 1 # they already segment it correctly. acc.update(h, t) calculated_pk, calculated_windiff = acc.calc_accuracy() print 'Pk: {:.4}.'.format(calculated_pk) print 'Win_diff: {:.4}.'.format(calculated_windiff)
def process_section(section, id): global num_sentneces_for_avg global sum_sentneces_for_avg sentences = text_manipulation.split_sentences(section, id) section_sentences = [] num_lists = 0 num_sentences = 0 num_formulas = 0 num_codes = 0 last_sentence_was_list = False for sentence in sentences: is_list_sentence = wiki_utils.get_list_token( ) + "." == sentence.encode('utf-8') if '\n' in sentence: logger.info("DocId: " + str(id) + " back slash in sentence: " + sentence) if (wiki_utils.get_list_token() in sentence) and ( wiki_utils.get_list_token() + ".") != sentence.encode('utf-8'): # TODO: delete this if section, since it is not suupposed to happen any more - but still happen num_lists += 1 last_sentence_was_list = True logger.info("DocId: " + str(id) + " Special case 1: " + sentence) continue elif is_list_sentence: if (last_sentence_was_list): continue last_sentence_was_list = True num_lists += 1 else: last_sentence_was_list = False sentence_words = text_manipulation.extract_sentence_words(sentence) if len(sentence_words) < wiki_thresholds.min_words_in_sentence: # ignore this sentence continue sum_sentneces_for_avg += len(sentence_words) num_sentneces_for_avg += 1 num_formulas += count_str_occurrences(sentence, wiki_utils.get_formula_token()) num_codes += count_str_occurrences(sentence, wiki_utils.get_codesnipet_token()) num_sentences += 1 section_sentences.append(sentence) valid_section = True error_message = None if (num_sentences < wiki_thresholds.min_sentence_in_section): valid_section = False error_message = "sentences count in section is too low" if (num_sentences > 0): lists_perentage = float(num_lists) / float(num_sentences) if lists_perentage >= wiki_thresholds.max_list_in_section_percentage: valid_section = False error_message = "list percentage in section is too high: " + str( lists_perentage) section_text = ''.join(section_sentences) if len(section_text) < wiki_thresholds.min_section_char_count: valid_section = False error_message = "char count in section is too low" if num_formulas >= wiki_thresholds.max_section_formulas_count: valid_section = False error_message = "number of formulas in section is too high: " + str( num_formulas) if num_codes >= wiki_thresholds.max_section_code_snipet_count: valid_section = False error_message = "number of code snippets in section is too high: " + str( num_codes) return valid_section, section_sentences, error_message