def get_list_name(file_list_name): file_names = read.textfile2list(file_list_name) file_simple = [ file_name.split("/")[-1] for file_name in file_names if "THYMEColonFinal" in file_name ] read.savein_json(file_list_name.replace(".txt", "_simple"), file_simple)
def get_train(): file_dev = read.readfrom_json("data/dev_file_simple") train_all_simple = read.readfrom_json("data/train_all_simple") train = [ train_file for train_file in train_all_simple if train_file not in file_dev ] read.savein_json("data/train_simple", train)
def document_level_2_sentence_level(file_dir, raw_data_path, preprocessed_path, xml_path, file_format): max_len_all = list() char_vocab = defaultdict(float) for data_id in range(0, len(file_dir)): raw_text_path = os.path.join(raw_data_path, file_dir[data_id], file_dir[data_id]) preprocessed_file_path = os.path.join(preprocessed_path, file_dir[data_id], file_dir[data_id]) raw_text = read.readfrom_txt(raw_text_path) raw_text = process.text_normalize(raw_text) sent_span_list_file, max_len_file, char_vocab = split_by_sentence( raw_text, char_vocab) max_len_all += max_len_file read.savein_json(preprocessed_file_path + "_sent", sent_span_list_file) if xml_path != "": xml_file_path = os.path.join(xml_path, file_dir[data_id], file_dir[data_id] + file_format) posi_info_dict = process.extract_xmltag_anafora( xml_file_path, raw_text) sent_tag_list_file = xml_tag_in_sentence(sent_span_list_file, posi_info_dict) read.savein_json(preprocessed_file_path + "_tag", sent_tag_list_file) max_len_all.sort(reverse=True) max_len_file_name = "/".join( preprocessed_path.split('/')[:-1]) + "/max_len_sent" read.savein_json(max_len_file_name, max_len_all)
def document_level_2_sentence_level(file_dir, raw_data_path, preprocessed_path, xml_path, file_format): max_len_all = list() char_vocab = defaultdict(float) for data_id in range(0, len(file_dir)): #raw_text_path = os.path.join(raw_data_path,file_dir[data_id],file_dir[data_id]) #preprocessed_file_path = os.path.join(preprocessed_path,file_dir[data_id],file_dir[data_id]) raw_text_path = os.path.join(raw_data_path, file_dir[data_id]) preprocessed_file_path = os.path.join(preprocessed_path, file_dir[data_id]) raw_text = read.readfrom_txt(raw_text_path) raw_text = process.text_normalize(raw_text) #print('raw_text - %s' % raw_text) #print('raw_text AFTER NORMALIZE - %s' % raw_text) sent_span_list_file, max_len_file, char_vocab = split_by_sentence( raw_text, char_vocab) #print('sent_span_list_file - %s, max_len_file - %s,char_vocab - %s ' % (sent_span_list_file, max_len_file, char_vocab)) max_len_all += max_len_file read.savein_json(preprocessed_file_path + "_sent", sent_span_list_file) if xml_path != "": xml_file_path = os.path.join(xml_path, file_dir[data_id], file_dir[data_id] + file_format) #xml_file_path = os.path.join(xml_path, file_dir[data_id] + file_format) print('xml_file_path - %s' % xml_file_path) posi_info_dict = process.extract_xmltag_anafora( xml_file_path, raw_text) print('posi_info_dict - ') for key, value in posi_info_dict.items(): print(key, value) sent_tag_list_file = xml_tag_in_sentence(sent_span_list_file, posi_info_dict) #print('sent_tag_list_file - %s' % sent_tag_list_file) read.savein_json(preprocessed_file_path + "_tag", sent_tag_list_file) print('max_len_all - %s' % max_len_all) max_len_all.sort(reverse=True) max_len_file_name = "/".join( preprocessed_path.split('/')[:-1]) + "/max_len_sent" read.savein_json(max_len_file_name, max_len_all)
def document_level_2_sentence_level(file_dir, raw_data_path, preprocessed_path, xml_path, file_format): max_len_all = list() char_vocab = defaultdict(float) pos_vocab = defaultdict(float) unicode_vocab = defaultdict(float) word_vocab = defaultdict(float) for data_id in range(0, len(file_dir)): raw_text_path = os.path.join(raw_data_path, file_dir[data_id], file_dir[data_id]) preprocessed_file_path = os.path.join(preprocessed_path, file_dir[data_id], file_dir[data_id]) raw_text = read.readfrom_txt(raw_text_path) raw_text = process.text_normalize(raw_text) sent_span_list_file, max_len_file, char_vocab = split_by_sentence( raw_text, char_vocab) max_len_all += max_len_file pos_sentences, pos_vocab = process.get_pos_sentence( sent_span_list_file, pos_vocab) #pos_sentences = read.readfrom_json("data/pos_sentences")#read.savein_json("data/pos_sentences",pos_sentences) word_sentences, word_vocab = process.get_words(sent_span_list_file, word_vocab) pos_sentences_character = process.word_pos_2_character_pos( sent_span_list_file, pos_sentences) unico_sentences_characte, unicode_vocab = process.get_unicode( sent_span_list_file, unicode_vocab) read.savein_json(preprocessed_file_path + "_sent", sent_span_list_file) read.savein_json(preprocessed_file_path + "_pos", pos_sentences_character) read.savein_json(preprocessed_file_path + "_unicodecategory", unico_sentences_characte) read.savein_json(preprocessed_file_path + "_words", word_sentences) if xml_path != "": xml_file_path = os.path.join(xml_path, file_dir[data_id], file_dir[data_id] + file_format) posi_info_dict = process.extract_xmltag_anafora( xml_file_path, raw_text) sent_tag_list_file = xml_tag_in_sentence(sent_span_list_file, posi_info_dict) read.savein_json(preprocessed_file_path + "_tag", sent_tag_list_file) #read.savein_json("data/word_vocab", word_vocab) max_len_all.sort(reverse=True) max_len_file_name = "/".join( preprocessed_path.split('/')[:-1]) + "/max_len_sent" read.savein_json(max_len_file_name, max_len_all)