def seg_long_phrases(origin_long_txt, seged_long_txt2): segs_list = file_utils.read_line(origin_long_txt, lambda line: segment.seg_text(line)) seg_list = [ seg for segs in segs_list for seg in segs.split(' ') if len(seg) > 1 ] file_utils.save_list2file(list(set(seg_list)), seged_long_txt2)
def load_data(data_path): datas = file_utils.read_line(data_path, lambda line_contents: (line_contents[0], line_contents[1]), split='\t') # return zip(*datas) return datas
def find_words_not_in_vec(word_index, vectors_file, inclue_file, exclude_file): vec_words = file_utils.read_line(vectors_file, lambda line: line.split()[0]) print(f'{vec_words[0]}') exclude_words = [word for word in word_index if word not in vec_words] include_words = [word for word in word_index if word in vec_words] file_utils.save_list2file(exclude_words, exclude_file) file_utils.save_list2file(include_words, inclue_file)
def concat_all(clf_dir, dest_dir, portion): file_names = ['train', 'val', 'test'] clf_name_file = os.path.join(dest_dir, 'clf_name.txt') clf_names = set() for clf_file in os.listdir(clf_dir): clf_name = clf_file[0:4] clf_count = int(clf_file[5:-4]) clf_file_path = os.path.join(clf_dir, clf_file) texts = list( file_utils.read_line(clf_file_path, lambda line: json.loads(line)['abs'])) random.shuffle(texts) count2read = int(clf_count * 0.05) for i in range(20): start = count2read * i end = count2read * (i + 1) if len(texts) - 1 > count2read * ( i + 1) else len(texts) - 1 splits = split_list(texts[start:end], portion) if splits: clf_names.add(clf_name) print(f'write clf {clf_name}') for index, list2write in enumerate(splits): dest_file = os.path.join(dest_dir, f'{file_names[index]}{i}.txt') file_utils.save_list2file( list2write, dest_file, work_func=lambda text: f'{clf_name}\t{text}', filter_func=lambda item: len(item) > 1) else: print(f'not split') file_utils.save_list2file(list(clf_names), clf_name_file)
def process_file(filename, word_to_id, cat_to_id, max_length=600): """将文件转换为id表示""" def judge(line_contents): return (line_contents[0], line_contents[1].split()) if len(line_contents) > 1 else ('', '') data2train = file_utils.read_line( filename, lambda line_contents: judge(line_contents), split='\t') data_id, label_id = [], [] for label, content in data2train: if len(label) == 0 or len(content) == 0: continue data_id.append( [word_to_id[word] for word in content if word in word_to_id]) label_id.append(cat_to_id[label]) # 使用keras提供的pad_sequences来将文本pad为固定长度 x_pad = kr.preprocessing.sequence.pad_sequences(data_id, max_length, truncating='post') y_pad = kr.utils.to_categorical( label_id, num_classes=len(cat_to_id)) # 将标签转换为one-hot表示 return x_pad, y_pad
def seg_clf_file(clf_file_pair): raw_clf_file, seged_clf_file = clf_file_pair print(f'seg file {raw_clf_file} to {seged_clf_file}') seged_lines = file_utils.read_line( raw_clf_file, lambda line: segment.seg_raw_doc(json.loads(line))) file_utils.save_list2file( seged_lines, seged_clf_file, lambda doc_json: json.dumps(doc_json, ensure_ascii=False))
def clean(raw_phrase_txt): reged_phrases = file_utils.read_line(raw_phrase_txt, lambda line: extract_chn(line)) clearn_phrases = [ words for reged_phrase in reged_phrases for words in reged_phrase ] # clearn_phrases.sort() return set(clearn_phrases)
def ans_score(my_ans_dir, right_ans_dir): que_count = 0 right_answer_count = 0 for ans_file_name in os.listdir(my_ans_dir): my_ans_file = os.path.join(my_ans_dir, ans_file_name) right_ans_file = os.path.join(right_ans_dir, ans_file_name) my_ans_dict = dict(file_utils.read_line(my_ans_file, lambda split: (split[0], split[1]), split=':')) right_ans_dict = dict(file_utils.read_line(right_ans_file, lambda split: (split[0], split[1]), split=':')) for _q, my_ans in my_ans_dict.items(): right_ans = right_ans_dict[_q] print(f'{_q}, my:{my_ans}, right:{right_ans}') que_count += 1 if my_ans == right_ans: right_answer_count += 1 total_score = right_answer_count / que_count print(f'total score is {total_score}')
def get_clf_str_from_file(clf_names_file_path: str): """ read classifications name string from file and return a generator whose item is a Classification obj :param clf_names_file_path:file path store classification infos :return: """ return file_utils.read_line(clf_names_file_path, lambda line: gen_from_clf_str(line))
def get_clf_info_dict(clf_count_info_file: str) -> list([dict]): """ :type clf_count_info_file: str """ info_list = file_utils.read_line(clf_count_info_file, lambda info: (info[0], ClfInfo(info[0], info[1])), ':') return dict(info_list)
def create_corpus(seged_clf_dir, copus_file): for seged_clf_file in os.listdir(seged_clf_dir): print(f'add clf {seged_clf_file}') file2read = os.path.join(seged_clf_dir, seged_clf_file) texts = file_utils.read_line(file2read, lambda line: json.loads(line)['abs']) file_utils.save_list2file( texts, copus_file, filter_func=lambda text: text and len(text) > 0) print(f'create copus complete')
def get_content_dict(answers_dir, spliter): total_content_dict = {} for content_file in file_utils.get_files(answers_dir): content_dict = dict( file_utils.read_line(content_file, lambda content: (content[0], content[1]), split=spliter)) total_content_dict.update(content_dict) return total_content_dict
def join_phrases(phrase_union_txt, *phrase_txts): print(f'start join...') phrase_set = set() for phrase_txt in phrase_txts: for phrase in file_utils.read_line(phrase_txt): if len(phrase) < 6: phrase_set.add(phrase) print(f'set phrases is: {phrase_set}') l = list(phrase_set) l.sort() file_utils.save_list2file(l, phrase_union_txt)
def build_vocab(train_txt_path, vocab_txt_path, vocab_size=5000): """根据训练集构建词汇表,存储""" contents = file_utils.read_line(train_txt_path, lambda line_contents: line_contents[1] if len(line_contents) > 1 else '', split='\t') counter = Counter( [word for content in contents for word in content.split()]) count_pairs = counter.most_common(vocab_size - 1) words, _ = list(zip(*count_pairs)) # 添加一个 <PAD> 来将所有文本pad为同一长度 words = ['<PAD>'] + list(words) file_utils.save_list2file(words, vocab_txt_path)
def process_question_file(filepath, word_to_id, max_length=600): data2train = file_utils.read_line( filepath, lambda line_contents: (line_contents[0], line_contents[1].split()), split='\t') data_id, y_pad = [], [] for pub_id, content in data2train: data_id.append( [word_to_id[word] for word in content if word in word_to_id]) y_pad.append(pub_id) x_pad = kr.preprocessing.sequence.pad_sequences(data_id, max_length, truncating='post') return x_pad, y_pad
def get_df(path, show_df_info=False): """ get dataFrame, if show_df_info is True, log the dataFrame head, null value count ands ample data. :param path: :param show_df_info: :return: """ logger.info(f'get data frame from file {path}') contents = file_utils.read_line(path, lambda content: (content[0], content[1]), split='\t') df = pd.DataFrame(contents, columns=['clf', 'text']) if show_df_info: logger.info(f'df head is \n {df.head()}') logger.info(f'isnull count:\n {df.isnull().sum()}') logger.info(f'train sample clf: {df["clf"].iloc[0]}, text: {df["text"].iloc[0]}') return df
def group_phrases(origin_file, short_file, median_file, long_file): phrases = file_utils.read_line(origin_file) short_phrases = [] median_phrases = [] long_phrases = [] for phrase in phrases: print(f'{phrase}') if len(phrase) < 6: short_phrases.append(phrase) elif len(phrase) > 10: long_phrases.append(phrase) else: median_phrases.append(phrase) file_utils.save_list2file(short_phrases, short_file) file_utils.save_list2file(median_phrases, median_file) file_utils.save_list2file(long_phrases, long_file)
def select_sample(seged_dir, select_dir): # seged_dir = 'E:/ip_data/clfs/new_seged/no_limit' # select_dir = 'E:/ip_data/clfs/new_seged/no_limit_t' clf_dict, total_count = get_clf_info(seged_dir) for clf_file in os.listdir(seged_dir): clf_name = clf_file[0:4] clf_count = clf_dict[clf_name] read_count = int(clf_count / total_count * 10000) if read_count > 20: file2read = os.path.join(seged_dir, clf_file) lines = list(file_utils.read_line(file2read)) random.shuffle(lines) print( f'clf {clf_name}, clf count {clf_count}, write count {read_count}' ) save_file = f'{clf_name}_{read_count}.txt' file_utils.save_list2file(lines[0:read_count], os.path.join(select_dir, save_file))
def right_ans_distribution(right_ans_dir, clf_count_file): clf_info_dict = dict(file_utils.read_line(clf_count_file, lambda split: (split[0], split[1]), split=':')) total_doc_count = 0 total_que_count = 0 for k, v in clf_info_dict.items(): # print(f'clf info k: {k}, v: {v}') total_doc_count += int(v) clf_que_count_dict = {clf: 0 for clf, count in clf_info_dict.items()} all_que_dict = get_all_ans_dict(right_ans_dir) for k, v in all_que_dict.items(): # print(f'all ans k {k},v {v}') total_que_count += 1 clf_que_count_dict[v] += 1 clf_que_count_dict = {k: int(v) * 100 / total_que_count for (k, v) in clf_que_count_dict.items()} clf_info_list = [(k, int(v) * 100 / total_doc_count) for (k, v) in clf_info_dict.items()] clf_info_list.sort(key=lambda ele: ele[1], reverse=True) print(f'total_que_count {total_que_count}, total_doc_count {total_doc_count}') for k, v in clf_info_list: doc_portion = '{0:.3f}%'.format(v) que_portion = '{0:.3f}%'.format(clf_que_count_dict.get(k)) print(f'clf : {k}, doc portion {doc_portion}, que portion: {que_portion}')
def extract_abs(clf_file_pair): raw_clf_file, seged_clf_file = clf_file_pair print(f'extract abs file {raw_clf_file} to {seged_clf_file}') abs_lines = file_utils.read_line( raw_clf_file, lambda line: segment.seg_text((json.loads(line))['abs'])) file_utils.save_list2file(abs_lines, seged_clf_file)
def read_category(clf_name_file): """读取分类目录,固定""" categories = list(file_utils.read_line(clf_name_file, lambda line: line)) cat_to_id = dict(zip(categories, range(len(categories)))) return categories, cat_to_id
def verify(word: str): word_len = len(word) if word_len % 2 == 0: half_word = word[0:int(word_len / 2)] return half_word if word.count(half_word) == 2 else word return word def collect_new_dict(dict_dir: str, dest_dict_file: str): new_dict_files = file_utils.get_files(dict_dir) word_counter = Counter([verify(word) for dict_file in new_dict_files for word in file_utils.read_line(dict_file)]) # list_utils.print_list(word_counter.keys()) file_utils.save_list2file(word_counter.keys(), dest_dict_file) if __name__ == '__main__': # collect('E:/dict/new_words','E:/dict/new_words.txt') # file_utils.remove_redundant('E:/dict/new_words.txt','E:/dict/new_words2.txt') clf_names_file_path = '/home/tqhy/ip_nlp/resources/clfs/class_needed.txt' clf_raw_dir = '/home/tqhy/ip_nlp/resources/clfs/raw/no_limit' lower_score_dir = '/home/tqhy/ip_nlp/resources/clfs/raw/lower_score' clf_to_collect = list(file_utils.read_line(clf_names_file_path)) for file_names in os.listdir(clf_raw_dir): if file_names[0:4] in clf_to_collect: src_file = os.path.join(clf_raw_dir, file_names) dest_file = os.path.join(lower_score_dir, file_names) shutil.copyfile(src_file, dest_file)
def read_ans(ans_file): return file_utils.read_line(ans_file, lambda split: (split[0], split[1]), split=':')
def collect_new_dict(dict_dir: str, dest_dict_file: str): new_dict_files = file_utils.get_files(dict_dir) word_counter = Counter([verify(word) for dict_file in new_dict_files for word in file_utils.read_line(dict_file)]) # list_utils.print_list(word_counter.keys()) file_utils.save_list2file(word_counter.keys(), dest_dict_file)
def extract_eng(raw_phrase_txt, eng_file): engs_lists = file_utils.read_line( raw_phrase_txt, lambda line: english_pattern.findall(line)) file_utils.save_list2file(engs_lists, eng_file, lambda engs: '\n'.join(engs))