def find_new_words(root, file_pair): abs_file, new_words_file = file_pair[0], file_pair[1] if os.path.exists(new_words_file): print(f'clf {new_words_file} has already found new words ...') return print(f'start find new word in {abs_file}') datas = load_data(abs_file) model = root topN = 2 if len(datas) > 0: tmp = [] count = 0 words2add = set() for item in datas: tmp.append(item) count += 1 if count % 40 == 0: load_data_2_root(tmp, model) result, add_word = model.find_word(topN) words2add.update(add_word.keys()) print(f'words2add: {words2add}, {count}') tmp.clear() if len(tmp) > 0: print(f'{words2add}') load_data_2_root(tmp, model) result, add_word = model.find_word(topN) words2add.update(add_word.keys()) print(f'words2add: {words2add}, {count}') tmp.clear() file_utils.save_list2file(words2add, new_words_file)
def concat_all(clf_dir, dest_dir, portion): file_names = ['train', 'val', 'test'] clf_name_file = os.path.join(dest_dir, 'clf_name.txt') clf_names = set() for clf_file in os.listdir(clf_dir): clf_name = clf_file[0:4] clf_count = int(clf_file[5:-4]) clf_file_path = os.path.join(clf_dir, clf_file) texts = list( file_utils.read_line(clf_file_path, lambda line: json.loads(line)['abs'])) random.shuffle(texts) count2read = int(clf_count * 0.05) for i in range(20): start = count2read * i end = count2read * (i + 1) if len(texts) - 1 > count2read * ( i + 1) else len(texts) - 1 splits = split_list(texts[start:end], portion) if splits: clf_names.add(clf_name) print(f'write clf {clf_name}') for index, list2write in enumerate(splits): dest_file = os.path.join(dest_dir, f'{file_names[index]}{i}.txt') file_utils.save_list2file( list2write, dest_file, work_func=lambda text: f'{clf_name}\t{text}', filter_func=lambda item: len(item) > 1) else: print(f'not split') file_utils.save_list2file(list(clf_names), clf_name_file)
def find_words_not_in_vec(word_index, vectors_file, inclue_file, exclude_file): vec_words = file_utils.read_line(vectors_file, lambda line: line.split()[0]) print(f'{vec_words[0]}') exclude_words = [word for word in word_index if word not in vec_words] include_words = [word for word in word_index if word in vec_words] file_utils.save_list2file(exclude_words, exclude_file) file_utils.save_list2file(include_words, inclue_file)
def seg_long_phrases(origin_long_txt, seged_long_txt2): segs_list = file_utils.read_line(origin_long_txt, lambda line: segment.seg_text(line)) seg_list = [ seg for segs in segs_list for seg in segs.split(' ') if len(seg) > 1 ] file_utils.save_list2file(list(set(seg_list)), seged_long_txt2)
def seg_clf_file(clf_file_pair): raw_clf_file, seged_clf_file = clf_file_pair print(f'seg file {raw_clf_file} to {seged_clf_file}') seged_lines = file_utils.read_line( raw_clf_file, lambda line: segment.seg_raw_doc(json.loads(line))) file_utils.save_list2file( seged_lines, seged_clf_file, lambda doc_json: json.dumps(doc_json, ensure_ascii=False))
def create_corpus(seged_clf_dir, copus_file): for seged_clf_file in os.listdir(seged_clf_dir): print(f'add clf {seged_clf_file}') file2read = os.path.join(seged_clf_dir, seged_clf_file) texts = file_utils.read_line(file2read, lambda line: json.loads(line)['abs']) file_utils.save_list2file( texts, copus_file, filter_func=lambda text: text and len(text) > 0) print(f'create copus complete')
def join_phrases(phrase_union_txt, *phrase_txts): print(f'start join...') phrase_set = set() for phrase_txt in phrase_txts: for phrase in file_utils.read_line(phrase_txt): if len(phrase) < 6: phrase_set.add(phrase) print(f'set phrases is: {phrase_set}') l = list(phrase_set) l.sort() file_utils.save_list2file(l, phrase_union_txt)
def gen_train_text(answers_dir, seged_texts_dir, train_file): total_answers = get_content_dict(answers_dir, ':') # list_utils.print_dict(total_answers) total_seged_texts = get_content_dict(seged_texts_dir, '\t') # list_utils.print_dict(total_seged_texts) train_list = [(clf, total_seged_texts.get(_id)) for _id, clf in total_answers.items()] # list_utils.print_list(train_list) file_utils.save_list2file(train_list, train_file, lambda pair: f'{pair[0]}\t{pair[1]}')
def build_vocab(train_txt_path, vocab_txt_path, vocab_size=5000): """根据训练集构建词汇表,存储""" contents = file_utils.read_line(train_txt_path, lambda line_contents: line_contents[1] if len(line_contents) > 1 else '', split='\t') counter = Counter( [word for content in contents for word in content.split()]) count_pairs = counter.most_common(vocab_size - 1) words, _ = list(zip(*count_pairs)) # 添加一个 <PAD> 来将所有文本pad为同一长度 words = ['<PAD>'] + list(words) file_utils.save_list2file(words, vocab_txt_path)
def select_sample(seged_dir, select_dir): # seged_dir = 'E:/ip_data/clfs/new_seged/no_limit' # select_dir = 'E:/ip_data/clfs/new_seged/no_limit_t' clf_dict, total_count = get_clf_info(seged_dir) for clf_file in os.listdir(seged_dir): clf_name = clf_file[0:4] clf_count = clf_dict[clf_name] read_count = int(clf_count / total_count * 10000) if read_count > 20: file2read = os.path.join(seged_dir, clf_file) lines = list(file_utils.read_line(file2read)) random.shuffle(lines) print( f'clf {clf_name}, clf count {clf_count}, write count {read_count}' ) save_file = f'{clf_name}_{read_count}.txt' file_utils.save_list2file(lines[0:read_count], os.path.join(select_dir, save_file))
def write_docs(store_dir: str, clf: Classification, docs, count): """ write ip docs json string to a local file line by line, the file was named in format like 'A_01_B_300.txt'. The number in file name is the count of docs stored in the file. :param store_dir: :param clf: :param docs: get from mongo, each item is a Bson obj :return: """ file_suffix = f'{count}.txt' logger.info(f'start write tasks {clf} with suffix {file_suffix}') file_name = f'{clf}_{file_suffix}' file_path = path.join(store_dir, file_name) logger.info(f'tasks docs store file path is {file_path}') file_utils.save_list2file(docs, file_path, lambda doc: json_encoder.doc2json(doc))
def group_phrases(origin_file, short_file, median_file, long_file): phrases = file_utils.read_line(origin_file) short_phrases = [] median_phrases = [] long_phrases = [] for phrase in phrases: print(f'{phrase}') if len(phrase) < 6: short_phrases.append(phrase) elif len(phrase) > 10: long_phrases.append(phrase) else: median_phrases.append(phrase) file_utils.save_list2file(short_phrases, short_file) file_utils.save_list2file(median_phrases, median_file) file_utils.save_list2file(long_phrases, long_file)
def extract_abs(clf_file_pair): raw_clf_file, seged_clf_file = clf_file_pair print(f'extract abs file {raw_clf_file} to {seged_clf_file}') abs_lines = file_utils.read_line( raw_clf_file, lambda line: segment.seg_text((json.loads(line))['abs'])) file_utils.save_list2file(abs_lines, seged_clf_file)
def collect_new_dict(dict_dir: str, dest_dict_file: str): new_dict_files = file_utils.get_files(dict_dir) word_counter = Counter([verify(word) for dict_file in new_dict_files for word in file_utils.read_line(dict_file)]) # list_utils.print_list(word_counter.keys()) file_utils.save_list2file(word_counter.keys(), dest_dict_file)
yield get_words_from_ctg_page(ctg_page) def get_page_count(ctg_pg0): soup = BeautifulSoup(ctg_pg0, features='lxml') page_info = soup.find('span', text=re.compile('^共.*')).text print('page info is {}'.format(page_info)) # 共16页 共[306]词汇 matcher = re.search('[0-9]+', page_info) return int(matcher.group(0)) if matcher else 0 def get_words_from_ctg_page(ctg_pg): soup = BeautifulSoup(ctg_pg, features='lxml') tr_tags = soup.find(id='lblcon').find_all('tr') words = [tr_tag.find_all('td')[1].a.text for tr_tag in tr_tags if not tr_tag.has_attr('class')] return words if __name__ == '__main__': html = get_html(catalogue_url) # print('text {}'.format(html)) ctg_uris = get_categories_uri(html) words = get_words_from_ctg(ctg_uris) dict_cnki_path = 'F:/temp/ip_nlp/cnki_dict.txt' for word in words: # print(word) file_utils.save_list2file(word, dict_cnki_path) print('all task complete...')
def extract_eng(raw_phrase_txt, eng_file): engs_lists = file_utils.read_line( raw_phrase_txt, lambda line: english_pattern.findall(line)) file_utils.save_list2file(engs_lists, eng_file, lambda engs: '\n'.join(engs))
def process_raw_answers(raw_answers_dir, processed_answer_dir): for raw_answer in os.listdir(raw_answers_dir): raw_answer_file = os.path.join(raw_answers_dir, raw_answer) processed_answers = process_raw_answer(raw_answer_file) store_answer_file = os.path.join(processed_answer_dir, raw_answer) file_utils.save_list2file(processed_answers, store_answer_file)
yield word def grab_failed_page(): while len(fail_pages) > 0: for fail_page_url in fail_pages[:]: fail_page_html = get_html(fail_page_url) word = get_word(fail_page_html) if word: fail_pages.remove(fail_page_url) yield word def test(): # html_page = get_html(base_url + 'h_5286500000.html') html_page = get_html('http://dict.cnki.net/h_9999999000.html') print('html page is:') print(html_page) soup = BeautifulSoup(html_page, features='lxml') input_value = soup.find(id='txt2').attrs['value'] print('input value is {}'.format(len(input_value))) if __name__ == '__main__': words = grab_words(max_page_num) file_utils.save_list2file(words, 'F:/temp/ip_nlp/cnki_trans.txt') if len(fail_pages) > 0: supply_words = grab_failed_page() file_utils.save_list2file(words, 'F:/temp/ip_nlp/cnki_trans.txt')