def get_semantic_tokens_multi(file_path): pos_type_info = { ark.prop_label: {K_IFD: IdFreqDict(), K_FILE: getcfg().pre_prop_dict_file}, ark.comm_label: {K_IFD: IdFreqDict(), K_FILE: getcfg().pre_comm_dict_file}, ark.verb_label: {K_IFD: IdFreqDict(), K_FILE: getcfg().pre_verb_dict_file}, ark.hstg_label: {K_IFD: IdFreqDict(), K_FILE: getcfg().pre_hstg_dict_file}, } total_doc_num = 0 file_path = fi.add_sep_if_needed(file_path) # subfiles = au.random_array_items(fi.listchildren(file_path, children_type=fi.TYPE_FILE), 40) subfiles = fi.listchildren(file_path, children_type=fi.TYPE_FILE) file_list_block = mu.split_multi_format([(file_path + subfile) for subfile in subfiles], process_num=20) res_list = mu.multi_process(get_semantic_tokens, [(file_list,) for file_list in file_list_block]) for res_type_info, doc_num in res_list: total_doc_num += doc_num for label in res_type_info.keys(): pos_type_info[label][K_IFD].merge_freq_from(res_type_info[label][K_IFD]) print('total_doc_num', total_doc_num) for label in pos_type_info.keys(): ifd, file_name = pos_type_info[label][K_IFD], pos_type_info[label][K_FILE] ifd.drop_words_by_condition(3) if label != ark.hstg_label: ifd.drop_words_by_condition(lambda word, _: word.startswith('#')) ifd.dump_dict(file_name) print('{}; vocab size:{}'.format(file_name, ifd.vocabulary_size()))
def generate_train_matrices(ft_model_file, lbl_txt_file, mtx_lbl_file_list): lbl_txt_arr = fu.read_lines(lbl_txt_file) lbl_txt_blocks = mu.split_multi_format(lbl_txt_arr, len(mtx_lbl_file_list)) args_list = [(ft_model_file, lbl_txt_blocks[idx], mtx_file, lbl_file) for idx, (mtx_file, lbl_file) in enumerate(mtx_lbl_file_list)] print([len(b) for b in lbl_txt_blocks]) mu.multi_process_batch(_generate_matrices, 10, args_list)
def twarr2filter(twarr): """ 将同一组推特列表拆分为多个 batch,传入过滤&分类模块中 :param twarr: list,推特列表,每个元素为dict(即json格式字符串转换而来的对象) :return: """ tw_batch = mu.split_multi_format(twarr, flt_pool_size) bflt.input_twarr_batch(tw_batch)
def twarr2filter(twarr): """ Read twarr into the filter :param twarr: :return: """ tw_batches = mu.split_multi_format(twarr, flt_pool_size) bflt.input_twarr_batch(tw_batches)
def query_from_files_multi(file_list, query, n_process=10): """ as there may be many files, we handle them through processes """ file_blocks = mu.split_multi_format(file_list, n_process) res_list = mu.multi_process(query_from_files, args_list=[(block, query) for block in file_blocks]) twarr = au.merge_array(res_list) print('len(res_list):{}, len(twarr):{}'.format(len(res_list), len(twarr)), end=', ') return twarr
def pairwise_score_multi(array, process_num, pair_func): total = len(array) pairs = [(i, j) for i in range(total - 1) for j in range(i + 1, total)] if process_num <= 1 or total < 40: idx_pair_score_list = pairwise_score(array, pairs, pair_func) else: pair_blocks = mu.split_multi_format(pairs, process_num) arg_list = [(array, idx_pairs, pair_func) for idx_pairs in pair_blocks] score_pairs_blocks = mu.multi_process(pairwise_score, arg_list) idx_pair_score_list = merge_array(score_pairs_blocks) return idx_pair_score_list
def get_tokens_multi(file_path): file_path = fi.add_sep_if_needed(file_path) # subfiles = au.random_array_items(fi.listchildren(file_path, children_type=fi.TYPE_FILE), 20) subfiles = fi.listchildren(file_path, children_type=fi.TYPE_FILE) file_list_block = mu.split_multi_format([(file_path + subfile) for subfile in subfiles], process_num=20) res_list = mu.multi_process(get_tokens, [(file_list,) for file_list in file_list_block]) id_freq_dict, total_doc_num = IdFreqDict(), 0 for ifd, doc_num in res_list: total_doc_num += doc_num id_freq_dict.merge_freq_from(ifd) print('total_doc_num', total_doc_num, 'total vocabulary_size', id_freq_dict.vocabulary_size()) id_freq_dict.drop_words_by_condition(3) id_freq_dict.dump_dict(getcfg().post_dict_file)
def make_neg_event_bad_text_2016(): files = fi.listchildren("/home/nfs/cdong/tw/origin/", fi.TYPE_FILE, concat=True) files_blocks = mu.split_multi_format(files, 4) output_file = neg_event_pattern.format("neg_2016_bad_text_{}.json") args_list = [(block, output_file.format(idx)) for idx, block in enumerate(files_blocks)] res_list = mu.multi_process(extract_bad_tweets_into, args_list) n_num_list, tw_num_list = zip(*res_list) total_n, total_tw = sum(n_num_list), sum(tw_num_list) print(n_num_list, tw_num_list, total_n, total_tw, round(total_n / total_tw, 6))
def summary_files_in_path(from_path, into_path=None): """ Read all .json under file_path, extract tweets from them into a file under summary_path. """ # [-13:]--hour [-13:-3]--day [-13:-5]--month,ymdh refers to the short of "year-month-date-hour" from_path = fi.add_sep_if_needed(from_path) file_ymdh_arr = pu.split_digit_arr(fi.get_parent_path(from_path)[-13:]) if not is_target_ymdh(file_ymdh_arr): return into_file = '{}{}'.format(fi.add_sep_if_needed(into_path), '_'.join(file_ymdh_arr) + '.sum') fi.remove_file(into_file) subfiles = fi.listchildren(from_path, children_type=fi.TYPE_FILE) file_block = mu.split_multi_format([(from_path + subfile) for subfile in subfiles], process_num=20) twarr_blocks = mu.multi_process(sum_files, [(file_list, tflt.FILTER_LEVEL_LOW) for file_list in file_block]) twarr = au.merge_array(twarr_blocks) if twarr: fu.dump_array(into_file, twarr, overwrite=True)
def generate_train_matrices(ft_model_file, lbl_txt_file, mtx_lbl_file_list): """ 给出fasttext模型文件的路径,读取文本-标记文件,将文件内容分块传递给多个子进程, 各子进程将文本和标记分别转化为向量列表(即矩阵)输出到 mtx_lbl_file_list 中的每个文件中 文本量较大的情况下避免每次训练分类器都要重新生成文本对应的向量列表 :param ft_model_file: str,fasttext模型的文件路径 :param lbl_txt_file: str,文本-标记文件的路径 :param mtx_lbl_file_list: 每个元素为tuple,tuple的每个元素为str, 第一个str标志存储矩阵的文件,第二个str表示存储该矩阵对应的标记列表的文件 :return: """ lbl_txt_arr = fu.read_lines(lbl_txt_file) lbl_txt_blocks = mu.split_multi_format(lbl_txt_arr, len(mtx_lbl_file_list)) args_list = [(ft_model_file, lbl_txt_blocks[idx], mtx_file, lbl_file) for idx, (mtx_file, lbl_file) in enumerate(mtx_lbl_file_list)] print([len(b) for b in lbl_txt_blocks]) mu.multi_process_batch(_generate_matrices, 10, args_list)