def get_quality_phrase(twarr, threshold): quality_list = list() if len(twarr) == 0: return quality_list fu.write_lines(raw_train_file, [tw[tk.key_text] for tw in twarr]) autophrase(raw_train_file, model_base) lines = fu.read_lines(output_file) for line in lines: confidence, phrase = line.strip().split(maxsplit=1) if float(confidence) > threshold: quality_list.append(phrase) return quality_list
def make_train_test(): p_file = ft_data_pattern.format("pos_2016.txt") n_bad_files = fi.listchildren(ft_data_pattern.format(''), fi.TYPE_FILE, concat=True, pattern='2016_bad') n_2017_files = fi.listchildren(ft_data_pattern.format(''), fi.TYPE_FILE, concat=True, pattern='2017') # n_2012_fulls = fi.listchildren(ft_data_pattern.format(''), fi.TYPE_FILE, concat=True, pattern='2012_full')[:12] n_2012_fulls = fi.listchildren(ft_data_pattern.format(''), fi.TYPE_FILE, concat=True, pattern='2012_full') n_2016_files = fi.listchildren(ft_data_pattern.format(''), fi.TYPE_FILE, concat=True, pattern='2016_queried') print(len(n_bad_files), len(n_2017_files), len(n_2012_fulls), len(n_2016_files)) n_files = n_bad_files + n_2017_files + n_2012_fulls + n_2016_files p_txtarr = fu.read_lines(p_file) p_prefix_txtarr = prefix_textarr(label_t, p_txtarr) n_txtarr_blocks = [fu.read_lines(file) for file in n_files] n_prefix_txtarr_blocks = [ prefix_textarr(label_f, txtarr) for txtarr in n_txtarr_blocks ] train_test = list() bad = len(n_bad_files) bad_blocks, n_blocks = n_prefix_txtarr_blocks[: bad], n_prefix_txtarr_blocks[ bad:] train_test.append(split_train_test(p_prefix_txtarr)) train_test.extend([split_train_test(block) for block in n_blocks]) print("len(train_test)", len(train_test)) train_list, test_list = zip(*train_test) train_list = list(train_list) + bad_blocks train_txtarr = au.merge_array(train_list) test_txtarr = au.merge_array(test_list) fu.write_lines(fasttext_train, train_txtarr) fu.write_lines(fasttext_test, test_txtarr) print("len(train_list)", len(train_list), "len(train_txtarr)", len(train_txtarr), "len(test_txtarr)", len(test_txtarr))
def autophrase_wrapper(process_code, textarr): # process_code用于辨识进程所占用的路径,textarr是一个文本list process_base = fi.join(autophrase_output_base, str(process_code)) copy_into_process_base(process_base) commander = fi.join(process_base, "auto_phrase.sh") input_text_file = fi.join(process_base, "raw_train.txt") output_keyword_file = fi.join(process_base, "AutoPhrase.txt") # 将文本列表写入文件, 执行autophrase fu.write_lines(input_text_file, textarr) min_sup = determine_min_sup(len(textarr)) autophrase(input_text_file, process_base, commander, process_base, min_sup) # 读取autophrase结果 lines = fu.read_lines(output_keyword_file) conf_word_list = list() for line in lines: conf, word = line.split(maxsplit=1) conf_word_list.append((float(conf), word)) # fi.rmtree(os.path.join(process_base, 'tmp')) return conf_word_list
def make_text_files(): for idx, file in enumerate(neg_2012_full_files): twarr = fu.load_array(file) txtarr = list() for tw in twarr: text = pu.text_normalization(tw[tk.key_text]) if pu.is_empty_string(text) or len(text) < 20: continue txtarr.append(text) print('len delta', len(twarr) - len(txtarr)) path = Path(file) out_file_name = '_'.join([path.parent.name, path.name]).replace('json', 'txt') out_file = ft_data_pattern.format(out_file_name) print(out_file) fu.write_lines(out_file, txtarr) return p_twarr_blocks = map(fu.load_array, pos_files) p_txtarr_blocks = map(twarr2textarr, p_twarr_blocks) p_txtarr = au.merge_array(list(p_txtarr_blocks)) p_out_file = ft_data_pattern.format('pos_2016.txt') fu.write_lines(p_out_file, p_txtarr) for f in neg_files: in_file = neg_event_pattern.format(f) out_file = ft_data_pattern.format(f.replace("json", "txt")) twarr = fu.load_array(in_file) txtarr = twarr2textarr(twarr) print(len(twarr), '->', len(txtarr), len(twarr) - len(txtarr)) fu.write_lines(out_file, txtarr)
def write_cic_list(path, cic_list): """ 调用 cic_list 中的各元素的 construct_od返回一个OrderedDIct,将每个OrderedDIct持久化到指定路径下的文件中 :param path: str,输出路径 :param cic_list: list,每个元素为 ClusterInfoCarrier :return: """ fi.mkdir(path, remove_previous=True) cic_list = sorted(cic_list, key=lambda item: len(item.twarr), reverse=True) print(' bext: output cic list, len={}'.format(len(cic_list))) for idx, cic in enumerate(cic_list): cluid = cic.cluid cic.twarr = ClusterInfoGetter.group_similar_tweets(cic.twarr, process_num=10) od = cic.construct_od() json_str = fu.dumps(od) cluster_file = fi.join(path, '{}_cluid:{}.json'.format(idx, cluid)) fu.write_lines(cluster_file, [json_str]) # textarr_file = fi.join(path, '{}_text.json'.format(idx)) # textarr = [tw[tk.key_text] for tw in cic.twarr] # fu.write_lines(textarr_file, textarr) print(' bext: output into files over')
_cic_list = bext.get_batch_output() print('get cic outputs, type:{}'.format(type(_cic_list))) for cic in _cic_list: twnum = len(cic.twarr) _geo_list = [ geo['address'] for geo in cic.od['geo_infer'] if geo['quality'] == 'locality' ] print('cluid:{}, twarr len:{}'.format(cic.cluid, twnum)) print(cic.od['summary']['keywords']) print(_geo_list) print('\n') if len(_geo_list) == 0: _top_geo = 'NOGPE' else: _top_geo = '`'.join(_geo_list) _out_file = '/home/nfs/cdong/tw/src/calling/tmp/id{}_tw{}_{}.txt'.format( cic.cluid, twnum, _top_geo) _txtarr = [tw[tk.key_text] for tw in cic.twarr] _idx_g, _txt_g = au.group_similar_items(_txtarr, score_thres=0.3, process_num=20) _txt_g = [ sorted(g, key=lambda t: len(t), reverse=True) for g in _txt_g ] _txtarr = au.merge_array(_txt_g) fu.write_lines(_out_file, _txtarr) tmu.check_time()