def exec_pre_test(test_data_path): subfiles = fi.listchildren(test_data_path, children_type='file') # file_list = fu.split_multi_format( # [(test_data_path + file) for file in subfiles if file.endswith('.json')], process_num=6) # twarr_blocks = fu.multi_process(fi.summary_unzipped_tweets_multi, # [(file_list_slice,) for file_list_slice in file_list]) twarr_blocks = filter_twarr( [fu.load_array(file) for file in subfiles if file.endswith('.json')]) twarr = au.merge_array(twarr_blocks) tu.start_ner_service(pool_size=16) tu.twarr_ner(twarr) tu.end_ner_service() all_ids = set(fu.load_array(test_data_path + 'test_ids_all.csv')) pos_ids = set(fu.load_array(test_data_path + 'test_ids_pos.csv')) non_pos_ids = all_ids.difference(pos_ids) pos_twarr = list() non_pos_twarr = list() for tw in twarr: twid = tw[tk.key_id] if twid in pos_ids: pos_twarr.append(tw) elif twid in non_pos_ids: non_pos_twarr.append(tw) fu.dump_array(getcfg().pos_data_file, pos_twarr) fu.dump_array(getcfg().non_pos_data_file, non_pos_twarr)
def parse_query_list(from_path, into_path, query_list, n_process): from_path = fi.add_sep_if_needed(from_path) into_path = fi.add_sep_if_needed(into_path) all_sub_files = [ file for file in fi.listchildren( from_path, children_type=fi.TYPE_FILE, pattern='.sum$') ] tw_num_sum = 0 for query in query_list: query = SeedQuery(*query) query_sub_files = [ os.path.join(from_path, f) for f in all_sub_files if query.is_time_desired( tw_ymd=query.time_of_tweet(f, source='filename')) ] print('{} files from {} to {}'.format( len(query_sub_files), query_sub_files[0][query_sub_files[0].rfind('/') + 1:], query_sub_files[-1][query_sub_files[-1].rfind('/') + 1:], )) twarr = query_from_files_multi(query_sub_files, query, n_process) tw_num_sum += len(twarr) file_name = query.to_string() + '.json' if len(twarr) > 20: print('file {} written\n'.format(file_name)) fu.dump_array(os.path.join(into_path, file_name), twarr) else: print('twarr not long enough') for tw in twarr: print(tw[tk.key_text], '\n') print('total tweet number: {}'.format(tw_num_sum))
def dump_dict(self, file_name): self.reset_id() for word in self.vocabulary(): if type(word) is not str: self.drop_word(word) word_id_freq_arr = [(word.strip(), int(self.word2id(word)), int(self.freq_of_word(word))) for word in sorted(self.vocabulary())] fu.dump_array(file_name, word_id_freq_arr)
def test(self, test_file): textarr, labelarr = file2label_text_array(test_file) featurearr = self.textarr2featurearr(textarr) probarr = self.predict_proba(featurearr) au.precision_recall_threshold( labelarr, probarr, file="performance.csv", thres_range=[i / 100 for i in range(1, 10)] + [i / 20 for i in range(2, 20)]) fu.dump_array("result.json", (labelarr, probarr))
def dump_dict(self, file_name): self.reset_id() for word in self.vocabulary(): if type(word) is not str: self.drop_word(word) fu.dump_array(file_name, [ K_DELIMITER.join(['{}'] * 3).format(word.strip(), int(self.freq_of_word(word)), int(self.word2id(word))) for word in sorted(self.vocabulary()) if type(word) is str ])
def test(self, test_file): textarr, labelarr = file2label_text_array(test_file) """""" # docarr = su.textarr_nlp(textarr, self.get_nlp()) # featurearr = self.textarr2featurearr(textarr, docarr) featurearr = self.textarr2featurearr_no_gpe(textarr) """""" probarr = self.predict_proba(featurearr) au.precision_recall_threshold(labelarr, probarr, thres_range=[i / 100 for i in range(1, 10)] + [i / 20 for i in range(2, 20)]) fu.dump_array("result.json", (labelarr, probarr))
def merge_events_2016(): base = '/home/nfs/cdong/tw/seeding/Terrorist/queried/event_corpus/' subs = fi.listchildren(base, fi.TYPE_FILE) twarr_list = [] for sub in subs: twarr = fu.load_array(base + sub) # twarr = tu.twarr_ner(twarr) # twarr = ark.twarr_ark(twarr) twarr_list.append(twarr) fu.dump_array('/home/nfs/cdong/tw/seeding/Terrorist/queried/event2016.txt', twarr_list)
def extract_bad_tweets_into(files, output_file): total_tw_num = 0 neg_twarr = list() for file in files: twarr = fu.load_array(file) total_tw_num += len(twarr) for tw in twarr: text = tw[tk.key_text] if len(text) < 20 or not pu.has_enough_alpha(text, 0.6): neg_twarr.append(tw) fu.dump_array(output_file, neg_twarr) return len(neg_twarr), total_tw_num
def summary_files_in_path_into_blocks(from_path, into_path, file_name): from_path = fi.add_sep_if_needed(from_path) sub_files = fi.listchildren(from_path, children_type=fi.TYPE_FILE, pattern='.json$') into_file = fi.add_sep_if_needed(into_path) + file_name twarr_block = list() for idx, file in enumerate(sub_files): from_file = from_path + file twarr = fu.load_array_catch(from_file) if len(twarr) <= 0: continue twarr = tflt.filter_twarr(twarr, tflt.FILTER_LEVEL_HIGH) twarr_block.append(twarr) print(sorted([('id'+str(idx), len(twarr)) for idx, twarr in enumerate(twarr_block)], key=lambda x: x[1])) print('event number in total: {}'.format(len(twarr_block))) fu.dump_array(into_file, twarr_block)
def summary_files_in_path(from_path, into_path=None): """ Read all .json under file_path, extract tweets from them into a file under summary_path. """ # [-13:]--hour [-13:-3]--day [-13:-5]--month,ymdh refers to the short of "year-month-date-hour" from_path = fi.add_sep_if_needed(from_path) file_ymdh_arr = pu.split_digit_arr(fi.get_parent_path(from_path)[-13:]) if not is_target_ymdh(file_ymdh_arr): return into_file = '{}{}'.format(fi.add_sep_if_needed(into_path), '_'.join(file_ymdh_arr) + '.sum') fi.remove_file(into_file) subfiles = fi.listchildren(from_path, children_type=fi.TYPE_FILE) file_block = mu.split_multi_format([(from_path + subfile) for subfile in subfiles], process_num=20) twarr_blocks = mu.multi_process(sum_files, [(file_list, tflt.FILTER_LEVEL_LOW) for file_list in file_block]) twarr = au.merge_array(twarr_blocks) if twarr: fu.dump_array(into_file, twarr, overwrite=True)
def test(self, test_file): """ 给定带标记和文本的文件,读取其中的文本-标记对,调用向量化接口以及分类器,评估分类器在测试集上的性能 :param test_file: str,测试用文本文件的路径 :return: """ textarr, labelarr = file2label_text_array(test_file) """""" # docarr = su.textarr_nlp(textarr, self.get_nlp()) # featurearr = self.textarr2featurearr(textarr, docarr) featurearr = self.textarr2featurearr_no_gpe(textarr) """""" probarr = self.predict_proba(featurearr) au.precision_recall_threshold( labelarr, probarr, thres_range=[i / 100 for i in range(1, 10)] + [i / 20 for i in range(2, 20)]) fu.dump_array("result.json", (labelarr, probarr))
def refilter_twarr(in_file, out_file): twarr = fu.load_array(in_file)[:200000] origin_len = len(twarr) print(origin_len) clf_filter = ClassifierTerror() # for idx in range(len(twarr) - 1, -1, -1): # text = twarr[idx][tk.key_text] # if not pu.has_enough_alpha(text, 0.6): # print(text) # twarr.pop(idx) # text_filter_len = len(twarr) # print("delta by text =", origin_len - text_filter_len) tmu.check_time("refilter_twarr") twarr = clf_filter.filter(twarr, 0.2) tmu.check_time("refilter_twarr") print(len(twarr)) fu.dump_array(out_file, twarr[:100000])
def make_tw_batches(self, batch_size): ordered_twarr = self.order_twarr_through_time() tw_batches = split_array_into_batches(ordered_twarr, batch_size) self.twarr_info(au.merge_array(tw_batches)) fu.dump_array(self.labelled_batch_file, tw_batches)
exit() """ 文本数量小于30时关键词的质量已经相当低,应尽量使进入的文本数量大于一定阈值 """ """ __main__里面的内容保持不变,是最终的接口形式 """ _keyword_file = 'keyword_results.json' _file_name_keywords_list = fu.load_array(_keyword_file) # for filename, keyword in _file_name_keywords_list: # print(filename) # print(filter_keywords(keyword, 20), '\n') exit() _base = "/home/nfs/cdong/tw/seeding/Terrorist/queried/positive/" _files = fi.listchildren(_base, fi.TYPE_FILE, concat=True) _twarr_list = [fu.load_array(file) for file in _files] tmu.check_time() _file_name_list = [fi.get_name(file) for file in _files] _keywords_list = autophrase_multi(_twarr_list, process_num=8) # 主要验证这个输出结果是否正确以及是否有加速 tmu.check_time() _res = list(zip(_file_name_list, _keywords_list)) fu.dump_array(_keyword_file, _res) # _keywords_list = fu.load_array(_keyword_file) # print(len(_keywords_list)) # print([len(_keywords) for _keywords in _keywords_list]) # assert len(_twarr_list) == len(_keywords_list) # for _idx, _keywords in enumerate(_keywords_list): # # print('word num', len(_keywords), ', tw num', len(_twarr_list[idx])) # if len(_keywords) < 20: # print(len(_twarr_list[_idx]))
def clusters_tfidf_similarity(self, file_name): import pandas as pd import utils.function_utils as fu # print('clusters_tfidf_similarity') # tmu.check_time() # """ construct tf vector for every cluster """ # cluid_arr = sorted(self.cludict.keys()) # valid_corpus_token_set = self.valid_corpus_token_set # cluid2vec = dict([(cluid, None) for cluid in cluid_arr]) # for cluid, cluster in self.cludict.items(): # if cluster.twnum == 0: # raise ValueError('cluster should have at least one document to make sense') # clu_vec = np.array([]) # for k_type in TokenSet.KEY_LIST: # valid_ifd = valid_corpus_token_set.get(k_type) # vocab_size = valid_ifd.vocabulary_size() # type_tf_vec = np.zeros([vocab_size]) # clu_ifd = cluster.token_set.get(k_type) # for word, freq in clu_ifd.word_freq_enumerate(): # type_tf_vec[valid_ifd.word2id(word)] = freq # clu_vec = np.concatenate([clu_vec, type_tf_vec]) # cluid2vec[cluid] = clu_vec # """ make idf """ # vec_len = sum([valid_corpus_token_set.get(k_type).vocabulary_size() for k_type in TokenSet.KEY_LIST]) # print('vector length sum({})={}'.format( # [valid_corpus_token_set.get(k_type).vocabulary_size() for k_type in TokenSet.KEY_LIST], vec_len)) # d = len(cluid2vec) # for i in range(vec_len): # df = 1 # for clu_vec in cluid2vec.values(): # if clu_vec[i] > 0: # df += 1 # idf = np.log(d / df) # for clu_vec in cluid2vec.values(): # clu_vec[i] *= idf # # tmu.check_time(print_func=lambda dt: print('construct tf-idf vector dt={}'.format(dt))) # # """ cosine similarity matrix """ # cosine_matrix = au.cosine_matrix_multi([cluid2vec[cluid].reshape([-1]) for cluid in cluid_arr], process_num=16) # sim_matrix = pd.DataFrame(index=cluid_arr, columns=cluid_arr, data=0.0, dtype=np.float32) # for i in range(len(cluid_arr)): # cluidi = cluid_arr[i] # for j in range(i + 1, len(cluid_arr)): # cluidj = cluid_arr[j] # cos_sim = au.cosine_similarity(cluid2vec[cluidi], cluid2vec[cluidj]) # sim_matrix.loc[cluidi, cluidj] = sim_matrix.loc[cluidj, cluidi] = cos_sim # tmu.check_time(print_func=lambda dt: print('cosine similarity single dt={}'.format(dt))) # TODO for each cluster, get the four similarities with other clusters, # use them as features to make classification whether two clusters are of same label """ one matrix per type """ type2vecarr = dict([(k_type, None) for k_type in TokenSet.KEY_LIST]) cluid_arr = sorted(self.cludict.keys()) valid_corpus_token_set = self.valid_corpus_token_set for k_type in TokenSet.KEY_LIST: valid_ifd = valid_corpus_token_set.get(k_type) vec_len = valid_ifd.vocabulary_size() for cluid in cluid_arr: clu_vec = np.zeros([vec_len]) clu_ifd = self.cludict[cluid].token_set.get(k_type) for word, freq in clu_ifd.word_freq_enumerate(): clu_vec[valid_ifd.word2id(word)] = freq type2vecarr[k_type] = np.concatenate([type2vecarr[k_type], clu_vec.reshape([1, -1])]) \ if type2vecarr[k_type] is not None else clu_vec.reshape([1, -1]) print(k_type, type2vecarr[k_type].shape) """ a matrix per type """ w_dict = { su.pos_prop: 0.4, su.pos_comm: 0.3, su.pos_verb: 0.2, su.pos_hstg: 0.1 } cosine_matrix = np.zeros([len(cluid_arr), len(cluid_arr)]) for k_type in TokenSet.KEY_LIST: if 0 in type2vecarr[k_type].shape: continue cosmtx = au.cosine_similarity( [vec.reshape([-1]) for vec in type2vecarr[k_type]], process_num=16) cosine_matrix += cosmtx * w_dict[k_type] """ ### ### """ """ || """ """ __ """ sim_matrix = pd.DataFrame(index=cluid_arr, columns=cluid_arr, data=cosine_matrix, dtype=np.float32) # tmu.check_time(print_func=lambda dt: print('cosine similarity multiple dt={}'.format(dt))) """ for each cluster, find top k similar clusters """ top_k = 3 cluid2topsim = dict() for cluid, row in sim_matrix.iterrows(): top_sim_cluids = row.index[np.argsort(row.values)[::-1][:top_k]] cluid2topsim[cluid] = { 'cluidarr': top_sim_cluids, 'scorearr': row[top_sim_cluids].tolist() } # tmu.check_time(print_func=lambda dt: print('find top 5 similar dt={}'.format(dt))) """ find representative label for every cluster """ cluid2label = dict() rep_score = 0.7 df = cs.cluid_label_table([int(i) for i in self.label], [int(i) for i in self.z]) for cluid, row in df.iterrows(): clu_twnum = sum(row.values) assert clu_twnum == self.cludict[cluid].twnum rep_label = int(row.index[np.argmax(row.values)]) rep_twnum = row[rep_label] if rep_twnum == 0 or rep_twnum < clu_twnum * rep_score: cluid2label[cluid] = -1 else: cluid2label[cluid] = rep_label # tmu.check_time(print_func=lambda dt: print('find representative label dt={}'.format(dt))) """ verify top sim. and rep. label """ assert len(set(cluid2topsim.keys()).difference(set(cluid_arr))) == 0 assert len(set(cluid2label.keys()).difference(set(cluid_arr))) == 0 sim_info = list() for cluid in cluid_arr: clu_replb = cluid2label[cluid] clu_twnum = self.cludict[cluid].twnum sim_cluid_arr = cluid2topsim[cluid]['cluidarr'] sim_score_arr = cluid2topsim[cluid]['scorearr'] top_sim_cluid = sim_cluid_arr[0] top_sim_replb = cluid2label[top_sim_cluid] top_sim_twnum = self.cludict[top_sim_cluid].twnum top_sim_score = sim_score_arr[0] if top_sim_score < 0.4 or cluid >= top_sim_cluid: continue # print('\ncid {}, lb [{}], twnum {}'.format(cluid, clu_replb, clu_twnum)) info = 'lb {:3} tw {:3} <-> lb {:3} tw {:3}, score {}'.format( clu_replb, clu_twnum, top_sim_replb, top_sim_twnum, round(top_sim_score, 2)) sim_info.append(info) # for idx in range(top_k): # sim_cluid = sim_cluid_arr[idx] # if sim_cluid <= cluid: # continue # sim_clu_twnum = self.cludict[sim_cluid].twnum # print(' cid {:4}, lb [{:3}], score {}, twnum {}'.format( # sim_cluid, cluid2label[sim_cluid], round(sim_score_arr[idx], 2), sim_clu_twnum)) fu.dump_array(file_name, sim_info, False)
probarr = my_filter.predict_proba(fu.load_array(file)) pos_probarr.extend(probarr) # post_twarr = list() # for idx in range(len(probarr)): # if probarr[idx] >= 0.35: # post_twarr.append(twarr[idx]) # else: # print(twarr[idx][tk.key_text]) # post_twarr = [tw for idx, tw in enumerate(twarr) if probarr[idx] >= 0.4] # post_total_len += len(post_twarr) # print(len(post_twarr) / len(twarr), '\n\n\n') tmu.check_time() lblarr = [1 for _ in range(len(pos_probarr))] + [0 for _ in range(len(neg_probarr))] prbarr = pos_probarr + neg_probarr fu.dump_array("prb_lbl_arr.txt", (lblarr, prbarr)) lblarr, prbarr = fu.load_array("prb_lbl_arr.txt") au.precision_recall_threshold(lblarr, prbarr) # print('total portion = {} / {} = {}'.format(post_total_len, pre_total_len, post_total_len / pre_total_len)) tmu.check_time() exit() sub_files = fi.listchildren('/home/nfs/cdong/tw/origin/', fi.TYPE_FILE, concat=True)[18:19] twarr = au.merge_array([fu.load_array(file) for file in sub_files]) print(len(twarr)) tmu.check_time(print_func=None) for idx, tw in enumerate(twarr[14000:15000]): if (idx + 1) % 1000 == 0: print(idx) try: my_filter.get_features(tw)
def dump_cluidarr(self, cluidarr): fu.dump_array(self.filtered_cluidarr_file, cluidarr)