def exec_pre_test(test_data_path): subfiles = fi.listchildren(test_data_path, children_type='file') # file_list = fu.split_multi_format( # [(test_data_path + file) for file in subfiles if file.endswith('.json')], process_num=6) # twarr_blocks = fu.multi_process(fi.summary_unzipped_tweets_multi, # [(file_list_slice,) for file_list_slice in file_list]) twarr_blocks = filter_twarr( [fu.load_array(file) for file in subfiles if file.endswith('.json')]) twarr = au.merge_array(twarr_blocks) tu.start_ner_service(pool_size=16) tu.twarr_ner(twarr) tu.end_ner_service() all_ids = set(fu.load_array(test_data_path + 'test_ids_all.csv')) pos_ids = set(fu.load_array(test_data_path + 'test_ids_pos.csv')) non_pos_ids = all_ids.difference(pos_ids) pos_twarr = list() non_pos_twarr = list() for tw in twarr: twid = tw[tk.key_id] if twid in pos_ids: pos_twarr.append(tw) elif twid in non_pos_ids: non_pos_twarr.append(tw) fu.dump_array(getcfg().pos_data_file, pos_twarr) fu.dump_array(getcfg().non_pos_data_file, non_pos_twarr)
def get_semantic_tokens_multi(file_path): pos_type_info = { ark.prop_label: {K_IFD: IdFreqDict(), K_FILE: getcfg().pre_prop_dict_file}, ark.comm_label: {K_IFD: IdFreqDict(), K_FILE: getcfg().pre_comm_dict_file}, ark.verb_label: {K_IFD: IdFreqDict(), K_FILE: getcfg().pre_verb_dict_file}, ark.hstg_label: {K_IFD: IdFreqDict(), K_FILE: getcfg().pre_hstg_dict_file}, } total_doc_num = 0 file_path = fi.add_sep_if_needed(file_path) # subfiles = au.random_array_items(fi.listchildren(file_path, children_type=fi.TYPE_FILE), 40) subfiles = fi.listchildren(file_path, children_type=fi.TYPE_FILE) file_list_block = mu.split_multi_format([(file_path + subfile) for subfile in subfiles], process_num=20) res_list = mu.multi_process(get_semantic_tokens, [(file_list,) for file_list in file_list_block]) for res_type_info, doc_num in res_list: total_doc_num += doc_num for label in res_type_info.keys(): pos_type_info[label][K_IFD].merge_freq_from(res_type_info[label][K_IFD]) print('total_doc_num', total_doc_num) for label in pos_type_info.keys(): ifd, file_name = pos_type_info[label][K_IFD], pos_type_info[label][K_FILE] ifd.drop_words_by_condition(3) if label != ark.hstg_label: ifd.drop_words_by_condition(lambda word, _: word.startswith('#')) ifd.dump_dict(file_name) print('{}; vocab size:{}'.format(file_name, ifd.vocabulary_size()))
def main(): # fi.iterate_file_tree(getconfig().data_path, summary_files_in_path, # summary_path='/home/nfs/cdong/tw/testdata/cdong/non') # summarization.get_semantic_tokens_multi(getcfg().origin_path) summarization.get_tokens_multi(getcfg().origin_path)
def main(args): input_base = getcfg().origin_path output_base = '/home/nfs/cdong/tw/seeding/Terrorist/queried/positive/' import utils.timer_utils as tmu tmu.check_time() parse_query_list(input_base, output_base, seed_queries, n_process=15) tmu.check_time() return
def parse_args(): parser = argparse.ArgumentParser(description="Seeding information") parser.add_argument( '--summary_path', default=getcfg().origin_path, help= 'Filtered tweets organized in days as file XX_XX_XX_XX.sum under this path.' ) parser.add_argument( '--seed_path', default=getcfg().seed_path, help='Path for extracted seed instances according to particular query.' ) parser.add_argument('--unlb', action='store_true', default=False, help='If query is performed for unlabeled tweets.') parser.add_argument('--cntr', action='store_true', default=False, help='If query is performed for counter tweets.') # parser.add_argument('--query', action='store_true', default=False, # help='If query tweets from summarized tw files.') # parser.add_argument('--ner', action='store_true', default=False, # help='If perform ner on queried file.') # parser.add_argument('--train', action='store_true', default=False, # help='If train the model according to the queried tweets, with internal logic.') # parser.add_argument('--temp', action='store_true', default=False, # help='Just a temp function.') # parser.add_argument('--matrix', action='store_true', default=False, # help='To obtain the matrix for both train and test twarr.') # # parser.add_argument('--test_data_path', default=getcfg().test_data_path, # help='Path for test data from dzs.') # parser.add_argument('--pre_test', action='store_true', default=False, # help='Just a temp function to preprocess data from dzs.') return parser.parse_args()
def get_tokens_multi(file_path): file_path = fi.add_sep_if_needed(file_path) # subfiles = au.random_array_items(fi.listchildren(file_path, children_type=fi.TYPE_FILE), 20) subfiles = fi.listchildren(file_path, children_type=fi.TYPE_FILE) file_list_block = mu.split_multi_format([(file_path + subfile) for subfile in subfiles], process_num=20) res_list = mu.multi_process(get_tokens, [(file_list,) for file_list in file_list_block]) id_freq_dict, total_doc_num = IdFreqDict(), 0 for ifd, doc_num in res_list: total_doc_num += doc_num id_freq_dict.merge_freq_from(ifd) print('total_doc_num', total_doc_num, 'total vocabulary_size', id_freq_dict.vocabulary_size()) id_freq_dict.drop_words_by_condition(3) id_freq_dict.dump_dict(getcfg().post_dict_file)
import math import subprocess import multiprocessing as mp from config.configure import getcfg service_command = getcfg().ner_service_command class ServiceException(Exception): def __init__(self, message): Exception.__init__(self) self.message = message class NerServiceProxy: def __init__(self): self.service = None def clear_service(self): if self.service is not None: self.service.terminate() del self.service self.service = None def is_service_open(self): return self.service is not None and self.service.poll() is None def open_ner_service(self, classify, pos): if self.is_service_open(): return self.service = subprocess.Popen(service_command, shell=True, close_fds=True,
def input_twarr_with_label(twarr, label): alpha_range = beta_range = [i / 100 for i in range(1, 10, 2) ] + [i / 10 for i in range(1, 10, 2)] """cluster using different hyperparams in multiprocess way""" iter_num = 100 process_num = 20 hyperparams = [(a, b) for a in alpha_range for b in beta_range] params = [(None, twarr, *param, iter_num, label) for param in hyperparams] res_list = ClusterService.clustering_multi(GSDPMM.GSDPMM_twarr, params, process_num) param_num = len(hyperparams) """group the data by alpha""" frame = pd.DataFrame(index=np.arange(0, param_num), columns=['alpha', 'beta']) for i in range(param_num): frame.loc[i] = hyperparams[i] """start plotting figures""" for alpha, indices in frame.groupby('alpha').groups.items(): fig = plt.figure() fig.set_figheight(8) fig.set_figwidth(8) ax1 = fig.add_subplot(211) ax2 = fig.add_subplot(212) for i in indices: beta = frame.loc[i]['beta'] topic_word_dstrb, tw_cluster_pred, iter_x, nmi_y, k_y = res_list[ i] ax1.plot(iter_x, nmi_y, '-', lw=1.5, label='beta=' + str(round(beta, 2))) ax2.plot(iter_x, k_y, '^', lw=1.5, label='beta=' + str(round(beta, 2))) title = 'alpha=' + str(round(alpha, 2)) ax1.set_title(title) ax1.set_ylabel('NMI') ax1.set_ylim(0.25, 0.75) ax1.legend(loc='lower left') ax1.text(iter_num * 0.6, 0.70, 'final nmi: ' + str(round(max([res_list[i][3][-1] for i in indices]), 4)), fontsize=15) ax2.set_xlabel('iteration') ax2.set_ylabel('K num') ax2.legend(loc='lower left') plt.grid(True, '-', color='#333333', lw=0.8) plt.savefig(getcfg().dc_test + 'GSDPMM/GSDPMM_alpha=' + title + '.png') # top_K = 20 # alpha_idx = 0 # beta_idx = 1 # tw_cluster_pred_idx = 3 # nmi_idx = 5 # table_idx = 7 # recall_idx = 8 # # event_cluster_label = [i for i in range(12)] # summary_list = [hyperparams[i] + res_list[i] + # ClusterService.event_table_recall(label, res_list[i][1], event_cluster_label) # for i in range(param_num)] # top_recall_summary_list = [summary_list[i] for i in # np.argsort([summary[recall_idx] for summary in summary_list])[::-1][ # :top_K]] # top_nmi_summary_list = [summary_list[i] for i in # np.argsort([summary[nmi_idx][-1] for summary in summary_list])[::-1][:top_K]] # # top_nmi_path = getcfg().dc_test + 'GSDPMM/max_nmis/' # top_recall_path = getcfg().dc_test + 'GSDPMM/max_recalls/' # fi.rmtree(top_nmi_path) # fi.rmtree(top_recall_path) # # def dump_cluster_info(summary_list, base_path): # for rank, summary in enumerate(summary_list): # res_dir = base_path + '{}_recall_{}_nmi_{}_alpha_{}_beta_{}/'. \ # format(rank, round(summary[recall_idx], 6), round(summary[nmi_idx][-1], 6), # summary[alpha_idx], summary[beta_idx]) # fi.makedirs(res_dir) # tw_topic_arr = ClusterService.create_clusters_with_labels(twarr, summary[tw_cluster_pred_idx]) # for i, _twarr in enumerate(tw_topic_arr): # if not len(_twarr) == 0: # fu.dump_array(res_dir + str(i) + '.txt', [tw[tk.key_text] for tw in _twarr]) # table = summary[table_idx] # table.to_csv(res_dir + 'table.csv') # # dump_cluster_info(top_recall_summary_list, top_recall_path) # dump_cluster_info(top_nmi_summary_list, top_nmi_path) return None, None
import os import shutil from subprocess import Popen, PIPE from config.configure import getcfg import utils.function_utils as fu import utils.array_utils as au import utils.file_iterator as fi import utils.multiprocess_utils as mu import utils.tweet_keys as tk import utils.timer_utils as tmu autophrase_base = getcfg().autophrase_path autophrase_output_base = fi.join( autophrase_base, "OUTPUTS/") # 保证任何autophrase的输出限制到output_base之内的某个角落 command = fi.join(autophrase_base, "auto_phrase.sh") fi.mkdir(autophrase_output_base) def autophrase(input_text_file, output_path, commander, process_base, min_sup): p = Popen(commander, shell=True, bufsize=1, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=process_base) p.stdin.flush() p.stdin.write((input_text_file + '\n').encode("utf8")) p.stdin.write((output_path + '\n').encode("utf8")) p.stdin.write((min_sup + '\n').encode("utf8"))
def input_twarr_with_label(self, twarr, label): # def GSDMM_twarr(self, alpha, etap, etac, etav, etah, K, iter_num, ref_labels=None) # self.GSDMM_twarr(0.01, 0.01, 0.01, 0.01, 0.01, 5, 30) base_path = getcfg().dc_test + 'SEMANTIC/' a_range = etap_range = etac_range = etav_range = etah_range = [ 0.01, 0.05, 0.1 ] K_range = [30, 40] iter_num = 50 """cluster using different hyperparams in multiprocess way""" process_num = 19 hyperparams = [(a, ep, ec, ev, eh, k) for a in a_range for ep in etap_range for ec in etac_range for ev in etav_range for eh in etah_range for k in K_range] param_num = len(hyperparams) res_list = cs.clustering_multi(SemanticClusterer.GSDMM_twarr, [(self, *param, iter_num, label) for param in hyperparams], process_num) column_name = ['alpha', 'etap', 'etac', 'etav', 'etah', 'K'] # """start plotting figures""" # frame = pd.DataFrame(index=np.arange(0, param_num), columns=column_name, data=hyperparams) # for (alpha, K), indices in frame.groupby(['alpha', 'K']).groups.items(): # fig = plt.figure() # fig.set_figheight(8) # fig.set_figwidth(8) # for i in indices: # clu_word_distrb, tw_cluster_pred, iter_x, nmi_y = res_list[i] # legend_params = ('etap', 'etac', 'etav', 'etah') # plt_label = ','.join([p_name + str(frame.loc[i][p_name]) for p_name in legend_params]) # plt.plot(iter_x, nmi_y, '-', lw=1.5, label=plt_label) # title = 'alpha=' + str(alpha) + ',K=' + str(K) # plt.title(title) # plt.ylabel('NMI') # plt.ylim(0.25, 0.75) # plt.legend(loc='lower left') # plt.text(iter_num * 0.6, 0.70, # 'final nmi: ' + str(round(max([res_list[i][3][-1] for i in indices]), 4)), fontsize=15) # plt.grid(True, '-', color='#333333', lw=0.8) # plt.savefig(base_path + 'SEMANTIC' + title + '.png') """start dumping cluster information""" # def concat_param_name_values(param_names, param_values): # if not len(param_names) == len(param_values): # raise ValueError('inconsistent param number') # return '_'.join(['{}_{:<3}'.format(param_names[i], param_values[i]) for i in range(len(param_names))]) # # top_rank = 30 # true_cluster = [i for i in range(12)] # tbl_recall_list = [ClusterService.event_table_recall(label, res_list[i][1], true_cluster) for i in range(param_num)] # top_recall_idx = pd.DataFrame(data=[(i, tbl_recall_list[i][1], res_list[i][3][-1]) for i in range(param_num)])\ # .sort_values(by=[1, 2], ascending=False).loc[:, 0][:top_rank] # top_nmi_idx = np.argsort([res_list[i][3][-1] for i in range(param_num)])[-1:-top_rank-1:-1] # # def dump_cluster_info(top_idx_list_, base_path_): # for rank, idx in enumerate(top_idx_list_): # res_dir = '{}{}_recall_{:0<6}_nmi_{:0<6}_{}/'.\ # format(base_path_, rank, round(tbl_recall_list[idx][1], 4), round(res_list[idx][3][-1], 4), # concat_param_name_values(column_name, hyperparams[idx])) # fi.makedirs(res_dir) # tw_topic_arr = ClusterService.create_clusters_with_labels(twarr, res_list[idx][1]) # for i, _twarr in enumerate(tw_topic_arr): # if not len(_twarr) == 0: # fu.dump_array(res_dir + str(i) + '.txt', [tw[tk.key_text] for tw in _twarr]) # cluster_table = tbl_recall_list[idx][0] # cluster_table.to_csv(res_dir + 'table.csv') # # top_recall_path = base_path + 'max_recalls/' # fi.rmtree(top_recall_path) # dump_cluster_info(top_recall_idx, top_recall_path) # top_nmi_path = base_path + 'max_nmis/' # fi.rmtree(top_nmi_path) # dump_cluster_info(top_nmi_idx, top_nmi_path) return 0, 0
def input_twarr_with_label(twarr, label): # alpha_range = beta_range = [i/100 for i in range(1, 10, 3)] + [i/10 for i in range(1, 10, 3)] + \ # [i for i in range(1, 10, 3)] # K_range = [30, 40, 50] alpha_range = beta_range = [i / 100 for i in range(1, 10, 4) ] + [i / 10 for i in range(1, 10, 4)] K_range = [30, 40, 50] """cluster using different hyperparams in multiprocess way""" iter_num = 100 process_num = 20 hyperparams = [(a, b, K) for a in alpha_range for b in beta_range for K in K_range] res_list = list() for i in range(int(math.ceil(len(hyperparams) / process_num))): param_list = [ (twarr, *param, iter_num) for param in hyperparams[i * process_num:(i + 1) * process_num] ] res_list += utils.multiprocess_utils.multi_process( GSDMM.GSDMM_twarr, param_list) print('{:<4} /'.format((i + 1) * process_num), len(hyperparams), 'params processed') """group the data by K""" frame = pd.DataFrame(index=np.arange(0, len(hyperparams)), columns=['alpha', 'beta', 'K']) for i in range(len(hyperparams)): frame.loc[i] = hyperparams[i] print('\n', frame, '\n') """start plotting figures""" for (alpha, K), indices in frame.groupby(['alpha', 'K']).groups.items(): fig = plt.figure() fig.set_figheight(8) fig.set_figwidth(8) all_nmi = list() for i in indices: beta = frame.loc[i]['beta'] tw_cluster_pred_iter = res_list[i] iter_x = range(len(tw_cluster_pred_iter)) nmi_y = [ au.score(label, pred, 'nmi') for pred in tw_cluster_pred_iter ] all_nmi.append(nmi_y) plt.plot(iter_x, nmi_y, '-', lw=1.5, label='beta={}'.format(round(beta, 2))) plt.xlabel('iteration') plt.ylabel('NMI') plt.ylim(0.0, 0.75) plt.title('K=' + str(K)) plt.legend(loc='lower right') plt.grid(True, '-', color='#333333', lw=0.8) plt.text(iter_num - 40, 0.70, 'final nmi: ' + str(round(max([nmi[-1] for nmi in all_nmi]), 6)), fontsize=14, verticalalignment='bottom', horizontalalignment='left') plt.savefig(getcfg().dc_test + 'GSDMM/' + 'alpha={},K={}.png'.format(round(alpha, 2), K))
class ExtractSubProcess(DaemonProcess): def start(self, func): self.process = mp.Process(target=func, args=(self.inq, self.outq)) self.process.daemon = False self.process.start() # print('ExtractSubProcess', self.process.pid) extract_sub_process = ExtractSubProcess() END_PROCESS = -1 SET_PARAMS = 0 INPUT_LIST = 1 OUTPUT_LIST = 3 OUT_BASE = getcfg().output_path fi.mkdir(OUT_BASE, remove_previous=True) def extract_sub_main(inq, outq): """ 聚类信息提取模块子进程的主函数,该进程还有下属的若干子进程用于执行实际操作, 负责读取主进程输入,调用子进程组进行一次地点提取,根据返回结果,对多个聚类依地点进行合并操作, 并再次调用子进程组对合并结果进行完整处理,以该结果为最终结果输出到文件 :param inq: mp.Queue,主进程向子进程的输入队列 :param outq: mp.Queue,子进程向主进程的输出队列 :return: """ pool_size, event_type = inq.get() extract_pool = CustomDaemonPool() extract_pool.start(extract_pool_main, pool_size)
if IfdGetter.K_IFD_FILE in kwargs: self.ifd_file = kwargs.get(IfdGetter.K_IFD_FILE) if self.ifd_file is None: raise ValueError('An id freq dict should be specified.') if self.ifd is None: self.ifd = IdFreqDict() self.ifd.load_dict(self.ifd_file) return self.ifd def reload(self, ifd_file): if self.ifd is not None: self.ifd.load_dict(ifd_file) # pre_dict_file = getcfg().pre_dict_file post_dict_file = getcfg().post_dict_file token_dict = IfdGetter(post_dict_file) # pre_list = [getcfg().pre_prop_file, getcfg().pre_comm_file, getcfg().pre_verb_file, getcfg().pre_hstg_file] # post_list = [getcfg().post_prop_file, getcfg().post_comm_file, getcfg().post_verb_file, getcfg().post_hstg_file] # prop_dict, comm_dict, verb_dict, hstg_dict = [IfdGetter(post_file) for post_file in post_list] if __name__ == '__main__': import utils.pattern_utils as pu def word_remove(word, freq): if pu.search_pattern(r'!?<>.,&\'`\^*', word) is not None or freq < 10: return True return False pre2post = dict(zip(pre_list, post_list))
from config.configure import getcfg # TODO 先用绝对路径标识文件名,我这里在配置文件中写成绝对路径了 afinn_file = getcfg().afinn_file ark_marks = {'N': 0, 'O': 1, '^': 2, 'S': 3, 'Z': 4, 'V': 5, 'A': 6, 'R': 7, '!': 8, 'D': 9, 'P': 10, '&': 11, 'T': 12, 'X': 13, '$': 14, ',': 15, 'G': 16, 'L': 17, 'M': 18, 'Y': 19} affinn_dict = {} with open(afinn_file, 'r') as fp: for line in fp.readlines(): l1 = line.split('\t') affinn_dict[l1[0].strip()] = int(l1[1].strip()) def count_ark_mark(ark_list): mark_list = [0] * 20 for pos_mark in ark_list: if pos_mark[1] in ark_marks: mark_list[ark_marks[pos_mark[1]]] += 1 return mark_list def count_sentiment(text): words = text.split() sentiment = 0 for word in words: if word in affinn_dict:
from config.configure import getcfg from preprocess.filter.filter_utils import get_all_substrings from preprocess.filter.pos_tag_process import count_sentiment import utils.array_utils as au import utils.function_utils as fu import utils.file_iterator as fi import utils.pattern_utils as pu import utils.tweet_keys as tk import utils.timer_utils as tmu sys.path.append(os.path.abspath(os.path.dirname(__file__))) chat_filter_file = getcfg().chat_filter_file is_noise_dict_file = getcfg().is_noise_dict_file clf_model_file = getcfg().clf_model_file black_list_file = getcfg().black_list_file class UseGSDMM: def __init__(self): try: with open(chat_filter_file, 'rb') as f: self.c = pickle.load(f) with open(is_noise_dict_file, 'rb') as f: self.is_noise_dict = set(pickle.load(f)) except: print('load error') traceback.print_exc()
results will contain a list of lists (one per tweet) of triples, each triple represents (term, type, confidence) """ import shlex import subprocess import utils.pattern_utils as pu import utils.tweet_keys as tk from config.configure import getcfg # The only relavent source I've found is here: # http://m1ked.com/post/12304626776/pos-tagger-for-twitter-successfully-implemented-in # which is a very simple implementation, my implementation is a bit more useful (but not much). # NOTE this command is directly lifted from runTagger.sh # RUN_TAGGER_CMD = "java -XX:ParallelGCThreads=2 -Xmx500m -jar {}".format(getconfig().ark_service_command) RUN_TAGGER_CMD = getcfg().ark_service_command def _split_results(wordtags): """ Parse the tab-delimited returned lines, modified from: https://github.com/brendano/ark-tweet-nlp/blob/master/scripts/show.py :param wordtags: corresponds to a set of word and their tags (String)in a tweet. """ word_tag_arr = list() for wordtag in wordtags: wordtag = wordtag.strip() # remove '\n' if len(wordtag) > 0: parts = wordtag.split('\t') tokens, tags, confidence = parts[0], parts[1], float(parts[2]) word_tag_arr.append( (tokens, tags, confidence)) # yield generates a result on getting a request
import numpy as np import utils.array_utils as au import utils.file_iterator as fi import utils.function_utils as fu import classifying.fast_text_utils as ftu import utils.multiprocess_utils as mu import utils.pattern_utils as pu import utils.tweet_keys as tk import utils.timer_utils as tmu from config.configure import getcfg from classifying.terror.classifier_terror import \ ClassifierTerror, file2label_text_array, text2label_text_array value_t, value_f = ftu.value_t, ftu.value_f nd_ft_model_file = getcfg().nd_ft_model_file nd_clf_model_file = getcfg().nd_lr_model_file class ClassifierNaturalDisaster(ClassifierTerror): def __init__(self, ft_model_file=nd_ft_model_file, clf_model_file=nd_clf_model_file): ClassifierTerror.__init__(self, ft_model_file, clf_model_file) self.ft_model = self.clf_model = None if ft_model_file: self.load_ft_model(ft_model_file) if clf_model_file: self.load_clf_model(clf_model_file) def textarr2featurearr(self, textarr):
# from sklearn.neural_network import MLPClassifier # from sklearn.ensemble import GradientBoostingClassifier from config.configure import getcfg import classifying.fast_text_utils as ftu import utils.array_utils as au import utils.file_iterator as fi import utils.function_utils as fu import utils.multiprocess_utils as mu import utils.pattern_utils as pu import utils.spacy_utils as su import utils.tweet_keys as tk import utils.timer_utils as tmu value_t, value_f = ftu.value_t, ftu.value_f terror_ft_model_file = getcfg().terror_ft_model_file terror_clf_model_file = getcfg().terror_lr_model_file class ClassifierTerror: nlp = None sensitive_words = { 'shooting', 'wounded', 'shots', 'attack', 'shooter', 'wounds', 'dead', 'terrorist', 'hurt', 'terror', 'police', 'killed', 'gunman', 'weapon', 'injured', 'attacked', 'bomb', 'bombed', 'attacker' } # @staticmethod # def get_nlp(): # if ClassifierTerror.nlp is None: # ClassifierTerror.nlp = su.get_nlp_disable_for_ner()
from os import listdir from os.path import isfile, join import pickle import traceback import pandas as pd from config.configure import getcfg from preprocess.filter.ChatFilter import ChatFilter from preprocess.filter.filter_utils import readFilesAsJsonList import utils.pattern_utils as pu import utils.tweet_keys as tk # TODO 先用绝对路径标识文件名,我这里在配置文件中写成绝对路径了 class_dist_file = getcfg().class_dist_file chat_filter_file = getcfg().chat_filter_file is_noise_dict_file = getcfg().is_noise_dict_file orgn_predict_label_file = getcfg().orgn_predict_label_file class UseGSDMM: def __init__(self, trainning=None): self.c = ChatFilter() self.orgn_predict_label = None self.class_dist = None self.is_noise_dict = None if trainning is None: try: with open(chat_filter_file, 'rb') as f: self.c = pickle.load(f)
import numpy as np import utils.file_iterator as fi import classifying.fast_text_utils as ftu import utils.timer_utils as tmu from config.configure import getcfg from classifying.natural_disaster.classifier_nd import \ ClassifierNaturalDisaster, text2label_text_array, recover_train_matrix, generate_train_matrices value_t, value_f = ftu.value_t, ftu.value_f k_ft_model_file = getcfg().k_ft_model_file k_clf_model_file = getcfg().k_lr_model_file class ClassifierK(ClassifierNaturalDisaster): def __init__(self, ft_model_file=k_ft_model_file, clf_model_file=k_clf_model_file): ClassifierNaturalDisaster.__init__(self, ft_model_file, clf_model_file) def _generate_matrices(ft_model_file, lbl_txt_arr, mtx_file, lbl_file): print(len(lbl_txt_arr), mtx_file, lbl_file) textarr, labelarr = text2label_text_array(lbl_txt_arr) clf = ClassifierK(ft_model_file, None) featurearr = clf.textarr2featurearr(textarr) np.save(mtx_file, featurearr) np.save(lbl_file, labelarr) if __name__ == "__main__":
from config.configure import getcfg import classifying.fast_text_utils as ftu from collections import Counter import utils.array_utils as au label2value = ftu.binary_label2value model_file = getcfg().ft_add_model_file def predict(target, threshold=0.5): """ returns value/value array given input text/text array; value(s) are dependent on the threshold """ model = ftu.get_model(model_file) pred_value_arr, score_arr = ftu.binary_predict(target, model, threshold) return pred_value_arr, score_arr def train(train_file, model_file): model = ftu.FastText() model.train_supervised(input=train_file, epoch=50, lr=2, wordNgrams=2, verbose=2, minCount=10) ftu.save_model(model_file, model) return model def test(test_file, model_file): textarr, labelarr = list(), list() with open(test_file) as testfp: lines = testfp.readlines() for line in lines: label, text = line.strip().split(' ', 1) textarr.append(text)
from sklearn import metrics from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split, GridSearchCV from preprocess.filter.filter_utils import readFilesAsJsonList, get_all_substrings from preprocess.filter.pos_tag_process import count_ark_mark, count_sentiment from preprocess.filter.pre_process import filterArray from preprocess.filter.use_GSDMM import UseGSDMM from utils.ark_service_proxy import twarr_ark import utils.tweet_keys as tk import utils.pattern_utils as pu from config.configure import getcfg # TODO 先用绝对路径标识文件名,我这里在配置文件中写成绝对路径了 clf_model_file = getcfg().clf_model_file black_list_file = getcfg().black_list_file class EffectCheck: def __init__(self, T_dir=None, F_dir=None): if T_dir is not None or F_dir is not None: self.T_corpus = readFilesAsJsonList(T_dir) self.F_corpus = readFilesAsJsonList(F_dir) self.gsdmm = None with open(black_list_file, 'r') as fp: self.spam_words = set([line.strip() for line in fp.readlines()]) def run_a_function_list(self, function_list, print_pos_matchcase=False): for function in function_list: T_filtered = 0