def is_valid_word(word): word = word.lower().strip() not_too_short = (len(word) >= len_min if len_min else True) not_too_long = (len(word) <= len_max if len_max else True) len_valid = not_too_short and not_too_long is_stop = word in stop_corpus if stop_corpus else False enough_alpha = pu.has_enough_alpha( word, alpha_thres) if alpha_thres else True return len_valid and enough_alpha and not is_stop
def extract_bad_tweets_into(files, output_file): total_tw_num = 0 neg_twarr = list() for file in files: twarr = fu.load_array(file) total_tw_num += len(twarr) for tw in twarr: text = tw[tk.key_text] if len(text) < 20 or not pu.has_enough_alpha(text, 0.6): neg_twarr.append(tw) fu.dump_array(output_file, neg_twarr) return len(neg_twarr), total_tw_num
def filter_twarr_text(twarr): """ This function only suits for tweets that are not processed """ flt_twarr = list() for tw in twarr: # TODO text_orgn = tw.get(tk.key_text, '').strip() text_orgn = tw.get(tk.key_orgntext, tw.get(tk.key_text, None)).strip() if not text_orgn: continue text_norm = pu.text_normalization(text_orgn).strip() if pu.is_empty_string(text_norm) or not pu.has_enough_alpha(text_norm, 0.65): continue tw[tk.key_orgntext] = text_orgn tw[tk.key_text] = text_norm flt_twarr.append(tw) return flt_twarr
def filter_twarr_text(twarr): """ 对输入的推特列表,对所有推特的文本进行预处理,抛弃预处理结果不合规的推特; 每条推特 tk.key_orgntext 字段保留原始文本, tk.key_text 字段保留预处理结果 :param twarr: list,推特列表 :return: list,经过文本预处理以及筛选的推特列表 """ flt_twarr = list() for tw in twarr: # TODO text_orgn = tw.get(tk.key_text, '').strip() text_orgn = tw.get(tk.key_text, '').strip() # text_orgn = tw.get(tk.key_orgntext, tw.get(tk.key_text, None)).strip() if not text_orgn: continue text_norm = pu.text_normalization(text_orgn).strip() if pu.is_empty_string(text_norm) or not pu.has_enough_alpha( text_norm, 0.65): continue tw[tk.key_orgntext] = text_orgn tw[tk.key_text] = text_norm flt_twarr.append(tw) return flt_twarr
def merge_events_2016(): base = '/home/nfs/cdong/tw/seeding/Terrorist/queried/event_corpus/' subs = fi.listchildren(base, fi.TYPE_FILE) twarr_list = [] for sub in subs: twarr = fu.load_array(base + sub) # twarr = tu.twarr_ner(twarr) # twarr = ark.twarr_ark(twarr) twarr_list.append(twarr) fu.dump_array('/home/nfs/cdong/tw/seeding/Terrorist/queried/event2016.txt', twarr_list) if __name__ == '__main__': # merge_events_2016() import utils.pattern_utils as pu base = "/home/nfs/cdong/tw/seeding/Terrorist/queried/event_corpus/" files = fi.listchildren(base, fi.TYPE_FILE, concat=True) for file in files: twarr = fu.load_array(file) len_pre = len(twarr) for idx in range(len(twarr) - 1, -1, -1): text = twarr[idx][tk.key_text] if not pu.has_enough_alpha(text, 0.6): print(text) twarr.pop(idx) print(len_pre, '->', len(twarr), '\n\n') # fu.dump_array(file, twarr)