예제 #1
0
 def is_valid_word(word):
     word = word.lower().strip()
     not_too_short = (len(word) >= len_min if len_min else True)
     not_too_long = (len(word) <= len_max if len_max else True)
     len_valid = not_too_short and not_too_long
     is_stop = word in stop_corpus if stop_corpus else False
     enough_alpha = pu.has_enough_alpha(
         word, alpha_thres) if alpha_thres else True
     return len_valid and enough_alpha and not is_stop
예제 #2
0
def extract_bad_tweets_into(files, output_file):
    total_tw_num = 0
    neg_twarr = list()
    for file in files:
        twarr = fu.load_array(file)
        total_tw_num += len(twarr)
        for tw in twarr:
            text = tw[tk.key_text]
            if len(text) < 20 or not pu.has_enough_alpha(text, 0.6):
                neg_twarr.append(tw)
    fu.dump_array(output_file, neg_twarr)
    return len(neg_twarr), total_tw_num
예제 #3
0
def filter_twarr_text(twarr):
    """ This function only suits for tweets that are not processed """
    flt_twarr = list()
    for tw in twarr:
        # TODO text_orgn = tw.get(tk.key_text, '').strip()
        text_orgn = tw.get(tk.key_orgntext, tw.get(tk.key_text, None)).strip()
        if not text_orgn:
            continue
        text_norm = pu.text_normalization(text_orgn).strip()
        if pu.is_empty_string(text_norm) or not pu.has_enough_alpha(text_norm, 0.65):
            continue
        tw[tk.key_orgntext] = text_orgn
        tw[tk.key_text] = text_norm
        flt_twarr.append(tw)
    return flt_twarr
예제 #4
0
def filter_twarr_text(twarr):
    """
    对输入的推特列表,对所有推特的文本进行预处理,抛弃预处理结果不合规的推特;
    每条推特 tk.key_orgntext 字段保留原始文本, tk.key_text 字段保留预处理结果
    :param twarr: list,推特列表
    :return: list,经过文本预处理以及筛选的推特列表
    """
    flt_twarr = list()
    for tw in twarr:
        # TODO text_orgn = tw.get(tk.key_text, '').strip()
        text_orgn = tw.get(tk.key_text, '').strip()
        # text_orgn = tw.get(tk.key_orgntext, tw.get(tk.key_text, None)).strip()
        if not text_orgn:
            continue
        text_norm = pu.text_normalization(text_orgn).strip()
        if pu.is_empty_string(text_norm) or not pu.has_enough_alpha(
                text_norm, 0.65):
            continue
        tw[tk.key_orgntext] = text_orgn
        tw[tk.key_text] = text_norm
        flt_twarr.append(tw)
    return flt_twarr
예제 #5
0

def merge_events_2016():
    base = '/home/nfs/cdong/tw/seeding/Terrorist/queried/event_corpus/'
    subs = fi.listchildren(base, fi.TYPE_FILE)
    twarr_list = []
    for sub in subs:
        twarr = fu.load_array(base + sub)
        # twarr = tu.twarr_ner(twarr)
        # twarr = ark.twarr_ark(twarr)
        twarr_list.append(twarr)
    fu.dump_array('/home/nfs/cdong/tw/seeding/Terrorist/queried/event2016.txt',
                  twarr_list)


if __name__ == '__main__':
    # merge_events_2016()
    import utils.pattern_utils as pu
    base = "/home/nfs/cdong/tw/seeding/Terrorist/queried/event_corpus/"
    files = fi.listchildren(base, fi.TYPE_FILE, concat=True)
    for file in files:
        twarr = fu.load_array(file)
        len_pre = len(twarr)
        for idx in range(len(twarr) - 1, -1, -1):
            text = twarr[idx][tk.key_text]
            if not pu.has_enough_alpha(text, 0.6):
                print(text)
                twarr.pop(idx)
        print(len_pre, '->', len(twarr), '\n\n')
        # fu.dump_array(file, twarr)