def get_GSDMM_Feature(self, json):
     """ is tw a chat or not """
     if tk.key_orgntext in json:
         json[tk.key_text] = pu.text_normalization(json[tk.key_orgntext])
     else:
         text = json[tk.key_text]
         json[tk.key_orgntext] = text
         json[tk.key_text] = pu.text_normalization(text)
     topic_num = self.c.sample_cluster(json)
     if topic_num in self.is_noise_dict:
         return 1
     return 0
示例#2
0
def make_text_files():
    for idx, file in enumerate(neg_2012_full_files):
        twarr = fu.load_array(file)
        txtarr = list()
        for tw in twarr:
            text = pu.text_normalization(tw[tk.key_text])
            if pu.is_empty_string(text) or len(text) < 20:
                continue
            txtarr.append(text)
        print('len delta', len(twarr) - len(txtarr))
        path = Path(file)
        out_file_name = '_'.join([path.parent.name,
                                  path.name]).replace('json', 'txt')
        out_file = ft_data_pattern.format(out_file_name)
        print(out_file)
        fu.write_lines(out_file, txtarr)
    return
    p_twarr_blocks = map(fu.load_array, pos_files)
    p_txtarr_blocks = map(twarr2textarr, p_twarr_blocks)
    p_txtarr = au.merge_array(list(p_txtarr_blocks))
    p_out_file = ft_data_pattern.format('pos_2016.txt')
    fu.write_lines(p_out_file, p_txtarr)

    for f in neg_files:
        in_file = neg_event_pattern.format(f)
        out_file = ft_data_pattern.format(f.replace("json", "txt"))
        twarr = fu.load_array(in_file)
        txtarr = twarr2textarr(twarr)
        print(len(twarr), '->', len(txtarr), len(twarr) - len(txtarr))
        fu.write_lines(out_file, txtarr)
示例#3
0
def chatFilter(orgn):
    global reason
    c_f = res.NORMAL
    corpus = pu.text_normalization(orgn).lower().split()
    vec_bow = dictionary.doc2bow(corpus)
    vec_lda = lda[vec_bow]
    maxSim = 0.35
    topicNum = -1
    for sim in vec_lda:
        if sim[1] > maxSim:
            topicNum = sim[0]
            maxSim = sim[1]
    #         print(topicNum)
    if topicNum != -1:
        #         print(nd_text[count])
        #         print (nd_corpus[count])
        if labeled_topic[topicNum] == 3:
            c_f = res.NOISY
            reason = "topic: " + str(topicNum)
        elif labeled_topic[topicNum] == 2:
            c_f = res.SUSPICIOUS
        else:
            c_f = res.NORMAL
        pass
    return c_f
示例#4
0
def twarr2textarr(twarr):
    textarr = list()
    for tw in twarr:
        text = tw.get(tk.key_text).strip()
        if tk.key_orgntext not in tw:
            text = pu.text_normalization(text)
        if pu.is_empty_string(text):
            continue
        textarr.append(text)
    return textarr
示例#5
0
 def textarr2featurearr(self, textarr):
     vecarr = list()
     for text in textarr:
         try:
             ft_vec = self.get_ft_vector(text)
         except:
             text = pu.text_normalization(text)
             ft_vec = self.get_ft_vector(text)
         vecarr.append(ft_vec)
     return np.array(vecarr)
示例#6
0
 def textarr2featurearr_no_gpe(self, textarr):
     vecarr = list()
     for text in textarr:
         try:
             ft_vec = self.get_ft_vector(text)
         except:
             text = pu.text_normalization(text)
             ft_vec = self.get_ft_vector(text)
         ft_vec = np.append(ft_vec, self.has_keyword_feature(text))
         vecarr.append(ft_vec)
     return np.array(vecarr)
def filter_twarr_text(twarr):
    """ This function only suits for tweets that are not processed """
    flt_twarr = list()
    for tw in twarr:
        # TODO text_orgn = tw.get(tk.key_text, '').strip()
        text_orgn = tw.get(tk.key_orgntext, tw.get(tk.key_text, None)).strip()
        if not text_orgn:
            continue
        text_norm = pu.text_normalization(text_orgn).strip()
        if pu.is_empty_string(text_norm) or not pu.has_enough_alpha(text_norm, 0.65):
            continue
        tw[tk.key_orgntext] = text_orgn
        tw[tk.key_text] = text_norm
        flt_twarr.append(tw)
    return flt_twarr
示例#8
0
 def textarr2featurearr_no_gpe(self, textarr):
     """
     将输入文本列表转化为向量列表;其中,每条文本的向量除了fasttext提供的向量之外,
     还拼接了 self.has_keyword_feature 返回的敏感词/触发词计数值
     :param textarr: list,每个元素为str,普通文本
     :return: np.array(2-d),每个元素为np.array(1-d)
     """
     vecarr = list()
     for text in textarr:
         try:
             ft_vec = self.get_ft_vector(text)
         except:
             text = pu.text_normalization(text)
             ft_vec = self.get_ft_vector(text)
         ft_vec = np.append(ft_vec, self.has_keyword_feature(text))
         vecarr.append(ft_vec)
     return np.array(vecarr)
示例#9
0
def filter_twarr_text(twarr):
    """
    对输入的推特列表,对所有推特的文本进行预处理,抛弃预处理结果不合规的推特;
    每条推特 tk.key_orgntext 字段保留原始文本, tk.key_text 字段保留预处理结果
    :param twarr: list,推特列表
    :return: list,经过文本预处理以及筛选的推特列表
    """
    flt_twarr = list()
    for tw in twarr:
        # TODO text_orgn = tw.get(tk.key_text, '').strip()
        text_orgn = tw.get(tk.key_text, '').strip()
        # text_orgn = tw.get(tk.key_orgntext, tw.get(tk.key_text, None)).strip()
        if not text_orgn:
            continue
        text_norm = pu.text_normalization(text_orgn).strip()
        if pu.is_empty_string(text_norm) or not pu.has_enough_alpha(
                text_norm, 0.65):
            continue
        tw[tk.key_orgntext] = text_orgn
        tw[tk.key_text] = text_norm
        flt_twarr.append(tw)
    return flt_twarr
 def get_features(self, json):
     user = json[tk.key_user]
     if tk.key_description in user and user[tk.key_description] is not None:
         l_profile_description = len(user[tk.key_description])
     else:
         l_profile_description = 0
     FI = user[tk.key_friends_count]
     FE = user[tk.key_followers_count]
     num_tweet_posted = user[tk.key_statuses_count]
     
     tw_time = json[tk.key_created_at]
     user_born_time = json[tk.key_user][tk.key_created_at]
     # TODO 有些推文时间字段有误,需要判断处理,比如缺了分秒信息
     tw_d = datetime.datetime.strptime(tw_time, '%a %b %d %H:%M:%S %z %Y')
     user_d = datetime.datetime.strptime(user_born_time, '%a %b %d %H:%M:%S %z %Y')
     time_delta = tw_d - user_d
     AU = time_delta.seconds / 60.0 + time_delta.days * 24
     FE_FI_ratio = 0
     if FI != 0:
         FE_FI_ratio = FE / float(FI)
     
     reputation = FE / (FI + FE) if (FI + FE) != 0 else 0
     
     following_rate = FI / AU if AU != 0 else 0
     tweets_per_day = num_tweet_posted / (AU / 24) if AU != 0 else 0
     tweets_per_week = num_tweet_posted / (AU / (24 * 7)) if AU != 0 else 0
     
     user_features = [l_profile_description, FI, FE, num_tweet_posted, AU, FE_FI_ratio,
                      reputation, following_rate, tweets_per_day, tweets_per_week]
     """ content features """
     if tk.key_orgntext not in json:
         orgn_text = json[tk.key_orgntext] = json[tk.key_text]
         json[tk.key_text] = pu.text_normalization(orgn_text)
     text = json[tk.key_text]
     words = text.split()
     num_words = len(words)
     num_charater = len(text)
     num_white_space = len(re.findall(r'(\s)', text))
     num_capitalization_word = len(re.findall(r'(\b[A-Z]([a-z])*\b)', text))
     num_capital_per_word = num_capitalization_word / num_words
     
     max_word_length = 0
     mean_word_length = 0
     for word in words:
         if len(word) > max_word_length:
             max_word_length = len(word)
             mean_word_length += len(word)
     mean_word_length /= len(words)
     
     orgn = json[tk.key_orgntext]
     num_exclamation_marks = orgn.count('!')
     num_question_marks = orgn.count('?')
     num_urls = len(json['entities']['urls'])
     num_urls_per_word = num_urls / num_words
     num_hashtags = len(json['entities']['hashtags'])
     num_hashtags_per_word = num_hashtags / num_words
     num_mentions = len(json['entities']['user_mentions'])
     num_mentions_per_word = num_mentions / num_words
     
     substrings = get_all_substrings(text)
     num_spam_words = 0
     for sub in substrings:
         if sub in self.spam_words:
             num_spam_words += 1
     num_spam_words_per_word = num_spam_words / num_words
     content_features = [num_words, num_charater, num_white_space, num_capitalization_word,
                         num_capital_per_word, max_word_length, mean_word_length,
                         num_exclamation_marks, num_question_marks, num_urls, num_urls_per_word,
                         num_hashtags, num_hashtags_per_word, num_mentions,
                         num_mentions_per_word, num_spam_words, num_spam_words_per_word]
     sentiment_frature = count_sentiment(text)
     chat_feature = self.gsdmm.get_GSDMM_Feature(json)
     total_features = list()
     total_features.extend(user_features)
     total_features.extend(content_features)
     total_features.append(sentiment_frature)
     total_features.append(chat_feature)
     return total_features
 tmu.check_time()
 exit()
 
 sub_files = fi.listchildren('/home/nfs/cdong/tw/origin/', fi.TYPE_FILE, concat=True)[18:19]
 twarr = au.merge_array([fu.load_array(file) for file in sub_files])
 print(len(twarr))
 tmu.check_time(print_func=None)
 for idx, tw in enumerate(twarr[14000:15000]):
     if (idx + 1) % 1000 == 0:
         print(idx)
     try:
         my_filter.get_features(tw)
     except:
         # print(tw[tk.key_text])
         # print(tw[tk.key_orgntext])
         print('-', pu.text_normalization(tw[tk.key_orgntext]))
 tmu.check_time(print_func=lambda dt: print('pos filter time elapsed {}s'.format(dt)))
 
 exit()
 
 pos_base = '/home/nfs/cdong/tw/seeding/Terrorist/queried/event_corpus/'
 sub_files = fi.listchildren(pos_base, fi.TYPE_FILE, 'txt$', concat=True)
 pos_twarr = au.merge_array([fu.load_array(file) for file in sub_files])
 print(len(pos_twarr))
 tmu.check_time(print_func=None)
 pos_proba = my_filter.predict_proba(pos_twarr)
 tmu.check_time(print_func=lambda dt: print('pos filter time elapsed {}s'.format(dt)))
 
 neg_files = [
     '/home/nfs/yying/data/crawlTwitter/Crawler1/test.json',
     '/home/nfs/yying/data/crawlTwitter/Crawler2/crawl2.json',
示例#12
0
def text_normalization(text):
    text = pu.text_normalization(text)
    text = ' '.join(tokenize(text))
    return text
示例#13
0
def filter_text(text):
    return pu.text_normalization(text)
示例#14
0
    def __init__(self, trainning=None):
        self.c = ChatFilter()
        self.orgn_predict_label = None
        self.class_dist = None
        self.is_noise_dict = None

        if trainning is None:
            try:
                with open(chat_filter_file, 'rb') as f:
                    self.c = pickle.load(f)
                with open(orgn_predict_label_file, 'rb') as f:
                    self.orgn_predict_label = pickle.load(f)
                with open(class_dist_file, 'rb') as f:
                    self.class_dist = pickle.load(f)
                with open(is_noise_dict_file, 'rb') as f:
                    self.is_noise_dict = set(pickle.load(f))
            except:
                print('load error')
                traceback.print_exc()
        else:
            # prepare data
            mypath = '../data/'
            onlyfiles = [
                mypath + f for f in listdir(mypath) if isfile(join(mypath, f))
            ]
            print(onlyfiles)
            twarrF = readFilesAsJsonList(onlyfiles)
            twarrT = trainning
            for idx, tw in enumerate(twarrF):
                twarrF[idx]['label'] = 0

            for idx, tw in enumerate(twarrT):
                twarrT[idx]['label'] = 1
            twarr = list()
            twarr.extend(twarrT)
            twarr.extend(twarrF)
            i = 0
            for tw in twarr:
                tw['text'] = pu.text_normalization(tw['orgn'])
                twarr[i] = tw
                i += 1

            # train
            self.c = ChatFilter()
            self.c.set_twarr(twarr)
            self.c.set_hyperparams(
                0.9, 0.01, 55)  # 推荐超参,论文里用的是alpha=0.1 * len(twarr), beta=0.02
            class_dist, orgn_predict_label = self.c.recluster_using_GSDMM()

            try:
                with open(chat_filter_file, 'wb') as f:
                    pickle.dump(self.c, f)
                with open(orgn_predict_label_file, 'wb') as f:
                    pickle.dump(orgn_predict_label, f)
                with open(class_dist_file, 'wb') as f:
                    pickle.dump(class_dist, f)
            except:
                print('save error')
                traceback.print_exc()

            # get isNoiseDict
            label = [tw['label'] for tw in twarr]
            table = pd.DataFrame(index=set(orgn_predict_label),
                                 columns=set(label),
                                 data=0)
            for i in range(len(label)):
                table.loc[orgn_predict_label[i], label[i]] += 1
            print(table)
            multiple_times = 30
            self.is_noise_dict = []
            zero_total = float(table[0].sum())
            one_total = float(table[1].sum())
            for index, row in table.iterrows():
                if row[1] == 0:
                    if row[0] > multiple_times:
                        self.is_noise_dict.append(index)
                    else:
                        continue
                elif (row[0] / zero_total) / (row[1] /
                                              one_total) > multiple_times:
                    self.is_noise_dict.append(index)
            with open(is_noise_dict_file, 'wb') as f:
                pickle.dump(self.is_noise_dict, f)