def cal_text_work(item):
    uid = item['uid']
    timestamp = item['timestamp']
    date = ts2datetime(timestamp)
    ts = datetime2ts(date)
    text = item['text']
    if isinstance(text, str):
        text = text.decode('utf-8', 'ignore')
    RE = re.compile(u'#([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+)#', re.UNICODE)
    hashtag_list = RE.findall(text)
    if hashtag_list:
        # there all use unicode·
        hashtag_dict = dict()
        for hashtag in hashtag_list:
            try:
                hashtag_dict[hashtag] += 1
            except:
                hashtag_dict[hashtag] = 1
        try:
            hashtag_count_string = r_cluster.hget('hashtag_'+str(ts), str(uid))
            hashtag_count_dict = json.loads(hashtag_count_string)
            for hashtag in hashtag_dict:
                count = hashtag_dict[hashtag]
                try:
                    hashtag_count_dict[hashtag] += count
                except:
                    hashtag_count_dict[hashtag] = count
            r_cluster.hset('hashtag_'+str(ts), str(uid), json.dumps(hashtag_count_dict))
        except:
            r_cluster.hset('hashtag_'+str(ts), str(uid), json.dumps(hashtag_dict))
            item['keywords_string'] = keywords_string         # use to search

            sensitive_words_dict = searchWord(text.encode('utf-8', 'ignore'), DFA)
            if sensitive_words_dict:
                item['sensitive_words_string'] = "&".join(sensitive_words_dict.keys())
                item['sensitive_words_dict'] = json.dumps(sensitive_words_dict)
            else:
                item['sensitive_words_string'] = ""
                item['sensitive_words_dict'] = json.dumps({})

            timestamp = item['timestamp']
            date = ts2datetime(timestamp)
            ts = datetime2ts(date)
            if sensitive_words_dict:
                print sensitive_words_dict.keys()[0]
                sensitive_count_string = r_cluster.hget('sensitive_'+str(ts), str(uid))
                if sensitive_count_string: #redis取空
                    sensitive_count_dict = json.loads(sensitive_count_string)
                    for word in sensitive_words_dict.keys():
                        if sensitive_count_dict.has_key(word):
                            sensitive_count_dict[word] += sensitive_words_dict[word]
                        else:
                            sensitive_count_dict[word] = sensitive_words_dict[word]
                    r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_count_dict))
                else:
                    r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_words_dict))

            #identify whether to mapping new es
            weibo_timestamp = item['timestamp']
            should_index_name_date = ts2datetime(weibo_timestamp)
            if should_index_name_date != now_index_name_date:
Пример #3
0
            sensitive_words_dict = searchWord(text.encode('utf-8', 'ignore'),
                                              DFA)
            if sensitive_words_dict:
                item['sensitive_words_string'] = "&".join(
                    sensitive_words_dict.keys())
                item['sensitive_words_dict'] = json.dumps(sensitive_words_dict)
            else:
                item['sensitive_words_string'] = ""
                item['sensitive_words_dict'] = json.dumps({})

            timestamp = item['timestamp']
            date = ts2datetime(timestamp)
            ts = datetime2ts(date)
            if sensitive_words_dict:
                print sensitive_words_dict.keys()[0]
                sensitive_count_string = r_cluster.hget(
                    'sensitive_' + str(ts), str(uid))
                if sensitive_count_string:  #redis取空
                    sensitive_count_dict = json.loads(sensitive_count_string)
                    for word in sensitive_words_dict.keys():
                        if sensitive_count_dict.has_key(word):
                            sensitive_count_dict[word] += sensitive_words_dict[
                                word]
                        else:
                            sensitive_count_dict[word] = sensitive_words_dict[
                                word]
                    r_cluster.hset('sensitive_' + str(ts), str(uid),
                                   json.dumps(sensitive_count_dict))
                else:
                    r_cluster.hset('sensitive_' + str(ts), str(uid),
                                   json.dumps(sensitive_words_dict))
Пример #4
0
         statusnum = 0
 else:
     uname = uid
     location = ''
     try:
         fansnum = bci_history_dict['fields']['user_fansnum'][0]
     except:
         fansnum = 0
     try:
         statusnum = bci_history_dict['fields']['weibo_month_sum'][0]
     except:
         statusnum = 0
 if status == 'show_in':
     if user_type == "sensitive":
         tmp_ts = datetime2ts(date) - DAY
         tmp_data = r_cluster.hget("sensitive_"+str(tmp_ts), uid)
         if tmp_data:
             sensitive_dict = json.loads(tmp_data)
             sensitive_words = sensitive_dict.keys()
         else:
             sensitive_words = []
         if sensitive_history_dict.get('fields',0):
             #print sensitive_history_dict['fields'][sensitive_string][0]
             #print top_sensitive
             sensitive_value = math.log(sensitive_history_dict['fields'][sensitive_string][0]/float(top_sensitive)*9+1, 10)*100
             #print "sensitive_value", sensitive_value
         else:
             sensitive_value = 0
         results.append([uid, uname, location, fansnum, statusnum, influence, sensitive_words, sensitive_value])
     else:
         results.append([uid, uname, location, fansnum, statusnum, influence])