def cal_text_work(item): uid = item['uid'] timestamp = item['timestamp'] date = ts2datetime(timestamp) ts = datetime2ts(date) text = item['text'] if isinstance(text, str): text = text.decode('utf-8', 'ignore') RE = re.compile(u'#([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+)#', re.UNICODE) hashtag_list = RE.findall(text) if hashtag_list: # there all use unicode· hashtag_dict = dict() for hashtag in hashtag_list: try: hashtag_dict[hashtag] += 1 except: hashtag_dict[hashtag] = 1 try: hashtag_count_string = r_cluster.hget('hashtag_'+str(ts), str(uid)) hashtag_count_dict = json.loads(hashtag_count_string) for hashtag in hashtag_dict: count = hashtag_dict[hashtag] try: hashtag_count_dict[hashtag] += count except: hashtag_count_dict[hashtag] = count r_cluster.hset('hashtag_'+str(ts), str(uid), json.dumps(hashtag_count_dict)) except: r_cluster.hset('hashtag_'+str(ts), str(uid), json.dumps(hashtag_dict))
item['keywords_string'] = keywords_string # use to search sensitive_words_dict = searchWord(text.encode('utf-8', 'ignore'), DFA) if sensitive_words_dict: item['sensitive_words_string'] = "&".join(sensitive_words_dict.keys()) item['sensitive_words_dict'] = json.dumps(sensitive_words_dict) else: item['sensitive_words_string'] = "" item['sensitive_words_dict'] = json.dumps({}) timestamp = item['timestamp'] date = ts2datetime(timestamp) ts = datetime2ts(date) if sensitive_words_dict: print sensitive_words_dict.keys()[0] sensitive_count_string = r_cluster.hget('sensitive_'+str(ts), str(uid)) if sensitive_count_string: #redis取空 sensitive_count_dict = json.loads(sensitive_count_string) for word in sensitive_words_dict.keys(): if sensitive_count_dict.has_key(word): sensitive_count_dict[word] += sensitive_words_dict[word] else: sensitive_count_dict[word] = sensitive_words_dict[word] r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_count_dict)) else: r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_words_dict)) #identify whether to mapping new es weibo_timestamp = item['timestamp'] should_index_name_date = ts2datetime(weibo_timestamp) if should_index_name_date != now_index_name_date:
sensitive_words_dict = searchWord(text.encode('utf-8', 'ignore'), DFA) if sensitive_words_dict: item['sensitive_words_string'] = "&".join( sensitive_words_dict.keys()) item['sensitive_words_dict'] = json.dumps(sensitive_words_dict) else: item['sensitive_words_string'] = "" item['sensitive_words_dict'] = json.dumps({}) timestamp = item['timestamp'] date = ts2datetime(timestamp) ts = datetime2ts(date) if sensitive_words_dict: print sensitive_words_dict.keys()[0] sensitive_count_string = r_cluster.hget( 'sensitive_' + str(ts), str(uid)) if sensitive_count_string: #redis取空 sensitive_count_dict = json.loads(sensitive_count_string) for word in sensitive_words_dict.keys(): if sensitive_count_dict.has_key(word): sensitive_count_dict[word] += sensitive_words_dict[ word] else: sensitive_count_dict[word] = sensitive_words_dict[ word] r_cluster.hset('sensitive_' + str(ts), str(uid), json.dumps(sensitive_count_dict)) else: r_cluster.hset('sensitive_' + str(ts), str(uid), json.dumps(sensitive_words_dict))
statusnum = 0 else: uname = uid location = '' try: fansnum = bci_history_dict['fields']['user_fansnum'][0] except: fansnum = 0 try: statusnum = bci_history_dict['fields']['weibo_month_sum'][0] except: statusnum = 0 if status == 'show_in': if user_type == "sensitive": tmp_ts = datetime2ts(date) - DAY tmp_data = r_cluster.hget("sensitive_"+str(tmp_ts), uid) if tmp_data: sensitive_dict = json.loads(tmp_data) sensitive_words = sensitive_dict.keys() else: sensitive_words = [] if sensitive_history_dict.get('fields',0): #print sensitive_history_dict['fields'][sensitive_string][0] #print top_sensitive sensitive_value = math.log(sensitive_history_dict['fields'][sensitive_string][0]/float(top_sensitive)*9+1, 10)*100 #print "sensitive_value", sensitive_value else: sensitive_value = 0 results.append([uid, uname, location, fansnum, statusnum, influence, sensitive_words, sensitive_value]) else: results.append([uid, uname, location, fansnum, statusnum, influence])