def createWordTree(): awords = [] # for b in open('sensitive_words.txt', 'rb'): # awords.append(b.strip()) sensitive_word_list = [] sensitive_word_list = r_sensitive.hgetall("sensitive_words") awords = json.dumps(sensitive_word_list) for word in awords: temp = wordTree for a in range(0, len(word)): index = ord(word[a]) if a < (len(word) - 1): if temp[index] == None: node = [[None for x in range(256)], 0] temp[index] = node elif temp[index] == 1: node = [[None for x in range(256)], 1] temp[index] = node temp = temp[index][0] else: temp[index] = 1
def cal_text_sensitive(item): text = item['text'] uid = item['uid'] timestamp = item['timestamp'] date = ts2datetime(timestamp) ts = datetime2ts(date) if isinstance(text, str): text = text.decode('utf-8', 'ignore') sensitive_word_list = [] sensitive_word_list = r_sensitive.hgetall("sensitive_words") SENSITIVE_WORD = json.dumps(sensitive_word_list) sensitive_result = [word for word in SENSITIVE_WORD if word in text] if sensitive_result: sensitive_dict = dict() for word in sensitive_result: try: sensitive_dict[word] += 1 except: sensitive_dict[word] = 1 try: sensitive_count_string = r_cluster.hget('sensitive_' + str(ts), str(uid)) sensitive_count_dict = json.loads(sensitive_count_string) for word in sensitive_dict: count = sensitive_dict[word] try: sensitive_count_dict[word] += count except: sensitive_count_dict[word] = count r_cluster.hset('sensitive_' + str(ts), str(uid), json.dumps(sensitive_count_dict)) except: r_cluster.hset('sensitive_' + str(ts), str(uid), json.dumps(sensitive_dict))