def save_activity(uid, ts, time_segment, sensitive): key = str(ts) try: if sensitive: activity_count_dict = r_cluster.hget('sensitive_activity_' + key, str(uid)) else: activity_count_dict = r_cluster.hget('activity_' + key, str(uid)) activity_count_dict = json.loads(activity_count_dict) try: activity_count_dict[str(time_segment)] += 1 except: activity_count_dict[str(time_segment)] = 1 if sensitive: r_cluster.hset('sensitive_activity_' + key, str(uid), json.dumps(activity_count_dict)) else: r_cluster.hset('activity_' + key, str(uid), json.dumps(activity_count_dict)) except: if sensitive: r_cluster.hset('sensitive_activity_' + key, str(uid), json.dumps({str(time_segment): 1})) else: r_cluster.hset('activity_' + key, str(uid), json.dumps({str(time_segment): 1}))
def cal_text_work(item): uid = item['uid'] timestamp = item['timestamp'] date = ts2datetime(timestamp) ts = datetime2ts(date) text = item['text'] if isinstance(text, str): text = text.decode('utf-8', 'ignore') RE = re.compile(u'#([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+)#', re.UNICODE) hashtag_list = RE.findall(text) if hashtag_list: # there all use unicode· hashtag_dict = dict() for hashtag in hashtag_list: try: hashtag_dict[hashtag] += 1 except: hashtag_dict[hashtag] = 1 try: hashtag_count_string = r_cluster.hget('hashtag_'+str(ts), str(uid)) hashtag_count_dict = json.loads(hashtag_count_string) for hashtag in hashtag_dict: count = hashtag_dict[hashtag] try: hashtag_count_dict[hashtag] += count except: hashtag_count_dict[hashtag] = count r_cluster.hset('hashtag_'+str(ts), str(uid), json.dumps(hashtag_count_dict)) except: r_cluster.hset('hashtag_'+str(ts), str(uid), json.dumps(hashtag_dict))
def save_at(uid, at_uid, timestamp, sensitive): ts = ts2datetime(timestamp).replace('-', '') key = str(uid) try: if sensitive: ruid_count_string = r_cluster.hget('sensitive_at_' + str(ts), str(uid)) else: ruid_count_string = r_cluster.hget('at_' + str(ts), str(uid)) ruid_count_dict = json.loads(ruid_count_string) try: ruid_count_dict[str(at_uid)] += 1 except: ruid_count_dict[str(at_uid)] = 1 if sensitive: r_cluster.hset('sensitive_at_' + str(ts), str(uid), json.dumps(ruid_count_dict)) else: r_cluster.hset('at_' + str(ts), str(uid), json.dumps(ruid_count_dict)) except: if sensitive: r_cluster.hset('sensitive_at_' + str(ts), str(uid), json.dumps({str(at_uid): 1})) else: r_cluster.hset('at_' + str(ts), str(uid), json.dumps({str(at_uid): 1}))
def save_city(uid, ip, timestamp, sensitive): ts = ts2datetime(timestamp).replace('-','') key = str(uid) try: if sensitive: ip_count_string = r_cluster.hget('sensitive_ip_'+str(ts), str(uid)) else: ip_count_string = r_cluster.hget('ip_'+str(ts), str(uid)) ip_count_dict = json.loads(ip_count_string) try: ip_count_dict[str(ip)] += 1 except: ip_count_dict[str(ip)] = 1 if sensitive: r_cluster.hset('sensitive_ip_'+str(ts), str(uid), json.dumps(ip_count_dict)) else: r_cluster.hset('ip_'+str(ts), str(uid), json.dumps(ip_count_dict)) except: if sensitive: r_cluster.hset('sensitive_ip_'+str(ts), str(uid), json.dumps({str(ip):1})) else: r_cluster.hset('ip_'+str(ts), str(uid), json.dumps({str(ip):1}))
def cal_text_sensitive(item): text = item['text'] uid = item['uid'] timestamp = item['timestamp'] date = ts2datetime(timestamp) ts = datetime2ts(date) if isinstance(text, str): text = text.decode('utf-8', 'ignore') sensitive_result = [word for word in SENSITIVE_WORD if word in text] if sensitive_result: sensitive_dict = dict() for word in sensitive_result: try: sensitive_dict[word] += 1 except: sensitive_dict[word] = 1 try: sensitive_count_string = r_cluster.hget('sensitive_'+str(ts), str(uid)) sensitive_count_dict = json.loads(sensitive_count_string) for word in sensitive_dict: count = sensitive_dict[word] try: sensitive_count_dict[word] += count except: sensitive_count_dict[word] = count r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_count_dict)) except: r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_dict))
def cal_class_ratio(): ratio_results = {} date = '2013-09-07' ts = datetime2ts(date) scan_count = 0 scan_cursor = 0 all_count = 0 while 1: if scan_count == 1000000: break results = r_cluster.hscan('activity_' + str(ts), scan_cursor, count=1000) scan_cursor = results[0] scan_count += 1000 for uid in results[1]: activity_dict_string = r_cluster.hget('activity_' + str(ts), uid) activity_dict = json.loads(activity_dict_string) weibo_count = 0 for time_seg in activity_dict: weibo_count += int(activity_dict[time_seg]) if weibo_count >= 6: indic_3 = '1' else: indic_3 = '0' retweet_results = r.hgetall('retweet_' + str(uid)) retweet_count = len(retweet_results) if retweet_count >= 8: indic_1 = '1' else: indic_1 = '0' be_retweet_results = r.hgetall('be_retweet_' + str(uid)) be_retweet_count = len(be_retweet_results) #print 'be_retweet_count:', be_retweet_count if be_retweet_count >= 9: indic_2 = '1' else: indic_2 = '0' #print 'indic_2:', indic_2 key = indic_1 + indic_2 + indic_3 try: ratio_results[key] += 1 except: ratio_results[key] = 1 # write eight type users ''' if key=='001': writer1.writerow([uid, retweet_count, be_retweet_count, weibo_count]) elif key=='111': writer2.writerow([uid, retweet_count, be_retweet_count, weibo_count]) elif key=='101': writer3.writerow([uid, retweet_count, be_retweet_count, weibo_count]) elif key=='011': writer4.writerow([uid, retweet_count, be_retweet_count, weibo_count]) elif key=='110': writer5.writerow([uid, retweet_count, be_retweet_count, weibo_count]) if key=='010': writer6.writerow([uid, retweet_count, be_retweet_count, weibo_count]) ''' print 'ratio_results:', ratio_results
def cal_sensitive_words_work(item, sw_list): timestamp = item['timestamp'] uid = item['uid'] timestamp = ts2datetime(timestamp).replace('-','') ts = timestamp map = {} for w in sw_list: word = "".join([chr(x) for x in w]) word = word.decode('utf-8') if not map.__contains__(word): map[word] = 1 else: map[word] += 1 try: sensitive_count_string = r_cluster.hget('sensitive_'+str(ts), str(uid)) sensitive_count_dict = json.loads(sensitive_count_string) for word in map: count = map[word] if sensitive_count_dict.__contains__(word): sensitive_count_dict[word] += count else: sensitive_count_dict[word] = count r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(sensitive_count_dict)) except: r_cluster.hset('sensitive_'+str(ts), str(uid), json.dumps(map))
def save_city(uid, ip, timestamp, sensitive): ts = ts2datetime(timestamp).replace('-', '') key = str(uid) try: if sensitive: ip_count_string = r_cluster.hget('sensitive_ip_' + str(ts), str(uid)) else: ip_count_string = r_cluster.hget('ip_' + str(ts), str(uid)) ip_count_dict = json.loads(ip_count_string) try: ip_count_dict[str(ip)] += 1 except: ip_count_dict[str(ip)] = 1 if sensitive: r_cluster.hset('sensitive_ip_' + str(ts), str(uid), json.dumps(ip_count_dict)) else: r_cluster.hset('ip_' + str(ts), str(uid), json.dumps(ip_count_dict)) except: if sensitive: r_cluster.hset('sensitive_ip_' + str(ts), str(uid), json.dumps({str(ip): 1})) else: r_cluster.hset('ip_' + str(ts), str(uid), json.dumps({str(ip): 1}))
def cal_ave_weibo(): date = '2013-09-07' timestamp = datetime2ts(date) scan_count = 0 scan_cursor = 0 all_count = 0 while 1: if scan_count == 1000000: break results = r_cluster.hscan('activity_' + str(timestamp), scan_cursor, count=1000) scan_cursor = results[0] scan_count += 1000 for uid in results[1]: for i in range(0, 1): ts = timestamp - 24 * 3600 * i activity_dict_string = r_cluster.hget('activity_' + str(ts), uid) if activity_dict_string: activity_dict = json.loads(activity_dict_string) weibo_count = 0 for time_seg in activity_dict: weibo_count += int(activity_dict[time_seg]) all_count += weibo_count ave_count = float(all_count) / scan_count print 'ave_count:', ave_count
def cal_sensitive_words_work(item, sw_list): timestamp = item['timestamp'] uid = item['uid'] timestamp = ts2datetime(timestamp).replace('-', '') ts = timestamp map = {} for w in sw_list: word = "".join([chr(x) for x in w]) word = word.decode('utf-8') if not map.__contains__(word): map[word] = 1 else: map[word] += 1 try: sensitive_count_string = r_cluster.hget('sensitive_' + str(ts), str(uid)) sensitive_count_dict = json.loads(sensitive_count_string) for word in map: count = map[word] if sensitive_count_dict.__contains__(word): sensitive_count_dict[word] += count else: sensitive_count_dict[word] = count r_cluster.hset('sensitive_' + str(ts), str(uid), json.dumps(sensitive_count_dict)) except: r_cluster.hset('sensitive_' + str(ts), str(uid), json.dumps(map))
def cal_class_ratio(): ratio_results = {} date = '2013-09-07' ts = datetime2ts(date) scan_count = 0 scan_cursor = 0 all_count = 0 while 1: if scan_count == 1000000: break results = r_cluster.hscan('activity_'+str(ts), scan_cursor, count=1000) scan_cursor = results[0] scan_count += 1000 for uid in results[1]: activity_dict_string = r_cluster.hget('activity_'+str(ts), uid) activity_dict = json.loads(activity_dict_string) weibo_count = 0 for time_seg in activity_dict: weibo_count += int(activity_dict[time_seg]) if weibo_count >= 6: indic_3 = '1' else: indic_3 = '0' retweet_results = r.hgetall('retweet_'+str(uid)) retweet_count = len(retweet_results) if retweet_count >= 8: indic_1 = '1' else: indic_1 = '0' be_retweet_results = r.hgetall('be_retweet_'+str(uid)) be_retweet_count = len(be_retweet_results) #print 'be_retweet_count:', be_retweet_count if be_retweet_count >= 9: indic_2 = '1' else: indic_2 = '0' #print 'indic_2:', indic_2 key = indic_1 + indic_2 + indic_3 try: ratio_results[key] += 1 except: ratio_results[key] = 1 # write eight type users ''' if key=='001': writer1.writerow([uid, retweet_count, be_retweet_count, weibo_count]) elif key=='111': writer2.writerow([uid, retweet_count, be_retweet_count, weibo_count]) elif key=='101': writer3.writerow([uid, retweet_count, be_retweet_count, weibo_count]) elif key=='011': writer4.writerow([uid, retweet_count, be_retweet_count, weibo_count]) elif key=='110': writer5.writerow([uid, retweet_count, be_retweet_count, weibo_count]) if key=='010': writer6.writerow([uid, retweet_count, be_retweet_count, weibo_count]) ''' print 'ratio_results:', ratio_results
def save_activity(uid, ts, time_segment): key = str(ts) try: activity_count_dict = r_cluster.hget('activity_' + key, str(uid)) activity_count_dict = json.loads(activity_count_dict) try: activity_count_dict[str(time_segment)] += 1 except: activity_count_dict[str(time_segment)] = 1 r_cluster.hset('activity_' + key, str(uid), json.dumps(activity_count_dict)) except: r_cluster.hset('activity_' + key, str(uid), json.dumps({str(time_segment): 1}))
def save_city(uid, ip, timestamp): date = ts2datetime(timestamp) ts = datetime2ts(date) key = str(uid) try: ip_count_string = r_cluster.hget('ip_'+str(ts), str(uid)) ip_count_dict = json.loads(ip_count_string) try: ip_count_dict[str(ip)] += 1 except: ip_count_dict[str(ip)] = 1 r_cluster.hset('ip_'+str(ts), str(uid), json.dumps(ip_count_dict)) except: r_cluster.hset('ip_'+str(ts), str(uid), json.dumps({str(ip):1}))
def save_at(uid, at_uid, timestamp): date = ts2datetime(timestamp) ts = datetime2ts(date) key = str(uid) try: ruid_count_string = r_cluster.hget('at_'+str(ts), str(uid)) ruid_count_dict = json.loads(ruid_count_string) try: ruid_count_dict[str(at_uid)] += 1 except: ruid_count_dict[str(at_uid)] = 1 r_cluster.hset('at_'+str(ts), str(uid), json.dumps(ruid_count_dict)) except: r_cluster.hset('at_'+str(ts), str(uid), json.dumps({str(at_uid):1}))
def save_at(uid, at_uid, timestamp): date = ts2datetime(timestamp) ts = datetime2ts(date) key = str(uid) try: ruid_count_string = r_cluster.hget('at_' + str(ts), str(uid)) ruid_count_dict = json.loads(ruid_count_string) try: ruid_count_dict[str(at_uid)] += 1 except: ruid_count_dict[str(at_uid)] = 1 r_cluster.hset('at_' + str(ts), str(uid), json.dumps(ruid_count_dict)) except: r_cluster.hset('at_' + str(ts), str(uid), json.dumps({str(at_uid): 1}))
def save_city_timestamp(uid, ip, timestamp): date = ts2datetime(timestamp) ts = datetime2ts(date) try: ip_timestamp_string = r_cluster.hget('new_ip_'+str(ts), str(uid)) ip_timestamp_string_dict = json.loads(ip_timestamp_string) try: add_string = '&'+str(timestamp) ip_timestamp_string_dict[str(ip)] += add_string except: ip_timestamp_string_dict[str(ip)] = str(timestamp) r_cluster.hset('new_ip_'+str(ts), str(uid), json.dumps(ip_timestamp_string_dict)) except: r_cluster.hset('new_ip_'+str(ts), str(uid), json.dumps({str(ip): str(timestamp)}))
def filter_activity(user_set): results = [] #run_type if RUN_TYPE == 1: now_date = ts2datetime(time.time()) else: now_date = RUN_TEST_TIME ts = datetime2ts(now_date) - DAY date = ts2datetime(ts) timestamp = datetime2ts(date) for user in user_set: over_count = 0 for i in range(0, 7): ts = timestamp - DAY * i result = r_cluster.hget('activity_' + str(ts), str(user)) if result: items_dict = json.loads(result) for item in items_dict: weibo_count = items_dict[item] if weibo_count > activity_threshold: over_count += 1 if over_count == 0: results.append(user) else: writer.writerow([user, 'activity']) return results
def filter_activity(user_set): results = [] now_date = ts2datetime(time.time()) # test now_date = '2013-09-08' ts = datetime2ts(now_date) - 24*3600 date = ts2datetime(ts) #print 'date:', date timestamp = datetime2ts(date) for user in user_set: over_count = 0 for i in range(0,7): ts = timestamp - 3600*24*i result = r_cluster.hget('activity_'+str(ts), str(user)) if result: items_dict = json.loads(result) for item in items_dict: weibo_count = items_dict[item] if weibo_count > activity_threshold: over_count += 1 if over_count == 0: results.append(user) else: writer.writerow([user, 'activity']) print 'after filter activity:', len(results) return results
def filter_activity(user_set): results = [] now_date = ts2datetime(time.time()) now_date = '2013-09-08' ts = datetime2ts(now_date) - 24 * 3600 date = ts2datetime(ts) timestamp = datetime2ts(date) ts = ts.replace('-', '') for user in user_set: over_count = 0 for i in range(0, 7): ts = timestamp - 3600 * 24 * i result = r_cluster.hget('activity_' + str(ts), str(user)) if result: item_dict = json.loads(result) sorted_dict = sorted(item_dict.iteritems(), key=lambda asd: asd[1], reverse=True) if sorted_dict[0][1] > activity_threshold: over_count = 1 if over_count == 0: results.append(user) else: writer.writerow([user, 'activity']) print 'after filter activity: ', len(results) return results
def filter_activity(user_set): results = [] now_date = ts2datetime(time.time()) now_date = '2013-09-08' ts = datetime2ts(now_date) - 24*3600 date = ts2datetime(ts) timestamp = datetime2ts(date) ts = ts.replace('-','') for user in user_set: over_count = 0 for i in range(0,7): ts = timestamp - 3600*24*i result = r_cluster.hget('activity_'+str(ts), str(user)) if result: item_dict = json.loads(result) sorted_dict = sorted(item_dict.iteritems(), key=lambda asd:asd[1], reverse=True) if sorted_dict[0][1] > activity_threshold: over_count = 1 if over_count == 0: results.append(user) else: writer.writerow([user, 'activity']) print 'after filter activity: ', len(results) return results
def get_attr_geo_track(uid_list): date_results = [] # results = {'2013-09-01':[(geo1, count1), (geo2, track2)], '2013-09-02'...} 7day now_ts = time.time() now_date = ts2datetime(now_ts) #test now_date = '2013-09-08' ts = datetime2ts(now_date) for i in range(7, 0, -1): timestamp = ts - i*24*3600 #print 'timestamp:', ts2datetime(timestamp) ip_dict = dict() results = r_cluster.hmget('ip_'+str(timestamp), uid_list) #print 'results:',results for item in results: if item: item_dict = json.loads(item) #print 'item_dict:', item_dict for ip_item in item_dict: try: ip_dict[ip_item] += item_dict[ip_item] except: ip_dict[ip_item] = item_dict[ip_item] geo_dict = ip2geo(ip_dict) sort_geo_dict = sorted(geo_dict.items(), key=lambda x:x[1], reverse=True) date_key = ts2datetime(timestamp) date_results.append([date_key, sort_geo_dict[:2]]) #print 'results:', date_results return {'geo_track': json.dumps(date_results)}
def update_day_hashtag(uid_list): results = {} all_results = {} now_ts = time.time() #run_type if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = test_ts for i in range(WEEK, 0, -1): ts = now_date_ts - DAY * i count = 0 hashtag_results = r_cluster.hmget('hashtag_' + str(ts), uid_list) for uid in uid_list: if uid not in results: results[uid] = {} hashtag_item = hashtag_results[count] if hashtag_item: hashtag_dict = json.loads(hashtag_item) else: hashtag_dict = {} for hashtag in hashtag_dict: try: results[uid][hashtag] += 1 except: results[uid][hashtag] = 1 for uid in uid_list: user_hashtag_dict = results[uid] hashtag_string = '&'.join(user_hashtag_dict.keys()) all_results[uid] = { 'hashtag': hashtag_string, 'hashtag_dict': user_hashtag_dict } return results
def update_day_hashtag(uid_list): results = {} all_results = {} now_ts = time.time() #run_type if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = test_ts for i in range(WEEK,0,-1): ts = now_date_ts - DAY*i count = 0 hashtag_results = r_cluster.hmget('hashtag_'+str(ts), uid_list) for uid in uid_list: if uid not in results: results[uid] = {} hashtag_item = hashtag_results[count] if hashtag_item: hashtag_dict = json.loads(hashtag_item) else: hashtag_dict = {} for hashtag in hashtag_dict: try: results[uid][hashtag] += 1 except: results[uid][hashtag] = 1 for uid in uid_list: user_hashtag_dict = results[uid] hashtag_string = '&'.join(user_hashtag_dict.keys()) all_results[uid] = {'hashtag': hashtag_string, 'hashtag_dict':user_hashtag_dict} return all_results
def filter_activity(user_set): results = [] #run_type if RUN_TYPE == 1: now_date = ts2datetime(time.time()) else: now_date = RUN_TEST_TIME ts = datetime2ts(now_date) - DAY date = ts2datetime(ts) timestamp = datetime2ts(date) for user in user_set: over_count = 0 for i in range(0,7): ts = timestamp - DAY*i result = r_cluster.hget('activity_'+str(ts), str(user)) if result: items_dict = json.loads(result) for item in items_dict: weibo_count = items_dict[item] if weibo_count > activity_threshold: over_count += 1 if over_count == 0: results.append(user) else: writer.writerow([user, 'activity']) return results
def save_city_timestamp(uid, ip, timestamp): date = ts2datetime(timestamp) ts = datetime2ts(date) try: ip_timestamp_string = r_cluster.hget('new_ip_' + str(ts), str(uid)) ip_timestamp_string_dict = json.loads(ip_timestamp_string) try: add_string = '&' + str(timestamp) ip_timestamp_string_dict[str(ip)] += add_string except: ip_timestamp_string_dict[str(ip)] = str(timestamp) r_cluster.hset('new_ip_' + str(ts), str(uid), json.dumps(ip_timestamp_string_dict)) except: r_cluster.hset('new_ip_' + str(ts), str(uid), json.dumps({str(ip): str(timestamp)}))
def cal_hashtag_work(item, sensitive): text = item['text'] uid = item['uid'] timestamp = item['timestamp'] ts = ts2datetime(timestamp).replace('-', '') if isinstance(text, str): text = text.decode('utf-8', 'ignore') RE = re.compile(u'#([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+)#', re.UNICODE) hashtag_list = RE.findall(text) if hashtag_list: hashtag_dict = {} for hashtag in hashtag_list: try: hashtag_dict[hashtag] += 1 except: hashtag_dict[hashtag] = 1 try: if sensitive: hashtag_count_string = r_cluster.hget( 'sensitive_hashtag_' + str(ts), str(uid)) else: hashtag_count_string = r_cluster.hget('hashtag_' + str(ts), str(uid)) hashtag_count_dict = json.loads(hashtag_count_string) for hashtag in hashtag_dict: count = hashtag_dict[hashtag] try: hashtag_count_dict[hashtag] += count except: hashtag_count_dict[hashtag] = count if sensitive: r_cluster.hset('sensitive_hashtag_' + str(ts), str(uid), json.dumps(hashtag_count_dict)) else: r_cluster.hset('hashtag_' + str(ts), str(uid), json.dumps(hashtag_count_dict)) except: if sensitive: r_cluster.hset('sensitive_hashtag_' + str(ts), str(uid), json.dumps(hashtag_dict)) else: r_cluster.hset('hashtag_' + str(ts), str(uid), json.dumps(hashtag_dict))
def update_day_geo(uid_list, user_info_list): results = {} now_ts = time.time() #run_type if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = test_ts ip_results = r_cluster.hmget('new_ip_' + str(now_date_ts - DAY), uid_list) count = 0 for uid in uid_list: if uid not in results: results[uid] = {'activity_geo': {}, 'activity_geo_dict': []} uid_ip_results = ip_results[count] count += 1 if uid_ip_results: uid_ip_dict = json.loads(uid_ip_results) else: uid_ip_dict = {} day_results = {} for ip in uid_ip_dict: ip_count = len(uid_ip_dict[ip].split('&')) geo = ip2city(ip) geo = geo.decode('utf-8') try: day_results[geo] += ip_count except: day_results[geo] = ip_count #update the activity_geo_dict activity_geo_history_list = json.loads( user_info_list[uid]['activity_geo_dict']) activity_geo_history_list.append(day_results) results[uid]['activity_geo_dict'] = json.dumps( activity_geo_history_list[-30:]) #update the activity_geo week_activity_geo_list = activity_geo_history_list[-7:] week_geo_list = [] for activity_geo_item in week_activity_geo_list: geo_list = activity_geo_item.keys() week_geo_list.extend(geo_list) week_geo_list = list(set(week_geo_list)) week_geo_string = '&'.join([ '&'.join((item.encode('utf-8')).split('\t')) for item in week_geo_list ]) try: week_geo_aggs_string = '&'.join([ (item.encode('utf-8')).split('\t')[-1] for item in week_geo_list ]) except: week_geo_aggs_string = '' results[uid]['activity_geo'] = week_geo_string results[uid]['activity_geo_aggs'] = week_geo_aggs_string return results
def cal_core_class(): date = '2013-09-07' timestamp = datetime2ts(date) f_r = open( '/home/ubuntu8/huxiaoqian/user_portrait/user_portrait/cron/recommentation_in/core_list.csv', 'rb') reader = csv.reader(f_r) f_w = open( '/home/ubuntu8/huxiaoqian/user_portrait/user_portrait/cron/recommentation_in/core_class.csv', 'wb') writer = csv.writer(f_w) result_list = [] count011 = 0 for line in reader: uid = line[0] retweet_results = r.hgetall('retweet_' + str(uid)) retweet_count = len(retweet_results) be_retweet_results = r.hgetall('be_retweet_' + str(uid)) be_retweet_count = len(be_retweet_results) weibo_count = 0 for i in range(0, 7): ts = timestamp - 24 * 3600 * i activity_string = r_cluster.hget('activity_' + str(ts), str(uid)) if activity_string: activity_dict = json.loads(activity_string) else: activity_dict = {} for time_seg in activity_dict: count = activity_dict[time_seg] weibo_count += count ave_weibo_count = float(weibo_count) / 7 if retweet_count >= 8: indic_1 = '1' else: indic_1 = '0' if be_retweet_count >= 9: indic_2 = '1' else: indic_2 = '0' if ave_weibo_count >= 6: indic_3 = '1' else: indic_3 = '0' key = indic_1 + indic_2 + indic_3 if key == '011': count011 += 1 result_list.append( [uid, key, retweet_count, be_retweet_count, ave_weibo_count]) f_r.close() sort_result = sorted(result_list, key=lambda x: x[3], reverse=True) for item in sort_result: writer.writerow(list(item)) f_w.close() print 'count011:', count011
def main(): now_ts = time.time() delete_ts = datetime2ts(ts2datetime(now_ts-EXPIRE_TIME)) #待删除的时间戳 delete_date = ts2datetime(now_ts-EXPIRE_TIME) #delete @ r_cluster.delete("at_"+str(delete_ts)) #delete ip r_cluster.delete('new_ip_'+str(delete_ts)) #delete activity r_cluster.delete('activity_'+str(delete_ts)) #delete hashtag r_cluster.delete('hashtag_'+str(delete_ts)) #delete sensitive words r_cluster.delete('sensitive_'+str(delete_ts)) #delete recommendation r.delete('recomment_'+str(delete_date))
def cal_hashtag_work(item, sensitive): text = item['text'] uid = item['uid'] timestamp = item['timestamp'] ts = ts2datetime(timestamp).replace('-','') if isinstance(text, str): text = text.decode('utf-8', 'ignore') RE = re.compile(u'#([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+)#', re.UNICODE) hashtag_list = RE.findall(text) if hashtag_list: hashtag_dict = {} for hashtag in hashtag_list: try: hashtag_dict[hashtag] += 1 except: hashtag_dict[hashtag] = 1 try: if sensitive: hashtag_count_string = r_cluster.hget('sensitive_hashtag_'+str(ts), str(uid)) else: hashtag_count_string = r_cluster.hget('hashtag_'+str(ts), str(uid)) hashtag_count_dict = json.loads(hashtag_count_string) for hashtag in hashtag_dict: count = hashtag_dict[hashtag] try: hashtag_count_dict[hashtag] += count except: hashtag_count_dict[hashtag] = count if sensitive: r_cluster.hset('sensitive_hashtag_'+str(ts), str(uid), json.dumps(hashtag_count_dict)) else: r_cluster.hset('hashtag_'+str(ts), str(uid), json.dumps(hashtag_count_dict)) except: if sensitive: r_cluster.hset('sensitive_hashtag_'+str(ts), str(uid), json.dumps(hashtag_dict)) else: r_cluster.hset('hashtag_'+str(ts), str(uid), json.dumps(hashtag_dict))
def get_activity_time(uid_list): results = {} now_ts = time.time() now_date = ts2datetime(now_ts) #run_type if RUN_TYPE == 1: timestamp = datetime2ts(now_date) else: timestamp = datetime2ts(RUN_TEST_TIME) activity_list_dict = {} # {uid:[activity_list], uid:[]} for i in range(1, WEEK + 1): ts = timestamp - DAY * i print ts r_result = r_cluster.hmget('activity_' + str(ts), uid_list) #print r_result if r_result: for j in range(0, len(uid_list)): uid = uid_list[j] if uid not in activity_list_dict: activity_list_dict[uid] = [0 for i in range(0, 96)] user_r_result = r_result[j] if user_r_result: user_activity_dict = json.loads(user_r_result) for i in range(0, 96): try: count = user_activity_dict[str(i)] except: count = 0 activity_list_dict[uid].append(count) for uid in uid_list: activity_list = activity_list_dict[uid] statusnum = sum(activity_list) signal = np.array(activity_list) fftResult = np.abs(np.fft.fft(signal))**2 n = signal.size freq = np.fft.fftfreq(n, d=1) i = 0 max_val = 0 max_freq = 0 for val in fftResult: if val > max_val and freq[i] > 0: max_val = val max_freq = freq[i] i += 1 results[uid] = { 'statusnum': statusnum, 'activity_time': math.log(max_freq + 1) } return results
def update_day_geo(uid_list, user_info_list): results = {} now_ts = time.time() #run_type if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = test_ts ip_results = r_cluster.hmget('new_ip_'+str(now_date_ts - DAY), uid_list) count = 0 for uid in uid_list: if uid not in results: results[uid] = {'activity_geo':{}, 'activity_geo_dict':[]} uid_ip_results = ip_results[count] count += 1 if uid_ip_results: uid_ip_dict = json.loads(uid_ip_results) else: uid_ip_dict = {} day_results = {} for ip in uid_ip_dict: ip_count = len(uid_ip_dict[ip].split('&')) geo, school = ip2city(ip) if geo: geo = geo.decode('utf-8') try: day_results[geo] += ip_count except: day_results[geo] = ip_count #update the activity_geo_dict activity_geo_history_list = json.loads(user_info_list[uid]['activity_geo_dict']) activity_geo_history_list.append(day_results) results[uid]['activity_geo_dict'] = json.dumps(activity_geo_history_list[-30:]) #update the activity_geo week_activity_geo_list = activity_geo_history_list[-7:] week_geo_list = [] for activity_geo_item in week_activity_geo_list: geo_list = activity_geo_item.keys() week_geo_list.extend(geo_list) week_geo_list = list(set(week_geo_list)) week_geo_string = '&'.join(['&'.join(item.split('\t')) for item in week_geo_list]) try: week_geo_aggs_string = '&'.join([item.split('\t')[-1] for item in week_geo_list]) except: week_geo_aggs_string = '' results[uid]['activity_geo'] = week_geo_string results[uid]['activity_geo_aggs'] = week_geo_aggs_string #print 'update geo results:', results return results
def cal_core_class(): date = '2013-09-07' timestamp = datetime2ts(date) f_r = open('/home/ubuntu8/huxiaoqian/user_portrait/user_portrait/cron/recommentation_in/core_list.csv', 'rb') reader = csv.reader(f_r) f_w = open('/home/ubuntu8/huxiaoqian/user_portrait/user_portrait/cron/recommentation_in/core_class.csv', 'wb') writer = csv.writer(f_w) result_list = [] count011 = 0 for line in reader: uid = line[0] retweet_results = r.hgetall('retweet_'+str(uid)) retweet_count = len(retweet_results) be_retweet_results = r.hgetall('be_retweet_'+str(uid)) be_retweet_count = len(be_retweet_results) weibo_count = 0 for i in range(0,7): ts = timestamp - 24*3600*i activity_string = r_cluster.hget('activity_'+str(ts), str(uid)) if activity_string: activity_dict = json.loads(activity_string) else: activity_dict = {} for time_seg in activity_dict: count = activity_dict[time_seg] weibo_count += count ave_weibo_count = float(weibo_count) / 7 if retweet_count >= 8: indic_1 = '1' else: indic_1 = '0' if be_retweet_count >= 9: indic_2 = '1' else: indic_2 = '0' if ave_weibo_count >= 6: indic_3 = '1' else: indic_3 = '0' key = indic_1 + indic_2 + indic_3 if key=='011': count011 += 1 result_list.append([uid, key, retweet_count, be_retweet_count, ave_weibo_count]) f_r.close() sort_result = sorted(result_list, key=lambda x:x[3], reverse=True) for item in sort_result: writer.writerow(list(item)) f_w.close() print 'count011:', count011
def cal_ave_weibo(): date = '2013-09-07' timestamp = datetime2ts(date) scan_count = 0 scan_cursor = 0 all_count = 0 while 1: if scan_count == 1000000: break results = r_cluster.hscan('activity_'+str(timestamp), scan_cursor, count=1000) scan_cursor = results[0] scan_count += 1000 for uid in results[1]: for i in range(0,1): ts = timestamp - 24*3600*i activity_dict_string = r_cluster.hget('activity_'+str(ts), uid) if activity_dict_string: activity_dict = json.loads(activity_dict_string) weibo_count = 0 for time_seg in activity_dict: weibo_count += int(activity_dict[time_seg]) all_count += weibo_count ave_count = float(all_count) / scan_count print 'ave_count:', ave_count
def save_at(uid, at_uid, timestamp, sensitive): ts = ts2datetime(timestamp).replace('-','') key = str(uid) try: if sensitive: ruid_count_string = r_cluster.hget('sensitive_at_'+str(ts), str(uid)) else: ruid_count_string = r_cluster.hget('at_'+str(ts), str(uid)) ruid_count_dict = json.loads(ruid_count_string) try: ruid_count_dict[str(at_uid)] += 1 except: ruid_count_dict[str(at_uid)] = 1 if sensitive: r_cluster.hset('sensitive_at_'+str(ts), str(uid), json.dumps(ruid_count_dict)) else: r_cluster.hset('at_'+str(ts), str(uid), json.dumps(ruid_count_dict)) except: if sensitive: r_cluster.hset('sensitive_at_'+str(ts), str(uid), json.dumps({str(at_uid):1})) else: r_cluster.hset('at_'+str(ts), str(uid), json.dumps({str(at_uid):1}))
def get_activity_time(uid_list): results = {} now_ts = time.time() now_date = ts2datetime(now_ts) #run_type if RUN_TYPE == 1: timestamp = datetime2ts(now_date) else: timestamp = datetime2ts(RUN_TEST_TIME) activity_list_dict = {} # {uid:[activity_list], uid:[]} for i in range(1,WEEK+1): ts = timestamp - DAY*i print ts r_result = r_cluster.hmget('activity_'+str(ts), uid_list) #print r_result if r_result: for j in range(0, len(uid_list)): uid = uid_list[j] if uid not in activity_list_dict: activity_list_dict[uid] = [0 for i in range(0, 96)] user_r_result = r_result[j] if user_r_result: user_activity_dict = json.loads(user_r_result) for i in range(0, 96): try: count = user_activity_dict[str(i)] except: count = 0 activity_list_dict[uid].append(count) for uid in uid_list: activity_list = activity_list_dict[uid] statusnum = sum(activity_list) signal = np.array(activity_list) fftResult = np.abs(np.fft.fft(signal))**2 n = signal.size freq = np.fft.fftfreq(n, d=1) i = 0 max_val = 0 max_freq = 0 for val in fftResult: if val>max_val and freq[i]>0: max_val = val max_freq = freq[i] i += 1 results[uid] = {'statusnum': statusnum, 'activity_time': math.log(max_freq + 1)} return results
def get_user_at(): #step1: get_uid_list uid_list = get_uid_list() date = ts2datetime(time.time()) ts = datetime2ts(date) f = open('/home/user_portrait_0320/revised_user_portrait/user_portrait/user_portrait/attribute/uid_at.txt', 'w') for i in range(1, 8): ts = ts - DAY for uid in uid_list: #try: result_string = r_cluster.hget('at_' + str(ts), uid) #except: # result_string = '' if result_string: save_dict = {'ts': ts, 'result': result_string} f.write('%s\n' % json.dumps(save_dict)) f.close()
def get_activeness(uid, activity_geo): result = 0 now_ts = time.time() now_date = ts2datetime(now_ts) timestamp = datetime2ts(now_date) #test timestamp = datetime2ts('2013-09-08') # deal activity_time fft and statusnum activity_list = [] statusnum = 0 for i in range(1,8): ts = timestamp - 24*3600*i r_result = r_cluster.hget('activity_'+str(ts), uid) if r_result: r_result = json.loads(r_result) #print 'r_result:', r_result for i in range(0,96): try: count = r_result[str(i)] except: count = 0 activity_list.append(float(count)) #print 'activity_list:', activity_list statusnum = sum(activity_list) signal = np.array(activity_list) fftResult = np.abs(np.fft.fft(signal)) ** 2 n = signal.size freq = np.fft.fftfreq(n, d=1) i = 0 max_val = 0 max_freq = 0 for val in fftResult: #print str(1/freq[i]) + ',' + str(val) if val>max_val and freq[i]>0: max_val = val max_freq = freq[i] i = i + 1 #print 'i:', i #print 'max_freq, max_val:', max_freq, max_val # deal avtivity_geo input: 'geo&geo' activity_geo_count = len(activity_geo.split('&')) result = activeness_weight_dict['activity_time'] * math.log(max_freq + 1) + \ activeness_weight_dict['activity_geo'] * math.log(activity_geo_count + 1) +\ activeness_weight_dict['statusnum'] * math.log(statusnum + 1) #print 'activeness:', result return result
def get_activeness(uid, activity_geo): result = 0 now_ts = time.time() now_date = ts2datetime(now_ts) timestamp = datetime2ts(now_date) #test #timestamp = datetime2ts('2013-09-08') # deal activity_time fft and statusnum activity_list = [] statusnum = 0 for i in range(1,8): ts = timestamp - 24*3600*i r_result = r_cluster.hget('activity_'+str(ts), uid) if r_result: r_result = json.loads(r_result) #print 'r_result:', r_result for i in range(0,96): try: count = r_result[str(i)] except: count = 0 activity_list.append(float(count)) #print 'activity_list:', activity_list statusnum = sum(activity_list) signal = np.array(activity_list) fftResult = np.abs(np.fft.fft(signal)) ** 2 n = signal.size freq = np.fft.fftfreq(n, d=1) i = 0 max_val = 0 max_freq = 0 for val in fftResult: #print str(1/freq[i]) + ',' + str(val) if val>max_val and freq[i]>0: max_val = val max_freq = freq[i] i = i + 1 #print 'i:', i #print 'max_freq, max_val:', max_freq, max_val # deal avtivity_geo input: 'geo&geo' activity_geo_count = len(activity_geo.split('&')) result = activeness_weight_dict['activity_time'] * math.log(max_freq + 1) + \ activeness_weight_dict['activity_geo'] * math.log(activity_geo_count + 1) +\ activeness_weight_dict['statusnum'] * math.log(statusnum + 1) #print 'activeness:', result return result
def get_activity_geo(uid): ip_result = [] now_ts = time.time() now_date = ts2datetime(now_ts) ts = datetime2ts(now_date) geo_result = {} # test ts = datetime2ts('2013-09-08') for i in range(1,8): ts = ts - 24*3600 r_result = r_cluster.hget('ip_'+str(ts), uid) if r_result: ip_list = json.loads(r_result).keys() ip_result.extend(ip_list) ip_list = set(ip_result) geo_string = '&'.join(ip2geo(ip_list)) #print 'geo_string:', geo_string return geo_string
def get_user_at(): #step1: get_uid_list uid_list = get_uid_list() date = ts2datetime(time.time()) ts = datetime2ts(date) f = open( '/home/user_portrait_0320/revised_user_portrait/user_portrait/user_portrait/attribute/uid_at.txt', 'w') for i in range(1, 8): ts = ts - DAY for uid in uid_list: #try: result_string = r_cluster.hget('at_' + str(ts), uid) #except: # result_string = '' if result_string: save_dict = {'ts': ts, 'result': result_string} f.write('%s\n' % json.dumps(save_dict)) f.close()
def attr_hash(uid): hashtag_results = {} now_ts = time.time() # test now_ts = datetime2ts('2013-09-08') date = ts2datetime(now_ts) ts = datetime2ts(date) for i in range(1, 8): ts = ts - 24 * 3600 result_string = r_cluster.hget('hashtag_' + str(ts), str(uid)) if result_string: result_dict = json.loads(result_string) for hashtag in result_dict: count = result_dict[hashtag] try: hashtag_results[hashtag] += count except: hashtag_results[hashtag] = count return hashtag_results
def attr_hash(uid): hashtag_results = {} now_ts = time.time() # test now_ts = datetime2ts('2013-09-08') date = ts2datetime(now_ts) ts = datetime2ts(date) for i in range(1,8): ts = ts - 24*3600 result_string = r_cluster.hget('hashtag_'+str(ts), str(uid)) if result_string: result_dict = json.loads(result_string) for hashtag in result_dict: count = result_dict[hashtag] try: hashtag_results[hashtag] += count except: hashtag_results[hashtag] = count return hashtag_results
def filter_mention(user_set): results = [] now_date = ts2datetime(time.time()) now_date = '2013-09-08' timestamp = datetime2ts(now_date) - 24*3600 for user in user_set: mention_set = set() for i in range(0,7): ts = timestamp - 3600*24*i result = r_cluster.hget('at_'+str(ts), str(user)) if result: item_dict = json.loads(result) for at_user in item_dict: mention_set.add(at_user) if at_count < mention_threshold: results.append(user) else: writer.writerow([user, 'mention']) print 'after filter mention: ', len(results) return results
def filter_mention(user_set): results = [] now_date = ts2datetime(time.time()) now_date = '2013-09-08' timestamp = datetime2ts(now_date) - 24 * 3600 for user in user_set: mention_set = set() for i in range(0, 7): ts = timestamp - 3600 * 24 * i result = r_cluster.hget('at_' + str(ts), str(user)) if result: item_dict = json.loads(result) for at_user in item_dict: mention_set.add(at_user) if at_count < mention_threshold: results.append(user) else: writer.writerow([user, 'mention']) print 'after filter mention: ', len(results) return results
def get_school(uid_list): now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) school_results = {} for i in range(WEEK, 0, -1): ts = now_date_ts - DAY * i ip_results = r_cluster.hmget('new_ip_' + str(ts), uid_list) count = 0 for uid in uid_list: if uid not in school_results: school_results[uid] = {} ip_item = ip_results[count] if ip_item: uid_ip_dict = json.loads(ip_item) else: uid_ip_dict = {} for ip in uid_ip_dict: ip_count = len(uid_ip_dict[ip].split('&')) school = ip2school(ip) if school: try: school_results[uid][school] += ip_count except: school_results[uid][school] = ip_count count += 1 results = {} for uid in uid_list: school_dict = school_results[uid] school_string = '&'.join(school_dict.keys()) if school_dict != {}: is_school = '1' else: is_school = '0' results[uid] = { 'is_school': is_school, 'school_string': school_string, 'school_dict': json.dumps(school_dict) } return results
def get_attr_trend(uid_list): result = {} now_ts = time.time() date = ts2datetime(now_ts - 24*3600) timestamp = datetime2ts(date) #test timestamp = datetime2ts('2013-09-08') time_result = dict() segment_result = dict() for i in range(1, 8): ts = timestamp - i*24*3600 r_result = r_cluster.hmget('activity_'+str(ts), uid_list) #print 'r_result:', r_result for item in r_result: if item: item = json.loads(item) for segment in item: try: time_result[int(segment)/16*15*60*16+ts] += item[segment] except: time_result[int(segment)/16*15*60*16+ts] = item[segment] try: segment_result[int(segment)/16*15*60*16] += item[segment] except: segment_result[int(segment)/16*15*60*16] = item[segment] trend_list = [] for i in range(1, 8): ts = timestamp - i*24*3600 for j in range(0, 6): time_seg = ts + j*15*60*16 if time_seg in time_result: trend_list.append((time_seg, time_result[time_seg])) else: trend_list.append((time_seg, 0)) sort_trend_list = sorted(trend_list, key=lambda x:x[0], reverse=False) #print 'time_result:', time_result #print 'trend_list:', trend_list #print 'sort_trend_list:', sort_trend_list result['activity_trend'] = json.dumps(sort_trend_list) result['activity_time'] = json.dumps(segment_result) return result
def update_day_sensitive(uid_list): results = {} all_results = {} now_ts = time.time() #run_type if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = test_ts for i in range(WEEK,0,-1): ts = now_date_ts - DAY*i count = 0 sensitive_results = r_cluster.hmget('sensitive_'+str(ts), uid_list) for uid in uid_list: if uid not in results: results[uid] = {} sensitive_item = sensitive_results[count] if sensitive_item: sensitive_dict = json.loads(sensitive_item) else: sensitive_dict = {} for sensitive in sensitive_dict: try: results[uid][sensitive] += 1 except: results[uid][sensitive] = 1 for uid in uid_list: user_sensitive_dict = results[uid] sensitive_score = 0 for item in user_sensitive_dict: k = item v = user_sensitive_dict[k] tmp_stage = r_sensitive.hget('sensitive_words', k) if tmp_stage: sensitive_score += v * sensitive_score_dict[str(tmp_stage)] sensitive_string = '&'.join(user_sensitive_dict.keys()) all_results[uid] = {'sensitive_string': sensitive_string, 'sensitive_dict':user_sensitive_dict,\ 'sensitive': sensitive_score} return all_results
def update_day_sensitive(uid_list): results = {} all_results = {} now_ts = time.time() #run_type if RUN_TYPE == 1: now_date_ts = datetime2ts(ts2datetime(now_ts)) else: now_date_ts = test_ts for i in range(WEEK, 0, -1): ts = now_date_ts - DAY * i count = 0 sensitive_results = r_cluster.hmget('sensitive_' + str(ts), uid_list) for uid in uid_list: if uid not in results: results[uid] = {} sensitive_item = sensitive_results[count] if sensitive_item: sensitive_dict = json.loads(sensitive_item) else: sensitive_dict = {} for sensitive in sensitive_dict: try: results[uid][sensitive] += 1 except: results[uid][sensitive] = 1 for uid in uid_list: user_sensitive_dict = results[uid] sensitive_score = 0 for item in user_sensitive_dict: k = item v = user_sensitive_dict[k] tmp_stage = r_sensitive.hget('sensitive_words', k) if tmp_stage: sensitive_score += v * sensitive_score_dict[str(tmp_stage)] sensitive_string = '&'.join(user_sensitive_dict.keys()) all_results[uid] = {'sensitive_string': sensitive_string, 'sensitive_dict':user_sensitive_dict,\ 'sensitive': sensitive_score} return all_results
def filter_ip(user_set): results = [] now_date = ts2datetime(time.time()) now_date = '2013-09-08' ts = datetime2ts(now_date) - 24 * 3600 for user in user_set: ip_set = set() for i in range(0, 7): timestamp = ts - 3600 * 24 * i ip_result = r_cluster.hget('ip_' + str(ts), str(user)) if ip_result: result_dict = json.loads(ip_result) else: result_dict = {} for ip in result_dict: ip_set.add(ip) if len(result_dict) < ip_threshold: results.append(user) else: writer.writerow([user, 'ip']) print 'after filter ip: ', len(results) return results
def filter_ip(user_set): results = [] now_date = ts2datetime(time.time()) now_date = '2013-09-08' ts = datetime2ts(now_date) - 24*3600 for user in user_set: ip_set = set() for i in range(0,7): timestamp = ts - 3600*24*i ip_result = r_cluster.hget('ip_'+str(ts), str(user)) if ip_result: result_dict = json.loads(ip_result) else: result_dict = {} for ip in result_dict: ip_set.add(ip) if len(result_dict) < ip_threshold: results.append(user) else: writer.writerow([user, 'ip']) print 'after filter ip: ', len(results) return results
def get_school(uid_list): now_ts = time.time() now_date_ts = datetime2ts(ts2datetime(now_ts)) school_results = {} for i in range(WEEK, 0, -1): ts = now_date_ts - DAY * i ip_results = r_cluster.hmget('new_ip_'+str(ts), uid_list) count = 0 for uid in uid_list: if uid not in school_results: school_results[uid] = {} ip_item = ip_results[count] if ip_item: uid_ip_dict = json.loads(ip_item) else: uid_ip_dict = {} for ip in uid_ip_dict: ip_count = len(uid_ip_dict[ip].split('&')) school = ip2school(ip) if school: try: school_results[uid][school] += ip_count except: school_results[uid][school] = ip_count count += 1 results = {} for uid in uid_list: school_dict = school_results[uid] school_string = '&'.join(school_dict.keys()) if school_dict != {}: is_school = '1' else: is_school = '0' results[uid] = {'is_school': is_school, 'school_string': school_string, 'school_dict': json.dumps(school_dict)} return results
def filter_ip(user_set): results = [] #run_type if RUN_TYPE == 1: now_date = ts2datetime(time.time()) else: now_date = RUN_TEST_TIME ts = datetime2ts(now_date) - DAY for user in user_set: ip_set = set() for i in range(0, 7): timestamp = ts - DAY * i ip_result = r_cluster.hget('ip_' + str(ts), str(user)) if ip_result: result_dict = json.loads(ip_result) else: result_dict = {} for ip in result_dict: ip_set.add(ip) if len(ip_set) < ip_threshold: results.append(user) else: writer.writerow([user, 'ip']) return results
def filter_mention(user_set): results = [] #run_type if RUN_TYPE == 1: now_date = ts2datetime(time.time()) else: now_date = RUN_TEST_TIME timestamp = datetime2ts(now_date) - DAY date = ts2datetime(timestamp) for user in user_set: mention_set = set() for i in range(0, 7): ts = timestamp - DAY * i result = r_cluster.hget('at_' + str(ts), str(user)) if result: item_dict = json.loads(result) for at_user in item_dict: mention_set.add(at_user) at_count = len(mention_set) if at_count < mention_threshold: results.append(user) else: writer.writerow([user, 'mention']) return results
text.encode('utf-8', 'ignore'), DFA) if sensitive_words_dict: item['sensitive_words_string'] = "&".join( sensitive_words_dict.keys()) item['sensitive_words_dict'] = json.dumps( sensitive_words_dict) else: item['sensitive_words_string'] = "" item['sensitive_words_dict'] = json.dumps({}) timestamp = item['timestamp'] date = ts2datetime(timestamp) ts = datetime2ts(date) if sensitive_words_dict: #print 'sensitive_words_dict...keys[0]...',sensitive_words_dict.keys()[0] sensitive_count_string = r_cluster.hget( 'sensitive_' + str(ts), str(uid)) if sensitive_count_string: #redis取空 sensitive_count_dict = json.loads( sensitive_count_string) for word in sensitive_words_dict.keys(): if sensitive_count_dict.has_key(word): sensitive_count_dict[ word] += sensitive_words_dict[word] else: sensitive_count_dict[ word] = sensitive_words_dict[word] r_cluster.hset('sensitive_' + str(ts), str(uid), json.dumps(sensitive_count_dict)) else: r_cluster.hset('sensitive_' + str(ts), str(uid), json.dumps(sensitive_words_dict))
def test(ft_type): print ft_type if ft_type == 'facebook': index_name_pre = facebook_flow_text_index_name_pre index_type = facebook_flow_text_index_type user_index_name = facebook_user_index_name user_index_type = facebook_user_index_type else: index_name_pre = twitter_flow_text_index_name_pre index_type = twitter_flow_text_index_type user_index_name = twitter_user_index_name user_index_type = twitter_user_index_type # date_list = load_date_list(True) date_list = load_date_list() DFA = createWordTree() query_body = { 'post_filter': { 'missing': { 'field': 'keywords_string' } }, 'query': { 'filtered': { 'filter': { 'bool': { 'must': [{ 'range': { 'flag_ch': { 'gte': -1 } } }] } } } } } for date in date_list: count = 0 bulk_action = [] index_name = index_name_pre + date try: es_scan_results = scan(es, query=query_body, size=1000, index=index_name, doc_type=index_type) while True: try: scan_data = es_scan_results.next() item = scan_data['_source'] text = item['text_ch'] uid = item['uid'] if ft_type == 'facebook': _id = item['fid'] else: _id = item['tid'] ts = datetime2ts(date) #add sentiment field to weibo sentiment, keywords_list = triple_classifier(item) #add key words to weibo keywords_dict, keywords_string = get_weibo_keywords( keywords_list) #sensitive_words_dict sensitive_words_dict = searchWord( text.encode('utf-8', 'ignore'), DFA) if sensitive_words_dict: sensitive_words_string_data = "&".join( sensitive_words_dict.keys()) sensitive_words_dict_data = json.dumps( sensitive_words_dict) else: sensitive_words_string_data = "" sensitive_words_dict_data = json.dumps({}) #redis if sensitive_words_dict: sensitive_count_string = r_cluster.hget( 'sensitive_' + str(ts), str(uid)) if sensitive_count_string: #redis取空 sensitive_count_dict = json.loads( sensitive_count_string) for word in sensitive_words_dict.keys(): if sensitive_count_dict.has_key(word): sensitive_count_dict[ word] += sensitive_words_dict[word] else: sensitive_count_dict[ word] = sensitive_words_dict[word] r_cluster.hset('sensitive_' + str(ts), str(uid), json.dumps(sensitive_count_dict)) else: r_cluster.hset('sensitive_' + str(ts), str(uid), json.dumps(sensitive_words_dict)) #sensitive sensitive_score = 0 if sensitive_words_dict: for k, v in sensitive_words_dict.iteritems(): tmp_stage = r_sensitive.hget("sensitive_words", k) if tmp_stage: sensitive_score += v * sensitive_score_dict[ str(tmp_stage)] #directed_uid directed_uid_data = 0 directed_uid, directed_uname = get_root_retweet( text, uid, ft_type) if directed_uid: directed_uid_data = long(directed_uid) # hashtag hashtag = '' RE = re.compile( u'#([0-9a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+)[ ," =.。: :、]' ) hashtag_list = re.findall(RE, text) if hashtag_list: hashtag = '&'.join(hashtag_list) #action action = {'update': {'_id': _id}} # action_data action_data = { 'sentiment': str(sentiment), 'keywords_dict': json.dumps(keywords_dict), 'keywords_string': keywords_string, 'sensitive_words_string': sensitive_words_string_data, 'sensitive_words_dict': sensitive_words_dict_data, 'sensitive': sensitive_score, 'directed_uid': directed_uid_data, 'directed_uname': directed_uname, 'hashtag': hashtag, } bulk_action.extend([action, {'doc': action_data}]) count += 1 if count % 1000 == 0 and count != 0: if bulk_action: es.bulk(bulk_action, index=index_name, doc_type=facebook_flow_text_index_type, timeout=600) bulk_action = [] count = 0 except StopIteration: break if bulk_action: es.bulk(bulk_action, index=index_name, doc_type=facebook_flow_text_index_type, timeout=600) except Exception, e: #es文档不存在 print e