def get_msg_type_aggs(data_dict, date): tb = date + " 00:00:00" ta = time.strptime(tb, "%Y-%m-%d %H:%M:%S") ts = int(time.mktime(ta)) te = date + " 23:59:59" ta = time.strptime(te, "%Y-%m-%d %H:%M:%S") te = int(time.mktime(ta)) user_behavior_dict = {} for uid in data_dict: #sql = 'select * from Information where uid = %s and ' % uid #cursor.execute(sql) #cursor.fetchall() mid_dict_list = data_dict[uid] df = DataFrame(mid_dict_list) behavior_dict = get_msg_aggs(df) sensitivenum = Information.objects.filter(uid=uid, timestamp__gte=ts, timestamp__lt=te).count() behavior_dict["sensitivenum"] = sensitivenum behavior_dict["timestamp"] = ts behavior_dict["uid"] = uid behavior_dict["store_date"] = date #print(behavior_dict) user_behavior_dict["%s_%s" % (str(ts), uid)] = behavior_dict sql_insert_many(cursor, "UserBehavior", "ub_id", user_behavior_dict)
def get_user_domain(word_dict, date): #time1 = time.time() domain_dict = domain_tfidf() #time2 = time.time() #print("获取domain",time2-time1) user_domain = {} ts = date.timestamp() #thedate = datetime.date.today() domain_p = get_p(domain_dict, word_dict) #time3 = time.time() #print("获取概率",time3-time2) for k in word_dict.keys(): domain_json = json.dumps(domain_p[k]) if len(domain_p[k]): md = domain_p[k][0][0] #print(md) else: md = "other" user_domain["%s_%s" % (str(ts), k)] = { "uid": k, "timestamp": ts, "main_domain": md, "domains": domain_json, "store_date": date } sql_insert_many(cursor, "UserDomain", "ud_id", user_domain)
def get_user_activity_aggs(data_dict, date): #end_time = int(time.time()) end_time = datetime.datetime.strptime(date + " 23:59:59", '%Y-%m-%d %H:%M:%S').timestamp() start_time = end_time - 24 * 60 * 60 user_activity_dict = {} for uid in data_dict: geo_ip_dict = defaultdict(set) mid_dict_list = data_dict[uid] #print(mid_dict_list) df = DataFrame(mid_dict_list) geo_dict = df.groupby([df["geo"]]).size().to_dict() #print(geo_dict) #print(uid) '''无ip信息,后期补上 activity_dict = df.groupby([df["geo"], df["send_ip"]]).size().to_dict() for k, v in activity_dict.items(): geo_ip_dict[k[0]].add(k[1][:(k[1].rindex(".") + 1)] + "*") ''' for k in geo_dict: #print(k) #ips = ",".join(list(geo_ip_dict[k])) 无ip信息 后期补上 statusnum = geo_dict[k] #print(geo_dict[k]) sensitivenum = Information.objects.filter( uid=uid, timestamp__gte=start_time, timestamp__lt=end_time, geo=k).count() user_activity_dict["%s_%s_%s" % (str(end_time), uid, k)] = { "uid": uid, "timestamp": end_time, "geo": k, "send_ip": None, "statusnum": statusnum, "sensitivenum": sensitivenum, "store_date": date } sql_insert_many(cursor, "UserActivity", "ua_id", user_activity_dict)
def get_user_topic(word_dict, date): #time1 = time.time() topic_dict = topic_tfidf() #time2 = time.time() #print("读取topic花费:",time2-time1) #thedate = datetime.date.today() #td = date + " 00:00:00" #ta = time.strptime(td, "%Y-%m-%d %H:%M:%S") #ts = int(time.mktime(ta)) #print(topic_dict) ts = date.timestamp() user_topic = {} topic_p = get_p(topic_dict, word_dict) #time3 = time.time() #print("获取概率花费:",time3-time1) for k in word_dict.keys(): topic_json = json.dumps(topic_p[k]) user_topic["%s_%s" % (str(ts), k)] = { "uid": k, "timestamp": ts, "topics": topic_json, "store_date": date } sql_insert_many(cursor, "UserTopic", "ut_id", user_topic)
def get_user_keywords(text_list,word_dict,date, keywords_num=5): keywords = [] hastag_dict=defaultdict(list) hastag = {} user_kw={} keywords_dict=defaultdict(dict) text_all="" #thedate = datetime.date.today() tr4w = TextRank4Keyword() #time11 = time.time() td = date + " 00:00:00" ta = time.strptime(td, "%Y-%m-%d %H:%M:%S") ts = int(time.mktime(ta)) for k,v in text_list.items(): for text in v: if isinstance(text, str): RE = re.compile(r'#([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+)#', re.UNICODE) #print(RE.findall(text)) #RE = re.compile(u"#.[\u4e00-\u9fa5]+#") #print(text) ht = RE.findall(text.encode('utf-8').decode('utf-8')) if len(ht): for h in ht: if h in hastag: hastag[h] += 1 else: hastag[h] = 1 tr4w.analyze(text=text, lower=True, window=2) # py2中text必须是utf8编码的str或者unicode对象,py3中必须是utf8编码的bytes或者str对象 for item in tr4w.get_keywords(keywords_num, word_min_len= 1): #print(item.word,item.weight) try: keywords_dict[k][item['word']] += item['weight'] except: keywords_dict[k][item['word']] = item['weight'] #print(json.dumps(keywords_dict[k],ensure_ascii=False)) hastag_dict[k] = hastag #print(hastag) #keywords_dict[k] = keywords #print(hastag_dict) #time22 = time.time() #print("获取关键词和has花费:",time22-time11) #time2 = time.time() #print("wordcount花费:",time2-time22) sensitive_words_weight = sensitive_word() #time3=time.time() #print("读取敏感词花费:",time3-time2) stw_dict = get_p(sensitive_words_weight,word_dict) #time4 = time.time() #print("获取概率:",time4-time3) for k in word_dict: #if len(keywords_dict): keyword_json = json.dumps(keywords_dict[k],ensure_ascii=False) #print(keyword_json) #if len(hastag_dict): hastag_json = json.dumps(hastag_dict[k],ensure_ascii=False) #if len(stw_dict): stw_json = json.dumps(stw_dict[k],ensure_ascii=False) user_kw["%s_%s" % (str(ts), k)]={"uid": k, "timestamp": ts, "keywords":keyword_json, "hastags":hastag_json, "sensitive_words":stw_json, "store_date":date} sql_insert_many(cursor, "UserKeyWord", "ukw_id", user_kw) #time5 = time.time() # print("插入kw花费:",time5-time4) #return keywords_dict,hastag_dict'''