示例#1
0
def get_msg_type_aggs(data_dict, date):
    tb = date + " 00:00:00"
    ta = time.strptime(tb, "%Y-%m-%d %H:%M:%S")
    ts = int(time.mktime(ta))
    te = date + " 23:59:59"
    ta = time.strptime(te, "%Y-%m-%d %H:%M:%S")
    te = int(time.mktime(ta))
    user_behavior_dict = {}
    for uid in data_dict:
        #sql = 'select * from Information where uid = %s and ' % uid
        #cursor.execute(sql)
        #cursor.fetchall()
        mid_dict_list = data_dict[uid]
        df = DataFrame(mid_dict_list)
        behavior_dict = get_msg_aggs(df)
        sensitivenum = Information.objects.filter(uid=uid,
                                                  timestamp__gte=ts,
                                                  timestamp__lt=te).count()
        behavior_dict["sensitivenum"] = sensitivenum
        behavior_dict["timestamp"] = ts
        behavior_dict["uid"] = uid
        behavior_dict["store_date"] = date
        #print(behavior_dict)
        user_behavior_dict["%s_%s" % (str(ts), uid)] = behavior_dict
    sql_insert_many(cursor, "UserBehavior", "ub_id", user_behavior_dict)
def get_user_domain(word_dict, date):
    #time1 = time.time()
    domain_dict = domain_tfidf()
    #time2 = time.time()
    #print("获取domain",time2-time1)
    user_domain = {}
    ts = date.timestamp()
    #thedate = datetime.date.today()
    domain_p = get_p(domain_dict, word_dict)
    #time3 = time.time()
    #print("获取概率",time3-time2)
    for k in word_dict.keys():
        domain_json = json.dumps(domain_p[k])
        if len(domain_p[k]):
            md = domain_p[k][0][0]
            #print(md)
        else:
            md = "other"
        user_domain["%s_%s" % (str(ts), k)] = {
            "uid": k,
            "timestamp": ts,
            "main_domain": md,
            "domains": domain_json,
            "store_date": date
        }
    sql_insert_many(cursor, "UserDomain", "ud_id", user_domain)
示例#3
0
def get_user_activity_aggs(data_dict, date):
    #end_time = int(time.time())
    end_time = datetime.datetime.strptime(date + " 23:59:59",
                                          '%Y-%m-%d %H:%M:%S').timestamp()
    start_time = end_time - 24 * 60 * 60
    user_activity_dict = {}
    for uid in data_dict:
        geo_ip_dict = defaultdict(set)
        mid_dict_list = data_dict[uid]
        #print(mid_dict_list)
        df = DataFrame(mid_dict_list)
        geo_dict = df.groupby([df["geo"]]).size().to_dict()
        #print(geo_dict)
        #print(uid)
        '''无ip信息,后期补上
        activity_dict = df.groupby([df["geo"], df["send_ip"]]).size().to_dict()

        for k, v in activity_dict.items():
            geo_ip_dict[k[0]].add(k[1][:(k[1].rindex(".") + 1)] + "*")
        '''
        for k in geo_dict:
            #print(k)
            #ips = ",".join(list(geo_ip_dict[k])) 无ip信息 后期补上
            statusnum = geo_dict[k]
            #print(geo_dict[k])
            sensitivenum = Information.objects.filter(
                uid=uid,
                timestamp__gte=start_time,
                timestamp__lt=end_time,
                geo=k).count()
            user_activity_dict["%s_%s_%s" % (str(end_time), uid, k)] = {
                "uid": uid,
                "timestamp": end_time,
                "geo": k,
                "send_ip": None,
                "statusnum": statusnum,
                "sensitivenum": sensitivenum,
                "store_date": date
            }
    sql_insert_many(cursor, "UserActivity", "ua_id", user_activity_dict)
def get_user_topic(word_dict, date):
    #time1 = time.time()
    topic_dict = topic_tfidf()
    #time2 = time.time()
    #print("读取topic花费:",time2-time1)
    #thedate = datetime.date.today()
    #td = date + " 00:00:00"
    #ta = time.strptime(td, "%Y-%m-%d %H:%M:%S")
    #ts = int(time.mktime(ta))
    #print(topic_dict)
    ts = date.timestamp()
    user_topic = {}
    topic_p = get_p(topic_dict, word_dict)
    #time3 = time.time()
    #print("获取概率花费:",time3-time1)
    for k in word_dict.keys():
        topic_json = json.dumps(topic_p[k])
        user_topic["%s_%s" % (str(ts), k)] = {
            "uid": k,
            "timestamp": ts,
            "topics": topic_json,
            "store_date": date
        }
    sql_insert_many(cursor, "UserTopic", "ut_id", user_topic)
def get_user_keywords(text_list,word_dict,date, keywords_num=5):
    keywords = []
    hastag_dict=defaultdict(list)
    hastag = {}
    user_kw={}
    keywords_dict=defaultdict(dict)
    text_all=""
    #thedate = datetime.date.today()
    tr4w = TextRank4Keyword()
    #time11 = time.time()
    td = date + " 00:00:00"
    ta = time.strptime(td, "%Y-%m-%d %H:%M:%S")
    ts = int(time.mktime(ta))
    for k,v in text_list.items():
        for text in v:
            if isinstance(text, str):
                RE = re.compile(r'#([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+)#', re.UNICODE)
                #print(RE.findall(text))
                #RE = re.compile(u"#.[\u4e00-\u9fa5]+#")
                #print(text)
                ht = RE.findall(text.encode('utf-8').decode('utf-8'))
                if len(ht):
                    for h in ht:
                        if h in hastag:
                            hastag[h] += 1
                        else:
                            hastag[h] = 1
                tr4w.analyze(text=text, lower=True, window=2)   # py2中text必须是utf8编码的str或者unicode对象,py3中必须是utf8编码的bytes或者str对象
                for item in tr4w.get_keywords(keywords_num, word_min_len= 1):
                    #print(item.word,item.weight)
                    try:
                        keywords_dict[k][item['word']] += item['weight']
                    except:
                        keywords_dict[k][item['word']] = item['weight']
                #print(json.dumps(keywords_dict[k],ensure_ascii=False))
        hastag_dict[k] = hastag
        #print(hastag)
        #keywords_dict[k] = keywords
    #print(hastag_dict)
    #time22 = time.time()
    #print("获取关键词和has花费:",time22-time11)
    #time2 = time.time()
    #print("wordcount花费:",time2-time22)
    sensitive_words_weight = sensitive_word()
    #time3=time.time()
    #print("读取敏感词花费:",time3-time2)
    stw_dict = get_p(sensitive_words_weight,word_dict)
    #time4 = time.time()
    #print("获取概率:",time4-time3)
    for k in word_dict:
        #if len(keywords_dict):
        keyword_json = json.dumps(keywords_dict[k],ensure_ascii=False)
        #print(keyword_json)
        #if len(hastag_dict):
        hastag_json = json.dumps(hastag_dict[k],ensure_ascii=False)
        #if len(stw_dict):
        stw_json = json.dumps(stw_dict[k],ensure_ascii=False)
        user_kw["%s_%s" % (str(ts), k)]={"uid": k,
                                                        "timestamp": ts,
                                                        "keywords":keyword_json,
                                                        "hastags":hastag_json,
                                                        "sensitive_words":stw_json,
                                                        "store_date":date}
    sql_insert_many(cursor, "UserKeyWord", "ukw_id", user_kw)
    #time5 = time.time()
    # print("插入kw花费:",time5-time4)
    #return keywords_dict,hastag_dict'''