Exemplo n.º 1
0
def get_msg_type_aggs(data_dict, date):
    tb = date + " 00:00:00"
    ta = time.strptime(tb, "%Y-%m-%d %H:%M:%S")
    ts = int(time.mktime(ta))
    te = date + " 23:59:59"
    ta = time.strptime(te, "%Y-%m-%d %H:%M:%S")
    te = int(time.mktime(ta))
    user_behavior_dict = {}
    for uid in data_dict:
        #sql = 'select * from Information where uid = %s and ' % uid
        #cursor.execute(sql)
        #cursor.fetchall()
        mid_dict_list = data_dict[uid]
        df = DataFrame(mid_dict_list)
        behavior_dict = get_msg_aggs(df)
        sensitivenum = Information.objects.filter(uid=uid,
                                                  timestamp__gte=ts,
                                                  timestamp__lt=te).count()
        behavior_dict["sensitivenum"] = sensitivenum
        behavior_dict["timestamp"] = ts
        behavior_dict["uid"] = uid
        behavior_dict["store_date"] = date
        #print(behavior_dict)
        user_behavior_dict["%s_%s" % (str(ts), uid)] = behavior_dict
    sql_insert_many(cursor, "UserBehavior", "ub_id", user_behavior_dict)
def get_user_domain(word_dict, date):
    #time1 = time.time()
    domain_dict = domain_tfidf()
    #time2 = time.time()
    #print("获取domain",time2-time1)
    user_domain = {}
    ts = date.timestamp()
    #thedate = datetime.date.today()
    domain_p = get_p(domain_dict, word_dict)
    #time3 = time.time()
    #print("获取概率",time3-time2)
    for k in word_dict.keys():
        domain_json = json.dumps(domain_p[k])
        if len(domain_p[k]):
            md = domain_p[k][0][0]
            #print(md)
        else:
            md = "other"
        user_domain["%s_%s" % (str(ts), k)] = {
            "uid": k,
            "timestamp": ts,
            "main_domain": md,
            "domains": domain_json,
            "store_date": date
        }
    sql_insert_many(cursor, "UserDomain", "ud_id", user_domain)
    update(user_domain)
Exemplo n.º 3
0
def wordcount(text_dict, date):
    stopwords = stopwordslist()
    word_dict = {}  #格式为字典{uid:{词:词频}}
    user_wc = {}
    thedate = datetime.date.today()
    for k, v in text_dict.items():
        word_list = {}
        count = 0
        for item in v:
            for item1 in item:
                if item1 not in stopwords and item1 != " ":
                    count += 1
                    try:
                        word_list[item1] += 1
                    except:
                        word_list[item1] = 1
        word_list["count"] = count
        word_dict[k] = word_list
    td = date + " 00:00:00"
    ta = time.strptime(td, "%Y-%m-%d %H:%M:%S")
    ts = int(time.mktime(ta))
    for k in word_dict.keys():
        word_json = json.dumps(word_dict[k])
        id = "%s_%s" % (str(int(time.time())), k)
        user_wc[id] = {
            "uid": k,
            "timestamp": ts,
            "wordcount": word_json,
            "store_date": date
        }
    sql_insert_many("WordCount", "uwc_id", user_wc)
    return word_dict
Exemplo n.º 4
0
def get_user_activity_aggs(data_dict, date):
    #end_time = int(time.time())
    end_time = datetime.datetime.strptime(date + " 23:59:59",
                                          '%Y-%m-%d %H:%M:%S').timestamp()
    start_time = end_time - 24 * 60 * 60
    user_activity_dict = {}
    ip_dict = {}
    for uid in data_dict:
        #geo_ip_dict = defaultdict(set)
        mid_dict_list = data_dict[uid]
        #print(mid_dict_list)
        df = DataFrame(mid_dict_list)
        df = df.astype(object).where(pd.notnull(df), None)
        geo_dict = df.groupby([df["geo"]]).size().to_dict()
        #print(geo_dict)
        #print(uid)
        '''无ip信息,后期补上
        activity_dict = df.groupby([df["geo"], df["send_ip"]]).size().to_dict()
        for k, v in activity_dict.items():
            geo_ip_dict[k[0]].add(k[1][:(k[1].rindex(".") + 1)] + "*")
            '''
        for index, row in df.iterrows():
            ip_dict[row["geo"]] = row["ip"]
            #print(row['geo'])
            #try:
            #ip_dict[row["geo"]] = row["ip"]
            #except:
            #continue
        #print(ip_dict)
        for k in geo_dict:
            #print(k)
            #ips = ",".join(list(geo_ip_dict[k])) #无ip信息 后期补上
            ip = ip_dict[k]
            if ip is None:
                ip = "未知"
            #ip = geo_ip_dict[k]
            statusnum = geo_dict[k]
            #print(geo_dict[k])
            sensitivenum = Information.objects.filter(
                uid=uid,
                timestamp__gte=start_time,
                timestamp__lt=end_time,
                geo=k).count()
            user_activity_dict["%s_%s_%s" % (str(end_time), uid, k)] = {
                "uid": uid,
                "timestamp": end_time,
                "geo": k,
                "send_ip": ip,
                "statusnum": statusnum,
                "sensitivenum": sensitivenum,
                "store_date": date
            }
    sql_insert_many(cursor, "UserActivity", "ua_id", user_activity_dict)
def cal_user_emotion(word_dict, thedate):
    '''
    用户情感计算函数  0为负面 1为中性 2为正面
    :param word_dict:
    :return:None
    '''
    # 加载词向量
    with open('../profile_cal/sentiment_model_data/weibo_vector.pkl',
              'rb') as f:
        weibo_dic = pickle.load(f)
    # 加载模型
    l_m = joblib.load(
        '../profile_cal/sentiment_model_data/sentiment_logical.model')
    user_sentiment_dict = {}
    for uid, weibo_list in word_dict.items():  #value 为列表  key 为uid
        sentiment_dict = {}
        # thedate = today()
        sum_r = len(weibo_list)
        if sum_r:  #此天有微博数据
            sentiment = triple_classifier(weibo_list, weibo_dic, l_m)
            c = Counter(sentiment).most_common()
            c = dict(c)
            sentiment_dict['timestamp'] = date2ts(thedate)
            sentiment_dict['uid'] = uid
            sentiment_dict['negtive'] = c.get('0', 0)
            sentiment_dict['nuetral'] = c.get('1', 0)
            sentiment_dict['positive'] = c.get('2', 0)
            sentiment_dict['store_date'] = thedate
            user_sentiment_dict['%s_%s' % (str(
                sentiment_dict['timestamp']), uid)] = sentiment_dict
        else:
            sentiment_dict['timestamp'] = date2ts(thedate)
            sentiment_dict['uid'] = uid
            sentiment_dict['negtive'] = 0
            sentiment_dict['nuetral'] = 0
            sentiment_dict['positive'] = 0
            sentiment_dict['store_date'] = thedate
            user_sentiment_dict['%s_%s' % (str(
                sentiment_dict['timestamp']), uid)] = sentiment_dict
            print("no data")
    sql_insert_many("UserSentiment", "us_id", user_sentiment_dict)
Exemplo n.º 6
0
def get_user_topic(word_dict, date):
    #time1 = time.time()
    topic_dict = topic_tfidf()
    #time2 = time.time()
    #print("读取topic花费:",time2-time1)
    #thedate = datetime.date.today()
    #td = date + " 00:00:00"
    #ta = time.strptime(td, "%Y-%m-%d %H:%M:%S")
    #ts = int(time.mktime(ta))
    #print(topic_dict)
    ts = date.timestamp()
    user_topic = {}
    topic_p = get_p(topic_dict, word_dict)
    #time3 = time.time()
    #print("获取概率花费:",time3-time1)
    for k in word_dict.keys():
        topic_json = json.dumps(topic_p[k])
        user_topic["%s_%s" % (str(ts), k)] = {
            "uid": k,
            "timestamp": ts,
            "topics": topic_json,
            "store_date": date
        }
    sql_insert_many(cursor, "UserTopic", "ut_id", user_topic)
def get_user_keywords(text_list,word_dict,date, keywords_num=5):
    keywords = []
    hastag_dict=defaultdict(list)
    
    user_kw={}
    keywords_dict=defaultdict(dict)
    text_all=""
    #thedate = datetime.date.today()
    tr4w = TextRank4Keyword()
    #time11 = time.time()
    td = date + " 00:00:00"
    ta = time.strptime(td, "%Y-%m-%d %H:%M:%S")
    ts = int(time.mktime(ta))
    for k,v in text_list.items():
        hastag = {}
        for text in v:
            if isinstance(text, str):
                RE = re.compile(r'#([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+)#', re.UNICODE)
                #print(RE.findall(text))
                #RE = re.compile(u"#.[\u4e00-\u9fa5]+#")
                #print(text)
                ht = RE.findall(text.encode('utf-8').decode('utf-8'))
                if len(ht):
                    for h in ht:
                        if h in hastag:
                            hastag[h] += 1
                        else:
                            hastag[h] = 1
                tr4w.analyze(text=text, lower=True, window=2)   # py2中text必须是utf8编码的str或者unicode对象,py3中必须是utf8编码的bytes或者str对象
                for item in tr4w.get_keywords(keywords_num, word_min_len= 1):
                    #print(item.word,item.weight)
                    try:
                        keywords_dict[k][item['word']] += item['weight']
                    except:
                        keywords_dict[k][item['word']] = item['weight']
                #print(json.dumps(keywords_dict[k],ensure_ascii=False))
        hastag_dict[k] = hastag
        #print(hastag)
        #keywords_dict[k] = keywords
    #print(hastag_dict)
    #time22 = time.time()
    #print("获取关键词和has花费:",time22-time11)
    #time2 = time.time()
    #print("wordcount花费:",time2-time22)
    sensitive_words_weight = sensitive_word()
    #time3=time.time()
    #print("读取敏感词花费:",time3-time2)
    stw_dict = get_p(sensitive_words_weight,word_dict)
    #time4 = time.time()
    #print("获取概率:",time4-time3)
    for k in word_dict:
        #if len(keywords_dict):
        keyword_json = json.dumps(keywords_dict[k],ensure_ascii=False)
        #print(keyword_json)
        #if len(hastag_dict):
        hastag_json = json.dumps(hastag_dict[k],ensure_ascii=False)
        #if len(stw_dict):
        stw_json = json.dumps(stw_dict[k],ensure_ascii=False)
        user_kw["%s_%s" % (str(ts), k)]={"uid": k,
                                                        "timestamp": ts,
                                                        "keywords":keyword_json,
                                                        "hastags":hastag_json,
                                                        "sensitive_words":stw_json,
                                                        "store_date":date}
    sql_insert_many(cursor, "UserKeyWord", "ukw_id", user_kw)
    #time5 = time.time()
    # print("插入kw花费:",time5-time4)
    #return keywords_dict,hastag_dict'''