示例#1
0
def get_attr_geo_track(uid_list):
    date_results = [] # results = {'2013-09-01':[(geo1, count1), (geo2, track2)], '2013-09-02'...} 7day
    now_ts = time.time()
    now_date = ts2datetime(now_ts)
    #test
    now_date = '2013-09-08'
    ts = datetime2ts(now_date)
    for i in range(7, 0, -1):
        timestamp = ts - i*24*3600
        #print 'timestamp:', ts2datetime(timestamp)
        ip_dict = dict()
        results = r_cluster.hmget('ip_'+str(timestamp), uid_list)
        #print 'results:',results
        for item in results:
            if item:
                item_dict = json.loads(item)
                #print 'item_dict:', item_dict
                for ip_item in item_dict:
                    try:
                        ip_dict[ip_item] += item_dict[ip_item]
                    except:
                        ip_dict[ip_item] = item_dict[ip_item]
        geo_dict = ip2geo(ip_dict)
        sort_geo_dict = sorted(geo_dict.items(), key=lambda x:x[1], reverse=True)
        date_key = ts2datetime(timestamp)
        date_results.append([date_key, sort_geo_dict[:2]])
    #print 'results:', date_results
    return {'geo_track': json.dumps(date_results)}
def get_activity_time(uid_list):
    results = {}
    now_ts = time.time()
    now_date = ts2datetime(now_ts)
    #run_type
    if RUN_TYPE == 1:
        timestamp = datetime2ts(now_date)
    else:
        timestamp = datetime2ts("2013-09-08")
    activity_list_dict = {} # {uid:[activity_list], uid:[]}
    for i in range(1,WEEK+1):
        ts = timestamp - DAY*i
        if WORK_TYPE != 0:
            r_result = redis_activity.hmget('activity_'+str(ts), uid_list)
        else:
            r_result = []
            index_name = "activity_" + str(ts2datetime(ts))
            exist_bool = es_cluster.indices.exists(index=index_name)
            if exist_bool:
                es_results = es_cluster.mget(index=index_name, doc_type="activity", body={"ids":uid_list})["docs"]
                for item in es_results:
                    if item['found']:
                        r_result.append(item['_source']['activity_dict'])
                    else:
                        r_result.append(json.dumps({}))
            else:
                r_result = [json.dumps(dict())]*len(uid_list)

        if r_result:
            for j in range(0, len(uid_list)):
                uid = uid_list[j]
                if uid not in activity_list_dict:
                    activity_list_dict[uid] = [0 for i in range(0, 96)]
                user_r_result = r_result[j]
                if user_r_result:
                    user_activity_dict = json.loads(user_r_result)
                    for i in range(0, 96):
                        try:
                            count = user_activity_dict[str(i)]
                        except:
                            count = 0
                        activity_list_dict[uid].append(count)
    for uid in uid_list:
        activity_list = activity_list_dict[uid]
        statusnum = sum(activity_list)
        signal = np.array(activity_list)
        fftResult = np.abs(np.fft.fft(signal))**2
        n = signal.size
        freq = np.fft.fftfreq(n, d=1)
        i = 0
        max_val = 0
        max_freq = 0
        for val in fftResult:
            if val>max_val and freq[i]>0:
                max_val = val
                max_freq = freq[i]
            i += 1
        results[uid] = {'statusnum': statusnum, 'activity_time': math.log(max_freq + 1)}
    
    return results
def filter_activity(user_set):
    results = []
    #run_type
    if RUN_TYPE == 1:
        now_date = ts2datetime(time.time())
    else:
        now_date = RUN_TEST_TIME
    ts = datetime2ts(now_date) - DAY
    date = ts2datetime(ts)
    timestamp = datetime2ts(date)
    for user in user_set:
        over_count = 0
        for i in range(0,7):
            ts = timestamp - DAY*i
            result = redis_activity.hget('activity_'+str(ts), str(user))
            if result:
                items_dict = json.loads(result)
                for item in items_dict:
                    weibo_count = items_dict[item]
                    if weibo_count > activity_threshold:
                        over_count += 1
        if over_count == 0:
            results.append(user)
        else:
            writer.writerow([user, 'activity'])
            
    return results
def filter_activity(user_set):
    results = []
    now_date = ts2datetime(time.time())
    now_date = '2013-09-08'
    ts = datetime2ts(now_date) - 24*3600
    date = ts2datetime(ts)
    timestamp = datetime2ts(date)
    ts = ts.replace('-','')
    for user in user_set:
        over_count = 0
        for i in range(0,7):
            ts = timestamp - 3600*24*i
            result = r_cluster.hget('activity_'+str(ts), str(user))
            if result:
                item_dict = json.loads(result)
                sorted_dict = sorted(item_dict.iteritems(), key=lambda asd:asd[1], reverse=True)
                if sorted_dict[0][1] > activity_threshold:
                    over_count = 1
        if over_count == 0:
            results.append(user)
        else:
            writer.writerow([user, 'activity'])

    print 'after filter activity: ', len(results)
    return results
def key_words_search( search_type , pre , during , start_time , keyword , search_key = '' , sort_norm = '', sort_scope = ''  ,time = 1 , isall = False):
    keywords = keyword.split(",")
    should = []
    for key in keywords:
        if search_type == "hashtag":
            should.append({"prefix":{"text.text": "#" +  key + "#"}})
        else:    
            should.append({"prefix":{"text.text":key}})    
    date = start_time 
    index_name = pre + start_time
    while not es_9206.indices.exists(index= index_name) :
        new_time = datetime2ts(date) + DAY
        date = ts2datetime(new_time)
        index_name = pre + date
        during -= 1

    
    uid_set = set()
    for i in range(during):
        print index_name
        query = {"query":{"bool":{"must":[],"must_not":[],"should":should}},"size":MAX_ITEMS,"sort":[],"facets":{},"fields":['uid']}
        try :
            temp = es_9206.search(index = index_name , doc_type = 'text' , body = query)
            result = temp['hits']['hits']
            print "Fetch " + str(len(result))
            for item in result :
                uid_set.add(item['fields']['uid'][0].encode("utf-8") )
        except Exception,e:
            print e
            raise  Exception('user_list failed!')        
        new_time = datetime2ts(date) + DAY
        date = ts2datetime(new_time)
        index_name = pre + date
        i += 1
示例#6
0
def get_interval_count(topic, start_ts, end_ts):
    results = [0]
    ts_list = []
    #unit = 900
    #during = Day
    during = interval_count_during
    start_ts = datetime2ts(ts2datetime(start_ts))
    ts_list.append(start_ts)
    #end_ts = datetime2ts(ts2datetime(end_ts))
    # deal with the time is not the whole day
    print 'before deal end_ts:', ts2date(end_ts)
    if end_ts - datetime2ts(ts2datetime(end_ts))!= 0:
        end_ts = datetime2ts(ts2datetime(end_ts)) + 3600 * 24
    print 'get_interval_count start_ts:', ts2date(start_ts)
    print 'get_interval_count end_ts:', ts2date(end_ts)

    windowsize = (end_ts - start_ts) / Day
    interval = (end_ts - start_ts) / During
    for i in range(interval, 0, -1):
        begin_ts = end_ts - during * i
        over_ts = begin_ts + during
        ts_list.append(over_ts)

        items = db.session.query(PropagateCountNews).filter(PropagateCountNews.topic==topic ,\
                                                                                              PropagateCountNews.end<=over_ts ,\
                                                                                              PropagateCountNews.end>begin_ts ,\
                                                                                              PropagateCountNews.range==unit).all()
        if items:
            result = Merge_propagate(items)
        else:
            result = 0
        results.append(float(result))

    return ts_list, results
def search_weibo_task(user_name):
    c_result = {}
    query = {"query":{"bool":{"must":[{"term":{"user_rank_task.submit_user":user_name}}]}},"size":MAX_ITEMS,"sort":[{"create_time":{"order":"desc"}}],"fields":["status","search_type","keyword","submit_user","sort_scope","sort_norm","start_time","user_ts","end_time", "create_time", 'number']}
    if 1:
        return_list = []
        result = es.search(index=WEIBO_RANK_KEYWORD_TASK_INDEX , doc_type=WEIBO_RANK_KEYWORD_TASK_TYPE , body=query)['hits']
        c_result['flag'] = True
        for item in result['hits']:
            result_temp = {}
            result_temp['submit_user'] = item['fields']['submit_user'][0]
            result_temp['search_type'] = item['fields']['search_type'][0]
            result_temp['keyword'] = json.loads(item['fields']['keyword'][0])
            result_temp['sort_scope'] = item['fields']['sort_scope'][0]
            result_temp['sort_norm'] = item['fields']['sort_norm'][0]
            result_temp['start_time'] = ts2datetime(item['fields']['start_time'][0])
            result_temp['end_time'] = ts2datetime(item['fields']['end_time'][0])
            result_temp['status'] = item['fields']['status'][0]
            result_temp['create_time'] = ts2date(item['fields']['create_time'][0])
            result_temp['search_id'] = item['fields']['user_ts'][0]
            tmp = item['fields'].get('number', 0)
            if tmp:
                result_temp['number'] = int(tmp[0])
            else:
                result_temp['number'] = 100
            return_list.append(result_temp)
        c_result['data'] = return_list
        return c_result
def main():
    RUN_TYPE = 0
    if RUN_TYPE == 1:
        now_ts = time.time()
    else:
        now_ts = datetime2ts(RUN_TEST_TIME)
        now_ts = datetime2ts('2013-09-02')
    date = ts2datetime(now_ts - DAY)
    # auto recommendation: step 1:4
    #step1: read from top es_daily_rank
    top_user_set, user_dict = search_from_es(date)
    #step2: filter black_uid
    black_user_set = read_black_user()
    subtract_user_set = top_user_set - black_user_set
    #step3: filter users have been in
    subtract_user_set = list(subtract_user_set)
    candidate_results = filter_in(subtract_user_set)
    #step4: filter rules about ip count& reposts/bereposts count&activity count
    results = filter_rules(candidate_results)
    #step5: get sensitive user
    sensitive_user = list(get_sensitive_user(date))
    results = results - set(sensitive_user) # influence user - sensitive user
    new_date = ts2datetime(now_ts)
    hashname_influence = "recomment_" + new_date + "_influence"
    if results:
        for uid in results:
            #print uid
            r.hset(hashname_influence, uid, "0")

    hashname_sensitive = "recomment_" + new_date + "_sensitive"
    if sensitive_user:
        for uid in sensitive_user:
            #print "sensitive"
            r.hset(hashname_sensitive, uid, "0")
    """
def query_hot_weibo(ts, origin_mid_list, time_segment):
    query_all_body = {
        "query": {
            "filtered": {
                "filter": {
                    "bool": {
                        "must": [
                            {"range": {
                                "timestamp":{
                                    "gte": ts - time_segment,
                                    "lt": ts
                                }
                            }},
                            {"terms":{"root_mid":origin_mid_list}}
                        ]
                    }
                }
            }
        },
        "aggs":{
            "all_mid":{
                "terms":{"field": "root_mid", "size":400},
                "aggs":{
                    "message_type":{
                        "terms":{
                            "field":"message_type"
                        }
                    }
                }
            }
        }
    }

    return_results = dict()
    datetime_1 = ts2datetime(ts)
    datetime_2 = ts2datetime(ts-24*3600)
    index_name_1 = flow_text_index_name_pre + datetime_1
    index_name_2 = flow_text_index_name_pre + datetime_2
    index_list = []
    exist_es_1 = es_text.indices.exists(index_name_1)
    exist_es_2 = es_text.indices.exists(index_name_2)
    if exist_es_1:
        index_list.append(index_name_1)
    if exist_es_2:
        index_list.append(index_name_2)
    if index_list:
        results = es_text.search(index=index_list, doc_type=flow_text_index_type,body=query_all_body)['aggregations']['all_mid']['buckets']
        if results:
            for item in results:
                temp_dict = dict()
                temp_dict[item['key']] = item['doc_count']
                detail = item['message_type']['buckets']
                detail_dict = dict()
                for iter_item in detail:
                    detail_dict[iter_item['key']] = iter_item['doc_count']
                temp_dict['retweeted'] = detail_dict.get(3, 0)
                temp_dict['comment'] = detail_dict.get(2, 0)
                return_results[item['key']] = temp_dict

    return return_results
示例#10
0
def filter_activity(user_set):
    results = []
    now_date = ts2datetime(time.time())
    # test
    now_date = '2013-09-08'
    ts = datetime2ts(now_date) - 24*3600
    date = ts2datetime(ts)
    #print 'date:', date
    timestamp = datetime2ts(date)
    for user in user_set:
        over_count = 0
        for i in range(0,7):
            ts = timestamp - 3600*24*i
            result = r_cluster.hget('activity_'+str(ts), str(user))
            if result:
                items_dict = json.loads(result)
                for item in items_dict:
                    weibo_count = items_dict[item]
                    if weibo_count > activity_threshold:
                        over_count += 1
        if over_count == 0:
            results.append(user)
        else:
            writer.writerow([user, 'activity'])
            
    print 'after filter activity:', len(results)    
    return results
def key_words_search( pre , time , start_time , keyword , type  = 'in'  ):
    date = start_time 
    index_name = pre + start_time
    while not es.indices.exists(index= index_name) :
        time = datetime2ts(date) + DAY
        date = ts2datetime(time)
        index_name = pre + date
        time -= 1

    uid_set = set()
    for i in range(time):
        print index_name
        query = {"query":{"bool":{"must":[{"prefix":{"text.text":keyword}}],"must_not":[],"should":[]}},"size":MAX_ITEMS,"sort":[],"facets":{},"fields":['uid']}
        try :
            temp = es.search(index = index_name , doc_type = 'text' , body = query)
            result = temp['hits']['hits']
            print "Fetch " + str(len(result))
            for item in result :
                uid_set.add(item['fields']['uid'][0].encode("utf-8") )
        except Exception,e:
            print e
            raise  Exception('user_list failed!')        
        time = datetime2ts(date) + DAY
        date = ts2datetime(time)
        index_name = pre + date
        i += 1
def read_flow_text(uid_list):
    '''
        读取用户微博(返回结果没有微博情绪标签):
        输入数据:uid_list(字符串型列表)
        输出数据:word_dict(用户分词结果字典),weibo_list(用户微博列表)
        word_dict示例:{uid1:{'w1':f1,'w2':f2...}...}
        weibo_list示例:[[uid1,text1,ts1],[uid2,text2,ts2],...](每一条记录对应三个值:uid、text、timestamp)
    '''
    word_dict = dict()#词频字典
    weibo_list = []#微博列表
    online_pattern_dict = {} # {uid:[online_pattern1, ..],...}
    now_ts = time.time()
    #run_type
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = datetime2ts(RUN_TEST_TIME)
    
    start_date_ts = now_date_ts - DAY * WEEK
    for i in range(0,WEEK):
        iter_date_ts = start_date_ts + DAY * i
        flow_text_index_date = ts2datetime(iter_date_ts)
        flow_text_index_name = flow_text_index_name_pre + flow_text_index_date
        print flow_text_index_name
        try:
            flow_text_exist = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\
                    body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'size': MAX_VALUE}, _source=False,  fields=['text','uid','keywords_dict','timestamp'])['hits']['hits']
        except:
            flow_text_exist = []

        for flow_text_item in flow_text_exist:
            uid = flow_text_item['fields']['uid'][0].encode('utf-8')
            text = flow_text_item['fields']['text'][0].encode('utf-8')
            ts = flow_text_item['fields']['timestamp'][0]
            keywords_dict = json.loads(flow_text_item['fields']['keywords_dict'][0])
            keywords_dict = json.dumps(keywords_dict, encoding="UTF-8", ensure_ascii=False)
            keywords_dict = eval(keywords_dict)

            if word_dict.has_key(uid):
                item_dict = Counter(word_dict[uid])
                keywords_dict = Counter(keywords_dict)
                item_dict = dict(item_dict + keywords_dict)
                word_dict[uid] = item_dict
            else:
                word_dict[uid] = keywords_dict

            weibo_list.append([uid,text,ts])
            #test online pattern
            online_pattern = u'weibo.com'
            try:
                user_online_pattern_dict = online_pattern_dict[uid]
            except:
                online_pattern_dict[uid] = {}
            try:
                online_pattern_dict[uid][online_pattern] += 1
            except:
                online_pattern_dict[uid][online_pattern] = 1
    
    return  word_dict,weibo_list, online_pattern_dict, start_date_ts
def main():
    scan_cursor = 0
    count = 0
    bulk_action = []
    number = r.scard('user_set')
    print number

    if RUN_TYPE:
        ts = time.time() - DAY
        date = ts2datetime(ts)
        start_time = str(ts2datetime(time.time()))
        print "/cron/push_mid2redis.py&start&%s" %start_time
    else:
        date = '2013-09-05'
    index_name = flow_text_index_name_pre+date
    print index_name

    ts = time.time()
    while 1:
        re_scan = r.sscan("user_set", scan_cursor, count=3000)
        scan_cursor = re_scan[0]
        uid_list = re_scan[1] #具体数据
        if len(uid_list):
            for uid in uid_list:
                detail_dict = r.hgetall(uid)
                for k,v in detail_dict.iteritems():
                    update_dict = dict()
                    if "_origin_weibo_retweeted" in k and v:
                        mid = k.split('_')[0]
                        update_dict["retweeted"] = int(v)
                    elif "_origin_weibo_comment" in k and v:
                        mid = k.split('_')[0]
                        update_dict["comment"] = int(v)
                    elif '_retweeted_weibo_comment' in k and v:
                        mid = k.split('_')[0]
                        update_dict["comment"] = int(v)
                    elif '_retweeted_weibo_retweeted' in k and v:
                        mid = k.split('_')[0]
                        update_dict["retweeted"] = int(v)
                    else:
                        pass
                    if update_dict:
                        action = {"update": {"_id": mid}}
                        xdata = {"doc": update_dict}
                        bulk_action.extend([action, xdata])
                        count += 1
                        if count % 400 == 0:
                            r_flow.lpush('update_mid_list', json.dumps(bulk_action))
                            bulk_action = []
                            tp = time.time()
                            #print "%s cost %s" %(count, tp-ts)
                            ts = tp
        if int(scan_cursor) == 0:
            break

    if bulk_action:
        r_flow.lpush('update_mid_list', json.dumps(bulk_action))

    print count
def aggregation_sentiment_related_weibo(ts, origin_mid_list, time_segment, message_type=1, uid_list=[]):
    if message_type == 1:
        query_all_body = {
            "query": {
                "filtered": {
                    "filter": {
                        "bool": {
                            "must": [
                                {"range": {"timestamp": {"gte": ts - time_segment, "lt": ts}}},
                                {"terms": {"root_mid": origin_mid_list}},
                            ]
                        }
                    }
                }
            },
            "aggs": {"all_sentiments": {"terms": {"field": "sentiment"}}},
        }
    else:
        query_all_body = {
            "query": {
                "filtered": {
                    "filter": {
                        "bool": {
                            "must": [
                                {"range": {"timestamp": {"gte": ts - time_segment, "lt": ts}}},
                                {"terms": {"root_mid": origin_mid_list}},
                                {"terms": {"directed_uid": uid_list}},
                            ]
                        }
                    }
                }
            },
            "aggs": {"all_sentiments": {"terms": {"field": "sentiment"}}},
        }

    results = {"0": 0, "1": 0, "2": 0, "3": 0, "4": 0, "5": 0, "6": 0}
    datetime_1 = ts2datetime(ts)
    datetime_2 = ts2datetime(ts - 24 * 3600)
    index_name_1 = flow_text_index_name_pre + datetime_1
    index_name_2 = flow_text_index_name_pre + datetime_2
    index_list = []
    exist_es_1 = es_text.indices.exists(index_name_1)
    exist_es_2 = es_text.indices.exists(index_name_2)
    if exist_es_1:
        index_list.append(index_name_1)
    if exist_es_2:
        index_list.append(index_name_2)
    if index_list:
        search_results = es_text.search(index=index_list, doc_type=flow_text_index_type, body=query_all_body)[
            "aggregations"
        ]["all_sentiments"]["buckets"]
        if search_results:
            for item in search_results:
                key = item["key"]
                count = item["doc_count"]
                results[key] = count
    print "results: ", results, sum(results.values())
    return results
def main():
    now_ts = time.time()
    delete_ts = datetime2ts(ts2datetime(now_ts-EXPIRE_TIME))  #待删除的时间戳
    delete_date = ts2datetime(now_ts-EXPIRE_TIME)
    del_day = ts2datetime(now_ts-MONTH_TIME)

    index_name = flow_text_index_name_pre + del_day
    exist_es = es_flow_text.indices.exists(index=index_name)
    if exist_es:
        es_flow_text.indices.delete(index=index_name)
    index_bci = "bci_" + del_day.replace('-', '')
    exist_bci = ES_CLUSTER_FLOW1.indices.exists(index=index_bci)
    if exist_bci:
        ES_CLUSTER_FLOW1.indices.delete(index=index_bci)


    #delete @
    redis_cluster.delete("at_"+str(delete_ts))
    redis_cluster.delete("sensitive_at_"+str(delete_ts))

    #delete ip
    redis_ip.delete('ip_'+str(delete_ts))
    if WORK_TYPE == 0:
        exist_ip = es_cluster.indices.exists(index="ip_"+delete_date)
        if exist_ip:
            es_cluster.indices.delete(index="ip_"+delete_date)
    redis_ip.delete('sensitive_ip_'+str(delete_ts))
    if WORK_TYPE == 0:
        exist_ip = es_cluster.indices.exists(index="sensitive_ip_"+delete_date)
        if exist_ip:
            es_cluster.indices.delete(index="sensitive_ip_"+delete_date)

    #delete activity
    redis_activity.delete('activity_'+str(delete_ts))
    if WORK_TYPE == 0:
        exist_activity = es_cluster.indices.exists(index="activity_"+delete_date)
        if exist_activity:
            es_cluster.indices.delete(index="activity_"+delete_date)
    redis_activity.delete('sensitive_activity_'+str(delete_ts))
    if WORK_TYPE == 0:
        exist_activity = es_cluster.indices.exists(index="sensitive_activity_"+delete_date)
        if exist_activity:
            es_cluster.indices.delete(index="sensitive_activity_"+delete_date)

    #delete hashtag
    redis_cluster.delete('hashtag_'+str(delete_ts))
    redis_cluster.delete('sensitive_hashtag_'+str(delete_ts))

    #delete sensitive words
    redis_cluster.delete('sensitive_'+str(delete_ts))

    #delete recommendation
    r.delete('recomment_'+str(delete_date)+"_influence")
    r.delete('recomment_'+str(delete_date)+"_sensitive")
    r.delete("identify_in_sensitive_" + str(delete_date))
    r.delete("identify_in_influence_" + str(delete_date)))
示例#16
0
def query_hot_weibo(ts, origin_mid_list, time_segment, keywords_list, aggregation_field="root_mid", size=100):
    query_all_body = {
        "query": {
            "filtered": {
                "filter": {
                    "bool": {
                        "must": [
                            {"range": {
                                "timestamp":{
                                    "gte": ts - time_segment,
                                    "lt": ts
                                }
                            }}],
                        "should": [
                            {"terms":{
                                "keywords_string": keywords_list
                                }
                            }
                        ]
                    }
                }
            }
        },
        "aggs":{
            "all_count":{
                "terms":{"field": aggregation_field, "size": size}
            }
        }
    }

    datetime = ts2datetime(ts)
    # test
    #datetime = "2013-09-07"
    hot_mid_dict = dict()
    index_name = flow_text_index_name_pre + datetime
    exist_es = es_text.indices.exists(index_name)
    if origin_mid_list and exist_es:
        query_all_body["query"]["filtered"]["filter"]["bool"]["should"].append({"terms": {"root_mid": origin_mid_list}})
        query_all_body["query"]["filtered"]["filter"]["bool"]["should"].append({"terms": {"mid": origin_mid_list}})
        results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_all_body)['aggregations']['all_count']['buckets']
        if results:
            for item in results:
                hot_mid_dict[item['key']] = item['doc_count']

        datetime_1 = ts2datetime(ts-time_segment)
        index_name_1 = flow_text_index_name_pre + datetime_1
        exist_es_1 = es_text.indices.exists(index_name_1)
        if datetime_1 != datetime and exist_es_1:
            query_all_body["query"]["filtered"]["filter"]["bool"]["should"].append({"terms": {"root_mid": origin_mid_list}})
            query_all_body["query"]["filtered"]["filter"]["bool"]["should"].append({"terms": {"mid": origin_mid_list}})
            results_1 = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_all_body)['aggregations']['all_count']['buckets']
            if results_1:
                for item in results:
                    hot_mid_dict[item['key']] = item['doc_count']

    return hot_mid_dict
def update_his_item(history_item , today_bci , today_date):
    last_day = ts2datetime(today_date - DAY)
    warehousing_time = history_item['_source']['warehousing_time'] #get the warehousing time  yyyy-mm-dd
    #get the days of warehousing
    day = int((today_date - datetime2ts(warehousing_time))/DAY + 1)
    item = history_item['_source']
    try:
        item['bci_day_change'] = today_bci - item['bci_' + ts2datetime(today_date - 2 * DAY)]
    except Exception, e:
        print history_item['_id'] + ":" + e.message
示例#18
0
def sort_task(user, keyword, status, start_time, end_time, submit_time):
    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "bool":{
                        "must":[
                            {"term":{"submit_user": user}}
                        ]
                    }
                }
            }
        },
        "size": 10000,
        "sort":{"submit_time":{"order":"desc"}}
    }

    query_list = []
    if keyword:
        keyword_list = keyword.split(',')
        query_list.append({"terms":{"keyword_string":keyword_list}})
    if status != 2:
        query_list.append({"term":{"status": status}})
    if start_time and end_time:
        start_ts = datetime2ts(start_time)
        end_ts = datetime2ts(end_time)
        query_list.append({"range":{"start_time":{"gte":start_ts, "lte":end_ts}}})
        query_list.append({"range":{"end_time":{"gte":start_ts, "lte":end_ts}}})
    if submit_time:
        query_list.append({"term":{"submit_time": submit_time}})

    if query_list:
        query_body["query"]["filtered"]["filter"]["bool"]["must"].extend(query_list)

    #print query_body
    search_results = es.search(index=USER_RANK_KEYWORD_TASK_INDEX, doc_type=USER_RANK_KEYWORD_TASK_TYPE, body=query_body)["hits"]["hits"]
    results = []
    if search_results:
        for item in search_results:
            iter_item = item['_source']
            tmp = []
            tmp.append(iter_item['search_type'])
            tmp.append(json.loads(iter_item['keyword']))
            tmp.append(ts2datetime(iter_item['start_time']))
            tmp.append(ts2datetime(iter_item['end_time']))
            tmp.append(iter_item['range'])
            tmp.append(ts2date(iter_item['create_time']))
            tmp.append(iter_item['status'])
            tmp.append(iter_item['sort_norm'])
            tmp.append(iter_item['sort_scope'])
            tmp.append(item['_id']) # task_name
            results.append(tmp)

    return results
def query_related_weibo(ts, origin_mid_list, time_segment):
    query_all_body = {
        "query": {
            "filtered": {
                "filter": {
                    "bool": {
                        "must": [
                            {"range": {
                                "timestamp":{
                                    "gte": ts - time_segment,
                                    "lt": ts
                                }
                            }},
                            {"terms":{"root_mid":origin_mid_list}}
                        ]
                    }
                }
            }
        },
        "aggs":{
            "all_count":{
                "terms":{"field": "message_type"}
            }
        }
    }

    return_results = {"origin": 0, "retweeted": 0, "comment": 0}
    datetime_1 = ts2datetime(ts)
    datetime_2 = ts2datetime(ts-24*3600)
    index_name_1 = flow_text_index_name_pre + datetime_1
    index_name_2 = flow_text_index_name_pre + datetime_2
    index_list = []
    exist_es_1 = es_text.indices.exists(index_name_1)
    exist_es_2 = es_text.indices.exists(index_name_2)
    if exist_es_1:
        index_list.append(index_name_1)
    if exist_es_2:
        index_list.append(index_name_2)
    if index_list:
        results = es_text.search(index=index_list, doc_type=flow_text_index_type,body=query_all_body)['aggregations']['all_count']['buckets']
        if results:
            for item in results:
                if int(item['key']) == 1:
                    return_results['origin'] = item['doc_count']
                elif int(item['key']) == 3:
                    return_results['retweeted'] = item['doc_count']
                elif int(item['key']) == 2:
                    return_results['comment'] = item['doc_count']
                else:
                    pass

    return_results['total_count'] = sum(return_results.values())
    print "return_results: ", return_results
    return return_results
def main():
    filter_uid = all_delete_uid()
    #record_time = time.strftime("%Y%m%d", time.localtime(time.time()))
    record_time = ts2datetime(time.time()) # 2013-09-08
    former_time = ts2datetime(time.time() - 24*3600) # 2013-09-07
    recommend_list = search_low_number(threshould) # recommended user to delete in portrait database
    print len(recommend_list)
    recommend_list = list(set(recommend_list).difference(filter_uid))
    recommend_redis.hset("recommend_delete_list", record_time, json.dumps(recommend_list)) # 今日推荐出库名单
    recommend_redis.hdel("recommend_delete_list", former_time) # 删除昨日推荐名单,只维护一个推荐出库名单

    return 1
def get_influence(uid_list):
    result = {}
    now_ts = time.time()
    #run_type
    if RUN_TYPE == 1:
        now_date = ts2datetime(now_ts - DAY)
    else:
        now_date = ts2datetime(datetime2ts(RUN_TEST_TIME) - DAY)

    index_time = 'bci_' + ''.join(now_date.split('-'))
    index_type = 'bci'
    try:
        es_result = es.mget(index=index_time, doc_type=index_type, body={'ids': uid_list})['docs']
    except Exception, e:
        raise e
def getResult(search_id):
    item = es.get(index=WEIBO_RANK_KEYWORD_TASK_INDEX , doc_type=WEIBO_RANK_KEYWORD_TASK_TYPE , id=search_id)
    try:
        result_obj = {}
        result_obj['keyword'] = json.loads(item['_source']['keyword'])
        result_obj['sort_scope'] = item['_source']['sort_scope']
        result_obj['sort_norm'] = item['_source']['sort_norm']
        result_obj['start_time'] = ts2datetime(item['_source']['start_time'])
        result_obj['end_time'] =ts2datetime(item['_source']['end_time'])
        result_obj['result'] = json.loads(item['_source']['result'])
        result_obj['text_results'] = json.loads(item['_source']['text_results'])
        result_obj['number'] = item['_source']['number']
        return result_obj
    except :
        return []    
def query_mid_list(ts, social_sensors, time_segment, message_type=1):
    query_body = {
        "query": {
            "filtered": {
                "filter": {
                    "bool": {
                        "must":[
                            {"range": {
                                "timestamp": {
                                    "gte": ts - time_segment,
                                    "lt": ts
                                }
                            }},
                            {"terms":{"uid": social_sensors}},
                            {"term":{"message_type": message_type}}
                        ]
                    }
                }
            }
        },
        "sort": {"sentiment": {"order": "desc"}},
        "size": 10000
    }

    datetime_1 = ts2datetime(ts)
    datetime_2 = ts2datetime(ts-24*3600)
    index_name_1 = flow_text_index_name_pre + datetime_1
    index_name_2 = flow_text_index_name_pre + datetime_2
    index_list = []
    exist_es_1 = es_text.indices.exists(index_name_1)
    exist_es_2 = es_text.indices.exists(index_name_2)
    if exist_es_1:
        index_list.append(index_name_1)
    if exist_es_2:
        index_list.append(index_name_2)
    if index_list:
        search_results = es_text.search(index=index_list, doc_type=flow_text_index_type, body=query_body)["hits"]["hits"]
    else:
        search_results = []
    origin_mid_list = set()
    if search_results:
        for item in search_results:
            if message_type == 1:
                origin_mid_list.add(item["_id"])
            else:
                origin_mid_list.add(item['_source']['root_mid'])

    return list(origin_mid_list)
def scan_mapper(pre, sen_pre, r):
    if RUN_TYPE:
        ts = datetime2ts(ts2datetime(time.time - DAY))
    else:
        ts = datetime2ts('2013-09-01')
    ts = str(ts)
    hash_name = pre + ts
    sen_hash_name = sen_pre + ts
    cursor = 0
    count = 0
    tb = time.time()

    while 1:
        re_scan = r.hscan(hash_name, cursor, count=1000)
        cursor = re_scan[0]
        ip_dict = re_scan[1]
        uid_list = ip_dict.keys()
        if uid_list:
            r.lpush('act_uid_list', json.dumps(uid_list))
            count += len(uid_list)
            ts = time.time()
            print '%s : %s' %(count, ts - tb)
            tb = ts
        if cursor == 0:
            print count
            break
def get_influence(uid):
    result = 0
    now_ts = time.time()
    now_date = ts2datetime(now_ts - 3600*24)
    # test
    now_date = '2013-09-07'
    index_time = ''.join(now_date.split('-'))
    index_type = 'bci'
    try:
        result = es.get(index=index_time, id=uid, doc_type=index_type)['_source']['user_index']
        #print 'result_dict:', result
        '''
        query_body = {
        'query':{
            'filtered':{
                'query':{
                    'match_all':{}
                    },
                'filter':{
                    'range':{
                        'user_index':{
                            'gte':result
                            }
                        }
                    }
            }
        }
        }
        rank = es.count(index=index_time, doc_type=index_type, body=query_body)['count']
        #print 'rank:', rank
        '''
    except:
        return 0
    return result
def update_day_sensitive(uid_list):
    results = {}
    for uid in uid_list:
        results[uid] = {"sensitive": 0, "sensitive_string": "", "sensitive_dict": json.dumps({})}
    all_results = {}
    now_ts = time.time()
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = datetime2ts("2013-09-03")
    today_sensitive_dict = {}
    sensitive_results = redis_cluster.hmget("sensitive_" + str(now_date_ts), uid_list)
    for item in sensitive_results:
        for uid, words_dict in item.iteritems():
            sensitive_index = 0
            sensitive_words_dict = {}
            if words_dict:
                sensitive_words_dict = json.dumps(words_dict)
                for word, count in words_dict.iter_items():
                    tmp_stage = r_sensitive.hget("sensitive_words", word)
                    if tmp_stage:
                        tmp = json.loads(tmp_stage)
                        sensitive_index += sensitive_score_dict[tmp[0]] * count
            sensitive_words_string = "&".join(sensitive_words_dict.keys())
            results[uid] = {
                "sensitive": sensitive_index,
                "sensitive_words_string": sensitive_words_string,
                "sensitive_words_dict": sensitive_words_dict,
            }

    return results
def update_day_hashtag(uid_list):
    results = {}
    all_results = {}
    now_ts = time.time()
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = datetime2ts("2013-09-02")

    for i in range(WEEK, 0, -1):
        ts = now_date_ts - DAY * i
        count = 0
        hashtag_results = redis_cluster.hmget("hashtag_" + str(ts), uid_list)
        for uid in uid_list:
            if uid not in results:
                results[uid] = {}
            hashtag_item = hashtag_results[count]
            if hashtag_item:
                hashtag_dict = json.loads(hashtag_item)
            else:
                hashtag_dict = {}
            for hashtag in hashtag_dict:
                try:
                    results[uid][hashtag] += 1
                except:
                    results[uid][hashtag] = 1
            count += 1

    for uid in uid_list:
        user_hashtag_dict = results[uid]
        hashtag_string = "&".join(user_hashtag_dict.keys())
        all_results[uid] = {"hashtag_string": hashtag_string, "hashtag_dict": json.dumps(user_hashtag_dict)}

    return all_results
def update_day_sensitive(uid_list):
    results = {}
    count = 0
    for uid in uid_list:
        results[uid] = {"sensitive": 0, 'sensitive_string': "", 'sensitive_dict': json.dumps({})}
    all_results = {}
    now_ts = time.time()
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = datetime2ts('2013-09-02')
    today_sensitive_dict = {}
    sensitive_results = redis_cluster.hmget("sensitive_"+str(now_date_ts), uid_list)
    for item in sensitive_results:
        if not item:
            count += 1
            continue
        print type(item)
        uid = uid_list[count]
        item = json.loads(item)
        sensitive_index = 0
        sensitive_words_dict = {}
        for word, count in item.iteritems():
            tmp_stage = r_sensitive.hget("sensitive_words", word)
            if tmp_stage:
                tmp = json.loads(tmp_stage)
                sensitive_index += sensitive_score_dict[str(tmp[0])] * count
        sensitive_words_string = "&".join(item.keys())
        results[uid] = {'sensitive': sensitive_index, "sensitive_words_string":sensitive_words_string, "sensitive_words_dict":item}
        count += 1

    return results
def get_importance(uid, domain, topic):
    result = 0
    domain_result = 0
    domain_list = domain.split(' ')
    #print 'domain_list:', domain_list
    for domain in domain_list:
        try:
            domain_result += domain_weight_dict[domain]
        except:
            pass
    topic_result = 0
    topic_list = topic.split(' ')
    #print 'topic_list:', topic_list
    for topic in topic_list:
        try:
            topic_result += topic_weight_dict[topic]
        except:
            pass
    #get fansnum, origin_weibo_retweeted_total_number, retweeted_weibo_retweeted_total_number
    now_ts = time.time()
    date = ts2datetime(now_ts-3600*24)
    #test 
    date = '2013-09-07'
    index_time = ''.join(date.split('-'))
    index_type = 'bci'
    try:
        es_result = es.get(index=index_time, doc_type=index_type, id=uid)['_source']
        fansnum = es_result['user_fansnum']
        retweetednum = es_result['origin_weibo_retweeted_total_number'] + es_result['retweeted_weibo_retweeted_total_number']
        result = importance_weight_dict['fansnum']*fansnum + importance_weight_dict['retweeted_num']*retweetednum + \
             importance_weight_dict['domain']*domain_result + importance_weight_dict['topic']*topic_result
        #print 'importance result:', result
        return result
    except:
        return 0
def get_important_user(ts, origin_mid_list, time_segment):
    query_all_body = {
        "query": {
            "filtered": {
                "filter": {
                    "bool": {
                        "must": [{"range": {"timestamp": {"gte": ts - time_segment, "lt": ts}}}],
                        "should": [{"terms": {"root_mid": origin_mid_list}}, {"terms": {"mid": origin_mid_list}}],
                    }
                }
            }
        },
        "sort": {"user_fansnum": {"order": "desc"}},
        "size": 1000,
    }

    datetime = ts2datetime(ts - time_segment)
    index_name = flow_text_index_name_pre + datetime
    exist_es = es_text.indices.exists(index_name)
    results = []
    if origin_mid_list and exist_es:
        search_results = es_text.search(
            index=index_name, doc_type=flow_text_index_type, body=query_all_body, _source=False
        )["hits"]["hits"]
        if search_results:
            for item in search_results:
                results.append(item["_id"])

    return results
示例#31
0
            user_list.append(item['_id'])

    return user_list


def main():
    filter_uid = all_delete_uid()
    #record_time = time.strftime("%Y%m%d", time.localtime(time.time()))
    record_time = ts2datetime(time.time()) # 2013-09-08
    former_time = ts2datetime(time.time() - 24*3600) # 2013-09-07
    recommend_list = search_low_number(threshould) # recommended user to delete in portrait database
    print len(recommend_list)
    recommend_list = list(set(recommend_list).difference(filter_uid))
    recommend_redis.hset("recommend_delete_list", record_time, json.dumps(recommend_list)) # 今日推荐出库名单
    recommend_redis.hdel("recommend_delete_list", former_time) # 删除昨日推荐名单,只维护一个推荐出库名单

    return 1

if __name__ == "__main__":
    current_path = os.getcwd()
    file_path = os.path.join(current_path, 'recommend_to_delete.py')
    now_ts = ts2datetime(ts)
    print_log = "&".join([file_path, "start", now_ts])
    print print_log #打印开始信息

    main()

    now_ts = ts2datetime(ts)
    print_log = "&".join([file_path, "end", now_ts])
    print print_log # 打印终止信息
示例#32
0
                    },
                }
            }
        }
    }

    #current_time = time.time()
    #facebook_feedback_friends_index_name = facebook_feedback_friends_index_name_pre + ts2datetime(current_time)

    if not es.indices.exists(index=facebook_feedback_friends_index_name):
        es.indices.create(index=facebook_feedback_friends_index_name,
                          body=index_info,
                          ignore=400)


if __name__ == '__main__':

    current_time = time.time()
    date = ts2datetime(current_time + 24 * 3600)

    facebook_feedback_like_mappings(facebook_feedback_like_index_name_pre +
                                    date)
    facebook_feedback_retweet_mappings(
        facebook_feedback_retweet_index_name_pre + date)
    facebook_feedback_at_mappings(facebook_feedback_at_index_name_pre + date)
    facebook_feedback_comment_mappings(
        facebook_feedback_comment_index_name_pre + date)
    facebook_feedback_private_mappings(
        facebook_feedback_private_index_name_pre + date)
    facebook_feedback_friends_mappings()
def read_flow_text(uid_list):
    '''
        读取用户微博(返回结果没有微博情绪标签):
        输入数据:uid_list(字符串型列表)
        输出数据:word_dict(用户分词结果字典),weibo_list(用户微博列表)
        word_dict示例:{uid1:{'w1':f1,'w2':f2...}...}
        weibo_list示例:[[uid1,text1,ts1],[uid2,text2,ts2],...](每一条记录对应三个值:uid、text、timestamp)
    '''
    word_dict = dict()  #词频字典
    weibo_list = []  #微博列表
    online_pattern_dict = {}  # {uid:[online_pattern1, ..],...}
    now_ts = time.time()
    #run_type
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = datetime2ts(RUN_TEST_TIME)

    start_date_ts = now_date_ts - DAY * WEEK
    for i in range(0, WEEK):
        iter_date_ts = start_date_ts + DAY * i
        flow_text_index_date = ts2datetime(iter_date_ts)
        flow_text_index_name = flow_text_index_name_pre + flow_text_index_date
        print flow_text_index_name
        try:
            flow_text_exist = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\
                    body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'size': MAX_VALUE}, _source=False,  fields=['text','uid','keywords_dict','timestamp'])['hits']['hits']
        except:
            flow_text_exist = []

        for flow_text_item in flow_text_exist:
            uid = flow_text_item['fields']['uid'][0].encode('utf-8')
            text = flow_text_item['fields']['text'][0].encode('utf-8')
            ts = flow_text_item['fields']['timestamp'][0]
            keywords_dict = json.loads(
                flow_text_item['fields']['keywords_dict'][0])
            keywords_dict = json.dumps(keywords_dict,
                                       encoding="UTF-8",
                                       ensure_ascii=False)
            keywords_dict = eval(keywords_dict)

            if word_dict.has_key(uid):
                item_dict = Counter(word_dict[uid])
                keywords_dict = Counter(keywords_dict)
                item_dict = dict(item_dict + keywords_dict)
                word_dict[uid] = item_dict
            else:
                word_dict[uid] = keywords_dict

            weibo_list.append([uid, text, ts])
            #test online pattern
            online_pattern = u'weibo.com'
            try:
                user_online_pattern_dict = online_pattern_dict[uid]
            except:
                online_pattern_dict[uid] = {}
            try:
                online_pattern_dict[uid][online_pattern] += 1
            except:
                online_pattern_dict[uid][online_pattern] = 1

    return word_dict, weibo_list, online_pattern_dict, start_date_ts
示例#34
0
def create_event_warning(xnr_user_no,today_datetime,write_mark):
    #获取事件名称
    hashtag_list = get_hashtag(today_datetime)
    #print 'hashtag_list::',hashtag_list

    flow_text_index_name = get_day_flow_text_index_list(today_datetime)

    #虚拟人的粉丝列表和关注列表
    try:
        es_xnr_result=es_xnr.get(index=weibo_xnr_fans_followers_index_name,doc_type=weibo_xnr_fans_followers_index_type,id=xnr_user_no)['_source']
        followers_list=es_xnr_result['followers_list']
        fans_list=es_xnr_result['fans_list']
    except:
        followers_list=[]
        fans_list=[]

    event_warming_list=[]
    event_num=0
    for event_item in hashtag_list:
        event_sensitive_count=0
        event_warming_content=dict()     #事件名称、主要参与用户、典型微博、事件影响力、事件平均时间
        event_warming_content['event_name']=event_item['event_name']
        print 'event_name:',event_item
        event_num=event_num+1
        print 'event_num:::',event_num
        print 'first_time:::',int(time.time())
        event_influence_sum=0
        event_time_sum=0       
        query_body={
            'query':{
                # 'bool':{
                #     'must':[{'wildcard':{'text':'*'+event_item[0]+'*'}},
                #     {'range':{'sensitive':{'gte':1}}}]
                # }
                'filtered':{
                    'filter':{
                        'bool':{
                            'must':[
                                {'term':{'hashtag':event_item['event_name']}},
                                {'range':{'sensitive':{'gte':1}}}
                            ]
                        }
                    }
                }
            },
            'size':MAX_WARMING_SIZE,
            'sort':{'sensitive':{'order':'desc'}}
        }
        #try:         
        event_results=es_flow_text.search(index=flow_text_index_name,doc_type=flow_text_index_type,body=query_body)['hits']['hits']
        if event_results:
            weibo_result=[]
            fans_num_dict=dict()
            followers_num_dict=dict()
            alluser_num_dict=dict()
            print 'sencond_time:::',int(time.time())
            for item in event_results:
                #print 'event_content:',item['_source']['text']          
                
                #统计用户信息
                if alluser_num_dict.has_key(str(item['_source']['uid'])):
                    followers_mark=set_intersection(item['_source']['uid'],followers_list)
                    if followers_mark > 0:
                        alluser_num_dict[str(item['_source']['uid'])]=alluser_num_dict[str(item['_source']['uid'])]+1*2
                    else:
                        alluser_num_dict[str(item['_source']['uid'])]=alluser_num_dict[str(item['_source']['uid'])]+1
                else:
                    alluser_num_dict[str(item['_source']['uid'])]=1                

                #计算影响力
                origin_influence_value=(1+item['_source']['comment']+item['_source']['retweeted'])*(1+item['_source']['sensitive'])
                # fans_value=judge_user_type(item['_source']['uid'],fans_list)
                followers_value=judge_user_type(item['_source']['uid'],followers_list)
                item['_source']['weibo_influence_value']=origin_influence_value*(followers_value)
                
                item['_source']['nick_name']=get_user_nickname(item['_source']['uid'])

                weibo_result.append(item['_source'])

                #统计影响力、时间
                event_influence_sum=event_influence_sum+item['_source']['weibo_influence_value']
                event_time_sum=event_time_sum+item['_source']['timestamp']            
        
            print 'third_time:::',int(time.time())
            #典型微博信息
            weibo_result.sort(key=lambda k:(k.get('weibo_influence_value',0)),reverse=True)
            event_warming_content['main_weibo_info']=json.dumps(weibo_result)

            #事件影响力和事件时间
            number=len(event_results)
            event_warming_content['event_influence']=event_influence_sum/number
            event_warming_content['event_time']=event_time_sum/number

        # except:
        #     event_warming_content['main_weibo_info']=[]
        #     event_warming_content['event_influence']=0
        #     event_warming_content['event_time']=0
        
        # try:
            #对用户进行排序
            alluser_num_dict=sorted(alluser_num_dict.items(),key=lambda d:d[1],reverse=True)
            main_userid_list=[]
            for i in xrange(0,len(alluser_num_dict)):
                main_userid_list.append(alluser_num_dict[i][0])

        #主要参与用户信息
            main_user_info=[]
            user_es_result=es_user_profile.mget(index=profile_index_name,doc_type=profile_index_type,body={'ids':main_userid_list})['docs']
            for item in user_es_result:

                user_dict=dict()
                if item['found']:
                    user_dict['photo_url']=item['_source']['photo_url']
                    user_dict['uid']=item['_id']
                    user_dict['nick_name']=item['_source']['nick_name']
                    user_dict['favoritesnum']=item['_source']['favoritesnum']
                    user_dict['fansnum']=item['_source']['fansnum']
                else:
                    user_dict['photo_url']=''
                    user_dict['uid']=item['_id']
                    user_dict['nick_name']=''
                    user_dict['favoritesnum']=0
                    user_dict['fansnum']=0
                main_user_info.append(user_dict)
            event_warming_content['main_user_info']=json.dumps(main_user_info)


        # except:
            # event_warming_content['main_user_info']=[]
            print 'fourth_time:::',int(time.time())
            event_warming_content['xnr_user_no']=xnr_user_no
            event_warming_content['validity']=0
            event_warming_content['timestamp']=today_datetime
            now_time=int(time.time())
            task_id=xnr_user_no+'_'+str(now_time) 
        
          
            if write_mark:
                print 'today_datetime:::',ts2datetime(today_datetime)
                mark=write_envent_warming(today_datetime,event_warming_content,task_id)
                event_warming_list.append(mark)
            else:
                event_warming_list.append(event_warming_content)

        else:
        	pass
        print 'fifth_time:::',int(time.time())
    return event_warming_list
示例#35
0
reload(sys)
sys.path.append('../../')
from time_utils import ts2datetime, datetime2ts, ts2yeartime
from parameter import MAX_VALUE, DAY, WARMING_DAY
from global_config import S_TYPE, FACEBOOK_FLOW_START_DATE
from elasticsearch import Elasticsearch
from global_utils import es_xnr_2 as es, es_xnr
from global_utils import facebook_user_warning_index_name_pre,facebook_user_warning_index_type,\
      facebook_event_warning_index_name_pre,facebook_event_warning_index_type,\
      facebook_speech_warning_index_name_pre,facebook_speech_warning_index_type,\
      facebook_timing_warning_index_name_pre,facebook_timing_warning_index_type,\
      weibo_date_remind_index_name,weibo_date_remind_index_type,\
      facebook_warning_corpus_index_name,facebook_warning_corpus_index_type

NOW_DATE = ts2datetime(int(time.time()) - 8 * DAY)
print 'NOW_DATE:', NOW_DATE


def facebook_user_warning_mappings(TODAY_DATE):
    index_info = {
        'settings': {
            'number_of_replicas': 0,
            'number_of_shards': 5
        },
        'mappings': {
            facebook_user_warning_index_type: {
                'properties': {
                    'xnr_user_no': {  # 虚拟人  
                        'type': 'string',
                        'index': 'not_analyzed'
示例#36
0
def create_speech_warning(xnr_user_no, today_datetime):
    #查询关注列表
    lookup_type = 'followers_list'
    followers_list = lookup_xnr_fans_followers(xnr_user_no, lookup_type)

    query_body = {
        'query': {
            'filtered': {
                'filter': {
                    'bool': {
                        'must': {
                            'range': {
                                'sensitive': {
                                    'gte': 1
                                }
                            }
                        }
                    }
                }
            }
        },
        'size': MAX_SEARCH_SIZE,
        'sort': {
            'sensitive': {
                'order': 'desc'
            }
        }
    }
    twitter_flow_text_index_name = get_timets_set_indexset_list(
        twitter_flow_text_index_name_pre, today_datetime, today_datetime)
    #print twitter_flow_text_index_name
    results = es_xnr_2.search(index=twitter_flow_text_index_name,
                              doc_type=twitter_flow_text_index_type,
                              body=query_body)['hits']['hits']
    #print results
    result = []
    for item in results:
        if item['_source']['uid'] in followers_list:
            item['_source']['content_type'] = 'follow'
        else:
            item['_source']['content_type'] = 'unfollow'

        item['_source']['validity'] = 0
        item['_source']['xnr_user_no'] = xnr_user_no

        #查询三个指标字段
        tid_result = lookup_tid_attend_index(item['_source']['tid'],
                                             today_datetime)
        if tid_result:
            item['_source']['comment'] = tid_result['comment']
            item['_source']['share'] = tid_result['share']
            item['_source']['favorite'] = tid_result['favorite']
        else:
            item['_source']['comment'] = 0
            item['_source']['share'] = 0
            item['_source']['favorite'] = 0

        #查询用户昵称
        item['_source']['nick_name'] = get_user_nickname(
            item['_source']['uid'])

        task_id = xnr_user_no + '_' + item['_source']['tid']

        #写入数据库
        today_date = ts2datetime(today_datetime)
        twitter_speech_warning_index_name = twitter_speech_warning_index_name_pre + today_date
        # try:
        es_xnr_2.index(index=twitter_speech_warning_index_name,
                       doc_type=twitter_speech_warning_index_type,
                       body=item['_source'],
                       id=task_id)
        mark = True
        # except:
        #     mark=False

        result.append(mark)
    return result
示例#37
0
def trace_xnr_community(community, trace_datetime):  #传的是ts
    #step1:获取跟踪社区list
    # community_list = get_trace_community(trace_datetime)

    #针对每个社区进行处理
    all_influence = get_evaluate_max(weibo_bci_history_index_name,
                                     weibo_bci_history_index_type,
                                     'bci_week_ave')
    all_sensitive = get_evaluate_max(weibo_sensitive_history_index_name,
                                     weibo_sensitive_history_index_type,
                                     'sensitive_week_ave')
    result_mark = []
    # for community in community_list:
    community_detail = dict()
    community_detail['xnr_user_no'] = community['xnr_user_no']
    community_detail['community_id'] = community['community_id']
    community_detail['community_name'] = community['community_name']
    community_detail['create_time'] = community['create_time']
    community_detail['trace_time'] = trace_datetime
    community_detail['trace_date'] = ts2datetime(trace_datetime)
    community_detail['num'] = community['num']
    community_detail['nodes'] = community['nodes']

    #判断一下,对于刚生成社区的预警,指标值取生成的
    create_date = ts2datetime(community['create_time'])
    trace_date = ts2datetime(trace_datetime)
    if create_date == trace_date:
        print '新社区!'
        community_detail['density'] = community['density']
        community_detail['cluster'] = community['cluster']
        community_detail['max_influence'] = community['max_influence']
        community_detail['mean_influence'] = community['mean_influence']
        community_detail['max_sensitive'] = community['max_sensitive']
        community_detail['mean_sensitive'] = community['mean_sensitive']

        community_detail['warning_type'] = community['warning_type']

        community_detail['num_warning'] = 0
        community_detail['num_warning_descrp'] = ""
        community_detail['num_warning_content'] = ""

        community_detail['sensitive_warning'] = 0
        community_detail['sensitive_warning_descrp'] = ""
        community_detail['sensitive_warning_content'] = ""

        community_detail['influence_warning'] = 0
        community_detail['influence_warning_descrp'] = ""
        community_detail['influence_warning_content'] = ""

        community_detail['density_warning'] = 0
        community_detail['density_warning_descrp'] = ""
        community_detail['density_warning_content'] = ""

        for item in community['warning_type']:
            if item == '人物突增预警':
                community_detail['num_warning'] = 1
                community_detail['num_warning_descrp'],\
                community_detail['num_warning_content'] = get_person_warning(community['community_id'],community['nodes'],community['xnr_user_no'],trace_datetime)
            elif item == '影响力剧增预警':
                community_detail['influence_warning'] = 1
                community_detail['influence_warning_descrp'],\
                community_detail['influence_warning_content'] = get_influence_warning(community,trace_datetime)
            elif item == '敏感度剧增预警':
                community_detail['sensitive_warning'] = 1
                community_detail['sensitive_warning_descrp'],\
                community_detail['sensitive_warning_content'] = get_sensitive_warning(community,trace_datetime)
            elif item == '社区聚集预警':
                community_detail['density_warning'] = 1
                community_detail['density_warning_descrp'],\
                community_detail['density_warning_content'] = get_density_warning(community,trace_datetime)

    else:
        print '旧社区!'
        #trace_index_result = group_evaluate(community['xnr_user_no'],community['nodes'],all_influence,all_sensitive)
        trace_index_result = group_evaluate_trace(community['xnr_user_no'],
                                                  community['nodes'],
                                                  all_influence,
                                                  all_sensitive,
                                                  trace_datetime,
                                                  G=None)
        community_detail['density'] = trace_index_result['density']
        community_detail['cluster'] = trace_index_result['cluster']
        community_detail['max_influence'] = trace_index_result['max_influence']
        community_detail['mean_influence'] = trace_index_result[
            'mean_influence']
        community_detail['max_sensitive'] = trace_index_result['max_sensitive']
        community_detail['mean_sensitive'] = trace_index_result[
            'mean_sensitive']

        #预警处理
        warning_result = get_warning_reslut(community_detail, trace_datetime)
        community_detail['warning_type'] = warning_result['warning_type']

        community_detail['num_warning'] = warning_result['num_warning']
        community_detail['num_warning_descrp'] = warning_result[
            'num_warning_descrp']
        community_detail['num_warning_content'] = warning_result[
            'num_warning_content']

        community_detail['sensitive_warning'] = warning_result[
            'sensitive_warning']
        community_detail['sensitive_warning_descrp'] = warning_result[
            'sensitive_warning_descrp']
        community_detail['sensitive_warning_content'] = warning_result[
            'sensitive_warning_content']

        community_detail['influence_warning'] = warning_result[
            'influence_warning']
        community_detail['influence_warning_descrp'] = warning_result[
            'influence_warning_descrp']
        community_detail['influence_warning_content'] = warning_result[
            'influence_warning_content']

        community_detail['density_warning'] = warning_result['density_warning']
        community_detail['density_warning_descrp'] = warning_result[
            'density_warning_descrp']
        community_detail['density_warning_content'] = warning_result[
            'density_warning_content']

        community_detail[
            'warning_rank'] = warning_result['num_warning'] + warning_result[
                'sensitive_warning'] + warning_result[
                    'influence_warning'] + warning_result['density_warning']
        #更新显示
        update_warningrank_mark = update_warning_rank(community_detail,
                                                      trace_datetime)

    #存储至数据库
    save_community_mark = save_community_detail(community_detail,
                                                community['xnr_user_no'])

    result_mark.append(save_community_mark)

    return result_mark
示例#38
0
    redis_cluster.delete('sensitive_hashtag_'+str(delete_ts))

    #delete sensitive words
    redis_cluster.delete('sensitive_'+str(delete_ts))

    #delete recommendation
    r.delete('recomment_'+str(delete_date)+"_influence")
    r.delete('recomment_'+str(delete_date)+"_sensitive")
    r.delete("identify_in_sensitive_" + str(delete_date))
    r.delete("identify_in_influence_" + str(delete_date)))

if __name__ == "__main__":
    now_ts = time.time()
    current_path = os.getcwd()
    file_path_redis = os.path.join(current_path, 'delete_redis.py')
    print_log = "&".join([file_path_redis, "start", ts2datetime(now_ts)])
    print print_log

    now_datetime = datetime2ts(ts2datetime(now_ts))
    new_ip_number = r_cluster.hlen('new_ip_'+str(now_datetime))
    new_hashtag_number = r_cluster.hlen('hashtag_'+str(now_datetime))

    #if new_ip_number and new_hashtag_number: # flow2/flow4写入新数据,可以清楚8天前数据
    #    main()

    now_ts = time.time()
    print_log = "&".join([file_path_redis, "end", ts2datetime(now_ts)])
    print print_log


def get_related_recommendation(task_detail):

    avg_sort_uid_dict = {}

    xnr_user_no = task_detail['xnr_user_no']
    sort_item = task_detail['sort_item']
    es_result = es.get(index=weibo_xnr_index_name,
                       doc_type=weibo_xnr_index_type,
                       id=xnr_user_no)['_source']
    uid = es_result['uid']

    monitor_keywords = es_result['monitor_keywords']

    monitor_keywords_list = monitor_keywords.split(',')

    nest_query_list = []
    #print 'monitor_keywords_list::',monitor_keywords_list
    for monitor_keyword in monitor_keywords_list:
        #print 'monitor_keyword::::',monitor_keyword
        nest_query_list.append(
            {'wildcard': {
                'keywords_string': '*' + monitor_keyword + '*'
            }})

    # else:
    try:
        recommend_list = es.get(index=weibo_xnr_fans_followers_index_name,
                                doc_type=weibo_xnr_fans_followers_index_type,
                                id=xnr_user_no)['_source']['followers_list']
    except:
        recommend_list = []

    recommend_set_list = list(set(recommend_list))

    if S_TYPE == 'test':
        current_date = S_DATE
    else:
        current_date = ts2datetime(int(time.time() - 24 * 3600))

    flow_text_index_name = flow_text_index_name_pre + current_date

    if sort_item != 'friend':

        uid_list = []
        #uid_list = recommend_set_list
        if sort_item == 'influence':
            sort_item = 'user_fansnum'
        query_body_rec = {
            'query': {
                'bool': {
                    'should': nest_query_list
                }
            },
            'aggs': {
                'uid_list': {
                    'terms': {
                        'field': 'uid',
                        'size': TOP_ACTIVE_SOCIAL,
                        'order': {
                            'avg_sort': 'desc'
                        }
                    },
                    'aggs': {
                        'avg_sort': {
                            'avg': {
                                'field': sort_item
                            }
                        }
                    }
                }
            }
        }

        es_rec_result = es_flow_text.search(
            index=flow_text_index_name, doc_type='text',
            body=query_body_rec)['aggregations']['uid_list']['buckets']
        #print 'es_rec_result///',es_rec_result
        for item in es_rec_result:
            uid = item['key']
            uid_list.append(uid)

            avg_sort_uid_dict[uid] = {}

            if sort_item == 'user_fansnum':
                avg_sort_uid_dict[uid]['sort_item_value'] = int(
                    item['avg_sort']['value'])
            else:
                avg_sort_uid_dict[uid]['sort_item_value'] = round(
                    item['avg_sort']['value'], 2)

    else:
        if S_TYPE == 'test':
            uid_list = FRIEND_LIST
            #sort_item = 'sensitive'
        else:
            uid_list = []
            '''
            friends_list_results = es_user_profile.mget(index=profile_index_name,doc_type=profile_index_type,body={'ids':recommend_set_list})['docs']
            for result in friends_list_results:
                friends_list = friends_list + result['friend_list']
            '''
            friends_list = get_friends_list(recommend_set_list)

            friends_set_list = list(set(friends_list))

            #uid_list = friends_set_list

            sort_item_new = 'fansnum'

            query_body_rec = {
                'query': {
                    'bool': {
                        'must': [{
                            'terms': {
                                'uid': friends_set_list
                            }
                        }, {
                            'bool': {
                                'should': nest_query_list
                            }
                        }]
                    }
                },
                'aggs': {
                    'uid_list': {
                        'terms': {
                            'field': 'uid',
                            'size': TOP_ACTIVE_SOCIAL,
                            'order': {
                                'avg_sort': 'desc'
                            }
                        },
                        'aggs': {
                            'avg_sort': {
                                'avg': {
                                    'field': sort_item_new
                                }
                            }
                        }
                    }
                }
            }
            es_friend_result = es_flow_text.search(
                index=flow_text_index_name,
                doc_type='text',
                body=query_body_rec)['aggregations']['uid_list']['buckets']

            for item in es_friend_result:
                uid = item['key']
                uid_list.append(uid)

                avg_sort_uid_dict[uid] = {}

                if not item['avg_sort']['value']:
                    avg_sort_uid_dict[uid]['sort_item_value'] = 0
                else:
                    avg_sort_uid_dict[uid]['sort_item_value'] = int(
                        item['avg_sort']['value'])

    results_all = []

    for uid in uid_list:
        #if sort_item == 'friend':
        query_body = {
            'query': {
                'filtered': {
                    'filter': {
                        'term': {
                            'uid': uid
                        }
                    }
                }
            }
        }

        es_results = es_user_portrait.search(index=portrait_index_name,
                                             doc_type=portrait_index_type,
                                             body=query_body)['hits']['hits']

        if es_results:
            #print 'portrait--',es_results[0]['_source'].keys()
            for item in es_results:
                uid = item['_source']['uid']
                #nick_name,photo_url = uid2nick_name_photo(uid)
                item['_source']['nick_name'] = uid  #nick_name
                item['_source']['photo_url'] = ''  #photo_url
                weibo_type = judge_follow_type(xnr_user_no, uid)
                sensor_mark = judge_sensing_sensor(xnr_user_no, uid)

                item['_source']['weibo_type'] = weibo_type
                item['_source']['sensor_mark'] = sensor_mark
                try:
                    del item['_source']['group']
                    del item['_source']['activity_geo_dict']
                except:
                    pass

                if sort_item == 'friend':
                    if S_TYPE == 'test':
                        item['_source']['fansnum'] = item['_source']['fansnum']
                    else:
                        item['_source']['fansnum'] = avg_sort_uid_dict[uid][
                            'sort_item_value']
                elif sort_item == 'sensitive':
                    item['_source']['sensitive'] = avg_sort_uid_dict[uid][
                        'sort_item_value']
                    item['_source']['fansnum'] = item['_source']['fansnum']
                else:
                    item['_source']['fansnum'] = avg_sort_uid_dict[uid][
                        'sort_item_value']

                if S_TYPE == 'test':
                    current_time = datetime2ts(S_DATE)
                else:
                    current_time = int(time.time())

                index_name = get_flow_text_index_list(current_time)

                query_body = {
                    'query': {
                        'bool': {
                            'must': [{
                                'term': {
                                    'uid': uid
                                }
                            }, {
                                'terms': {
                                    'message_type': [1, 3]
                                }
                            }]
                        }
                    },
                    'sort': {
                        'retweeted': {
                            'order': 'desc'
                        }
                    },
                    'size': 5
                }

                es_weibo_results = es_flow_text.search(
                    index=index_name,
                    doc_type=flow_text_index_type,
                    body=query_body)['hits']['hits']

                weibo_list = []
                for weibo in es_weibo_results:
                    weibo = weibo['_source']
                    weibo_list.append(weibo)
                item['_source']['weibo_list'] = weibo_list
                item['_source']['portrait_status'] = True
                results_all.append(item['_source'])
        else:
            item_else = dict()
            item_else['uid'] = uid
            #nick_name,photo_url = uid2nick_name_photo(uid)
            item_else['nick_name'] = uid  #nick_name
            item_else['photo_url'] = ''  #photo_url
            weibo_type = judge_follow_type(xnr_user_no, uid)
            sensor_mark = judge_sensing_sensor(xnr_user_no, uid)
            item_else['weibo_type'] = weibo_type
            item_else['sensor_mark'] = sensor_mark
            item_else['portrait_status'] = False
            #if sort_item != 'friend':
            #item_else['sort_item_value'] = avg_sort_uid_dict[uid]['sort_item_value']
            # else:
            #     item_else['sort_item_value'] = ''

            if S_TYPE == 'test':
                current_time = datetime2ts(S_DATE)
            else:
                current_time = int(time.time())

            index_name = get_flow_text_index_list(current_time)

            query_body = {
                'query': {
                    'term': {
                        'uid': uid
                    }
                },
                'sort': {
                    'retweeted': {
                        'order': 'desc'
                    }
                }
            }

            es_weibo_results = es_flow_text.search(
                index=index_name,
                doc_type=flow_text_index_type,
                body=query_body)['hits']['hits']

            weibo_list = []
            for weibo in es_weibo_results:
                item_else['fansnum'] = weibo['_source']['user_fansnum']
                weibo = weibo['_source']
                weibo_list.append(weibo)
            item_else['weibo_list'] = weibo_list
            item_else['friendsnum'] = 0
            item_else['statusnum'] = 0
            if sort_item == 'sensitive':
                item_else['sensitive'] = avg_sort_uid_dict[uid][
                    'sort_item_value']
            else:
                item_else['fansnum'] = avg_sort_uid_dict[uid][
                    'sort_item_value']

            results_all.append(item_else)

    return results_all
            if count % 1000 == 0 and count != 0:
                R_TOPIC.hmset(r_topic_name, hmset_dict)
                end_ts = time.time()
                #print '%s sec count 1000' % (end_ts - start_ts)
        except StopIteration:
            if hmset_dict:
                R_TOPIC.hmset(r_topic_name, hmset_dict)
                hmset_dict = {}
            break
        except Exception as e:
            raise e
            break
    if hmset_dict:
        R_TOPIC.hmset(r_topic_name, hmset_dict)
    #print 'all count:', count


if __name__ == '__main__':
    log_time_ts = time.time()
    log_time_date = ts2datetime(log_time_ts)
    print 'cron/flow4/scan_topic2senitment.py&start&' + log_time_date

    del_topic_redis()
    scan_topic2redis()

    log_time_ts = time.time()
    log_time_date = ts2datetime(log_time_ts)
    print 'cron/flow4/scan_topic2senitment.py&end&' + log_time_date
    #topic_string = R_TOPIC.hget(r_topic_name, '2010832710')
    #print 'topic_string:', topic_string, type(topic_string)
def xnr_keywords_compute(xnr_user_no):
    #查询好友列表
    friends_list = lookup_xnr_friends(xnr_user_no)
    lookup_condition_list = []
    print 'xnr_user_no, friends_list:', xnr_user_no, friends_list
    lookup_condition_list.append({
        'filtered': {
            'filter': {
                'bool': {
                    'must': {
                        'terms': {
                            'uid': friends_list
                        }
                    }
                }
            }
        }
    })

    #根据日期确定查询表
    if S_TYPE == 'test':
        date_time = test_date
    else:
        now_time = int(time.time())
        date_time = ts2datetime(now_time)
    flow_text_index_name = facebook_flow_text_index_name_pre + date_time

    #按日期统计
    # print lookup_condition_list
    for item_condition in lookup_condition_list:
        query_body = {
            # 'query':item_condition,
            'aggs': {
                'keywords': {
                    'terms': {
                        'field': 'keywords_string',
                        'size': 1000
                    }
                }
            }
        }

        flow_text_exist=es_xnr_2.search(index=flow_text_index_name,doc_type=facebook_flow_text_index_type,\
               body=query_body)['aggregations']['keywords']['buckets']

        # print 'flow_text_exist:',flow_text_exist
        word_dict = dict()

        word_dict_new = dict()

        keywords_string = ''
        for item in flow_text_exist:
            word = item['key']
            count = item['doc_count']
            word_dict[word] = count

            keywords_string += '&'
            keywords_string += item['key']

        k_dict = extract_keywords(keywords_string)

        for item_item in k_dict:
            keyword = item_item.word
            # print 'keyword::',type(keyword)
            word_dict_new[keyword] = word_dict[keyword]

    return word_dict_new
示例#42
0
def scan_retweet():
    count = 0
    scan_cursor = 0
    now_ts = time.time()
    now_date_ts = datetime2ts(ts2datetime(now_ts))
    #get redis db number
    db_number = get_db_num(now_date_ts)
    #retweet/be_retweet es mappings
    '''
    retweet_es_mappings(str(db_number))
    be_retweet_es_mappings(str(db_number))
    '''
    #get redis db
    retweet_redis = retweet_redis_dict[str(db_number)]
    retweet_bulk_action = []
    be_retweet_bulk_action = []
    start_ts = time.time()
    #retweet count/be_retweet count
    retweet_count = 0
    be_retweet_count = 0
    while True:
        re_scan = retweet_redis.scan(scan_cursor, count=100)
        re_scan_cursor = re_scan[0]
        '''
        if re_scan_cursor == 0:
            print 'scan finish'
            if retweet_bulk_action != []:
                es.bulk(retweet_bulk_action, index='retweet_'+str(db_number), doc_type='user')
            if be_retweet_bulk_action != []:
                es.bulk(be_retweet_bulk_action, index='be_retweet_'+str(db_number), doc_type='user')
            break
        '''
        for item in re_scan[1]:
            count += 1
            item_list = item.split('_')
            save_dict = {}
            if len(item_list) == 2:
                retweet_count += 1
                uid = item_list[1]
                item_result = retweet_redis.hgetall(item)
                save_dict['uid'] = uid
                save_dict['uid_retweet'] = json.dumps(item_result)
                retweet_bulk_action.extend([{
                    'index': {
                        '_id': uid
                    }
                }, save_dict])
            elif len(item_list) == 3:
                be_retweet_count += 1
                uid = item_list[2]
                item_result = retweet_redis.hgetall(item)
                save_dict['uid'] = uid
                save_dict['uid_be_retweet'] = json.dumps(item_result)
                be_retweet_bulk_action.extend([{
                    'index': {
                        '_id': uid
                    }
                }, save_dict])
        es.bulk(retweet_bulk_action,
                index='1225_retweet_' + str(db_number),
                doc_type='user')
        es.bulk(be_retweet_bulk_action,
                index='1225_be_retweet_' + str(db_number),
                doc_type='user')
        retweet_bulk_action = []
        be_retweet_bulk_action = []
        end_ts = time.time()
        print '%s sec scan %s count user:'******'count:', count
    print 'end'
示例#43
0
def scan_comment():
    count = 0
    scan_cursor = 0
    now_ts = time.time()
    now_date_ts = datetime2ts(ts2datetime(now_ts))
    #get redis db number
    db_number = get_db_num(now_date_ts)
    #comment/be_comment es mappings
    '''
    comment_es_mappings(str(db_number))
    be_comment_es_mappings(str(db_number))
    '''
    #get redis db
    comment_redis = comment_redis_dict[str(db_number)]
    comment_bulk_action = []
    be_comment_bulk_action = []
    start_ts = time.time()
    #comment count/be_comment count
    comment_count = 0
    be_comment_count = 0
    while True:
        re_scan = comment_redis.scan(scan_cursor, count=100)
        re_scan_cursor = re_scan[0]
        for item in re_scan[1]:
            count += 1
            item_list = item.split('_')
            save_dict = {}
            if len(item_list) == 2:
                comment_count += 1
                uid = item_list[1]
                item_result = comment_redis.hgetall(item)
                save_dict['uid'] = uid
                save_dict['uid_comment'] = json.dumps(item_result)
                comment_bulk_action.extend([{
                    'index': {
                        '_id': uid
                    }
                }, save_dict])
            '''
            elif len(item_list)==3:
                be_comment_count += 1
                uid = item_list[2]
                item_result = comment_redis.hgetall(item)
                save_dict['uid'] = uid
                save_dict['uid_be_comment'] = json.dumps(item_result)
                be_comment_bulk_action.extend([{'index':{'_id': uid}}, save_dict])
            '''
        try:
            es.bulk(comment_bulk_action,
                    index='1225_comment_' + str(db_number),
                    doc_type='user')
        except:
            index_name = '1225_comment_' + str(db_number)
            split_bulk_action(comment_bulk_action, index_name)
        '''
        try:
            es.bulk(be_comment_bulk_action, index='1225_be_comment_'+str(db_number), doc_type='user')
        except:
            index_name = '1225_be_comment_'+str(db_number)
            split_bulk_action(be_comment_bulk_action, index_name)
        '''
        comment_bulk_action = []
        #be_comment_bulk_action = []
        end_ts = time.time()
        print '%s sec scan %s count user' % (end_ts - start_ts, count)
        start_ts = end_ts
        scan_cursor = re_scan[0]
        if scan_cursor == 0:
            break
    print 'count:', count
    print 'end'
示例#44
0
def create_personal_warning(xnr_user_no,today_datetime):
    #查询关注列表
    lookup_type='followers_list'
    followers_list=lookup_xnr_fans_followers(xnr_user_no,lookup_type)

    #查询虚拟人uid
    xnr_uid=lookup_xnr_uid(xnr_user_no)

    #计算敏感度排名靠前的用户
    query_body={
        # 'query':{
        #     'filtered':{
        #         'filter':{
        #             'terms':{'uid':followers_list}
        #         }
        #     }
        # },
        'aggs':{
            'followers_sensitive_num':{
                'terms':{'field':'uid'},
                'aggs':{
                    'sensitive_num':{
                        'sum':{'field':'sensitive'}
                    }
                }                        
            }
            },
        'size':MAX_SEARCH_SIZE
    }

    flow_text_index_name=get_day_flow_text_index_list(today_datetime)
    
    try:   
        first_sum_result=es_flow_text.search(index=flow_text_index_name,doc_type=flow_text_index_type,\
        body=query_body)['aggregations']['followers_sensitive_num']['buckets']
    except:
        first_sum_result=[]

    #print first_sum_result
    top_userlist=[]
    for i in xrange(0,len(first_sum_result)):
        user_sensitive=first_sum_result[i]['sensitive_num']['value']
        if user_sensitive > 0:
            user_dict=dict()
            user_dict['uid']=first_sum_result[i]['key']
            followers_mark=judge_user_type(user_dict['uid'],followers_list)
            user_dict['sensitive']=user_sensitive*followers_mark
            top_userlist.append(user_dict)
        else:
            pass

    ####################################
    #如果是关注者则敏感度提升
    ####################################
    #查询敏感用户的敏感微博内容
    results=[]
    for user in top_userlist:
        #print user
        user_detail=dict()
        user_detail['uid']=user['uid']
        user_detail['user_sensitive']=user['sensitive']
        # user_lookup_id=xnr_uid+'_'+user['uid']
        # print user_lookup_id
        # try:
        #     #user_result=es_xnr.get(index=weibo_feedback_follow_index_name,doc_type=weibo_feedback_follow_index_type,id=user_lookup_id)['_source']
        #     user_result=es_user_profile.get(index=profile_index_name,doc_type=profile_index_type,id=user['uid'])['_source']
        #     user_detail['user_name']=user_result['nick_name']
        # except:
        user_detail['user_name']=get_user_nickname(user['uid'])

        query_body={
            'query':{
                'filtered':{
                    'filter':{
                        'bool':{
                            'must':[
                                {'term':{'uid':user['uid']}},
                                {'range':{'sensitive':{'gte':1}}}
                            ]
                        }
                    }
                }
            },
            'size':MAX_WARMING_SIZE,
            'sort':{'sensitive':{'order':'desc'}}
        }

        try:
            second_result=es_flow_text.search(index=flow_text_index_name,doc_type=flow_text_index_type,body=query_body)['hits']['hits']
        except:
            second_result=[]

        s_result=[]
        #tem_word_one = '静坐'
        #tem_word_two = '集合'
        for item in second_result:
            #sensitive_words=item['_source']['sensitive_words_string']
            #if ((sensitive_words==tem_word_one) or (sensitive_words==tem_word_two)):
            #    pass
            #else:
            #查询用户昵称
            item['_source']['nick_name']=get_user_nickname(item['_source']['uid'])
            s_result.append(item['_source'])

        s_result.sort(key=lambda k:(k.get('sensitive',0)),reverse=True)
        user_detail['content']=json.dumps(s_result)

        user_detail['xnr_user_no']=xnr_user_no
        user_detail['validity']=0
        user_detail['timestamp']=today_datetime

        #写入数据库
        today_date=ts2datetime(today_datetime)
        weibo_user_warning_index_name=weibo_user_warning_index_name_pre+today_date

        task_id=xnr_user_no+'_'+user_detail['uid']
        #print weibo_user_warning_index_name
        #print user_detail
        if s_result:
            try:
                es_xnr.index(index=weibo_user_warning_index_name,doc_type=weibo_user_warning_index_type,body=user_detail,id=task_id)
                mark=True
            except:
                mark=False
        else:
            pass

        results.append(mark)

    return results
示例#45
0
def create_date_warning(today_datetime):
    query_body = {
        'query': {
            'match_all': {}
        },
        'size': MAX_VALUE,
        'sort': {
            'date_time': {
                'order': 'asc'
            }
        }
    }
    try:
        result = es_xnr.search(index=weibo_date_remind_index_name,
                               doc_type=weibo_date_remind_index_type,
                               body=query_body)['hits']['hits']
        date_result = []
        for item in result:
            #计算距离日期
            date_time = item['_source']['date_time']
            year = ts2yeartime(today_datetime)
            warming_date = year + '-' + date_time
            today_date = ts2datetime(today_datetime)
            countdown_num = (datetime2ts(warming_date) -
                             datetime2ts(today_date)) / DAY

            if abs(countdown_num) < WARMING_DAY:
                #根据给定的关键词查询预警微博
                print 'date_time:', date_time
                keywords = item['_source']['keywords']
                date_warming = lookup_twitter_date_warming(
                    keywords, today_datetime)
                item['_source']['twitter_date_warming_content'] = json.dumps(
                    date_warming)
                item['_source']['validity'] = 0
                item['_source']['timestamp'] = today_datetime

                task_id = str(
                    item['_source']['create_time']) + '_' + str(today_datetime)
                #print 'task_id',task_id
                #print 'date_warming',date_warming
                #写入数据库

                twitter_timing_warning_index_name = twitter_timing_warning_index_name_pre + warming_date

                if date_warming:
                    print twitter_timing_warning_index_name
                    try:

                        es_xnr_2.index(
                            index=twitter_timing_warning_index_name,
                            doc_type=twitter_timing_warning_index_name,
                            body=item['_source'],
                            id=task_id)
                        mark = True
                    except:
                        mark = False
                else:
                    pass

                date_result.append(mark)
        else:
            pass

    except:
        date_result = []
    return date_result
def active_social_recommend_daily(current_date):

    # 1. 获得所有已完成虚拟人

    all_xnrs = get_all_xnrs()
    print 'all_xnrs', all_xnrs
    # 2. 对于每个虚拟人,计算 按粉丝数、按敏感度、按朋友圈 三个结果 并保存
    for xnr_user_no in all_xnrs:
        for sort_item in ['influence', 'sensitive', 'friend']:
            task_detail = {}
            print 'sort_item..', sort_item
            task_detail['xnr_user_no'] = xnr_user_no
            task_detail['sort_item'] = sort_item

            # 计算
            result = get_related_recommendation(task_detail)
            print 'result', len(result)
            # 保存
            save_results_to_es(xnr_user_no, current_date, sort_item, result)


if __name__ == '__main__':

    current_time = time.time()
    current_date = ts2datetime(current_time)
    start_ts = time.time()
    active_social_recommend_daily(current_date)
    end_ts = time.time()
    print 'cost..', end_ts - start_ts
示例#47
0
def create_personal_warning(xnr_user_no, today_datetime):
    #查询关注列表
    lookup_type = 'followers_list'
    followers_list = lookup_xnr_fans_followers(xnr_user_no, lookup_type)

    #查询虚拟人uid
    xnr_uid = lookup_xnr_uid(xnr_user_no)

    #计算敏感度排名靠前的用户
    query_body = {
        # 'query':{
        #     'filtered':{
        #         'filter':{
        #             'terms':{'uid':followers_list}
        #         }
        #     }
        # },
        'aggs': {
            'friends_sensitive_num': {
                'terms': {
                    'field': 'uid'
                },
                'aggs': {
                    'sensitive_num': {
                        'sum': {
                            'field': 'sensitive'
                        }
                    }
                }
            }
        },
        'size': MAX_SEARCH_SIZE
    }

    twitter_flow_text_index_name = get_timets_set_indexset_list(
        twitter_flow_text_index_name_pre, today_datetime, today_datetime)

    try:
        first_sum_result=es_xnr_2.search(index=twitter_flow_text_index_name,doc_type=twitter_flow_text_index_type,\
        body=query_body)['aggregations']['friends_sensitive_num']['buckets']
    except:
        first_sum_result = []

    #print 'first_sum_result',first_sum_result
    top_userlist = []
    for i in xrange(0, len(first_sum_result)):
        user_sensitive = first_sum_result[i]['sensitive_num']['value']
        if user_sensitive > 0:
            user_dict = dict()
            user_dict['uid'] = first_sum_result[i]['key']
            followers_mark = judge_user_type(user_dict['uid'], followers_list)
            user_dict['sensitive'] = user_sensitive * followers_mark
            top_userlist.append(user_dict)
        else:
            pass
    #####################
    #如果是关注者,则用户敏感度计算值增加1.5倍
    #####################
    #查询敏感用户的敏感内容
    results = []
    for user in top_userlist:
        #print user
        user_detail = dict()
        user_detail['uid'] = user['uid']
        user_detail['user_sensitive'] = user['sensitive']
        user_lookup_id = user['uid']
        print user_lookup_id
        #查询用户昵称
        user_detail['user_name'] = get_user_nickname(user['uid'])

        query_body = {
            'query': {
                'filtered': {
                    'filter': {
                        'bool': {
                            'must': [{
                                'term': {
                                    'uid': user['uid']
                                }
                            }, {
                                'range': {
                                    'sensitive': {
                                        'gte': 1
                                    }
                                }
                            }]
                        }
                    }
                }
            },
            'size': MAX_WARMING_SIZE,
            'sort': {
                'sensitive': {
                    'order': 'desc'
                }
            }
        }

        try:
            second_result = es_xnr_2.search(
                index=twitter_flow_text_index_name,
                doc_type=twitter_flow_text_index_type,
                body=query_body)['hits']['hits']
        except:
            second_result = []

        s_result = []
        for item in second_result:
            #查询三个指标字段
            tid_result = lookup_tid_attend_index(item['_source']['tid'],
                                                 today_datetime)
            if tid_result:
                item['_source']['comment'] = tid_result['comment']
                item['_source']['share'] = tid_result['share']
                item['_source']['favorite'] = tid_result['favorite']
            else:
                item['_source']['comment'] = 0
                item['_source']['share'] = 0
                item['_source']['favorite'] = 0

            #查询用户昵称
            item['_source']['nick_name'] = get_user_nickname(
                item['_source']['uid'])

            s_result.append(item['_source'])

        s_result.sort(key=lambda k: (k.get('sensitive', 0)), reverse=True)
        user_detail['content'] = json.dumps(s_result)

        user_detail['xnr_user_no'] = xnr_user_no
        user_detail['validity'] = 0
        user_detail['timestamp'] = today_datetime

        #写入数据库
        today_date = ts2datetime(today_datetime)
        twitter_user_warning_index_name = twitter_user_warning_index_name_pre + today_date

        task_id = xnr_user_no + '_' + user_detail['uid']
        if s_result:
            try:
                es_xnr_2.index(index=twitter_user_warning_index_name,
                               doc_type=twitter_user_warning_index_type,
                               body=user_detail,
                               id=task_id)
                mark = True
            except:
                mark = False
        else:
            pass

        results.append(mark)

    return results
示例#48
0
def group_evaluate_trace(xnr_user_no,
                         nodes,
                         all_influence,
                         all_sensitive,
                         date_time,
                         G=None):
    result = {}
    result['xnr_user_no'] = xnr_user_no
    result['nodes'] = nodes
    result['num'] = len(nodes)

    #从redis中获取社区转发网络
    count = 0
    scan_cursor = 1
    now_ts = time.time()
    now_date_ts = datetime2ts(ts2datetime(now_ts))
    #get redis db number
    db_number = get_db_num(now_date_ts)
    print 'db_number:', str(db_number)
    #get redis db
    print 'retweet_dict::', retweet_redis_dict
    retweet_redis = retweet_redis_dict[str(db_number)]
    comment_redis = comment_redis_dict[str(db_number)]

    retweet_result = []
    for uid in nodes:
        item_1 = str('retweet_' + uid)
        # print 'item_lookup::',item_1,type(item_1)
        re_result = retweet_redis.hgetall(item_1)
        if re_result:
            save_dict = dict()
            save_dict['uid'] = uid
            save_dict['uid_retweet'] = re_result
            retweet_result.append(save_dict)
# print 'test_result::',retweet_result
# print 'aaa:::', retweet_redis.hgetall('retweet_'+str(nodes[-1]))

#print 'retweet_redis::',retweet_redis
#print 'comment_redis::',comment_redis
    ''' 
    re_scan = retweet_redis.scan(scan_cursor,count=10)
    for item in re_scan[1]:
       # item_list = item.split('_')
        print 'item::',item,type(item)
        item_result = retweet_redis.hgetall(item)
        print 'item_result::',item_result
   # print 'hlen::',retweet_redis.hlen()
   # print 'hgetall::',retweet_redis.hgetall()
    retweet_result = retweet_redis.hgetall(nodes)
    comment_result = comment_redis.hgetall(nodes)
    '''
    # print 'retweet_result:::',retweet_result
    #print 'comment_result:::',comment_result

    G_i = nx.Graph()
    for i in retweet_result:
        # print 'i:',i
        # if not i['found']:
        #     continue
        uid_retweet = i['uid_retweet']
        max_count = max([int(n) for n in uid_retweet.values()])
        G_i.add_weighted_edges_from([
            (i['uid'], j, float(uid_retweet[j]) / max_count)
            for j in uid_retweet.keys() if j != i['uid'] and j and i['uid']
        ])
    '''
    for i in comment_result:
        # print 'comment_i:',i
        if not i['found']:
            continue
        uid_comment = json.loads(i['_source']['uid_comment'])
        max_count = max([int(n) for n in uid_comment.values()])
        G_i.add_weighted_edges_from([(i['_source']['uid'],j,float(uid_comment[j])/max_count) for j in uid_comment.keys() if j != i['_source']['uid'] and j and i['_source']['uid']])
    '''

    sub_g = G_i.subgraph(nodes)

    result['density'] = round(nx.density(sub_g), 4)
    #print 'ave_cluster::',nx.average_clustering(sub_g)
    try:
        result['cluster'] = round(nx.average_clustering(sub_g), 4)
    except:
        result['cluster'] = 0
    result['transitivity'] = round(nx.transitivity(sub_g), 4)

    ##将结果换成当天的计算结果
    influence_field = 'user_index'
    sensitive_field = 'sensitive'
    influence_result = get_influence_value(date_time, influence_field, nodes)
    sensitive_result = get_sensitive_value(date_time, sensitive_field, nodes)

    result['max_influence'] = round(
        (max(influence_result) / float(all_influence)) * 100, 4)
    result['mean_influence'] = round(
        ((sum(influence_result) / len(influence_result)) /
         float(all_influence)) * 100, 4)

    max_sensitive = round((max(sensitive_result) / float(all_sensitive)) * 1,
                          4)
    if max_sensitive > 100:
        result['max_sensitive'] = 100.0000
    else:
        result['max_sensitive'] = max_sensitive
    result['mean_sensitive'] = round(
        ((sum(sensitive_result) / len(sensitive_result)) /
         float(all_sensitive)) * 1, 4)

    return result
示例#49
0
    s_re = scan(es_9200,
                query=es_query,
                index=BCIHIS_INDEX_NAME,
                doc_type=BCIHIS_INDEX_TYPE)
    count = 0
    array = []
    while 1:
        try:
            temp = s_re.next()
            one_item = {}
            one_item['id'] = temp['_id'].encode("utf-8")
            one_item['total_num'] = 0
            one_item['today_bci'] = 0
            one_item['update_time'] = TODAY_TIME
            array.append(one_item)
            count += 1
            if count % 1000 == 0:
                r_flow.lpush('update_bci_list', json.dumps(array))
                array = []
                count = 0
        except StopIteration:
            print "all done"
            r_flow.lpush('update_bci_list', json.dumps(array))
            break


if __name__ == '__main__':
    todaydate = ts2datetime(time.time())
    mapper_bci_today(todaydate)
    mapper_bci_history(todaydate)
示例#50
0
        word_dict示例:{uid1:{'w1':f1,'w2':f2...}...}
        weibo_list示例:[[uid1,text1,ts1],[uid2,text2,ts2],...](每一条记录对应三个值:uid、text、timestamp)
    '''
    word_dict = dict()#词频字典
    weibo_list = []#微博列表
    now_ts = time.time()
    #run_type
    if RUN_TYPE = 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = datetime2ts(RUN_TEST_TIME)

    start_date_ts = now_date_ts - DAY * WEEK
    for i in range(0,WEEK):
        iter_date_ts = start_date_ts + DAY * i
        flow_text_index_date = ts2datetime(iter_date_ts)
        flow_text_index_name = flow_text_index_name_pre + flow_text_index_date
        print flow_text_index_name
        try:
            flow_text_exist = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\
                    body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'size': MAX_VALUE}, _source=False,  fields=['text','uid','keywords_dict','timestamp'])['hits']['hits']
        except:
            flow_text_exist = []

        for flow_text_item in flow_text_exist:
            uid = flow_text_item['fields']['uid'][0].encode('utf-8')
            text = flow_text_item['fields']['text'][0].encode('utf-8')
            ts = flow_text_item['fields']['timestamp'][0]
            keywords_dict = json.loads(flow_text_item['fields']['keywords_dict'][0])
            keywords_dict = json.dumps(keywords_dict, encoding="UTF-8", ensure_ascii=False)
            keywords_dict = eval(keywords_dict)
示例#51
0
def get_flow_information_v2(uid_list, all_user_keywords_dict):
    results = {}
    #results = {uid:{'hashtag_dict':{},'hashtag':'', 'keywords_dict':{}, 'keywords_string':'', 'activity_geo':'', 'activity_geo_dict':dict, 'activity_geo_aggs':''}}
    iter_results = {
    }  # iter_results = {uid:{'hashtag': hashtag_dict, 'geo':geo_dict, 'keywords':keywords_dict}}
    now_ts = time.time()
    #run_type
    today_sensitive_results = {}
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = test_ts
    print 'run_type:', RUN_TYPE
    for i in range(WEEK, 0, -1):
        ts = now_date_ts - DAY * i
        uid_day_geo = {}
        #compute hashtag and geo
        hashtag_results = r_cluster_3.hmget('hashtag_' + str(ts), uid_list)
        ip_results = r_cluster.hmget('new_ip_' + str(ts), uid_list)
        print 'ip_results:', ip_results
        #compute sensitive_words
        sensitive_results = r_cluster_3.hmget('sensitive_' + str(ts), uid_list)
        count = 0
        for uid in uid_list:
            #init iter_results[uid]
            if uid not in iter_results:
                iter_results[uid] = {
                    'hashtag': {},
                    'geo': {},
                    'geo_track': [],
                    'keywords': {},
                    'sensitive': {},
                    'school': {},
                    'week_ip': {
                        0: {},
                        1: {},
                        2: {},
                        3: {},
                        4: {},
                        5: {}
                    },
                    'ip': {}
                }
            if uid not in today_sensitive_results:
                today_sensitive_results[uid] = {}
            #compute hashtag
            hashtag_item = hashtag_results[count]
            if hashtag_item:
                uid_hashtag_dict = json.loads(hashtag_item)
            else:
                uid_hashtag_dict = {}
            for hashtag in uid_hashtag_dict:
                try:
                    iter_results[uid]['hashtag'][hashtag] += uid_hashtag_dict[
                        hashtag]
                except:
                    iter_results[uid]['hashtag'][hashtag] = uid_hashtag_dict[
                        hashtag]
            #compute sensitive
            sensitive_item = sensitive_results[count]
            if sensitive_item:
                uid_sensitive_dict = json.loads(sensitive_item)
            else:
                uid_sensitive_dict = {}
            for sensitive_word in uid_sensitive_dict:
                try:
                    iter_results[uid]['sensitive'][
                        sensitive_word] += uid_sensitive_dict[sensitive_word]
                except:
                    iter_results[uid]['sensitive'][
                        sensitive_word] = uid_sensitive_dict[sensitive_word]
                if ts == now_date_ts - DAY:
                    try:
                        today_sensitive_results[uid][
                            sensitive_word] += uid_sensitive_dict[
                                sensitive_word]
                    except:
                        today_sensitive_results[uid][
                            sensitive_word] = uid_sensitive_dict[
                                sensitive_word]
            #compute geo
            uid_day_geo[uid] = {}
            ip_item = ip_results[count]
            if ip_item:
                uid_ip_dict = json.loads(ip_item)
            else:
                uid_ip_dict = {}
            for ip in uid_ip_dict:
                ip_count = len(uid_ip_dict[ip].split('&'))
                geo, school = ip2city(ip)
                if geo:
                    try:
                        iter_results[uid]['geo'][geo] += ip_count
                    except:
                        iter_results[uid]['geo'][geo] = ip_count
                    try:
                        uid_day_geo[uid][geo] += ip_count
                    except:
                        uid_day_geo[uid][geo] = ip_count
                if school:
                    try:
                        iter_results[uid]['school'][school] += ip_count
                    except:
                        iter_results[uid]['school'][school] = ip_count
                #deal ip: job_ip&home_ip&active_ip
                ip_time_list = uid_ip_dict[ip].split('&')
                try:
                    iter_results[uid]['ip'][ip] += ip_count
                except:
                    iter_results[uid]['ip'] = {ip: ip_count}
                for ip_time_item in ip_time_list:
                    ip_timesegment = (int(ip_time_item) - ts) / IP_TIME_SEGMENT
                    try:
                        iter_results[uid]['week_ip'][ip_timesegment][ip] += 1
                    except:
                        iter_results[uid]['week_ip'][ip_timesegment][ip] = 1
                #end deal ip
            iter_results[uid]['geo_track'].append(uid_day_geo[uid])
            count += 1
    #get keywords top
    for uid in uid_list:
        #print 'test iter_results_ip:', iter_results[uid]['week_ip']
        results[uid] = {}
        #hashtag
        hashtag_dict = iter_results[uid]['hashtag']
        results[uid]['hashtag_dict'] = json.dumps(hashtag_dict)
        results[uid]['hashtag'] = '&'.join(hashtag_dict.keys())
        #sensitive words
        sensitive_word_dict = iter_results[uid]['sensitive']
        results[uid]['sensitive_dict'] = json.dumps(sensitive_word_dict)
        results[uid]['sensitive_string'] = '&'.join(sensitive_word_dict.keys())
        sensitive_score = 0
        today_sensitive_results_user = today_sensitive_results[uid]
        for sensitive_item in today_sensitive_results_user:
            k = sensitive_item
            v = today_sensitive_results_user[sensitive_item]
            tmp_stage = r_sensitive.hget('sensitive_words', k)
            if tmp_stage:
                sensitive_score += v * sensitive_score_dict[str(tmp_stage)]
        results[uid]['sensitive'] = sensitive_score
        #geo
        geo_dict = iter_results[uid]['geo']
        geo_track_list = iter_results[uid]['geo_track']
        results[uid]['activity_geo_dict'] = json.dumps(geo_track_list)
        geo_dict_keys = geo_dict.keys()
        results[uid]['activity_geo'] = '&'.join(
            ['&'.join(item.split('\t')) for item in geo_dict_keys])
        try:
            results[uid]['activity_geo_aggs'] = '&'.join(
                [item.split('\t')[-1] for item in geo_dict_keys])
        except:
            results[uid]['activity_geo_aggs'] = ''
        #keywords
        keywords_dict = all_user_keywords_dict[uid]
        keywords_top50 = sorted(keywords_dict.items(),
                                key=lambda x: x[1],
                                reverse=True)[:50]
        keywords_top50_string = '&'.join(
            [keyword_item[0] for keyword_item in keywords_top50])
        results[uid]['keywords'] = json.dumps(keywords_top50)
        results[uid]['keywords_string'] = keywords_top50_string
        #school dict
        school_dict = iter_results[uid]['school']
        school_string = '&'.join(school_dict.keys())
        if school_dict != {}:
            is_school = '1'
        else:
            is_school = '0'
        results[uid]['is_school'] = is_school
        results[uid]['school_string'] = school_string
        results[uid]['school_dict'] = json.dumps(school_dict)
        #ip: job_ip&home_ip&activity_ip
        #activity_ip
        all_ip_dict = iter_results[uid]['ip']
        sort_all_ip = sorted(all_ip_dict.items(),
                             key=lambda x: x[1],
                             reverse=True)
        try:
            activity_ip = sort_all_ip[0][0]
        except:
            activity_ip = ''
        results[uid]['activity_ip'] = str(activity_ip)
        #job_ip & home_ip
        week_time_ip_dict = iter_results[uid]['week_ip']
        for i in range(0, 6):
            try:
                segment_dict = week_time_ip_dict[i]
            except:
                week_time_ip_dict[i] = {}
        home_ip, job_ip = get_ip_description(week_time_ip_dict)
        results[uid]['home_ip'] = str(home_ip)
        results[uid]['job_ip'] = str(job_ip)

    return results
示例#52
0
import time
reload(sys)
sys.path.append('../../')
from time_utils import ts2datetime,ts2yeartime,datetime2ts
from parameter import WARMING_DAY,MAX_VALUE,DAY
from global_config import S_TYPE,S_DATE_BCI,S_DATE_WARMING
from elasticsearch import Elasticsearch
from global_utils import es_xnr as es
from global_utils import weibo_user_warning_index_name_pre,weibo_user_warning_index_type,\
						weibo_event_warning_index_name_pre,weibo_event_warning_index_type,\
						weibo_speech_warning_index_name_pre,weibo_speech_warning_index_type,\
						weibo_timing_warning_index_name_pre,weibo_timing_warning_index_type,\
						weibo_date_remind_index_name,weibo_date_remind_index_type,\
						weibo_warning_corpus_index_name,weibo_warning_corpus_index_type

NOW_DATE=ts2datetime(int(time.time())-DAY)

def weibo_user_warning_mappings(index_name):
	index_info = {
		'settings':{
			'number_of_replicas':0,
			'number_of_shards':5
		},
		'mappings':{
			weibo_user_warning_index_type:{
				'properties':{
					'xnr_user_no':{  # 虚拟人  
						'type':'string',
						'index':'not_analyzed'
					},
					'user_name':{    #预警用户昵称
def organize_feature(mid, topic):
    if RUN_TYPE:
        ts = time.time()
    else:
        ts = datetime2ts("2016-11-17")
    index_list = []
    for i in range(7):
        index_list.append("flow_text_" + ts2datetime(ts - i * 24 * 3600))

    result = dict()
    for iter_index in index_list:
        if not es.indices.exists(index=iter_index):
            continue
        try:
            result = es.get(index=iter_index, doc_type="text",
                            id=mid)["_source"]
            break
        except:
            pass
    if not result:
        return [0, 0, 0, 0, 0, 0, 0]

    ts = result["timestamp"]

    query_body = {"query": {"term": {"root_mid": mid}}}
    #total_weibo
    #count = es.count(index=index_list, doc_type="text", body=query_body)["count"]

    query_body_uid = {
        "query": {
            "term": {
                "root_mid": mid
            }
        },
        "aggs": {
            "uid_count": {
                "cardinality": {
                    "field": "uid"
                }
            }
        }
    }
    # total_uid
    #total_uid_count = es.search(index=index_list, doc_type="text", body=query_body_uid)['aggregations']["uid_count"]["value"]

    feature_list = []
    feature_list.append(math.log(result["user_fansnum"] + 1))
    query_body_ts = {
        "query": {
            "bool": {
                "must": [{
                    "term": {
                        "root_mid": mid
                    }
                }, {
                    "range": {
                        "timestamp": {
                            "lt": ts + 3600 * 10
                        }
                    }
                }]
            }
        },
        "aggs": {
            "weibo_type": {
                "terms": {
                    "field": "message_type"
                }
            }
        }
    }
    comment = 0
    retweet = 0
    tmp_count = es.search(
        index=index_list, doc_type="text",
        body=query_body_ts)['aggregations']["weibo_type"]["buckets"]
    if tmp_count:
        for item in tmp_count:
            if int(item["key"]) == 2:
                comment = item["doc_count"]
            elif int(item["key"]) == 3:
                retweet = item["doc_count"]
    feature_list.append(comment + retweet)
    feature_list.append(retweet)
    feature_list.append(comment)
    feature_list.append(retweet / float(comment + retweet + 1))
    feature_list.append(comment / float(comment + retweet + 1))
    query_body_uid = {
        "query": {
            "bool": {
                "must": [{
                    "term": {
                        "root_mid": mid
                    }
                }, {
                    "range": {
                        "timestamp": {
                            "lt": ts + 3600 * 10
                        }
                    }
                }]
            }
        },
        "aggs": {
            "uid_count": {
                "cardinality": {
                    "field": "uid"
                }
            }
        }
    }
    uid_count = es.search(
        index=index_list, doc_type="text",
        body=query_body_uid)['aggregations']["uid_count"]["value"]
    feature_list.append(uid_count)
    #feature_list.append(topic_field_dict[topic])

    return feature_list
示例#54
0
    k = int(len(g.nodes()) * ratio_pm)
    items = db.session.query(BetweenessCentralityUser).filter(BetweenessCentralityUser.topic==topic ,\
                                                              BetweenessCentralityUser.rank<=k ,\
                                                              BetweenessCentralityUser.windowsize==windowsize ,\
                                                              BetweenessCentralityUser.date==date).all()
    positive_m = []
    for item in items:
        positive_m.append(item.userid)
        
    print 'len(positive_m):', k
    print 'positive_m:', positive_m
    return positive_m


if __name__=='__main__':
    g = nx.read_gexf('test_graph.gexf')
    positive_m = []
    source = []
    topic = u'东盟,博览会'
    windowsize = 6
    end_ts = datetime2ts('2013-09-08')
    date = ts2datetime(end_ts)
    ratio_pm = 1 / float(30)
    positive_m = Get_pm(g, ratio_pm, topic, windowsize, date)
    while positive_m:
        #print 'positive_m, source:', len(positive_m), len(source)
        source, positive_m = Get_s(g, positive_m, source)
        print 'positive_m, source:', len(positive_m), len(source)
    save_source(source)
    
示例#55
0
def delete_files():
    localtime = int(time.time()) - 24 * 3600  #隔天删除数据
    print "time to delete files ..."
    count = 0
    file_list = os.listdir(BIN_FILE_PATH)
    for each in file_list:
        file_name = each.split('.')[0]
        file_timestamp = int(file_name.split('_')[0])
        if file_timestamp < localtime:
            os.remove(os.path.join(BIN_FILE_PATH, each))
            count += 1
    print 'we delete %s file at the time %s' % (count, localtime)


if __name__ == "__main__":

    context = zmq.Context()

    controller = context.socket(zmq.PUB)
    controller.bind("tcp://%s:%s" %
                    (ZMQ_CTRL_HOST_FLOW1, ZMQ_CTRL_VENT_PORT_FLOW5))

    for i in range(20):
        time.sleep(0.1)
        controller.send("PAUSE")
        # repeat to send to ensure

    ts = ts2datetime(time.time())
    print "stop_zmq&stop-flow2*%s" % ts
    #delete_files()
示例#56
0
def create_user_log():
    now_time=int(time.time())
    today_datetime=datetime2ts(ts2datetime(now_time))
    start_time=today_datetime-DAY    #前一天0点
    end_time=today_datetime          #定时文件启动的0点
    operate_date=ts2datetime(start_time)

    #查询账户列表
    user_name_list=get_user_account_list()

    mark_list=[]
    #查询账户所管理的虚拟人
    for user_account in user_name_list:
        #对应账户的日志ID
        #print 'user_account',user_account,type(user_account)
        #print 'operate_date',operate_date,type(operate_date)
        user_account=list(user_account)[0]
        log_id=str(user_account)+'_'+operate_date
        print log_id
        log_content_dict=dict()

###########################################################################
#微博部分日志
###########################################################################

        #账户是否创建虚拟人
        xnr_number=create_xnr_number(user_account,start_time,end_time)
        if xnr_number > 0:
            log_content_dict[u'创建微博虚拟人']=xnr_number
        else:
            pass

        xnr_user_no_list=get_user_xnr_list(user_account)
        xnr_uid_list=get_xnr_uid_list(user_account)
        
        #遍历各个模块,验证所管理虚拟人是否进行操作
        ##################发帖操作#################
        #日常发帖
        daily_post_type='daily_post'
        daily_post_num=count_type_posting(daily_post_type,operate_date,xnr_user_no_list)
        if daily_post_num > 0:
            log_content_dict[u'微博-日常发帖']=daily_post_num
        else:
            pass

        #业务发帖
        business_post_type='business_post'
        business_post_num=count_type_posting(business_post_type,operate_date,xnr_user_no_list)
        if business_post_num > 0:
            log_content_dict[u'微博-业务发帖']=business_post_num
        else:
            pass

        #热点跟随
        hot_post_type='hot_post'
        hot_post_num=count_type_posting(hot_post_type,operate_date,xnr_user_no_list)
        if hot_post_num > 0:
            log_content_dict[u'微博-热点跟随']=hot_post_num
        else:
            pass

        #跟踪转发
        retweet_timing_num=count_tweet_retweet(start_time,end_time,xnr_user_no_list)
        if retweet_timing_num > 0:
            log_content_dict[u'微博-跟踪转发']=retweet_timing_num
        else:
            pass

        ##################社交操作:转发、评论、点赞#################
        #转发
        retweet_type='3'
        retweet_num=count_retweet_comment_operate(retweet_type,operate_date,xnr_uid_list)
        if retweet_num > 0:
            log_content_dict[u'微博-转发']=retweet_num
        else:
            pass

        #评论
        comment_type='2'
        comment_num=count_retweet_comment_operate(comment_type,operate_date,xnr_uid_list)
        if comment_num > 0:
            log_content_dict[u'微博-评论']=comment_num
        else:
            pass

        #点赞
        like_num=count_like_operate(start_time,end_time,xnr_uid_list)
        if like_num > 0:
            log_content_dict[u'微博-点赞']=like_num
        else:
            pass

        #私信
        private_message_num=count_private_message(start_time,end_time,xnr_uid_list)
        if private_message_num > 0:
            log_content_dict[u'微博-私信']=private_message_num
        else:
            pass

        ##################加入语料#################
        add_corpus_num=count_add_corpus(start_time,end_time,xnr_user_no_list)
        if add_corpus_num > 0:
            log_content_dict[u'微博-加入语料']=add_corpus_num
        else:
            pass

        ##################上报操作#################
        report_num=count_report_type(start_time,end_time,xnr_user_no_list)
        if report_num > 0:
            log_content_dict[u'微博-上报']=report_num
        else:
            pass

        ##################加入预警库#################
        add_warming_num=count_add_warming_speech(start_time,end_time,xnr_user_no_list)
        if add_warming_num > 0:
            log_content_dict[u'微博-加入预警库']=add_warming_num
        else:
            pass

        ##################定时任务#################
        timing_task_num=count_add_timing_task(start_time,end_time,xnr_user_no_list)
        if timing_task_num > 0:
            log_content_dict[u'微博-创建定时任务']=timing_task_num
        else:
            pass

###########################################################################
###########################################################################

        ##################领域创建#################
        create_domain_num=count_create_domain(user_account,start_time,end_time)
        if create_domain_num > 0:
            log_content_dict[u'领域创建']=create_domain_num
        else:
            pass

        ##################业务知识库#################
        #敏感词创建
        create_sensitive_words_num=count_create_business(user_account,start_time,end_time,weibo_sensitive_words_index_name,weibo_sensitive_words_index_type)
        if create_sensitive_words_num > 0:
            log_content_dict[u'创建敏感词']=create_sensitive_words_num
        else:
            pass

        #时间节点创建
        create_date_remind_num=count_create_business(user_account,start_time,end_time,weibo_date_remind_index_name,weibo_date_remind_index_type)
        if create_date_remind_num > 0:
            log_content_dict[u'创建时间节点']=create_date_remind_num
        else:
            pass

        #隐喻式表达创建
        create_hidden_expression_num=count_create_business(user_account,start_time,end_time,weibo_hidden_expression_index_name,weibo_hidden_expression_index_type)
        if create_hidden_expression_num > 0:
            log_content_dict[u'创建隐喻式表达']=create_hidden_expression_num
        else:
            pass


###########################################################################
#QQ部分日志
###########################################################################
        #账户是否创建QQ虚拟人
        qq_xnr_number=create_qqxnr_number(user_account,start_time,end_time)
        if qq_xnr_number > 0:
            log_content_dict[u'创建QQ虚拟人']=qq_xnr_number
        else:
            pass

        qq_xnr_user_no_list=get_user_qqxnr_list(user_account)

        #今日发帖量
        qqxnr_daily_post = count_qqxnr_daily_post(operate_date,qq_xnr_user_no_list)
        if qqxnr_daily_post > 0:
            log_content_dict[u'QQ-发言']=qqxnr_daily_post
        else:
            pass

        #上报数量
        qq_report_number=count_qq_report_number(start_time,end_time,qq_xnr_user_no_list)
        if qq_report_number > 0:
            log_content_dict[u'QQ-上报']=qq_report_number
        else:
            pass
            

        log_content=json.dumps(log_content_dict)

       #写入日志
       #日志ID存在判断
        try:
            es_xnr.update(index=weibo_log_management_index_name,doc_type=weibo_log_management_index_type,id=log_id,body={'doc':{'operate_content':log_content}})
            mark=True
        except:
            mark=False
        mark_list.append(mark)
    return mark_list
示例#57
0
def scan_retweet(ft_type):
    count = 0
    scan_cursor = 0
    now_ts = time.time()
    now_date_ts = datetime2ts(ts2datetime(now_ts))
    #get redis db number
    db_number = get_db_num(now_date_ts)

    if ft_type == 'fb':
        retweet_redis_dict = fb_retweet_dict
        index_name = 'fb_be_retweet_' +str(db_number)
    else:
        retweet_redis_dict = tw_retweet_dict
        index_name = 'tw_be_retweet_' +str(db_number)

    #get redis db
    retweet_redis = retweet_redis_dict[str(db_number)]
    # # 1. 判断即将切换的db中是否有数据
    # while 1:
    #     redis_host_list.pop(db_number)
    #     other_db_number = retweet_redis_dict[redis_host_list[0]] # 获得对应的redis
    #     current_dbsize = other_db_number.dbsize()
    #     if current_dbsize:
    #         break # 已经开始写入新的db,说明前一天的数据已经写完
    #     else:
    #         time.sleep(60)

    # 2. 删除之前的es
    
    be_retweet_es_mappings(str(db_number),ft_type)
    
    
    # 3. scan

    retweet_bulk_action = []
    be_retweet_bulk_action = []
    start_ts = time.time()
    #retweet count/be_retweet count
    #retweet_count = 0
    be_retweet_count = 0
    while True:
        re_scan = retweet_redis.scan(scan_cursor, count=100)
        re_scan_cursor = re_scan[0]
        '''
        if re_scan_cursor == 0:
            print 'scan finish'
            if retweet_bulk_action != []:
                es.bulk(retweet_bulk_action, index='retweet_'+str(db_number), doc_type='user')
            if be_retweet_bulk_action != []:
                es.bulk(be_retweet_bulk_action, index='be_retweet_'+str(db_number), doc_type='user')
            break
        '''
        for item in re_scan[1]:
            count += 1
            item_list = item.split('_')
            save_dict = {}
           
            if len(item_list)==3:
                be_retweet_count += 1
                uid = item_list[2]
                item_result = retweet_redis.hgetall(item)
                save_dict['uid'] = uid
                save_dict['uid_be_retweet'] = json.dumps(item_result)
                be_retweet_bulk_action.extend([{'index':{'_id':uid}}, save_dict])
        #print 'be_retweet_bulk_action...',be_retweet_bulk_action
        if be_retweet_bulk_action:
            es.bulk(be_retweet_bulk_action, index=index_name, doc_type='user')
        else:
            break
        retweet_bulk_action = []
        be_retweet_bulk_action = []
        end_ts = time.time()
        print '%s sec scan %s count user:'******'count:', count

    
    # flushdb

    retweet_redis.flushdb()
    
    print 'end'
示例#58
0
                    	'type':'long'
                    },
                    'socail_keyword':{  #社区初始关键词
                    	'type':'string',
                    	'index':'not_analyzed'
                    },
                    'warning_type':{ #预警类型:人物突增预警;影响力剧增预警;敏感度剧增预警;社区聚集预警
                          'type':'string',
                          'index':'not_analyzed'
                    }

				}
			}
		}
	}


	weibo_community_index_name = weibo_community_index_name_pre + date_name
	if not es.indices.exists(index=weibo_community_index_name):
		es.indices.create(index=weibo_community_index_name,body=index_info,ignore=400)


if __name__ == '__main__':
     if S_TYPE == 'test':
          date_name = WEIBO_COMMUNITY_DATE
     else:
          now_time = int(time.time())
          date_name = ts2datetime(now_time)

     weibo_community_mappings(date_name)
示例#59
0
            f.write(json.dumps(group_evaluate(xnr_user_no,v,all_influence,all_sensitive,G))+'\n')
    #print 'total time:',time.time()-s
    #print 'eq:',ExtendQ(allG,coms_list)
        mark = True
    except:
        mark = False
    return mark



if __name__ == '__main__':
    start_time = int(time.time())
    
    if S_TYPE == 'test':
        today_time = datetime2ts(WEIBO_COMMUNITY_DATE)
        xnr_user_no_list = ['WXNR0004']
    else:
        today_time = time.time() - 1*DAY
        #today_time = datetime2ts('2018-07-15')
        print ts2datetime(today_time)
        xnr_user_no_list = get_compelete_wbxnr()

    # xnr_user_no_list = ['WXNR0004']
    for xnr_user_no in xnr_user_no_list:
        create_weibo_community(xnr_user_no,today_time)


    end_time = int(time.time())
    print 'cost_time:::',end_time - start_time

示例#60
0
def get_hot_sensitive_recommend_at_user(sort_item):

    if S_TYPE == 'test':
        now_ts = datetime2ts(S_DATE_FB)    
    else:
        now_ts = int(time.time())
    datetime = ts2datetime(now_ts-24*3600)

    #sort_item = 'sensitive'
    sort_item_2 = 'timestamp'
    index_name = facebook_flow_text_index_name_pre + datetime

    query_body = {
        'query':{
            'match_all':{}
        },
        'sort':{sort_item:{'order':'desc'}},
        'size':HOT_EVENT_TOP_USER,
        '_source':['uid','user_fansnum','retweeted','timestamp']
    }

    # if sort_item == 'retweeted':
    #     sort_item_2 = 'timestamp'
    # else:
    #     sort_item_2 = 'retweeted'

    es_results = es.search(index=index_name,doc_type=facebook_flow_text_index_type,body=query_body)['hits']['hits']
    
    uid_fansnum_dict = dict()
    if es_results:
        for result in es_results:
            result = result['_source']
            uid = result['uid']
            uid_fansnum_dict[uid] = {}
            uid_fansnum_dict[uid][sort_item_2] = result[sort_item_2]

    uid_fansnum_dict_sort_top = sorted(uid_fansnum_dict.items(),key=lambda x:x[1][sort_item_2],reverse=True)

    uid_set = set()

    for item in uid_fansnum_dict_sort_top:
        uid_set.add(item[0])

    uid_list = list(uid_set)


    ## 根据uid,从weibo_user中得到 nick_name
    uid_nick_name_dict = dict()  # uid不会变,而nick_name可能会变
    es_results_user = es.mget(index=facebook_user_index_name,doc_type=facebook_user_index_type,body={'ids':uid_list})['docs']
    i = 0
    for result in es_results_user:
        if result['found'] == True:
            result = result['_source']
            uid = result['uid']
            nick_name = result['name']
            if nick_name:
                i += 1
                uid_nick_name_dict[uid] = nick_name
        if i >= HOT_AT_RECOMMEND_USER_TOP:
            break

    return uid_nick_name_dict