def scan_mapper(pre, sen_pre, r):
    if RUN_TYPE:
        ts = datetime2ts(ts2datetime(time.time - DAY))
    else:
        ts = datetime2ts('2013-09-01')
    ts = str(ts)
    hash_name = pre + ts
    sen_hash_name = sen_pre + ts
    cursor = 0
    count = 0
    tb = time.time()

    while 1:
        re_scan = r.hscan(hash_name, cursor, count=1000)
        cursor = re_scan[0]
        ip_dict = re_scan[1]
        uid_list = ip_dict.keys()
        if uid_list:
            r.lpush('act_uid_list', json.dumps(uid_list))
            count += len(uid_list)
            ts = time.time()
            print '%s : %s' %(count, ts - tb)
            tb = ts
        if cursor == 0:
            print count
            break
def scan_mapper():
    if RUN_TYPE:
        ts = datetime2ts(ts2datetime(time.time - DAY))
    else:
        ts = datetime2ts('2016-05-14')
    ts = str(ts)
    hash_name = sen_pre_ip + ts
    cursor = 0
    count = 0
    tb = time.time()

    while 1:
        re_scan = redis_ip.hscan(hash_name, cursor, count=1000)
        cursor = re_scan[0]
        ip_dict = re_scan[1]
        uid_list = ip_dict.keys()
        if uid_list:
            redis_ip.lpush('sensitive_ip_uid_list', json.dumps(uid_list))
            count += len(uid_list)
            ts = time.time()
            print '%s : %s' %(count, ts - tb)
            tb = ts
        if cursor == 0:
            print count
            break
def filter_activity(user_set):
    results = []
    now_date = ts2datetime(time.time())
    now_date = '2013-09-08'
    ts = datetime2ts(now_date) - 24*3600
    date = ts2datetime(ts)
    timestamp = datetime2ts(date)
    ts = ts.replace('-','')
    for user in user_set:
        over_count = 0
        for i in range(0,7):
            ts = timestamp - 3600*24*i
            result = r_cluster.hget('activity_'+str(ts), str(user))
            if result:
                item_dict = json.loads(result)
                sorted_dict = sorted(item_dict.iteritems(), key=lambda asd:asd[1], reverse=True)
                if sorted_dict[0][1] > activity_threshold:
                    over_count = 1
        if over_count == 0:
            results.append(user)
        else:
            writer.writerow([user, 'activity'])

    print 'after filter activity: ', len(results)
    return results
def weibo_sort_interface(username , time, sort_scope, sort_norm, arg, st, et, task_number, number):
    task_number = int(task_number)
    print "user_interface:", number

    weibo_list = []
    during = (datetime2ts(et) - datetime2ts(st)) / DAY + 1
    time = 1
    if during > 3:
        time = 7
    elif during > 16:
        time = 30

    query_body = {
        "query":{
            "terms":{
                "status": [0, -1]
            }
        }
    }

    if sort_scope == 'all_limit_keyword':
        running_number = es_weibo_portrait.count(index=WEIBO_RANK_KEYWORD_TASK_INDEX, doc_type=WEIBO_RANK_KEYWORD_TASK_TYPE, body=query_body)['count']
        if running_number > task_number - 1:
            return "more than limit"
        search_id = add_task(username, type="keyword", during=during, st=st, et=et, arg=arg, sort_norm=sort_norm, sort_scope=sort_scope, time=time, number=number)
        #deal with the offline task   
        return {"flag": True , "search_id": search_id}

    elif sort_scope == 'all_nolimit':
        pass

    return weibo_list
def main():
    RUN_TYPE = 0
    if RUN_TYPE == 1:
        now_ts = time.time()
    else:
        now_ts = datetime2ts(RUN_TEST_TIME)
        now_ts = datetime2ts('2013-09-02')
    date = ts2datetime(now_ts - DAY)
    # auto recommendation: step 1:4
    #step1: read from top es_daily_rank
    top_user_set, user_dict = search_from_es(date)
    #step2: filter black_uid
    black_user_set = read_black_user()
    subtract_user_set = top_user_set - black_user_set
    #step3: filter users have been in
    subtract_user_set = list(subtract_user_set)
    candidate_results = filter_in(subtract_user_set)
    #step4: filter rules about ip count& reposts/bereposts count&activity count
    results = filter_rules(candidate_results)
    #step5: get sensitive user
    sensitive_user = list(get_sensitive_user(date))
    results = results - set(sensitive_user) # influence user - sensitive user
    new_date = ts2datetime(now_ts)
    hashname_influence = "recomment_" + new_date + "_influence"
    if results:
        for uid in results:
            #print uid
            r.hset(hashname_influence, uid, "0")

    hashname_sensitive = "recomment_" + new_date + "_sensitive"
    if sensitive_user:
        for uid in sensitive_user:
            #print "sensitive"
            r.hset(hashname_sensitive, uid, "0")
    """
def update_day_sensitive(uid_list):
    results = {}
    for uid in uid_list:
        results[uid] = {"sensitive": 0, "sensitive_string": "", "sensitive_dict": json.dumps({})}
    all_results = {}
    now_ts = time.time()
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = datetime2ts("2013-09-03")
    today_sensitive_dict = {}
    sensitive_results = redis_cluster.hmget("sensitive_" + str(now_date_ts), uid_list)
    for item in sensitive_results:
        for uid, words_dict in item.iteritems():
            sensitive_index = 0
            sensitive_words_dict = {}
            if words_dict:
                sensitive_words_dict = json.dumps(words_dict)
                for word, count in words_dict.iter_items():
                    tmp_stage = r_sensitive.hget("sensitive_words", word)
                    if tmp_stage:
                        tmp = json.loads(tmp_stage)
                        sensitive_index += sensitive_score_dict[tmp[0]] * count
            sensitive_words_string = "&".join(sensitive_words_dict.keys())
            results[uid] = {
                "sensitive": sensitive_index,
                "sensitive_words_string": sensitive_words_string,
                "sensitive_words_dict": sensitive_words_dict,
            }

    return results
def key_words_search( pre , time , start_time , keyword , type  = 'in'  ):
    date = start_time 
    index_name = pre + start_time
    while not es.indices.exists(index= index_name) :
        time = datetime2ts(date) + DAY
        date = ts2datetime(time)
        index_name = pre + date
        time -= 1

    uid_set = set()
    for i in range(time):
        print index_name
        query = {"query":{"bool":{"must":[{"prefix":{"text.text":keyword}}],"must_not":[],"should":[]}},"size":MAX_ITEMS,"sort":[],"facets":{},"fields":['uid']}
        try :
            temp = es.search(index = index_name , doc_type = 'text' , body = query)
            result = temp['hits']['hits']
            print "Fetch " + str(len(result))
            for item in result :
                uid_set.add(item['fields']['uid'][0].encode("utf-8") )
        except Exception,e:
            print e
            raise  Exception('user_list failed!')        
        time = datetime2ts(date) + DAY
        date = ts2datetime(time)
        index_name = pre + date
        i += 1
Пример #8
0
def key_words_search( search_type , pre , during , start_time , keyword , search_key = '' , sort_norm = '', sort_scope = ''  ,time = 1 , isall = False):
    keywords = keyword.split(",")
    should = []
    for key in keywords:
        if search_type == "hashtag":
            should.append({"prefix":{"text.text": "#" +  key + "#"}})
        else:    
            should.append({"prefix":{"text.text":key}})    
    date = start_time 
    index_name = pre + start_time
    while not es_9206.indices.exists(index= index_name) :
        new_time = datetime2ts(date) + DAY
        date = ts2datetime(new_time)
        index_name = pre + date
        during -= 1

    
    uid_set = set()
    for i in range(during):
        print index_name
        query = {"query":{"bool":{"must":[],"must_not":[],"should":should}},"size":MAX_ITEMS,"sort":[],"facets":{},"fields":['uid']}
        try :
            temp = es_9206.search(index = index_name , doc_type = 'text' , body = query)
            result = temp['hits']['hits']
            print "Fetch " + str(len(result))
            for item in result :
                uid_set.add(item['fields']['uid'][0].encode("utf-8") )
        except Exception,e:
            print e
            raise  Exception('user_list failed!')        
        new_time = datetime2ts(date) + DAY
        date = ts2datetime(new_time)
        index_name = pre + date
        i += 1
Пример #9
0
def get_interval_count(topic, start_ts, end_ts):
    results = [0]
    ts_list = []
    #unit = 900
    #during = Day
    during = interval_count_during
    start_ts = datetime2ts(ts2datetime(start_ts))
    ts_list.append(start_ts)
    #end_ts = datetime2ts(ts2datetime(end_ts))
    # deal with the time is not the whole day
    print 'before deal end_ts:', ts2date(end_ts)
    if end_ts - datetime2ts(ts2datetime(end_ts))!= 0:
        end_ts = datetime2ts(ts2datetime(end_ts)) + 3600 * 24
    print 'get_interval_count start_ts:', ts2date(start_ts)
    print 'get_interval_count end_ts:', ts2date(end_ts)

    windowsize = (end_ts - start_ts) / Day
    interval = (end_ts - start_ts) / During
    for i in range(interval, 0, -1):
        begin_ts = end_ts - during * i
        over_ts = begin_ts + during
        ts_list.append(over_ts)

        items = db.session.query(PropagateCountNews).filter(PropagateCountNews.topic==topic ,\
                                                                                              PropagateCountNews.end<=over_ts ,\
                                                                                              PropagateCountNews.end>begin_ts ,\
                                                                                              PropagateCountNews.range==unit).all()
        if items:
            result = Merge_propagate(items)
        else:
            result = 0
        results.append(float(result))

    return ts_list, results
Пример #10
0
def filter_activity(user_set):
    results = []
    now_date = ts2datetime(time.time())
    # test
    now_date = '2013-09-08'
    ts = datetime2ts(now_date) - 24*3600
    date = ts2datetime(ts)
    #print 'date:', date
    timestamp = datetime2ts(date)
    for user in user_set:
        over_count = 0
        for i in range(0,7):
            ts = timestamp - 3600*24*i
            result = r_cluster.hget('activity_'+str(ts), str(user))
            if result:
                items_dict = json.loads(result)
                for item in items_dict:
                    weibo_count = items_dict[item]
                    if weibo_count > activity_threshold:
                        over_count += 1
        if over_count == 0:
            results.append(user)
        else:
            writer.writerow([user, 'activity'])
            
    print 'after filter activity:', len(results)    
    return results
def filter_activity(user_set):
    results = []
    #run_type
    if RUN_TYPE == 1:
        now_date = ts2datetime(time.time())
    else:
        now_date = RUN_TEST_TIME
    ts = datetime2ts(now_date) - DAY
    date = ts2datetime(ts)
    timestamp = datetime2ts(date)
    for user in user_set:
        over_count = 0
        for i in range(0,7):
            ts = timestamp - DAY*i
            result = redis_activity.hget('activity_'+str(ts), str(user))
            if result:
                items_dict = json.loads(result)
                for item in items_dict:
                    weibo_count = items_dict[item]
                    if weibo_count > activity_threshold:
                        over_count += 1
        if over_count == 0:
            results.append(user)
        else:
            writer.writerow([user, 'activity'])
            
    return results
def update_day_sensitive(uid_list):
    results = {}
    count = 0
    for uid in uid_list:
        results[uid] = {"sensitive": 0, 'sensitive_string': "", 'sensitive_dict': json.dumps({})}
    all_results = {}
    now_ts = time.time()
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = datetime2ts('2013-09-02')
    today_sensitive_dict = {}
    sensitive_results = redis_cluster.hmget("sensitive_"+str(now_date_ts), uid_list)
    for item in sensitive_results:
        if not item:
            count += 1
            continue
        print type(item)
        uid = uid_list[count]
        item = json.loads(item)
        sensitive_index = 0
        sensitive_words_dict = {}
        for word, count in item.iteritems():
            tmp_stage = r_sensitive.hget("sensitive_words", word)
            if tmp_stage:
                tmp = json.loads(tmp_stage)
                sensitive_index += sensitive_score_dict[str(tmp[0])] * count
        sensitive_words_string = "&".join(item.keys())
        results[uid] = {'sensitive': sensitive_index, "sensitive_words_string":sensitive_words_string, "sensitive_words_dict":item}
        count += 1

    return results
def get_activity_time(uid_list):
    results = {}
    now_ts = time.time()
    now_date = ts2datetime(now_ts)
    #run_type
    if RUN_TYPE == 1:
        timestamp = datetime2ts(now_date)
    else:
        timestamp = datetime2ts("2013-09-08")
    activity_list_dict = {} # {uid:[activity_list], uid:[]}
    for i in range(1,WEEK+1):
        ts = timestamp - DAY*i
        if WORK_TYPE != 0:
            r_result = redis_activity.hmget('activity_'+str(ts), uid_list)
        else:
            r_result = []
            index_name = "activity_" + str(ts2datetime(ts))
            exist_bool = es_cluster.indices.exists(index=index_name)
            if exist_bool:
                es_results = es_cluster.mget(index=index_name, doc_type="activity", body={"ids":uid_list})["docs"]
                for item in es_results:
                    if item['found']:
                        r_result.append(item['_source']['activity_dict'])
                    else:
                        r_result.append(json.dumps({}))
            else:
                r_result = [json.dumps(dict())]*len(uid_list)

        if r_result:
            for j in range(0, len(uid_list)):
                uid = uid_list[j]
                if uid not in activity_list_dict:
                    activity_list_dict[uid] = [0 for i in range(0, 96)]
                user_r_result = r_result[j]
                if user_r_result:
                    user_activity_dict = json.loads(user_r_result)
                    for i in range(0, 96):
                        try:
                            count = user_activity_dict[str(i)]
                        except:
                            count = 0
                        activity_list_dict[uid].append(count)
    for uid in uid_list:
        activity_list = activity_list_dict[uid]
        statusnum = sum(activity_list)
        signal = np.array(activity_list)
        fftResult = np.abs(np.fft.fft(signal))**2
        n = signal.size
        freq = np.fft.fftfreq(n, d=1)
        i = 0
        max_val = 0
        max_freq = 0
        for val in fftResult:
            if val>max_val and freq[i]>0:
                max_val = val
                max_freq = freq[i]
            i += 1
        results[uid] = {'statusnum': statusnum, 'activity_time': math.log(max_freq + 1)}
    
    return results
def update_day_hashtag(uid_list):
    results = {}
    all_results = {}
    now_ts = time.time()
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = datetime2ts("2013-09-02")

    for i in range(WEEK, 0, -1):
        ts = now_date_ts - DAY * i
        count = 0
        hashtag_results = redis_cluster.hmget("hashtag_" + str(ts), uid_list)
        for uid in uid_list:
            if uid not in results:
                results[uid] = {}
            hashtag_item = hashtag_results[count]
            if hashtag_item:
                hashtag_dict = json.loads(hashtag_item)
            else:
                hashtag_dict = {}
            for hashtag in hashtag_dict:
                try:
                    results[uid][hashtag] += 1
                except:
                    results[uid][hashtag] = 1
            count += 1

    for uid in uid_list:
        user_hashtag_dict = results[uid]
        hashtag_string = "&".join(user_hashtag_dict.keys())
        all_results[uid] = {"hashtag_string": hashtag_string, "hashtag_dict": json.dumps(user_hashtag_dict)}

    return all_results
Пример #15
0
def add_task( user_name ,type = "keyword",range = "all"  ,pre ='flow_text_' , during = '1' , start_time ='2013-09-07' ,end_time ='2013-09-07', keyword = 'hello,world' , sort_norm = 'bci' , sort_scope  = 'in_limit_keyword', time = 7, isall = False, number=100 ):
    time_now = int(TIME.time())
    task_id = user_name + "-" + str(time_now)
    tmp_list = keyword.split(',')
    keyword_list = []
    for item in tmp_list:
        if item:
            keyword_list.append(item)
       
    body_json = {
                'submit_user' : user_name ,
                'keyword' : json.dumps(keyword_list),
                'keyword_string': "&".join(keyword_list),
                'submit_time' : ts2datetime(time_now),
                'create_time': time_now,
                'end_time' : datetime2ts(end_time),
                'search_type' : type,
                'status':0,
                'range' : range , 
                'user_ts' : user_name + '-'+ str(time_now),
                'pre' : pre,
                'during' : during ,
                'start_time' : datetime2ts(start_time) ,
                'sort_norm' : sort_norm ,
                'sort_scope' : sort_scope,
                'time' : time ,
                'isall' : isall,
                'number': number
            }
    es.index(index = USER_RANK_KEYWORD_TASK_INDEX , doc_type=USER_RANK_KEYWORD_TASK_TYPE , id=task_id, body=body_json)
    return body_json["user_ts"]
Пример #16
0
def read_flow_text(uid_list):
    '''
        读取用户微博(返回结果没有微博情绪标签):
        输入数据:uid_list(字符串型列表)
        输出数据:word_dict(用户分词结果字典),weibo_list(用户微博列表)
        word_dict示例:{uid1:{'w1':f1,'w2':f2...}...}
        weibo_list示例:[[uid1,text1,ts1],[uid2,text2,ts2],...](每一条记录对应三个值:uid、text、timestamp)
    '''
    word_dict = dict()#词频字典
    weibo_list = []#微博列表
    online_pattern_dict = {} # {uid:[online_pattern1, ..],...}
    now_ts = time.time()
    #run_type
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = datetime2ts(RUN_TEST_TIME)
    
    start_date_ts = now_date_ts - DAY * WEEK
    for i in range(0,WEEK):
        iter_date_ts = start_date_ts + DAY * i
        flow_text_index_date = ts2datetime(iter_date_ts)
        flow_text_index_name = flow_text_index_name_pre + flow_text_index_date
        print flow_text_index_name
        try:
            flow_text_exist = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\
                    body={'query':{'filtered':{'filter':{'terms':{'uid': uid_list}}}}, 'size': MAX_VALUE}, _source=False,  fields=['text','uid','keywords_dict','timestamp'])['hits']['hits']
        except:
            flow_text_exist = []

        for flow_text_item in flow_text_exist:
            uid = flow_text_item['fields']['uid'][0].encode('utf-8')
            text = flow_text_item['fields']['text'][0].encode('utf-8')
            ts = flow_text_item['fields']['timestamp'][0]
            keywords_dict = json.loads(flow_text_item['fields']['keywords_dict'][0])
            keywords_dict = json.dumps(keywords_dict, encoding="UTF-8", ensure_ascii=False)
            keywords_dict = eval(keywords_dict)

            if word_dict.has_key(uid):
                item_dict = Counter(word_dict[uid])
                keywords_dict = Counter(keywords_dict)
                item_dict = dict(item_dict + keywords_dict)
                word_dict[uid] = item_dict
            else:
                word_dict[uid] = keywords_dict

            weibo_list.append([uid,text,ts])
            #test online pattern
            online_pattern = u'weibo.com'
            try:
                user_online_pattern_dict = online_pattern_dict[uid]
            except:
                online_pattern_dict[uid] = {}
            try:
                online_pattern_dict[uid][online_pattern] += 1
            except:
                online_pattern_dict[uid][online_pattern] = 1
    
    return  word_dict,weibo_list, online_pattern_dict, start_date_ts
def get_db_num():
    date = ts2datetime(time.time())
    date_ts = datetime2ts(date)
    r_begin_ts = datetime2ts(R_BEGIN_TIME)
    db_number = ((date_ts - r_begin_ts) / (DAY * 7 )) % 2 + 1
    #run_type
    if RUN_TYPE == 0:
        db_number = 1
    return db_number
def main():
    #step1: get task from redis queue (rpop)
    #step2: get monitor task time record from redis----data: {'monitor_task_time_record':{task_name, compute_start_ts}}
    #step3: identify the compute_start_ts can be compute
    #setp4: get task user from es---group_result
    #step5: according task user count do differently computing
    #step6: compute task mid-result
    #step7: save the mid-result in mid-result es----timestamp as field
    #step8: identify the track task is doing ,not end/delete  from group_result es status==1 not 0
    #step8: if track_task is doing: update the compute_start_ts
    #step9: if track_task is doing: lpush task name to redis queue (keep the task in queue)
    #step10: if track_task is not doing: delete the compute_start_ts from redis
    while True:
        task_name = get_task_name()
        if task_name:
            start_ts = r_task.hget('monitor_task_time_record', task_name)
            start_ts = int(start_ts)
            #now_ts = time.time()
            #test
            now_ts = datetime2ts('2013-09-08')
            if start_ts == now_ts:
                status = add_task_name(task_name)
                if status == 0:
                    print 'add task to redis fail'
                    break

            if start_ts + 900 <= now_ts:
                task_user  = get_task_user(task_name)
                
                if len(task_user)==1:
                    print 'compute %s start_ts %s' % (task_name, ts2date(start_ts))
                    status = compute_mid_result_one(task_name, task_user, start_ts)
                else:
                    print 'compute %s start_ts %s' % (task_name, ts2date(start_ts))
                    status = compute_mid_result_group(task_name, task_user, start_ts)
                    #compute group polarization----compute once a day
                    if datetime2ts(ts2datetime(start_ts)) == start_ts:
                        print 'start commpute group inner %s' % ts2date(start_ts)
                        group_status = compute_group_inner(task_name, task_user, start_ts)
                        status += group_status

                if status == 0:
                    print 'there is a bug about %s task' % task_name
                else:
                    #update the record time
                    start_ts += 900
                    task_doing_status = identify_task_doing(task_name)
                    print 'task_doing_status:', task_doing_status
                    if task_doing_status == True:
                        r_task.hset('monitor_task_time_record', task_name, start_ts)
                        status = add_task_name(task_name)
                        if status==0:
                            print 'add task name to redis fail'
                    else:
                        r_task.hdel('monitor_task_time_record', task_name)
Пример #19
0
def sort_task(user, keyword, status, start_time, end_time, submit_time):
    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "bool":{
                        "must":[
                            {"term":{"submit_user": user}}
                        ]
                    }
                }
            }
        },
        "size": 10000,
        "sort":{"submit_time":{"order":"desc"}}
    }

    query_list = []
    if keyword:
        keyword_list = keyword.split(',')
        query_list.append({"terms":{"keyword_string":keyword_list}})
    if status != 2:
        query_list.append({"term":{"status": status}})
    if start_time and end_time:
        start_ts = datetime2ts(start_time)
        end_ts = datetime2ts(end_time)
        query_list.append({"range":{"start_time":{"gte":start_ts, "lte":end_ts}}})
        query_list.append({"range":{"end_time":{"gte":start_ts, "lte":end_ts}}})
    if submit_time:
        query_list.append({"term":{"submit_time": submit_time}})

    if query_list:
        query_body["query"]["filtered"]["filter"]["bool"]["must"].extend(query_list)

    #print query_body
    search_results = es.search(index=USER_RANK_KEYWORD_TASK_INDEX, doc_type=USER_RANK_KEYWORD_TASK_TYPE, body=query_body)["hits"]["hits"]
    results = []
    if search_results:
        for item in search_results:
            iter_item = item['_source']
            tmp = []
            tmp.append(iter_item['search_type'])
            tmp.append(json.loads(iter_item['keyword']))
            tmp.append(ts2datetime(iter_item['start_time']))
            tmp.append(ts2datetime(iter_item['end_time']))
            tmp.append(iter_item['range'])
            tmp.append(ts2date(iter_item['create_time']))
            tmp.append(iter_item['status'])
            tmp.append(iter_item['sort_norm'])
            tmp.append(iter_item['sort_scope'])
            tmp.append(item['_id']) # task_name
            results.append(tmp)

    return results
Пример #20
0
def hot_uid_by_word(starttime, endtime, count=50):
    '''筛选出词语表中有超过50条记录的博主
    '''
    startdate = ts2datetime(datetime2ts(starttime))
    enddate =  ts2datetime(datetime2ts(endtime))
    uids = set()
    uids_count = db.session.query(Words.uid, func.count(Words.id)).\
                               filter(Words.postDate>startdate, Words.postDate<enddate).\
                               group_by(Words.uid).\
                               having(func.count(Words.id) > count).all()
    for uid, count in uids_count:
        uids.add(uid)
    return uids
def user_sort_interface(time = 1 , sort_norm = 'imp' , sort_scope = 'in_nolimit' , arg = None , start_time = '2013-09-01' , end_time = '2013-09-07'):
    
    uid_list = []
    return_data = {}
    
    
    try:
        first_stage_time = datetime.datetime.now()
        #find the userid which in the scope
        if sort_scope == 'all_nolimit' :
            uid_list = all_sort_filter(time,sort_norm,sort_scope,arg)
        elif sort_scope == 'all_limit_keyword':
            during = ( datetime2ts(end_time) - datetime2ts(start_time) ) / DAY
            time = 1
            if during > 3:
                time = 7
            elif during > 16:
                time = 30
            uid_list = key_words_search('flow_text_2013_',during,start_time,arg,'all')
            uid_list = sort_norm_filter(uid_list,sort_norm ,time)

        elif sort_scope == "in_limit_keyword":
            during = ( datetime2ts(end_time) - datetime2ts(start_time) ) / DAY
            time = 1
            if during > 3:
                time = 7
            elif during > 16:
                time = 30
            uid_list = key_words_search('flow_text_',during,start_time,arg,'all')   
            uid_list = sort_norm_filter(uid_list,sort_norm ,time)         
        else :
            uid_list = in_sort_filter(time , sort_norm , sort_scope, arg)

        #make up the result with userid list
        user_info_list = make_up_user_info(uid_list)
        second_stage_time = datetime.datetime.now()
        print "info-makeup's mission complete,  Time-consuming: " + str(second_stage_time - first_stage_time)

        #make up the JSON return data
        return_data['flag'] = True 
        return_data['data'] = user_info_list
        third_stage_time = datetime.datetime.now()
        print "JSON-maker's mission complete,  Time-consuming: " + str(third_stage_time - second_stage_time)
        return return_data
    except RuntimeError , e1:
        print "RuntimeError : " + str(e1)
        return_data['flag'] = False
        return_data['error_msg'] = "time out" 
def get_group_history(admin_user, now_date):
    results = set()
    now_ts = datetime2ts(now_date)
    start_ts = now_ts - DAY * RECOMMEND_IN_AUTO_DATE
    end_ts = now_ts
    #search group task
    query_body = {
        'query':{
            'bool':{
                'must':[
                    #{'range': {'submit_date':{'gte': start_ts, 'lt': end_ts}}},
                    {'term': {'submit_user': admin_user}},
                    {'term': {'task_type': 'analysis'}}
                    ]
                }
            },
        'size': RECOMMEND_IN_AUTO_GROUP
        }
    try:
        group_results = es_group_result.search(index=group_index_name, doc_type=group_index_type,\
                body=query_body, _source=False, fields=['uid_list'])['hits']['hits']
    except:
        group_results = []
    all_user_list = []
    for group_item in group_results:
        try:
            uid_list = group_item['fields']['uid_list']
        except:
            uid_list = []
        all_user_list.extend(uid_list)
    results = set(all_user_list)
    return results
Пример #23
0
 def test_hadoop_job_id(self):
     date = '2013-03-01'
     ts = datetime2ts(date)
     window_size = 1
     topic_id = 1
     job_id = generate_job_id(ts, window_size, topic_id)
     self.assertEqual(job_id, '2013_03_01_1_1', 'wrong job id')        
def mapper_bci_today(todaydate=None):
    if todaydate:
        BCI_INDEX_NAME = BCI_INDEX_NAME_PRE + ts2datetime((datetime2ts(todaydate) - DAY)).replace("-","")
        TODAY_TIME = todaydate
    else :
        BCI_INDEX_NAME = BCI_INDEX_NAME_PRE + '20130901'
        TODAY_TIME = '2013-09-02'
    s_re = scan(es_9200, query={"query":{"match_all":{}},"size":MAX_ITEMS ,"fields":[TOTAL_NUM,TODAY_BCI]}, index=BCI_INDEX_NAME, doc_type=BCI_INDEX_TYPE)
    count = 0
    array = []
    while 1:
        try:
            temp = s_re.next()
            one_item = {}
            one_item['id'] = temp['_id'].encode("utf-8")
            one_item['total_num'] = temp['fields'][TOTAL_NUM][0]
            one_item['today_bci'] = temp['fields'][TODAY_BCI][0]
            one_item['update_time'] = TODAY_TIME
            array.append(one_item)
            count += 1
            if count % 1000 == 0:
                r_flow.lpush('update_bci_list', json.dumps(array))
                array = []
                count = 0
        except StopIteration: 
                print "all done" 
                r_flow.lpush('update_bci_list', json.dumps(array))              
                break 
Пример #25
0
def get_attr_geo_track(uid_list):
    date_results = [] # results = {'2013-09-01':[(geo1, count1), (geo2, track2)], '2013-09-02'...} 7day
    now_ts = time.time()
    now_date = ts2datetime(now_ts)
    #test
    now_date = '2013-09-08'
    ts = datetime2ts(now_date)
    for i in range(7, 0, -1):
        timestamp = ts - i*24*3600
        #print 'timestamp:', ts2datetime(timestamp)
        ip_dict = dict()
        results = r_cluster.hmget('ip_'+str(timestamp), uid_list)
        #print 'results:',results
        for item in results:
            if item:
                item_dict = json.loads(item)
                #print 'item_dict:', item_dict
                for ip_item in item_dict:
                    try:
                        ip_dict[ip_item] += item_dict[ip_item]
                    except:
                        ip_dict[ip_item] = item_dict[ip_item]
        geo_dict = ip2geo(ip_dict)
        sort_geo_dict = sorted(geo_dict.items(), key=lambda x:x[1], reverse=True)
        date_key = ts2datetime(timestamp)
        date_results.append([date_key, sort_geo_dict[:2]])
    #print 'results:', date_results
    return {'geo_track': json.dumps(date_results)}
Пример #26
0
def scan_retweet(tmp_file):
    count = 0
    ret_count = 0
    scan_cursor = 0
    now_ts = time.time()
    now_date_ts = datetime2ts(ts2datetime(now_ts))
    retweet_redis = daily_retweet_redis
    start_ts = time.time()
    while True:
        re_scan = retweet_redis.scan(scan_cursor, count=100)
        re_scan_cursor = re_scan[0]
        for item in re_scan[1]:
            count += 1
            item_list = item.split('_')
            if len(item_list)==2:
                ret_count += 1
                uid = item_list[1]
                item_result = retweet_redis.hgetall(item)
                write_tmp_file(tmp_file, uid, item_result)
        end_ts = time.time()
        #run_type
        # if RUN_TYPE == 0:
            #print '%s sec scan %s count user:'******'total %s sec scan %s count user and %s retweet count' %(end_ts - now_ts, count, ret_count)
def save_at(uid, at_uid, timestamp, sensitive):
    date = ts2datetime(timestamp)
    ts = datetime2ts(date)
    key = str(uid)
    ruid_count_dict = dict()
    sensitive_ruid_count_dict = dict()
    ruid_count_string = redis_cluster.hget('at_'+str(ts), str(uid))
    if ruid_count_string:
        ruid_count_dict = json.loads(ruid_count_string)
        if ruid_count_dict.has_key(str(at_uid)):
            ruid_count_dict[str(at_uid)] += 1
        else:
            ruid_count_dict[str(at_uid)] = 1
    else:
        ruid_count_dict[str(at_uid)] = 1
    redis_cluster.hset('at_'+str(ts), str(uid), json.dumps(ruid_count_dict))


    if sensitive:
        sensitive_ruid_count_string = redis_cluster.hget('sensitive_at_'+str(ts), str(uid))
        if sensitive_ruid_count_string:
            sensitive_ruid_count_dict = json.loads(sensitive_ruid_count_string)
            if sensitive_ruid_count_dict.has_key(str(at_uid)):
                sensitive_ruid_count_dict[str(at_uid)] += 1
            else:
                sensitive_ruid_count_dict[str(at_uid)] = 1
        else:
            sensitive_ruid_count_dict[str(at_uid)] = 1
        redis_cluster.hset('sensitive_at_'+str(ts), str(uid), json.dumps(sensitive_ruid_count_dict))
def save_city(uid, ip, timestamp, sensitive):
    date = ts2datetime(timestamp)
    ts = datetime2ts(date)
    key = str(uid)
    ip_count_dict = dict()
    sensitive_ip_count_dict = dict()
    ip_count_string = redis_ip.hget('ip_'+str(ts), str(uid))
    if ip_count_string:
        ip_count_dict = json.loads(ip_count_string)
        if ip_count_dict.has_key(str(ip)):
            ip_count_dict[str(ip)] += 1
        else:
            ip_count_dict[str(ip)] = 1
    else:
        ip_count_dict[str(ip)] = 1
    redis_ip.hset('ip_'+str(ts), str(uid), json.dumps(ip_count_dict))

    if sensitive:
        sensitive_ip_count_string = redis_ip.hget('sensitive_ip_'+str(ts), str(uid))
        if sensitive_ip_count_string:
            sensitive_ip_count_dict = json.loads(sensitive_ip_count_string)
            if sensitive_ip_count_dict.has_key(str(ip)):
                sensitive_ip_count_dict[str(ip)] += 1
            else:
                sensitive_ip_count_dict[str(ip)] = 1
        else:
            sensitive_ip_count_dict[str(ip)] = 1
        redis_ip.hset('sensitive_ip_'+str(ts), str(uid), json.dumps(sensitive_ip_count_dict))
def save_activity(uid, timestamp, time_segment, sensitive):
    date = ts2datetime(timestamp)
    ts = datetime2ts(date)
    key = str(ts)
    activity_count_dict = dict()
    sensitive_activity_count_dict = dict()
    activity_count_string = redis_activity.hget('activity_' + key, str(uid))
    if activity_count_string:
        activity_count_dict = json.loads(activity_count_string)
        if activity_count_dict.has_key(str(time_segment)):
            activity_count_dict[str(time_segment)] += 1
        else:
            activity_count_dict[str(time_segment)] = 1
    else:
        activity_count_dict[str(time_segment)] = 1
    redis_activity.hset('activity_' + key, str(uid), json.dumps(activity_count_dict))

    if sensitive:
        sensitive_activity_count_string = redis_activity.hget('sensitive_activity_' + key, str(uid))
        if sensitive_activity_count_string:
            sensitive_activity_count_dict = json.loads(sensitive_activity_count_string)
            if sensitive_activity_count_dict.has_key(str(time_segment)):
                sensitive_activity_count_dict[str(time_segment)] += 1
            else:
                sensitive_activity_count_dict[str(time_segment)] = 1
        else:
            sensitive_activity_count_dict[str(time_segment)] = 1
        redis_activity.hset('sensitive_activity_' + key, str(uid), json.dumps(sensitive_activity_count_dict))
def get_sensing_history(admin_user, now_date):
    results = set()
    now_ts = datetime2ts(now_date)
    start_ts = now_ts - DAY * RECOMMEND_IN_AUTO_DATE
    end_ts = now_ts
    #search social sensing task
    query_body = {
        'query':{
            'bool':{
                'must':[
                    #{'range': {'create_at': {'gte': start_ts, 'lt': end_ts}}},
                    {'term': {'create_by': admin_user}}
                    ]
                }
            },
        'size': RECOMMEND_IN_AUTO_GROUP
        }
    try:
        sensing_result = es_social_sensing.search(index=sensing_index_name, doc_type=sensing_doc_type,\
                body=query_body, _source=False, fields=['social_sensors'])['hits']['hits']
    except:
        sensing_result = []
    sensing_user_list = []
    for task_item in sensing_result:
        user_list = json.loads(task_item['fields']['social_sensors'][0])
        sensing_user_list.extend(user_list)
    results = set(sensing_user_list)
    return results
Пример #31
0
def uid_lists2fb_from_flow_text(monitor_keywords_list, uid_list):

    nest_query_list = []
    for monitor_keyword in monitor_keywords_list:
        nest_query_list.append(
            {'wildcard': {
                'keywords_string': '*' + monitor_keyword + '*'
            }})

    query_body = {
        'query': {
            'bool': {
                'should': nest_query_list,
                'must': [{
                    'terms': {
                        'uid': uid_list
                    }
                }]
            }
        },
        'size': TOP_WEIBOS_LIMIT,
        'sort': {
            'timestamp': {
                'order': 'desc'
            }
        }
    }

    if S_TYPE == 'test':
        now_ts = datetime2ts(S_DATE_FB)
    else:
        now_ts = int(time.time())
    datetime = ts2datetime(now_ts - 24 * 3600)

    index_name_flow = facebook_flow_text_index_name_pre + datetime

    es_results = es.search(index=index_name_flow,
                           doc_type=facebook_flow_text_index_type,
                           body=query_body)['hits']['hits']

    results_all = []
    for result in es_results:
        result = result['_source']
        uid = result['uid']
        nick_name, photo_url = fb_uid2nick_name_photo(uid)
        result['nick_name'] = nick_name
        result['photo_url'] = photo_url
        results_all.append(result)
    return results_all
Пример #32
0
def create_facebook_warning():
    #时间设置
    if S_TYPE == 'test':
        test_day_date = FACEBOOK_FLOW_START_DATE
        today_datetime = datetime2ts(test_day_date) - DAY
        start_time = today_datetime
        end_time = today_datetime
        operate_date = ts2datetime(start_time)
    else:
        now_time = int(time.time())
        today_datetime = datetime2ts(ts2datetime(now_time)) - 8 * DAY
        start_time = today_datetime  #前一天0点
        end_time = today_datetime  #定时文件启动的0点
        operate_date = ts2datetime(start_time)

    account_list = get_user_account_list()
    # account_list = ['*****@*****.**']
    for account in account_list:
        xnr_list = get_user_xnr_list(account)
        # print xnr_list
        #xnr_list=['FXNR0005']
        for xnr_user_no in xnr_list:
            print 'xnr_user_no:', xnr_user_no
            #人物行为预警
            personal_mark = create_personal_warning(xnr_user_no,
                                                    today_datetime)
            #言论内容预警
            speech_mark = create_speech_warning(xnr_user_no, today_datetime)
            speech_mark = True
            #事件涌现预警
            create_event_warning(xnr_user_no, today_datetime, write_mark=True)

    #时间预警
    date_mark = create_date_warning(today_datetime)

    return True
def get_influence(uid_list):
    result = {}
    now_ts = time.time()
    #run_type
    if RUN_TYPE == 1:
        now_date = ts2datetime(now_ts - DAY)
    else:
        now_date = ts2datetime(datetime2ts(RUN_TEST_TIME) - DAY)

    index_time = 'bci_' + ''.join(now_date.split('-'))
    index_type = 'bci'
    try:
        es_result = es.mget(index=index_time, doc_type=index_type, body={'ids': uid_list}, _source=False, fields=['user_index'])['docs']
    except Exception, e:
        raise e
Пример #34
0
def compute_keywords_mark():
    xnr_user_list = lookup_xnr_user_list()
    # xnr_user_list=['FXNR0001']
    print 'xnr_user_list:', xnr_user_list
    now_time = int(time.time()) - DAY
    date_time = ts2datetime(now_time)

    mark_list = []
    for xnr_user_no in xnr_user_list:
        keywords_task_detail = dict()
        keyword_value_string = json.dumps(xnr_keywords_compute(xnr_user_no))
        keywords_task_detail['keyword_value_string'] = keyword_value_string
        keywords_task_detail['xnr_user_no'] = xnr_user_no
        #keywords_task_detail['date_time']=date_time
        #keywords_task_detail['timestamp']=datetime2ts(date_time)
        if S_TYPE == 'test':
            keywords_task_id = xnr_user_no + '_' + test_date
            keywords_task_detail['timestamp'] = datetime2ts(test_date)
            keywords_task_detail['date_time'] = test_date
            print 'keywords_task_detail:', test_date
        else:
            keywords_task_id = xnr_user_no + '_' + date_time
            keywords_task_detail['timestamp'] = datetime2ts(date_time)
            keywords_task_detail['date_time'] = date_time
            print 'keywords_task_detail:', date_time
        try:
            es_xnr_2.index(index=facebook_keyword_count_index_name,
                           doc_type=facebook_keyword_count_index_type,
                           body=keywords_task_detail,
                           id=keywords_task_id)
            mark = True
        except:
            mark = False
        mark_list.append(mark)
    print 'mark_list:', mark_list
    return mark_list
Пример #35
0
def create_xnr_targetuser(xnr_user_no):
    # #step1:查找虚拟人列表
    # xnr_user_no_list = get_compelete_fbxnr()

    #step2:设置时间范围
    if S_TYPE == 'test':
        now_time = datetime2ts(FACEBOOK_COMMUNITY_DATE)
    else:
        now_time = int(time.time())
    end_ts = datetime2ts(ts2datetime(now_time))
    start_ts = end_ts - COMMUNITY_TERM * DAY
    datetime_list = []
    if start_ts != end_ts:
        iter_date_ts = end_ts
        while iter_date_ts >= start_ts:
            start_date = ts2datetime(iter_date_ts)
            datetime_list.append(start_date)
            iter_date_ts = iter_date_ts - DAY
    else:
        start_date = ts2datetime(start_ts)
        datetime_list.append(start_date)

    #step3:分虚拟人创建种子用户
    # for xnr_user_no in xnr_user_no_list:

    #step3.1:查找虚拟人发布的关键词
    xnr_keywords = get_xnr_keywords(xnr_user_no, datetime_list)

    #step3.2:查找虚拟人的关注用户或好友
    xnr_relationer = get_xnr_relationer(xnr_user_no)

    #step3.3:基于关键词和种子用户扩展用户
    expand_userid_list = get_expand_userid_list(xnr_keywords, xnr_relationer,
                                                datetime_list)

    return expand_userid_list
Пример #36
0
def update_day_sensitive(uid_list):
    results = {}
    for uid in uid_list:
        results[uid] = {
            "sensitive": 0,
            'sensitive_string': "",
            'sensitive_dict': json.dumps({})
        }
    all_results = {}
    now_ts = time.time()
    if RUN_TYPE == 1:
        now_date_ts = datetime2ts(ts2datetime(now_ts))
    else:
        now_date_ts = datetime2ts('2013-09-03')
    today_sensitive_dict = {}
    sensitive_results = redis_cluster.hmget("sensitive_" + str(now_date_ts),
                                            uid_list)
    for item in sensitive_results:
        for uid, words_dict in item.iteritems():
            sensitive_index = 0
            sensitive_words_dict = {}
            if words_dict:
                sensitive_words_dict = json.dumps(words_dict)
                for word, count in words_dict.iter_items():
                    tmp_stage = r_sensitive.hget("sensitive_words", word)
                    if tmp_stage:
                        tmp = json.loads(tmp_stage)
                        sensitive_index += sensitive_score_dict[tmp[0]] * count
            sensitive_words_string = "&".join(sensitive_words_dict.keys())
            results[uid] = {
                'sensitive': sensitive_index,
                "sensitive_words_string": sensitive_words_string,
                "sensitive_words_dict": sensitive_words_dict
            }

    return results
Пример #37
0
def compute_network_task(network_task_information):
    results = {}
    #step1: get task information
    start_date = network_task_information['start_date']
    start_ts = datetime2ts(start_date)
    end_date = network_task_information['end_date']
    end_ts = datetime2ts(end_date)
    iter_date_ts = start_ts
    to_date_ts = end_ts
    iter_query_date_list = []  # ['2013-09-01', '2013-09-02']
    while iter_date_ts <= to_date_ts:
        iter_date = ts2datetime(iter_date_ts)
        iter_query_date_list.append(iter_date)
        iter_date_ts += DAY
    #step2: get iter search flow_text_index_name
    #step2.1: get search keywords list
    query_must_list = []
    keyword_nest_body_list = []
    keywords_string = network_task_information['query_keywords']
    keywords_list = keywords_string.split('&')
    for keywords_item in keywords_list:
        keyword_nest_body_list.append(
            {'wildcard': {
                'text': '*' + keywords_item + '*'
            }})
    query_must_list.append({'bool': {'should': keyword_nest_body_list}})
    query_must_list.append({'term': {'message_type': '3'}})
    #step2.2: iter search by date
    results = []
    for iter_date in iter_query_date_list:
        flow_text_index_name = flow_text_index_name_pre + iter_date
        query_body = {'query': {'bool': {'must': query_must_list}}}
        flow_text_result = es_flow_text.search(index=flow_text_index_name, doc_type=flow_text_index_type,\
                    body=query_body)['hits']['hits']
        results.extend(flow_text_result)
    return results
Пример #38
0
def get_recommend_at_user(xnr_user_no):
    #_id  = user_no2_id(user_no)
    es_result = es.get(index=tw_xnr_index_name,
                       doc_type=tw_xnr_index_type,
                       id=xnr_user_no)['_source']
    #print 'es_result:::',es_result
    if es_result:
        uid = es_result['uid']
        daily_interests = es_result['daily_interests']
    if S_TYPE == 'test':
        now_ts = datetime2ts(S_DATE_TW)
    else:
        now_ts = int(time.time())
    datetime = ts2datetime(now_ts - 24 * 3600)

    index_name = twitter_flow_text_index_name_pre + datetime
    nest_query_list = []
    daily_interests_list = daily_interests.split('&')

    es_results_daily = es.search(index=index_name,doc_type=twitter_flow_text_index_type,\
                        body={'query':{'match_all':{}},'size':200,\
                        'sort':{'timestamp':{'order':'desc'}}})['hits']['hits']

    uid_list = []
    if es_results_daily:
        for result in es_results_daily:
            result = result['_source']
            uid_list.append(result['uid'])

    ## 根据uid,从weibo_user中得到 nick_name
    uid_nick_name_dict = dict()  # uid不会变,而nick_name可能会变
    es_results_user = es.mget(index=twitter_user_index_name,
                              doc_type=twitter_user_index_type,
                              body={'ids': uid_list})['docs']
    i = 0
    for result in es_results_user:

        if result['found'] == True:
            result = result['_source']
            uid = result['uid']
            nick_name = result['name']
            if nick_name:
                i += 1
                uid_nick_name_dict[uid] = nick_name
        if i >= DAILY_AT_RECOMMEND_USER_TOP:
            break

    return uid_nick_name_dict
Пример #39
0
def create_twitter_bci_data(uid, date):
    tw_bci_index_name = tw_bci_index_name_pre + date
    tw_bci_mappings(tw_bci_index_name)
    data = {
        'active': random.choice([1, 1, 1, 1, 1, 2, 3]),
        'propagate': random.choice([1, 1, 1, 1, 1, 1]),
        'cover': random.choice([1, 1, 1, 1, 12, 18, 31, 43, 90, 201]),
        'trust': random.choice([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
        'influence': random.choice([10, 10, 20, 12]),
        'uid': uid,
        'timestamp': datetime2ts(date),
    }
    print es.index(index=tw_bci_index_name,
                   doc_type=tw_bci_index_type,
                   id=uid,
                   body=data)
Пример #40
0
def save_results_to_es(xnr_user_no, current_date, sort_item, result):

    item_body = {}
    item_body['xnr_user_no'] = xnr_user_no
    item_body['sort_item'] = sort_item
    item_body['result'] = json.dumps(result)
    item_body['timestamp'] = datetime2ts(current_date)

    _id = xnr_user_no + '_' + sort_item

    index_name = active_social_index_name_pre + current_date

    es.index(index=index_name,
             doc_type=active_social_index_type,
             body=item_body,
             id=_id)
def get_activeness(uid, activity_geo):
    result = 0
    now_ts = time.time()
    now_date = ts2datetime(now_ts)
    timestamp = datetime2ts(now_date)
    #test
    #timestamp = datetime2ts('2013-09-08')
    # deal activity_time fft and statusnum
    activity_list = []
    statusnum = 0
    for i in range(1,8):
        ts = timestamp - 24*3600*i
        r_result = r_cluster.hget('activity_'+str(ts), uid)
        if r_result:
            r_result = json.loads(r_result)
        #print 'r_result:', r_result
        for i in range(0,96):
            try:
                count = r_result[str(i)]
            except:
                count = 0
            activity_list.append(float(count))
    #print 'activity_list:', activity_list
    statusnum = sum(activity_list)
    signal = np.array(activity_list)
    fftResult = np.abs(np.fft.fft(signal)) ** 2
    n = signal.size
    freq = np.fft.fftfreq(n, d=1)
    i = 0
    max_val = 0
    max_freq = 0
    for val in fftResult:
        #print str(1/freq[i]) + ',' + str(val)
        if val>max_val and freq[i]>0:
            max_val = val
            max_freq = freq[i]
        i = i + 1
    #print 'i:', i
    #print 'max_freq, max_val:', max_freq, max_val
    # deal avtivity_geo input: 'geo&geo'
    activity_geo_count = len(activity_geo.split('&'))
    result = activeness_weight_dict['activity_time'] * math.log(max_freq  + 1) + \
             activeness_weight_dict['activity_geo'] * math.log(activity_geo_count + 1) +\
             activeness_weight_dict['statusnum'] * math.log(statusnum + 1)
    #print 'activeness:', result
    return result
Пример #42
0
def save_city_timestamp(uid, ip, timestamp):
    date = ts2datetime(timestamp)
    ts = datetime2ts(date)
    try:
        ip_timestamp_string = r_cluster.hget('new_ip_' + str(ts), str(uid))
        ip_timestamp_string_dict = json.loads(ip_timestamp_string)
        try:
            add_string = '&' + str(timestamp)
            ip_timestamp_string_dict[str(ip)] += add_string
        except:
            ip_timestamp_string_dict[str(ip)] = str(timestamp)
        r_cluster.hset('new_ip_' + str(ts), str(uid),
                       json.dumps(ip_timestamp_string_dict))

    except:
        r_cluster.hset('new_ip_' + str(ts), str(uid),
                       json.dumps({str(ip): str(timestamp)}))
Пример #43
0
def save_user_warning(xnr_user_no,start_time,end_time):

    #判断数据库是否存在:
    today_date=ts2datetime(end_time)
    today_datetime = datetime2ts(today_date)
    weibo_user_warning_index_name=weibo_user_warning_index_name_pre+today_date
    if not es_xnr.indices.exists(index=weibo_user_warning_index_name):
        weibo_user_warning_mappings(weibo_user_warning_index_name)

    
    new_user_warning = create_personal_warning(xnr_user_no,start_time,end_time)

    today_history_user_warning,old_uid_list = lookup_history_user_warming(xnr_user_no,today_datetime,end_time)

    results = []
    if new_user_warning:
        for item in new_user_warning:
            id_mark = set_intersection(item['uid'],old_uid_list)
            if id_mark == 1:
                #组合,更新数据库
                task_id = xnr_user_no+'_'+item['uid']
                old_user = es_xnr.get(index=weibo_user_warning_index_name,doc_type=weibo_user_warning_index_type,id=task_id)['_source']
                old_user['content'] = json.loads(old_user['content'])
                old_user['content'].extend(item['content'])
                old_user['user_sensitive'] = old_user['user_sensitive'] + item['user_sensitive']
                #old_user['user_influence'] = old_user['user_influence'] + item['user_influence']
                try:
                    es_xnr.index(index=weibo_user_warning_index_name,doc_type=weibo_user_warning_index_type,body=old_user,id=task_id)
                    mark=True
                except:
                    mark=False

            else:
                #直接存储
                task_id=xnr_user_no+'_'+item['uid']
                try:
                    es_xnr.index(index=weibo_user_warning_index_name,doc_type=weibo_user_warning_index_type,body=item,id=task_id)
                    mark=True
                except:
                    mark=False

            results.append(mark)
    else:
        pass
    print 'person_mark::',results
    return results
Пример #44
0
def get_hot_recommend_tweets(xnr_user_no,topic_field,sort_item):

    topic_field_en = topic_ch2en_dict[topic_field]
    if sort_item != 'compute_status':
        query_body = {
            'query':{
                'bool':{
                    'must':[
                        {
                            'filtered':{
                                'filter':{
                                    'term':{'topic_field':topic_field_en}
                                }
                            }
                        }
                    ]
                }
                
            },
            'sort':{sort_item:{'order':'desc'}},
            'size':TOP_WEIBOS_LIMIT
        }
        
        current_time = time.time()

        if S_TYPE == 'test':
            current_time = datetime2ts(S_DATE_FB)
            

        #fb_social_sensing_index_name = fb_social_sensing_index_name_pre + ts2datetime(current_time)

        es_results = es.search(index=fb_social_sensing_index_name,doc_type=fb_social_sensing_index_type,body=query_body)['hits']['hits']

        if not es_results:    
            es_results = es.search(index=fb_social_sensing_index_name,doc_type=fb_social_sensing_index_type,\
                                    body={'query':{'match_all':{}},'size':TOP_WEIBOS_LIMIT,\
                                    'sort':{sort_item:{'order':'desc'}}})['hits']['hits']
    results_all = []
    for result in es_results:
        result = result['_source']
        uid = result['uid']
        nick_name,photo_url = fb_uid2nick_name_photo(uid)
        result['nick_name'] = nick_name
        result['photo_url'] = photo_url
        results_all.append(result)
    return results_all
def save_results(save_type, recomment_results):
    save_mark = False
    #run_type
    if RUN_TYPE == 1:
        now_date = ts2datetime(time.time())
    else:
        now_date = ts2datetime(datetime2ts(RUN_TEST_TIME) - DAY)
    recomment_hash_name = 'recomment_' + now_date + '_auto'
    if save_type == 'hotspot':
        #print 'save hotspot results'
        R_RECOMMENTATION.hset(recomment_hash_name, 'auto',
                              json.dumps(recomment_results))
        save_mark = True
    elif save_type == 'operation':
        #print 'save operation results'
        R_RECOMMENTATION.hmset(recomment_hash_name, recomment_results)
        save_mark = True
    return save_mark
def make_up_user_info(user_list = []):
    result_info = []

    today = str(datetime.date.today())
    today = '2013-09-07'
    timestamp = datetime2ts(today)
    print len(user_list)
    if user_list:
        for id in user_list:
            item = {}
            item['uid'] = id
            item['is_warehousing'] , item['uname'], item['weibo_num'] , item['location'] , item['fansnum'] = user_portrait_info(id)
            item['bci_day_last'] = history_info(BCIHISTORY_INDEX_NAME,BCIHISTORY_INDEX_TYPE,id,['bci_day_last'])
            item['sen_day_last'] = history_info(SESHISTORY_INDEX_NAME,BCIHISTORY_INDEX_TYPE,id,['sensitive_score_' + str(timestamp) ])
            result_info.append(item)
        return result_info
    else:
        return []
Пример #47
0
def compute_history_number(xnr_qq_number):
    query_body_history = {
        "query": {
            "filtered": {
                "filter": {
                    "bool": {
                        "must": [{
                            "term": {
                                "xnr_qq_number": xnr_qq_number
                            }
                        }]
                    }
                }
            }
        },
        "size": MAX_VALUE,
        # "sort":{"timestamp":{"order":"desc"}}
    }

    enddate = datetime.datetime.now().strftime('%Y-%m-%d')
    startdate = ts2datetime(
        datetime2ts(enddate) - group_message_windowsize * DAY)
    index_names = get_groupmessage_index_list(startdate, enddate)
    print index_names
    results = {}
    for index_name in index_names:
        # if not es_xnr.indices.exsits(index=index_name):
        #     continue
        try:
            # 这里是把index名字改成发送表的名 sent_group_message_2017-07-07
            result = es.search(index='sent_' + index_name,
                               doc_type=group_message_index_type,
                               body=query_body_history)
            if results != {}:
                results['hits']['hits'].extend(result['hits']['hits'])
            else:
                results = result.copy()
        except:
            pass
    if results != {}:
        history_num = len(results['hits']['hits'])
    else:
        return 0
    return history_num
def inner_group_retweet(item):
    root_uid = str(item['root_uid'])
    uid = str(item['uid'])
    timestamp = item['timestamp']
    date = ts2datetime(timestamp)
    date_ts = datetime2ts(date)
    time_segment = int((timestamp - date_ts) / 900)
    start_ts = date_ts + time_segment * 900
    key = 'inner_' + str(start_ts)
    inner_retweet_exist = monitor_inner_r.hget(root_uid, key)
    if not inner_retweet_exist:
        monitor_inner_r.hset(root_uid, key, json.dumps({uid: 1}))
    else:
        inner_retweet_dict = json.loads(inner_retweet_exist)
        if uid in inner_retweet_dict:
            inner_retweet_dict[uid] += 1
        else:
            inner_retweet_dict[uid] = 1
        monitor_inner_r.hset(root_uid, key, json.dumps(inner_retweet_dict))
Пример #49
0
def get_all_filed(sort_norm, time):
    ts = datetime2ts(ts2datetime(TIME.time() - DAY))
    field_bci = 'bci_day_last'
    field_weibo = "weibo_month_sum"
    field_sen = 'sensitive_score_%s' % ts
    if sort_norm == "weibo_num":
        if time == 1:
            field_weibo = 'weibo_day_last'
        if time == 7:
            field_weibo = "weibo_week_sum"
        elif time == 30:
            field_weibo = "weibo_month_sum"
        else:
            pass
    if sort_norm == 'bci':
        if time == 1:
            field_bci = 'bci_day_last'
        elif time == 7:
            field_bci = 'bci_week_ave'
        else:
            field_bci = 'bci_month_ave'
    elif sort_norm == 'bci_change':
        if time == 1:
            field_bci = 'bci_day_change'
        elif time == 7:
            field_bci = 'bci_week_change'
        else:
            field_bci = 'bci_month_change'
    elif sort_norm == 'ses':
        if time == 1:
            field_sen = 'sensitive_score_%s' % ts
        elif time == 7:
            field_sen = 'sensitive_week_ave'
        else:
            field_sen = 'senstiive_month_ave'
    elif sort_norm == 'ses_change':
        if time == 1:
            field_sen = 'sensitive_day_change'
        elif time == 7:
            field_sen = 'sensitive_week_change'
        else:
            field_sen = 'sensitive_month_change'
    return field_bci, field_sen, field_weibo
Пример #50
0
def main():
    uid_list = []
    count = 0
    with open('uid_list_0520.txt', 'rb') as f:
        for item in f:
            uid_list.append(item.strip())
    print "uid_list: ", len(uid_list)
    print uid_list[:3]

    query_body = {
        "query": {
            "filtered": {
                "filter": {
                    "terms": {
                        "uid": uid_list
                    }
                }
            }
        },
        "size": 100000
    }

    with open('uid_text_0523.txt', 'wb') as f_txt:
        #ts = datetime2ts(ts2datetime(time.time()-24*3600))
        ts = datetime2ts(ts2datetime(time.time()))  #today
        while 1:
            date = ts2datetime(ts)
            index_name = "flow_text_" + str(date)
            print index_name
            exist_bool = es_flow_text.indices.exists(index=index_name)
            if not exist_bool:
                break
            search_results = es_flow_text.search(
                index=index_name, doc_type="text",
                body=query_body)["hits"]["hits"]
            print len(search_results)
            if search_results:
                for item in search_results:
                    f_txt.write(json.dumps(item['_source']) + "\n")
                    count += 1
            ts = ts - 24 * 3600
            break
    print count
Пример #51
0
def get_user_at():
    #step1: get_uid_list
    uid_list = get_uid_list()
    date = ts2datetime(time.time())
    ts = datetime2ts(date)
    f = open(
        '/home/user_portrait_0320/revised_user_portrait/user_portrait/user_portrait/attribute/uid_at.txt',
        'w')
    for i in range(1, 8):
        ts = ts - DAY
        for uid in uid_list:
            #try:
            result_string = r_cluster.hget('at_' + str(ts), uid)
            #except:
            #    result_string = ''
            if result_string:
                save_dict = {'ts': ts, 'result': result_string}
                f.write('%s\n' % json.dumps(save_dict))
    f.close()
def mapper_bci_today(todaydate):
    BCI_INDEX_NAME = "bci_" + ts2datetime(
        (datetime2ts(todaydate) - DAY)).replace("-", "")
    TODAY_TIME = todaydate
    print BCI_INDEX_NAME
    s_re = scan(es_9200,
                query={
                    "query": {
                        "match_all": {}
                    },
                    "size":
                    MAX_ITEMS,
                    "fields":
                    [TOTAL_NUM, TODAY_BCI, "user_fansnum", 'user_friendsnum']
                },
                index=BCI_INDEX_NAME,
                doc_type=BCI_INDEX_TYPE)
    count = 0
    array = []
    while 1:
        try:
            temp = s_re.next()
            one_item = {}
            one_item['id'] = temp['_id'].encode("utf-8")
            one_item['user_fansnum'] = temp['fields']["user_fansnum"][0]
            one_item['user_friendsnum'] = temp['fields']['user_friendsnum'][0]
            one_item['total_num'] = temp['fields'][TOTAL_NUM][0]
            one_item['today_bci'] = temp['fields'][TODAY_BCI][0]
            one_item['update_time'] = TODAY_TIME
            array.append(one_item)
            count += 1
            if count % 1000 == 0:
                r_flow.lpush('update_bci_list', json.dumps(array))
                array = []
                #if count % 100000 == 0:
                #    print count
        except StopIteration:
            print "all done"
            if array:
                r_flow.lpush('update_bci_list', json.dumps(array))
            break

    print count
Пример #53
0
def create_fb_warning():
    #时间设置
    now_time = int(time.time())
    #生成当前时间周期内的
    start_time = datetime2ts(ts2datetime(now_time))

    #生成表
    for i in range(0, 3, 1):
        datetime = start_time - i * DAY
        datename = ts2datetime(datetime)
        facebook_user_warning_mappings(datename)
        facebook_event_warning_mappings(datename)
        facebook_speech_warning_mappings(datename)

        date_result = lookup_date_info(datetime)
        facebook_timing_warning_mappings(date_result)

    account_list = get_user_account_list()
    for account in account_list:
        xnr_list = get_user_xnr_list(account)

        for xnr_user_no in xnr_list:
            for i in range(0, 3, 1):
                task_dict = dict()
                task_dict['xnr_user_no'] = xnr_user_no
                task_dict['today_datetime'] = start_time - i * DAY
                #将计算任务加入队列
                r_warning.lpush(fb_user_warning_task_queue_name,
                                json.dumps(task_dict))

                r_warning.lpush(fb_speech_warning_task_queue_name,
                                json.dumps(task_dict))

                r_warning.lpush(fb_event_warning_task_queue_name,
                                json.dumps(task_dict))

#时间预警
    time_task = dict()
    for i in range(0, 3, 1):
        time_task['today_datetime'] = start_time - i * DAY
        r_warning.lpush(fb_time_warning_task_queue_name, json.dumps(time_task))
    return True
Пример #54
0
def get_tweets_from_bci(monitor_keywords_list, sort_item_new):

    if S_TYPE == 'test':
        now_ts = datetime2ts(S_DATE_BCI_TW)
    else:
        now_ts = int(time.time())

    datetime = ts2datetime(now_ts - 24 * 3600)
    # datetime_new = datetime[0:4]+datetime[5:7]+datetime[8:10]
    datetime_new = datetime

    index_name = tw_bci_index_name_pre + datetime_new

    query_body = {
        'query': {
            'match_all': {}
        },
        'sort': {
            sort_item_new: {
                'order': 'desc'
            }
        },
        'size': BCI_USER_NUMBER
    }

    es_results_bci = es.search(index=index_name,
                               doc_type=tw_bci_index_type,
                               body=query_body)['hits']['hits']
    #print 'es_results_bci::',es_results_bci
    #print 'index_name::',index_name
    #print ''
    uid_set = set()

    if es_results_bci:
        for result in es_results_bci:
            uid = result['_id']
            uid_set.add(uid)
    uid_list = list(uid_set)

    es_results = uid_lists2tw_from_flow_text(monitor_keywords_list, uid_list)

    return es_results
Пример #55
0
def retweet(xnr_info, date):
    global EXCEPTION
    ts = datetime2ts(date)
    facebook_feedback_retweet_mappings(
        facebook_feedback_retweet_index_name_pre + date)
    redis_key = 'facebook_feedback_retweet_data'
    xnr_user_no = xnr_info['xnr_user_no']
    lis = load_data(xnr_user_no, redis_key)

    # {'uid', 'nick_name', 'mid', 'timestamp', 'text', 'update_time', 'root_text', 'root_mid'}
    data = []
    for item in lis:
        try:
            uid = item['uid']
            text = item['text']
            if uid in xnr_info['friends_list']:
                facebook_type = u"好友"
            else:
                facebook_type = u"陌生人"
            sensitive_info, sensitive_user = sensitive_func(ts, text, uid)
            d = {
                'uid': uid,
                'text': text,
                'nick_name': item['nick_name'],
                'mid': item['mid'],
                'timestamp': item['timestamp'],
                'update_time': item['update_time'],
                'root_text': item['root_text'],
                'root_mid': item['root_mid'],
                'photo_url': '',
                'root_uid': xnr_info['root_uid'],
                'root_nick_name': xnr_info['root_nick_name'],
                'facebook_type': facebook_type,
                'sensitive_info': sensitive_info,
                'sensitive_user': sensitive_user,
                'retweet': 0,
                'comment': 0,
                'like': 0
            }
            data.append(d)
        except Exception, e:
            EXCEPTION += '\n retweet Exception: ' + str(e)
Пример #56
0
def create_task_list():
    # 1. search from manage_sensing_task
    # 2. push to redis list-----task_work

    # print start info
    current_path = os.getcwd()
    file_path = os.path.join(current_path, 'task_list.py')
    if S_TYPE == 'test':
        now_ts = datetime2ts(S_DATE) - 3600 * 10
    else:
        now_ts = datehour2ts(ts2datehour(time.time() - 3600))
    print_log = "&".join([file_path, "start", ts2date(now_ts)])
    print print_log
    #ts = ts - 3600

    query_body = {"query": {"match_all": {}}}

    search_results = es.search(index=index_sensing,
                               doc_type=type_sensing,
                               body=query_body)['hits']['hits']

    count = 0
    if search_results:
        for iter_item in search_results:
            _id = iter_item['_id']
            item = iter_item['_source']
            task = []
            task.append(item['task_name'])  # task_name
            try:
                task.append(json.loads(
                    item['social_sensors']))  # social sensors
            except:
                task.append(item['social_sensors'])  # social sensors
            task.append(now_ts)
            #task.append(item['xnr_user_no'])
            #task.append(given_ts)
            r.lpush("task_name", json.dumps(task))
            count += 1

    print count
    print_log = "&".join([file_path, "end", ts2date(time.time())])
    print print_log
def filter_mention(user_set):
    results = []
    now_date = ts2datetime(time.time())
    now_date = '2013-09-08'
    timestamp = datetime2ts(now_date) - 24 * 3600
    for user in user_set:
        mention_set = set()
        for i in range(0, 7):
            ts = timestamp - 3600 * 24 * i
            result = r_cluster.hget('at_' + str(ts), str(user))
            if result:
                item_dict = json.loads(result)
                for at_user in item_dict:
                    mention_set.add(at_user)
        if at_count < mention_threshold:
            results.append(user)
        else:
            writer.writerow([user, 'mention'])
    print 'after filter mention: ', len(results)
    return results
Пример #58
0
def get_all_data():
    topic_list = [
        u'东盟,博览会', u'全军政治工作会议', u'外滩踩踏', u'高校思想宣传', u'APEC', u'张灵甫遗骨疑似被埋羊圈'
    ]
    time_range_list = [('2013-09-08', 6), ('2014-11-16', 17),
                       ('2015-01-10', 10), ('2015-02-01', 9),
                       ('2014-11-20', 15), ('2015-02-02', 10)]
    result = {}
    result_list = []
    for i in range(len(topic_list)):
        topic_name = topic_list[i]
        end_date = time_range_list[i][0]
        windowsize = time_range_list[i][1]
        end_ts = datetime2ts(end_date)
        start_ts = end_ts - Day * windowsize
        print 'start compute topic:', topic_name
        result = get_topic_data(topic_name, start_ts, end_ts)
        result_list.append(result)

    return json.dumps(result_list)
Пример #59
0
def make_up_user_info(user_list=[], isall=False, time=1, sort_norm="bci"):
    result_info = []

    if RUN_TYPE:
        today = str(datetime.date.today())
    else:
        today = '2013-09-07'
    timestamp = datetime2ts(today)
    #print len(user_list)
    if user_list:
        for id in user_list:
            item = {}
            if isall:
                item = all_makeup_info(id, sort_norm, time)
            else:
                item = in_makeup_info(id, sort_norm, time)
            result_info.append(item)
        return result_info
    else:
        return []
def cal_hashtag_work(uid, hashtag_list, timestamp, sensitive):
    date = ts2datetime(timestamp)
    ts = datetime2ts(date)
    key = str(uid)

    hashtag_dict = {}
    sensitive_hashtag_dict = dict()
    for hashtag in hashtag_list:
        try:
            hashtag_dict[hashtag] += 1
        except:
            hashtag_dict[hashtag] = 1
    hashtag_count_string = redis_cluster.hget('hashtag_' + str(ts), str(uid))
    if hashtag_count_string:
        hashtag_count_dict = json.loads(hashtag_count_string)
        for item in hashtag_list:
            if hashtag_count_dict.has_key(item):
                hashtag_count_dict[item] += 1
            else:
                hashtag_count_dict[item] = 1
    else:
        hashtag_count_dict = hashtag_dict
    redis_cluster.hset('hashtag_' + str(ts), str(uid),
                       json.dumps(hashtag_count_dict))

    if sensitive:
        sensitive_hashtag_count_string = redis_cluster.hget(
            'sensitive_hashtag_' + str(ts), str(uid))
        if sensitive_hashtag_count_string:
            sensitive_hashtag_count_dict = json.loads(
                sensitive_hashtag_count_string)
            for hashtag in hashtag_list:
                if sensitive_hashtag_count_dict.has_key(hashtag):
                    sensitive_hashtag_count_dict[hashtag] += 1
                else:
                    sensitive_hashtag_count_dict[hashtag] = 1
        else:
            sensitive_hashtag_count_dict = hashtag_dict

        redis_cluster.hset('sensitive_hashtag_' + str(ts), str(uid),
                           json.dumps(sensitive_hashtag_count_dict))