Пример #1
0
def pagerank_rank():
    timestamp = time.time()
    es_num = get_es_num(timestamp)
    if es_num == 0:
        network_es_mappings()
        network_count_es_mappings()

    tmp_file = tempfile.NamedTemporaryFile(delete=False)

    print 'step 1: scan', ts2date(timestamp)
    scan_retweet(tmp_file)
    tmp_file.close()
    if not tmp_file:
        return
    input_tmp_path = tmp_file.name
    print input_tmp_path

    ITER_COUNT = 10
    TOP_N = 50
    print 'step 2: pagerank', ts2date(time.time())
    all_uids_count, dg_sorted_uids, pr_sorted_uids = pagerank(ITER_COUNT, input_tmp_path, TOP_N, 'all')
    print 'step 3: save', ts2date(time.time())
    save_count_results(all_uids_count, es_num)
    save_dg_pr_results(dg_sorted_uids, es_num, 'dg')    
    save_dg_pr_results(pr_sorted_uids, es_num, 'pr')    
    print 'save done', ts2date(time.time())
Пример #2
0
def main():
    #step1: get task from redis queue (rpop)
    #step2: get monitor task time record from redis----data: {'monitor_task_time_record':{task_name, compute_start_ts}}
    #step3: identify the compute_start_ts can be compute
    #setp4: get task user from es---group_result
    #step5: according task user count do differently computing
    #step6: compute task mid-result
    #step7: save the mid-result in mid-result es----timestamp as field
    #step8: identify the track task is doing ,not end/delete  from group_result es status==1 not 0
    #step8: if track_task is doing: update the compute_start_ts
    #step9: if track_task is doing: lpush task name to redis queue (keep the task in queue)
    #step10: if track_task is not doing: delete the compute_start_ts from redis
    while True:
        task_name = get_task_name()
        if task_name:
            start_ts = r_task.hget('monitor_task_time_record', task_name)
            start_ts = int(start_ts)
            #now_ts = time.time()
            #test
            now_ts = date2ts('2013-09-08 00:15:00')
            if start_ts == now_ts:
                status = add_task_name(task_name)
                if status == 0:
                    print 'add task to redis fail'
                    break

            if start_ts + 900 <= now_ts:
                task_user = get_task_user(task_name)

                if len(task_user) == 1:
                    print 'compute %s start_ts %s' % (task_name,
                                                      ts2date(start_ts))
                    #status = compute_mid_result_one(task_name, task_user, start_ts)
                else:
                    #print 'compute %s start_ts %s' % (task_name, ts2date(start_ts))
                    #status = compute_mid_result_group(task_name, task_user, start_ts)
                    #compute group polarization----compute once a day
                    if datetime2ts(ts2datetime(start_ts)) == start_ts:
                        print 'start commpute group inner %s' % ts2date(
                            start_ts)
                        group_status = compute_group_inner(
                            task_name, task_user, start_ts)
                        status = group_status
                #test
                status = 1
                if status == 0:
                    print 'there is a bug about %s task' % task_name
                else:
                    #update the record time
                    start_ts += 900
                    task_doing_status = identify_task_doing(task_name)
                    #print 'task_doing_status:', task_doing_status
                    if task_doing_status == True:
                        r_task.hset('monitor_task_time_record', task_name,
                                    start_ts)
                        status = add_task_name(task_name)
                        if status == 0:
                            print 'add task name to redis fail'
                    else:
                        r_task.hdel('monitor_task_time_record', task_name)
Пример #3
0
def get_maker(topic, new_peaks, new_bottom, ts_list, collection):
    begin_ts = ts_list[new_bottom[0]]
    end_ts = ts_list[new_peaks[0]]
    print 'get_maker news_bottom:', new_bottom[0]
    print 'get_maker news_peak:', new_peaks[0]
    print 'get_maker ts_list:', ts2date(ts_list[0])
    print 'get_maker start_ts:', ts2date(begin_ts)
    print 'get_maker end_ts:', ts2date(end_ts)
    if begin_ts > end_ts:
        begin_ts = ts_list[0]
    
    begin_ts = begin_ts - Hour
    filter_dict = get_filter_dict()
    query_dict = {'timestamp':{'$gte':begin_ts, '$lte':end_ts}}
    '''
    maker_list = collection.find(query_dict, filter_dict).sort('weight').limit(maker_news_count)
    if not maker_list:
        return []
    else:
        return maker_list
    '''
    input_news_list = collection.find(query_dict, filter_dict)
    # 第一个波段内所有新闻进行分词
    news_cut_list = cut_news(input_news_list)
    # 计算top50的关键词
    keywords_list = get_news_keywords(news_cut_list)
    # 计算波段内新闻的关键词占比weight
    weight_list = get_news_weight(news_cut_list, keywords_list)
    # 排序获取weight前20的news
    maker_list = get_top_weight_news(weight_list)
    
    if not maker_list:
        return []
    else:
        return maker_list
Пример #4
0
def get_maker(topic, new_peaks, new_bottom, ts_list, collection):
    begin_ts = ts_list[new_bottom[0]]
    end_ts = ts_list[new_peaks[0]]
    print 'get_maker news_bottom:', new_bottom[0]
    print 'get_maker news_peak:', new_peaks[0]
    print 'get_maker ts_list:', ts2date(ts_list[0])
    print 'get_maker start_ts:', ts2date(begin_ts)
    print 'get_maker end_ts:', ts2date(end_ts)
    if begin_ts > end_ts:
        begin_ts = ts_list[0]

    begin_ts = begin_ts - Hour
    filter_dict = get_filter_dict()
    query_dict = {'timestamp': {'$gte': begin_ts, '$lte': end_ts}}
    '''
    maker_list = collection.find(query_dict, filter_dict).sort('weight').limit(maker_news_count)
    if not maker_list:
        return []
    else:
        return maker_list
    '''
    input_news_list = collection.find(query_dict, filter_dict)
    # 第一个波段内所有新闻进行分词
    news_cut_list = cut_news(input_news_list)
    # 计算top50的关键词
    keywords_list = get_news_keywords(news_cut_list)
    # 计算波段内新闻的关键词占比weight
    weight_list = get_news_weight(news_cut_list, keywords_list)
    # 排序获取weight前20的news
    maker_list = get_top_weight_news(weight_list)

    if not maker_list:
        return []
    else:
        return maker_list
Пример #5
0
def get_interval_count(topic, start_ts, end_ts):
    results = [0]
    ts_list = []
    #unit = 900
    #during = Day
    during = interval_count_during
    start_ts = datetime2ts(ts2datetime(start_ts))
    ts_list.append(start_ts)
    #end_ts = datetime2ts(ts2datetime(end_ts))
    # deal with the time is not the whole day
    print 'before deal end_ts:', ts2date(end_ts)
    if end_ts - datetime2ts(ts2datetime(end_ts)) != 0:
        end_ts = datetime2ts(ts2datetime(end_ts)) + 3600 * 24
    print 'get_interval_count start_ts:', ts2date(start_ts)
    print 'get_interval_count end_ts:', ts2date(end_ts)

    windowsize = (end_ts - start_ts) / Day
    interval = (end_ts - start_ts) / During
    for i in range(interval, 0, -1):
        begin_ts = end_ts - during * i
        over_ts = begin_ts + during
        ts_list.append(over_ts)

        items = db.session.query(PropagateCountNews).filter(PropagateCountNews.topic==topic ,\
                                                                                              PropagateCountNews.end<=over_ts ,\
                                                                                              PropagateCountNews.end>begin_ts ,\
                                                                                              PropagateCountNews.range==unit).all()
        if items:
            result = Merge_propagate(items)
        else:
            result = 0
        results.append(float(result))

    return ts_list, results
Пример #6
0
def create_task_list(given_ts):
    # 1. search from manage_sensing_task
    # 2. push to redis list-----task_work

    # print start info
    current_path = os.getcwd()
    file_path = os.path.join(current_path, 'task_list.py')
    now_ts = datehour2ts(ts2datehour(time.time() - 3600))
    print_log = "&".join([file_path, "start", ts2date(now_ts)])
    print print_log
    #ts = ts - 3600

    query_body = {"query": {"match_all": {}}}

    search_results = es.search(index=index_sensing,
                               doc_type=type_sensing,
                               body=query_body)['hits']['hits']

    count = 0
    if search_results:
        for iter_item in search_results:
            _id = iter_item['_id']
            item = iter_item['_source']
            task = []
            task.append(item['task_name'])  # task_name
            task.append(json.loads(item['social_sensors']))  # social sensors
            #task.append(now_ts)
            task.append(given_ts)
            r.lpush('task_name', json.dumps(task))
            count += 1

    print count
    print_log = "&".join([file_path, "end", ts2date(time.time())])
    print print_log
def bursty_event_detection(ts,mid_list):
    #print mid_list
    results = es_user_portrait.mget(index=monitor_index_name, doc_type=monitor_index_type, body={"ids":mid_list})["docs"]
    if len(mid_list) >=3:
        print results
        with open('burst_3.txt', 'a') as f:
            for item in results:
                item = item['_source']
                mid = item['mid']
                f.write(str(ts2date(ts))+str(ts2date(item['timestamp']))+str(item['uid'])+item["text"].encode("utf-8", "ignore")+"\n")
                item['detection'] = 1
                es_user_portrait.index(index=monitor_index_name, doc_type=monitor_index_type,id=item['mid'], body=item)
    

    time_series = dict()
    for item in results:
        for k,v in item['_source'].iteritems():
            if "ts-" in k:
                k = k.split('-')[1]
                if int(k) in time_series:
                    time_series[int(k)] += v 
                else:
                    time_series[int(k)] = v
    sorted_result = sorted(time_series.items(), key=lambda x:x[0], reverse=False)
    if len(sorted_result) > 4 and len(mid_list) >=2:
        timestamp = sorted_result[-1][0]
        retweet_number = sorted_result[-1][1]
        average_list = [item[1] for item in sorted_result[:-1]]
        average = np.mean(average_list)
        std = np.std(average_list)
        former_three = sum(average_list[-4:-1])
        #print average_list, retweet_number
        #if retweet_number > average + 1.96*std:
        if retweet_number > former_three:
            print sorted_result
            print "detect burst event"
            print "timestamp: ", timestamp
            print "weibo list: ", mid_list
            #取按时间排序的top2
            text_list = []
            for item in results:
                text_list.append(item['_source'])
            #sorted_by_ts = sorted(text_list, key=operator.itemgetter("timestamp"), reverse=False)
            #print "最早的两个微博:", sorted_by_ts[:2]
            #sorted_by_retweet = sorted(text_list, key=operator.itemgetter("sum_retweet"), reverse=True)
            #print sorted_by_retweet[:2]
            #mining_results = []
            #mining_results.extend(sorted_by_ts[:2])
            #mining_results.extend(sorted_by_retweet[:2])
            with open("burst_3.txt", "a") as f:
                for item in text_list:
                    mid = item['mid']
                    item['detection'] = 1
                    es_user_portrait.index(index=monitor_index_name, doc_type=monitor_index_type,id=item['mid'], body=item)
                    f.write(str(ts2date(ts))+str(ts2date(item['timestamp']))+str(item['uid'])+item["text"].encode('utf-8', 'ignore')+"\n")

        else:
            results = []

    return results
def pagerank_rank():
    timestamp = time.time()
    net_dic_pr = get_net_dic_pr()
    '''
    es_num = get_es_num(timestamp)
    
    if es_num == 0:
        network_es_mappings()
        network_count_es_mappings()
    '''
    tmp_file = tempfile.NamedTemporaryFile(delete=False)

    print 'step 1: write', ts2date(timestamp)
    for key in net_dic_pr:

        write_tmp_file(tmp_file, key, net_dic_pr[key])

    tmp_file.close()
    if not tmp_file:
        return
    input_tmp_path = tmp_file.name
    print input_tmp_path

    print 'step 2: pagerank', ts2date(time.time())
    all_uids_count, dg_sorted_uids, pr_sorted_uids = pagerank(
        ITER_COUNT, input_tmp_path, TOP_N, 'all')
    print 'pr_sorted_uids:', pr_sorted_uids
    print 'step 3: save', ts2date(time.time())
    save_count_results(all_uids_count, es_num)
    save_dg_pr_results(dg_sorted_uids, es_num, 'dg')
    save_dg_pr_results(pr_sorted_uids, es_num, 'pr')
    print 'save done', ts2date(time.time())
Пример #9
0
def get_interval_count(topic, start_ts, end_ts):
    results = [0]
    ts_list = []
    #unit = 900
    #during = Day
    during = interval_count_during
    start_ts = datetime2ts(ts2datetime(start_ts))
    ts_list.append(start_ts)
    #end_ts = datetime2ts(ts2datetime(end_ts))
    # deal with the time is not the whole day
    print 'before deal end_ts:', ts2date(end_ts)
    if end_ts - datetime2ts(ts2datetime(end_ts))!= 0:
        end_ts = datetime2ts(ts2datetime(end_ts)) + 3600 * 24
    print 'get_interval_count start_ts:', ts2date(start_ts)
    print 'get_interval_count end_ts:', ts2date(end_ts)

    windowsize = (end_ts - start_ts) / Day
    interval = (end_ts - start_ts) / During
    for i in range(interval, 0, -1):
        begin_ts = end_ts - during * i
        over_ts = begin_ts + during
        ts_list.append(over_ts)

        items = db.session.query(PropagateCountNews).filter(PropagateCountNews.topic==topic ,\
                                                                                              PropagateCountNews.end<=over_ts ,\
                                                                                              PropagateCountNews.end>begin_ts ,\
                                                                                              PropagateCountNews.range==unit).all()
        if items:
            result = Merge_propagate(items)
        else:
            result = 0
        results.append(float(result))

    return ts_list, results
Пример #10
0
def pagerank_rank():
    timestamp = time.time()
    es_num = get_es_num(timestamp)
    if es_num == 0:
        network_es_mappings()
        network_count_es_mappings()

    tmp_file = tempfile.NamedTemporaryFile(delete=False)

    print 'step 1: scan', ts2date(timestamp)
    scan_retweet(tmp_file)
    tmp_file.close()
    if not tmp_file:
        return
    input_tmp_path = tmp_file.name
    print input_tmp_path

    ITER_COUNT = 10
    TOP_N = 50
    print 'step 2: pagerank', ts2date(time.time())
    all_uids_count, dg_sorted_uids, pr_sorted_uids = pagerank(
        ITER_COUNT, input_tmp_path, TOP_N, 'all')
    print 'step 3: save', ts2date(time.time())
    save_count_results(all_uids_count, es_num)
    save_dg_pr_results(dg_sorted_uids, es_num, 'dg')
    save_dg_pr_results(pr_sorted_uids, es_num, 'pr')
    print 'save done', ts2date(time.time())
Пример #11
0
def create_task_list():
    # 1. search from manage_sensing_task
    # 2. push to redis list-----task_work

    # print start info
    current_path = os.getcwd()
    file_path = os.path.join(current_path, 'task_list.py')
    if RUN_TYPE == 0:
        now_ts = 1463241600  # 1378008000
    else:
        i = int(sys.argv[1])
        now_ts = 1463241600 + 3600 * i
        #now_ts = date_hour2ts(ts2date_hour(time.time()))
    print_log = "&".join([file_path, "start", ts2date(now_ts)])
    print print_log
    #ts = ts - 3600

    query_body = {
        "query": {
            "filtered": {
                "filter": {
                    "bool": {
                        "must": [{
                            "term": {
                                "finish": "0"
                            }
                        }, {
                            "term": {
                                "processing_status": "1"
                            }
                        }]
                    }
                }
            }
        }
    }

    search_results = es.search(index=index_name,
                               doc_type=task_doc_type,
                               body=query_body)['hits']['hits']

    count = 0
    if search_results:
        for iter_item in search_results:
            _id = iter_item['_id']
            item = iter_item['_source']
            task = []
            task.append(item['task_name'])  # task_name
            task.append(item['keywords'])  # keywords
            task.append(item['stop_time'])  # stop time
            task.append(item['create_by'])
            task.append(now_ts)
            r.lpush('task_name', json.dumps(task))
            count += 1

    print count
    print_log = "&".join([file_path, "end", ts2date(time.time())])
    print print_log
def main():
    #step1: get task from redis queue (rpop)
    #step2: get monitor task time record from redis----data: {'monitor_task_time_record':{task_name, compute_start_ts}}
    #step3: identify the compute_start_ts can be compute
    #setp4: get task user from es---group_result
    #step5: according task user count do differently computing
    #step6: compute task mid-result
    #step7: save the mid-result in mid-result es----timestamp as field
    #step8: identify the track task is doing ,not end/delete  from group_result es status==1 not 0
    #step8: if track_task is doing: update the compute_start_ts
    #step9: if track_task is doing: lpush task name to redis queue (keep the task in queue)
    #step10: if track_task is not doing: delete the compute_start_ts from redis
    while True:
        task_name = get_task_name()
        if task_name:
            start_ts = r_task.hget('monitor_task_time_record', task_name)
            start_ts = int(start_ts)
            #now_ts = time.time()
            #test
            now_ts = date2ts('2013-09-08 00:15:00')
            if start_ts == now_ts:
                status = add_task_name(task_name)
                if status == 0:
                    print 'add task to redis fail'
                    break

            if start_ts + 900 <= now_ts:
                task_user  = get_task_user(task_name)
                
                if len(task_user)==1:
                    print 'compute %s start_ts %s' % (task_name, ts2date(start_ts))
                    #status = compute_mid_result_one(task_name, task_user, start_ts)
                else:
                    #print 'compute %s start_ts %s' % (task_name, ts2date(start_ts))
                    #status = compute_mid_result_group(task_name, task_user, start_ts)
                    #compute group polarization----compute once a day
                    if datetime2ts(ts2datetime(start_ts)) == start_ts:
                        print 'start commpute group inner %s' % ts2date(start_ts)
                        group_status = compute_group_inner(task_name, task_user, start_ts)
                        status = group_status
                #test
                status = 1
                if status == 0:
                    print 'there is a bug about %s task' % task_name
                else:
                    #update the record time
                    start_ts += 900
                    task_doing_status = identify_task_doing(task_name)
                    #print 'task_doing_status:', task_doing_status
                    if task_doing_status == True:
                        r_task.hset('monitor_task_time_record', task_name, start_ts)
                        status = add_task_name(task_name)
                        if status==0:
                            print 'add task name to redis fail'
                    else:
                        r_task.hdel('monitor_task_time_record', task_name)
def create_task_list():
    # 1. search from manage_sensing_task
    # 2. push to redis list-----task_work

    # print start info
    current_path = os.getcwd()
    file_path = os.path.join(current_path, 'task_list.py')
    if RUN_TYPE == 0:
        now_ts = 1463241600 # 1378008000
    else:
        i = int(sys.argv[1])
        now_ts = 1463241600 + 3600 * i
        #now_ts = date_hour2ts(ts2date_hour(time.time()))
    print_log = "&".join([file_path, "start", ts2date(now_ts)])
    print print_log
    #ts = ts - 3600

    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "bool":{
                        "must":[
                            {"term":{"finish": "0"}},
                            {"term":{"processing_status": "1"}}
                        ]
                    }
                }
            }
        }
    }

    search_results = es.search(index=index_name, doc_type=task_doc_type, body=query_body)['hits']['hits']

    count = 0
    if search_results:
        for iter_item in search_results:
            _id = iter_item['_id']
            item = iter_item['_source']
            task = []
            task.append(item['task_name']) # task_name
            task.append(item['keywords']) # keywords
            task.append(item['stop_time']) # stop time
            task.append(item['create_by'])
            task.append(now_ts)
            r.lpush('task_name', json.dumps(task))
            count += 1

    print count
    print_log = "&".join([file_path, "end", ts2date(time.time())])
    print print_log
Пример #14
0
def get_tsu(new_peaks, new_bottom, ts_list, topic_xapian_id):
    #print 'new_peaks:', new_peaks
    #print 'new_bottom:', new_bottom
    #print 'ts_list:', ts_list
    end_ts = ts_list[new_peaks[0]]
    begin_ts = ts_list[new_bottom[0]]
    if begin_ts > end_ts:
        begin_ts = ts_list[0]
    query_dict = {
        'timestamp': {
            '$gt': begin_ts,
            '$lt': end_ts
        },
        'message_type': 3
    }
    print 'query_dict:', query_dict
    print 'begin_ts:', ts2date(begin_ts)
    print 'end_ts:', ts2date(end_ts)
    xapian_search_weibo = getXapianWeiboByTopic(topic_xapian_id)  # 这里需要考虑话题id
    count, results = xapian_search_weibo.search(
        query=query_dict, fields=['retweeted_uid', 'retweeted_mid'])
    print 'count:', count
    ruid_count = {}
    ruid_mid = {}
    for result in results():
        r_uid = result['retweeted_uid']
        if (r_uid == 0) or (not r_uid):
            continue
        ruid_mid[r_uid] = result['retweeted_mid']
        try:
            ruid_count[r_uid] += 1
        except KeyError:
            ruid_count[r_uid] = 1
    sorted_result = sorted(ruid_count.items(),
                           key=lambda d: d[1],
                           reverse=True)
    print 'top_source_user:'******'''
    count, top_weibo = xapian_search_weibo.search(query={'_id':top_source_mid}, fields=['timestamp'])
    print 'count:', count
    for i in top_weibo():
        timestamp = i['timestamp']
        print 'timestamp:', ts2date(int(timestamp))
    '''
    return sorted_result
Пример #15
0
def get_theme(theme_name, submit_user):
    if theme_name == '': 
        theme_detail = es_special_event.search(index=special_event_name, doc_type=special_event_type,\
            body={'query':{'term':{'user':submit_user}}})['hits']['hits']
    else:
        query_body = {
            "query":{
                'bool':{
                    'must':[
                        {'match':   {"user":submit_user}},         
                        {'match':   {"topic_name":theme_name}},         
                    ]

                }

            },
            'size':100
        }
        theme_detail = es_event.search(index=special_event_name, doc_type=special_event_type,\
            body=query_body)['hits']['hits']
    theme_result = []
    for i in theme_detail:
        topic_id = i['_id']
        theme_name = i['_source']['topic_name']
        contain_event = i['_source']['event_count']
        auto_label = i['_source']['label'].split('&')[:5]
        try:
            work_tag = i['_source']['k_label'].split('&')
        # work_tag = deal_event_tag(work_tag, submit_user)[0]
        except:
            work_tag = []
        submit_ts = ts2date(i['_source']['create_ts'])
        theme_result.append([topic_id, theme_name, contain_event, auto_label, work_tag, submit_ts])
    return theme_result
def add_task( user_name ,type = "keyword",range = "all"  ,pre ='flow_text_' , during = '1' , start_time ='2013-09-07' ,end_time ='2013-09-07', keyword = 'hello,world' , sort_norm = 'bci' , sort_scope  = 'in_limit_keyword', time = 1, isall = False  ):
    time_now = TIME.time()
        
       
    body_json = {
                'submit_user' : user_name ,
                'keyword' : keyword,
                'submit_time' : str(ts2date(time_now)) ,
                'end_time' : end_time,
                'search_type' : type,
                'status':0,
                'range' : range , 
                'user_ts' : user_name +  str(time_now),
                'pre' : pre,
                'during' : during ,
                'start_time' : start_time ,
                'sort_norm' : sort_norm ,
                'sort_scope' : sort_scope,
                'time' : time ,
                'isall' : isall
            }
    try:
        es.index(index = USER_RANK_KEYWORD_TASK_INDEX , doc_type=USER_RANK_KEYWORD_TASK_TYPE ,  body=body_json)
        return body_json["user_ts"]
    except Exception , e1 :
        print e1
Пример #17
0
def profile_group_status_count(fieldEnName):
    start_ts = request.args.get('start_ts', None)
    end_ts = request.args.get('end_ts', None)

    if not start_ts or not end_ts:
        start_ts, end_ts = _time_zone(_utf_encode(default_timerange))

    if start_ts:
        start_ts = int(start_ts)

    if end_ts:
        end_ts = int(end_ts)
    
    interval = (end_ts - start_ts) / (24 * 3600) + 1
    datestr = ts2datetimestr(end_ts) # '20130907'

    date_list = last_week_to_date(datestr, interval)
    domainid = DOMAIN_LIST.index(fieldEnName)

    time_arr = []
    total_arr = []
    repost_arr = []
    fipost_arr = []

    for datestr in date_list:
        active, important, reposts, original = getDomainCountData(domainid, datestr)
        sumcount = reposts + original
        time_arr.append(ts2date(datetimestr2ts(datestr)).isoformat())
        total_arr.append(sumcount)
        repost_arr.append(reposts)
        fipost_arr.append(original)

    return json.dumps({'time': time_arr, 'count': total_arr, 'repost': repost_arr, 'fipost': fipost_arr})
Пример #18
0
def compute_group_inner(task_name, task_user, start_ts):
    #step1: get task_user in-monitor task user retweet relation from monitor_inner_r
    #step2: get task_user in-task user retweet relation
    #step3: compute every inner user be-retweet ratio in task
    #step4: save top5 to es--monitor_result, doc_type=task_name, _id='inner_'+date  e:'inner_2013-09-01'
    group_status = 0
    time_segment = 3600 * 24
    iter_time_segment = 900
    iter_ts = start_ts - time_segment
    inner_group_dict = {}
    user_count_dict = {}
    print 'group inner ask_user:'******'''
            if iter_ts >= start_ts:
                break
            '''
            key = 'inner_' + str(iter_ts)
            print 'iter_ts:', ts2date(iter_ts)
            inner_retweet_string = monitor_inner_r.hget(root_uid, key)
            print 'root_uid, key, inner_retweet_string:', root_uid, key, inner_retweet_string
            if inner_retweet_string:
                print 'yes'
                inner_retweet_dict = json.loads(inner_retweet_string)
            else:
                inner_retweet_dict = None
            if inner_retweet_dict:
                inner_group_dict[root_uid] = merge_dict(
                    inner_group_dict[root_uid], inner_retweet_dict)
            iter_ts += iter_time_segment
        user_inner_retweet_count = sum(inner_group_dict[root_uid].values())
        user_count_dict[root_uid] = user_inner_retweet_count
    all_be_retweet_count = sum(user_count_dict.values())
    if all_be_retweet_count == 0:
        group_status = 1
        return group_status
    sort_user_inner_retweet_count = sorted(user_count_dict.items(),
                                           key=lambda x: x[1],
                                           reverse=True)
    top5_user = sort_user_inner_retweet_count[:5]

    # timestamp: '2013-09-01'
    date = ts2datetime(start_ts - 24 * 3600)
    index_body = {'date': date}
    for rank in range(1, 6):
        key = 'top' + str(rank)
        index_body[key] = json.dumps(top5_user[rank - 1])
    key = 'inner_' + date
    # save inner-retweet graph by dict {root_uid1:{uid1:count1, uid2:count2}, ...}
    index_body['inner_graph'] = json.dumps(inner_group_dict)

    es.index(index=monitor_index_name,
             doc_type=task_name,
             id=key,
             body=index_body)
    group_status = 1
    return group_status
Пример #19
0
def search_user_task(user_name):
    c_result = {}
    query = {"query":{"bool":{"must":[{"term":{"submit_user":str(user_name)}}]}},"size":MAX_ITEMS,"sort":[{"create_time":{"order":"desc"}}],"fields":["status","search_type","keyword","submit_user","sort_scope","sort_norm","start_time","user_ts","end_time","create_time",'number']}#"sort":[{"create_time":{"order":"desc"}}],;;field:"create_time", 'number'
    if 1:
        return_list = []
        result = es.search(index=USER_RANK_KEYWORD_TASK_INDEX , doc_type=USER_RANK_KEYWORD_TASK_TYPE , body=query)['hits']
        c_result['flag'] = True
        for item in result['hits']:
            result_temp = {}
            result_temp['submit_user'] = item['fields']['submit_user'][0]
            result_temp['search_type'] = item['fields']['search_type'][0]
            #jln
            #result_temp['keyword'] = json.loads(item['fields']['keyword'][0])
            result_temp['keyword'] = json.loads(item['fields']['keyword'][0])
            result_temp['sort_scope'] = item['fields']['sort_scope'][0]
            result_temp['sort_norm'] = item['fields']['sort_norm'][0]
            # result_temp['start_time'] = ts2datetime(item['fields']['start_time'][0])
            # result_temp['end_time'] = ts2datetime(item['fields']['end_time'][0])
            result_temp['start_time'] = item['fields']['start_time'][0]
            result_temp['end_time'] = item['fields']['end_time'][0]

            result_temp['status'] = item['fields']['status'][0]
            result_temp['create_time'] = ts2date(item['fields']['create_time'][0])
            result_temp['search_id'] = item['fields']['user_ts'][0]
            tmp = item['fields'].get('number', 0)
            if tmp:
                result_temp['number'] = int(tmp[0])
            else:
                result_temp['number'] = 100
            return_list.append(result_temp)
        c_result['data'] = return_list
        return c_result
Пример #20
0
def get_news_trend_pusher(topic, start_ts, end_ts, rank_method, news_skip,
                          news_limit_count):
    results = []
    print 'topic, start_ts, end_ts, rank_method:', topic.encode(
        'utf-8'), ts2date(start_ts), ts2date(end_ts), rank_method

    items = db.session.query(TrendPusherNews).filter(TrendPusherNews.topic==topic ,\
                                                                                     TrendPusherNews.start_ts==start_ts ,\
                                                                                     TrendPusherNews.end_ts==end_ts).all()
    if not items or items == []:
        return []

    for item in items:
        row = []
        news_id = item.news_id
        news_id = deal_with(news_id)
        timestamp = item.timestamp
        comments_count = item.comments_count
        news_info = json.loads(item.news_info)

        url = news_info['url']
        summary = news_info['summary']
        datetime = news_info['datetime']
        source_from_name = news_info['source_from_name']
        content168 = news_info['content168']
        title = news_info['title']
        #weight = news_info['weight']
        transmit_name = news_info['transmit_name']
        #if len(transmit_name)==0:
        #    transmit_name = u'未知'
        same_news_num = news_info['same_news_num']
        row = [
            news_id, url, summary, timestamp, datetime, source_from_name,
            content168, title, same_news_num, transmit_name, comments_count
        ]
        results.append(row)

    if rank_method == 'comments_count':
        sort_results = sorted(results, key=lambda x: x[10],
                              reverse=True)  # 评论数逆序排列
    elif rank_method == 'timestamp':
        sort_results = sorted(results, key=lambda x: x[3])  # 时间戳正序排列
    #elif rank_method=='weight':
    #    sort_results = sorted(results, key=lambda x:x[10], reverse=True) # 相关度逆序排序

    return sort_results[news_skip:news_limit_count + news_skip]
Пример #21
0
def scan_network_keywords_task():
    #step1: read task information from redis queue
    #step2: identify the task information is exist in es
    #step3: compute the network trend task
    while True:
        #read task informaiton from redis queue
        network_task_information = get_task_information()
        print network_task_information
        #when redis queue null - file break
        if not network_task_information:
            break
        #identify the task is exist in es
        exist_mark = identify_task_exist(network_task_information)
        print 'exist_mark:', exist_mark
        if exist_mark:
            print 'step 1: compute', ts2date(time.time())
            results = compute_network_task(network_task_information)
            if results:
                tmp_file = tempfile.NamedTemporaryFile(delete=False)
                write_tmp_file(tmp_file, results)
                tmp_file.close()

                if not tmp_file:
                    return
                input_tmp_path = tmp_file.name
                print input_tmp_path

                ITER_COUNT = 10
                TOP_N = 50
                print 'step 2: pagerank', ts2date(time.time())
                all_uids_count, dg_sorted_uids, pr_sorted_uids = pagerank(
                    ITER_COUNT, input_tmp_path, TOP_N, 'keywords')
                #save results
                print 'step 3: save', ts2date(time.time())
                save_mark = save_task_results(dg_sorted_uids, pr_sorted_uids,
                                              network_task_information)
                print 'save done', ts2date(time.time())
                #identify save status
                if not save_mark:
                    #status fail: push task information to redis queue
                    push_mark = push_task_information(network_task_information)
                    if not push_mark:
                        print 'error push task queue'
        else:
            #if no exist - pass
            pass
Пример #22
0
def get_max_k_timestamp(results, p_ts_list):
    # 最大斜率 且增量要大于平均增量
    length = len(results)
    smooth_results = []
    incre_dict = {}
    k_dict = {}
    # 平滑处理--感觉会消耗信息!!!!!!
    for i in range(length):
        if i > 1:
            smooth = sum(results[i - 2:i + 1]) / 3.0
            smooth_results.append(smooth)
            #print 'smooth_results:',i ,results[i-2:i+1], smooth_results
        l = len(smooth_results)
        if l >= 2:
            '''
            if smooth_results[l-2]!=0:
                k = (smooth_results[l-1] - smooth_results[l-2]) / smooth_results[l-2]
                k_dict[l-1] = k
            else:
                k_dict[l-1] = 0
            '''
            k = (smooth_results[l - 1] - smooth_results[l - 2]) / Hour
            k_dict[l - 1] = k

    #print 'smooth_results:', smooth_results
    sort_k_list = sorted(k_dict.items(), key=lambda c: c[1], reverse=True)
    #print 'sort_k_list:', sort_k_list
    smooth_length = len(smooth_results)
    all_average = 0
    for j in range(smooth_length):
        if j > 0:
            incre = float(smooth_results[j] - smooth_results[j - 1])
            all_average += incre
            incre_dict[j - 1] = incre
    try:
        average_incre = all_average / len(incre_dict)
    except:
        average_incre = all_average
    remove_list = []
    #print 'incre_dict:', incre_dict
    # 筛掉增量小于平均增量的
    for k in incre_dict:
        if incre_dict[k] <= average_incre:
            remove_list.append(k)
    after_remove_k_list = []
    for sort_k in sort_k_list:
        if not sort_k[0] in remove_list:
            index = sort_k[0]
            timestamp = p_ts_list[index + 1]
            k_value = sort_k[1]
            after_remove_k_list.append((index + 1, timestamp, k_value))

    max_k_timestamp = after_remove_k_list[0][1]
    #print 'after_remove_k_list:', after_remove_k_list
    print 'max_k_timestamp:', max_k_timestamp
    print 'max_k_timestamp:', ts2date(max_k_timestamp)
    return max_k_timestamp
Пример #23
0
def search_user_task(user_name):
    c_result = {}
    query = {
        "query": {
            "bool": {
                "must": [{
                    "term": {
                        "submit_user": str(user_name)
                    }
                }]
            }
        },
        "size":
        MAX_ITEMS,
        "sort": [{
            "create_time": {
                "order": "desc"
            }
        }],
        "fields": [
            "status", "search_type", "keyword", "submit_user", "sort_scope",
            "sort_norm", "start_time", "user_ts", "end_time", "create_time",
            'number'
        ]
    }  #"sort":[{"create_time":{"order":"desc"}}],;;field:"create_time", 'number'
    if 1:
        return_list = []
        result = es.search(index=USER_RANK_KEYWORD_TASK_INDEX,
                           doc_type=USER_RANK_KEYWORD_TASK_TYPE,
                           body=query)['hits']
        c_result['flag'] = True
        for item in result['hits']:
            result_temp = {}
            result_temp['submit_user'] = item['fields']['submit_user'][0]
            result_temp['search_type'] = item['fields']['search_type'][0]
            #jln
            #result_temp['keyword'] = json.loads(item['fields']['keyword'][0])
            result_temp['keyword'] = json.loads(item['fields']['keyword'][0])
            result_temp['sort_scope'] = item['fields']['sort_scope'][0]
            result_temp['sort_norm'] = item['fields']['sort_norm'][0]
            # result_temp['start_time'] = ts2datetime(item['fields']['start_time'][0])
            # result_temp['end_time'] = ts2datetime(item['fields']['end_time'][0])
            result_temp['start_time'] = item['fields']['start_time'][0]
            result_temp['end_time'] = item['fields']['end_time'][0]

            result_temp['status'] = item['fields']['status'][0]
            result_temp['create_time'] = ts2date(
                item['fields']['create_time'][0])
            result_temp['search_id'] = item['fields']['user_ts'][0]
            tmp = item['fields'].get('number', 0)
            if tmp:
                result_temp['number'] = int(tmp[0])
            else:
                result_temp['number'] = 100
            return_list.append(result_temp)
        c_result['data'] = return_list
        return c_result
def scan_network_keywords_task():
    #step1: read task information from redis queue
    #step2: identify the task information is exist in es
    #step3: compute the network trend task
    while True:
        #read task informaiton from redis queue
        network_task_information = get_task_information()
        print network_task_information
        #when redis queue null - file break
        if not network_task_information:
            break
        #identify the task is exist in es
        exist_mark = identify_task_exist(network_task_information)
        print 'exist_mark:', exist_mark
        if exist_mark:
            print 'step 1: compute', ts2date(time.time())
            results = compute_network_task(network_task_information)
            if results:
                tmp_file = tempfile.NamedTemporaryFile(delete=False)
                write_tmp_file(tmp_file, results)
                tmp_file.close()

                if not tmp_file:
                    return
                input_tmp_path = tmp_file.name
                print input_tmp_path

                ITER_COUNT = 10
                TOP_N = 50
                print 'step 2: pagerank', ts2date(time.time())
                all_uids_count, dg_sorted_uids, pr_sorted_uids = pagerank(ITER_COUNT, input_tmp_path, TOP_N, 'keywords')
                #save results
                print 'step 3: save', ts2date(time.time())
                save_mark = save_task_results(dg_sorted_uids, pr_sorted_uids, network_task_information)
                print 'save done', ts2date(time.time())
                #identify save status
                if not save_mark:
                    #status fail: push task information to redis queue
                    push_mark = push_task_information(network_task_information)
                    if not push_mark:
                        print 'error push task queue'
        else:
            #if no exist - pass
            pass
Пример #25
0
def user_weibo():
    """微博列表页面
    """
    # 要素
    yaosu = 'moodlens'

    # 话题关键词
    topic = request.args.get('query', default_topic)

    # 时间范围: 20130901-20130901
    time_range = request.args.get('time_range', default_timerange)

    # 时间粒度: 3600
    point_interval = request.args.get('point_interval', None)
    if not point_interval:
        point_interval = default_pointInterval
    else:
        for pi in pointIntervals:
            if pi['en'] == int(point_interval):
                point_interval = pi
                break

    weibos = []
    tar_location = u'地域未知'
    tar_nickname = u'昵称未知'
    tar_profile_image_url = '#'
    tar_followers_count = u'粉丝数未知'
    tar_friends_count = u'关注数未知'
    tar_user_url = '#'
    uid = request.args.get('uid', None)

    if uid:
        count, results = xapian_search_weibo.search(query={'user': int(uid)}, sort_by=['timestamp'], \
            fields=['id', 'user', 'text', 'reposts_count', 'comments_count', 'geo', 'timestamp'])

        for r in results():
            r['weibo_url'] = 'http://weibo.com/'
            r['user_url'] = 'http://weibo.com/u/' + str(uid)
            r['created_at'] = ts2date(r['timestamp'])
            weibos.append(r)

        user_info = acquire_user_by_id(uid)
        if user_info:
            tar_name = user_info['name']
            tar_location = user_info['location']
            tar_profile_image_url = user_info['profile_image_url']
            tar_friends_count = user_info['friends_count']
            tar_followers_count = user_info['followers_count']
            tar_user_url = 'http://weibo.com/u/' + str(uid)

    return render_template('index/weibolist.html', yaosu=yaosu, time_range=time_range, \
            topic=topic, pointInterval=point_interval, pointIntervals=pointIntervals, \
            gaishu_yaosus=gaishu_yaosus, deep_yaosus=deep_yaosus, tar_location=tar_location, \
            tar_profile_image_url=tar_profile_image_url, \
            statuses=weibos, tar_name=tar_name, tar_friends_count=tar_friends_count, \
            tar_followers_count=tar_followers_count, tar_user_url=tar_user_url)
Пример #26
0
def get_tsu(new_peaks, new_bottom, ts_list, topic_xapian_id):
    #print 'new_peaks:', new_peaks
    #print 'new_bottom:', new_bottom
    #print 'ts_list:', ts_list
    end_ts = ts_list[new_peaks[0]]
    begin_ts = ts_list[new_bottom[0]]
    if begin_ts>end_ts:
        begin_ts = ts_list[0]
    query_dict = {
        'timestamp':{'$gt':begin_ts, '$lt':end_ts},
        'message_type':3
        }
    print 'query_dict:', query_dict
    print 'begin_ts:', ts2date(begin_ts)
    print 'end_ts:', ts2date(end_ts)
    xapian_search_weibo = getXapianWeiboByTopic(topic_xapian_id)# 这里需要考虑话题id
    count, results = xapian_search_weibo.search(query=query_dict, fields=['retweeted_uid','retweeted_mid'])
    print 'count:', count
    ruid_count = {}
    ruid_mid = {}
    for result in results():
        r_uid = result['retweeted_uid']
        if (r_uid == 0) or (not r_uid):
            continue
        ruid_mid[r_uid] = result['retweeted_mid']
        try:
            ruid_count[r_uid] += 1
        except KeyError:
            ruid_count[r_uid] = 1
    sorted_result = sorted(ruid_count.items(), key=lambda d:d[1], reverse=True)
    print 'top_source_user:'******'''
    count, top_weibo = xapian_search_weibo.search(query={'_id':top_source_mid}, fields=['timestamp'])
    print 'count:', count
    for i in top_weibo():
        timestamp = i['timestamp']
        print 'timestamp:', ts2date(int(timestamp))
    '''    
    return sorted_result
Пример #27
0
def sort_task(user, keyword, status, start_time, end_time, submit_time):
    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "bool":{
                        "must":[
                            {"term":{"submit_user": user}}
                        ]
                    }
                }
            }
        },
        "size": 10000,
        "sort":{"submit_time":{"order":"desc"}}
    }

    query_list = []
    if keyword:
        keyword_list = keyword.split(',')
        query_list.append({"terms":{"keyword_string":keyword_list}})
    if status != 2:
        query_list.append({"term":{"status": status}})
    if start_time and end_time:
        start_ts = datetime2ts(start_time)
        end_ts = datetime2ts(end_time)
        query_list.append({"range":{"start_time":{"gte":start_ts, "lte":end_ts}}})
        query_list.append({"range":{"end_time":{"gte":start_ts, "lte":end_ts}}})
    if submit_time:
        query_list.append({"term":{"submit_time": submit_time}})

    if query_list:
        query_body["query"]["filtered"]["filter"]["bool"]["must"].extend(query_list)

    #print query_body
    search_results = es.search(index=USER_RANK_KEYWORD_TASK_INDEX, doc_type=USER_RANK_KEYWORD_TASK_TYPE, body=query_body)["hits"]["hits"]
    results = []
    if search_results:
        for item in search_results:
            iter_item = item['_source']
            tmp = []
            tmp.append(iter_item['search_type'])
            tmp.append(json.loads(iter_item['keyword']))
            tmp.append(ts2datetime(iter_item['start_time']))
            tmp.append(ts2datetime(iter_item['end_time']))
            tmp.append(iter_item['range'])
            tmp.append(ts2date(iter_item['create_time']))
            tmp.append(iter_item['status'])
            tmp.append(iter_item['sort_norm'])
            tmp.append(iter_item['sort_scope'])
            tmp.append(item['_id']) # task_name
            results.append(tmp)

    return results
def compute_group_inner(task_name, task_user, start_ts):
    #step1: get task_user in-monitor task user retweet relation from monitor_inner_r
    #step2: get task_user in-task user retweet relation
    #step3: compute every inner user be-retweet ratio in task
    #step4: save top5 to es--monitor_result, doc_type=task_name, _id='inner_'+date  e:'inner_2013-09-01'
    group_status = 0
    time_segment = 3600*24
    iter_time_segment = 900
    iter_ts = start_ts - time_segment
    inner_group_dict = {}
    user_count_dict = {}
    print 'group inner ask_user:'******'''
            if iter_ts >= start_ts:
                break
            '''
            key = 'inner_' + str(iter_ts)
            print 'iter_ts:', ts2date(iter_ts)
            inner_retweet_string = monitor_inner_r.hget(root_uid, key)
            print 'root_uid, key, inner_retweet_string:', root_uid, key, inner_retweet_string
            if inner_retweet_string:
                print 'yes'
                inner_retweet_dict = json.loads(inner_retweet_string)
            else:
                inner_retweet_dict = None
            if inner_retweet_dict:
                inner_group_dict[root_uid] = merge_dict(inner_group_dict[root_uid], inner_retweet_dict)
            iter_ts += iter_time_segment
        user_inner_retweet_count = sum(inner_group_dict[root_uid].values())
        user_count_dict[root_uid] = user_inner_retweet_count
    all_be_retweet_count = sum(user_count_dict.values())
    if all_be_retweet_count==0:
        group_status = 1
        return group_status
    sort_user_inner_retweet_count = sorted(user_count_dict.items(), key=lambda x:x[1], reverse=True)
    top5_user = sort_user_inner_retweet_count[:5]

    # timestamp: '2013-09-01'
    date = ts2datetime(start_ts - 24*3600)
    index_body = {'date': date}
    for rank in range(1,6):
        key = 'top' + str(rank)
        index_body[key] = json.dumps(top5_user[rank-1])
    key = 'inner_' + date
    # save inner-retweet graph by dict {root_uid1:{uid1:count1, uid2:count2}, ...}
    index_body['inner_graph'] = json.dumps(inner_group_dict)
    
    es.index(index=monitor_index_name, doc_type=task_name, id=key, body=index_body)
    group_status = 1
    return group_status
Пример #29
0
def social_sensing_task():

    count = 0
    now_ts = ts2date(time.time())

    while 1:
        temp = r.rpop("task_name")
        if temp:
            print "current_task:", json.loads(temp)[0]

        if not temp:
            print 'the last task:', count
            now_date = ts2date(time.time())
            print 'All tasks Finished:', now_date
            break

        task_detail = json.loads(temp)
        count += 1
        social_sensing(task_detail)
        print json.loads(temp)[0], ':Finished'
Пример #30
0
def read_uid_weibos(topic, date, windowsize, uid):
    # change
    end_ts = datetime2ts(date)
    start_ts = end_ts - Day * windowsize
    xapian_search_weibo = getXapianWeiboByTopic(topic, start_ts, end_ts)

    query_dict = {'user': uid}
    count, results = xapian_search_weibo.search(query=query_dict,
                                                fields=weibo_fields_list)
    if count == 0:
        weibo_list = []
    else:
        weibo_list = []
        for weibo in results():
            wid = weibo['_id']
            uid = weibo['user']
            result = user_search.search_by_id(uid, fields=user_fields_list)
            if result:
                name = result['name']
                location = result['location']
                friends_count = result['friends_count']
                followers_count = result['followers_count']
                created_at = result['created_at']
                statuses_count = result['statuses_count']
                profile_image_url = result['profile_image_url']
            else:
                name = u'未知'
                location = u'未知'
                friends_count = u'未知'
                followers_count = u'未知'
                created_at = u'未知'
                statuses_count = u'未知'
                profile_image_url = u'no'

            text = weibo['text']
            geo = weibo['geo']
            source = weibo['source']
            timestamp = weibo['timestamp']
            date = ts2date(timestamp)
            reposts_count = weibo['reposts_count']
            comments_count = weibo['comments_count']
            weibo_link = weiboinfo2url(uid, wid)
            domain = uid2domain(uid)

            row = [
                wid, uid, name, location, friends_count, followers_count,
                created_at, statuses_count, profile_image_url, date, text, geo,
                source, reposts_count, comments_count, weibo_link
            ]
            weibo_list.append(row)

    sort_weibo_list = sorted(weibo_list, key=lambda x: x[9])
    return sort_weibo_list
Пример #31
0
def get_max_k_timestamp(results, p_ts_list):
    # 最大斜率 且增量要大于平均增量
    length = len(results)
    smooth_results = []
    incre_dict = {}
    k_dict = {}
    # 平滑处理--感觉会消耗信息!!!!!!
    for i in range(length):
        if i>1:
            smooth = sum(results[i-2:i+1]) / 3.0
            smooth_results.append(smooth)
            #print 'smooth_results:',i ,results[i-2:i+1], smooth_results
        l = len(smooth_results)
        if l>=2:
            '''
            if smooth_results[l-2]!=0:
                k = (smooth_results[l-1] - smooth_results[l-2]) / smooth_results[l-2]
                k_dict[l-1] = k
            else:
                k_dict[l-1] = 0
            '''
            k = (smooth_results[l-1] - smooth_results[l-2]) / Hour
            k_dict[l-1] = k

    #print 'smooth_results:', smooth_results
    sort_k_list = sorted(k_dict.items(), key=lambda c:c[1], reverse=True)
    #print 'sort_k_list:', sort_k_list
    smooth_length = len(smooth_results)
    all_average = 0
    for j in range(smooth_length):
        if j>0:
            incre = float(smooth_results[j] - smooth_results[j-1])
            all_average += incre
            incre_dict[j-1] = incre
    average_incre = all_average / len(incre_dict)    
    remove_list = []
    #print 'incre_dict:', incre_dict
    # 筛掉增量小于平均增量的
    for k in incre_dict:
        if incre_dict[k]<=average_incre:
            remove_list.append(k)
    after_remove_k_list = []
    for sort_k in sort_k_list:
        if not sort_k[0] in remove_list:
            index = sort_k[0]
            timestamp = p_ts_list[index+1]
            k_value = sort_k[1]
            after_remove_k_list.append((index+1, timestamp, k_value))
    max_k_timestamp = after_remove_k_list[0][1]
    #print 'after_remove_k_list:', after_remove_k_list
    print 'max_k_timestamp:', max_k_timestamp
    print 'max_k_timestamp:', ts2date(max_k_timestamp)
    return max_k_timestamp
Пример #32
0
def social_sensing_task():
    while 1:
        temp = r.rpop("task_name")

        if not temp:
            now_date = ts2date(time.time())
            print 'All tasks Finished:',now_date
            break  
            
        task_detail = json.loads(temp)
        social_sensing(task_detail)
        print json.loads(temp)[0],':Finished'
Пример #33
0
def parseNews(news):
    news_dict = {}
    news = _json_loads(news)

    if not news:
        return {}

    for weibo in news:
        try:
            _id = deal_with(weibo['_id'])
            replies = 1
            weibo['timestamp'] = ts2date(weibo['timestamp'])
            weibo['content168'] = weibo['content168']
            news_dict[_id] = [replies, weibo]
        except:
            continue

    return news_dict
Пример #34
0
def social_sensing_task():
    # 1. print start info
    count = 0
    current_path = os.getcwd()
    file_path = os.path.join(current_path, 'social_sensing.py')
    now_ts = ts2date(time.time())
    print_log = "&".join([file_path, "start", now_ts])
#    print print_log #打印开始信息

    while 1:
        temp = r.rpop("task_name")
        if not temp:
            print count
            now_ts = str(int(time.time()))
            print_log = "&".join([file_path, "end", now_ts])
            break  # finish all task in task_list
        task_detail = json.loads(temp)
        count += 1
        social_sensing(task_detail)
Пример #35
0
def parseWeibos(weibos):
    weibo_dict = {}
    weibos = _json_loads(weibos)

    if not weibos:
        return {}

    for weibo in weibos:
        try:
            _id = weibo['_id']
            username, profileimage = getuserinfo(
                weibo['user'])  # get username and profile_image_url
            reposts_count = weibo['reposts_count']
            weibo['weibo_link'] = weiboinfo2url(weibo['user'], _id)
            weibo['name'] = username
            weibo['profile_image_url'] = profileimage
            weibo['date'] = ts2date(weibo['timestamp'])
            weibo_dict[_id] = [reposts_count, weibo]
        except:
            continue

    return weibo_dict
Пример #36
0
def add_task(user_name,
             type="keyword",
             range="all",
             pre='flow_text_',
             during='1',
             start_time='2013-09-07',
             end_time='2013-09-07',
             keyword='hello,world',
             sort_norm='bci',
             sort_scope='in_limit_keyword',
             time=1,
             isall=False):
    time_now = TIME.time()

    body_json = {
        'submit_user': user_name,
        'keyword': keyword,
        'submit_time': str(ts2date(time_now)),
        'end_time': end_time,
        'search_type': type,
        'status': 0,
        'range': range,
        'user_ts': user_name + str(time_now),
        'pre': pre,
        'during': during,
        'start_time': start_time,
        'sort_norm': sort_norm,
        'sort_scope': sort_scope,
        'time': time,
        'isall': isall
    }
    try:
        es.index(index=USER_RANK_KEYWORD_TASK_INDEX,
                 doc_type=USER_RANK_KEYWORD_TASK_TYPE,
                 body=body_json)
        return body_json["user_ts"]
    except Exception, e1:
        print e1
Пример #37
0
def get_propagate_peak_news(topic, start_ts, end_ts):
    lis = []
    ts_lis = []
    total_days = (end_ts - start_ts) / During
    for i in range(total_days + 1):
        ts = start_ts + During * i
        count = 0
        for k, v in mtype_kv.iteritems():
            dcount = ReadPropagateNews(topic, ts, During, v)
            if dcount:
                count += sum(dcount['dcount'].values())
        lis.append(float(count))
        ts_lis.append(ts2date(ts))

    if not lis or not len(lis):
        return {}

    new_zeros = detect_peaks(lis)
    time_lis = {}
    for idx, point_idx in enumerate(new_zeros):
        timestamp = ts_lis[point_idx]
        time_lis[idx] = {'ts': timestamp, 'title': 'E' + str(idx)}
    return {'ts': ts_lis, 'count_list': lis, 'peak': time_lis}
Пример #38
0
def key_words_search(task_id,
                     search_type,
                     pre,
                     during,
                     start_time,
                     keyword_list,
                     search_key='',
                     sort_norm='',
                     sort_scope='',
                     time=1,
                     isall=False,
                     number=100):
    number = int(number)
    should = []
    for key in keyword_list:
        if search_type == "hashtag":
            should.append({"prefix": {"text": "#" + key + "#"}})
        else:
            should.append({"wildcard": {"text": "*" + key + "*"}})
    index_list = []
    date = ts2datetime(start_time)
    index_name = pre + date
    while during:
        if es_flow_text.indices.exists(index=index_name):
            index_list.append(index_name)
            start_time = start_time + DAY
            date = ts2datetime(start_time)
            index_name = pre + date
            during -= 1

    print index_list
    uid_set = set()
    text_results = []

    query_body = {
        "query": {
            "bool": {
                "must": should
            }
        },
        "sort": {
            "user_fansnum": {
                "order": "desc"
            }
        },
        "size": 5000
    }

    results = es_flow_text.search(index=index_list,
                                  doc_type='text',
                                  body=query_body,
                                  _source=False,
                                  fields=[
                                      "uid", "user_fansnum", "text",
                                      "message_type", "sentiment", "timestamp",
                                      "geo", "retweeted", "comment"
                                  ])["hits"]["hits"]

    id_index = 0
    index_list = []
    un_uid_list = []
    for item in results:
        if item['fields']['uid'][0] not in uid_set:
            uid_set.add(item['fields']['uid'][0])
            un_uid_list.append(item['fields']['uid'][0])
            index_list.append(id_index)
        id_index += 1

    uid_list = []
    print "un_uid_list: ", len(un_uid_list)
    portrait_list = []
    count = 0
    in_index = 0
    if not isall and un_uid_list:  # 库内
        portrait_results = es_user_portrait.mget(index=USER_INDEX_NAME,
                                                 doc_type=USER_INDEX_TYPE,
                                                 body={"ids": un_uid_list},
                                                 _source=False,
                                                 fields=['uname'])["docs"]
        for item in portrait_results:
            if item["found"]:
                portrait_list.append(item['_id'])
                nick_name = item['fields']['uname'][0]
                if nick_name == 'unknown':
                    nick_name = item['_id']
                index = index_list[in_index]
                weibo_url = weiboinfo2url(results[index]['fields']['uid'][0],
                                          results[index]['_id'])
                text_results.extend([
                    results[index]['fields']['uid'][0],
                    results[index]['fields']['user_fansnum'][0],
                    results[index]['fields']['text'][0],
                    results[index]['fields']['message_type'][0],
                    results[index]['fields']['sentiment'][0],
                    ts2date(results[index]['fields']['timestamp'][0]),
                    results[index]['fields']['geo'][0],
                    results[index]['fields']['retweeted'][0],
                    results[index]['fields']['comment'][0], nick_name,
                    weibo_url
                ])
                count += 1
                if count == number:
                    break
                print "portrait_len, ", len(portrait_list)
            in_index += 1
        if portrait_list:
            uid_list = in_sort_filter(time, sort_norm, sort_scope, None,
                                      portrait_list, True, number)  # sort
    elif un_uid_list:
        profile_result = es_user_profile.mget(index="weibo_user",
                                              doc_type="user",
                                              body={"ids": un_uid_list},
                                              fields=['nick_name'])["docs"]
        for i in range(len(profile_result)):
            index = index_list[i]
            try:
                nick_name = profile_result[i]['fields']['nick_name'][0]
            except:
                nick_name = un_uid_list[i]
            item = results[index]
            weibo_url = weiboinfo2url(item['fields']['uid'][0],
                                      results[index]['_id'])
            text_results.append([
                item['fields']['uid'][0], item['fields']['user_fansnum'][0],
                item['fields']['text'][0], item['fields']['message_type'][0],
                item['fields']['sentiment'][0],
                ts2date(item['fields']['timestamp'][0]),
                results[index]['fields']['geo'][0],
                results[index]['fields']['retweeted'][0],
                results[index]['fields']['comment'][0], nick_name, weibo_url
            ])
            if i == number:
                break
        uid_list = all_sort_filter(un_uid_list[:number], sort_norm, time, True,
                                   number)

    print "filter_uid_list: ", len(uid_list)
    if uid_list:
        results = make_up_user_info(uid_list, isall, time, sort_norm)
    else:
        results = []
    print "results: ", len(results)
    # 修改状态
    task_detail = es_user_portrait.get(index=USER_RANK_KEYWORD_TASK_INDEX,
                                       doc_type=USER_RANK_KEYWORD_TASK_TYPE,
                                       id=task_id)
    item = task_detail['_source']
    item['status'] = 1
    item['result'] = json.dumps(results)
    item['text_results'] = json.dumps(text_results)
    item['number'] = len(results)
    es_user_portrait.index(index=USER_RANK_KEYWORD_TASK_INDEX,
                           doc_type=USER_RANK_KEYWORD_TASK_TYPE,
                           id=task_id,
                           body=item)

    return "1"
Пример #39
0
def community_result(community_user_list, topic, date, windowsize):
    #change
    end_ts = datetime2ts(date)
    start_ts = end_ts - windowsize * Day
    xapian_search_weibo = getXapianWeiboByTopic(topic, start_ts, end_ts)
    query_dict = {'$or': []}
    for uid in community_user_list:
        query_dict['$or'].append({'user': int(uid)})
    community_info = []
    count, weibo_results = xapian_search_weibo.search(query=query_dict,
                                                      fields=weibo_fields_list)
    if count == 0:
        return None, None, None
    sentiment_count = {}
    for weibo in weibo_results():
        uid = weibo['user']
        _id = weibo['_id']
        result = user_search.search_by_id(uid, fields=user_fields_list)
        if result:
            name = result['name']
            location = result['location']
            friends_count = result['friends_count']
            followers_count = result['followers_count']
            created_at = result['created_at']
            statuses_count = result['statuses_count']
            profile_image_url = result['profile_image_url']
        else:
            name = u'未知'
            location = u'未知'
            friends_count = u'未知'
            followers_count = u'未知'
            created_at = u'未知'
            statuses_count = u'未知'
            profile_image_url = u'no'

        text = weibo['text']
        timestamp = weibo['timestamp']
        date = ts2date(timestamp)
        reposts_count = weibo['reposts_count']
        source = weibo['source']
        geo = weibo['geo']
        comments_count = weibo['comments_count']
        sentiment = weibo['sentiment']
        sentiment_name = emotions_kv[sentiment]
        weibo_link = weiboinfo2url(uid, _id)
        domain = uid2domain(uid)

        try:
            sentiment_count[sentiment] += 1
        except KeyError:
            sentiment_count[sentiment] = 1
        community_info.append([
            _id, name, location, friends_count, followers_count, created_at,
            statuses_count, profile_image_url, text, date, reposts_count,
            source, geo, comments_count, sentiment_name, weibo_link, domain
        ])

    sort_community_info = sorted(community_info,
                                 key=lambda x: x[10],
                                 reverse=True)  #以转发量排序

    mset = xapian_search_weibo.search(query=query_dict,
                                      max_offset=50,
                                      mset_direct=True)
    top_keyword = top_keywords(gen_mset_iter(xapian_search_weibo,
                                             mset,
                                             fields=['terms']),
                               top=50)

    sort_top_keyword = sorted(top_keyword, key=lambda x: x[1], reverse=True)

    new_sentiment_list = []
    for sentiment in sentiment_count:
        sentiment_ch = emotions_kv[int(sentiment)]
        num = sentiment_count[sentiment]
        ratio = float(num) / float(count)
        new_sentiment_list.append([sentiment_ch, num, ratio])

    return sort_community_info, sort_top_keyword, new_sentiment_list, query_dict
Пример #40
0
def key_words_search(task_id, search_type , pre , during , start_time , keyword_list , search_key = '' , sort_norm = '', sort_scope = ''  ,time = 7 , isall = False, number = 100):
    number = int(number)
    should = []
    for key in keyword_list:
        if search_type == "hashtag":
            should.append({"prefix":{"text": "#" +  key + "#"}})
        else:    
            should.append({"wildcard":{"text": "*" +key + "*"}})    
    index_list = []
    date = ts2datetime(start_time)
    index_name = pre + date
    while during:
        if es_flow_text.indices.exists(index=index_name):
            index_list.append(index_name)
            start_time = start_time + DAY
            date = ts2datetime(start_time)
            index_name = pre + date
            during -= 1

    print index_list
    uid_set = set()
    text_results = []
    sorted_text_results = []

    query_body = {
        "query":{
            "bool":{
                "must":should
             }
        },
        "sort":{"user_fansnum":{"order":"desc"}},
        "size":5000
    }
                    
    results = es_flow_text.search(index = index_list , doc_type = 'text' , body = query_body, _source=False, fields=["uid", "user_fansnum","text", "message_type", "sentiment","timestamp", "geo", "retweeted", "comment"])["hits"]["hits"]
    id_index = 0
    index_list = []
    un_uid_list = []
    for item in results :
        if item['fields']['uid'][0] not in uid_set:
            uid_set.add(item['fields']['uid'][0])
            un_uid_list.append(item['fields']['uid'][0])
            index_list.append(id_index)
        id_index += 1
    
    #get_all_filed(sort_norm , time)
    uid_list = []
    print "un_uid_list: ", len(un_uid_list)
    portrait_list = []
    count = 0
    in_index = 0
    if not isall and un_uid_list : # 库内
        portrait_results = es_user_portrait.mget(index=USER_INDEX_NAME, doc_type=USER_INDEX_TYPE, body={"ids":un_uid_list}, _source=False, fields=['uname'])["docs"]
        for item in portrait_results:
            if item["found"]:
                portrait_list.append(item['_id'])    
                nick_name = item['fields']['uname'][0]
                if nick_name == 'unknown':
                    nick_name = item['_id']
                index = index_list[in_index]
                weibo_url = weiboinfo2url(results[index]['fields']['uid'][0], results[index]['_id'])
                text_results.extend([results[index]['fields']['uid'][0], results[index]['fields']['user_fansnum'][0], results[index]['fields']['text'][0], results[index]['fields']['message_type'][0], results[index]['fields']['sentiment'][0], ts2date(results[index]['fields']['timestamp'][0]), results[index]['fields']['geo'][0], results[index]['fields']['retweeted'][0], results[index]['fields']['comment'][0], nick_name, weibo_url])
                count += 1
                if count == number:
                    break
                print "portrait_len, ", len(portrait_list)
            in_index += 1
        if portrait_list:
            uid_list = in_sort_filter(time,sort_norm ,sort_scope ,None , portrait_list , True, number) # sort
            for iter_uid in uid_list:
                iter_index = portrait_list.index(iter_uid)
                sorted_text_results.append(text_results[i])

    elif un_uid_list:
        profile_result = es_user_profile.mget(index="weibo_user", doc_type="user", body={"ids":un_uid_list}, fields=['nick_name'])["docs"]
        for i in range(len(profile_result)):
            index = index_list[i]
            try:
                nick_name = profile_result[i]['fields']['nick_name'][0]
            except:
                nick_name = un_uid_list[i]
            item = results[index]
            weibo_url = weiboinfo2url(item['fields']['uid'][0], results[index]['_id'])
            text_results.append([item['fields']['uid'][0], item['fields']['user_fansnum'][0], item['fields']['text'][0], item['fields']['message_type'][0], item['fields']['sentiment'][0], ts2date(item['fields']['timestamp'][0]), results[index]['fields']['geo'][0], results[index]['fields']['retweeted'][0], results[index]['fields']['comment'][0], nick_name, weibo_url])
            if i == number:
                break
        uid_list = all_sort_filter(un_uid_list[:number] , sort_norm , time ,True, number)
        sorted_text_results = []
        f = open("small.txt", "wb")
        for iter_uid in uid_list:
            iter_index = un_uid_list.index(iter_uid)
            f.write(str(iter_uid)+"\n")
            sorted_text_results.append(text_results[iter_index])
        f.close()
    print "filter_uid_list: ", len(uid_list)
    if uid_list:
        results = make_up_user_info(uid_list,isall,time,sort_norm)
    else:
        results = []
    print "results: ", len(results)
    # 修改状态
    task_detail = es_user_portrait.get(index=USER_RANK_KEYWORD_TASK_INDEX , doc_type=USER_RANK_KEYWORD_TASK_TYPE, id=task_id)
    item = task_detail['_source']
    item['status'] = 1
    item['result'] = json.dumps(results)
    item['text_results'] = json.dumps(sorted_text_results)
    item['number'] = len(results)
    es_user_portrait.index(index = USER_RANK_KEYWORD_TASK_INDEX , doc_type=USER_RANK_KEYWORD_TASK_TYPE , id=task_id,  body=item)

    return "1"
Пример #41
0
def get_pushers(topic, new_peaks, new_bottom, ts_list, topic_xapian_id):
    #unit = 900
    #p_during = Hour
    p_ts_list = []
    results = []
    end_ts = ts_list[new_peaks[0]]
    begin_ts = ts_list[new_bottom[0]]
    print 'pusher_start_ts:', ts2date(begin_ts)
    print 'pusher_end_ts:', ts2date(end_ts)
    if begin_ts>end_ts:
        begin_ts = ts_list[0]
    interval = (end_ts - begin_ts) / p_during
    for i in range(interval, 0, -1):
        begin_ts = end_ts - p_during * i
        over_ts = begin_ts + p_during
        p_ts_list.append(over_ts)
        items = db.session.query(PropagateCount).filter(PropagateCount.topic==topic ,\
                                                        PropagateCount.end<=over_ts ,\
                                                        PropagateCount.end>begin_ts ,\
                                                        PropagateCount.range==unit).all()
        if items:
            result = Merge_propagate(items)
        else:
            result = 0
        results.append(float(result))
    #print 'pusher_line:', results
    max_k_timestamp = get_max_k_timestamp(results, p_ts_list) # 获取增速最快的时间点
    #save max_k_timestamp
    # save_mak_k(max_k_timestamp)
    end = max_k_timestamp
    start = max_k_timestamp - p_during
    xapian_search_weibo = getXapianWeiboByTopic(topic_xapian_id)
    query_dict = {
        'timestamp':{'$gt':end, '$lt':end+3600}
        }
    '''
    count , results = xapian_search_weibo.search(query=query_dict, fields=['_id', 'user','retweeted_uid','retweeted_mid', 'timestamp'])
    ruid_count = {}
    ruid_mid = {}
    for result in results():
        r_uid = result['retweeted_uid']
        if (r_uid == 0) or (not r_uid):
            continue
        ruid_mid[r_uid] = result['retweeted_mid']
        try:
            ruid_count[r_uid] += 1
        except KeyError:
            ruid_count[r_uid] = 1
    sorted_pushers = sorted(ruid_count.items(), key=lambda d:d[1], reverse=True)
    print 'top_trend_pusher_uid:',sorted_pushers
    pusher_list = []
    for pusher in sorted_pushers:
        uid = pusher[0]
        mid = ruid_mid[uid]
        value = pusher[1]
    '''
    #以上是找到斜率最大的时间段内所有转发微博集中地源头用户--但是介于这些用户的相关信息找不到,因而选择使用下面的方法
    #以下是通过找到斜率最大的时间段内所有微博中转发数最大的用户
    count ,results = xapian_search_weibo.search(query=query_dict, sort_by=['reposts_count'], fields=['_id', 'user', 'reposts_count'])
    print 'pusher_search_count:', count
    print 'pusher_query_dict:', query_dict
    pusher_list = []
    count = 0
    for result in results():
        count += 1
        if count>100:
            break
        wid = result['_id']
        uid = result['user']
        value = result['reposts_count']
        pusher_list.append((uid, wid, value))
    # sort by reposts_count
    # sort_by_rc(pusher_list)
    return pusher_list
        user_set = r_flow.rpop('update_bci_list')
        bulk_action = []
        if user_set:
            items = json.loads(user_set)
            uid_list = []
            for item in items:
                uid_list.append(item['id'])
            if uid_list:
                search_results = es.mget(index=BCIHIS_INDEX_NAME, doc_type=BCIHIS_INDEX_TYPE, body={"ids":uid_list})["docs"]
                cal_num_for_bci_history(uid_list, items, search_results)
                count += len(uid_list)
                if count % 10000 == 0:
                    te = time.time()
                    #print "count: %s, cost time: %s" %(count, te-ts)
                    ts = te
        else:
            print count
            break


if __name__ == "__main__":
    time.sleep(100)
    ts = time.time()
    print "all_bci&start&%s"%ts2date(ts)
    try:
        reducer()
    except Exception, e:
        print e, '&error&', ts2date(time.time())
    print "all_bci&end&%s"%ts2date(time.time())
 
from parameter import TIME_INTERVAL

def get_queue_index(timestamp):
    time_struc = time.gmtime(float(timestamp))
    hour = time_struc.tm_hour
    minute = time_struc.tm_min
    index = hour*4+math.ceil(minute/15.0) #every 15 minutes
    return int(index)

if __name__ == "__main__":
    now_ts = time.time()
    date_ts = datetime2ts(ts2datetime(now_ts))
    if now_ts - TIME_INTERVAL < date_ts:
        sys.exit(0)

    tmp_date = ts2date(now_ts)
    print "cron_influence_start_" + tmp_date
    index = get_queue_index(now_ts) #当前时间戳所对应的时间区间
    influence_ts = "influence_timestamp_" + str(index)
    scan_cursor = 0
    count = 0
    while 1:
        re_scan = r.hscan(influence_ts, scan_cursor, count=1000)
        scan_cursor = re_scan[0]
        detail = re_scan[1]
        if len(detail):
            for k,v in detail.iteritems():
                r.zadd(influence_ts, v, k)
                count += 1
        if int(scan_cursor) == 0:
            break
Пример #44
0
def get_pushers(topic, new_peaks, new_bottom, ts_list):
    #unit = 900
    #p_during = Hour
    p_ts_list = []
    results = []
    end_ts = ts_list[new_peaks[0]]
    begin_ts = ts_list[new_bottom[0]]
    print 'pusher_start_ts:', ts2date(begin_ts)
    print 'pusher_end_ts:', ts2date(
        end_ts)  #有两个时间  起点和终点  波峰和波谷  现在搞反了  不知道为什么
    if begin_ts > end_ts:
        begin_ts = ts_list[0]
    interval = (end_ts - begin_ts) / p_during
    print end_ts - begin_ts
    print p_during
    print interval
    for i in range(interval, 0, -1):
        begin_ts = end_ts - p_during * i
        over_ts = begin_ts + p_during
        #print '383',begin_ts,over_ts
        p_ts_list.append(over_ts)
        items = db.session.query(PropagateCount).filter(PropagateCount.topic==topic ,\
                                                        PropagateCount.end<=over_ts ,\
                                                        PropagateCount.end>begin_ts ,\
                                                        PropagateCount.range==unit).all()

        if items:
            result = Merge_propagate(items)
        else:
            result = 0
        results.append(float(result))
    #print 'pusher_line:', results
    #try:
    print results
    print p_ts_list
    try:
        max_k_timestamp = get_max_k_timestamp(results, p_ts_list)  # 获取增速最快的时间点
    except:
        max_k_timestamp = end_ts
    #save max_k_timestamp
    # save_mak_k(max_k_timestamp)
    end = max_k_timestamp
    start = max_k_timestamp - p_during
    query_body = {
        'query': {
            'bool': {
                'must':
                # {'term':{'name': topic}},
                {
                    'range': {
                        'timestamp': {
                            'gte': end,
                            'lt': end + 3600
                        }  #3600
                    }
                }
            }
        },
        'size': 1000000,  # 返回条数限制 待删
        'sort': {
            "timestamp": {
                "order": "asc"
            }
        }
    }
    es_search_weibos = weibo_es.search(index=topic,
                                       doc_type=weibo_index_type,
                                       body=query_body)['hits']['hits']
    #xapian_search_weibo = getXapianWeiboByTopic(topic_xapian_id)
    #query_dict = {
    #'timestamp':{'$gt':end, '$lt':end+3600}
    #}
    #以下是通过找到斜率最大的时间段内所有微博中转发数最大的用户
    #results = xapian_search_weibo.search(query=query_dict, sort_by=['reposts_count'], fields=['_id', 'user', 'reposts_count'])

    results = es_search_weibos
    print 'pusher_search_count:', len(results)
    #print 'pusher_query_dict:', query_dict
    pusher_list = []
    count = 0
    for result in results:
        count += 1
        if count > 100:
            break
        wid = result['_source']['mid']
        uid = result['_source']['uid']
        value = result['_source']['retweeted']
        pusher_list.append((uid, wid, value))
    # sort by reposts_count
    # sort_by_rc(pusher_list)
    return pusher_list
            one_item['today_bci'] = 0
            one_item['update_time'] = TODAY_TIME
            array.append(one_item)
            count += 1
            if count % 1000 == 0:
                r_flow.lpush('update_bci_list', json.dumps(array))
                array = []
                if count % 100000 == 0:
                    print count
        except StopIteration: 
                print "all done" 
                if array:
                    r_flow.lpush('update_bci_list', json.dumps(array))
                break 

    print count


if __name__ == '__main__':
    todaydate = ts2datetime(time.time())
    #todaydate = '2016-04-19'
    print todaydate
    print "push_bci_redis&start&%s" %ts2date(time.time())
    ts = datetime2ts(todaydate)
    #print es_user_profile.indices.put_mapping(index="bci_history", doc_type="bci", body={'properties':{"user_friendsnum":{"type":"long"}}})
    try:
        mapper_bci_today(todaydate)
    except Exception, e:
        print e, '&error&', ts2date(time.time())
    print "push_bci_redis&end%s" %ts2date(time.time())
                one_item['user_fansnum'] = temp['fields']["user_fansnum"][0]
            else:
                one_item['user_fansnum'] = 0
            array.append(one_item)
            count += 1
            if count % 1000 == 0:
                r_flow.lpush('update_bci_list', json.dumps(array))
                array = []
                if count % 100000 == 0:
                    print count
        except StopIteration: 
                print "all done" 
                if array:
                    r_flow.lpush('update_bci_list', json.dumps(array))
                break 

    print count


if __name__ == '__main__':
    todaydate = ts2datetime(time.time())
    todaydate = "2016-04-18"
    print todaydate
    lenth = r_flow.llen('update_bci_list')
    if not lenth:
        print "cron/scan/history_bci&start&%s" %ts2date(time.time())
        mapper_bci_history(todaydate)
        print "cron/scan/history_bci&end&%s" %ts2date(time.time())
    else:
        time.sleep(60)
    new_mapping_dict = {}
    for uid in mapping_dict:
        user_list = json.loads(mapping_dict[uid])
        user_list[1] = '4'
        new_mapping_dict[uid] = json.dumps(user_list)
    r.hmset(hash_name, new_mapping_dict)

#use to deal compute fail situation
def change_status_compute_fail(mapping_dict):
    hash_name = 'compute'
    status = 1
    new_mapping_dict = {}
    for uid in mapping_dict:
        user_list = json.loads(mapping_dict[uid])
        user_list[1] = '1'
        new_mapping_dict[uid] = json.dumps(user_list)
    r.hmset(hashname, new_mapping_dict)


if __name__=='__main__':
    log_time_ts = int(time.time())
    print 'cron/text_attribute/scan_compute_redis_imm.py&start&' + str(log_time_ts)
    
    try:
        scan_compute_redis()
    except Exception, e:
        print e, '&error&', ts2date(time.time())

    log_time_ts = int(time.time())
    print 'cron/text_attribute/scan_compute_redis_imm.py&end&' + str(log_time_ts)
Пример #48
0
        network_es_mappings()
        network_count_es_mappings()

    tmp_file = tempfile.NamedTemporaryFile(delete=False)

    print 'step 1: scan', ts2date(timestamp)
    scan_retweet(tmp_file)
    tmp_file.close()
    if not tmp_file:
        return
    input_tmp_path = tmp_file.name
    print input_tmp_path

    ITER_COUNT = 10
    TOP_N = 50
    print 'step 2: pagerank', ts2date(time.time())
    all_uids_count, dg_sorted_uids, pr_sorted_uids = pagerank(
        ITER_COUNT, input_tmp_path, TOP_N, 'all')
    print 'step 3: save', ts2date(time.time())
    save_count_results(all_uids_count, es_num)
    save_dg_pr_results(dg_sorted_uids, es_num, 'dg')
    save_dg_pr_results(pr_sorted_uids, es_num, 'pr')
    print 'save done', ts2date(time.time())


if __name__ == '__main__':
    try:
        pagerank_rank()
    except Exception, e:
        print e, '&error&', ts2date(time.time())
def update_flow_information(user_info):
    results = {} # results ={uid: {'activity_geo_dict':'', 'activity_geo':'', 'hashtag_dict':'', 'hashtag':'', 'online_pattern_dict':'', 'online_pattern':''}}
    uid_list = user_info.keys()
    now_ts = time.time()
    now_date = ts2datetime(now_ts)
    timestamp = datetime2ts(now_date)
    #test
    timestamp = datetime2ts('2013-09-08')
    user_hashtag_dict = dict()
    user_online_dict = dict()
    ip_user_count_dict = {}
    new_day_ip_dict = dict()
    for i in range(7,0,-1):
        ts = timestamp - 24*3600*i
        print 'iter date:', ts2date(ts)
        results = r_cluster.hmget('hashtag_'+str(ts), uid_list)
        online_pattern_results = r_cluster.hmget('online_'+str(ts), uid_list)

        if i==0:
            ip_result = r_cluater.hmget('hashtag_'+str(ts), uid_list)

        for j in range(0, len(uid_list)):
            uid = uid_list[j]
            #attr: hashtag
            if results[j]:
                hashtag_dict = json.loads(results[j])
                for hashtag in hashtag_dict:
                    if uid in user_hashtag_dict:
                        try:
                            user_hashtag_dict[uid][hashtag] += hashtag_dict[hashtag]
                        except:
                            user_hashtag_dict[uid][hashtag] = hashtag_dict[hashtag]
                    else:
                        user_hashtag_dict[uid] = {hashtag: hashtag_dict[hashtag]}
            '''
            #attr: online_pattern
            if online_pattern_results[j]:
                online_pattern_dict = json.loads(online_pattern_results[j])
                for online_pattern in online_pattern_dict:
                    if uid in user_online_dict:
                        try:
                            user_online_dict[uid][online_pattern] += online_pattern_dict[online_pattern]
                        except:
                            user_online_dict[uid][online_pattern] = online_pattern_dict[online_pattern]
                    else:
                        user_online_dict[uid] = {online_pattern: online_pattern_dict[online_pattern]}
            '''
            
            #attr: activity_geo by ip-timestamp
            if i==0 and ip_result[j]:
                ip_timestamp_dict = json.loads(ip_result[j])
                old_flow_information = user_info[uid]
                old_day_geo_list = json.loads(old_flow_information['activity_geo_dict'])
                for ip in ip_timestamp_dict:
                    ip_count = len(ip_timestamp_dict[ip].split('&'))
                    new_day_ip_dict[uid][ip] = ip_count
                geo_dict = ip2city(new_day_ip_dict[uid])
                if len(old_day_geo_list)>=30:
                    new_day_geo_list = old_day_geo_list[1:].append(geo_dict)
                else:
                    new_day_geo_list = old_day_geo_list.append(geo_dict)
                week_geo_list = []
                week_day_geo_list = new_day_geo[-7:]
                for day_geo_dict in week_day_geo_list:
                    week_geo_list.extend(day_geo_dict.keys())
                week_geo_list = list(set(week_geo_list))
                activity_geo_string = ''
                new_week_geo_list = []
                for geo_string in week_geo_list:
                    day_geo_string = '&'.join(geo_string.split('\t'))
                    new_week_geo_list.append(day_geo_string)
                activity_geo_string = '&'.join(new_week_geo_list)
                print 'activity_geo_string:', activity_geo_string
                

    for uid in uid_list:
        #attr: hashtag
        try:
            hashtag_dict = user_hashtag_dict[uid]
            hashtag_string = json.dumps(hashtag_dict)
            hashtag_list = '&'.join(hashtag_dict.keys())
        except KeyError:
            hashtag_string = ''
            hashtag_list = ''
        '''
        #attr: online_pattern
        try:
            online_dict = user_online_dict[uid]
            online_string = json.dumps(online_dict)
            online_list = '&'.join(online_dict.keys())
        except KeyError:
            online_string = ''
            online_list = ''
        '''
        result[uid] = {'hashtag_dict':hashtag_string, 'hashtag':hashtag_list, \
                       'activity_geo_dict': json.loads(new_day_geo_list), 'activity_geo': activity_geo_string, \
                       'online_pattern_dict': online_pattern_string, 'online_pattern': online_pattern_list}
    return result
def social_sensing(task_detail):
    # 任务名 传感器 终止时间 之前状态 创建者 时间
    task_name = task_detail[0]
    social_sensors = task_detail[1]
    stop_time = task_detail[2]
    create_by = task_detail[3]
    ts = int(task_detail[4])

    print ts2date(ts)
    # PART 1
    
    #forward_result = get_forward_numerical_info(task_name, ts, create_by)
    # 之前时间阶段内的原创微博list/retweeted
    forward_origin_weibo_list = query_mid_list(ts-time_interval, social_sensors, forward_time_range)
    forward_retweeted_weibo_list = query_mid_list(ts-time_interval, social_sensors, forward_time_range, 3)
    # 当前阶段内原创微博list
    current_mid_list = query_mid_list(ts, social_sensors, time_interval)
    current_retweeted_mid_list = query_mid_list(ts, social_sensors, time_interval, 3)
    all_mid_list = []
    all_mid_list.extend(current_mid_list)
    all_mid_list.extend(current_retweeted_mid_list)
    all_mid_list.extend(forward_origin_weibo_list)
    all_mid_list.extend(forward_retweeted_weibo_list)
    all_origin_list = []
    all_origin_list.extend(current_mid_list)
    all_origin_list.extend(forward_origin_weibo_list)
    all_retweeted_list = []
    all_retweeted_list.extend(current_retweeted_mid_list)
    all_retweeted_list.extend(forward_retweeted_weibo_list)#被转发微博的mid/root-mid
    print "all mid list: ", len(all_mid_list)
    #print "all_origin_list", all_origin_list
    #print "all_retweeted_list", all_retweeted_list

    # 查询微博在当前时间内的转发和评论数, 聚合按照message_type
    statistics_count = query_related_weibo(ts, all_mid_list, time_interval)
    if all_origin_list:
        origin_weibo_detail = query_hot_weibo(ts, all_origin_list, time_interval) # 原创微博详情
    else:
        origin_weibo_detail = {}
    if all_retweeted_list:
        retweeted_weibo_detail = query_hot_weibo(ts, all_retweeted_list, time_interval) # 转发微博详情
    else:
        retweeted_weibo_detail = {}
    current_total_count = statistics_count['total_count']

    # 当前阶段内所有微博总数
    current_retweeted_count = statistics_count['retweeted']
    current_comment_count = statistics_count['comment']


    # 聚合当前时间内重要的人
    important_uid_list = []
    datetime = ts2datetime(ts-time_interval)
    index_name = flow_text_index_name_pre + datetime
    exist_es = es_text.indices.exists(index_name)
    if exist_es:
        search_results = get_important_user(ts, all_mid_list, time_interval)
        important_uid_list = search_results
    # 根据获得uid_list,从人物库中匹配重要人物
    if important_uid_list:
        important_results = es_user_portrait.mget(index=portrait_index_name,doc_type=portrait_index_type, body={"ids": important_uid_list})['docs']
    else:
        important_results = []
    filter_important_list = [] # uid_list
    if important_results:
        for item in important_results:
            if item['found']:
                #if item['_source']['importance'] > IMPORTANT_USER_THRESHOULD:
                filter_important_list.append(item['_id'])

    print "filter_important_list", filter_important_list
    print "important_results", important_uid_list

    #判断感知
    finish = unfinish_signal # "0"
    process_status = "1"


    if int(stop_time) <= ts: # 检查任务是否已经完成
        finish = finish_signal
        process_status = "0"

    # 感知到的事, all_mid_list
    sensitive_text_list = []

    # 有事件发生时开始
    if 1:
        index_list = []
        important_words = []
        datetime_1 = ts2datetime(ts)
        index_name_1 = flow_text_index_name_pre + datetime_1
        exist_es = es_text.indices.exists(index=index_name_1)
        if exist_es:
            index_list.append(index_name_1)
        datetime_2 = ts2datetime(ts-DAY)
        index_name_2 = flow_text_index_name_pre + datetime_2
        exist_es = es_text.indices.exists(index=index_name_2)
        if exist_es:
            index_list.append(index_name_2)
        if index_list and all_mid_list:
            query_body = {
                "query":{
                    "filtered":{
                        "filter":{
                            "terms":{"mid": all_mid_list}
                        }
                    }
                },
                "size": 5000
            }
            search_results = es_text.search(index=index_list, doc_type="text", body=query_body)['hits']['hits']
            tmp_sensitive_warning = ""
            text_dict = dict() # 文本信息
            mid_value = dict() # 文本赋值
            duplicate_dict = dict() # 重合字典
            portrait_dict = dict() # 背景信息
            classify_text_dict = dict() # 分类文本
            classify_uid_list = []
            duplicate_text_list = []
            sensitive_words_dict = dict()
            if search_results:
                for item in search_results:
                    iter_uid = item['_source']['uid']
                    iter_mid = item['_source']['mid']
                    iter_text = item['_source']['text'].encode('utf-8', 'ignore')
                    iter_sensitive = item['_source'].get('sensitive', 0)

                    duplicate_text_list.append({"_id":iter_mid, "title": "", "content":iter_text})

                    if iter_sensitive:
                        tmp_sensitive_warning = signal_sensitive_variation #涉及到敏感词的微博
                        sensitive_words_dict[iter_mid] = iter_sensitive

                    keywords_dict = json.loads(item['_source']['keywords_dict'])
                    personal_keywords_dict = dict()
                    for k, v in keywords_dict.iteritems():
                        k = k.encode('utf-8', 'ignore')
                        personal_keywords_dict[k] = v
                    classify_text_dict[iter_mid] = personal_keywords_dict
                    classify_uid_list.append(iter_uid)

                # 去重
                if duplicate_text_list:
                    dup_results = duplicate(duplicate_text_list)
                    for item in dup_results:
                        if item['duplicate']:
                            duplicate_dict[item['_id']] = item['same_from']

                # 分类
                if classify_text_dict:
                     classify_results = topic_classfiy(classify_uid_list, classify_text_dict)
                     mid_value = dict()
                     #print "classify_results: ", classify_results
                     for k,v in classify_results.iteritems(): # mid:value
                        mid_value[k] = topic_value_dict[v[0]]

            sensitive_weibo_detail = {}
            if sensitive_words_dict:
                sensitive_mid_list = sensitive_words_dict.keys()
                sensitivie_weibo_detail = query_hot_weibo(ts, sensitive_mid_list, time_interval)


    results = dict()
    results['mid_topic_value'] = json.dumps(mid_value)
    results['duplicate_dict'] = json.dumps(duplicate_dict)
    results['sensitive_words_dict'] = json.dumps(sensitive_words_dict)
    results['sensitive_weibo_detail'] = json.dumps(sensitive_weibo_detail)
    results['origin_weibo_number'] = len(all_origin_list)
    results['retweeted_weibo_number'] = len(all_retweeted_list)
    results['origin_weibo_detail'] = json.dumps(origin_weibo_detail)
    results['retweeted_weibo_detail'] = json.dumps(retweeted_weibo_detail)
    results['retweeted_weibo_count'] = current_retweeted_count
    results['comment_weibo_count'] = current_comment_count
    results['weibo_total_number'] = current_total_count
    results['important_users'] = json.dumps(filter_important_list)
    results['unfilter_users'] = json.dumps(important_uid_list)
    results['timestamp'] = ts
    #results['clustering_topic'] = json.dumps(topic_list)
    # es存储当前时段的信息
    doctype = create_by + '-' + task_name
    es_user_portrait.index(index=index_sensing_task, doc_type=doctype, id=ts, body=results)

    # 更新manage social sensing的es信息
    temporal_result = es_user_portrait.get(index=index_manage_social_task, doc_type=task_doc_type, id=doctype)['_source']
    temporal_result['finish'] = finish
    temporal_result['processing_status'] = process_status
    history_status = json.loads(temporal_result['history_status'])
    history_status.append(ts)
    temporal_result['history_status'] = json.dumps(history_status)
    es_user_portrait.index(index=index_manage_social_task, doc_type=task_doc_type, id=doctype, body=temporal_result)
    return "1"
                print 'step 2: pagerank', ts2date(time.time())
                all_uids_count, dg_sorted_uids, pr_sorted_uids = pagerank(ITER_COUNT, input_tmp_path, TOP_N, 'keywords')
                #save results
                print 'step 3: save', ts2date(time.time())
                save_mark = save_task_results(dg_sorted_uids, pr_sorted_uids, network_task_information)
                print 'save done', ts2date(time.time())
                #identify save status
                if not save_mark:
                    #status fail: push task information to redis queue
                    push_mark = push_task_information(network_task_information)
                    if not push_mark:
                        print 'error push task queue'
        else:
            #if no exist - pass
            pass


if __name__=='__main__':
    log_time_ts = time.time()
    log_time_date = ts2date(log_time_ts)
    print 'cron/network/cron_network.py&start&' + log_time_date
    
    try: 
        scan_network_keywords_task()
    except Exception, e:
        print e, '&error&', ts2date(time.time())

    log_time_ts = time.time()
    log_time_date = ts2date(log_time_ts)
    print 'cron/network/cron_network.py&end&' + log_time_date
# -*- coding = utf-8 -*-

import os
import time
import sys
reload(sys)
sys.path.append('./../../')
from time_utils import ts2datetime, ts2date
from global_utils import R_SPARK

path = "/home/ubuntu01/txt"
file_list = os.listdir(path)
for each in file_list:
    filename = each.split('.')[0]
    if filename.split('_')[-1] == 'yes3':
        os.remove(path+'/'+each)

R_SPARK.flushdb()
ts = ts2date(time.time())

print "/cron/flow3/del_file_yes.py&end&%s" %ts
print "/cron/flow3/flushdb.py&end&%s" %ts
def social_sensing(task_detail):
    # 任务名 传感器 终止时间 之前状态 创建者 时间
    task_name = task_detail[0]
    social_sensors = task_detail[1]
    stop_time = task_detail[2]
    forward_warning_status = task_detail[3]
    create_by = task_detail[4]
    ts = int(task_detail[5])
    new = int(task_detail[6])

    print ts2date(ts)
    # PART 1

    forward_result = get_forward_numerical_info(task_name, ts, create_by)
    # 之前时间阶段内的原创微博list/retweeted
    forward_origin_weibo_list = query_mid_list(ts - time_interval, social_sensors, forward_time_range)
    forward_retweeted_weibo_list = query_mid_list(ts - time_interval, social_sensors, forward_time_range, 3)
    # 当前阶段内原创微博list
    current_mid_list = query_mid_list(ts, social_sensors, time_interval)
    current_retweeted_mid_list = query_mid_list(ts, social_sensors, time_interval, 3)
    all_mid_list = []
    all_mid_list.extend(current_mid_list)
    all_mid_list.extend(current_retweeted_mid_list)
    all_mid_list.extend(forward_origin_weibo_list)
    all_mid_list.extend(forward_retweeted_weibo_list)
    all_origin_list = []
    all_origin_list.extend(current_mid_list)
    all_origin_list.extend(forward_origin_weibo_list)
    all_retweeted_list = []
    all_retweeted_list.extend(current_retweeted_mid_list)
    all_retweeted_list.extend(forward_retweeted_weibo_list)  # 被转发微博的mid/root-mid
    print "all mid list: ", len(all_mid_list)
    # print "all_origin_list", all_origin_list
    # print "all_retweeted_list", all_retweeted_list

    # 查询微博在当前时间内的转发和评论数, 聚合按照message_type
    statistics_count = query_related_weibo(ts, all_mid_list, time_interval)
    if all_origin_list:
        origin_weibo_detail = query_hot_weibo(ts, all_origin_list, time_interval)  # 原创微博详情
    else:
        origin_weibo_detail = {}
    if all_retweeted_list:
        retweeted_weibo_detail = query_hot_weibo(ts, all_retweeted_list, time_interval)  # 转发微博详情
    else:
        retweeted_weibo_detail = {}
    current_total_count = statistics_count["total_count"]

    # 当前阶段内所有微博总数
    current_retweeted_count = statistics_count["retweeted"]
    current_comment_count = statistics_count["comment"]

    # PART 2
    # 聚合当前时间内积极、中性、悲伤、愤怒情绪分布
    # sentiment_dict = {"0": "neutral", "1":"positive", "2":"sad", "3": "anger"}
    sentiment_count = {"0": 0, "1": 0, "2": 0, "3": 0}
    search_results = aggregation_sentiment_related_weibo(ts, all_mid_list, time_interval)
    sentiment_count = search_results
    print "sentiment_count: ", sentiment_count
    negetive_key = ["2", "3", "4", "5", "6"]
    negetive_count = 0
    for key in negetive_key:
        negetive_count += sentiment_count[key]

    # 聚合当前时间内重要的人
    important_uid_list = []
    datetime = ts2datetime(ts - time_interval)
    index_name = flow_text_index_name_pre + datetime
    exist_es = es_text.indices.exists(index_name)
    if exist_es:
        search_results = get_important_user(ts, all_mid_list, time_interval)
        important_uid_list = search_results
    # 根据获得uid_list,从人物库中匹配重要人物
    if important_uid_list:
        important_results = es_user_portrait.mget(
            index=portrait_index_name, doc_type=portrait_index_type, body={"ids": important_uid_list}
        )["docs"]
    else:
        important_results = []
    filter_important_list = []  # uid_list
    if important_results:
        for item in important_results:
            if item["found"]:
                # if item['_source']['importance'] > IMPORTANT_USER_THRESHOULD:
                filter_important_list.append(item["_id"])

    # 判断感知
    burst_reason = signal_nothing_variation
    warning_status = signal_nothing
    finish = unfinish_signal  # "0"
    process_status = "1"

    if forward_result[0]:
        # 根据移动平均判断是否有时间发生
        mean_count = forward_result[1]
        std_count = forward_result[2]
        mean_sentiment = forward_result[3]
        std_sentiment = forward_result[4]
        if (
            mean_count >= MEAN_COUNT
            and current_total_count > mean_count + 1.96 * std_count
            or current_total_count >= len(all_mid_list) * AVERAGE_COUNT
        ):  # 异常点发生
            if forward_warning_status == signal_brust:  # 已有事件发生,改为事件追踪
                warning_status = signal_track
            else:
                warning_status = signal_brust
            burst_reason += signal_count_varition  # 数量异常

        if (
            negetive_count > mean_sentiment + 1.96 * std_sentiment
            and mean_sentiment >= MEAN_COUNT
            or negetive_count >= len(all_mid_list) * AVERAGE_COUNT
        ):
            warning_status = signal_brust
            burst_reason += signal_sentiment_varition  # 负面情感异常, "12"表示两者均异常
            if forward_warning_status == signal_brust:  # 已有事件发生,改为事件追踪
                warning_status = signal_track

    if int(stop_time) <= ts:  # 检查任务是否已经完成
        finish = finish_signal
        process_status = "0"

    # 感知到的事, all_mid_list
    tmp_burst_reason = burst_reason
    topic_list = []
    sensitive_text_list = []

    # 有事件发生时开始
    # if warning_status:
    if 1:
        index_list = []
        important_words = []
        datetime_1 = ts2datetime(ts)
        index_name_1 = flow_text_index_name_pre + datetime_1
        exist_es = es_text.indices.exists(index=index_name_1)
        if exist_es:
            index_list.append(index_name_1)
        datetime_2 = ts2datetime(ts - DAY)
        index_name_2 = flow_text_index_name_pre + datetime_2
        exist_es = es_text.indices.exists(index=index_name_2)
        if exist_es:
            index_list.append(index_name_2)
        if index_list and all_mid_list:
            query_body = {"query": {"filtered": {"filter": {"terms": {"mid": all_mid_list}}}}, "size": 5000}
            search_results = es_text.search(index=index_list, doc_type="text", body=query_body)["hits"]["hits"]
            tmp_sensitive_warning = ""
            text_dict = dict()  # 文本信息
            mid_value = dict()  # 文本赋值
            duplicate_dict = dict()  # 重合字典
            portrait_dict = dict()  # 背景信息
            classify_text_dict = dict()  # 分类文本
            classify_uid_list = []
            duplicate_text_list = []
            sensitive_words_dict = dict()
            if search_results:
                for item in search_results:
                    iter_uid = item["_source"]["uid"]
                    iter_mid = item["_source"]["mid"]
                    iter_text = item["_source"]["text"].encode("utf-8", "ignore")
                    iter_sensitive = item["_source"].get("sensitive", 0)

                    duplicate_text_list.append({"_id": iter_mid, "title": "", "content": iter_text})

                    if iter_sensitive:
                        tmp_sensitive_warning = signal_sensitive_variation  # 涉及到敏感词的微博
                        sensitive_words_dict[iter_mid] = iter_sensitive

                    keywords_dict = json.loads(item["_source"]["keywords_dict"])
                    personal_keywords_dict = dict()
                    for k, v in keywords_dict.iteritems():
                        k = k.encode("utf-8", "ignore")
                        personal_keywords_dict[k] = v
                    classify_text_dict[iter_mid] = personal_keywords_dict
                    classify_uid_list.append(iter_uid)

                # 去重
                if duplicate_text_list:
                    dup_results = duplicate(duplicate_text_list)
                    for item in dup_results:
                        if item["duplicate"]:
                            duplicate_dict[item["_id"]] = item["same_from"]

                # 分类
                if classify_text_dict:
                    classify_results = topic_classfiy(classify_uid_list, classify_text_dict)
                    mid_value = dict()
                    # print "classify_results: ", classify_results
                    for k, v in classify_results.iteritems():  # mid:value
                        mid_value[k] = topic_value_dict[v[0]]

            if tmp_sensitive_warning:
                warning_status = signal_brust
                burst_reason += signal_sensitive_variation
            sensitive_weibo_detail = {}
            if sensitive_words_dict:
                sensitive_mid_list = sensitive_words_dict.keys()
                sensitivie_weibo_detail = query_hot_weibo(ts, sensitive_mid_list, time_interval)

    results = dict()
    results["mid_topic_value"] = json.dumps(mid_value)
    results["duplicate_dict"] = json.dumps(duplicate_dict)
    results["sensitive_words_dict"] = json.dumps(sensitive_words_dict)
    results["sensitive_weibo_detail"] = json.dumps(sensitive_weibo_detail)
    results["origin_weibo_number"] = len(all_origin_list)
    results["retweeted_weibo_number"] = len(all_retweeted_list)
    results["origin_weibo_detail"] = json.dumps(origin_weibo_detail)
    results["retweeted_weibo_detail"] = json.dumps(retweeted_weibo_detail)
    results["retweeted_weibo_count"] = current_retweeted_count
    results["comment_weibo_count"] = current_comment_count
    results["weibo_total_number"] = current_total_count
    results["sentiment_distribution"] = json.dumps(sentiment_count)
    results["important_users"] = json.dumps(filter_important_list)
    results["unfilter_users"] = json.dumps(important_uid_list)
    results["burst_reason"] = tmp_burst_reason
    results["timestamp"] = ts
    # results['clustering_topic'] = json.dumps(topic_list)
    # es存储当前时段的信息
    doctype = create_by + "-" + task_name
    es_user_portrait.index(index=index_sensing_task, doc_type=doctype, id=ts, body=results)

    # 更新manage social sensing的es信息
    if not new:
        temporal_result = es_user_portrait.get(index=index_manage_social_task, doc_type=task_doc_type, id=doctype)[
            "_source"
        ]
        temporal_result["warning_status"] = warning_status
        temporal_result["burst_reason"] = tmp_burst_reason
        temporal_result["finish"] = finish
        temporal_result["processing_status"] = process_status
        history_status = json.loads(temporal_result["history_status"])
        history_status.append([ts, task_name, warning_status])
        temporal_result["history_status"] = json.dumps(history_status)
        es_user_portrait.index(index=index_manage_social_task, doc_type=task_doc_type, id=doctype, body=temporal_result)
    else:
        print "test"
    return "1"
Пример #54
0
def save_weibos(excel_name, topic, child_topic_list, w_limit): # 这里需要根据文本内容查询相关微博id等
    data = xlrd.open_workbook(excel_name)
    weibos_dict = {}
    for i in child_topic_list:
        #if i == '0':
        #    continue
        weibos_dict[i] = []
        table_weibos = data.sheet_by_name(str(int(i)))
        n_row_weibos = table_weibos.nrows
        if n_row_weibos <= w_limit:
            n_rows = n_row_weibo
        else:
            n_rows = w_limit  # 考虑到数据已经根据权重从大到小排列
        for j in range(n_rows):
            line = table_weibos.row_values(j)  # 缺少根据文本查询微博文本对应的其他微博内容
            weibo_text = line[1]
            weibo_weight = line[0]
            try:
                weibos_dict[i].append((weibo_text, weibo_weight)) # 实际上这里append的应该是weibo的完整内容,并且是将username等获取到的
            except:
                weibos_dict[i]=[(weibo_text, weibo_weight)]
    #print 'weibos_dict:', weibos_dict
    #获取微博具体数据,仅作测试用
    s = XapianSearch(path='/home/ubuntu3/huxiaoqian/case/20140724/20140804/', name='master_timeline_weibo',schema_version='5')
    begin_ts = 1378050300
    end_ts = 1378051200
    query_dict = {
        'timestamp': {'$gt':begin_ts, '$lt': end_ts},
        'message_type' : 2
    }
    weibos_dict_new = {}
    scount, weibo_results =s.search(query=query_dict, fields=fields_list)
    #print 'scount:', scount
    i = 0
    j = 0
    for weibo in weibo_results():
        if i==11:
            break
        weibo['text'] = weibos_dict[str(i)][j][0]
        #获取username,profileimage,weibourl
        username, profileimage = getuserinfo(weibo['user'])
        weibo['username'] = username
        weibo['profile_image_url'] = profileimage
        weibo['timestamp'] = ts2date(weibo['timestamp'])
        weibo['weibo_link'] = weiboinfo2url(weibo['user'],weibo['_id'])
        #获取username, profileimage,weibourl结束       
        weight = weibos_dict[str(i)][j][1]
        try:
            weibos_dict_new[i].append((weibo, weight))
        except:
            weibos_dict_new[i] = [(weibo, weight)]
        if j==4:
            j = 0
            i += 1
        else:
            j +=1
            
        #分割线
    for i in range(len(child_topic_list)):
        item = OpinionTestWeibos(topic, i, json.dumps(weibos_dict_new[i]))
        item_exist = db.session.query(OpinionTestWeibos).filter(OpinionTestWeibos.topic==topic, \
                                                                OpinionTestWeibos.child_topic==i).first()
        if item_exist:
            db.session.delete(item_exist)
        db.session.add(item)
    db.session.commit()