def user_sort_interface(username , time ,sort_scope , sort_norm , arg = None, st = None, et = None, isall = False, task_number=0, number=100):

    task_number = int(task_number)
    print "user_interface:", number
    user_list = []
    if isall:
        #deal with the situation of all net user
        if sort_scope == 'all_limit_keyword':
            #offline job
            #add job to es index
            during = ( datetime2ts(et) - datetime2ts(st) ) / DAY + 1
            time = 1
            if during > 3:
                time = 7
            elif during > 16:
                time = 30
            running_number = es_user_portrait.count(index='user_rank_keyword_task', doc_type='user_rank_task', body=query_task_number(username))['count']
            if running_number > task_number-1:
                return "more than limit"
            search_id = add_task( username ,"keyword" , "all" ,'flow_text_' , during , st ,et, arg , sort_norm , sort_scope, time, isall, number)
            #deal with the offline task   
            return {"flag":True , "search_id" : search_id }
        elif sort_scope == 'all_nolimit':
            #online job
            print "all_sort, ", number
            user_list = all_sort_filter(None,sort_norm,time,False,number)
    else:
        if sort_scope == 'in_limit_keyword':
            #offline job
            #deal with the offline task
            during = ( datetime2ts(et) - datetime2ts(st) ) / DAY + 1
            time = 1
            if during > 3:
                time = 7
            elif during > 16:
                time = 30
            running_number = es_user_portrait.count(index='user_rank_keyword_task', doc_type='user_rank_task', body=query_task_number(username))['count']
            if running_number > task_number-1:
                return "more than limit"
            search_id = add_task( username ,"keyword" , "in" ,'flow_text_' , during , st ,et , arg , sort_norm , sort_scope, time, isall, number)
            return {"flag":True , "search_id" : search_id }
        elif sort_scope == 'in_limit_hashtag':
            during = ( datetime2ts(et) - datetime2ts(st) ) / DAY + 1
            time = 1
            if during > 3:
                time = 7
            elif during > 16:
                time = 30
            running_number = es_user_portrait.count(index='user_rank_keyword_task', doc_type='user_rank_task', body=query_task_number(username))['count']
            if running_number > task_number-1:
                return "more than limit"
            search_id = add_task( username ,"hashtag" , "in" ,'flow_text_' , during , st ,et, arg , sort_norm , sort_scope, time, isall,  number)
            return {"flag":True , "search_id" : search_id }
        else:
            #find the scope
            user_list = in_sort_filter(time , sort_norm,sort_scope , arg,[], False, number)
    
    result = make_up_user_info(user_list,isall , time , sort_norm)
    print "user_list:", len(user_list)
    return result
示例#2
0
def get_user_count():
    count = 0
    query_body = {'query': {'match_all': {}}}
    count = es_user_portrait.count(index=portrait_index_name,
                                   doc_type=portrait_index_type,
                                   body=query_body)['count']
    return count
def weibo_sort_interface(username , time, sort_scope, sort_norm, arg, st, et, task_number, number):
    task_number = int(task_number)
    print "user_interface:", number

    weibo_list = []
    during = (datetime2ts(et) - datetime2ts(st)) / DAY + 1
    time = 1
    if during > 3:
        time = 7
    elif during > 16:
        time = 30

    query_body = {
        "query":{
            "terms":{
                "status": [0, -1]
            }
        }
    }

    if sort_scope == 'all_limit_keyword':
        running_number = es_weibo_portrait.count(index=WEIBO_RANK_KEYWORD_TASK_INDEX, doc_type=WEIBO_RANK_KEYWORD_TASK_TYPE, body=query_body)['count']
        if running_number > task_number - 1:
            return "more than limit"
        search_id = add_task(username, type="keyword", during=during, st=st, et=et, arg=arg, sort_norm=sort_norm, sort_scope=sort_scope, time=time, number=number)
        #deal with the offline task   
        return {"flag": True , "search_id": search_id}

    elif sort_scope == 'all_nolimit':
        pass

    return weibo_list
示例#4
0
def get_user_count():
    count = 0
    query_body = {
            'query':{
                'match_all':{}
                }
            }
    count = es.count(index=index_name, doc_type=index_type, body=query_body)['count']
    #print 'all user count:', count
    return count
示例#5
0
def get_index_rank(index_value, index_name):
    result = 0
    query_body = {
            'query':{
                'range':{
                    index_name:{
                        'from':index_value,
                        'to': 100000
                        }
                    }
                }
            }
    index_rank = es.count(index='user_portrait', doc_type='user', body=query_body)
    if index_rank['_shards']['successful'] != 0:
       result = index_rank['count']
    else:
        print 'es index rank error'
        results = 0
    return result
示例#6
0
def get_influence_top_count(top_threshold, user_count):
    query_body = {
            'query':{
                'filtered':{
                    'query':{
                        'match_all':{}
                        },
                    'filter':{
                        'range':{
                            'influence':{
                                'gte': top_threshold,
                                'lt': 3000
                                }
                            }
                        }
                    }
                }
            }
    result = es.count(index='user_portrait', doc_type='user', body=query_body)['count']
    #print 'result:', result
    return {'top_influence_ratio':float(result)/user_count}
示例#7
0
def get_influence_top_count(top_threshold, user_count):
    query_body = {
        'query': {
            'filtered': {
                'query': {
                    'match_all': {}
                },
                'filter': {
                    'range': {
                        'influence': {
                            'gte': top_threshold,
                            'lt': 3000
                        }
                    }
                }
            }
        }
    }
    result = es_user_portrait.count(index=portrait_index_name,
                                    doc_type=portrait_index_type,
                                    body=query_body)['count']
    return {'top_influence_ratio': float(result) / user_count}
def weibo_sort_interface(username, time, sort_scope, sort_norm, arg, st, et,
                         task_number, number):
    task_number = int(task_number)
    print "user_interface:", number

    weibo_list = []
    during = (datetime2ts(et) - datetime2ts(st)) / DAY + 1
    time = 1
    if during > 3:
        time = 7
    elif during > 16:
        time = 30

    query_body = {"query": {"terms": {"status": [0, -1]}}}

    if sort_scope == 'all_limit_keyword':
        running_number = es_weibo_portrait.count(
            index=WEIBO_RANK_KEYWORD_TASK_INDEX,
            doc_type=WEIBO_RANK_KEYWORD_TASK_TYPE,
            body=query_body)['count']
        if running_number > task_number - 1:
            return "more than limit"
        search_id = add_task(username,
                             type="keyword",
                             during=during,
                             st=st,
                             et=et,
                             arg=arg,
                             sort_norm=sort_norm,
                             sort_scope=sort_scope,
                             time=time,
                             number=number)
        #deal with the offline task
        return {"flag": True, "search_id": search_id}

    elif sort_scope == 'all_nolimit':
        pass

    return weibo_list
示例#9
0
def user_sort_interface(username,
                        time,
                        sort_scope,
                        sort_norm,
                        arg=None,
                        st=None,
                        et=None,
                        isall=False,
                        task_number=0,
                        number=100):

    task_number = int(task_number)
    print "user_interface:", number
    user_list = []
    if isall:
        #deal with the situation of all net user
        if sort_scope == 'all_limit_keyword':
            #offline job
            #add job to es index
            during = (datetime2ts(et) - datetime2ts(st)) / DAY + 1
            time = 7
            if during > 3:
                time = 7
            elif during > 16:
                time = 30
            running_number = es_user_portrait.count(
                index='user_rank_keyword_task',
                doc_type='user_rank_task',
                body=query_body)['count']
            if running_number > task_number - 1:
                return "more than limit"
            search_id = add_task(username, "keyword", "all", 'flow_text_',
                                 during, st, et, arg, sort_norm, sort_scope,
                                 time, isall, number)
            #deal with the offline task
            return {"flag": True, "search_id": search_id}
        elif sort_scope == 'all_nolimit':
            #online job
            print "all_sort, ", number
            user_list = all_sort_filter(None, sort_norm, time, False, number)
    else:
        if sort_scope == 'in_limit_keyword':
            #offline job
            #deal with the offline task
            during = (datetime2ts(et) - datetime2ts(st)) / DAY + 1
            time = 7
            if during > 3:
                time = 7
            elif during > 16:
                time = 30
            running_number = es_user_portrait.count(
                index='user_rank_keyword_task',
                doc_type='user_rank_task',
                body=query_body)['count']
            if running_number > task_number - 1:
                return "more than limit"
            search_id = add_task(username, "keyword", "in", 'flow_text_',
                                 during, st, et, arg, sort_norm, sort_scope,
                                 time, isall, number)
            return {"flag": True, "search_id": search_id}
        elif sort_scope == 'in_limit_hashtag':
            during = (datetime2ts(et) - datetime2ts(st)) / DAY + 1
            time = 7
            if during > 3:
                time = 7
            elif during > 16:
                time = 30
            running_number = es_user_portrait.count(
                index='user_rank_keyword_task',
                doc_type='user_rank_task',
                body=query_body)['count']
            if running_number > task_number - 1:
                return "more than limit"
            search_id = add_task(username, "hashtag", "in", 'flow_text_',
                                 during, st, et, arg, sort_norm, sort_scope,
                                 time, isall, number)
            return {"flag": True, "search_id": search_id}
        else:
            #find the scope
            user_list = in_sort_filter(time, sort_norm, sort_scope, arg, [],
                                       False, number)

    result = make_up_user_info(user_list, isall, time, sort_norm)
    print "user_list:", len(user_list)
    return result
示例#10
0
def get_scan_results_v2():
    result_dict = dict()

    # gender ratio count
    query_body = {
        "query": {
            "match_all": {}
        },
        "aggs": {
            "all_interests": {
                "terms": {
                    "field": "gender"
                }
            }
        }
    }

    search_results = es_user_portrait.search(
        index=portrait_index_name,
        doc_type=portrait_index_type,
        body=query_body)["aggregations"]['all_interests']['buckets']
    gender_result = dict()
    for item in search_results:
        gender_result[str(item['key'])] = item['doc_count']
    count = sum(gender_result.values())
    if count == 0:
        gender_ratio = {'1': 0.5, '2': 0.5}
    else:
        gender_ratio = {
            '1': float(gender_result['1']) / count,
            '2': float(gender_result['2']) / count
        }
    result_dict['gender_ratio'] = json.dumps(gender_ratio)

    # verified ratio count
    query_body = {
        "query": {
            "match_all": {}
        },
        "aggs": {
            "all_interests": {
                "terms": {
                    "field": "verified"
                }
            }
        }
    }

    search_results = es_user_portrait.search(
        index=portrait_index_name,
        doc_type=portrait_index_type,
        body=query_body)["aggregations"]['all_interests']['buckets']
    verified_result = dict()
    for item in search_results:
        verified_result[item['key']] = item['doc_count']
    count = sum(verified_result.values())
    if count == 0:
        verified_ratio = {'yes': 0.5, 'no': 0.5}
    else:
        verified_ratio = {
            'yes': float(verified_result['']) / count,
            'no': float(verified_result['unknown']) / count
        }
    result_dict['verified_ratio'] = json.dumps(verified_ratio)

    # loation top
    query_body = {
        "query": {
            "match_all": {}
        },
        "aggs": {
            "all_interests": {
                "terms": {
                    "field": "location",
                    "size": 5
                }
            }
        }
    }

    search_results = es_user_portrait.search(
        index=portrait_index_name,
        doc_type=portrait_index_type,
        body=query_body)["aggregations"]['all_interests']['buckets']
    if len(search_results):
        location_top = []
        for item in search_results:
            location_top.append([item['key'], item['doc_count']])
    else:
        location_top = {}
    result_dict['location_top'] = json.dumps(location_top)

    # activity geo

    query_body = {
        "query": {
            "match_all": {}
        },
        "aggs": {
            "all_interests": {
                "terms": {
                    "field": "activity_geo_aggs",
                    "size": 50
                }
            }
        }
    }

    search_results = es_user_portrait.search(
        index=portrait_index_name,
        doc_type=portrait_index_type,
        body=query_body)["aggregations"]['all_interests']['buckets']
    if len(search_results):
        activity_geo_top = []
        for item in search_results:
            activity_geo_top.append([item['key'], item['doc_count']])
    else:
        activity_geo_top = {}
    activity_geo_top = sorted(activity_geo_top,
                              key=lambda activity_geo_top: activity_geo_top[1],
                              reverse=True)
    print "activity_geo_top:", activity_geo_top
    result_dict['activity_geo_top'] = json.dumps(activity_geo_top)

    # keywords
    query_body = {
        "query": {
            "match_all": {}
        },
        "aggs": {
            "all_interests": {
                "terms": {
                    "field": "keywords_string",
                    "size": 50
                }
            }
        }
    }

    search_results = es_user_portrait.search(
        index=portrait_index_name,
        doc_type=portrait_index_type,
        body=query_body)["aggregations"]['all_interests']['buckets']
    if len(search_results):
        keywords_top = []
        for item in search_results:
            keywords_top.append([item['key'], item['doc_count']])
    else:
        keywords_top = {}
    result_dict['keywords_top'] = json.dumps(keywords_top)

    # hashtag top
    query_body = {
        "query": {
            "match_all": {}
        },
        "aggs": {
            "all_interests": {
                "terms": {
                    "field": "hashtag",
                    "size": 50
                }
            }
        }
    }

    search_results = es_user_portrait.search(
        index=portrait_index_name,
        doc_type=portrait_index_type,
        body=query_body)["aggregations"]['all_interests']['buckets']
    if len(search_results):
        hashtag_top = []
        for item in search_results:
            hashtag_top.append([item['key'], item['doc_count']])
    else:
        hashtag_top = {}
    hashtag_top = sorted(hashtag_top,
                         key=lambda hashtag_top: hashtag_top[1],
                         reverse=True)

    result_dict['hashtag_top'] = json.dumps(hashtag_top)

    # topic top
    query_body = {
        "query": {
            "match_all": {}
        },
        "aggs": {
            "all_interests": {
                "terms": {
                    "field": "topic_string",
                    "size": 50
                }
            }
        }
    }

    search_results = es_user_portrait.search(
        index=portrait_index_name,
        doc_type=portrait_index_type,
        body=query_body)["aggregations"]['all_interests']['buckets']
    if len(search_results):
        topic_top = []
        for item in search_results:
            topic_top.append([item['key'], item['doc_count']])
    else:
        topic_top = {}
    result_dict['topic_top'] = json.dumps(topic_top)

    # domain top
    query_body = {
        "query": {
            "match_all": {}
        },
        "aggs": {
            "all_interests": {
                "terms": {
                    "field": "domain",
                    "size": 20
                }
            }
        }
    }

    search_results = es_user_portrait.search(
        index=portrait_index_name,
        doc_type=portrait_index_type,
        body=query_body)["aggregations"]['all_interests']['buckets']
    if len(search_results):
        domain_top = []
        for item in search_results:
            domain_top.append([item['key'], item['doc_count']])
    else:
        domain_top = {}
    result_dict['domain_top'] = json.dumps(domain_top)
    result_dict['domain_top_user'] = json.dumps(
        get_domain_top_user(domain_top))
    result_dict['topic_top_user'] = json.dumps(get_topic_top_user(topic_top))

    # online pattern top
    query_body = {
        "query": {
            "match_all": {}
        },
        "aggs": {
            "all_interests": {
                "terms": {
                    "field": "online_pattern_aggs",
                    "size": 50
                }
            }
        }
    }

    search_results = es_user_portrait.search(
        index=portrait_index_name,
        doc_type=portrait_index_type,
        body=query_body)["aggregations"]['all_interests']['buckets']
    if len(search_results):
        online_pattern_top = []
        for item in search_results:
            online_pattern_top.append([item['key'], item['doc_count']])
    else:
        online_pattern_top = {}
    result_dict['online_pattern_top'] = json.dumps(online_pattern_top)

    # activity_count
    no_activity_count = es_user_portrait.count(index=portrait_index_name, doc_type=portrait_index_type, \
            body={'query':{'filtered':{'filter':{'term':{'influence': 0}}}}})['count']
    all_count = es_user_portrait.count(index=portrait_index_name, doc_type=portrait_index_type ,\
            body={'query':{'match_all':{}}})['count']
    result_dict['activity_count'] = 1 - float(no_activity_count) / all_count

    return result_dict
示例#11
0
def get_scan_results():
    result_dict = {}
    gender_result = {'1': 0, '2': 0}
    verified_result = {'yes': 0, 'no': 0}
    location_result = {}
    activity_geo_result = {}
    keywords_result = {}
    hashtag_result = {}
    topic_result = {}
    online_pattern_result = {}
    domain_result = {}
    no_gender_count = 0
    no_verified_count = 0
    no_location_count = 0
    no_activity_geo_count = 0
    no_keywords_count = 0
    no_hashtag_count = 0
    no_topic_count = 0
    no_online_pattern_count = 0
    no_domain_count = 0
    s_re = scan(es_user_portrait,
                query={
                    'query': {
                        'match_all': {}
                    },
                    'size': 100
                },
                index=portrait_index_name,
                doc_type=portrait_index_type)
    print 's_re:', s_re
    activity_count = 0
    while True:
        portrait_uid_list = []
        while True:
            try:
                scan_re = s_re.next()['_source']
                # gender ratio count
                portrait_uid_list.append(scan_re['uid'])
                try:
                    gender_result[str(scan_re['gender'])] += 1
                except:
                    no_gender_count += 1
                # verified ratio count
                try:
                    verified_result[str(scan_re['verified'])] += 1
                except:
                    no_verified_count += 1
                # loation top
                try:
                    location = scan_re['location']
                    if len(location.split(' ')) > 1:
                        location = location.split(' ')[0]
                    try:
                        location_result[location] += 1
                    except:
                        location_result[location] = 1
                except:
                    no_location_count += 1
                # activity geo
                try:
                    activity_geo = scan_re['activity_geo_dict']
                    if activity_geo:
                        activity_geo_dict = json.loads(activity_geo)[-1]
                        for geo in activity_geo_dict:
                            geo_list = geo.split('\t')
                            if geo_list[0] == u'中国' and len(geo_list) >= 2:
                                province = geo_list[1]
                                try:
                                    activity_geo_result[
                                        province] += activity_geo_dict[geo]
                                except:
                                    activity_geo_result[
                                        province] = activity_geo_dict[geo]
                except:
                    no_activity_geo_count += 1
                # keywords
                try:
                    keywords = json.loads(scan_re['keywords'])
                    if keywords:
                        for word in keywords:
                            try:
                                keywords_result[word] += keywords[word]
                            except:
                                keywords_result[word] = keywords[word]
                except:
                    no_keywords_count += 1
                # hashtag top
                try:
                    hashtag_dict = json.loads(scan_re['hashtag_dict'])
                    if hashtag_dict:
                        for tag in hashtag_dict:
                            try:
                                hashtag_result[tag] += hashtag_dict[tag]
                            except:
                                hashtag_result[tag] = hashtag_dict[tag]
                except:
                    no_hashtag_count += 1
                # topic top
                try:
                    topic = scan_re['topic_string']
                    if topic:
                        topic_list = topic.split('&')
                        for item in topic_list:
                            try:
                                topic_result[item] += 1
                            except:
                                topic_result[item] = 1
                except:
                    no_topic_count += 1
                # online pattern top
                try:
                    online_pattern = json.loads(scan_re['online_pattern'])
                    if online_pattern:
                        for item in online_pattern:
                            try:
                                online_pattern_result[item] += online_pattern[
                                    item]
                            except:
                                online_pattern_result[item] = online_pattern[
                                    item]
                except:
                    no_online_pattern_count += 1
                # domain top
                try:
                    domain = scan_re['domain']
                    if domain:
                        try:
                            domain_result[domain] += 1
                        except:
                            domain_result[domain] = 1
                except:
                    no_domain_count += 1
            except StopIteration:
                print 'all done'
                now_ts = time.time()
                now_date = ts2datetime(now_ts - DAY)
                index_time = ''.join(now_date.split('-'))
                #test
                index_time = '20130907'
                # gender ratio count
                #count = sum(gender_result.values())
                all_count = es_user_portrait.count(index=portrait_index_name,doc_type=portrait_index_type,\
                    body={'query':{'match_all':{}}})['count']
                count = all_count
                print "count:", count
                gender_ratio = {
                    '1': float(gender_result['1']) / count,
                    '2': float(gender_result['2']) / count
                }
                #print 'gender ratio:', gender_ratio
                activity_result = es_user_portrait.mget(
                    index='bci_' + index_time,
                    doc_type='bci',
                    body={'ids': portrait_uid_list})['docs']
                for activity_item in activity_result:
                    if activity_item['found']:
                        activity_count += 1
                #print 'activity_count:', activity_count
                result_dict['activity_count'] = float(activity_count) / count
                result_dict['gender_ratio'] = json.dumps(gender_ratio)
                # verified ratio count
                count = sum(verified_result.values())
                if count == 0:
                    verified_ratio = {'yes': 0.5, 'no': 0.5}
                else:
                    verified_ratio = {
                        'yes': float(verified_result['yes']) / count,
                        'no': float(verified_result['no']) / count
                    }
                #print 'verified ratio:', verified_ratio
                result_dict['verified_ratio'] = json.dumps(verified_ratio)
                # location top
                if location_result:
                    sort_location = sorted(location_result.items(),
                                           key=lambda x: x[1],
                                           reverse=True)
                    location_top = sort_location[:5]
                else:
                    location_top = {}
                #print 'location top:', location_top
                result_dict['location_top'] = json.dumps(location_top)
                # activity geo top
                if activity_geo_result:
                    sort_activity_geo = sorted(activity_geo_result.items(),
                                               key=lambda x: x[1],
                                               reverse=True)
                    activity_geo_top = sort_activity_geo[:50]
                else:
                    activity_geo_top = {}
                #print 'activity_geo_top:', activity_geo_top
                result_dict['activity_geo_top'] = json.dumps(activity_geo_top)
                # keywords top
                if keywords_result:
                    sort_keywords = sorted(keywords_result.items(),
                                           key=lambda x: x[1],
                                           reverse=True)
                    keywords_top = sort_keywords[:50]
                else:
                    keywords_top = {}
                #print 'keywords_top:', keywords_top
                result_dict['keywords_top'] = json.dumps(keywords_top)
                # hashtag top
                if hashtag_result:
                    sort_hashtag = sorted(hashtag_result.items(),
                                          key=lambda x: x[1],
                                          reverse=True)
                    hashtag_top = sort_hashtag[:50]
                else:
                    hashtag_top = {}
                #print 'hashtag top:', hashtag_top
                result_dict['hashtag_top'] = json.dumps(hashtag_top)
                # topic top
                if topic_result:
                    sort_topic = sorted(topic_result.items(),
                                        key=lambda x: x[1],
                                        reverse=True)
                    topic_top = sort_topic[:50]
                else:
                    topic_top = {}
                #print 'topic top:', topic_top
                result_dict['topic_top'] = json.dumps(topic_top)
                # online_pattern top
                if online_pattern_result:
                    sort_online_pattern = sorted(online_pattern_result.items(),
                                                 key=lambda x: x[1],
                                                 reverse=True)
                    online_pattern_top = sort_online_pattern[:50]
                else:
                    online_pattern_top = {}
                #print 'online pattern top:', online_pattern_top
                result_dict['online_pattern_top'] = json.dumps(
                    online_pattern_top)
                # domain top
                if domain_result:
                    sort_domain = sorted(domain_result.items(),
                                         key=lambda x: x[1],
                                         reverse=True)
                    domain_top = sort_domain[:20]
                else:
                    domain_top = {}
                result_dict['domain_top'] = json.dumps(domain_top)
                result_dict['domain_top_user'] = json.dumps(
                    get_domain_top_user(domain_top))
                result_dict['topic_top_user'] = json.dumps(
                    get_topic_top_user(topic_top))
                return result_dict
            except Exception, r:
                print Exception, r
                return result_dict
        activity_result = es.mget(index='20130907',
                                  doc_type='bci',
                                  body={'ids': portrait_uid_list})['docs']
        for activity_item in activity_result:
            if activity_item['found']:
                activity_count += 1