def identify_weibo_exist(mid, weibo_timestamp):
    exist_mark = False
    weibo_info = {}
    weibo_date  = ts2datetime(weibo_timestamp)
    index_name = flow_text_index_name_pre + weibo_date
    try:
        weibo_result = es_flow_text.get(index=index_name, doc_type=flow_text_index_type,\
                id = mid)['_source']
    except:
        weibo_result = {}
    if weibo_result:
        weibo_info = weibo_result
        exist_mark = True
    return exist_mark, weibo_info
Пример #2
0
def identify_weibo_exist(mid, weibo_timestamp):
    exist_mark = False
    weibo_info = {}
    weibo_date  = ts2datetime(weibo_timestamp)
    index_name = flow_text_index_name_pre + weibo_date
    try:
        weibo_result = es_flow_text.get(index=index_name, doc_type=flow_text_index_type,\
                id = mid)['_source']
    except:
        weibo_result = {}
    if weibo_result:
        weibo_info = weibo_result
        exist_mark = True
    return exist_mark, weibo_info
Пример #3
0
def influenced_people(uid, mid, influence_style, date, default_number=20):
    # uid
    # which weibo----mid, retweeted weibo ---seek for root_mid
    # influence_style: retweeted(0) or comment(1)
    date1 = str(date).replace('-', '')
    index_name = pre_index + date1
    index_flow_text = pre_text_index + date
    text_result = es.get(index=index_flow_text,
                         doc_type=flow_text_index_type,
                         id=mid)["_source"]
    temp_mid = text_result.get("root_mid", '')  #判断微博是否是原创微博
    print temp_mid
    if temp_mid:
        mid_type = 1  # 非原创微博
    else:
        mid_type = 0  # 原创微博
    query_body = {
        "query": {
            "filtered": {
                "filter": {
                    "bool": {
                        "must": []
                    }
                }
            }
        },
        "size": 100000
    }

    if mid_type == 0:
        if int(influence_style) == 0:  # origin weibo, all retweeted people
            query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{
                "term": {
                    "root_uid": uid
                }
            }, {
                "term": {
                    "message_type": 3
                }
            }, {
                "term": {
                    "root_mid": mid
                }
            }])
        else:  # commented people
            query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{
                "term": {
                    "directed_uid": uid
                }
            }, {
                "term": {
                    "message_type": 2
                }
            }, {
                "term": {
                    "root_mid": mid
                }
            }])
    else:
        if int(influence_style) == 0:  # origin weibo, all retweeted people
            query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{
                "term": {
                    "directed_uid": uid
                }
            }, {
                "term": {
                    "message_type": 3
                }
            }, {
                "term": {
                    "root_mid": temp_mid
                }
            }])
        else:  # commented people
            query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{
                "term": {
                    "directed_uid": uid
                }
            }, {
                "term": {
                    "message_type": 2
                }
            }, {
                "term": {
                    "root_mid": temp_mid
                }
            }])
    search_results = es.search(index=index_flow_text,
                               doc_type=flow_text_index_type,
                               body=query_body,
                               fields=["uid"],
                               timeout=30)["hits"]["hits"]
    results = []
    if search_results:
        for item in search_results:
            if int(item["fields"]["uid"][0]) == int(uid):
                pass
            else:
                results.append(item["fields"]["uid"][0])
        results = list(set(results))
    else:
        results = []

    if results:
        portrait_results = es_user_portrait.mget(index=user_portrait,
                                                 doc_type=portrait_index_type,
                                                 body={"ids": results},
                                                 fields=[
                                                     "domain", "topic_string",
                                                     "activity_geo_dict",
                                                     "importance", "influence"
                                                 ])["docs"]
    else:
        portrait_results = {}

    in_portrait = []
    out_portrait = []
    in_portrait_info = []
    retweeted_domain = {}
    retweeted_topic = {}
    retweeted_geo = {}
    average_influence = 0
    total_influence = 0
    count = 0

    if portrait_results:
        for item in portrait_results:
            if item["found"]:
                temp = []
                count += 1
                temp.append(item['_id'])
                temp.append(item["fields"]["importance"][0])
                in_portrait.append(temp)
                temp_domain = item["fields"]["domain"][0].split('&')
                temp_topic = item["fields"]["topic_string"][0].split('&')
                temp_geo = json.loads(
                    item["fields"]["activity_geo_dict"][0])[-1].keys()
                total_influence += item["fields"]["influence"][0]
                retweeted_domain = aggregation(temp_domain, retweeted_domain)
                retweeted_topic = aggregation(temp_topic, retweeted_topic)
                retweeted_geo = aggregation(temp_geo, retweeted_geo)
            else:
                out_portrait.append(item['_id'])
        retweeted_domain = proportion(retweeted_domain)
        retweeted_topic = proportion(retweeted_topic)
        retweeted_geo = proportion(retweeted_geo)
        try:
            average_influence = total_influence / count
        except:
            average_influence = 0
    sorted_retweeted_domain = sorted(retweeted_domain.items(),
                                     key=lambda x: x[1],
                                     reverse=True)
    sorted_retweeted_topic = sorted(retweeted_topic.items(),
                                    key=lambda x: x[1],
                                    reverse=True)
    sorted_retweeted_geo = sorted(retweeted_geo.items(),
                                  key=lambda x: x[1],
                                  reverse=True)

    retweeted_results = dict()
    retweeted_results["domian"] = sorted_retweeted_domain[:5]
    retweeted_results["topic"] = sorted_retweeted_topic[:5]
    retweeted_results["geo"] = sorted_retweeted_geo[:5]
    retweeted_results["influence"] = average_influence
    in_portrait = sorted(in_portrait, key=lambda x: x[1], reverse=True)

    temp_list = []
    for item in in_portrait:
        temp_list.append(item[0])
    print temp_list[:20]
    print out_portrait[:20]
    retweeted_results['in_portrait_number'] = len(temp_list)
    retweeted_results['out_portrait_number'] = len(out_portrait)
    in_portrait_url = get_user_url(temp_list[:default_number])
    out_portrait_url = get_user_url(out_portrait[:default_number])

    return_results = dict()
    return_results["influence_users"] = [in_portrait_url, out_portrait_url]
    return_results["influence_distribution"] = retweeted_results

    return return_results

    return ([
        in_portrait_url[:default_number], out_portrait_url[:default_number]
    ])
def influenced_people(uid, mid, influence_style, date, default_number=20):
# uid 
# which weibo----mid, retweeted weibo ---seek for root_mid
# influence_style: retweeted(0) or comment(1)
    date1 = ts2datetime(datetime2ts(date)).replace('-', '')
    index_name = pre_index + date1
    index_flow_text = pre_text_index + date
    text_result = es.get(index=index_flow_text, doc_type=flow_text_index_type, id=mid)["_source"]
    temp_mid = text_result.get("root_mid",'') #判断微博是否是原创微博
    if temp_mid:
        mid_type = 1 # 非原创微博
    else:
        mid_type = 0 # 原创微博
    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "bool":{
                        "must":[
                        ]
                    }
                }
            }
        },
        "size": 100000
    }

    if mid_type == 0:
        if int(influence_style) == 0: # origin weibo, all retweeted people
            query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term": {"root_uid": uid}}, {"term": {"message_type": 3}}, {"term": {"root_mid": mid}}])
        else: # commented people
            query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term": {"directed_uid": uid}}, {"term": {"message_type": 2}}, {"term": {"root_mid": mid}}])
    else:
        if int(influence_style) == 0: # origin weibo, all retweeted people
            query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term": {"directed_uid": uid}}, {"term": {"message_type": 3}}, {"term": {"root_mid": temp_mid}}])
        else: # commented people
            query_body["query"]["filtered"]["filter"]["bool"]["must"].extend([{"term": {"directed_uid": uid}}, {"term": {"message_type": 2}}, {"term": {"root_mid": temp_mid}}])
    search_results = es.search(index=index_flow_text, doc_type=flow_text_index_type, body=query_body, fields=["uid"], timeout=30)["hits"]["hits"]
    results = []
    if search_results:
        for item in search_results:
            if int(item["fields"]["uid"][0]) == int(uid):
                pass
            else:
                results.append(item["fields"]["uid"][0])
        results = list(set(results))
    else:
        results = []


    if results:
        portrait_results = es_user_portrait.mget(index=user_portrait, doc_type=portrait_index_type, body={"ids": results}, fields=["domain", "topic_string", "activity_geo_dict","importance", "influence"])["docs"]
    else:
        portrait_results = {}


    in_portrait = []
    out_portrait = []
    in_portrait_info = []
    retweeted_domain = {}
    retweeted_topic = {}
    retweeted_geo = {}
    average_influence = 0
    total_influence = 0
    count = 0

    if portrait_results:
        for item in portrait_results:
            if item["found"]:
                temp = []
                count += 1
                temp.append(item['_id'])
                temp.append(item["fields"]["importance"][0])
                in_portrait.append(temp)
                temp_domain = item["fields"]["domain"][0].split('&')
                temp_topic = item["fields"]["topic_string"][0].split('&')
                temp_geo = json.loads(item["fields"]["activity_geo_dict"][0])[-1].keys()
                total_influence += item["fields"]["influence"][0]
                retweeted_domain = aggregation(temp_domain, retweeted_domain)
                retweeted_topic = aggregation(temp_topic, retweeted_topic)
                retweeted_geo = aggregation(temp_geo, retweeted_geo)
            else:
                out_portrait.append(item['_id'])
        retweeted_domain = proportion(retweeted_domain)
        retweeted_topic = proportion(retweeted_topic)
        retweeted_geo = proportion(retweeted_geo)
        try:
            average_influence = total_influence/count
        except:
            average_influence = 0
    sorted_retweeted_domain = sorted(retweeted_domain.items(),key=lambda x:x[1], reverse=True)
    sorted_retweeted_topic = sorted(retweeted_topic.items(),key=lambda x:x[1], reverse=True)
    sorted_retweeted_geo = sorted(retweeted_geo.items(), key=lambda x:x[1], reverse=True)

    retweeted_results = dict()
    retweeted_results["domian"] = sorted_retweeted_domain[:5]
    retweeted_results["topic"] = sorted_retweeted_topic[:5]
    retweeted_results["geo"] = sorted_retweeted_geo[:5]
    retweeted_results["influence"] = average_influence
    in_portrait = sorted(in_portrait, key=lambda x:x[1], reverse=True)


    temp_list = []
    for item in in_portrait:
        temp_list.append(item[0])
    retweeted_results['in_portrait_number'] = len(temp_list)
    retweeted_results['out_portrait_number'] = len(out_portrait)
    in_portrait_url = get_user_url(temp_list[:default_number])
    out_portrait_url = get_user_url(out_portrait[:default_number])

    return_results = dict()
    return_results["influence_users"] = [in_portrait_url, out_portrait_url]
    return_results["influence_distribution"] = retweeted_results


    return return_results





    return ([in_portrait_url[:default_number], out_portrait_url[:default_number]])