Пример #1
0
def personTopic2leveldb(keyword_limit=50):
    # test 0.6 seconds per 10000 weibos
    weibos = xapian_search_weibo.iter_all_docs(fields=['user', 'text'])

    count = 0
    ts = te = time.time()
    batch = leveldb.WriteBatch()
    for weibo in weibos:
        if count % 10000 == 0:
            te = time.time()
            daily_profile_person_topic_db.Write(batch, sync=True)
            batch = leveldb.WriteBatch()
            print count, '%s sec' % (te - ts)
            ts = te

        uid = weibo['user']
        text = _utf_encode(weibo['text'])
        terms = cut(scws, text, f='n')

        try:
            ori_dict = json.loads(daily_profile_person_topic_db.Get(str(uid)))
        except KeyError:
            ori_dict = {}

        for term in terms:
            try:
                ori_dict[term] += 1
            except KeyError:
                ori_dict[term] = 1

        batch.Put(str(uid), json.dumps(ori_dict))

        count += 1
Пример #2
0
def make_network(topic, date, window_size, max_size=100000, ts=False):
    end_time = datetime2ts(date)
    start_time = end_time - window2time(window_size)

    g = nx.DiGraph()

    #need repost index
    topic = cut(s, topic.encode('utf-8'))
    query_dict = {'text': topic, 'timestamp': {'$gt': start_time, '$lt': end_time}}

    if ts:
        count, get_statuses_results = statuses_search.search(query=query_dict, field=['text', 'user', 'timestamp', 'retweeted_status'], max_offset=max_size)
    else:
        count, get_statuses_results = statuses_search.search(query=query_dict, field=['text', 'user', 'retweeted_status'], max_offset=max_size)
    print 'topic statuses count %s' % count

    if ts:
        uid_ts = {}
        for status in get_statuses_results():
            try:
                if status['retweeted_status']:
                    repost_uid = status['user']
                    rt_mid = status['retweeted_status']
                    repost_ts = int(status['timestamp'])
                    source_status = acquire_status_by_id(rt_mid)
                    source_uid = source_status['user']
                    source_ts = int(source_status['timestamp'])
                    if is_in_trash_list(repost_uid) or is_in_trash_list(source_uid):
                        continue
                    if repost_uid not in uid_ts:
                        uid_ts[repost_uid] = repost_ts
                    else:
                        if uid_ts[repost_uid] > repost_ts:
                            uid_ts[repost_uid] = repost_ts
                    if source_uid not in uid_ts:
                        uid_ts[source_uid] = source_ts   
                    else:
                        if uid_ts[source_uid] > source_ts:
                            uid_ts[source_uid] = source_ts
                    g.add_edge(repost_uid, source_uid)
            except (TypeError, KeyError):
                continue
        return uid_ts, g
    else:
        for status in get_statuses_results():
            try:
                if status['retweeted_status']:
                    repost_uid = status['user']
                    rt_mid = status['retweeted_status']
                    source_uid = acquire_status_by_id(rt_mid)['user']
                    if is_in_trash_list(repost_uid) or is_in_trash_list(source_uid):
                        continue
                    g.add_edge(repost_uid, source_uid)
            except (TypeError, KeyError):
                continue
        return g
Пример #3
0
def batch_handle_domain():
    weibos = xapian_search_weibo.iter_all_docs(fields=['user', 'text', \
    'retweeted_mid', 'reposts_count', 'comments_count', 'text'])

    count = 0
    ts = te = time.time()
    batch = leveldb.WriteBatch()
    for weibo in weibos:
        if count % 10000 == 0:
            te = time.time()
            daily_profile_domain_db.Write(batch, sync=True)
            batch = leveldb.WriteBatch()
            print count, '%s sec' % (te - ts), ' %s daily domain' % batch_date_1
            ts = te

        uid = weibo['user']
        domain = userLeveldb2Domain(uid)
        retweeted_mid = weibo['retweeted_mid']
        reposts_count = weibo['reposts_count']
        comments_count = weibo['comments_count']
        text = weibo['text']
        retweeted_mid = weibo['retweeted_mid']

        try:
            results = daily_profile_domain_db.Get(str(domain))
            active, important, reposts, original = results.split('_\/')
            active = int(active)
            important = int(important)
            reposts = int(reposts)
            original = int(original)
        except KeyError:
            active = important = reposts = original = 0

        active += 1
        important += reposts_count + comments_count

        if retweeted_mid != 0:
            reposts += 1
        else:
            original += 1

        key = str(domain)
        value = '_\/'.join([str(active), str(important), str(reposts), str(original)])
        batch.Put(key, value)

        daily_profile_domain_keywords_db = daily_domain_keywords_db[int(domain)]
        terms = cut(scws, _utf_encode(text), f='n')
        for term in terms:
            try:
                kcount = int(daily_profile_domain_keywords_db.Get(str(term)))
                daily_profile_domain_keywords_db.Put(str(term), str(kcount + 1))
            except KeyError:
                daily_profile_domain_keywords_db.Put(str(term), str(1))

        count += 1
Пример #4
0
def calc_roeik(
    retweeted_mid, retweeted_uid, text, reposts, original, emoticon, direct_interact, retweeted_interact, keywords_dict
):
    if retweeted_mid != 0:
        reposts += 1
    else:
        original += 1

    _emoticons = emoticon_find(text)
    if _emoticons:
        emoticon += 1

    if isinstance(text, str):
        text = text.decode("utf-8", "ignore")

    RE = re.compile(u"//@([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+):", re.UNICODE)
    repost_users = RE.findall(text)

    if len(repost_users):
        repost_user = repost_users[0]
        direct_interact = str(direct_interact.encode("utf-8")) + "\_/" + str(repost_user.encode("utf-8"))

    if retweeted_uid != 0:
        retweeted_interact = str(retweeted_interact) + "\_/" + str(retweeted_uid)

    """
    if len(repost_users):
        repost_user = repost_users[0]
        try:
            direct_interact[repost_user] += 1
        except KeyError:
            direct_interact[repost_user] = 1

    if retweeted_uid != 0:
        try:
            retweeted_interact[retweeted_uid] += 1
        except KeyError:
            retweeted_interact[retweeted_uid] = 1
    """

    interact_dict = {"direct": direct_interact, "retweeted": retweeted_interact}

    terms = cut(scws, _utf_encode(text), f="n")
    keywords_dict += "\_/" + "\_/".join(terms)
    """
    for term in terms:
        try:
            keywords_dict[term] += 1
        except KeyError:
            keywords_dict[term] = 1
    """

    return reposts, original, emoticon, interact_dict, keywords_dict
Пример #5
0
def triple_classifier(tweet):
    sentiment = 0
    text = tweet['text']  # encode

    #if_empty_retweet = if_empty_retweet_weibo(tweet)
    #if if_empty_retweet:
    #    text = tweet['retweeted_status']['text']

    # if_emoticoned = if_emoticoned_weibo(tweet)
    # if if_emoticoned == 1:
    emoticon_sentiment = emoticon(text)
    if emoticon_sentiment != 0:
        sentiment = emoticon_sentiment
        text = u''

    if text != u'':
        entries = cut(cut_str, text.encode('utf-8'))
        entry = [e.decode('utf-8', 'ignore') for e in entries]
        bow = dictionary_1.doc2bow(entry)
        s = [1, 1]
        for pair in bow:
            s[0] = s[0] * (step1_score[pair[0]][0]**pair[1])
            s[1] = s[1] * (step1_score[pair[0]][1]**pair[1])
        if s[0] <= s[1]:
            bow = dictionary_2.doc2bow(entry)
            s = [1, 1, 1]
            for pair in bow:
                s[0] = s[0] * (step2_score[pair[0]][0]**pair[1])
                s[1] = s[1] * (step2_score[pair[0]][1]**pair[1])
                s[2] = s[2] * (step2_score[pair[0]][2]**pair[1])
            if s[0] > s[1] and s[0] > s[2]:
                sentiment = HAPPY
            elif s[1] > s[0] and s[1] > s[2]:
                sentiment = SAD
            elif s[2] > s[1] and s[2] > s[0]:
                sentiment = ANGRY

    return sentiment
def triple_classifier(tweet):
    sentiment = 0
    text = tweet['text']  # encode

    #if_empty_retweet = if_empty_retweet_weibo(tweet)
    #if if_empty_retweet:
    #    text = tweet['retweeted_status']['text']

    # if_emoticoned = if_emoticoned_weibo(tweet)
    # if if_emoticoned == 1:
    emoticon_sentiment = emoticon(text)
    if emoticon_sentiment != 0:
        sentiment = emoticon_sentiment
        text = u''

    if text != u'':
        entries = cut(cut_str, text.encode('utf-8'))
        entry = [e.decode('utf-8', 'ignore') for e in entries]
        bow = dictionary_1.doc2bow(entry)
        s = [1, 1]
        for pair in bow:
            s[0] = s[0] * (step1_score[pair[0]][0] ** pair[1])
            s[1] = s[1] * (step1_score[pair[0]][1] ** pair[1])
        if s[0] <= s[1]:
            bow = dictionary_2.doc2bow(entry)
            s = [1, 1, 1]
            for pair in bow:
                s[0] = s[0] * (step2_score[pair[0]][0] ** pair[1])
                s[1] = s[1] * (step2_score[pair[0]][1] ** pair[1])
                s[2] = s[2] * (step2_score[pair[0]][2] ** pair[1])
            if s[0] > s[1] and s[0] > s[2]:
                sentiment = HAPPY
            elif s[1] > s[0] and s[1] > s[2]:
                sentiment = SAD
            elif s[2] > s[1] and s[2] > s[0]:
                sentiment = ANGRY

    return sentiment
Пример #7
0
def bi_classification(mid_text):
	dictionary_1, dictionary_2, step1_score, step2_score = get_dictionary()
	triple = [0, 0, 0]
	iter_count = 0
	ts = te = time.time()
	f_senti = open('bi_sentiment.txt', 'w')
	for mid, text in mid_text.iteritems():
	    if iter_count % 10000 == 0:
	        te = time.time()
	        print iter_count, '%s sec' % (te - ts)
	        ts = te
	    iter_count += 1
	    sentiment = 0
	    mid_id_str = id_str = str(mid)

	    if text != '':
	        entry = cut(cut_str, text)

	        bow = dictionary_1.doc2bow(entry)
	        sub_score = [1,1]

	        for pair in bow:
	            sub_score[0] *= (step1_score[pair[0]][0]**pair[1])
	            sub_score[1] *= (step1_score[pair[0]][1]**pair[1])
	        if sub_score[0]<sub_score[1]:
	            s_bow = dictionary_2.doc2bow(entry)
	            score2 = [1,1]
	            for pair in s_bow:
	                score2[0] *= (step2_score[pair[0]][0]**pair[1])
	                score2[1] *= (step2_score[pair[0]][1]**pair[1])
	            if score2[0] > score2[1]:
	                sentiment = HAPPY
	            elif score2[1] > score2[0]:
	                sentiment = ANGRY

	    f_senti.write('%s %s\n' % (id_str, sentiment))
	f_senti.close()
Пример #8
0
 def cut_text(item):
     text = item['text'].encode('utf-8')
     item['terms'] = cut(s, text, cx=False)
     return item
        # 确实存在retweeted_status在我们的数据中不存在的情况
        misskey_err_count += 1

    if if_emoticoned:
        if_emoticoned = int(if_emoticoned)
    if if_emoticoned == 1:
        emoticon_sentiment = emoticon(zan, angry, sad, r['text'])
        if emoticon_sentiment == 1 or emoticon_sentiment == 2:
            sentiment = emoticon_sentiment
        else:
            text = r['text']
    else:
        text = r['text']

    if text != '':
        entries = cut(cut_str, text)
        entry = [e.decode('utf-8') for e in entries]
        bow = dictionary.doc2bow(entry)
        s = [1, 1, 1]
        for pair in bow:
            s[0] = s[0] * (p_senti[pair[0]][0] ** pair[1])
            s[1] = s[1] * (p_senti[pair[0]][1] ** pair[1])
            s[2] = s[2] * (p_senti[pair[0]][2] ** pair[1])
        if s[0] > s[1] and s[0] > s[2]:
            sentiment = HAPPY
        elif s[1] > s[0] and s[1] > s[2]:
            sentiment = ANGRY
        elif s[2] > s[1] and s[2] > s[0]:
            sentiment = SAD

    # 微博是否为转发微博
Пример #10
0
def profile_person_cal(itemdict):
    uid = itemdict['user']
    retweeted_mid = itemdict['retweeted_mid']
    reposts_count = itemdict['reposts_count']
    comments_count = itemdict['comments_count']
    text = itemdict['text']
    retweeted_uid = itemdict['retweeted_uid']

    try:
        keywords_dict = json.loads(daily_profile_keywords_db.Get(str(uid)))
    except:
    	  keywords_dict = {}

    try:
        interact_dict = json.loads(daily_profile_interact_db.Get(str(uid)))
        direct_interact = interact_dict['direct']
        retweeted_interact = interact_dict['retweeted']
    except:
    	direct_interact = {}
    	retweeted_interact = {}
    
    try:
    	active, important, reposts, original, emoticon = daily_profile_counts_db.Get(str(uid)).split('_\/')
        active = int(active)
        important = int(important)
        reposts = int(reposts)
        original = int(original)
        emoticon = int(emoticon)
    except KeyError:
        active = important = reposts = original = emoticon = 0

    active += 1
    important += reposts_count
    
    if retweeted_mid != 0:
        reposts += 1
    else:
        original += 1

    _emoticons = emoticon_find(text)
    if _emoticons:
        emoticon += 1
    
    if isinstance(text, str):
        text = text.decode('utf-8', 'ignore')

    RE = re.compile(u'//@([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+):', re.UNICODE)
    repost_users = RE.findall(text)

    if len(repost_users):
        repost_user = repost_users[0]
        try:
            direct_interact[repost_user] += 1
        except KeyError:
            direct_interact[repost_user] = 1

    if retweeted_uid != 0:
        try:
            retweeted_interact[retweeted_uid] += 1
        except KeyError:
            retweeted_interact[retweeted_uid] = 1

    interact_dict = {'direct': direct_interact, 'retweeted': retweeted_interact}

    terms = cut(scws, _utf_encode(text), f='n')
    for term in terms:
        try:
            keywords_dict[term] += 1
        except KeyError:
            keywords_dict[term] = 1
    
    daily_profile_keywords_db.Put(str(uid), json.dumps(keywords_dict))
    daily_profile_interact_db.Put(str(uid), json.dumps(interact_dict))
    daily_profile_counts_db.Put(str(uid), '_\/'.join([str(active), str(important), str(reposts), str(original), str(emoticon)]))
Пример #11
0
 def cut_text(item):
     text = item["text"].encode("utf-8")
     item["terms"] = cut(s, text, f="n", cx=False)
     return item
Пример #12
0
def batch_handle():
    weibos = xapian_search_weibo.iter_all_docs(fields=['user', 'text', 'retweeted_uid', \
    'retweeted_mid', 'reposts_count', 'comments_count', 'text'])

    count = 0
    ts = te = time.time()
    batch = leveldb.WriteBatch()
    for weibo in weibos:
        if count % 10000 == 0:
            te = time.time()
            daily_profile_person_db.Write(batch, sync=True)
            batch = leveldb.WriteBatch()
            print count, '%s sec' % (te - ts), ' profile person calc', batch_date_1
            ts = te

        uid = weibo['user']
        retweeted_mid = weibo['retweeted_mid']
        reposts_count = weibo['reposts_count']
        comments_count = weibo['comments_count']
        text = weibo['text']
        retweeted_uid = weibo['retweeted_uid']

        try:
            results = daily_profile_person_db.Get(str(uid))
            active, important, reposts, original, emoticon, interact_dict, keywords_dict = results.split('_\/')
            active = int(active)
            important = int(important)
            reposts = int(reposts)
            original = int(original)
            emoticon = int(emoticon)
            interact_dict = json.loads(interact_dict)
            direct_interact = interact_dict['direct']
            retweeted_interact = interact_dict['retweeted']
            keywords_dict = json.loads(keywords_dict)
        except KeyError:
            active = important = reposts = original = emoticon = 0
            direct_interact = {}
            retweeted_interact = {}
            keywords_dict = {}

        active += 1
        important += reposts_count + comments_count

        if retweeted_mid != 0:
            reposts += 1
        else:
            original += 1

        _emoticons = emoticon_find(text)
        if _emoticons and len(_emoticons):
            emoticon += 1
        
        if isinstance(text, str):
            text = text.decode('utf-8', 'ignore')

        RE = re.compile(u'//@([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+):', re.UNICODE)
        repost_users = RE.findall(text)

        if len(repost_users):
            repost_user = repost_users[0]
            try:
                direct_interact[repost_user] += 1
            except KeyError:
                direct_interact[repost_user] = 1

        if retweeted_uid != 0:
            try:
                retweeted_interact[retweeted_uid] += 1
            except KeyError:
                retweeted_interact[retweeted_uid] = 1

        interact_dict = {'direct': direct_interact, 'retweeted': retweeted_interact}

        terms = cut(scws, _utf_encode(text), f='n')
        for term in terms:
            try:
                keywords_dict[term] += 1
            except KeyError:
                keywords_dict[term] = 1

        key = str(uid)
        value = '_\/'.join([str(active), str(important), str(reposts), str(original), str(emoticon), json.dumps(interact_dict), json.dumps(keywords_dict)])
        batch.Put(key, value)
        count += 1