def personTopic2leveldb(keyword_limit=50): # test 0.6 seconds per 10000 weibos weibos = xapian_search_weibo.iter_all_docs(fields=['user', 'text']) count = 0 ts = te = time.time() batch = leveldb.WriteBatch() for weibo in weibos: if count % 10000 == 0: te = time.time() daily_profile_person_topic_db.Write(batch, sync=True) batch = leveldb.WriteBatch() print count, '%s sec' % (te - ts) ts = te uid = weibo['user'] text = _utf_encode(weibo['text']) terms = cut(scws, text, f='n') try: ori_dict = json.loads(daily_profile_person_topic_db.Get(str(uid))) except KeyError: ori_dict = {} for term in terms: try: ori_dict[term] += 1 except KeyError: ori_dict[term] = 1 batch.Put(str(uid), json.dumps(ori_dict)) count += 1
def make_network(topic, date, window_size, max_size=100000, ts=False): end_time = datetime2ts(date) start_time = end_time - window2time(window_size) g = nx.DiGraph() #need repost index topic = cut(s, topic.encode('utf-8')) query_dict = {'text': topic, 'timestamp': {'$gt': start_time, '$lt': end_time}} if ts: count, get_statuses_results = statuses_search.search(query=query_dict, field=['text', 'user', 'timestamp', 'retweeted_status'], max_offset=max_size) else: count, get_statuses_results = statuses_search.search(query=query_dict, field=['text', 'user', 'retweeted_status'], max_offset=max_size) print 'topic statuses count %s' % count if ts: uid_ts = {} for status in get_statuses_results(): try: if status['retweeted_status']: repost_uid = status['user'] rt_mid = status['retweeted_status'] repost_ts = int(status['timestamp']) source_status = acquire_status_by_id(rt_mid) source_uid = source_status['user'] source_ts = int(source_status['timestamp']) if is_in_trash_list(repost_uid) or is_in_trash_list(source_uid): continue if repost_uid not in uid_ts: uid_ts[repost_uid] = repost_ts else: if uid_ts[repost_uid] > repost_ts: uid_ts[repost_uid] = repost_ts if source_uid not in uid_ts: uid_ts[source_uid] = source_ts else: if uid_ts[source_uid] > source_ts: uid_ts[source_uid] = source_ts g.add_edge(repost_uid, source_uid) except (TypeError, KeyError): continue return uid_ts, g else: for status in get_statuses_results(): try: if status['retweeted_status']: repost_uid = status['user'] rt_mid = status['retweeted_status'] source_uid = acquire_status_by_id(rt_mid)['user'] if is_in_trash_list(repost_uid) or is_in_trash_list(source_uid): continue g.add_edge(repost_uid, source_uid) except (TypeError, KeyError): continue return g
def batch_handle_domain(): weibos = xapian_search_weibo.iter_all_docs(fields=['user', 'text', \ 'retweeted_mid', 'reposts_count', 'comments_count', 'text']) count = 0 ts = te = time.time() batch = leveldb.WriteBatch() for weibo in weibos: if count % 10000 == 0: te = time.time() daily_profile_domain_db.Write(batch, sync=True) batch = leveldb.WriteBatch() print count, '%s sec' % (te - ts), ' %s daily domain' % batch_date_1 ts = te uid = weibo['user'] domain = userLeveldb2Domain(uid) retweeted_mid = weibo['retweeted_mid'] reposts_count = weibo['reposts_count'] comments_count = weibo['comments_count'] text = weibo['text'] retweeted_mid = weibo['retweeted_mid'] try: results = daily_profile_domain_db.Get(str(domain)) active, important, reposts, original = results.split('_\/') active = int(active) important = int(important) reposts = int(reposts) original = int(original) except KeyError: active = important = reposts = original = 0 active += 1 important += reposts_count + comments_count if retweeted_mid != 0: reposts += 1 else: original += 1 key = str(domain) value = '_\/'.join([str(active), str(important), str(reposts), str(original)]) batch.Put(key, value) daily_profile_domain_keywords_db = daily_domain_keywords_db[int(domain)] terms = cut(scws, _utf_encode(text), f='n') for term in terms: try: kcount = int(daily_profile_domain_keywords_db.Get(str(term))) daily_profile_domain_keywords_db.Put(str(term), str(kcount + 1)) except KeyError: daily_profile_domain_keywords_db.Put(str(term), str(1)) count += 1
def calc_roeik( retweeted_mid, retweeted_uid, text, reposts, original, emoticon, direct_interact, retweeted_interact, keywords_dict ): if retweeted_mid != 0: reposts += 1 else: original += 1 _emoticons = emoticon_find(text) if _emoticons: emoticon += 1 if isinstance(text, str): text = text.decode("utf-8", "ignore") RE = re.compile(u"//@([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+):", re.UNICODE) repost_users = RE.findall(text) if len(repost_users): repost_user = repost_users[0] direct_interact = str(direct_interact.encode("utf-8")) + "\_/" + str(repost_user.encode("utf-8")) if retweeted_uid != 0: retweeted_interact = str(retweeted_interact) + "\_/" + str(retweeted_uid) """ if len(repost_users): repost_user = repost_users[0] try: direct_interact[repost_user] += 1 except KeyError: direct_interact[repost_user] = 1 if retweeted_uid != 0: try: retweeted_interact[retweeted_uid] += 1 except KeyError: retweeted_interact[retweeted_uid] = 1 """ interact_dict = {"direct": direct_interact, "retweeted": retweeted_interact} terms = cut(scws, _utf_encode(text), f="n") keywords_dict += "\_/" + "\_/".join(terms) """ for term in terms: try: keywords_dict[term] += 1 except KeyError: keywords_dict[term] = 1 """ return reposts, original, emoticon, interact_dict, keywords_dict
def triple_classifier(tweet): sentiment = 0 text = tweet['text'] # encode #if_empty_retweet = if_empty_retweet_weibo(tweet) #if if_empty_retweet: # text = tweet['retweeted_status']['text'] # if_emoticoned = if_emoticoned_weibo(tweet) # if if_emoticoned == 1: emoticon_sentiment = emoticon(text) if emoticon_sentiment != 0: sentiment = emoticon_sentiment text = u'' if text != u'': entries = cut(cut_str, text.encode('utf-8')) entry = [e.decode('utf-8', 'ignore') for e in entries] bow = dictionary_1.doc2bow(entry) s = [1, 1] for pair in bow: s[0] = s[0] * (step1_score[pair[0]][0]**pair[1]) s[1] = s[1] * (step1_score[pair[0]][1]**pair[1]) if s[0] <= s[1]: bow = dictionary_2.doc2bow(entry) s = [1, 1, 1] for pair in bow: s[0] = s[0] * (step2_score[pair[0]][0]**pair[1]) s[1] = s[1] * (step2_score[pair[0]][1]**pair[1]) s[2] = s[2] * (step2_score[pair[0]][2]**pair[1]) if s[0] > s[1] and s[0] > s[2]: sentiment = HAPPY elif s[1] > s[0] and s[1] > s[2]: sentiment = SAD elif s[2] > s[1] and s[2] > s[0]: sentiment = ANGRY return sentiment
def triple_classifier(tweet): sentiment = 0 text = tweet['text'] # encode #if_empty_retweet = if_empty_retweet_weibo(tweet) #if if_empty_retweet: # text = tweet['retweeted_status']['text'] # if_emoticoned = if_emoticoned_weibo(tweet) # if if_emoticoned == 1: emoticon_sentiment = emoticon(text) if emoticon_sentiment != 0: sentiment = emoticon_sentiment text = u'' if text != u'': entries = cut(cut_str, text.encode('utf-8')) entry = [e.decode('utf-8', 'ignore') for e in entries] bow = dictionary_1.doc2bow(entry) s = [1, 1] for pair in bow: s[0] = s[0] * (step1_score[pair[0]][0] ** pair[1]) s[1] = s[1] * (step1_score[pair[0]][1] ** pair[1]) if s[0] <= s[1]: bow = dictionary_2.doc2bow(entry) s = [1, 1, 1] for pair in bow: s[0] = s[0] * (step2_score[pair[0]][0] ** pair[1]) s[1] = s[1] * (step2_score[pair[0]][1] ** pair[1]) s[2] = s[2] * (step2_score[pair[0]][2] ** pair[1]) if s[0] > s[1] and s[0] > s[2]: sentiment = HAPPY elif s[1] > s[0] and s[1] > s[2]: sentiment = SAD elif s[2] > s[1] and s[2] > s[0]: sentiment = ANGRY return sentiment
def bi_classification(mid_text): dictionary_1, dictionary_2, step1_score, step2_score = get_dictionary() triple = [0, 0, 0] iter_count = 0 ts = te = time.time() f_senti = open('bi_sentiment.txt', 'w') for mid, text in mid_text.iteritems(): if iter_count % 10000 == 0: te = time.time() print iter_count, '%s sec' % (te - ts) ts = te iter_count += 1 sentiment = 0 mid_id_str = id_str = str(mid) if text != '': entry = cut(cut_str, text) bow = dictionary_1.doc2bow(entry) sub_score = [1,1] for pair in bow: sub_score[0] *= (step1_score[pair[0]][0]**pair[1]) sub_score[1] *= (step1_score[pair[0]][1]**pair[1]) if sub_score[0]<sub_score[1]: s_bow = dictionary_2.doc2bow(entry) score2 = [1,1] for pair in s_bow: score2[0] *= (step2_score[pair[0]][0]**pair[1]) score2[1] *= (step2_score[pair[0]][1]**pair[1]) if score2[0] > score2[1]: sentiment = HAPPY elif score2[1] > score2[0]: sentiment = ANGRY f_senti.write('%s %s\n' % (id_str, sentiment)) f_senti.close()
def cut_text(item): text = item['text'].encode('utf-8') item['terms'] = cut(s, text, cx=False) return item
# 确实存在retweeted_status在我们的数据中不存在的情况 misskey_err_count += 1 if if_emoticoned: if_emoticoned = int(if_emoticoned) if if_emoticoned == 1: emoticon_sentiment = emoticon(zan, angry, sad, r['text']) if emoticon_sentiment == 1 or emoticon_sentiment == 2: sentiment = emoticon_sentiment else: text = r['text'] else: text = r['text'] if text != '': entries = cut(cut_str, text) entry = [e.decode('utf-8') for e in entries] bow = dictionary.doc2bow(entry) s = [1, 1, 1] for pair in bow: s[0] = s[0] * (p_senti[pair[0]][0] ** pair[1]) s[1] = s[1] * (p_senti[pair[0]][1] ** pair[1]) s[2] = s[2] * (p_senti[pair[0]][2] ** pair[1]) if s[0] > s[1] and s[0] > s[2]: sentiment = HAPPY elif s[1] > s[0] and s[1] > s[2]: sentiment = ANGRY elif s[2] > s[1] and s[2] > s[0]: sentiment = SAD # 微博是否为转发微博
def profile_person_cal(itemdict): uid = itemdict['user'] retweeted_mid = itemdict['retweeted_mid'] reposts_count = itemdict['reposts_count'] comments_count = itemdict['comments_count'] text = itemdict['text'] retweeted_uid = itemdict['retweeted_uid'] try: keywords_dict = json.loads(daily_profile_keywords_db.Get(str(uid))) except: keywords_dict = {} try: interact_dict = json.loads(daily_profile_interact_db.Get(str(uid))) direct_interact = interact_dict['direct'] retweeted_interact = interact_dict['retweeted'] except: direct_interact = {} retweeted_interact = {} try: active, important, reposts, original, emoticon = daily_profile_counts_db.Get(str(uid)).split('_\/') active = int(active) important = int(important) reposts = int(reposts) original = int(original) emoticon = int(emoticon) except KeyError: active = important = reposts = original = emoticon = 0 active += 1 important += reposts_count if retweeted_mid != 0: reposts += 1 else: original += 1 _emoticons = emoticon_find(text) if _emoticons: emoticon += 1 if isinstance(text, str): text = text.decode('utf-8', 'ignore') RE = re.compile(u'//@([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+):', re.UNICODE) repost_users = RE.findall(text) if len(repost_users): repost_user = repost_users[0] try: direct_interact[repost_user] += 1 except KeyError: direct_interact[repost_user] = 1 if retweeted_uid != 0: try: retweeted_interact[retweeted_uid] += 1 except KeyError: retweeted_interact[retweeted_uid] = 1 interact_dict = {'direct': direct_interact, 'retweeted': retweeted_interact} terms = cut(scws, _utf_encode(text), f='n') for term in terms: try: keywords_dict[term] += 1 except KeyError: keywords_dict[term] = 1 daily_profile_keywords_db.Put(str(uid), json.dumps(keywords_dict)) daily_profile_interact_db.Put(str(uid), json.dumps(interact_dict)) daily_profile_counts_db.Put(str(uid), '_\/'.join([str(active), str(important), str(reposts), str(original), str(emoticon)]))
def cut_text(item): text = item["text"].encode("utf-8") item["terms"] = cut(s, text, f="n", cx=False) return item
def batch_handle(): weibos = xapian_search_weibo.iter_all_docs(fields=['user', 'text', 'retweeted_uid', \ 'retweeted_mid', 'reposts_count', 'comments_count', 'text']) count = 0 ts = te = time.time() batch = leveldb.WriteBatch() for weibo in weibos: if count % 10000 == 0: te = time.time() daily_profile_person_db.Write(batch, sync=True) batch = leveldb.WriteBatch() print count, '%s sec' % (te - ts), ' profile person calc', batch_date_1 ts = te uid = weibo['user'] retweeted_mid = weibo['retweeted_mid'] reposts_count = weibo['reposts_count'] comments_count = weibo['comments_count'] text = weibo['text'] retweeted_uid = weibo['retweeted_uid'] try: results = daily_profile_person_db.Get(str(uid)) active, important, reposts, original, emoticon, interact_dict, keywords_dict = results.split('_\/') active = int(active) important = int(important) reposts = int(reposts) original = int(original) emoticon = int(emoticon) interact_dict = json.loads(interact_dict) direct_interact = interact_dict['direct'] retweeted_interact = interact_dict['retweeted'] keywords_dict = json.loads(keywords_dict) except KeyError: active = important = reposts = original = emoticon = 0 direct_interact = {} retweeted_interact = {} keywords_dict = {} active += 1 important += reposts_count + comments_count if retweeted_mid != 0: reposts += 1 else: original += 1 _emoticons = emoticon_find(text) if _emoticons and len(_emoticons): emoticon += 1 if isinstance(text, str): text = text.decode('utf-8', 'ignore') RE = re.compile(u'//@([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+):', re.UNICODE) repost_users = RE.findall(text) if len(repost_users): repost_user = repost_users[0] try: direct_interact[repost_user] += 1 except KeyError: direct_interact[repost_user] = 1 if retweeted_uid != 0: try: retweeted_interact[retweeted_uid] += 1 except KeyError: retweeted_interact[retweeted_uid] = 1 interact_dict = {'direct': direct_interact, 'retweeted': retweeted_interact} terms = cut(scws, _utf_encode(text), f='n') for term in terms: try: keywords_dict[term] += 1 except KeyError: keywords_dict[term] = 1 key = str(uid) value = '_\/'.join([str(active), str(important), str(reposts), str(original), str(emoticon), json.dumps(interact_dict), json.dumps(keywords_dict)]) batch.Put(key, value) count += 1