def cal_class_ratio():
    ratio_results = {}
    date = '2013-09-07'
    ts = datetime2ts(date)
    scan_count = 0
    scan_cursor = 0
    all_count = 0
    while 1:
        if scan_count == 1000000:
            break
        results = r_cluster.hscan('activity_' + str(ts),
                                  scan_cursor,
                                  count=1000)
        scan_cursor = results[0]
        scan_count += 1000
        for uid in results[1]:
            activity_dict_string = r_cluster.hget('activity_' + str(ts), uid)
            activity_dict = json.loads(activity_dict_string)
            weibo_count = 0
            for time_seg in activity_dict:
                weibo_count += int(activity_dict[time_seg])
            if weibo_count >= 6:
                indic_3 = '1'
            else:
                indic_3 = '0'
            retweet_results = r.hgetall('retweet_' + str(uid))
            retweet_count = len(retweet_results)
            if retweet_count >= 8:
                indic_1 = '1'
            else:
                indic_1 = '0'
            be_retweet_results = r.hgetall('be_retweet_' + str(uid))
            be_retweet_count = len(be_retweet_results)
            #print 'be_retweet_count:', be_retweet_count
            if be_retweet_count >= 9:
                indic_2 = '1'
            else:
                indic_2 = '0'
            #print 'indic_2:', indic_2
            key = indic_1 + indic_2 + indic_3
            try:
                ratio_results[key] += 1
            except:
                ratio_results[key] = 1
            # write eight type users
            '''
            if key=='001':
                writer1.writerow([uid, retweet_count, be_retweet_count, weibo_count])
            elif key=='111':
                writer2.writerow([uid, retweet_count, be_retweet_count, weibo_count])
            elif key=='101':
                writer3.writerow([uid, retweet_count, be_retweet_count, weibo_count])
            elif key=='011':
                writer4.writerow([uid, retweet_count, be_retweet_count, weibo_count])
            elif key=='110':
                writer5.writerow([uid, retweet_count, be_retweet_count, weibo_count])
            if key=='010':
                writer6.writerow([uid, retweet_count, be_retweet_count, weibo_count])
            '''
    print 'ratio_results:', ratio_results
def cal_ave_weibo():
    date = '2013-09-07'
    timestamp = datetime2ts(date)
    scan_count = 0
    scan_cursor = 0
    all_count = 0
    while 1:
        if scan_count == 1000000:
            break
        results = r_cluster.hscan('activity_' + str(timestamp),
                                  scan_cursor,
                                  count=1000)
        scan_cursor = results[0]
        scan_count += 1000
        for uid in results[1]:
            for i in range(0, 1):
                ts = timestamp - 24 * 3600 * i
                activity_dict_string = r_cluster.hget('activity_' + str(ts),
                                                      uid)
                if activity_dict_string:
                    activity_dict = json.loads(activity_dict_string)
                    weibo_count = 0
                    for time_seg in activity_dict:
                        weibo_count += int(activity_dict[time_seg])
                    all_count += weibo_count
    ave_count = float(all_count) / scan_count
    print 'ave_count:', ave_count
def cal_class_ratio():
    ratio_results = {}
    date = '2013-09-07'
    ts = datetime2ts(date)
    scan_count = 0
    scan_cursor = 0
    all_count = 0
    while 1:
        if scan_count == 1000000:
            break
        results = r_cluster.hscan('activity_'+str(ts), scan_cursor, count=1000)
        scan_cursor = results[0]
        scan_count += 1000
        for uid in results[1]:
            activity_dict_string = r_cluster.hget('activity_'+str(ts), uid)
            activity_dict = json.loads(activity_dict_string)
            weibo_count = 0
            for time_seg in activity_dict:
                weibo_count += int(activity_dict[time_seg])
            if weibo_count >= 6:
                indic_3 = '1'
            else:
                indic_3 = '0'
            retweet_results = r.hgetall('retweet_'+str(uid))
            retweet_count = len(retweet_results)
            if retweet_count >= 8:
                indic_1 = '1'
            else:
                indic_1 = '0'
            be_retweet_results = r.hgetall('be_retweet_'+str(uid))
            be_retweet_count = len(be_retweet_results)
            #print 'be_retweet_count:', be_retweet_count
            if be_retweet_count >= 9:
                indic_2 = '1'
            else:
                indic_2 = '0'
            #print 'indic_2:', indic_2
            key = indic_1 + indic_2 + indic_3
            try:
                ratio_results[key] += 1
            except:
                ratio_results[key] = 1
            # write eight type users
            '''
            if key=='001':
                writer1.writerow([uid, retweet_count, be_retweet_count, weibo_count])
            elif key=='111':
                writer2.writerow([uid, retweet_count, be_retweet_count, weibo_count])
            elif key=='101':
                writer3.writerow([uid, retweet_count, be_retweet_count, weibo_count])
            elif key=='011':
                writer4.writerow([uid, retweet_count, be_retweet_count, weibo_count])
            elif key=='110':
                writer5.writerow([uid, retweet_count, be_retweet_count, weibo_count])
            if key=='010':
                writer6.writerow([uid, retweet_count, be_retweet_count, weibo_count])
            '''
    print 'ratio_results:', ratio_results
def cal_ave_weibo():
    date = '2013-09-07'
    timestamp = datetime2ts(date)
    scan_count = 0
    scan_cursor = 0
    all_count = 0
    while 1:
        if scan_count == 1000000:
            break
        results = r_cluster.hscan('activity_'+str(timestamp), scan_cursor, count=1000)
        scan_cursor = results[0]
        scan_count += 1000
        for uid in results[1]:
            for i in range(0,1):
                ts = timestamp - 24*3600*i
                activity_dict_string = r_cluster.hget('activity_'+str(ts), uid)
                if activity_dict_string:
                    activity_dict = json.loads(activity_dict_string)
                    weibo_count = 0
                    for time_seg in activity_dict:
                        weibo_count += int(activity_dict[time_seg])
                    all_count += weibo_count
    ave_count = float(all_count) / scan_count
    print 'ave_count:', ave_count
예제 #5
0
def main():
    if RUN_TYPE:
        now_ts = time.time() - DAY  # 前一天
        ts = str(datetime2ts(ts2datetime(now_ts)))
    else:
        ts = str(datetime2ts('2016-03-16'))
    now_ts = int(ts)
    print now_ts
    sensitive_string = "sensitive_" + ts
    date_string = ts
    update_sensitive_key = "sensitive_score_" + ts  # 更新的键
    sensitive_dict_key = "sensitive_dict_" + ts
    sensitive_string_key = "sensitive_string_" + ts
    sensitive_day_change_key = "sensitive_" + ts + "_day_change"
    del_month = datetime2ts(ts2datetime(now_ts - MONTH))
    del_sensitive_key = "sensitive_score_" + str(del_month)  # 要删除的键

    former_ts = int(ts) - DAY
    former_date = str(datetime2ts(ts2datetime(former_ts)))
    former_sensitive_key = "sensitive_score_" + former_date

    iter_count = 0
    bulk_action = []

    mappings(ES_SENSITIVE_INDEX)
    total_number = r.hlen(sensitive_string)
    scan_cursor = 0
    print total_number

    while 1:
        re_scan = r.hscan(sensitive_string, scan_cursor, count=1000)
        scan_cursor = re_scan[0]
        if len(re_scan[1]) != 0:
            sensitive_info = re_scan[1]  # 字典形式,uid:sensitive_words_dict
            uid_list = sensitive_info.keys()
            sensitive_results = es.mget(index=ES_SENSITIVE_INDEX,
                                        doc_type=DOCTYPE_SENSITIVE_INDEX,
                                        body={"ids": uid_list})['docs']
            if sensitive_results:
                for item in sensitive_results:
                    uid = item['_id']
                    sensitive_words_dict = json.loads(
                        sensitive_info[uid])  # json.loads
                    current_sensitive_score = 0
                    for k, v in sensitive_words_dict.iteritems():
                        tmp_stage = r_sensitive.hget("sensitive_words", k)
                        if tmp_stage:
                            current_sensitive_score += v * sensitive_score_dict[
                                str(tmp_stage)]
                    if item['found']:  # 之前存在相关信息
                        revise_item = item["_source"]
                        if del_sensitive_key in revise_item:
                            item.remove(del_sensitive_key)
                        revise_item['uid'] = uid
                        # 新更新的敏感度
                        revise_item[
                            update_sensitive_key] = current_sensitive_score
                        # 新更新的敏感词
                        revise_item[sensitive_dict_key] = sensitive_info[uid]
                        # 新更新的string
                        revise_item[sensitive_string_key] = "&".join(
                            sensitive_words_dict.keys())
                        # 当天和之前一天、一周和一月均值的差异
                        revise_item[
                            'sensitive_day_change'] = current_sensitive_score - revise_item.get(
                                former_sensitive_key, 0)
                        revise_item[
                            'sensitive_week_change'] = current_sensitive_score - revise_item.get(
                                'sensitive_week_ave', 0)
                        revise_item[
                            'sensitive_month_change'] = current_sensitive_score - revise_item.get(
                                'sensitive_month_ave', 0)
                        # 更新后week、month的均值和方差
                        revise_item['sensitive_week_ave'], revise_item[
                            'sensitive_week_var'], revise_item[
                                'sensitive_week_sum'] = compute_week(
                                    revise_item, now_ts)
                        revise_item['senstiive_month_ave'], revise_item[
                            'sensitive_month_var'], revise_item[
                                'sensitive_month_sum'] = compute_month(
                                    revise_item, now_ts)

                    else:
                        revise_item = dict()
                        revise_item['uid'] = uid
                        revise_item[
                            update_sensitive_key] = current_sensitive_score
                        revise_item[sensitive_dict_key] = sensitive_info[uid]
                        revise_item[sensitive_string_key] = "&".join(
                            sensitive_words_dict.keys())
                        revise_item[
                            'sensitive_day_change'] = current_sensitive_score
                        revise_item[
                            'sensitive_week_change'] = current_sensitive_score
                        revise_item[
                            'sensitive_month_change'] = current_sensitive_score
                        revise_item['sensitive_week_ave'], revise_item[
                            'sensitive_week_var'], revise_item[
                                'sensitive_week_sum'] = compute_week(
                                    revise_item, now_ts)
                        revise_item['senstiive_month_ave'], revise_item[
                            'sensitive_month_var'], revise_item[
                                'sensitive_month_sum'] = compute_month(
                                    revise_item, now_ts)
                    action = {'index': {'_id': uid}}
                    bulk_action.extend([action, revise_item])
                    iter_count += 1
                    if iter_count % 1000 == 0:
                        es.bulk(bulk_action,
                                index=ES_SENSITIVE_INDEX,
                                doc_type=DOCTYPE_SENSITIVE_INDEX)
                        bulk_action = []
                        print iter_count
        if int(scan_cursor) == 0:
            break
    if bulk_action:
        es.bulk(bulk_action,
                index=ES_SENSITIVE_INDEX,
                doc_type=DOCTYPE_SENSITIVE_INDEX)

    print iter_count
def main():
    if RUN_TYPE:
        now_ts = time.time()-DAY # 前一天
        ts = str(datetime2ts(ts2datetime(now_ts)))
    else:
        ts = str(datetime2ts('2013-09-07'))
    now_ts = int(ts)
    sensitive_string = "sensitive_" + ts
    date_string = ts
    update_sensitive_key = "sensitive_score_" + ts # 更新的键
    sensitive_dict_key = "sensitive_dict_" + ts
    sensitive_string_key = "sensitive_string_" + ts
    sensitive_day_change_key = "sensitive_" + ts +"_day_change"
    del_month = datetime2ts(ts2datetime(now_ts - MONTH))
    del_sensitive_key = "sensitive_score_"+str(del_month) # 要删除的键

    former_ts = int(ts) - DAY
    former_date = str(datetime2ts(ts2datetime(former_ts)))
    former_sensitive_key = "sensitive_score_" + former_date

    iter_count = 0
    bulk_action = []

    mappings(ES_SENSITIVE_INDEX)
    total_number = r.hlen(sensitive_string)
    scan_cursor = 0
    print total_number

    while 1:
        re_scan = r.hscan(sensitive_string, scan_cursor, count=1000)
        scan_cursor = re_scan[0]
        if len(re_scan[1]) != 0:
            sensitive_info = re_scan[1] # 字典形式,uid:sensitive_words_dict
            uid_list = sensitive_info.keys()
            sensitive_results = es.mget(index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX, body={"ids":uid_list})['docs']
            if sensitive_results:
                for item in sensitive_results:
                    uid = item['_id']
                    sensitive_words_dict = json.loads(sensitive_info[uid]) # json.loads
                    current_sensitive_score = 0
                    for k,v in sensitive_words_dict.iteritems():
                        tmp_stage = r_sensitive.hget("sensitive_words", k)
                        if tmp_stage:
                            current_sensitive_score += v*sensitive_score_dict[str(tmp_stage)]
                    if item['found']: # 之前存在相关信息
                        revise_item = item["_source"]
                        if del_sensitive_key in revise_item:
                            item.pop(del_sensitive_key)
                        revise_item['uid'] = uid
                        # 新更新的敏感度
                        revise_item[update_sensitive_key] = current_sensitive_score
                        # 新更新的敏感词
                        revise_item[sensitive_dict_key] = sensitive_info[uid]
                        # 新更新的string
                        revise_item[sensitive_string_key] = "&".join(sensitive_words_dict.keys())
                        # 当天和之前一天、一周和一月均值的差异
                        revise_item['sensitive_day_change'] = current_sensitive_score - revise_item.get(former_sensitive_key, 0)
                        revise_item['sensitive_week_change'] = current_sensitive_score - revise_item.get('sensitive_week_ave', 0)
                        revise_item['sensitive_month_change'] = current_sensitive_score - revise_item.get('sensitive_month_ave', 0)
                        # 更新后week、month的均值和方差
                        revise_item['sensitive_week_ave'], revise_item['sensitive_week_var'], revise_item['sensitive_week_sum'] = compute_week(revise_item, now_ts)
                        revise_item['senstiive_month_ave'], revise_item['sensitive_month_var'], revise_item['sensitive_month_sum'] = compute_month(revise_item, now_ts)

                    else:
                        revise_item = dict()
                        revise_item['uid'] = uid
                        revise_item[update_sensitive_key] = current_sensitive_score
                        revise_item[sensitive_dict_key] = sensitive_info[uid]
                        revise_item[sensitive_string_key] = "&".join(sensitive_words_dict.keys())
                        revise_item['sensitive_day_change'] = current_sensitive_score
                        revise_item['sensitive_week_change'] = current_sensitive_score
                        revise_item['sensitive_month_change'] = current_sensitive_score
                        revise_item['sensitive_week_ave'], revise_item['sensitive_week_var'], revise_item['sensitive_week_sum'] = compute_week(revise_item, now_ts)
                        revise_item['senstiive_month_ave'], revise_item['sensitive_month_var'], revise_item['sensitive_month_sum'] = compute_month(revise_item, now_ts)
                    action = {'index':{'_id': uid}}
                    bulk_action.extend([action, revise_item])
                    iter_count += 1
                    if iter_count % 1000 == 0:
                        es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX)
                        bulk_action = []
                        print iter_count
        if int(scan_cursor) == 0:
            break
    if bulk_action:
        es.bulk(bulk_action, index=ES_SENSITIVE_INDEX, doc_type=DOCTYPE_SENSITIVE_INDEX)