def export_data(): DAU_LOG = {} populate_DAU(datetime.datetime(2012,2,20),datetime.datetime(2012,11,1),DAU_LOG) csw = CommonCsvWriter('stat_HARDCODE-1024') csw.write_header([u'注册月份',u'追踪月份',u'has_read',u'has_write',u'friends count','feed']) for month in range(1,13): target_users = get_target_users_by_month(month) for m in range(3,11): start_month = datetime.datetime(2012,m,1) end_month = datetime.datetime(2012,(m + 1),1) start_timestamp = stat_util.convert_datetime_to_timestamp(start_month) end_timestamp = stat_util.convert_datetime_to_timestamp(end_month) last_day_of_month = end_month - datetime.timedelta(days = 1,hours = 8) for user in target_users: uid = user.id is_read_activtity = 'YES' if is_readactivity(uid,start_month,end_month,DAU_LOG) else 'NO' is_write_activity = 'YES' if is_writeactivity(uid,start_month,end_month) else 'NO' friends = get_friends_and_counts(uid,last_day_of_month) seen_feed = 0 for f in friends: c = crab.user_post[f].find(R.type.in_([1,2,3,5,7]) & ( R.created_on >= start_timestamp) & (R.created_on < end_timestamp)).count() seen_feed += c csw.write_onerow([month,m,is_read_activtity,is_write_activity,len(friends),seen_feed]) csw.end_write()
def export_csv(uids): csw = CommonCsvWriter(filename='./output/stat_HARDCORE-983') csw.write_header([u'aid','guid','guid_name','chechin No','view','click','share']) csv_body = [] start_date = datetime.datetime(2012,9,30) - datetime.timedelta(hours = 8) end_date = datetime.datetime(2012,10,30) + datetime.timedelta(hours = 16) start = stat_util.convert_datetime_to_timestamp(start_date) end = stat_util.convert_datetime_to_timestamp(end_date) for uid in uids: sina = utils.get_weibo_info(uid) friends_count = get_f_count(uid) user = user_get(uid) user_name = user['name'] user_point = user['points_total'] user_posts = [] check_in_count = 0 for p in crab.user_post[uid].find( (R.created_on >=start) & (R.created_on < end) & (R.location_id > 0)): created_on = p['created_on'] if not check_in_count: check_in_count = crab.user_post[uid].find( (R.created_on >=start) & (R.created_on < end) & (R.location_id > 0)).count() checkin_date = datetime.datetime.fromtimestamp(created_on) post = db_slave.post.find_one({'_id':p['post_id']}) loc = location_get(post['l'], 'basic') loc_name = loc['name'] city = city_get(p['city']) or 'N/A' if city: city= city['name'] body = post['b'] photo = db.photo.find_one({'p':p['post_id']}) has_photo = 'YES' if photo else 'NO' photo_link = get_photo_url(photo) + '?size=500&style=1&quality=high' if photo else 'N/A' sina_weibo = 'YES' if sina else 'N/A' sina_name = sina['screen_name'] if sina else 'N/A' sina_url = 'http://weibo.com/u/%d' % sina['id'] if sina else 'N/A' csw.write_onerow([user_name,uid,user_point,checkin_date,loc_name,check_in_count,friends_count,city,body,has_photo,photo_link , sina_weibo,sina_name ,sina_url]) keylist = aid_data.keys() keylist.sort() for key in keylist: aid = key datas = aid_data.get(key) for data in datas: d_key_list = data.keys() d_key_list.sort() for d_key in d_key_list: guid = d_key checkin_c = get_post_count_by_guid(guid) loc_name = get_location(guid) num = data.get(d_key) #print aid,guid,num csv_body.append([aid,guid,loc_name,checkin_c,num[0],num[1],num[2]]) #csw.write_body(csv_body) csw.end_write()
def get_post_count_by_guid(guid): end = stat_util.convert_datetime_to_timestamp(datetime.datetime(2012,11,1) - datetime.timedelta(hours = 8)) start = stat_util.convert_datetime_to_timestamp(datetime.datetime(2012,9,30) - datetime.timedelta(hours = 8)) r = crab.location_post[guid_to_int(guid)].find((R.type.in_([1,3,7,10])) & (R.created_on > start)& (R.created_on < end)).count() #r += crab.location_post[guid_to_int(guid)].find( R.type==7& R.created_on > start& R.created_on < end).count() #r += crab.location_post[guid_to_int(guid)].find( R.type==10& R.created_on > start& R.created_on < end).count() print 'get_post_count_by_guid',guid,r return r
def get_user_posts(uid,start_date,end_date,all_posts): crab_cond = [R.privacy != 1] #非【仅自己可见】 crab_cond.append(R.location_id) if start_date: start_timestamp = stat_util.convert_datetime_to_timestamp(start_date) crab_cond.append(R.created_on >= start_timestamp) if end_date: end_timestamp = stat_util.convert_datetime_to_timestamp(end_date) crab_cond.append(R.created_on <= end_timestamp) posts_by_month = [[] for r in range(0,12)] print crab.user_post[uid].find(*crab_cond).count() for crab_post in crab.user_post[uid].find(*crab_cond): pid = crab_post['post_id'] #mongo_post = db_slave.post.find_one({'_id':pid}) #"nl"= number of likes;"nc" = number of comments mongo_post = None if all_posts and all_posts.get(pid): mongo_post = all_posts.get(pid) #mongo_post = db_slave.post.find_one({'_id':pid}) if not mongo_post: mongo_post = db_slave.post.find_one({'_id':pid}) if not mongo_post: continue guid = mongo_post.get('l') if __is_virtual_loc(guid): continue has_photo = 1 if crab_post['has_photo'] == -1 else 0 if mongo_post and not mongo_post.get('del'): created_on = mongo_post.get('c') week_idx = (created_on + datetime.timedelta(hours=8)).month - 1 total_feedback = (mongo_post.get('nc') or 0 )+ (mongo_post.get('nl') or 0) mongo_post['total_feedback'] = total_feedback mongo_post['has_photo'] = has_photo posts_by_month[week_idx].append(mongo_post) # sort fetched posts for idx in range(0,12): posts_by_month[idx] = sorted(posts_by_month[idx],key=itemgetter('has_photo','total_feedback'),reverse=True) # sort in DESC #posts_by_month[idx] = sorted(posts_by_month[idx],key=lambda post: post.get('has_photo')) posts_by_month[idx] = posts_by_month[idx][:3] # truncate number of elements to 3 posts_by_month[idx] = sorted(posts_by_month[idx],key=itemgetter('c')) # sort by created_on in ASC #print posts_by_month return posts_by_month
def stat_user_lists(target_guids): start_date = datetime.datetime.utcnow() - datetime.timedelta(days=60) stat_timestamp = stat_util.convert_datetime_to_timestamp(start_date) user_ids = set() for guid in target_guids: int_guid = guid_to_int(guid) for r in crab.location_post[int_guid].find(R.created_on > stat_timestamp).group(R.user_id): user_ids.add(r["user_id"]) return user_ids
def is_writeactivity(uid,start_month,end_month): start = start_month - datetime.timedelta(hours = 8) end = end_month - datetime.timedelta(hours = 8) start = stat_util.convert_datetime_to_timestamp(start) end = stat_util.convert_datetime_to_timestamp(end) return crab.user_post[uid].find((R.created_on >= start) & (R.created_on <end).count()) > 0
def create_cate_data_monthly(): start_month = 12 end_month = 12 guid_cate = fetch_all_loc_cate() user_cate = {} exists_count = 0 cursor = db_slave.stat_journal_2012.find({},{'_id':1,'mc':1},timeout = False) try: for r in cursor: if 'mc' not in r: continue old_mc = r['mc'] exists_count += 1 uid = r['_id'] user_cate[uid] = [[]for r in range(0,12)] for month in range(start_month,end_month + 1): start_date = datetime.datetime(2012,month,1) - datetime.timedelta(hours = 8) end_date = start_date + datetime.timedelta(days = 31) if end_date.month != month: end_date = start_date + datetime.timedelta(days = 30) if end_date.month != month: end_date = start_date + datetime.timedelta(days = 29) start_timestamp = stat_util.convert_datetime_to_timestamp(start_date) end_timestamp = stat_util.convert_datetime_to_timestamp(end_date) cate_count_dict = {} for p in crab.user_post[uid].find( (R.privacy != 1) & R.location_id & (R.created_on >= start_timestamp) & (R.created_on <= end_timestamp)).group(R.location_id): loc_id = p['location_id'] #pid = p['post_id'] #mongo_post = db_slave.post.find_one({'_id':pid}) or {} #guid = mongo_post.get('l') guid ='%x' % loc_id if not guid: continue short_guid = guid[:14] short_guid = short_guid.upper() #print short_guid #categories = db_slave.locations_categories_2.find_one({'_id':short_guid}) or {} #cate = categories.get('cat')[0] if 'cat' in categories and len(categories['cat']) > 0 else '' cate = guid_cate.get(short_guid) if not cate: continue cate_id = cate.get('id') or None if not cate_id: continue if cate_id not in cate_count_dict: cate_count_dict[cate_id] = 0 cate_count_dict[cate_id] = cate_count_dict[cate_id] + 1 frequently_cate = [] if cate_count_dict: #find 3 gone frequently categories while True: if len(frequently_cate) > 2 or len(frequently_cate) > len(cate_count_dict): break max_cate ,max_count = max(cate_count_dict.iteritems(),key = lambda cate:cate[1]) del cate_count_dict[max_cate] #frequently_cate[max_cate] = max_count frequently_cate.append({'cid':max_cate,'c':max_count}) #print cate_count_dict user_cate[uid][month - 1] = frequently_cate #max_locations = [] #while True: # if len(max_locations) >= 3 print uid if user_cate[uid][11]: if type(old_mc) != list: db.stat_journal_2012.update({'_id': uid}, {'$set': {'mc':user_cate[uid]}}) else: db.stat_journal_2012.update({'_id': uid}, {'$set': {'mc.11':user_cate[uid][11]}}) finally: print 'newly appended user count is ',exists_count cursor.close()
def export_csv(guids): csw = CommonCsvWriter(filename="./output/stat_HARDCORE-990-1") csw.write_header([u"aid", "guid", "guid_name", "chechin No", "view", "click", "share"]) csv_body = [] start_time = datetime.datetime(2012, 6, 14) - datetime.timedelta(hours=8) start = stat_util.convert_datetime_to_timestamp(start_time) end_time = datetime.datetime(2012, 11, 14) + datetime.timedelta(hours=16) end = stat_util.convert_datetime_to_timestamp(end_time) for guid in guids: _checkin_count = crab.location_post[guid_to_int(guid)].find(R.type.in_([1, 3, 7, 10])).count() _user_count = crab.location_post[guid_to_int(guid)].find(R.type.in_([1, 3, 7, 10])).group(R.user_id).count() _photo_count = crab.location_post[guid_to_int(guid)].find((R.has_photo) & R.type.in_([1, 3, 7, 10])).count() checkin_count = ( crab.location_post[guid_to_int(guid)] .find(R.type.in_([1, 3, 7, 10]) & (R.created_on >= start) & (R.created_on < end)) .count() ) user_count = ( crab.location_post[guid_to_int(guid)] .find(R.type.in_([1, 3, 7, 10]) & (R.created_on >= start) & (R.created_on < end)) .group(R.user_id) .count() ) photo_count = ( crab.location_post[guid_to_int(guid)] .find(R.type.in_([1, 3, 7, 10]) & (R.has_photo) & (R.created_on >= start) & (R.created_on < end)) .count() ) # pid = crab.location_post[int(guid,16)].find(R.type.in_([1,3,7,10]))[0]['post_id'] location = location_get(guid) if location: # row = [guid,location['name'],checkin_count,user_count,photo_count,' ',_checkin_count,_user_count,_photo_count] row = [ guid, location["name"], _checkin_count, _user_count, _photo_count, " ", checkin_count, user_count, photo_count, ] print row csw.write_onerow(row) csw.end_write() csw = CommonCsvWriter(filename="./output/stat_HARDCORE-990-2") csw.write_header([u"aid", "guid", "guid_name", "chechin No", "view", "click", "share"]) csv_body = [] start_date = datetime.datetime(2012, 6, 14) end_date = datetime.datetime(2012, 11, 14) for i in range(0, (end_date - start_date).days + 1): _day = start_date + datetime.timedelta(days=i) start_time = _day - datetime.timedelta(hours=8) end_time = _day + datetime.timedelta(hours=16) start = stat_util.convert_datetime_to_timestamp(start_time) end = stat_util.convert_datetime_to_timestamp(end_time) checkin_count = 0 user_count = 0 photo_count = 0 for guid in guids: checkin_count += ( crab.location_post[guid_to_int(guid)] .find(R.type.in_([1, 3, 7, 10]) & (R.created_on >= start) & (R.created_on < end)) .count() ) user_count += ( crab.location_post[guid_to_int(guid)] .find(R.type.in_([1, 3, 7, 10]) & (R.created_on >= start) & (R.created_on < end)) .group(R.user_id) .count() ) photo_count += ( crab.location_post[guid_to_int(guid)] .find(R.type.in_([1, 3, 7, 10]) & (R.has_photo) & (R.created_on >= start) & (R.created_on < end)) .count() ) row = [_day, checkin_count, user_count, photo_count] print row csw.write_onerow(row) csw.end_write()