def stat_user(search_time=None, force_update=False): """ 统计用户信息 """ if not search_time: search_time = datetime.datetime.now() search_time = "%s-%s-%s" % (now_time.year, now_time.month, now_time.day) content = Content.get(search_time) if not content: return #遍历所有用户 for types, ids in content.weibo.iteritems(): for id, context in ids.iteritems(): user = Weibo.get(id) #如果还没有今天的统计就统计一下 if not user.stat_info.get(search_time) or force_update: create_at = [] for tmp_content in context: if not tmp_content.get('created_at'): continue print types, tmp_content['created_at'] create_at.append( tmp_content['created_at'].split(' ')[3].split(':')[0]) user.stat_info[search_time] = {} user.stat_info[search_time] = { 'send_count': len(context), 'create_at': create_at } #print ,create_atuser.stat_info[search_time] user.put()
def stat_user(search_time=None,force_update=False): """ 统计用户信息 """ if not search_time: search_time = datetime.datetime.now() search_time = "%s-%s-%s"%(now_time.year,now_time.month,now_time.day) content = Content.get(search_time) if not content: return #遍历所有用户 for types,ids in content.weibo.iteritems(): for id,context in ids.iteritems(): user = Weibo.get(id) #如果还没有今天的统计就统计一下 if not user.stat_info.get(search_time) or force_update: create_at = [] for tmp_content in context: if not tmp_content.get('created_at'):continue print types,tmp_content['created_at'] create_at.append(tmp_content['created_at'].split(' ')[3].split(':')[0]) user.stat_info[search_time] = {} user.stat_info[search_time] = {'send_count':len(context),'create_at':create_at} #print ,create_atuser.stat_info[search_time] user.put()
#-*- coding: utf-8 -*-
def get_content(weibo_type, user_id, debug=False, count=200, force_update=False, content_type=0): """ 抓取微博内容 weibo_type:微博类型 注意需要是已经存在的类别 user_id:微博的id 注意这里不是微博名字 是微博id debug:调试模式 不插入数据库 force_update:强制更新 删除所有 重新获取 content_type:0全部,1原创,2图片,3视频,4音乐 """ content_dict = {} #ty:时尚 美图 旅游 搞笑..... #用户id result = client.statuses.user_timeline.get(uid=user_id, count=count, feature=content_type) contents = dict(result) #遍历所有发的帖子 前100条 for s_item in contents['statuses']: #可能是转帖 所以需要再取一次 if not s_item.get('original_pic'): if s_item.get('retweeted_status', {}).get('original_pic'): s_item['original_pic'] = s_item['retweeted_status'][ 'original_pic'] else: #如果没有图片 就pass掉 continue #filter列表包含这些内容不保存 可能是广告数据 if "http://" in s_item['text'] or "包邮" in s_item['text']\ or "去评论中找链接哦" in s_item['text']\ or "www." in s_item['text'] or re.findall('[0-9]元',s_item['text'])\ or s_item['text'].count(" ★") >= 3 or s_item['text'].count("(") >= 3\ or s_item['text'].count(":") > 5 or s_item['text'].count("【") > 2\ or s_item['text'].count("、") > 5 or '@' in s_item['text']\ or '#' in s_item['text']: continue #gif图片单独存放 if '.gif' in s_item.get('original_pic', ''): response = urllib.urlopen(url=s_item['original_pic']) response_dict = dict(response.headers) file_size = response_dict.get('content-length') if file_size: #计算他是多少M的大小 file_size = float(file_size) / 1000.0 / 1000.0 file_size = decimal.Decimal(file_size).quantize( decimal.Decimal('0.0')) s_item['file_size'] = file_size #如果是检查视频微博 判断视频长度 if content_type in [3, '3']: if 'http://' in s_item['text']: video_url = s_item['text'] elif 'http://' in s_item['retweeted_status']['text']: video_url = s_item['retweeted_status']['text'] video_index = b.index('http') #视频地址 #视频片段有多少个 s_item['video_url'] = video_url[video_index:].split(' ')[0] video_count = utils.get_video_count(s_item['video_url']) s_item['video_count'] = video_count print s_item['video_url'], video_count #判断字数小于5个字过滤 if len(s_item['text'].decode('utf-8')) <= 5: continue # #计算图片的大小 # if s_item.get('original_pic'): # response = urllib.urlopen(url=s_item['original_pic']) # img_data = response.read() # io = cStringIO.StringIO(img_data) # s_item['width'],s_item['height'] = Image.open(io).size #格式化时间 按照时间分开存放内容 created_at = s_item['created_at'].split(' ') time_str = created_at[len(created_at) - 1] + "-" + str( time_dict[created_at[1]]) + '-' + created_at[2] if time_str not in content_dict: content_dict[time_str] = {} #[时间][搞笑][周杰伦的微博的id] 注意是id哦~ if user_id not in content_dict[time_str]: content_dict[time_str][user_id] = [] need_data = { 'id': s_item['id'], 'screen_name': weibo_user.userids[int(user_id)], 'type': weibo_type, 'text': s_item['text'], 'bmiddle_pic': s_item.get('bmiddle_pic'), 'original_pic': s_item.get('original_pic'), 'thumbnail_pic': s_item.get('thumbnail_pic'), 'reposts_count': s_item.get('reposts_count'), 'comments_count': s_item.get('comments_count'), 'attitudes_count': s_item.get('attitudes_count'), 'mlevel': s_item.get('mlevel'), 'width': s_item.get('width'), 'height': s_item.get('height'), 'text_size': len(s_item['text'].decode('utf-8')), 'created_at': s_item['created_at'], 'file_size': s_item.get('file_size'), 'video_url': s_item.get('video_url'), 'avatar_large': s_item.get('user', {}).get('avatar_large'), 'profile_image_url': s_item.get('user', {}).get('profile_image_url'), } #[时间][用户id] = [微博,微博,微博] content_dict[time_str][user_id].append(need_data) #按照时间分开存储 k:时间 :{用户id:[]} for k, v in content_dict.iteritems(): cont_obj = Content.get(k) if not cont_obj: cont_obj = Content._install(k) #新添加类别 if weibo_type not in cont_obj.weibo: cont_obj.weibo[weibo_type] = v else: #有可能内容已经存在 u_id:用户id item_value:帖子集合[] for u_id, item_value in v.iteritems(): #如果没用该用户的信息 创建 if u_id not in cont_obj.weibo[weibo_type] or force_update: cont_obj.weibo[weibo_type][u_id] = [] cont_obj.weibo[weibo_type][u_id] = item_value else: #如果有该用户信息 需要判断是否有重复内容 now_ids = [ va['id'] for va in cont_obj.weibo[weibo_type][u_id] ] for cont in item_value: if cont['id'] not in now_ids: cont_obj.weibo[weibo_type][u_id].append(cont) if not debug: a = time.time() cont_obj.put() print 'result', time.time() - a
def get_content(weibo_type,user_id,debug=False,count=200,force_update=False,content_type=0): """ 抓取微博内容 weibo_type:微博类型 注意需要是已经存在的类别 user_id:微博的id 注意这里不是微博名字 是微博id debug:调试模式 不插入数据库 force_update:强制更新 删除所有 重新获取 content_type:0全部,1原创,2图片,3视频,4音乐 """ content_dict = {} #ty:时尚 美图 旅游 搞笑..... #用户id result = client.statuses.user_timeline.get(uid=user_id,count=count,feature=content_type) contents = dict(result) #遍历所有发的帖子 前100条 for s_item in contents['statuses']: #可能是转帖 所以需要再取一次 if not s_item.get('original_pic'): if s_item.get('retweeted_status',{}).get('original_pic'): s_item['original_pic'] = s_item['retweeted_status']['original_pic'] else: #如果没有图片 就pass掉 continue #filter列表包含这些内容不保存 可能是广告数据 if "http://" in s_item['text'] or "包邮" in s_item['text']\ or "去评论中找链接哦" in s_item['text']\ or "www." in s_item['text'] or re.findall('[0-9]元',s_item['text'])\ or s_item['text'].count(" ★") >= 3 or s_item['text'].count("(") >= 3\ or s_item['text'].count(":") > 5 or s_item['text'].count("【") > 2\ or s_item['text'].count("、") > 5 or '@' in s_item['text']\ or '#' in s_item['text']: continue #gif图片单独存放 if '.gif' in s_item.get('original_pic',''): response = urllib.urlopen(url=s_item['original_pic']) response_dict = dict(response.headers) file_size = response_dict.get('content-length') if file_size: #计算他是多少M的大小 file_size = float(file_size) / 1000.0 / 1000.0 file_size = decimal.Decimal(file_size).quantize(decimal.Decimal('0.0')) s_item['file_size'] = file_size #如果是检查视频微博 判断视频长度 if content_type in [3,'3']: if 'http://' in s_item['text']: video_url = s_item['text'] elif 'http://' in s_item['retweeted_status']['text']: video_url = s_item['retweeted_status']['text'] video_index = b.index('http') #视频地址 #视频片段有多少个 s_item['video_url'] = video_url[video_index:].split(' ')[0] video_count = utils.get_video_count(s_item['video_url']) s_item['video_count'] = video_count print s_item['video_url'],video_count #判断字数小于5个字过滤 if len(s_item['text'].decode('utf-8'))<= 5: continue # #计算图片的大小 # if s_item.get('original_pic'): # response = urllib.urlopen(url=s_item['original_pic']) # img_data = response.read() # io = cStringIO.StringIO(img_data) # s_item['width'],s_item['height'] = Image.open(io).size #格式化时间 按照时间分开存放内容 created_at = s_item['created_at'].split(' ') time_str = created_at[len(created_at)-1] + "-" + str(time_dict[created_at[1]]) + '-' + created_at[2] if time_str not in content_dict: content_dict[time_str] = {} #[时间][搞笑][周杰伦的微博的id] 注意是id哦~ if user_id not in content_dict[time_str]: content_dict[time_str][user_id] = [] need_data = { 'id':s_item['id'], 'screen_name':weibo_user.userids[int(user_id)], 'type':weibo_type, 'text':s_item['text'], 'bmiddle_pic':s_item.get('bmiddle_pic'), 'original_pic':s_item.get('original_pic'), 'thumbnail_pic':s_item.get('thumbnail_pic'), 'reposts_count':s_item.get('reposts_count'), 'comments_count':s_item.get('comments_count'), 'attitudes_count':s_item.get('attitudes_count'), 'mlevel':s_item.get('mlevel'), 'width':s_item.get('width'), 'height':s_item.get('height'), 'text_size':len(s_item['text'].decode('utf-8')), 'created_at':s_item['created_at'], 'file_size':s_item.get('file_size'), 'video_url':s_item.get('video_url'), 'avatar_large':s_item.get('user',{}).get('avatar_large'), 'profile_image_url':s_item.get('user',{}).get('profile_image_url'), } #[时间][用户id] = [微博,微博,微博] content_dict[time_str][user_id].append(need_data) #按照时间分开存储 k:时间 :{用户id:[]} for k,v in content_dict.iteritems(): cont_obj = Content.get(k) if not cont_obj: cont_obj = Content._install(k) #新添加类别 if weibo_type not in cont_obj.weibo: cont_obj.weibo[weibo_type] = v else: #有可能内容已经存在 u_id:用户id item_value:帖子集合[] for u_id,item_value in v.iteritems(): #如果没用该用户的信息 创建 if u_id not in cont_obj.weibo[weibo_type] or force_update: cont_obj.weibo[weibo_type][u_id] = [] cont_obj.weibo[weibo_type][u_id] = item_value else: #如果有该用户信息 需要判断是否有重复内容 now_ids = [va['id'] for va in cont_obj.weibo[weibo_type][u_id]] for cont in item_value: if cont['id'] not in now_ids: cont_obj.weibo[weibo_type][u_id].append(cont) if not debug: a = time.time() cont_obj.put() print 'result',time.time()-a