def store_bbs_post(url, bbs_user_screen_name, title, content, info_source_id, keyword_id, created_at, read_count, comment_count): try: sql_post = session.query(BBSPost).filter(BBSPost.url == url).first() if not sql_post: sql_post = BBSPost() sql_post.info_source_id = info_source_id sql_post.url = url sql_post.keyword_id = keyword_id sql_post.bbs_user_screen_name = bbs_user_screen_name sql_post.created_at = created_at sql_post.title = title sql_post.content = content sql_post.read_count = read_count sql_post.comment_count = comment_count session.merge(sql_post) #merge session.flush() session.commit() sql_post = session.query(BBSPost).filter(BBSPost.url == url).first() if sql_post: store_category('bbs', str(sql_post.id)) except: print "store bbs post error!"
def add_news_to_session(url, source_name, title, content, info_source_id, created_at, keyword_id): sql_news = session.query(News).filter(News.url == url).first() if not sql_news: sql_news = News() else: return sql_news.url = url sql_news.source_name = source_name sql_news.title = title sql_news.content = content sql_news.info_source_id = info_source_id sql_news.keyword_id = keyword_id sql_news.created_at = created_at session.merge(sql_news) #merge session.flush() session.commit() sql_news = session.query(News).filter( News.url == url, News.info_source_id == info_source_id).first() if sql_news: store_category('news', str(sql_news.id))
def update_by_weibo_id(id, origin_id): api_status = get_count_by_ids(str(origin_id))[0] sql_status = session.query(Status).get(id) sql_status.repost_count = api_status['transmit_count'] sql_status.comment_count = api_status['comments_count'] session.commit() store_category('weibo', str(sql_status.id))
def store_by_tudou_video_url(url, keyword_id, title, video_user_screen_name, created_at, playcount): sql_post = session.query(VideoPost).filter(VideoPost.url == url).first() if not sql_post: sql_post = VideoPost() sql_post.url = url sql_post.title = title sql_post.keyword_id = keyword_id sql_post.video_user_screen_name = video_user_screen_name sql_post.created_at = created_at sql_post.info_source_id = TUDOU_INFO_SOURCE_ID sql_post.source_name = u"土豆" headers = { 'Host': 'www.tudou.com', #'Referer': 'http://www.soku.com/search_video/', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/536.26.17 (KHTML, like Gecko) Version/6.0.2 Safari/536.26.17', } req = urllib2.Request(url, headers=headers) response = urllib2.urlopen(req) content = response.read() iid = getIID(content) # stat_url = "http://v.youku.com/v_vpactionInfo/id/" + video_id + "/pm/1?__rt=1&__ro=info_stat" stat_url = 'http://www.tudou.com/tvp/itemSum.action?jsoncallback=__TVP_itemSum&iabcdefg=' + iid + '&uabcdefg=0&juabcdefg=&showArea=true&app=1' req = urllib2.Request(stat_url, headers=headers) response = urllib2.urlopen(req) content = response.read() up_count = getDigNum(content) down_count = getBuryNum(content) comment_count = getCommentNum(content) sql_post.watch_count = playcount sql_post.up_count = int(up_count) sql_post.down_count = int(down_count) sql_post.comment_count = int(comment_count) #print playcount,up_count,down_count,comment_count session.merge(sql_post) #merge session.flush() session.commit() sql_post = session.query(VideoPost).filter(VideoPost.url == url).first() if sql_post: store_category('video', str(sql_post.id))
def update_by_weibo_id(id, origin_id): status = t.statuses_show({'id': str(origin_id)}) api_status = json.loads(status) sql_status = session.query(Status).get(id) sql_status.repost_count = api_status['retweet_count'] sql_status.comment_count = api_status['comments_count'] session.commit() store_category('weibo', str(sql_status.id))
def store_by_wiki_url(url, comment_count, answered, keyword_id): sql_post = session.query(WikiPost).filter(WikiPost.url==url).first() if not sql_post: sql_post = WikiPost() sql_post.url = url sql_post.keyword_id = keyword_id sql_post.info_source_id = SOSO_WENWEN_INFO_SOURCE_ID sql_post.comment_count = comment_count sql_post.answered = answered headers = { 'Host': 'wenwen.soso.com', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/536.26.17 (KHTML, like Gecko) Version/6.0.2 Safari/536.26.17', } req = urllib2.Request(url, headers = headers) response = urllib2.urlopen(req) content = response.read() soup = BeautifulSoup(content) wiki_user_screen_name = soup.find('a', attrs={'class':"user_name"}) if wiki_user_screen_name == None: wiki_user_screen_name = u'匿名' else: wiki_user_screen_name = wiki_user_screen_name.text date_str = soup.find('span', attrs={'class':"question_time"}).text created_at = baidu_date_str_to_datetime(date_str) title = soup.find('h3', attrs={'id':"questionTitle"}).text content_div = soup.find('div', attrs={'class':"question_con"}) if content_div is None: content = "" else: content = content_div.text sql_post.read_count = 0 sql_post.wiki_user_screen_name = wiki_user_screen_name sql_post.title = title sql_post.content = content sql_post.created_at = created_at session.merge(sql_post) #merge session.flush() session.commit() sql_post = session.query(WikiPost).filter(WikiPost.url==url).first() if sql_post: #print "stored" store_category('wiki', str(sql_post.id)) time.sleep(5)
def locationId2Str(province_id, city_id): try: cities = CITIES[province_id] return {'province':cities['name'], 'city':cities[city_id]} except KeyError: return {'province':u'其它', 'city':u''} statuses = session.query(Status) for row in statuses: print "sss" store_category('weibo', str(row.id))
def update_by_weibo_id(id, origin_id): try: api_status = api.tweet.show(origin_id) sql_status = session.query(Status).get(id) sql_status.repost_count = api_status.count sql_status.comment_count = api_status.mcount session.commit() store_category('weibo', str(sql_status.id)) except Exception, e: #APIError store_error(QQ_WEIBO_INFO_SOURCE_ID) weibo_logger.exception(e)
def update_by_weibo_id(id, origin_id): try: api_status = client.statuses.show.get(id=origin_id) sql_status = session.query(Status).get(id) sql_status.repost_count = api_status['reposts_count'] sql_status.comment_count = api_status['comments_count'] sql_status.attitude_count = api_status['attitudes_count'] session.commit() store_category('weibo', str(sql_status.id)) except Exception: #APIError pass
def store_by_video_url(url, keyword_id, title, source_name): sql_post = session.query(VideoPost).filter(VideoPost.url == url).first() if not sql_post: sql_post = VideoPost() sql_post.url = url sql_post.title = title sql_post.keyword_id = keyword_id sql_post.created_at = datetime.now() sql_post.info_source_id = ALL_VIDEO_INFO_SOURCE_ID sql_post.source_name = source_name session.merge(sql_post) #merge session.flush() session.commit() sql_post = session.query(VideoPost).filter(VideoPost.url == url).first() if sql_post: store_category('video', str(sql_post.id))
def store_blog_post(url, blog_user_screen_name, title, content, info_source_id, keyword_id, created_at, read_count, comment_count): sql_post = session.query(BlogPost).filter(BlogPost.url==url).first() if not sql_post: sql_post = BlogPost() sql_post.info_source_id = info_source_id sql_post.url = url sql_post.keyword_id = keyword_id sql_post.blog_user_screen_name = blog_user_screen_name sql_post.created_at = created_at sql_post.title = title sql_post.content = content sql_post.read_count = read_count sql_post.comment_count = comment_count session.merge(sql_post) #merge session.flush() session.commit() sql_post = session.query(BlogPost).filter(BlogPost.url==url).first() if sql_post: store_category('blog', str(sql_post.id))
def store_by_sina_video_url(url, keyword_id, title, video_user_screen_name, created_at, play_count): sql_post = session.query(VideoPost).filter(VideoPost.url == url).first() if not sql_post: sql_post = VideoPost() sql_post.url = url sql_post.title = title sql_post.keyword_id = keyword_id sql_post.video_user_screen_name = video_user_screen_name sql_post.info_source_id = SINA_VIDEO_INFO_SOURCE_ID sql_post.source_name = u"新浪视频" sql_post.created_at = created_at sql_post.watch_count = play_count session.merge(sql_post) #merge session.flush() session.commit() sql_post = session.query(VideoPost).filter(VideoPost.url == url).first() if sql_post: store_category('video', str(sql_post.id))
def store_by_wiki_url(url, comment_count, keyword_id): #print url sql_post = session.query(WikiPost).filter(WikiPost.url==url).first() if not sql_post: sql_post = WikiPost() sql_post.url = url sql_post.keyword_id = keyword_id sql_post.info_source_id = BAIDU_ZHIDAO_INFO_SOURCE_ID headers = { 'Host': 'zhidao.baidu.com', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/536.26.17 (KHTML, like Gecko) Version/6.0.2 Safari/536.26.17', } req = urllib2.Request(url, headers = headers) response = urllib2.urlopen(req) content = response.read() soup = BeautifulSoup(content) title = soup.find('span', attrs={'class': "ask-title"}) if title is None: wiki_logger.error("open baidu zhidao url: " + url + " error. can't find title") time.sleep(600) return sql_post.title = title.text asker_soup = soup.find('div', attrs={'id': "ask-info"}) asker = asker_soup.find('a', attrs={'class': "user-name"}) if asker == None: sql_post.wiki_user_screen_name = u'匿名' else: sql_post.wiki_user_screen_name = asker.text address = url[25:] start = address.find('/') end = address.find('.html') idstr = address[start+1:end] read_count_url = "http://cp.zhidao.baidu.com/v.php?q=" + idstr + "&callback=bd__cbs__onzolk" headers = { 'Host': 'cp.zhidao.baidu.com', 'Referer': 'http://zhidao.baidu.com/question/523194115.html', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/536.26.17 (KHTML, like Gecko) Version/6.0.2 Safari/536.26.17', } req = urllib2.Request(read_count_url, headers = headers) response = urllib2.urlopen(req) read_count = response.read() start = read_count.find("(") end = read_count.find(")") sql_post.read_count = int(read_count[start+1:end]) ask_time = soup.find('span', attrs={'class': "grid-r ask-time"}) sql_post.created_at = wiki_date_str_to_datetime(ask_time.text) content = soup.find('pre', attrs={'accuse': "qContent"}) if content is None: sql_post.content = "" else: sql_post.content = content.renderContents() editor = soup.find('span', attrs={'class': "answer-title h2 grid"}) if editor is None: sql_post.answered = False else: sql_post.answered = True sql_post.comment_count = comment_count session.merge(sql_post) #merge session.flush() session.commit() sql_post = session.query(WikiPost).filter(WikiPost.url==url).first() if sql_post: store_category('wiki', str(sql_post.id)) time.sleep(10)
def add_status_and_user_to_session(status, keyword_id): user = status['user'] if user is None or status['text'] is None: #Exception return sql_user = session.query(User).filter( User.user_origin_id == str(user['id']), User.info_source_id == NETEASE_WEIBO_INFO_SOURCE_ID).first() if not sql_user: sql_user = User() sql_user.user_origin_id = str(user['id']) sql_user.info_source_id = NETEASE_WEIBO_INFO_SOURCE_ID sql_user.screen_name = user['screen_name'] sql_user.profile_image_url = user['profile_image_url'] sql_user.status_count = user['statuses_count'] sql_user.follower_count = user['followers_count'] sql_user.following_count = user['friends_count'] sql_user.verified = user['verified'] if user['gender'] == '1': sql_user.gender = 'm' elif user['gender'] == '2': sql_user.gender = 'f' else: sql_user.gender = 'n' location = location_split(user['location']) sql_user.geo_info_province = location['province'] sql_user.geo_info_city = location['city'] sql_status = session.query(Status).filter( Status.weibo_origin_id == status['id'], Status.info_source_id == NETEASE_WEIBO_INFO_SOURCE_ID).first() if not sql_status: sql_status = Status() sql_status.weibo_origin_id = int(status['id']) sql_status.url = "http://t.163.com/" + user['id'] + "/status/" + status[ 'id'] sql_status.weibo_user_screen_name = user['screen_name'] sql_status.keyword_id = keyword_id sql_status.info_source_id = NETEASE_WEIBO_INFO_SOURCE_ID sql_status.text = status['text'] sql_status.created_at = weibo_date_str_to_datetime(status['created_at']) sql_status.repost_count = status['retweet_count'] sql_status.comment_count = status['comments_count'] sql_status.attitude_count = 0 if status['in_reply_to_status_id'] != None: sql_status.retweeted = True else: sql_status.retweeted = False sql_status.with_pic = False sql_status.geo_info_province = location['province'] sql_status.geo_info_city = location['city'] sql_status.user = sql_user #foreign key session.merge(sql_status) #merge session.flush() session.commit() sql_status = session.query(Status).filter( Status.weibo_origin_id == status['id'], Status.info_source_id == NETEASE_WEIBO_INFO_SOURCE_ID).first() if sql_status: store_category('weibo', str(sql_status.id))
def add_status_and_user_to_session(status, count, keyword_id): user = status['user'] if user is None or status['text'] is None or status[ 'text'] == "@#@_@#@": #Exception return sql_user = session.query(User).filter( User.user_origin_id == str(user['id']), User.info_source_id == SOHU_WEIBO_INFO_SOURCE_ID).first() if not sql_user: sql_user = User() sql_user.user_origin_id = user['id'] sql_user.info_source_id = SOHU_WEIBO_INFO_SOURCE_ID sql_user.screen_name = user['screen_name'] sql_user.profile_image_url = user['profile_image_url'] sql_user.status_count = user['statuses_count'] sql_user.follower_count = user['followers_count'] sql_user.following_count = user['friends_count'] sql_user.verified = user['verified'] sql_user.gender = 'n' sql_user.geo_info_province = u'其它' sql_user.geo_info_city = u'' sql_status = session.query(Status).filter( Status.weibo_origin_id == status['id'], Status.info_source_id == SOHU_WEIBO_INFO_SOURCE_ID).first() if not sql_status: sql_status = Status() sql_status.weibo_origin_id = int(status['id']) sql_status.url = "http://t.sohu.com/m/" + status['id'] sql_status.weibo_user_screen_name = user['screen_name'] sql_status.keyword_id = keyword_id sql_status.info_source_id = SOHU_WEIBO_INFO_SOURCE_ID sql_status.text = status['text'] sql_status.created_at = weibo_date_str_to_datetime(status['created_at']) sql_status.repost_count = count['transmit_count'] sql_status.comment_count = count['comments_count'] sql_status.attitude_count = 0 if status.has_key('retweeted_status'): sql_status.retweeted = True else: sql_status.retweeted = False if status['middle_pic'] != '': sql_status.with_pic = True sql_status.pic_address = status['middle_pic'] else: sql_status.with_pic = False sql_status.geo_info_province = u'其它' sql_status.geo_info_city = u'' sql_status.user = sql_user #foreign key session.merge(sql_status) #merge session.flush() session.commit() sql_status = session.query(Status).filter( Status.weibo_origin_id == status['id'], Status.info_source_id == SOHU_WEIBO_INFO_SOURCE_ID).first() if sql_status: store_category('weibo', str(sql_status.id))
def store_by_video_url(url, keyword_id, title, video_user_screen_name, created_at): sql_post = session.query(VideoPost).filter(VideoPost.url == url).first() if not sql_post: sql_post = VideoPost() sql_post.url = url sql_post.title = title sql_post.keyword_id = keyword_id sql_post.video_user_screen_name = video_user_screen_name sql_post.created_at = created_at sql_post.info_source_id = YOUKU_INFO_SOURCE_ID sql_post.source_name = u"优酷" #sql_post.watch_count = watch_count headers = { 'Host': 'v.youku.com', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/536.26.17 (KHTML, like Gecko) Version/6.0.2 Safari/536.26.17', } req = urllib2.Request(url, headers=headers) response = urllib2.urlopen(req) video_id = '' for line in response.readlines(): if line.find('videoId') > 0: start = line.find("= '") end = line.find("';") video_id = line[start + 3:end] break if video_id == '': return stat_url = "http://v.youku.com/v_vpactionInfo/id/" + video_id + "/pm/1?__rt=1&__ro=info_stat" req = urllib2.Request(stat_url, headers=headers) response = urllib2.urlopen(req) content = response.read() soup = BeautifulSoup(content) nums = soup.findAll('span', attrs={'class': "num"}) sql_post.watch_count = int(nums[0].text.replace(',', '')) up_down = nums[1].text.replace(',', '').split(' / ') sql_post.up_count = int(up_down[0]) sql_post.down_count = int(up_down[1]) sql_post.comment_count = int(nums[3].text.replace(',', '')) try: sql_post.repost_count = int(nums[4].text.replace(',', '')) except: sql_post.repost_count = 0 #print sql_post.watch_count, sql_post.up_count, sql_post.down_count, sql_post.comment_count session.merge(sql_post) #merge session.flush() session.commit() sql_post = session.query(VideoPost).filter(VideoPost.url == url).first() if sql_post: store_category('video', str(sql_post.id))
def add_status_and_user_to_session(status, keyword_id): user = api.user.userinfo(status.name) sql_user = session.query(User).filter( User.user_origin_id == user.name, User.info_source_id == QQ_WEIBO_INFO_SOURCE_ID).first() if not sql_user: sql_user = User() sql_user.user_origin_id = user.name sql_user.info_source_id = QQ_WEIBO_INFO_SOURCE_ID sql_user.screen_name = user.nick if user.head == "": sql_user.profile_image_url = "" else: sql_user.profile_image_url = user.head + '/100' sql_user.status_count = user.tweetnum sql_user.follower_count = user.fansnum sql_user.following_count = user.idolnum sql_user.verified = user.isvip if user.sex == 1: sql_user.gender = 'm' elif user.sex == 2: sql_user.gender = 'f' else: sql_user.gender = 'n' location = location_split(user.location) sql_user.geo_info_province = location['province'] sql_user.geo_info_city = location['city'] sql_status = session.query(Status).filter( Status.weibo_origin_id == status.id, Status.info_source_id == QQ_WEIBO_INFO_SOURCE_ID).first() if not sql_status: sql_status = Status() sql_status.weibo_origin_id = status.id sql_status.url = "http://t.qq.com/p/t/" + str(status.id) sql_status.weibo_user_screen_name = user.nick sql_status.keyword_id = keyword_id sql_status.info_source_id = QQ_WEIBO_INFO_SOURCE_ID sql_status.text = status.origtext sql_status.created_at = datetime.fromtimestamp(status.timestamp) sql_status.repost_count = status.count sql_status.comment_count = status.mcount sql_status.attitude_count = 0 if status.type != 1: sql_status.retweeted = True else: sql_status.retweeted = False if status.image is None: sql_status.with_pic = False else: sql_status.with_pic = True sql_status.pic_address = status.image[0] sql_status.geo_info_province = location['province'] sql_status.geo_info_city = location['city'] sql_status.user = sql_user #foreign key session.merge(sql_status) #merge session.flush() session.commit() sql_status = session.query(Status).filter( Status.weibo_origin_id == status.id, Status.info_source_id == QQ_WEIBO_INFO_SOURCE_ID).first() if sql_status: store_category('weibo', str(sql_status.id))
def add_status_and_user_to_session(status, keyword_id): user = status['user'] if user is None or status['text'] is None: #Exception return sql_user = session.query(User).filter(User.user_origin_id==str(user['id']), User.info_source_id==SEARCH_INFO_SOURCE_ID).first() if not sql_user: sql_user = User() sql_user.user_origin_id = str(user['id']) sql_user.info_source_id = SEARCH_INFO_SOURCE_ID sql_user.screen_name = user['screen_name'] sql_user.profile_image_url = user['profile_image_url'] sql_user.status_count = user['statuses_count'] sql_user.follower_count = user['followers_count'] sql_user.following_count = user['friends_count'] sql_user.verified = user['verified'] sql_user.gender = user['gender'] location = locationId2Str(user['province'], user['city']) sql_user.geo_info_province = location['province'] sql_user.geo_info_city = location['city'] sql_status = session.query(Status).filter(Status.weibo_origin_id==status['id'], Status.info_source_id==SEARCH_INFO_SOURCE_ID).first() if not sql_status: sql_status = Status() sql_status.weibo_origin_id = status['id'] sql_status.url = "http://weibo.com/" + str(user['id']) + "/" + id2mid(status['idstr']) sql_status.weibo_user_screen_name = user['screen_name'] sql_status.keyword_id = keyword_id sql_status.info_source_id = SEARCH_INFO_SOURCE_ID sql_status.text = status['text'] sql_status.created_at = weibo_date_str_to_datetime(status['created_at']) sql_status.repost_count = status['reposts_count'] sql_status.comment_count = status['comments_count'] sql_status.attitude_count = status['attitudes_count'] if status.has_key('retweeted_status'): sql_status.retweeted = True else: sql_status.retweeted = False if status.has_key('thumbnail_pic'): sql_status.with_pic = True sql_status.pic_address = status['thumbnail_pic'] else: sql_status.with_pic = False sql_status.geo_info_province = location['province'] sql_status.geo_info_city = location['city'] sql_status.user = sql_user #foreign key session.merge(sql_status) #merge session.flush() session.commit() sql_status = session.query(Status).filter(Status.weibo_origin_id==status['id'], Status.info_source_id==SEARCH_INFO_SOURCE_ID).first() if sql_status: store_category('weibo', str(sql_status.id))