Пример #1
0
def store_bbs_post(url, bbs_user_screen_name, title, content, info_source_id,
                   keyword_id, created_at, read_count, comment_count):
    try:
        sql_post = session.query(BBSPost).filter(BBSPost.url == url).first()
        if not sql_post:
            sql_post = BBSPost()

        sql_post.info_source_id = info_source_id
        sql_post.url = url
        sql_post.keyword_id = keyword_id
        sql_post.bbs_user_screen_name = bbs_user_screen_name
        sql_post.created_at = created_at
        sql_post.title = title
        sql_post.content = content
        sql_post.read_count = read_count
        sql_post.comment_count = comment_count

        session.merge(sql_post)  #merge

        session.flush()
        session.commit()

        sql_post = session.query(BBSPost).filter(BBSPost.url == url).first()
        if sql_post:
            store_category('bbs', str(sql_post.id))
    except:
        print "store bbs post error!"
Пример #2
0
def add_news_to_session(url, source_name, title, content, info_source_id,
                        created_at, keyword_id):
    sql_news = session.query(News).filter(News.url == url).first()
    if not sql_news:
        sql_news = News()
    else:
        return

    sql_news.url = url
    sql_news.source_name = source_name
    sql_news.title = title
    sql_news.content = content
    sql_news.info_source_id = info_source_id
    sql_news.keyword_id = keyword_id
    sql_news.created_at = created_at

    session.merge(sql_news)  #merge

    session.flush()
    session.commit()

    sql_news = session.query(News).filter(
        News.url == url, News.info_source_id == info_source_id).first()
    if sql_news:
        store_category('news', str(sql_news.id))
Пример #3
0
def update_by_weibo_id(id, origin_id):
    api_status = get_count_by_ids(str(origin_id))[0]
    sql_status = session.query(Status).get(id)

    sql_status.repost_count = api_status['transmit_count']
    sql_status.comment_count = api_status['comments_count']

    session.commit()

    store_category('weibo', str(sql_status.id))
Пример #4
0
def store_by_tudou_video_url(url, keyword_id, title, video_user_screen_name,
                             created_at, playcount):
    sql_post = session.query(VideoPost).filter(VideoPost.url == url).first()
    if not sql_post:
        sql_post = VideoPost()

    sql_post.url = url
    sql_post.title = title
    sql_post.keyword_id = keyword_id
    sql_post.video_user_screen_name = video_user_screen_name
    sql_post.created_at = created_at
    sql_post.info_source_id = TUDOU_INFO_SOURCE_ID
    sql_post.source_name = u"土豆"

    headers = {
        'Host':
        'www.tudou.com',
        #'Referer': 'http://www.soku.com/search_video/',
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/536.26.17 (KHTML, like Gecko) Version/6.0.2 Safari/536.26.17',
    }
    req = urllib2.Request(url, headers=headers)
    response = urllib2.urlopen(req)
    content = response.read()

    iid = getIID(content)

    # stat_url = "http://v.youku.com/v_vpactionInfo/id/" + video_id + "/pm/1?__rt=1&__ro=info_stat"
    stat_url = 'http://www.tudou.com/tvp/itemSum.action?jsoncallback=__TVP_itemSum&iabcdefg=' + iid + '&uabcdefg=0&juabcdefg=&showArea=true&app=1'

    req = urllib2.Request(stat_url, headers=headers)
    response = urllib2.urlopen(req)
    content = response.read()

    up_count = getDigNum(content)
    down_count = getBuryNum(content)
    comment_count = getCommentNum(content)

    sql_post.watch_count = playcount
    sql_post.up_count = int(up_count)
    sql_post.down_count = int(down_count)
    sql_post.comment_count = int(comment_count)

    #print playcount,up_count,down_count,comment_count

    session.merge(sql_post)  #merge

    session.flush()
    session.commit()

    sql_post = session.query(VideoPost).filter(VideoPost.url == url).first()

    if sql_post:
        store_category('video', str(sql_post.id))
Пример #5
0
def update_by_weibo_id(id, origin_id):
    status = t.statuses_show({'id': str(origin_id)})
    api_status = json.loads(status)
    sql_status = session.query(Status).get(id)

    sql_status.repost_count = api_status['retweet_count']
    sql_status.comment_count = api_status['comments_count']

    session.commit()

    store_category('weibo', str(sql_status.id))
Пример #6
0
def store_by_wiki_url(url, comment_count, answered, keyword_id):
    sql_post = session.query(WikiPost).filter(WikiPost.url==url).first()
    if not sql_post:
       sql_post = WikiPost() 

    sql_post.url = url

    sql_post.keyword_id = keyword_id
    sql_post.info_source_id = SOSO_WENWEN_INFO_SOURCE_ID
    sql_post.comment_count = comment_count
    sql_post.answered = answered

    headers = {
        'Host': 'wenwen.soso.com',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/536.26.17 (KHTML, like Gecko) Version/6.0.2 Safari/536.26.17',
    }
    
    req = urllib2.Request(url, headers = headers)  
    response = urllib2.urlopen(req)  
    content = response.read() 
    
    soup = BeautifulSoup(content)
    
    wiki_user_screen_name = soup.find('a', attrs={'class':"user_name"})
    if wiki_user_screen_name == None:
        wiki_user_screen_name = u'匿名'
    else:
        wiki_user_screen_name = wiki_user_screen_name.text
    date_str = soup.find('span', attrs={'class':"question_time"}).text
    created_at = baidu_date_str_to_datetime(date_str)
    title = soup.find('h3', attrs={'id':"questionTitle"}).text
    content_div = soup.find('div', attrs={'class':"question_con"})
    if content_div is None:
        content = ""
    else:
        content = content_div.text

    sql_post.read_count = 0
    sql_post.wiki_user_screen_name = wiki_user_screen_name
    sql_post.title = title
    sql_post.content = content
    sql_post.created_at = created_at

    session.merge(sql_post) #merge

    session.flush()
    session.commit()

    sql_post = session.query(WikiPost).filter(WikiPost.url==url).first()
    if sql_post:
        #print "stored"
        store_category('wiki', str(sql_post.id))

    time.sleep(5)
Пример #7
0
def locationId2Str(province_id, city_id):
    try:
        cities = CITIES[province_id]
        return {'province':cities['name'], 'city':cities[city_id]}
    except KeyError:
        return {'province':u'其它', 'city':u''}

    statuses = session.query(Status)
   
    for row in statuses:
        print "sss"
        store_category('weibo', str(row.id))
Пример #8
0
def update_by_weibo_id(id, origin_id):
    try:
        api_status = api.tweet.show(origin_id)
        sql_status = session.query(Status).get(id)

        sql_status.repost_count = api_status.count
        sql_status.comment_count = api_status.mcount

        session.commit()

        store_category('weibo', str(sql_status.id))

    except Exception, e:  #APIError
        store_error(QQ_WEIBO_INFO_SOURCE_ID)
        weibo_logger.exception(e)
Пример #9
0
def update_by_weibo_id(id, origin_id):
    try:
        api_status = client.statuses.show.get(id=origin_id)
        sql_status = session.query(Status).get(id)

        sql_status.repost_count = api_status['reposts_count']
        sql_status.comment_count = api_status['comments_count']
        sql_status.attitude_count = api_status['attitudes_count']
        
        session.commit()
        
        store_category('weibo', str(sql_status.id))


    except Exception: #APIError
        pass
Пример #10
0
def store_by_video_url(url, keyword_id, title, source_name):
    sql_post = session.query(VideoPost).filter(VideoPost.url == url).first()
    if not sql_post:
        sql_post = VideoPost()

    sql_post.url = url
    sql_post.title = title
    sql_post.keyword_id = keyword_id
    sql_post.created_at = datetime.now()
    sql_post.info_source_id = ALL_VIDEO_INFO_SOURCE_ID
    sql_post.source_name = source_name

    session.merge(sql_post)  #merge

    session.flush()
    session.commit()

    sql_post = session.query(VideoPost).filter(VideoPost.url == url).first()
    if sql_post:
        store_category('video', str(sql_post.id))
Пример #11
0
def store_blog_post(url, blog_user_screen_name, title, content, info_source_id,
                   keyword_id, created_at, read_count, comment_count):
    sql_post = session.query(BlogPost).filter(BlogPost.url==url).first()
    if not sql_post:
       sql_post = BlogPost()

    sql_post.info_source_id = info_source_id
    sql_post.url = url
    sql_post.keyword_id = keyword_id
    sql_post.blog_user_screen_name = blog_user_screen_name
    sql_post.created_at = created_at
    sql_post.title = title
    sql_post.content = content
    sql_post.read_count = read_count
    sql_post.comment_count = comment_count

    session.merge(sql_post) #merge

    session.flush()
    session.commit()

    sql_post = session.query(BlogPost).filter(BlogPost.url==url).first()
    if sql_post:
        store_category('blog', str(sql_post.id))
Пример #12
0
def store_by_sina_video_url(url, keyword_id, title, video_user_screen_name,
                            created_at, play_count):
    sql_post = session.query(VideoPost).filter(VideoPost.url == url).first()
    if not sql_post:
        sql_post = VideoPost()

    sql_post.url = url
    sql_post.title = title
    sql_post.keyword_id = keyword_id
    sql_post.video_user_screen_name = video_user_screen_name
    sql_post.info_source_id = SINA_VIDEO_INFO_SOURCE_ID
    sql_post.source_name = u"新浪视频"
    sql_post.created_at = created_at
    sql_post.watch_count = play_count

    session.merge(sql_post)  #merge

    session.flush()
    session.commit()

    sql_post = session.query(VideoPost).filter(VideoPost.url == url).first()

    if sql_post:
        store_category('video', str(sql_post.id))
Пример #13
0
def store_by_wiki_url(url, comment_count, keyword_id):
    #print url
    sql_post = session.query(WikiPost).filter(WikiPost.url==url).first()
    if not sql_post:
       sql_post = WikiPost() 

    sql_post.url = url

    sql_post.keyword_id = keyword_id
    sql_post.info_source_id = BAIDU_ZHIDAO_INFO_SOURCE_ID

    headers = {
        'Host': 'zhidao.baidu.com',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/536.26.17 (KHTML, like Gecko) Version/6.0.2 Safari/536.26.17',
    }
    
    req = urllib2.Request(url, headers = headers)  
    response = urllib2.urlopen(req)  
    content = response.read()  

    soup = BeautifulSoup(content)

    title = soup.find('span', attrs={'class': "ask-title"})
    if title is None:
        wiki_logger.error("open baidu zhidao url: " + url + " error. can't find title")
        time.sleep(600)
        return

    sql_post.title = title.text 

    asker_soup = soup.find('div', attrs={'id': "ask-info"})
    asker = asker_soup.find('a', attrs={'class': "user-name"})
    if asker == None:
        sql_post.wiki_user_screen_name = u'匿名'
    else:
        sql_post.wiki_user_screen_name = asker.text
    

    address = url[25:]
    start = address.find('/')
    end = address.find('.html')
    idstr = address[start+1:end]
    read_count_url = "http://cp.zhidao.baidu.com/v.php?q=" + idstr + "&callback=bd__cbs__onzolk" 
    headers = {
        'Host': 'cp.zhidao.baidu.com',
        'Referer': 'http://zhidao.baidu.com/question/523194115.html',       
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/536.26.17 (KHTML, like Gecko) Version/6.0.2 Safari/536.26.17',
    }

    req = urllib2.Request(read_count_url, headers = headers)  
    response = urllib2.urlopen(req)  
    read_count = response.read()
    start = read_count.find("(")
    end = read_count.find(")")
    sql_post.read_count = int(read_count[start+1:end])


    ask_time = soup.find('span', attrs={'class': "grid-r ask-time"})
    sql_post.created_at = wiki_date_str_to_datetime(ask_time.text)


    content = soup.find('pre', attrs={'accuse': "qContent"})
    if content is None:
        sql_post.content = ""
    else:
        sql_post.content = content.renderContents()


    editor = soup.find('span', attrs={'class': "answer-title h2 grid"})
    if editor is None:
        sql_post.answered = False
    else:
        sql_post.answered = True

    sql_post.comment_count = comment_count

    session.merge(sql_post) #merge

    session.flush()
    session.commit()

    sql_post = session.query(WikiPost).filter(WikiPost.url==url).first()
    if sql_post:
        store_category('wiki', str(sql_post.id))

    time.sleep(10)
Пример #14
0
def add_status_and_user_to_session(status, keyword_id):
    user = status['user']
    if user is None or status['text'] is None:  #Exception
        return

    sql_user = session.query(User).filter(
        User.user_origin_id == str(user['id']),
        User.info_source_id == NETEASE_WEIBO_INFO_SOURCE_ID).first()
    if not sql_user:
        sql_user = User()

    sql_user.user_origin_id = str(user['id'])
    sql_user.info_source_id = NETEASE_WEIBO_INFO_SOURCE_ID
    sql_user.screen_name = user['screen_name']
    sql_user.profile_image_url = user['profile_image_url']
    sql_user.status_count = user['statuses_count']
    sql_user.follower_count = user['followers_count']
    sql_user.following_count = user['friends_count']
    sql_user.verified = user['verified']
    if user['gender'] == '1':
        sql_user.gender = 'm'
    elif user['gender'] == '2':
        sql_user.gender = 'f'
    else:
        sql_user.gender = 'n'
    location = location_split(user['location'])
    sql_user.geo_info_province = location['province']
    sql_user.geo_info_city = location['city']

    sql_status = session.query(Status).filter(
        Status.weibo_origin_id == status['id'],
        Status.info_source_id == NETEASE_WEIBO_INFO_SOURCE_ID).first()
    if not sql_status:
        sql_status = Status()

    sql_status.weibo_origin_id = int(status['id'])
    sql_status.url = "http://t.163.com/" + user['id'] + "/status/" + status[
        'id']
    sql_status.weibo_user_screen_name = user['screen_name']
    sql_status.keyword_id = keyword_id
    sql_status.info_source_id = NETEASE_WEIBO_INFO_SOURCE_ID
    sql_status.text = status['text']
    sql_status.created_at = weibo_date_str_to_datetime(status['created_at'])
    sql_status.repost_count = status['retweet_count']
    sql_status.comment_count = status['comments_count']
    sql_status.attitude_count = 0
    if status['in_reply_to_status_id'] != None:
        sql_status.retweeted = True
    else:
        sql_status.retweeted = False

    sql_status.with_pic = False

    sql_status.geo_info_province = location['province']
    sql_status.geo_info_city = location['city']

    sql_status.user = sql_user  #foreign key

    session.merge(sql_status)  #merge

    session.flush()
    session.commit()

    sql_status = session.query(Status).filter(
        Status.weibo_origin_id == status['id'],
        Status.info_source_id == NETEASE_WEIBO_INFO_SOURCE_ID).first()
    if sql_status:
        store_category('weibo', str(sql_status.id))
Пример #15
0
def add_status_and_user_to_session(status, count, keyword_id):
    user = status['user']
    if user is None or status['text'] is None or status[
            'text'] == "@#@_@#@":  #Exception
        return

    sql_user = session.query(User).filter(
        User.user_origin_id == str(user['id']),
        User.info_source_id == SOHU_WEIBO_INFO_SOURCE_ID).first()
    if not sql_user:
        sql_user = User()

    sql_user.user_origin_id = user['id']
    sql_user.info_source_id = SOHU_WEIBO_INFO_SOURCE_ID
    sql_user.screen_name = user['screen_name']
    sql_user.profile_image_url = user['profile_image_url']
    sql_user.status_count = user['statuses_count']
    sql_user.follower_count = user['followers_count']
    sql_user.following_count = user['friends_count']
    sql_user.verified = user['verified']
    sql_user.gender = 'n'
    sql_user.geo_info_province = u'其它'
    sql_user.geo_info_city = u''

    sql_status = session.query(Status).filter(
        Status.weibo_origin_id == status['id'],
        Status.info_source_id == SOHU_WEIBO_INFO_SOURCE_ID).first()
    if not sql_status:
        sql_status = Status()

    sql_status.weibo_origin_id = int(status['id'])
    sql_status.url = "http://t.sohu.com/m/" + status['id']
    sql_status.weibo_user_screen_name = user['screen_name']
    sql_status.keyword_id = keyword_id
    sql_status.info_source_id = SOHU_WEIBO_INFO_SOURCE_ID
    sql_status.text = status['text']
    sql_status.created_at = weibo_date_str_to_datetime(status['created_at'])
    sql_status.repost_count = count['transmit_count']
    sql_status.comment_count = count['comments_count']
    sql_status.attitude_count = 0
    if status.has_key('retweeted_status'):
        sql_status.retweeted = True
    else:
        sql_status.retweeted = False

    if status['middle_pic'] != '':
        sql_status.with_pic = True
        sql_status.pic_address = status['middle_pic']
    else:
        sql_status.with_pic = False

    sql_status.geo_info_province = u'其它'
    sql_status.geo_info_city = u''

    sql_status.user = sql_user  #foreign key

    session.merge(sql_status)  #merge

    session.flush()
    session.commit()

    sql_status = session.query(Status).filter(
        Status.weibo_origin_id == status['id'],
        Status.info_source_id == SOHU_WEIBO_INFO_SOURCE_ID).first()
    if sql_status:
        store_category('weibo', str(sql_status.id))
Пример #16
0
def store_by_video_url(url, keyword_id, title, video_user_screen_name,
                       created_at):
    sql_post = session.query(VideoPost).filter(VideoPost.url == url).first()
    if not sql_post:
        sql_post = VideoPost()

    sql_post.url = url
    sql_post.title = title
    sql_post.keyword_id = keyword_id
    sql_post.video_user_screen_name = video_user_screen_name
    sql_post.created_at = created_at
    sql_post.info_source_id = YOUKU_INFO_SOURCE_ID
    sql_post.source_name = u"优酷"
    #sql_post.watch_count = watch_count

    headers = {
        'Host':
        'v.youku.com',
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/536.26.17 (KHTML, like Gecko) Version/6.0.2 Safari/536.26.17',
    }

    req = urllib2.Request(url, headers=headers)
    response = urllib2.urlopen(req)

    video_id = ''
    for line in response.readlines():
        if line.find('videoId') > 0:
            start = line.find("= '")
            end = line.find("';")
            video_id = line[start + 3:end]
            break

    if video_id == '':
        return

    stat_url = "http://v.youku.com/v_vpactionInfo/id/" + video_id + "/pm/1?__rt=1&__ro=info_stat"

    req = urllib2.Request(stat_url, headers=headers)
    response = urllib2.urlopen(req)
    content = response.read()

    soup = BeautifulSoup(content)

    nums = soup.findAll('span', attrs={'class': "num"})
    sql_post.watch_count = int(nums[0].text.replace(',', ''))

    up_down = nums[1].text.replace(',', '').split(' / ')
    sql_post.up_count = int(up_down[0])
    sql_post.down_count = int(up_down[1])

    sql_post.comment_count = int(nums[3].text.replace(',', ''))

    try:
        sql_post.repost_count = int(nums[4].text.replace(',', ''))
    except:
        sql_post.repost_count = 0

    #print sql_post.watch_count, sql_post.up_count, sql_post.down_count, sql_post.comment_count
    session.merge(sql_post)  #merge

    session.flush()
    session.commit()

    sql_post = session.query(VideoPost).filter(VideoPost.url == url).first()
    if sql_post:
        store_category('video', str(sql_post.id))
Пример #17
0
def add_status_and_user_to_session(status, keyword_id):
    user = api.user.userinfo(status.name)

    sql_user = session.query(User).filter(
        User.user_origin_id == user.name,
        User.info_source_id == QQ_WEIBO_INFO_SOURCE_ID).first()
    if not sql_user:
        sql_user = User()

    sql_user.user_origin_id = user.name
    sql_user.info_source_id = QQ_WEIBO_INFO_SOURCE_ID
    sql_user.screen_name = user.nick
    if user.head == "":
        sql_user.profile_image_url = ""
    else:
        sql_user.profile_image_url = user.head + '/100'
    sql_user.status_count = user.tweetnum
    sql_user.follower_count = user.fansnum
    sql_user.following_count = user.idolnum
    sql_user.verified = user.isvip
    if user.sex == 1:
        sql_user.gender = 'm'
    elif user.sex == 2:
        sql_user.gender = 'f'
    else:
        sql_user.gender = 'n'
    location = location_split(user.location)
    sql_user.geo_info_province = location['province']
    sql_user.geo_info_city = location['city']

    sql_status = session.query(Status).filter(
        Status.weibo_origin_id == status.id,
        Status.info_source_id == QQ_WEIBO_INFO_SOURCE_ID).first()
    if not sql_status:
        sql_status = Status()

    sql_status.weibo_origin_id = status.id
    sql_status.url = "http://t.qq.com/p/t/" + str(status.id)
    sql_status.weibo_user_screen_name = user.nick
    sql_status.keyword_id = keyword_id
    sql_status.info_source_id = QQ_WEIBO_INFO_SOURCE_ID
    sql_status.text = status.origtext
    sql_status.created_at = datetime.fromtimestamp(status.timestamp)
    sql_status.repost_count = status.count
    sql_status.comment_count = status.mcount
    sql_status.attitude_count = 0

    if status.type != 1:
        sql_status.retweeted = True
    else:
        sql_status.retweeted = False

    if status.image is None:
        sql_status.with_pic = False
    else:
        sql_status.with_pic = True
        sql_status.pic_address = status.image[0]

    sql_status.geo_info_province = location['province']
    sql_status.geo_info_city = location['city']

    sql_status.user = sql_user  #foreign key

    session.merge(sql_status)  #merge

    session.flush()
    session.commit()

    sql_status = session.query(Status).filter(
        Status.weibo_origin_id == status.id,
        Status.info_source_id == QQ_WEIBO_INFO_SOURCE_ID).first()
    if sql_status:
        store_category('weibo', str(sql_status.id))
Пример #18
0
def add_status_and_user_to_session(status, keyword_id):
    user = status['user']
    if user is None or status['text'] is None: #Exception
        return

    sql_user = session.query(User).filter(User.user_origin_id==str(user['id']), User.info_source_id==SEARCH_INFO_SOURCE_ID).first()
    if not sql_user:
        sql_user = User()

    sql_user.user_origin_id = str(user['id'])
    sql_user.info_source_id = SEARCH_INFO_SOURCE_ID
    sql_user.screen_name = user['screen_name']
    sql_user.profile_image_url = user['profile_image_url']
    sql_user.status_count = user['statuses_count']
    sql_user.follower_count = user['followers_count']
    sql_user.following_count = user['friends_count']
    sql_user.verified = user['verified']
    sql_user.gender = user['gender']
    location = locationId2Str(user['province'], user['city'])
    sql_user.geo_info_province = location['province']
    sql_user.geo_info_city = location['city']

    
    sql_status = session.query(Status).filter(Status.weibo_origin_id==status['id'], Status.info_source_id==SEARCH_INFO_SOURCE_ID).first()
    if not sql_status:
        sql_status = Status()

    sql_status.weibo_origin_id = status['id'] 
    sql_status.url = "http://weibo.com/" + str(user['id']) + "/" + id2mid(status['idstr'])
    sql_status.weibo_user_screen_name = user['screen_name']
    sql_status.keyword_id = keyword_id
    sql_status.info_source_id = SEARCH_INFO_SOURCE_ID
    sql_status.text = status['text']
    sql_status.created_at = weibo_date_str_to_datetime(status['created_at'])
    sql_status.repost_count = status['reposts_count']
    sql_status.comment_count = status['comments_count']
    sql_status.attitude_count = status['attitudes_count']
    if status.has_key('retweeted_status'):
        sql_status.retweeted = True
    else:
        sql_status.retweeted = False

    if status.has_key('thumbnail_pic'):
        sql_status.with_pic = True
        sql_status.pic_address = status['thumbnail_pic']
    else:
        sql_status.with_pic = False

    sql_status.geo_info_province = location['province']
    sql_status.geo_info_city = location['city']


    sql_status.user = sql_user #foreign key
    
    session.merge(sql_status) #merge

    session.flush()
    session.commit()


    sql_status = session.query(Status).filter(Status.weibo_origin_id==status['id'],
                                 Status.info_source_id==SEARCH_INFO_SOURCE_ID).first()
    
    if sql_status:
        store_category('weibo', str(sql_status.id))