def main():
    try:
        search_for_google_news_posts(OPPONENT_KEYWORDS,
                                     OPPONENT_GOOGLE_NEWS_INFO_SOURCE_ID)
    except Exception, e:
        news_logger.exception(e)
        store_error(OPPONENT_GOOGLE_NEWS_INFO_SOURCE_ID)
示例#2
0
def main():
    try:
        search_for_tianya_bbs_posts()
        search_for_tianya_bbs_315_posts()
    except Exception, e:
        store_error(TIANYA_INFO_SOURCE_ID)
        bbs_logger.exception(e) 
示例#3
0
def main():
    try:
        search_for_new_statuses()
        refresh_monitoring_status()
    except Exception, e:
        store_error(QQ_WEIBO_INFO_SOURCE_ID)
        weibo_logger.exception(e)
示例#4
0
def search_for_baidu_zhidao_posts():
    previous_real_count = session.query(WikiPost).filter(WikiPost.info_source_id==BAIDU_ZHIDAO_INFO_SOURCE_ID).count()
    count = 0
    sql_job = Job()
    sql_job.previous_executed = datetime.now()
    sql_job.info_source_id = BAIDU_ZHIDAO_INFO_SOURCE_ID

    count = 0

    last_count = 0
    for keyword in KEYWORDS :
        data = {'word': keyword.str.encode('utf8'),
                'ie': 'utf-8',
                'sort': 1,
                'lm': 0,
                'date':2,
                'oa':0,
                'sites':-1,
               }
        
        url = "http://zhidao.baidu.com/search?" + urllib.urlencode(data)
        # print url

        headers = {
            'Host': 'zhidao.baidu.com',
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/536.26.17 (KHTML, like Gecko) Version/6.0.2 Safari/536.26.17',
        }
    
        req = urllib2.Request(url, headers = headers)  
        response = urllib2.urlopen(req)  
        content = response.read() 
    
        soup = BeautifulSoup(content)


        posts = soup.findAll('dl', attrs={'class': "dl"})
        count = count + len(posts)

        #print count

        for post in posts:
            try:
                url = post.dt.a['href']
                comment_count = 0
                if post.find('a', attrs={'log': "pos:ans"}):
                    comment_count_str = post.find('a', attrs={'log': "pos:ans"}).text
                    tail = comment_count_str.find(u'个回答')
                    comment_count = int(comment_count_str[:tail])

                store_by_wiki_url(url, comment_count, keyword.id)
            except Exception, e:
                store_error(BAIDU_ZHIDAO_INFO_SOURCE_ID)
                wiki_logger.exception(e) 


        if count - last_count > 30:
            time.sleep(1800)
            last_count = count
示例#5
0
def main():
    try:
        start_time = datetime.now()
        search_for_baidu_tieba_posts()
        end_time = datetime.now()
        consume_time = end_time - start_time
        bbs_logger.info("baidu tieba consume time: " + str(consume_time))
    except Exception, e:
        store_error(BAIDU_TIEBA_INFO_SOURCE_ID)
        bbs_logger.exception(e) 
示例#6
0
def main():
    try:
        start_time = datetime.now()
        do_login('yoyo_worms', 'bI9eK4NF')
        search_for_baidu_zhidao_posts()
        end_time = datetime.now()
        consume_time = end_time - start_time
        wiki_logger.info("baidu zhidao consume time: " + str(consume_time))
    except Exception, e:
        store_error(BAIDU_ZHIDAO_INFO_SOURCE_ID)
        wiki_logger.exception(e) 
示例#7
0
def main():
    search_for_soso_wenwen_posts()
    try:
        start_time = datetime.now()
        search_for_soso_wenwen_posts()
        end_time = datetime.now()
        consume_time = end_time - start_time
        wiki_logger.info("soso wenwen consume time: " + str(consume_time))
    except Exception, e:
        store_error(SOSO_WENWEN_INFO_SOURCE_ID)
        wiki_logger.exception(e) 
示例#8
0
def update_by_weibo_id(id, origin_id):
    try:
        api_status = api.tweet.show(origin_id)
        sql_status = session.query(Status).get(id)

        sql_status.repost_count = api_status.count
        sql_status.comment_count = api_status.mcount

        session.commit()

        store_category('weibo', str(sql_status.id))

    except Exception, e:  #APIError
        store_error(QQ_WEIBO_INFO_SOURCE_ID)
        weibo_logger.exception(e)
示例#9
0
def main():
    try:
        search_for_tudou_video_posts()
    except Exception, e:
        store_error(TUDOU_INFO_SOURCE_ID)
        video_logger.exception(e)
示例#10
0
def search_for_tudou_video_posts():
    previous_real_count = session.query(VideoPost).filter(
        VideoPost.info_source_id == TUDOU_INFO_SOURCE_ID).count()

    count = 0
    sql_job = Job()
    sql_job.previous_executed = datetime.now()
    sql_job.info_source_id = TUDOU_INFO_SOURCE_ID

    html_parser = HTMLParser.HTMLParser()

    for keyword in KEYWORDS:
        page = 1
        finished = False
        while (not finished and page <= 10):
            url = "http://www.soku.com/t/nisearch/" + urllib.quote_plus(
                keyword.str.encode('utf8')
            ) + '/_cid__sort_date_display_album_time_0_page_' + str(
                page) + '?sfilter=1'
            page = page + 1
            #print url
            headers = {
                'Host':
                'www.soku.com',
                'Referer':
                'http://www.soku.com/search_video/',
                'User-Agent':
                'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/536.26.17 (KHTML, like Gecko) Version/6.0.2 Safari/536.26.17',
            }

            req = urllib2.Request(url, headers=headers)
            response = urllib2.urlopen(req)
            content = response.read()

            soup = BeautifulSoup(content)

            posts = soup.findAll('div', attrs={'class': "v"})
            count = count + len(posts)
            if len(posts) == 0:
                finished = True
                break

            for post in posts:
                try:
                    video_user_screen_name = post.find('span',
                                                       attrs={
                                                           'class': "username"
                                                       }).text
                    deltatime = post.find('span', attrs={'class': "pub"}).text
                    v_meta_title = post.find('div',
                                             attrs={'class': "v-meta-title"})
                    title = v_meta_title.a['title']
                    title = html_parser.unescape(title)
                    url = v_meta_title.a['href']

                    try:
                        v_meta_entry = post.find(
                            'div', attrs={'class': "v-meta-entry"})
                        v_meta_datas = v_meta_entry.findAll(
                            'div', attrs={'class': "v-meta-data"})
                        playcount = v_meta_datas[1].text
                        playcount = playcount[playcount.find(":") + 1:]
                        playcount = playcount.replace(',', '')
                        playcount = int(playcount)
                    except:
                        playcount = 0
                    #对关键词进行重新过滤
                    if not recheck_title(keyword, title):
                        continue

                    try:
                        created_at = convertTime(deltatime)
                        if created_at == -1:
                            continue
                    except:
                        created_at = datetime.now()
                    #print video_user_screen_name,created_at,title,url,playcount
                    store_by_tudou_video_url(url, keyword.id, title,
                                             video_user_screen_name,
                                             created_at, playcount)
                    time.sleep(5)

                except Exception, e:
                    store_error(TUDOU_INFO_SOURCE_ID)
                    video_logger.exception(e)
                    time.sleep(5)
示例#11
0
def main():
    try:
        search_for_youku_video_posts()
    except Exception, e:
        store_error(YOUKU_INFO_SOURCE_ID)
        video_logger.exception(e)
示例#12
0
def main():
    try:
        search_for_baidu_news_posts(KEYWORDS, BAIDU_NEWS_INFO_SOURCE_ID)
    except Exception, e:
        news_logger.exception(e)
        store_error(BAIDU_NEWS_INFO_SOURCE_ID)
示例#13
0
def main():
    try:
        search_for_sina_video_posts()
    except Exception, e:
        store_error(SINA_VIDEO_INFO_SOURCE_ID)
        video_logger.exception(e)
示例#14
0
def search_for_sina_video_posts():
    previous_real_count = session.query(VideoPost).filter(
        VideoPost.info_source_id == SINA_VIDEO_INFO_SOURCE_ID).count()

    count = 0
    sql_job = Job()
    sql_job.previous_executed = datetime.now()
    sql_job.info_source_id = SINA_VIDEO_INFO_SOURCE_ID

    for keyword in KEYWORDS:
        page = 1
        finished = False
        while (not finished and page <= 10):
            url = "http://video.sina.com.cn/search/index.php?k=" + urllib.quote_plus(
                keyword.str.encode('utf8')) + "&m1=a&m3=a2&page=" + str(page)
            page = page + 1
            #print url

            headers = {
                'Host':
                'video.sina.com.cn',
                'Referer':
                'http://video.sina.com.cn/search/index.php?',
                'User-Agent':
                'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/536.26.17 (KHTML, like Gecko) Version/6.0.2 Safari/536.26.17',
            }

            req = urllib2.Request(url, headers=headers)
            response = urllib2.urlopen(req)
            content = response.read()

            soup = BeautifulSoup(content)
            video_list = soup.find('div', id="contentH")
            if video_list == None:
                finished = True
                break
            divs = video_list.findAll('div')
            if len(divs) == 0:
                finished = True
                break
            tr_arr = video_list.findAll('tr')
            temp_arr = []

            for i, tr in enumerate(tr_arr):
                try:
                    if i % 2 == 0:
                        temp_arr = []
                        td_divs = tr.findAll('div', attrs={'class': "v_Info"})
                        for j, td_div in enumerate(td_divs):
                            name_div = td_div.find('div',
                                                   attrs={'class': 'name'})
                            a_tag = name_div.findAll('a')[1]
                            video_url = a_tag['href']
                            video_title = a_tag['title']
                            temp_arr.append({
                                'video_url': video_url,
                                'video_title': video_title
                            })
                            #print video_url,video_title
                    else:
                        #print tr.prettify()
                        td_divs = tr.findAll('div', attrs={'class': "v_Info"})
                        for j, td_div in enumerate(td_divs):
                            li_arr = td_div.findAll('li')
                            try:
                                video_user = li_arr[0].a['title']
                            except:
                                video_user = li_arr[0].a.text
                            video_createAt = li_arr[1].text
                            created_at = convertTime(video_createAt)

                            video_url = temp_arr[j]['video_url']
                            video_title = temp_arr[j]['video_title']

                            try:
                                play_count = li_arr[2].text
                                play_count = play_count[3:]
                                play_count = play_count.replace(',', '')
                                play_count = int(play_count)
                            except:
                                play_count = 0
                            #print "###"+video_title
                            #二次过滤关键词和时间
                            if created_at != -1 and recheck_title(
                                    keyword, video_title) == True:
                                #print video_title,video_url,video_user,created_at,play_count
                                store_by_sina_video_url(
                                    video_url, keyword.id, video_title,
                                    video_user, created_at, play_count)

                except Exception, e:
                    store_error(SINA_VIDEO_INFO_SOURCE_ID)
                    video_logger.exception(e)
                    time.sleep(5)

            time.sleep(5)
示例#15
0
def main():
    try:
        search_for_sina_blog_posts()
    except Exception, e:
        store_error(SINA_BLOG_INFO_SOURCE_ID)
        blog_logger.exception(e)
示例#16
0
def main():
    try:
        search_for_youku_global_video_posts()
    except Exception, e:
        store_error(ALL_VIDEO_INFO_SOURCE_ID)
        video_logger.exception(e)