def main(): try: search_for_tudou_video_posts() except Exception, e: store_error(TUDOU_INFO_SOURCE_ID) video_logger.exception(e)
def search_for_tudou_video_posts(): previous_real_count = session.query(VideoPost).filter( VideoPost.info_source_id == TUDOU_INFO_SOURCE_ID).count() count = 0 sql_job = Job() sql_job.previous_executed = datetime.now() sql_job.info_source_id = TUDOU_INFO_SOURCE_ID html_parser = HTMLParser.HTMLParser() for keyword in KEYWORDS: page = 1 finished = False while (not finished and page <= 10): url = "http://www.soku.com/t/nisearch/" + urllib.quote_plus( keyword.str.encode('utf8') ) + '/_cid__sort_date_display_album_time_0_page_' + str( page) + '?sfilter=1' page = page + 1 #print url headers = { 'Host': 'www.soku.com', 'Referer': 'http://www.soku.com/search_video/', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/536.26.17 (KHTML, like Gecko) Version/6.0.2 Safari/536.26.17', } req = urllib2.Request(url, headers=headers) response = urllib2.urlopen(req) content = response.read() soup = BeautifulSoup(content) posts = soup.findAll('div', attrs={'class': "v"}) count = count + len(posts) if len(posts) == 0: finished = True break for post in posts: try: video_user_screen_name = post.find('span', attrs={ 'class': "username" }).text deltatime = post.find('span', attrs={'class': "pub"}).text v_meta_title = post.find('div', attrs={'class': "v-meta-title"}) title = v_meta_title.a['title'] title = html_parser.unescape(title) url = v_meta_title.a['href'] try: v_meta_entry = post.find( 'div', attrs={'class': "v-meta-entry"}) v_meta_datas = v_meta_entry.findAll( 'div', attrs={'class': "v-meta-data"}) playcount = v_meta_datas[1].text playcount = playcount[playcount.find(":") + 1:] playcount = playcount.replace(',', '') playcount = int(playcount) except: playcount = 0 #对关键词进行重新过滤 if not recheck_title(keyword, title): continue try: created_at = convertTime(deltatime) if created_at == -1: continue except: created_at = datetime.now() #print video_user_screen_name,created_at,title,url,playcount store_by_tudou_video_url(url, keyword.id, title, video_user_screen_name, created_at, playcount) time.sleep(5) except Exception, e: store_error(TUDOU_INFO_SOURCE_ID) video_logger.exception(e) time.sleep(5)
def main(): try: search_for_youku_video_posts() except Exception, e: store_error(YOUKU_INFO_SOURCE_ID) video_logger.exception(e)
def main(): try: search_for_sina_video_posts() except Exception, e: store_error(SINA_VIDEO_INFO_SOURCE_ID) video_logger.exception(e)
def search_for_sina_video_posts(): previous_real_count = session.query(VideoPost).filter( VideoPost.info_source_id == SINA_VIDEO_INFO_SOURCE_ID).count() count = 0 sql_job = Job() sql_job.previous_executed = datetime.now() sql_job.info_source_id = SINA_VIDEO_INFO_SOURCE_ID for keyword in KEYWORDS: page = 1 finished = False while (not finished and page <= 10): url = "http://video.sina.com.cn/search/index.php?k=" + urllib.quote_plus( keyword.str.encode('utf8')) + "&m1=a&m3=a2&page=" + str(page) page = page + 1 #print url headers = { 'Host': 'video.sina.com.cn', 'Referer': 'http://video.sina.com.cn/search/index.php?', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/536.26.17 (KHTML, like Gecko) Version/6.0.2 Safari/536.26.17', } req = urllib2.Request(url, headers=headers) response = urllib2.urlopen(req) content = response.read() soup = BeautifulSoup(content) video_list = soup.find('div', id="contentH") if video_list == None: finished = True break divs = video_list.findAll('div') if len(divs) == 0: finished = True break tr_arr = video_list.findAll('tr') temp_arr = [] for i, tr in enumerate(tr_arr): try: if i % 2 == 0: temp_arr = [] td_divs = tr.findAll('div', attrs={'class': "v_Info"}) for j, td_div in enumerate(td_divs): name_div = td_div.find('div', attrs={'class': 'name'}) a_tag = name_div.findAll('a')[1] video_url = a_tag['href'] video_title = a_tag['title'] temp_arr.append({ 'video_url': video_url, 'video_title': video_title }) #print video_url,video_title else: #print tr.prettify() td_divs = tr.findAll('div', attrs={'class': "v_Info"}) for j, td_div in enumerate(td_divs): li_arr = td_div.findAll('li') try: video_user = li_arr[0].a['title'] except: video_user = li_arr[0].a.text video_createAt = li_arr[1].text created_at = convertTime(video_createAt) video_url = temp_arr[j]['video_url'] video_title = temp_arr[j]['video_title'] try: play_count = li_arr[2].text play_count = play_count[3:] play_count = play_count.replace(',', '') play_count = int(play_count) except: play_count = 0 #print "###"+video_title #二次过滤关键词和时间 if created_at != -1 and recheck_title( keyword, video_title) == True: #print video_title,video_url,video_user,created_at,play_count store_by_sina_video_url( video_url, keyword.id, video_title, video_user, created_at, play_count) except Exception, e: store_error(SINA_VIDEO_INFO_SOURCE_ID) video_logger.exception(e) time.sleep(5) time.sleep(5)
def main(): try: search_for_youku_global_video_posts() except Exception, e: store_error(ALL_VIDEO_INFO_SOURCE_ID) video_logger.exception(e)