def fetch_list(self, cat_id, page = 1): vids = {} response = urllib2.urlopen(self.video_list_url % (cat_id, page)) contents = response.read() page = BeautifulSoup(contents) matches = page.findAll('td', 'listOff') if matches: for match in matches: primary_link = match.find('a', 'vodlink') id = primary_link['href'].replace('./', '') if id.startswith('javascript'): continue re_match = re.search(r'Posted: (\d+) (\d+)/(\d+)<', match.parent.find('td', 'sect').renderContents()) year = re_match.group(1) month = re_match.group(2) day = re_match.group(3) date_string = "%s-%s-%s" % (day, month, year) posted_date = datetime.date(int(year), int(month), int(day)).strftime(self.date_format) image_url = 'http://www.gomtv.net' + match.find('img')['src'] local_image_path = os.path.join(self.base_data_path, cat_id, str(id) + '.tbn') if not os.path.exists(os.path.join(self.base_data_path, cat_id)): os.makedirs(os.path.join(self.base_data_path, cat_id)) if not os.path.isfile(local_image_path): urllib.urlretrieve(image_url, local_image_path) vids[id] = {'id': int(id), 'date_string': date_string, 'posted_date': posted_date, 'year': year, 'title': str(primary_link.string), 'description': str(match.find('div', 'vodinfo').renderContents()), 'image_url': local_image_path} return vids
def fetch_page_count(self, cat_id): count = 0 response = urllib2.urlopen(self.video_list_url % (cat_id, 1)) contents = response.read() page = BeautifulSoup(contents) link_table = page.find('table', {'id': 'bbsnum'}) count = re.search(r"<a href=\"\./\?page=(\d+)&[^>]*>Last >></a>", link_table.renderContents()).group(1) return count
def fetch_video(self, cat_id, vid_id): vid = None response = urllib2.urlopen(self.video_url % (cat_id, vid_id)) contents = response.read() page = BeautifulSoup(contents) re_match = re.search(r'\.swf\?link=(\d+)', page.renderContents()) if re_match: vid = {} file_id = re_match.group(1) vid['title'] = page.find('div', {'id': 'bbsDetail'}).h3.string vid['file_url'] = 'http://flvdn.gomtv.net/viewer/%s.flv' % file_id if not os.path.exists(os.path.join(self.base_data_path, cat_id)): os.makedirs(os.path.join(self.base_data_path, cat_id)) vid['local_vid_path'] = os.path.join(self.base_data_path, cat_id, str(vid_id) + '.flv') return vid
def fetch(self): cats = {} response = urllib2.urlopen(self.category_url) contents = response.read() page = BeautifulSoup(contents) channels = page.findAll('div', {'id': 'Channels'}) if channels: for channel in channels: shows = channel.findAll('dl') for show in shows: if show.dt.a: id = re.sub(r'/(.*)/', r'\1', show.dt.a['href']) image_url = show.find('dd', 'img').img['src'] if image_url.startswith('/'): image_url = 'http://www.gomtv.net' + image_url if show.find('dd', 'txt'): description = show.find('dd', 'txt').renderContents() else: description = '' cats[id] = {'id': id, 'title': show.dt.a.string, 'description': description, 'image_url': image_url } return cats