def display_lectures(self, url): """displays the lectures for a given course url""" html = urlread(url) #get the div which contains all of the <li> lecture tags div_tag = BS(html, parseOnlyThese=SS('div', {'class': 'results-list'})) #parse the name, url, desc, tn for each lecture dirs = [{'name': li.h4.a.text, 'htmlurl': self._urljoin(li.h4.a['href']), 'info': {'plot': li.p.text, 'title': li.h4.a.text}, 'tn':self._urljoin( li.find('img', {'class': 'thumb-144'})['src'])} for li in div_tag('li')] #for each dir, download the lecture's html page and parse the video url self.dp = DialogProgress(self.getString(30000), line1=self.getString(30101), num_steps=(len(dirs))) urls = [d['htmlurl'] for d in dirs] responses = async_urlread(urls, self.dp) [d.update({'url': self._get_video_url(response)}) for d, response in zip(dirs, responses)] #filter out lectures that don't have urls, currently a fix for a chem #course which contains a bad link to a lecture dirs = filter(lambda d: d['url'] != None, dirs) self.dp.update(100) self.dp.close() self.add_videos(dirs)
def display_allresults(self, url): """displays all results for a given url, used on a subject page t lis all video results without having to drill down into each category""" #dp = self.xbmcgui.DialogProgress() html = urlread(url) #get the div which contains all of the topic <a> tags div_topics = BS(html, parseOnlyThese=SS('div', {'class': 'results-side'})) #create a list of urls for all topics topic_urls = [self._urljoin(a['href']) for a in div_topics('a') if a.text.startswith('Online') == False and 'Credit' not in a.text and not a.text.startswith('All')] self.dp = DialogProgress(self.getString(30000), line1=self.getString(30102), num_steps=(2 * len(topic_urls))) topic_htmls = async_urlread(topic_urls, self.dp) courses, lectures = self._get_courses_lectures(topic_htmls) self.dp.update(100) self.dp.close() courses = sorted(courses, key=lambda c: c['name']) lectures = sorted(lectures, key=lambda l: l['name']) self.add_dirs(courses, end=False) self.add_videos(lectures)
class AcademicEarth(XBMCVideoPlugin): base_url = 'http://academicearth.org' subjects_url = '%s/subjects' % base_url def display_subjects(self, url): """Takes a url and displays subjects.""" html = urlread(url) div_tags = BS(html, parseOnlyThese=SS('div', {'class': 'institution-list'})) #Build the list of subjects. Sometimes there is more than one div_tag, #so loop through each div_tag, and then for each div_tag, loop through #all the <a> tags and parse the subject information. dirs = [{'name': a.text, 'url': self._urljoin(a['href']), 'mode': '1'} for div in div_tags for a in div('a')] #Filter out the paid courses subjects dirs = [d for d in dirs if d['name'] not in IGNORE_LIST] self.add_dirs(dirs) def display_topics(self, url): """Takes a subject url and displays a list of all topics on the page""" html = urlread(url) #get the div which contains all of the topic <a> tags div_topics = BS(html, parseOnlyThese=SS('div', {'class': 'results-side'})) #create the list of dirs by parsing all the a tags in the div dirs = [{'name': a.text, 'url': self._urljoin(a['href']), 'mode': '2'} for a in div_topics('a')] #filter out paid courses and the 'All' listing, since we build our own dirs = [d for d in dirs if d['name'].startswith('Online') == False and 'Courses for Credit' not in d['name'] and d['name'].startswith('All') == False] #make the first choice on the list = 'View All' dirs.insert(0, {'name': self.getString(30100), 'url': url, 'mode': '4'}) self.add_dirs(dirs) def display_courses(self, url): """Takes a topic url and displays all courses""" html = urlread(url) courses, lectures = self._get_courses_lectures(html) #add listings to UI, courses first, lectures at the bottom. self.add_dirs(courses, end=False) self.add_videos(lectures) def display_lectures(self, url): """displays the lectures for a given course url""" html = urlread(url) #get the div which contains all of the <li> lecture tags div_tag = BS(html, parseOnlyThese=SS('div', {'class': 'results-list'})) #parse the name, url, desc, tn for each lecture dirs = [{'name': li.h4.a.text, 'htmlurl': self._urljoin(li.h4.a['href']), 'info': {'plot': li.p.text, 'title': li.h4.a.text}, 'tn':self._urljoin( li.find('img', {'class': 'thumb-144'})['src'])} for li in div_tag('li')] #for each dir, download the lecture's html page and parse the video url self.dp = DialogProgress(self.getString(30000), line1=self.getString(30101), num_steps=(len(dirs))) urls = [d['htmlurl'] for d in dirs] responses = async_urlread(urls, self.dp) [d.update({'url': self._get_video_url(response)}) for d, response in zip(dirs, responses)] #filter out lectures that don't have urls, currently a fix for a chem #course which contains a bad link to a lecture dirs = filter(lambda d: d['url'] != None, dirs) self.dp.update(100) self.dp.close() self.add_videos(dirs) def display_allresults(self, url): """displays all results for a given url, used on a subject page t lis all video results without having to drill down into each category""" #dp = self.xbmcgui.DialogProgress() html = urlread(url) #get the div which contains all of the topic <a> tags div_topics = BS(html, parseOnlyThese=SS('div', {'class': 'results-side'})) #create a list of urls for all topics topic_urls = [self._urljoin(a['href']) for a in div_topics('a') if a.text.startswith('Online') == False and 'Credit' not in a.text and not a.text.startswith('All')] self.dp = DialogProgress(self.getString(30000), line1=self.getString(30102), num_steps=(2 * len(topic_urls))) topic_htmls = async_urlread(topic_urls, self.dp) courses, lectures = self._get_courses_lectures(topic_htmls) self.dp.update(100) self.dp.close() courses = sorted(courses, key=lambda c: c['name']) lectures = sorted(lectures, key=lambda l: l['name']) self.add_dirs(courses, end=False) self.add_videos(lectures) def _get_courses_lectures(self, htmls): """returns a tuple of lists: (courses_list, lectures_list). It takes the html source(s) of a topic page and parses all results by visiting each page of results""" if type(htmls).__name__ == 'str': htmls = [htmls] #Each topic page displays only 12 results to a page. So to get all #results for a topic, parse all page results urls from the topic page, #then download each of the extra pages of results, then parse the video #results. pagination_urls = [url for html in htmls for url in self._get_pagination_urls(html)] #Download every pagination page. If a dialog progress box exists, #update the step for each increment. Allocate 50% of the bar for #downloading the pagination urls. The other 50% is allocated to #downloading all of the topic pages when choosing 'View All' for a #subject. if self.dp and len(pagination_urls) != 0: self.dp.step = int(50 / len(pagination_urls)) page_htmls = async_urlread(pagination_urls, self.dp) else: page_htmls = async_urlread(pagination_urls) #extend the list of pagination htmls with the given htmls page_htmls.extend(htmls) #get a complete list of video results by parsing results from all pages results = self._get_video_results(page_htmls) #filter courses and lectures so they can be displayed in groups courses = filter(lambda r: '/courses/' in r['url'], results) lectures = filter(lambda r: '/lectures/' in r['url'], results) #add mode argument to courses, lectures don't need it since they will #contain a direct url to the video [c.update({'mode': 3}) for c in courses] #get the actual URL for the video for each lecture, this ensures that #the display link plays a video, and doesn't go to another level of #directory listings [l.update({'url': self._get_video_url(l['url']), 'name': self.getString(30103) + l['name']}) for l in lectures] #filter out lectures with no video url. This is a result of bad regex #parsing, crappy fix... lectures = [l for l in lectures if l['url'] is not None] return courses, lectures def _get_video_url(self, html): """Takes html for a video page and returns the url of the video""" m = re.search(r'flashVars.flvURL = "(.+?)"', html) if m: return m.group(1) return None def _get_pagination_urls(self, html): """Returns a list of urls for other results pages for given html.""" #get the pagination <ul> tags ul_tags = BS(html, parseOnlyThese=SS('ul', {'class': 'pagination'})) #choose the first pagination <ul> tag since both <ul>s are identical ul = ul_tags('ul', limit=1)[0] #return the complete url for each link in the <ul>, ignore the last #url in the list because it is the next page link, so it is already #included return [self._urljoin(a['href']) for a in ul('a')[:-1]] def _get_video_results(self, htmls): """takes an html source(s) and a list of video results""" video_results = [] #if htmls is only a single html page, then convert htmls to a list with #a single item, the given html string if type(htmls).__name__ == 'str': htmls = [htmls] for html in htmls: div_results = BS(html, parseOnlyThese=SS('div', {'class': 'video-results'})) #filter out empty <li> tags that only contain ' ' lis = [li for li in div_results('li') if li.get('class') != 'break'] #build the list of results, a dict for each results res = [{'name': li.h3.text, 'url': self._urljoin(li.a['href']), 'tn': self._urljoin( li.find('img', {'class': 'thumb-144'})['src'])} for li in lis] video_results.extend(res) return video_results def run(self, mode, url): #must pass default values for mode and url, mode is '0', url is '' mode_functions = {'0': self.display_subjects, '1': self.display_topics, '2': self.display_courses, '3': self.display_lectures, '4': self.display_allresults} mode_functions[mode](url)