def getVideoDownloadLink(self, url): """Return the video title and download link""" title = None link = None downloadPage = '' html = getHTML(url) soup = BeautifulSoup(html) # Look for the "bouton-telecharger" class (new version) telecharger = soup.find('a', attrs = {'class':'bouton-telecharger'}) if telecharger: downloadPage = telecharger['href'] else: # Look for the "bouton-telecharger" image (old version) img = soup.find('img', attrs = {'src':'http://www.arretsurimages.net/images/boutons/bouton-telecharger.png'}) if img: downloadPage = img.findParent()['href'] if downloadPage.endswith('.avi'): print downloadPage title = downloadPage.split('/')[-1] print title html = getHTML(downloadPage) soup = BeautifulSoup(html) click = soup.find(text=re.compile('cliquer ici')) if click: link = click.findParent()['href'] print link else: print "No \"cliquer ici\" found" else: print "bouton-telecharger not found" return {'Title':title, 'url':link}
def __init__(self, url=None): if __settings__.getSetting('subtitleEnable') == 'true' and url is None: url = URLNEW.replace('/page/', '?lang=%s&event=0&duration=0&orderedby=NEWEST&tagid=0' % Subtitle().language ) self.html = getHTML(url, savesessioncookie=True) else: if url is None: url = URLNEW self.html = getHTML(url) self.navItems = TedTalks().getNavItems(self.html)
def getVideoDetails(self, url): """self.videoDetails={Title, Director, Genre, Plot, id, url}""" #TODO: get 'related tags' and list them under genre html = getHTML(url) soup = BeautifulSoup(html) #get title title = soup.find('span', attrs={'id': 'altHeadline'}).string #get speaker from title speaker = title.split(':', 1)[0] #get description: plot = soup.find('p', attrs={'id': 'tagline'}).string #get url for link in soup.findAll('a'): if re.match('Watch.*high-res', str(link.string)): url = URLTED + link['href'] #get id from url id = url.split('/')[-1] return { 'Title': title, 'Director': speaker, 'Genre': 'TED', 'Plot': plot, 'PlotOutline': plot, 'id': id, 'url': url }
def genLangAbbr(self): self.langAbbr = {'English': 'eng'} html = getHTML(URLLANG) for full, abbr in re.compile( '<a title="(.*?)" href="/translate/languages/(.*?)">').findall( html): self.langAbbr[full] = abbr
def __init__(self, url=None): # adding 9999 to the url takes the script to the very last page of the list, providing the total # of pages. if url == None: url = URLSPEAKERS+'9999' self.html = getHTML(url) # only bother with navItems where they have a chance to appear. if URLSPEAKERS in url: self.navItems = TedTalks().getNavItems(self.html)
def removeFromFavorites(self, user, url): """user must be TedTalks().User object with .id attribute""" if user.id is not None: id = TedTalks().getVideoDetails(url)['id'] print id response = getHTML(URLREMFAV % (id)) if response: print '[%s] %s removeFromFavorites success' % (pluginName, __name__) return True else: print '[%s] %s invalid user object' % (pluginName, __name__)
def getAllSpeakers(self): speakerContainers = SoupStrainer(attrs = {'href':re.compile('/speakers/\S.+?.html')}) for i in range(self.navItems['selected']): # don't parse the last page twice. if i is not 8: html = getHTML(URLSPEAKERS+str(i+1)) else: html = self.html for speaker in BeautifulSoup(html, parseOnlyThese = speakerContainers): title = speaker.string link = URLTED+speaker['href'] yield {'url':link, 'Title':title}
def getFavoriteTalks(self, user, url = URLFAVORITES): """user must be TedTalks().User object with .id attribute""" if user.id is not None: html = getHTML(url+user.id) talkContainer = SoupStrainer(attrs = {'class':re.compile('box clearfix')}) for talk in BeautifulSoup(html, parseOnlyThese = talkContainer): title = talk.h4.a.string link = URLTED+talk.dt.a['href'] pic = resizeImage(talk.find('img', attrs = {'src':re.compile('.+?\.jpg')})['src']) yield {'url':link, 'Title':title, 'Thumb':pic} else: print '[%s] %s invalid user object' % (pluginName, __name__)
def getLoginResponse(self, url = URLLOGIN): #clientform doesn't like HTML, and I don't want to monkey patch it, so getUrllib2ResponseObject was born. response = getUrllib2ResponseObject(url) forms = ParseResponse(response, backwards_compat=False) response.close() #set username & password in the signin form form = forms[1] form["users[username]"] = self.username form["users[password]"] = self.password form["users[rememberme]"] = ['1'] #click submit return getHTML(form.click())
def isLoggedIn(self, username): """Return True if @username is already logged in, False otherwise""" html = getHTML(URLMONCOMPTE) soup = BeautifulSoup(html) if soup.title.string == u'Arrêt sur images – Mon compte': # Already logged in, check that the username is still the same userText = soup.find(text=re.compile(u'L’e-mail que vous utilisez pour @si est.*')) if userText and userText.next.string == username: return True else: print "Already logged in, but username does not match..." return False
def getTalks(self): # themes loaded with a json call. Why are they not more consistant? from simplejson import loads # search HTML for the link to tedtalk's "api". It is easier to use regex here than BS. jsonUrl = URLTED+re.findall('DataSource\("(.+?)"', self.html)[0] # make a dict from the json formatted string from above url talksMarkup = loads(getHTML(jsonUrl)) # parse through said dict for all the metadata for markup in talksMarkup['resultSet']['result']: talk = BeautifulSoup(markup['markup']) link = URLTED+talk.dt.a['href'] title = cleanHTML(talk.dt.a['title']) pic = resizeImage(talk.find('img', attrs = {'src':re.compile('.+?\.jpg')})['src']) yield {'url':link, 'Title':title, 'Thumb':pic}
def isLoggedIn(self, username): """Return True if @username is already logged in, False otherwise""" html = getHTML(URLMONCOMPTE) soup = BeautifulSoup(html) if soup.title.string == u'Arrêt sur images – Mon compte': # Already logged in, check that the username is still the same userText = soup.find( text=re.compile(u'L’e-mail que vous utilisez pour @si est.*')) if userText and userText.next.string == username: return True else: print "Already logged in, but username does not match..." return False
def getVideoDownloadLink(self, url): """Return the video title and download link""" title = None link = None downloadPage = '' html = getHTML(url) soup = BeautifulSoup(html) # Look for the "bouton-telecharger" class (new version) telecharger = soup.find('a', attrs={'class': 'bouton-telecharger'}) if telecharger: downloadPage = telecharger['href'] else: # Look for the "bouton-telecharger" image (old version) img = soup.find( 'img', attrs={ 'src': 'http://www.arretsurimages.net/images/boutons/bouton-telecharger.png' }) if img: downloadPage = img.findParent()['href'] if downloadPage.endswith('.avi'): print downloadPage title = downloadPage.split('/')[-1] print title html = getHTML(downloadPage) soup = BeautifulSoup(html) click = soup.find(text=re.compile('cliquer ici')) if click: link = click.findParent()['href'] print link else: print "No \"cliquer ici\" found" else: print "bouton-telecharger not found" return {'Title': title, 'url': link}
def getVideoDetails(self, url): """self.videoDetails={Title, Director, Genre, Plot, id, url}""" #TODO: get 'related tags' and list them under genre html = getHTML(url) soup = BeautifulSoup(html) #get title title = soup.find('span', attrs={'id':'altHeadline'}).string #get speaker from title speaker = title.split(':', 1)[0] #get description: plot = soup.find('p', attrs={'id':'tagline'}).string #get url for link in soup.findAll('a'): if re.match('Watch.*high-res' , str(link.string)): url = URLTED+link['href'] #get id from url id = url.split('/')[-1] return {'Title':title, 'Director':speaker, 'Genre':'TED', 'Plot':plot, 'PlotOutline':plot, 'id':id, 'url':url}
def login(self, username = None, password = None): """Try to login using @username and @password. Return True if successful, False otherwise""" if username and password: response = getUrllib2ResponseObject(URLLOGIN) forms = ParseResponse(response, backwards_compat=False) response.close() # Set username & password in the signin form form = forms[2] form["username"] = username form["password"] = password # Click submit html = getHTML(form.click()) soup = BeautifulSoup(html) if soup.title.string == u'Le Forum Arrêt Sur Images': # We are on the forum page - login successful return True return False
def login(self, username=None, password=None): """Try to login using @username and @password. Return True if successful, False otherwise""" if username and password: response = getUrllib2ResponseObject(URLLOGIN) forms = ParseResponse(response, backwards_compat=False) response.close() # Set username & password in the signin form form = forms[2] form["username"] = username form["password"] = password # Click submit html = getHTML(form.click()) soup = BeautifulSoup(html) if soup.title.string == u'Le Forum Arrêt Sur Images': # We are on the forum page - login successful return True return False
def getVideoDetails(self, url, streams): """Return the video title and link""" # Run the json request using the video id # passed in url argument request = getHTML(JSONREQUEST % url) result = simplejson.loads(request) # The stream quality chosen might not be available # -> get the first video link available (following the streams quality order) for stream in streams: if result[stream]: print "Found %s link" % stream link = result[stream] break else: print "No video link found for this video id" link = 'None' title = result["title"] return {'Title':title, 'url':link}
def getVideoDetails(self, url, streams): """Return the video title and link""" # Run the json request using the video id # passed in url argument request = getHTML(JSONREQUEST % url) result = simplejson.loads(request) # The stream quality chosen might not be available # -> get the first video link available (following the streams quality order) for stream in streams: if result[stream]: print "Found %s link" % stream link = result[stream] break else: print "No video link found for this video id" link = 'None' title = result["title"] return {'Title': title, 'url': link}
def getProgramParts(self, url, name, icon): """Return all parts of a program (video id) video id allows to get video url with a json request""" html = getHTML(url) soup = BeautifulSoup(html) parts = [] part = 1 # Get all movie id for param in soup.findAll('param', attrs={'name': 'movie'}): try: videoId = param.parent["id"] except KeyError: continue title = name + ' - Acte %d' % part # Try to get the icon linked to the iPhone video on that page # That's faster than getting it from the json request (see getVideoDetails), # which would require one extra HTML request for each part try: media = param.parent.parent.find(text=re.compile(u'img src=')) match = re.search(u'img src="(.*?)"', media) thumb = URLASI + match.group(1) except (TypeError, AttributeError): thumb = icon parts.append({'url': videoId, 'Title': title, 'Thumb': thumb}) part += 1 if u'ux sources' in soup.title.string and part == 3: # '@ux sources' is not cut in parts but getting the title is not # easy as it's not in a field linked to the video # Use a hack: since 20111110, "version intégrale" is first if re.search('Voici la version intégrale', html): parts[0]['Title'] = name + u' - intégrale'.encode('utf-8') parts[1]['Title'] = name + u' - aperçu'.encode('utf-8') else: # Before 20111104, the short video (version montée) was first parts[0]['Title'] = name + u' - montée'.encode('utf-8') parts[1]['Title'] = name + u' - intégrale'.encode('utf-8') return parts
def getProgramParts(self, url, name, icon): """Return all parts of a program (video id) video id allows to get video url with a json request""" html = getHTML(url) soup = BeautifulSoup(html) parts = [] part = 1 # Get all movie id for param in soup.findAll('param', attrs = {'name':'movie'}): try: videoId = param.parent["id"] except KeyError: continue title = name + ' - Acte %d' % part # Try to get the icon linked to the iPhone video on that page # That's faster than getting it from the json request (see getVideoDetails), # which would require one extra HTML request for each part try: media = param.parent.parent.find(text=re.compile(u'img src=')) match = re.search(u'img src="(.*?)"', media) thumb = URLASI + match.group(1) except (TypeError, AttributeError): thumb = icon parts.append({'url':videoId, 'Title':title, 'Thumb':thumb}) part += 1 if u'ux sources' in soup.title.string and part == 3: # '@ux sources' is not cut in parts but getting the title is not # easy as it's not in a field linked to the video # Use a hack: since 20111110, "version intégrale" is first if re.search('Voici la version intégrale', html): parts[0]['Title'] = name + u' - intégrale'.encode('utf-8') parts[1]['Title'] = name + u' - aperçu'.encode('utf-8') else: # Before 20111104, the short video (version montée) was first parts[0]['Title'] = name + u' - montée'.encode('utf-8') parts[1]['Title'] = name + u' - intégrale'.encode('utf-8') return parts
def __init__(self, url=None): if url is None: url = URLNEW self.html = getHTML(url) self.navItems = TedTalks().getNavItems(self.html)
def __init__(self, url): self.html = getHTML(url) # Get the navigation items self.navItems = ArretSurImages().getNavItems(self.html)
def __init__(self, url=None): if url == None: url = URLTHEMES self.html = getHTML(url)
def genLangAbbr(self): self.langAbbr = {"English": "eng"} html = getHTML(URLLANG) for full, abbr in re.compile('<a title="(.*?)" href="/translate/languages/(.*?)">').findall(html): self.langAbbr[full] = abbr
def getTEDSubtitlesByTalkID(self, id): tedSubtitleUrl = 'http://www.ted.com/talks/subtitles/id/' + str( id) + '/lang/' + self.language print tedSubtitleUrl return getHTML(tedSubtitleUrl)
def getTEDSubtitlesByTalkID(self, id): tedSubtitleUrl = "http://www.ted.com/talks/subtitles/id/" + str(id) + "/lang/" + self.language print tedSubtitleUrl return getHTML(tedSubtitleUrl)