Python getHTML примеры, util.getHTML Python примеры использования

Пример #1

0

Показать файл

Файл: asi_scraper.py Проект: beenje/plugin.video.arretsurimages

 def getVideoDownloadLink(self, url):
     """Return the video title and download link"""
     title = None
     link = None
     downloadPage = ''
     html = getHTML(url)
     soup = BeautifulSoup(html)
     # Look for the "bouton-telecharger" class (new version)
     telecharger = soup.find('a', attrs = {'class':'bouton-telecharger'})
     if telecharger:
         downloadPage = telecharger['href']
     else:
         # Look for the "bouton-telecharger" image (old version)
         img = soup.find('img', attrs = {'src':'http://www.arretsurimages.net/images/boutons/bouton-telecharger.png'})
         if img:
             downloadPage = img.findParent()['href']
     if downloadPage.endswith('.avi'):
         print downloadPage
         title = downloadPage.split('/')[-1]
         print title
         html = getHTML(downloadPage)
         soup = BeautifulSoup(html)
         click = soup.find(text=re.compile('cliquer ici'))
         if click:
             link = click.findParent()['href']
             print link
         else:
             print "No \"cliquer ici\" found"
     else:
         print "bouton-telecharger not found"
     return {'Title':title, 'url':link}

Пример #2

0

Показать файл

Файл: ted_talks_scraper.py Проект: cjrules/xbmc-korean

        def __init__(self, url=None):
            if __settings__.getSetting('subtitleEnable') == 'true' and url is None:
		url = URLNEW.replace('/page/', '?lang=%s&event=0&duration=0&orderedby=NEWEST&tagid=0' % Subtitle().language )
		self.html = getHTML(url, savesessioncookie=True)
	    else:
		if url is None:
		    url = URLNEW
		self.html = getHTML(url)
            self.navItems = TedTalks().getNavItems(self.html)

Пример #3

0

Показать файл

Файл: ted_talks_scraper.py Проект: whf839/xbmc-addons

 def getVideoDetails(self, url):
     """self.videoDetails={Title, Director, Genre, Plot, id, url}"""
     #TODO: get 'related tags' and list them under genre
     html = getHTML(url)
     soup = BeautifulSoup(html)
     #get title
     title = soup.find('span', attrs={'id': 'altHeadline'}).string
     #get speaker from title
     speaker = title.split(':', 1)[0]
     #get description:
     plot = soup.find('p', attrs={'id': 'tagline'}).string
     #get url
     for link in soup.findAll('a'):
         if re.match('Watch.*high-res', str(link.string)):
             url = URLTED + link['href']
     #get id from url
     id = url.split('/')[-1]
     return {
         'Title': title,
         'Director': speaker,
         'Genre': 'TED',
         'Plot': plot,
         'PlotOutline': plot,
         'id': id,
         'url': url
     }

Пример #4

0

Показать файл

 def genLangAbbr(self):
     self.langAbbr = {'English': 'eng'}
     html = getHTML(URLLANG)
     for full, abbr in re.compile(
             '<a title="(.*?)" href="/translate/languages/(.*?)">').findall(
                 html):
         self.langAbbr[full] = abbr

Пример #5

0

Показать файл

Файл: ted_talks_scraper.py Проект: cjrules/xbmc-korean

 def __init__(self, url=None):
     # adding 9999 to the url takes the script to the very last page of the list, providing the total # of pages.
     if url == None:
         url = URLSPEAKERS+'9999'
     self.html = getHTML(url)
     # only bother with navItems where they have a chance to appear.
     if URLSPEAKERS in url:
         self.navItems = TedTalks().getNavItems(self.html)

Пример #6

0

Показать файл

Файл: ted_talks_scraper.py Проект: drrlramsey/xbmc-addons

 def __init__(self, url=None):
     # adding 9999 to the url takes the script to the very last page of the list, providing the total # of pages.
     if url == None:
         url = URLSPEAKERS+'9999'
     self.html = getHTML(url)
     # only bother with navItems where they have a chance to appear.
     if URLSPEAKERS in url:
         self.navItems = TedTalks().getNavItems(self.html)

Пример #7

0

Показать файл

Файл: ted_talks_scraper.py Проект: cjrules/xbmc-korean

 def removeFromFavorites(self, user, url):
     """user must be TedTalks().User object with .id attribute"""
     if user.id is not None:
         id = TedTalks().getVideoDetails(url)['id']
         print id
         response = getHTML(URLREMFAV % (id))
         if response:
             print '[%s] %s removeFromFavorites success' % (pluginName, __name__)
             return True
     else:
         print '[%s] %s invalid user object' % (pluginName, __name__)

Пример #8

0

Показать файл

Файл: ted_talks_scraper.py Проект: drrlramsey/xbmc-addons

 def removeFromFavorites(self, user, url):
     """user must be TedTalks().User object with .id attribute"""
     if user.id is not None:
         id = TedTalks().getVideoDetails(url)['id']
         print id
         response = getHTML(URLREMFAV % (id))
         if response:
             print '[%s] %s removeFromFavorites success' % (pluginName, __name__)
             return True
     else:
         print '[%s] %s invalid user object' % (pluginName, __name__)

Пример #9

0

Показать файл

Файл: ted_talks_scraper.py Проект: cjrules/xbmc-korean

 def getAllSpeakers(self):
     speakerContainers = SoupStrainer(attrs = {'href':re.compile('/speakers/\S.+?.html')})
     for i in range(self.navItems['selected']):
         # don't parse the last page twice.
         if i is not 8:
             html = getHTML(URLSPEAKERS+str(i+1))
         else:
             html = self.html
         for speaker in BeautifulSoup(html, parseOnlyThese = speakerContainers):
             title = speaker.string
             link = URLTED+speaker['href']
             yield {'url':link, 'Title':title}

Пример #10

0

Показать файл

Файл: ted_talks_scraper.py Проект: drrlramsey/xbmc-addons

 def getFavoriteTalks(self, user, url = URLFAVORITES):
     """user must be TedTalks().User object with .id attribute"""
     if user.id is not None:
         html = getHTML(url+user.id)
         talkContainer = SoupStrainer(attrs = {'class':re.compile('box clearfix')})
         for talk in BeautifulSoup(html, parseOnlyThese = talkContainer):
             title = talk.h4.a.string
             link = URLTED+talk.dt.a['href']
             pic = resizeImage(talk.find('img', attrs = {'src':re.compile('.+?\.jpg')})['src'])
             yield {'url':link, 'Title':title, 'Thumb':pic}
     else:
         print '[%s] %s invalid user object' % (pluginName, __name__)

Пример #11

0

Показать файл

Файл: ted_talks_scraper.py Проект: cjrules/xbmc-korean

 def getFavoriteTalks(self, user, url = URLFAVORITES):
     """user must be TedTalks().User object with .id attribute"""
     if user.id is not None:
         html = getHTML(url+user.id)
         talkContainer = SoupStrainer(attrs = {'class':re.compile('box clearfix')})
         for talk in BeautifulSoup(html, parseOnlyThese = talkContainer):
             title = talk.h4.a.string
             link = URLTED+talk.dt.a['href']
             pic = resizeImage(talk.find('img', attrs = {'src':re.compile('.+?\.jpg')})['src'])
             yield {'url':link, 'Title':title, 'Thumb':pic}
     else:
         print '[%s] %s invalid user object' % (pluginName, __name__)

Пример #12

0

Показать файл

Файл: ted_talks_scraper.py Проект: drrlramsey/xbmc-addons

 def getAllSpeakers(self):
     speakerContainers = SoupStrainer(attrs = {'href':re.compile('/speakers/\S.+?.html')})
     for i in range(self.navItems['selected']):
         # don't parse the last page twice.
         if i is not 8:
             html = getHTML(URLSPEAKERS+str(i+1))
         else:
             html = self.html
         for speaker in BeautifulSoup(html, parseOnlyThese = speakerContainers):
             title = speaker.string
             link = URLTED+speaker['href']
             yield {'url':link, 'Title':title}

Пример #13

0

Показать файл

Файл: ted_talks_scraper.py Проект: drrlramsey/xbmc-addons

 def getLoginResponse(self, url = URLLOGIN):
     #clientform doesn't like HTML, and I don't want to monkey patch it, so getUrllib2ResponseObject was born.
     response = getUrllib2ResponseObject(url)
     forms = ParseResponse(response, backwards_compat=False)
     response.close()
     #set username & password in the signin form
     form = forms[1]
     form["users[username]"] = self.username
     form["users[password]"] = self.password
     form["users[rememberme]"] = ['1']
     #click submit
     return getHTML(form.click())

Пример #14

0

Показать файл

Файл: ted_talks_scraper.py Проект: cjrules/xbmc-korean

 def getLoginResponse(self, url = URLLOGIN):
     #clientform doesn't like HTML, and I don't want to monkey patch it, so getUrllib2ResponseObject was born.
     response = getUrllib2ResponseObject(url)
     forms = ParseResponse(response, backwards_compat=False)
     response.close()
     #set username & password in the signin form
     form = forms[1]
     form["users[username]"] = self.username
     form["users[password]"] = self.password
     form["users[rememberme]"] = ['1']
     #click submit
     return getHTML(form.click())

Пример #15

0

Показать файл

Файл: asi_scraper.py Проект: beenje/plugin.video.arretsurimages

 def isLoggedIn(self, username):
     """Return True if @username is already logged in,
     False otherwise"""
     html = getHTML(URLMONCOMPTE)
     soup = BeautifulSoup(html)
     if soup.title.string == u'Arrêt sur images – Mon compte':
         # Already logged in, check that the username is still the same
         userText = soup.find(text=re.compile(u'L’e-mail que vous utilisez pour @si est.*'))
         if userText and userText.next.string == username:
             return True
         else:
             print "Already logged in, but username does not match..."
     return False

Пример #16

0

Показать файл

Файл: ted_talks_scraper.py Проект: drrlramsey/xbmc-addons

 def getTalks(self):
     # themes loaded with a json call. Why are they not more consistant?
     from simplejson import loads
     # search HTML for the link to tedtalk's "api".  It is easier to use regex here than BS.
     jsonUrl = URLTED+re.findall('DataSource\("(.+?)"', self.html)[0]
     # make a dict from the json formatted string from above url
     talksMarkup = loads(getHTML(jsonUrl))
     # parse through said dict for all the metadata
     for markup in talksMarkup['resultSet']['result']:
         talk = BeautifulSoup(markup['markup'])
         link = URLTED+talk.dt.a['href']
         title = cleanHTML(talk.dt.a['title'])
         pic = resizeImage(talk.find('img', attrs = {'src':re.compile('.+?\.jpg')})['src'])
         yield {'url':link, 'Title':title, 'Thumb':pic}

Пример #17

0

Показать файл

Файл: asi_scraper.py Проект: mossroy/plugin.video.arretsurimages

 def isLoggedIn(self, username):
     """Return True if @username is already logged in,
     False otherwise"""
     html = getHTML(URLMONCOMPTE)
     soup = BeautifulSoup(html)
     if soup.title.string == u'Arrêt sur images – Mon compte':
         # Already logged in, check that the username is still the same
         userText = soup.find(
             text=re.compile(u'L’e-mail que vous utilisez pour @si est.*'))
         if userText and userText.next.string == username:
             return True
         else:
             print "Already logged in, but username does not match..."
     return False

Пример #18

0

Показать файл

Файл: ted_talks_scraper.py Проект: cjrules/xbmc-korean

 def getTalks(self):
     # themes loaded with a json call. Why are they not more consistant?
     from simplejson import loads
     # search HTML for the link to tedtalk's "api".  It is easier to use regex here than BS.
     jsonUrl = URLTED+re.findall('DataSource\("(.+?)"', self.html)[0]
     # make a dict from the json formatted string from above url
     talksMarkup = loads(getHTML(jsonUrl))
     # parse through said dict for all the metadata
     for markup in talksMarkup['resultSet']['result']:
         talk = BeautifulSoup(markup['markup'])
         link = URLTED+talk.dt.a['href']
         title = cleanHTML(talk.dt.a['title'])
         pic = resizeImage(talk.find('img', attrs = {'src':re.compile('.+?\.jpg')})['src'])
         yield {'url':link, 'Title':title, 'Thumb':pic}

Пример #19

0

Показать файл

Файл: asi_scraper.py Проект: mossroy/plugin.video.arretsurimages

 def getVideoDownloadLink(self, url):
     """Return the video title and download link"""
     title = None
     link = None
     downloadPage = ''
     html = getHTML(url)
     soup = BeautifulSoup(html)
     # Look for the "bouton-telecharger" class (new version)
     telecharger = soup.find('a', attrs={'class': 'bouton-telecharger'})
     if telecharger:
         downloadPage = telecharger['href']
     else:
         # Look for the "bouton-telecharger" image (old version)
         img = soup.find(
             'img',
             attrs={
                 'src':
                 'http://www.arretsurimages.net/images/boutons/bouton-telecharger.png'
             })
         if img:
             downloadPage = img.findParent()['href']
     if downloadPage.endswith('.avi'):
         print downloadPage
         title = downloadPage.split('/')[-1]
         print title
         html = getHTML(downloadPage)
         soup = BeautifulSoup(html)
         click = soup.find(text=re.compile('cliquer ici'))
         if click:
             link = click.findParent()['href']
             print link
         else:
             print "No \"cliquer ici\" found"
     else:
         print "bouton-telecharger not found"
     return {'Title': title, 'url': link}

Пример #20

0

Показать файл

Файл: ted_talks_scraper.py Проект: drrlramsey/xbmc-addons

 def getVideoDetails(self, url):
     """self.videoDetails={Title, Director, Genre, Plot, id, url}"""
     #TODO: get 'related tags' and list them under genre
     html = getHTML(url)
     soup = BeautifulSoup(html)
     #get title
     title = soup.find('span', attrs={'id':'altHeadline'}).string
     #get speaker from title
     speaker = title.split(':', 1)[0]
     #get description:
     plot = soup.find('p', attrs={'id':'tagline'}).string
     #get url
     for link in soup.findAll('a'):
         if re.match('Watch.*high-res' , str(link.string)):
             url = URLTED+link['href']
     #get id from url
     id = url.split('/')[-1]
     return {'Title':title, 'Director':speaker, 'Genre':'TED', 'Plot':plot, 'PlotOutline':plot, 'id':id, 'url':url}

Пример #21

0

Показать файл

Файл: asi_scraper.py Проект: beenje/plugin.video.arretsurimages

 def login(self, username = None, password = None):
     """Try to login using @username and @password.
     Return True if successful, False otherwise"""
     if username and password:
         response = getUrllib2ResponseObject(URLLOGIN)
         forms = ParseResponse(response, backwards_compat=False)
         response.close()
         # Set username & password in the signin form
         form = forms[2]
         form["username"] = username
         form["password"] = password
         # Click submit
         html = getHTML(form.click())
         soup = BeautifulSoup(html)
         if soup.title.string == u'Le Forum Arrêt Sur Images':
             # We are on the forum page - login successful
             return True
     return False

Пример #22

0

Показать файл

Файл: asi_scraper.py Проект: mossroy/plugin.video.arretsurimages

 def login(self, username=None, password=None):
     """Try to login using @username and @password.
     Return True if successful, False otherwise"""
     if username and password:
         response = getUrllib2ResponseObject(URLLOGIN)
         forms = ParseResponse(response, backwards_compat=False)
         response.close()
         # Set username & password in the signin form
         form = forms[2]
         form["username"] = username
         form["password"] = password
         # Click submit
         html = getHTML(form.click())
         soup = BeautifulSoup(html)
         if soup.title.string == u'Le Forum Arrêt Sur Images':
             # We are on the forum page - login successful
             return True
     return False

Пример #23

0

Показать файл

Файл: asi_scraper.py Проект: beenje/plugin.video.arretsurimages

 def getVideoDetails(self, url, streams):
     """Return the video title and link"""
     # Run the json request using the video id
     # passed in url argument
     request = getHTML(JSONREQUEST % url)
     result = simplejson.loads(request)
     # The stream quality chosen might not be available
     # -> get the first video link available (following the streams quality order)
     for stream in streams:
         if result[stream]:
             print "Found %s link" % stream
             link = result[stream]
             break
     else:
         print "No video link found for this video id"
         link = 'None'
     title = result["title"]
     return {'Title':title, 'url':link}

Пример #24

0

Показать файл

Файл: asi_scraper.py Проект: mossroy/plugin.video.arretsurimages

 def getVideoDetails(self, url, streams):
     """Return the video title and link"""
     # Run the json request using the video id
     # passed in url argument
     request = getHTML(JSONREQUEST % url)
     result = simplejson.loads(request)
     # The stream quality chosen might not be available
     # -> get the first video link available (following the streams quality order)
     for stream in streams:
         if result[stream]:
             print "Found %s link" % stream
             link = result[stream]
             break
     else:
         print "No video link found for this video id"
         link = 'None'
     title = result["title"]
     return {'Title': title, 'url': link}

Пример #25

0

Показать файл

Файл: asi_scraper.py Проект: mossroy/plugin.video.arretsurimages

    def getProgramParts(self, url, name, icon):
        """Return all parts of a program (video id)

        video id allows to get video url with a json request"""
        html = getHTML(url)
        soup = BeautifulSoup(html)
        parts = []
        part = 1
        # Get all movie id
        for param in soup.findAll('param', attrs={'name': 'movie'}):
            try:
                videoId = param.parent["id"]
            except KeyError:
                continue
            title = name + ' - Acte %d' % part
            # Try to get the icon linked to the iPhone video on that page
            # That's faster than getting it from the json request (see getVideoDetails),
            # which would require one extra HTML request for each part
            try:
                media = param.parent.parent.find(text=re.compile(u'img src='))
                match = re.search(u'img src="(.*?)"', media)
                thumb = URLASI + match.group(1)
            except (TypeError, AttributeError):
                thumb = icon
            parts.append({'url': videoId, 'Title': title, 'Thumb': thumb})
            part += 1
        if u'ux sources' in soup.title.string and part == 3:
            # '@ux sources' is not cut in parts but getting the title is not
            # easy as it's not in a field linked to the video
            # Use a hack: since 20111110, "version intégrale" is first
            if re.search('Voici la version int&eacute;grale', html):
                parts[0]['Title'] = name + u' - intégrale'.encode('utf-8')
                parts[1]['Title'] = name + u' - aperçu'.encode('utf-8')
            else:
                # Before 20111104, the short video (version montée) was first
                parts[0]['Title'] = name + u' - montée'.encode('utf-8')
                parts[1]['Title'] = name + u' - intégrale'.encode('utf-8')
        return parts

Пример #26

0

Показать файл

Файл: asi_scraper.py Проект: beenje/plugin.video.arretsurimages

    def getProgramParts(self, url, name, icon):
        """Return all parts of a program (video id)

        video id allows to get video url with a json request"""
        html = getHTML(url)
        soup = BeautifulSoup(html)
        parts = []
        part = 1
        # Get all movie id
        for param in soup.findAll('param', attrs = {'name':'movie'}):
            try:
                videoId = param.parent["id"]
            except KeyError:
                continue
            title = name + ' - Acte %d' % part
            # Try to get the icon linked to the iPhone video on that page
            # That's faster than getting it from the json request (see getVideoDetails),
            # which would require one extra HTML request for each part
            try:
                media = param.parent.parent.find(text=re.compile(u'img src='))
                match = re.search(u'img src="(.*?)"', media)
                thumb = URLASI + match.group(1)
            except (TypeError, AttributeError):
                thumb = icon
            parts.append({'url':videoId, 'Title':title, 'Thumb':thumb})
            part += 1
        if u'ux sources' in soup.title.string and part == 3:
            # '@ux sources' is not cut in parts but getting the title is not
            # easy as it's not in a field linked to the video
            # Use a hack: since 20111110, "version intégrale" is first
            if re.search('Voici la version int&eacute;grale', html):
                parts[0]['Title'] = name + u' - intégrale'.encode('utf-8')
                parts[1]['Title'] = name + u' - aperçu'.encode('utf-8')
            else:
                # Before 20111104, the short video (version montée) was first
                parts[0]['Title'] = name + u' - montée'.encode('utf-8')
                parts[1]['Title'] = name + u' - intégrale'.encode('utf-8')
        return parts

Пример #27

0

Показать файл

Файл: ted_talks_scraper.py Проект: drrlramsey/xbmc-addons

 def __init__(self, url=None):
     if url is None:
         url = URLNEW
     self.html = getHTML(url)
     self.navItems = TedTalks().getNavItems(self.html)

Пример #28

0

Показать файл

Файл: asi_scraper.py Проект: beenje/plugin.video.arretsurimages

 def __init__(self, url):
     self.html = getHTML(url)
     # Get the navigation items
     self.navItems = ArretSurImages().getNavItems(self.html)

Пример #29

0

Показать файл

Файл: ted_talks_scraper.py Проект: drrlramsey/xbmc-addons

 def __init__(self, url=None):
     if url == None:
         url = URLTHEMES
     self.html = getHTML(url)

Пример #30

0

Показать файл

Файл: asi_scraper.py Проект: mossroy/plugin.video.arretsurimages

 def __init__(self, url):
     self.html = getHTML(url)
     # Get the navigation items
     self.navItems = ArretSurImages().getNavItems(self.html)

Пример #31

0

Показать файл

Файл: subtitleDownloader.py Проект: cjrules/xbmc-korean

 def genLangAbbr(self):
     self.langAbbr = {"English": "eng"}
     html = getHTML(URLLANG)
     for full, abbr in re.compile('<a title="(.*?)" href="/translate/languages/(.*?)">').findall(html):
         self.langAbbr[full] = abbr

Пример #32

0

Показать файл

 def getTEDSubtitlesByTalkID(self, id):
     tedSubtitleUrl = 'http://www.ted.com/talks/subtitles/id/' + str(
         id) + '/lang/' + self.language
     print tedSubtitleUrl
     return getHTML(tedSubtitleUrl)

Пример #33

0

Показать файл

Файл: ted_talks_scraper.py Проект: cjrules/xbmc-korean

 def __init__(self, url=None):
     if url == None:
         url = URLTHEMES
     self.html = getHTML(url)

Пример #34

0

Показать файл

Файл: subtitleDownloader.py Проект: cjrules/xbmc-korean

 def getTEDSubtitlesByTalkID(self, id):
     tedSubtitleUrl = "http://www.ted.com/talks/subtitles/id/" + str(id) + "/lang/" + self.language
     print tedSubtitleUrl
     return getHTML(tedSubtitleUrl)

Пример #35

0

Показать файл

Файл: ted_talks_scraper.py Проект: whf839/xbmc-addons

 def __init__(self, url=None):
     if url is None:
         url = URLNEW
     self.html = getHTML(url)
     self.navItems = TedTalks().getNavItems(self.html)

Python getHTML примеры использования