Exemplos de MinimalSoup.findAll em Python, exemplos de BeautifulSoup.MinimalSoup.findAll em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: _tv.py Projeto: androidrebellion/xbmc-hulu-plugin

    def addSeasonList( self ):
        tree=MinimalSoup(common.getHTML(common.args.url))  
        seasons=tree.findAll('td', attrs={"class":re.compile('^vex')})
        #flatten seasons by settings
        if common.settings['flat_season'] == 1 or (len(seasons) == 1 and common.settings['flat_season'] == 0):
            common.args.mode='TV_Episodes'
            seasonNums=[]
            for season in seasons:
                common.args.name = season.contents[0]
                seasonNums.append(season.contents[0])
                self.addEpisodeList( )
            #add clips folder
            rss=tree.findAll('a', attrs={'class':'rss-link'})
            clipRSS = None
            for feed in rss:
                if feed['href'].split('/')[-1]=='clips':
                    clipRSS = feed['href']
            if clipRSS != None:
                common.addDirectory(xbmc.getLocalizedString(30095), clipRSS, "TV_Clips")
            xbmcplugin.endOfDirectory( handle=int( sys.argv[ 1 ] ))

        else:
            #add one folder for each season
            for season in seasons:
                name=season.contents[0]
                p=re.compile('&quot;(http://.+?)&quot;')
                url=p.findall(season['onclick'])
                url=url[0].replace('&amp;','&')
                ok=common.addDirectory(name, common.args.url, "TV_Episodes")
            #add clips folder
            rss=tree.findAll('a', attrs={'class':'rss-link'})
            for feed in rss:
                if feed['href'].split('/')[-1]=='clips': clipRSS = feed['href']
            common.addDirectory(xbmc.getLocalizedString(30095), clipRSS, "TV_Clips")
            xbmcplugin.endOfDirectory( handle=int( sys.argv[ 1 ] ))

Exemplo n.º 2

0

Exibir arquivo

Arquivo: default.py Projeto: TsUPeR/xbmc-pinkbike

def firstPage():
    html = getHTML(urllib.unquote_plus(BASE_URL))
    # https://bugs.launchpad.net/beautifulsoup/+bug/838022
    BeautifulSoup.NESTABLE_TAGS['td'] = ['tr', 'table']
    soup = BeautifulSoup(html)
    # Favorites
    for links in soup.findAll('a','iconlink'):
        try:
            title = links.contents[0]
        except:
            title = "No title"
        try:
            link = links['href']
        except:
            link = None
        if link and title and not "img" in str(title):
            addPosts(('Most faved ' + str(title)), urllib.quote_plus(link.replace('&amp;','&')))
    # Topics
    for table in soup.findAll('table'):
        for line in table.findAll('tr'):
            try:
                title = line.find('a').contents[0]
            except:
                title = None
            try:
                link = line.find('a')['href']
            except:
                link = None
            if title and link:
				if BASE_URL in link:
					addPosts(str(title), urllib.quote_plus(link.replace('&amp;','&')))
    # Search
    addPosts('Search..', '&search=True')
    return

Exemplo n.º 3

0

Exibir arquivo

Arquivo: addon.py Projeto: alexisv/kodi

def firstPage(url):
    html = getHTML(urllib.unquote_plus(url))
    # https://bugs.launchpad.net/beautifulsoup/+bug/838022
    BeautifulSoup.NESTABLE_TAGS['td'] = ['tr', 'table']
    soup = BeautifulSoup(html)
    thumbs = soup.findAll('div', 'thumb')
    lcount = 0
    # Items
    for links in soup.findAll('h2', 'post-title entry-title'):
        script = thumbs[lcount].find('script')
        try:
            thumbnail_container = script.contents[0]
        except:
            thumbnail = "DefaultFolder.png"
        try:
            tmatch = re.compile(
                'document.write\(bp_thumbnail_resize\(\"(.+?)\",').findall(
                    thumbnail_container)
        except:
            thumbnail = "DefaultFolder.png"
        try:
            thumbnail = tmatch[0]
        except:
            thumbnail = "DefaultFolder.png"
        lcount = lcount + 1
        for line in links.findAll('a'):
            try:
                title = links.find('a').contents[0].strip()
            except:
                title = "No title"
            try:
                link = links.find('a')['href']
            except:
                link = None
            if title and link:
                if BASE_URL in link:
                    addPosts(str(title),
                             urllib.quote_plus(link.replace('&amp;', '&')),
                             thumbnail, 0)
    olderlinks = soup.find('a', 'blog-pager-older-link')
    try:
        title = olderlinks.contents[0]
    except:
        title = "Mga Lumang mga Post"
    try:
        link = olderlinks.attrs[1][1]
    except:
        link = None
    if title and link:
        addPosts(str(title), urllib.quote_plus(link.replace('&amp;', '&')),
                 "DefaultFolder.png", 1)
    return

Exemplo n.º 4

0

Exibir arquivo

Arquivo: default.py Projeto: TsUPeR/xbmc-happymtb

def listPage(url):
    html = getHTML(urllib.unquote_plus(url))
    soup = BeautifulSoup(html) 
    for videobox in soup.findAll('div', 'videobox'):
        thumb = videobox.find('img', 'thumbnail')['src']
        try:
            title = videobox.find('a', 'title').contents
            title = title[0].encode("utf-8")
        except:
            title = "No title"
        RE_ID = 'jpg-s/(\d*)_\d.jpg'
        RE_ID_obj = re.compile(RE_ID, re.IGNORECASE)
        url = RE_ID_obj.sub(r"mp4/\g<1>.mp4?start=0", thumb)
        listitem=xbmcgui.ListItem(title, iconImage="DefaultFolder.png", thumbnailImage=thumb)
        listitem.setInfo(type="Video", infoLabels={ "Title": title })
        listitem.setPath(url)
        listitem.setProperty("IsPlayable", "true")
        xbmcplugin.addDirectoryItem(handle=int(sys.argv[1]), url=url, listitem=listitem)
    nav_page = soup.find('div', 'nav_page')
    for next in nav_page.findAll('a'):
        line = next.contents
        line = line[0].encode("utf-8")
        if 'sta' in line:
            url = next['href']
            url = BASE_SITE_URL + url
            addPosts(__language__(30000), urllib.quote_plus(url))
    return

Exemplo n.º 5

0

Exibir arquivo

Arquivo: ted_talks_scraper.py Projeto: gobelinus/xbmc-plugin.video.ted.talks

    def getVideoDetails(self, url):
        """self.videoDetails={Title, Director, Genre, Plot, id, url}"""
        #TODO: get 'related tags' and list them under genre
        html = self.fetcher.getHTML(url)
        url = ""
        soup = BeautifulSoup(html)
        #get title
        title = soup.find('span', attrs={'id':'altHeadline'}).string
        #get speaker from title
        speaker = title.split(':', 1)[0]
        #get description:
        plot = soup.find('p', attrs={'id':'tagline'}).string
        #get url
        #detectors for link to video in order of preference
        linkDetectors = [
            lambda l: re.compile('High-res video \(MP4\)').match(str(l.string)),
            lambda l: re.compile('http://download.ted.com/talks/.+.mp4').match(str(l['href'])),
        ]
        for link in soup.findAll('a', href=True):
            for detector in linkDetectors:
                if detector(link):
                    url = link['href']
                    linkDetectors = linkDetectors[:linkDetectors.index(detector)] # Only look for better matches than what we have
                    break

        if url == "":
          # look for utub link
          utublinks = re.compile('http://(?:www.)?youtube.com/v/([^\&]*)\&').findall(html)
          for link in utublinks:
            url = 'plugin://plugin.video.youtube/?action=play_video&videoid=%s' %(link)
        #get id from url
        id = url.split('/')[-1]
        return {'Title':title, 'Director':speaker, 'Genre':'TED', 'Plot':plot, 'PlotOutline':plot, 'id':id, 'url':url}

Exemplo n.º 6

0

Exibir arquivo

Arquivo: ted_talks_scraper.py Projeto: whf839/xbmc-addons

 def getVideoDetails(self, url):
     """self.videoDetails={Title, Director, Genre, Plot, id, url}"""
     #TODO: get 'related tags' and list them under genre
     html = getHTML(url)
     soup = BeautifulSoup(html)
     #get title
     title = soup.find('span', attrs={'id': 'altHeadline'}).string
     #get speaker from title
     speaker = title.split(':', 1)[0]
     #get description:
     plot = soup.find('p', attrs={'id': 'tagline'}).string
     #get url
     for link in soup.findAll('a'):
         if re.match('Watch.*high-res', str(link.string)):
             url = URLTED + link['href']
     #get id from url
     id = url.split('/')[-1]
     return {
         'Title': title,
         'Director': speaker,
         'Genre': 'TED',
         'Plot': plot,
         'PlotOutline': plot,
         'id': id,
         'url': url
     }

Exemplo n.º 7

0

Exibir arquivo

Arquivo: addon.py Projeto: alexisv/kodi

def get_vidlink_dailymotion(url):
    vidlink = ''
    # check if URL starts with just // and not the usual http: or https:; add 'http' accordingly
    dblslshpat = re.compile("//")
    if (dblslshpat.match(url, 0) > -1):
        url = "http:" + url
    html = getHTML(url)
    soup = BeautifulSoup(html)
    scripts = soup.findAll('script')
    scode = scripts[8].contents[0]
    matchconfig = re.compile('var config = (\{.+?\})\;').findall(
        scripts[8].contents[0])
    json_string = matchconfig[0]
    parsed_json = json.loads(json_string)
    fileurl = parsed_json['metadata']['qualities']['auto'][0]['url']
    lastquality = 0
    for q in parsed_json['metadata']['qualities']:
        if q == 'auto':
            continue
        if int(lastquality) > int(q):
            continue
        else:
            try:
                fileurl = parsed_json['metadata']['qualities'][q][1]['url']
            except:
                try:
                    fileurl = parsed_json['metadata']['qualities'][q][0]['url']
                except:
                    continue
            lastquality = int(q)
    vidlink = fileurl
    return vidlink

Exemplo n.º 8

0

Exibir arquivo

Arquivo: default.py Projeto: TsUPeR/xbmc-pinkbike

def listPage(url):
    html = getHTML(urllib.unquote_plus(url))
    soup = BeautifulSoup(html) 
    currentPage = soup.find('li', 'current-page').a['href']
    nextPage = soup.find('li', 'next-page').a['href']
    maxPage = soup.find('li', 'next-page').findPrevious('li').a['href']
    for inItem in soup.findAll('div', 'inItem'):
        try:
            title = inItem.findAll('a')[1].contents[0].replace('&amp;','&')
        except:
            title = "No title"
        link = inItem.find('a')['href']
        re_pinkbike = 'video/(\d+)/'
        id = re.findall(re_pinkbike, link)[0]
        id = int(id)
        partId = int(math.fabs(id/10000))
        url = 'http://lv1.pinkbike.org/vf/' + str(partId) + '/pbvid-' + str(id) + '.mp4'
        thumb = inItem.find('img', 'thimg')['src']
        time = inItem.find('span', 'fblack').contents[0]
        plot = inItem.find('p', 'uFullInfo f10 fgrey3').contents[0].strip()
        listitem=xbmcgui.ListItem(title, iconImage="DefaultFolder.png", thumbnailImage=thumb)
        listitem.setInfo(type="Video", infoLabels={ "Title": title, "Plot" : plot, "Duration" : time })
        listitem.setPath(url)
        listitem.setProperty("IsPlayable", "true")
        xbmcplugin.addDirectoryItem(handle=int(sys.argv[1]), url=url, listitem=listitem)
    if currentPage != maxPage:
        item=xbmcgui.ListItem('Next page...', iconImage="DefaultFolder.png")
        xurl = sys.argv[0] + '?' + "next=true" + "&url=" + urllib.quote_plus(nextPage.replace('&amp;','&'))
        item.setInfo(type="Video", infoLabels={ "Title": ""})
        item.setPath(xurl)
        folder = True
        xbmcplugin.addDirectoryItem(handle=int(sys.argv[1]), url=xurl, listitem=item, isFolder=folder)
    return

Exemplo n.º 9

0

Exibir arquivo

Arquivo: _tv.py Projeto: androidrebellion/xbmc-hulu-plugin

    def addShowsList( self ):
        xbmcplugin.addSortMethod(int(sys.argv[1]), xbmcplugin.SORT_METHOD_LABEL)
        xbmcplugin.addSortMethod(int(sys.argv[1]), xbmcplugin.SORT_METHOD_GENRE)

        html=common.getHTML(common.args.url)
        tree=MinimalSoup(html)
        shows=tree.findAll('a', attrs={"class":"show-thumb info_hover"})
        del html
        del tree
        # with clips
        for show in shows:
            name  = show.contents[0].replace('&quot;','"').replace('&amp;','&')
            url   = show['href']
            tmp   = show['href'].split('/')[3]
            art   = "http://assets.hulu.com/shows/key_art_"+tmp.replace('-','_')+".jpg"
            #thumb = "http://assets.hulu.com/shows/show_thumbnail_"+tmp.replace('-','_')+".jpg"
            #icon  = "http://assets.hulu.com/shows/show_thumbnail_"+tmp.replace('-','_')+".jpg"
            #Use higher res fanart (key_art) instead of lower res thumbs & icons
            thumb = art
            icon = art
            if common.settings['get_show_plot'] == True:
                json = common.getHTML("http://www.hulu.com/shows/info/"+tmp)
                try:
                    #this needs better regex, or maybe some sort of json parser
                    p = re.compile('description: "(.+?)"[,}]')
                    match = p.findall(json)
                    plot = match[0].replace('\\','')
                except:
                    plot=xbmc.getLocalizedString(30090)
                try:
                    p = re.compile('channel: "(.+?)"[,}]')
                    match = p.findall(json)
                    genre = match[0]
                except:
                    genre=xbmc.getLocalizedString(30090)
                #hopefully deleting this will help with xbox memory problems
                del json
            else:
                plot=genre=xbmc.getLocalizedString(30090)
            try:
                if show.parent['class'] != "full-episode-icon":
                    name += ' '+xbmc.getLocalizedString(30091)
                    genre += ' '+xbmc.getLocalizedString(30091)
                elif common.args.url != common.BASE_TV_URL:
                    common.addDirectory(name, url, "TV_Seasons", art, icon, art, plot, genre)
            except:
                name += ' '+xbmc.getLocalizedString(30091)
                genre += ' '+xbmc.getLocalizedString(30091)
                if common.settings['only_full_episodes'] == False:
                    common.addDirectory(name, url, "TV_Seasons", art, icon, art, plot, genre)
        
        #if we're doing both clips & full episodes, we need to run through the function again.
        if common.args.url == common.BASE_TV_URL :
            common.args.url = common.BASE_FULLTV_URL
            self.addShowsList()
        
        xbmcplugin.endOfDirectory( handle=int( sys.argv[ 1 ] ))

Exemplo n.º 10

0

Exibir arquivo

Arquivo: _tv.py Projeto: androidrebellion/xbmc-hulu-plugin

 def addEpisodeList( self ):
     #initialize variables
     p=re.compile('(\d+)')#gets last number from "season ##"
     currentSeason=p.findall(common.args.name)[0]
     epRSS=None
     #parse html tree
     tree=MinimalSoup(common.getHTML(common.args.url))
     rss=tree.findAll('a', attrs={'class':'rss-link'})
     for feed in rss:
         if feed['href'].split('/')[-1]=='episodes':
             tree=BeautifulStoneSoup(common.getHTML(feed['href']))
             items=tree.findAll('item')
             for episode in items:
                 p=re.compile('\(s([0-9]*).+?\|.+?e([0-9]*)\)')
                 match=p.findall(episode.title.contents[0])[0]
                 seasonNum  = match[0]
                 episodeNum = match[1]
                 if seasonNum == currentSeason:
                     #add this episode to list
                     name    = episode.title.contents[0].split('(')[0]
                     if len(seasonNum)<2:seasonNum='0'+seasonNum
                     if len(episodeNum)<2:episodeNum='0'+episodeNum
                     name = 's'+seasonNum+'e'+episodeNum+' '+name
                     url = episode.link.contents[0].split('#')[0]
                     try:
                         thumb = episode.findAll('media:thumbnail')[0]['url']
                     except:
                         thumb = ''
                     try:
                         airdate = episode.pubdate.contents[0]
                     except:
                         airdate = ''
                     try:
                         p=re.compile('<p>(.+?)</p>.+?Added: ')
                         plot =''.join(p.findall(str(episode.findAll('description'))))
                         try:
                             p=re.compile('Duration: (.+?)\n')
                             duration=p.findall(plot)[0].split(':')
                             duration=(int(duration[0])*60)+int(duration[1])
                         except:
                             duration=1
                     except:
                         plot = ''
                     common.addDirectory(name,url,'TV_play', thumb, thumb, common.args.fanart, plot, 'genre')

Exemplo n.º 11

0

Exibir arquivo

Arquivo: asi_scraper.py Projeto: beenje/plugin.video.arretsurimages

 def getPrograms(self):
     """Return all programs in self.html"""
     # Couldn't parse properly the file using "'div', {'class':'bloc-contenu-8'}"
     # BeautifulSoup returns nothing in that class
     # So use 'contenu-descr-8 ' and find previous tag
     soup = BeautifulSoup(cleanHTML(self.html))
     for media in soup.findAll('div', {'class':'contenu-descr-8 '}):
         aTag = media.findPrevious('a')
         # Get link, title and thumb
         mediaLink = URLASI + aTag['href']
         mediaTitle = aTag['title'].encode('utf-8')
         mediaThumb = URLASI + aTag.find('img', attrs = {'src':re.compile('.+?\.[png|jpg]')})['src']
         yield {'url':mediaLink, 'Title':mediaTitle, 'Thumb':mediaThumb}

Exemplo n.º 12

0

Exibir arquivo

Arquivo: ted_talks_scraper.py Projeto: dco-github/xbmc-plugin.video.ted.talks

    def getVideoDetails(self, url):
        """self.videoDetails={Title, Director, Genre, Plot, id, url}"""
        #TODO: get 'related tags' and list them under genre
        html = self.getHTML(url)
        url = ""
        soup = BeautifulSoup(html)
        #get title
        title = soup.find('span', attrs={'id': 'altHeadline'}).string
        #get speaker from title
        speaker = title.split(':', 1)[0]
        #get description:
        plot = soup.find('p', attrs={'id': 'tagline'}).string
        #get url
        #detectors for link to video in order of preference
        linkDetectors = [
            lambda l: re.compile('High-res video \(MP4\)').match(str(l.string)
                                                                 ),
            lambda l: re.compile('http://download.ted.com/talks/.+.mp4').match(
                str(l['href'])),
        ]
        for link in soup.findAll('a', href=True):
            for detector in linkDetectors:
                if detector(link):
                    url = link['href']
                    linkDetectors = linkDetectors[:linkDetectors.index(
                        detector
                    )]  # Only look for better matches than what we have
                    break

        if url == "":
            # look for utub link
            utublinks = re.compile(
                'http://(?:www.)?youtube.com/v/([^\&]*)\&').findall(html)
            for link in utublinks:
                url = 'plugin://plugin.video.youtube/?action=play_video&videoid=%s' % (
                    link)
        #get id from url
        id = url.split('/')[-1]
        return {
            'Title': title,
            'Director': speaker,
            'Genre': 'TED',
            'Plot': plot,
            'PlotOutline': plot,
            'id': id,
            'url': url
        }

Exemplo n.º 13

0

Exibir arquivo

Arquivo: ted_talks_scraper.py Projeto: drrlramsey/xbmc-addons

 def getVideoDetails(self, url):
     """self.videoDetails={Title, Director, Genre, Plot, id, url}"""
     #TODO: get 'related tags' and list them under genre
     html = getHTML(url)
     soup = BeautifulSoup(html)
     #get title
     title = soup.find('span', attrs={'id':'altHeadline'}).string
     #get speaker from title
     speaker = title.split(':', 1)[0]
     #get description:
     plot = soup.find('p', attrs={'id':'tagline'}).string
     #get url
     for link in soup.findAll('a'):
         if re.match('Watch.*high-res' , str(link.string)):
             url = URLTED+link['href']
     #get id from url
     id = url.split('/')[-1]
     return {'Title':title, 'Director':speaker, 'Genre':'TED', 'Plot':plot, 'PlotOutline':plot, 'id':id, 'url':url}

Exemplo n.º 14

0

Exibir arquivo

Arquivo: asi_scraper.py Projeto: mossroy/plugin.video.arretsurimages

 def getPrograms(self):
     """Return all programs in self.html"""
     # Couldn't parse properly the file using "'div', {'class':'bloc-contenu-8'}"
     # BeautifulSoup returns nothing in that class
     # So use 'contenu-descr-8 ' and find previous tag
     soup = BeautifulSoup(cleanHTML(self.html))
     for media in soup.findAll('div', {'class': 'contenu-descr-8 '}):
         aTag = media.findPrevious('a')
         # Get link, title and thumb
         mediaLink = URLASI + aTag['href']
         mediaTitle = aTag['title'].encode('utf-8')
         mediaThumb = URLASI + aTag.find(
             'img', attrs={'src': re.compile('.+?\.[png|jpg]')})['src']
         yield {
             'url': mediaLink,
             'Title': mediaTitle,
             'Thumb': mediaThumb
         }

Exemplo n.º 15

0

Exibir arquivo

def firstPage(url):
    html = getHTML(urllib.unquote_plus(url))
    # https://bugs.launchpad.net/beautifulsoup/+bug/838022
    BeautifulSoup.NESTABLE_TAGS['td'] = ['tr', 'table']
    soup = BeautifulSoup(html)

    for article in soup.findAll('article', 'latestPost excerpt layout-1'):
        h2 = article.find('h2', 'title front-view-title')
        try:
            title = h2.find('a')['title']
        except:
            title = "No title"
        try:
            link = h2.find('a')['href']
        except:
            link = None
        try:
            div = article.find('div', 'featured-thumbnail')
            try:
                thumbnail = div.find('img')['data-layzr']
            except:
                thumbnail = "DefaultFolder.png"
        except:
            div = None
            thumbnail = "DefaultFolder.png"

        if title and link:
            if BASE_URL in link:
                addPosts(title, link, thumbnail, 0)

    # Mga lumang mga post
    olderlinks = soup.find('a', 'next page-numbers')
    title = "Next Page"
    try:
        link = olderlinks.attrs[1][1]
    except:
        link = None
    if title and link:
        addPosts(str(title), urllib.quote_plus(link.replace('&amp;', '&')),
                 "DefaultFolder.png", 1)
    return

Exemplo n.º 16

0

Exibir arquivo

def getfirstPage_teleserye(url,useragent,referer):
    tlinks = {}
    llinks = {}
    html = getHTML(urllib.unquote_plus(str(url)).replace(' ','%20'),useragent,referer)
    #BeautifulSoup.NESTABLE_TAGS['td'] = ['tr', 'table']
    soup = BeautifulSoup(str(html))
    for article in soup.findAll('div','cat-hadding'):
            try:
                title = article.find('a')['title']
            except:
                title = "No title"
            try:
                link = article.find('a')['href']
            except:
                link = None
            try:
                thumbnail = article.find('img')['data-layzr']
            except:
                thumbnail = "DefaultFolder.png"
            if title and link:
            #    addPosts(title, link, thumbnail, 0)
                match_url = re.compile('http://www.teleserye.su/([^/]+?)/.+?$').findall(link)
                articleid = match_url[0]
                #alinks[title] = link
                #ilinks[title] = articleid
                tlinks[articleid] = title
                llinks[articleid] = link
            
    olderlinks = soup.find('a', 'blog-pager-older-link')
    try:
        title = olderlinks.contents[0]
    except:
        title = "Older Posts"
    try:
        link = olderlinks.attrs[1][1]
    except:
        link = None
    #if title and link:
        #addPosts(str(title), urllib.quote_plus(link.replace('&amp;','&')), "DefaultFolder.png", 1)
    return(tlinks,llinks)

Exemplo n.º 17

0

Exibir arquivo

Arquivo: asi_scraper.py Projeto: mossroy/plugin.video.arretsurimages

    def getProgramParts(self, url, name, icon):
        """Return all parts of a program (video id)

        video id allows to get video url with a json request"""
        html = getHTML(url)
        soup = BeautifulSoup(html)
        parts = []
        part = 1
        # Get all movie id
        for param in soup.findAll('param', attrs={'name': 'movie'}):
            try:
                videoId = param.parent["id"]
            except KeyError:
                continue
            title = name + ' - Acte %d' % part
            # Try to get the icon linked to the iPhone video on that page
            # That's faster than getting it from the json request (see getVideoDetails),
            # which would require one extra HTML request for each part
            try:
                media = param.parent.parent.find(text=re.compile(u'img src='))
                match = re.search(u'img src="(.*?)"', media)
                thumb = URLASI + match.group(1)
            except (TypeError, AttributeError):
                thumb = icon
            parts.append({'url': videoId, 'Title': title, 'Thumb': thumb})
            part += 1
        if u'ux sources' in soup.title.string and part == 3:
            # '@ux sources' is not cut in parts but getting the title is not
            # easy as it's not in a field linked to the video
            # Use a hack: since 20111110, "version intégrale" is first
            if re.search('Voici la version int&eacute;grale', html):
                parts[0]['Title'] = name + u' - intégrale'.encode('utf-8')
                parts[1]['Title'] = name + u' - aperçu'.encode('utf-8')
            else:
                # Before 20111104, the short video (version montée) was first
                parts[0]['Title'] = name + u' - montée'.encode('utf-8')
                parts[1]['Title'] = name + u' - intégrale'.encode('utf-8')
        return parts

Exemplo n.º 18

0

Exibir arquivo

Arquivo: asi_scraper.py Projeto: beenje/plugin.video.arretsurimages

    def getProgramParts(self, url, name, icon):
        """Return all parts of a program (video id)

        video id allows to get video url with a json request"""
        html = getHTML(url)
        soup = BeautifulSoup(html)
        parts = []
        part = 1
        # Get all movie id
        for param in soup.findAll('param', attrs = {'name':'movie'}):
            try:
                videoId = param.parent["id"]
            except KeyError:
                continue
            title = name + ' - Acte %d' % part
            # Try to get the icon linked to the iPhone video on that page
            # That's faster than getting it from the json request (see getVideoDetails),
            # which would require one extra HTML request for each part
            try:
                media = param.parent.parent.find(text=re.compile(u'img src='))
                match = re.search(u'img src="(.*?)"', media)
                thumb = URLASI + match.group(1)
            except (TypeError, AttributeError):
                thumb = icon
            parts.append({'url':videoId, 'Title':title, 'Thumb':thumb})
            part += 1
        if u'ux sources' in soup.title.string and part == 3:
            # '@ux sources' is not cut in parts but getting the title is not
            # easy as it's not in a field linked to the video
            # Use a hack: since 20111110, "version intégrale" is first
            if re.search('Voici la version int&eacute;grale', html):
                parts[0]['Title'] = name + u' - intégrale'.encode('utf-8')
                parts[1]['Title'] = name + u' - aperçu'.encode('utf-8')
            else:
                # Before 20111104, the short video (version montée) was first
                parts[0]['Title'] = name + u' - montée'.encode('utf-8')
                parts[1]['Title'] = name + u' - intégrale'.encode('utf-8')
        return parts

Exemplo n.º 19

0

Exibir arquivo

from sys import argv
import demjson
import zlib

folder = dirname(argv[0])

cache = Cache(debug=False)

pages = []

for index in range(1,11):
        index = cache.get("http://www.escapistmagazine.com/videos/view/zero-punctuation?page=%d"%index, max_age=60*60*2).read()
        index = index.replace("''>","'>")
        index = BeautifulSoup(index)

        for link in index.findAll("a"):
                if not link.has_key("href"):
                        continue
                if link["href"].find("http://www.escapistmagazine.com/videos/view/zero-punctuation/")!=-1:
                        short_href = link["href"]
                        slash = short_href.rfind("/")
                        if short_href[slash:].find("-")!=-1:
                                short_href = short_href[slash+1:slash+short_href[slash:].find("-")]
                        else:
                                short_href = short_href[slash+1:]

                        assert len(short_href)>0, link["href"]

                        if short_href not in pages:
                                pages.append(short_href)
        break

Exemplo n.º 20

0

Exibir arquivo

Arquivo: get-business-bulletins.py Projeto: samknight/parlparse

        fp = open(output_filename, 'w')
        fp.write(ur.read())
        fp.close()
        ur.close()

for year in range(1999, currentyear + 1):

    year_index_filename = output_directory + str(year) + ".html"
    if not os.path.exists(year_index_filename):
        raise Exception, "Missing the year index: '%s'" % year_index_filename
    fp = open(year_index_filename)
    html = fp.read()
    fp.close()

    soup = MinimalSoup(html)
    link_tags = soup.findAll('a')

    contents_pages = set()
    daily_pages = set()

    contents_hash = {}

    for t in link_tags:

        if t.has_key('href'):
            m = re.search('(^|/)(bb-[0-9]+/.*)$', t['href'])
            if m:
                page = m.group(2)

                subdir, leaf = page.split("/")
                if options.verbose: print "  == %s / %s ==" % (subdir, leaf)

Exemplo n.º 21

0

Exibir arquivo

Arquivo: get-business-bulletins.py Projeto: JonathanBowker/parlparse

        fp = open(output_filename, 'w')
        fp.write(ur.read())
        fp.close()
        ur.close()

for year in range(1999,currentyear+1):

    year_index_filename = output_directory  + str(year) + ".html"
    if not os.path.exists(year_index_filename):
        raise Exception, "Missing the year index: '%s'" % year_index_filename
    fp = open(year_index_filename)
    html = fp.read()
    fp.close()

    soup = MinimalSoup( html )
    link_tags = soup.findAll( 'a' )

    contents_pages = set()
    daily_pages = set()

    contents_hash = {}

    for t in link_tags:

        if t.has_key('href'):
            m = re.search('(^|/)(bb-[0-9]+/.*)$',t['href'])
            if m:
                page = m.group(2)

                subdir, leaf = page.split("/")
                if options.verbose: print "  == %s / %s ==" % (subdir,leaf)

Exemplo n.º 22

0

Exibir arquivo

def listPage(url):
    html = getHTML(urllib.unquote_plus(url))
    soup = BeautifulSoup(html)
    links = []
    # Items
    thumbnail_meta = soup.find('meta', attrs={'property': 'og:image'})
    try:
        thumbnail = thumbnail_meta['content']
    except:
        thumbnail = "DefaultFolder.png"
    title_tag = soup.find('title')
    try:
        title = title_tag.contents[0]
    except:
        title = "no title"
    iframes = soup.findAll('iframe')
    hcnt = 0
    for iframe in iframes:
        lurl = iframe['src']
        url = get_vidlink(lurl)
        links.append(str(url))
        hcnt = hcnt + 1
    if (len(links) > 1):
        durl = build_url({
            'url': links,
            'mode': 'playAllVideos',
            'foldername': title,
            'thumbnail': thumbnail,
            'title': title
        })
        itemname = 'Play All Parts'
        li = xbmcgui.ListItem(itemname, iconImage=thumbnail)
        li.setInfo(type="Video",
                   infoLabels={
                       "Title": title,
                       "Plot": "All parts of" + title
                   })
        li.setProperty('fanart_image', thumbnail)
        xbmcplugin.addDirectoryItem(handle=int(sys.argv[1]),
                                    url=durl,
                                    listitem=li)
    hcnt = 0
    for iframe in iframes:
        partcnt = hcnt + 1
        ititle = "Part " + str(partcnt)
        url = links[hcnt]
        thumb = thumbnail
        plot = ititle + ' of ' + title
        listitem = xbmcgui.ListItem(ititle,
                                    iconImage=thumb,
                                    thumbnailImage=thumb)
        listitem.setInfo(type="Video",
                         infoLabels={
                             "Title": title,
                             "Plot": plot
                         })
        listitem.setPath(url)
        listitem.setProperty("IsPlayable", "true")
        listitem.setProperty("fanart_image", thumb)
        xbmcplugin.addDirectoryItem(handle=int(sys.argv[1]),
                                    url=url,
                                    listitem=listitem)
        hcnt = hcnt + 1
    return