Python BeautifulSoup.SoupStrainer示例，BeautifulSoup.SoupStrainer, Tautulli Python示例

示例#1

0

显示文件

文件： imdbtv.py 项目： meastp/veefire

 def downloadShowList ( self, Shows ) :
     '''
     Download a list of Show object's episodes.
     
     :param Shows: Shows to feth data for
     :type Shows: list of api.dbapi.Show objects
     :returns: dictionary { value( Show ) : key ( raw html from web page ) }
     :rtype: dict
     '''
     conn = httplib.HTTPConnection("www.imdb.com")
     #totalbytes = 0
     gzippedfiles = [ ]
     Showdict = { }
     
     for Show in Shows :
         headers = {'User-Agent' : 'veefire/1.0', 'Accept-encoding' : 'gzip' }
         params = ''
         conn.request("GET", "/title/" + Show.url + "/episodes", params , headers)
         
         r1 = conn.getresponse()
         data1 = r1.read()
         
         seasons_select_form_id = BeautifulSoup.SoupStrainer('select', { "id" : "bySeason" })
         seasons_select_form = BeautifulSoup.BeautifulSoup( gzip.GzipFile(fileobj=StringIO.StringIO(data1)).read(), parseOnlyThese=seasons_select_form_id).findAll('option')
         
         seasonsraw = [seasonnumber["value"] for seasonnumber in seasons_select_form] 
         
         seasons = dict()
         
         for season in seasonsraw :
             conn.request("GET", "/title/" + Show.url + "/episodes?season="+season, params , headers)
             r = conn.getresponse()
             d = r.read()
             filter = BeautifulSoup.SoupStrainer('div', { "class" : "info" })
             
             if season not in seasons :
                 seasons[season] = list()
             
             seasons[season].extend([ { "season" : season, "episode" : episode } for episode in BeautifulSoup.BeautifulSoup( gzip.GzipFile(fileobj=StringIO.StringIO(d)).read(), parseOnlyThese=filter) ])
             
             
             
         #print seasons
         
         #print r1.status, r1.reason, ' [ gzipped: ' + str(len(data1)) + ' bytes ]'
         #totalbytes += len(data1)
         #gzippedfiles.append(data1)
         
         Showdict[Show] = seasons
     
     conn.close()
     
     return Showdict

示例#2

0

显示文件

def retieveTrailerStream(request_obj, response_obj):
    soup = None
    title = request_obj.get_data()['movieTitle']
    if request_obj.get_data().has_key('movieInfo'):
        soup = BeautifulSoup.BeautifulSoup(request_obj.get_data()['movieInfo'])
    elif request_obj.get_data().has_key('moviePageUrl'):
        contentDiv = BeautifulSoup.SoupStrainer('div', {'dir': 'ltr'})
        soup = HttpUtils.HttpClient().getBeautifulSoup(
            url=request_obj.get_data()['moviePageUrl'],
            parseOnlyThese=contentDiv)
    if soup is None:
        return
    videoLink = None
    Logger.logDebug(soup.prettify())
    frameTag = soup.findChild('iframe', recursive=True)
    if frameTag is not None:
        videoLink = frameTag['src']
    else:
        paramTag = soup.findChild('param',
                                  attrs={'name': 'movie'},
                                  recursive=True)
        if paramTag is not None:
            videoLink = paramTag['value']
        else:
            videoLink = soup.findChild('embed', recursive=True)['src']
    request_obj.set_data({'videoLink': videoLink, 'videoTitle': title})

示例#3

0

显示文件

文件： FranceInter.py 项目： btuduri/tvdownloader

    def rafraichir(self):
        # RAZ de la liste des emissions
        self.listeEmissions.clear()

        # Recupere le XML de description de toutes les emissions
        for (chaine, urlChaine) in self.listeChaines.items():
            self.listeEmissions[chaine] = {}
            pageHtml = self.getPage(urlChaine)
            soupStrainer = BeautifulSoup.SoupStrainer("a", {"class": "visuel"})
            pageSoup = BeautifulSoup.BeautifulSoup(pageHtml,
                                                   parseOnlyThese=soupStrainer)
            # Liste des pages des emissions
            listePagesUrl = map(
                lambda x: "%s%s" % ("http://www.franceinter.fr", x["href"]),
                pageSoup.contents)
            # Recupere toutes les pages
            listePagesData = self.getPages(listePagesUrl)
            for emission in pageSoup.contents:
                try:
                    nomEmission = emission["title"]
                    urlPageEmission = "%s%s" % ("http://www.franceinter.fr",
                                                emission["href"])
                    # Extrait le lien XML de la page de l'emission
                    urlXml = re.findall(
                        "http://radiofrance-podcast.net/podcast09/rss_\d+?.xml",
                        listePagesData[urlPageEmission])[0]
                    # Ajoute l'emission a la liste
                    self.listeEmissions[chaine][nomEmission] = urlXml
                except:
                    continue

        # Sauvegarde la liste dans le cache
        self.sauvegarderCache(self.listeEmissions)

示例#4

0

显示文件

def displayRecentMovies(request_obj, response_obj):
    contentDiv = BeautifulSoup.SoupStrainer('div', {'id':'sub-sidebar'})
    soup = HttpClient().getBeautifulSoup(url='http://www.pinoymovie.co/', parseOnlyThese=contentDiv)
    soup = soup.findChild('div', {'class':'right'})
    movieLinkTags = soup.findChildren('a')
    recentMoviesItems = XBMCInterfaceUtils.callBackDialogProgressBar(getattr(sys.modules[__name__], '__retrieveRecentMovies__'), movieLinkTags, 'Retrieving recent movies and its information', 'Failed to retrieve video information, please try again later', line1='Takes about 5 minutes')
    response_obj.extendItemList(recentMoviesItems)

示例#5

0

显示文件

def __retrieve_tv_shows__(tv_channel_url):
    tv_channel = {}
    tv_channel["running_tvshows"] = []
    tv_channel["finished_tvshows"] = []

    logging.getLogger().debug('TV Channel URL: ' + tv_channel_url)
    tv_shows = tv_channel["running_tvshows"]
    if tv_channel_url is None:
        return tv_shows
    tv_channel_url = BASE_WSITE_URL + tv_channel_url
    logging.getLogger().debug(tv_channel_url)
    contentDiv = BeautifulSoup.SoupStrainer('li', {'class': 'categories'})
    soup = HttpClient().get_beautiful_soup(url=tv_channel_url,
                                           parseOnlyThese=contentDiv)
    #     soup = BeautifulSoup.BeautifulSoup(HttpClient().get_html_content(url=tv_channel_url)).findAll('div', {'id':'forumbits', 'class':'forumbits'})[0]
    for title_tag in soup.findAll('li'):
        aTag = title_tag.findNext('a')
        tv_show_url = str(aTag['href'])
        if tv_show_url[0:4] != "http":
            tv_show_url = BASE_WSITE_URL + '/' + tv_show_url
        tv_show_name = aTag.getText()
        if not re.search('Completed Shows', tv_show_name, re.IGNORECASE):
            tv_shows.append({
                "name": http.unescape(tv_show_name),
                "url": tv_show_url,
                "iconimage": ""
            })
        else:
            tv_shows = tv_channel["finished_tvshows"]
    return tv_channel

示例#6

0

显示文件

def displayTVShowEpisodes(request_obj, response_obj):
    url = request_obj.get_data()['tvChannelUrl']
    contentDiv = GetContent(url)
    newcontent = ''.join(contentDiv.encode("utf-8").splitlines()).replace('\t','')
    contentDiv = BeautifulSoup.SoupStrainer('div', {'id':'content'})
    soup = HttpClient().getBeautifulSoup(url=url, parseOnlyThese=contentDiv)
    videoBoxes =re.compile("<div id='videobox'>(.+?)</h3><div style='clear: both;'>").findall(newcontent)
    for videoBox in videoBoxes:
        #imgTag = videoBox.findChild('img')
        imageUrl = re.compile('<img [^>]*src=["\']?([^>^"^\']+)["\']?[^>]*>').findall(str(videoBox))[0]
        match=re.compile('createSummaryThumb\("(.+?)","(.+?)","(.+?)",').findall(str(videoBox))
        if(len(match)>0):
            episodeName = match[0][1]
            episodeUrl = str(match[0][2])
            
            item = ListItem()
            item.add_request_data('episodeName', episodeName)
            item.add_request_data('episodeUrl', episodeUrl)
            item.set_next_action_name('Show_Episode_VLinks')
            xbmcListItem = xbmcgui.ListItem(label=episodeName, iconImage=imageUrl, thumbnailImage=imageUrl)
            item.set_xbmc_list_item_obj(xbmcListItem)
            response_obj.addListItem(item)
    pageTag = soup.findChild('div', {'class':'postnav'})
    if(pageTag !=None):
        olderPageTag = pageTag.findChild('a', {'class':'blog-pager-older-link'})
    else:
        olderPageTag = None
    if olderPageTag is not None:
        item = ListItem()
        item.add_request_data('tvChannelUrl', str(olderPageTag['href']))
        pageName = AddonUtils.getBoldString('              ->              Next Page')
        item.set_next_action_name('Show_Episodes_Next_Page')
        xbmcListItem = xbmcgui.ListItem(label=pageName)
        item.set_xbmc_list_item_obj(xbmcListItem)
        response_obj.addListItem(item)

示例#7

0

显示文件

def find_artist_url(artist_name):
    normalized = urllib.quote(artist_name, '')
    url = 'http://www.musicbrainz.org/ws/2/artist/?query=artist:"%s"&limit=10' % normalized
    print('  Searching MusicBrainz at URL %s' % url)
    resp = requests.get(url, headers=HEADERS)
    tree = ET.fromstring(resp.text.encode('utf-8'))[0]

    artist_node = tree[0]
    # prefer exact matches
    for node in tree:
        if node[0].text == artist_name:
            artist_node = node
            break

    mb_id = artist_node.attrib['id']
    print('  Found MusicBrainz ID %s' % mb_id)

    the_ugly = requests.get('http://musicbrainz.org/artist/%s' % mb_id,
                            headers=HEADERS)
    for link in bs.BeautifulSoup(the_ugly.text,
                                 parseOnlyThese=bs.SoupStrainer('a')):
        if link.has_key('href') and WIKI_LINK.match(link['href']):
            if 'discography' not in link['href']:
                return link['href']
    raise NoWikiForArtistError(
        '  MusicBrainz does not have a wikipedia page for %s' % artist_name)

示例#8

0

显示文件

文件： spider.py 项目： Debraj-Kundu/python-challenge

def extract_links(html):
    """
    >>> html = '<a href="go">hey</a><br href="not" /><a href="w"></a>'
    >>> links = [str(link) for link in extract_links(html)] #unicode -> str
    >>> links
    ['go', 'w']
    >>> h = '<a href="javascript:poptastic(\\'event.php?eventID=922\\')"></a>'
    >>> l = [str(link) for link in extract_links(h)] #unicode -> str
    >>> l
    ['event.php?eventID=922']
    >>> h = "<a href='javascript:poptastic(\\"event.php?eventID=922\\")'></a>"
    >>> l = [str(link) for link in extract_links(h)] #unicode -> str
    >>> l #also works for double-quoted javascript
    ['event.php?eventID=922']
    >>> html = '<a name="bla" id="q">hello anchor</a>'
    >>> links = extract_links(html)
    >>> links
    []
    """
    a = BeautifulSoup.SoupStrainer('a')
    links = BeautifulSoup.BeautifulSoup(html, parseOnlyThese=a)
    hrefs = [link['href'] for link in links if link.has_key('href')]

    #also extract javascript popup shit
    def extract_js(link):
        if link.lower().startswith('javascript'):
            return get_match(link, r"\([\'\"](.*)[\'\"]\)")
        else:
            return link
    hrefs = [extract_js(href) for href in hrefs]

    return hrefs

示例#9

0

显示文件

def displayMovies(request_obj, response_obj):
    url = request_obj.get_data()['movieCategoryUrl']
    print "indisplay" + url
    if request_obj.get_data().has_key('page'):
        url_parts = url.split('?')
        
        url_part_A = ''
        url_part_B = ''
        if len(url_parts) == 2:
            url_part_A = url_parts[0]
            url_part_B = '?' + url_parts[1]
        else:
            url_part_A = url
        if url_part_A[len(url_part_A) - 1] != '/':
            url_part_A = url_part_A + '/'
        url = url_part_A + 'page/' + request_obj.get_data()['page'] + url_part_B

    contentDiv = BeautifulSoup.SoupStrainer('div', {'id':'content'})
    soup = HttpClient().getBeautifulSoup(url=url, parseOnlyThese=contentDiv)

    movieTags = soup.findChildren('div', {'class':'post'})
    print "intags" + str(movieTags)
    if len(movieTags) == 0:
        movieTags = soup.findChildren('div', {'class':'videopost'})
    for movieTag in movieTags:
        item = __retrieveAndCreateMovieItem__(movieTag)
        response_obj.addListItem(item)
    
    response_obj.set_xbmc_content_type('movies')
    try:
        pagesInfoTag = soup.findChild('div', {'class':'navigation'})

        current_page = int(pagesInfoTag.find('span', {'class':'page current'}).getText())
        #print current_page
        pages = pagesInfoTag.findChildren('a', {'class':'page'})
        #print pages
        last_page = int(pages[len(pages) - 1].getText())
    
        if current_page < last_page:
            for page in range(current_page + 1, last_page + 1):
                createItem = False
                if page == last_page:
                    pageName = AddonUtils.getBoldString('              ->              Last Page #' + str(page))
                    createItem = True
                elif page <= current_page + 4:
                    pageName = AddonUtils.getBoldString('              ->              Page #' + str(page))
                    createItem = True
                if createItem:
                    item = ListItem()
                    item.add_request_data('movieCategoryUrl', request_obj.get_data()['movieCategoryUrl'])
                    item.add_request_data('page', str(page))
                
                    
                    item.set_next_action_name('Movies_List_Next_Page')
                    xbmcListItem = xbmcgui.ListItem(label=pageName)
                    item.set_xbmc_list_item_obj(xbmcListItem)
                    response_obj.addListItem(item)
    except: pass

示例#10

0

显示文件

 def _parse_pagetitle(self, page, url):
     ''' Get the page title '''
     head_tag = BeautifulSoup.SoupStrainer('head')
     soup = BeautifulSoup.BeautifulSoup(page,
         parseOnlyThese=head_tag, convertEntities=['html', 'xml'])
     if soup.title is None:
         return '%s -- no title found' % url
     title = unicode(soup.title.string).encode('utf-8')
     return '%s -- "%s"' % (url, title)

示例#11

0

显示文件

def load_tv_show_episodes(req_attrib, modelMap):
    logging.getLogger().debug('load tv show episodes...')
    url = req_attrib['tv-show-url']
    tv_show_url = req_attrib['tv-show-url']
    tv_show_name = req_attrib['tv-show-name']
    channel_type = req_attrib['channel-type']
    channel_name = req_attrib['channel-name']
    currentPage = 1

    if req_attrib.has_key('tv-show-page') and req_attrib['tv-show-page'] != '':
        currentPage = int(req_attrib['tv-show-page'])
        if currentPage != 1:
            url = url + 'page/' + req_attrib['tv-show-page'] + '/'
    logging.getLogger().debug('load tv show episodes...' + url)
    contentDiv = BeautifulSoup.SoupStrainer('div', {'id': 'left-div'})
    soup = HttpClient().get_beautiful_soup(url=url + '?tag=video',
                                           parseOnlyThese=contentDiv)
    #     soup = BeautifulSoup.BeautifulSoup(HttpClient().get_html_content(url=url)).findAll('div', {'id':'contentBody'})[0]

    tv_show_episode_items = []

    threads = soup.findAll('h2', {'class': 'titles'})
    tv_show_episode_items.extend(
        __retrieveTVShowEpisodes__(threads, tv_show_name, channel_type,
                                   channel_name))
    logging.getLogger().debug('In DTB: total tv show episodes: %s' %
                              str(len(tv_show_episode_items)))

    pagesDiv = soup.findChild('p', {'class': 'pagination'})
    if pagesDiv is not None:
        pagesInfoTags = pagesDiv.findAllNext('a')
        for pagesInfoTag in pagesInfoTags:
            logging.getLogger().debug(pagesInfoTag)
            pageInfo = re.compile('page/(.+?)/').findall(pagesInfoTag['href'])

            if len(pageInfo) > 0:
                if re.search('Old', pagesInfoTag.getText(), re.IGNORECASE):
                    item = xbmcgui.ListItem(label='<< Older Entries')
                elif re.search('Next', pagesInfoTag.getText(), re.IGNORECASE):
                    item = xbmcgui.ListItem(label='Next Entries >>')
                item.setProperty('tv-show-page', pageInfo[0][0])
                item.setProperty('channel-type', channel_type)
                item.setProperty('channel-name', channel_name)
                item.setProperty('tv-show-name', tv_show_name)
                item.setProperty('tv-show-url', tv_show_url)
                tv_show_episode_items.append(item)
            else:
                item = xbmcgui.ListItem(label='Newest Entries >>')
                item.setProperty('tv-show-page', '1')
                item.setProperty('channel-type', channel_type)
                item.setProperty('channel-name', channel_name)
                item.setProperty('tv-show-name', tv_show_name)
                item.setProperty('tv-show-url', tv_show_url)
                tv_show_episode_items.append(item)

    modelMap['tv_show_episode_items'] = tv_show_episode_items

示例#12

0

显示文件

文件： extractor.py 项目： sjl421/django-magellan

 def get_urls(cls, content):
     # retrieve all link hrefs from html
     links = []
     try:
         link_soup = BeautifulSoup.BeautifulSoup(
             content, parseOnlyThese=BeautifulSoup.SoupStrainer('a'))
     except UnicodeEncodeError:
         return links
     for link in link_soup:
         if link.has_key('href'):
             links.append(link.get('href'))
     return cls.clean_urls(links)

示例#13

0

显示文件

文件： test.py 项目： zuberv/xceltv

def retrieve_tv_shows(link):
    contentDiv = BeautifulSoup.SoupStrainer('div', {'class': 'all-tv-shows'})
    soup = http.HttpClient().get_beautiful_soup(url=link,
                                                parseOnlyThese=contentDiv,
                                                accept_500_error=True)
    list = soup.find('ul')
    for item in list.findChildren('li'):
        tv_show = item.findChild('a')
        link = tv_show['href']
        name = tv_show.getText()
        print '>>>>>>>>' + name
        print '>>>>>>>>' + link

示例#14

0

显示文件

 def render(self, entry):
     soup = BeautifulSoup.BeautifulSoup(entry.body,
         parseOnlyThese=BeautifulSoup.SoupStrainer("img"))
     imgs = soup.findAll("img")
     thumbnails = []
     for img in imgs:
         if "nomediarss" in img.get("class", "").split():
             continue
         thumbnails.append({
             "url": img["src"],
             "title": img.get("title", img.get("alt", "")),
             "width": img.get("width", ""),
             "height": img.get("height", ""),
         })
     return self.render_string("modules/mediarss.html", entry=entry,
         thumbnails=thumbnails)

示例#15

0

显示文件

def displayAllTVShows(request_obj, response_obj):
    url = request_obj.get_data()['tvChannelUrl']
    contentDiv = BeautifulSoup.SoupStrainer('div', {'class':'rightwidget'})
    soup = HttpClient().getBeautifulSoup(url=url, parseOnlyThese=contentDiv)
    tvshows = soup.findChildren('a')
    for tvshow in tvshows:
        tvshowName = tvshow.getText()
        tvshowUrl = str(tvshow['href'])
        
        item = ListItem()
        item.add_request_data('tvshowName', tvshowName)
        item.add_request_data('tvshowUrl', tvshowUrl)
        item.add_request_data('tvChannelUrl', tvshowUrl)
        item.set_next_action_name('Show_Episodes')
        xbmcListItem = xbmcgui.ListItem(label=tvshowName)
        item.set_xbmc_list_item_obj(xbmcListItem)
        response_obj.addListItem(item)

示例#16

0

显示文件

	def rafraichir( self ):
		self.afficher( u"Récupération de la liste des émissions..." )
		# RAZ
		self.listeEmissions.clear()
		# Recupere la page principale
		page         = self.API.getPage( self.pageEmissions )
		soupStrainer = BeautifulSoup.SoupStrainer( "div", { "class" : "unit size1of5" } )
		pageSoup     = BeautifulSoup.BeautifulSoup( page, parseOnlyThese = soupStrainer )
		# Extrait les emissions
		for emissionBlock in pageSoup.contents:
			try:
				nomEmission = unicodedata.normalize( 'NFKD', emissionBlock.div.p.a[ "title" ] ).encode( 'ASCII', 'ignore' )
				idEmission  = emissionBlock.div.p.input[ "value" ]
				self.listeEmissions[ nomEmission ] = idEmission
			except:
				continue
		self.sauvegarderCache( self.listeEmissions )
		self.afficher( u"Liste des émissions sauvegardées" )

示例#17

0

显示文件

	def listerFichiers( self, emission ):
		if( self.listeEmissions.has_key( emission ) ):
			# Recupere la page qui liste les fichiers
			pageFichiers     = self.API.getPage( "http://www.lcp.fr/spip.php?page=lcp_page_videos_ajax&parent=%s" %( self.listeEmissions[ emission ] ) )
			soupStrainer     = BeautifulSoup.SoupStrainer( "div", { "class" : "video-item" } )
			pageFichiersSoup = BeautifulSoup.BeautifulSoup( pageFichiers, parseOnlyThese = soupStrainer )
			# Extrait d'abord tous les liens vers les pages qui contienent les fichiers
			listeUrls = map( lambda x : "http://www.lcp.fr/%s" %( x[ "href" ] ), pageFichiersSoup.findAll( "a" ) )
			# Recupere toutes les pages
			dicoPageFichier = self.API.getPages( listeUrls )
			# Extrait les fichiers
			for fichiersBlock in pageFichiersSoup.contents:
				try:
					urlPageFichier = "http://www.lcp.fr/%s" %( fichiersBlock.strong.a[ "href" ] )
					urlImage       = "http://www.lcp.fr/%s" %( fichiersBlock.strong.img[ "src" ] )
					descriptif     = fichiersBlock.p.contents[ 0 ].replace( "\n", "" ).replace( "\t", "" )
					pageFichier    = dicoPageFichier[ urlPageFichier ]
					if( pageFichier == "" ):
						continue
					soup = BeautifulSoup.BeautifulSoup( pageFichier )
					
					#
					# Code k3c
					#
					
					nom = urlPageFichier.split('/')[-1:][0]
					player = soup.find('param', {'name': 'movie'})['value']
					info_video = soup.find('param', attrs={'name' : 'flashvars' })['value']
					host = info_video.split('rtmp://')[1].split('/')[0]
					app = info_video.split('rtmp://')[1].split('/')[1]
					s2 = host+"/"+app+"/"
					playpath = info_video.split(s2)[1].split('/mp4')[0]
					playpath = playpath[:-4]
					cmds = "rtmpdump"+" --resume  --live 0 --host "+host+" --swfVfy "+ player+" --swfAge 0 -v --app "+app+" --playpath "+playpath+" -e -k 1 --flv "+str(nom)+".mp4"
					
					#
					# Fin code k3c
					#
							
					# Ajouter le fichier
					self.ajouterFichier( emission, Fichier( nom = "%s - %s" %( emission, descriptif ), lien = cmds, nomFichierSortie = "%s %s.mp4" %( emission, descriptif ), urlImage = urlImage, descriptif = descriptif ) )
				except:
					continue

示例#18

0

显示文件

def displayMoviesMenu(request_obj, response_obj):
    # ALL Movies
    movies_icon_filepath = AddonUtils.getCompleteFilePath(baseDirPath=AddonContext().addonPath, extraDirPath=AddonUtils.ADDON_ART_FOLDER, filename='movies.png')
    item = ListItem()
    item.set_next_action_name('Movies_List')
    item.add_request_data('movieCategoryUrl', 'http://www.pinoymovie.co/video')
    xbmcListItem = xbmcgui.ListItem(label='All Movies', iconImage=movies_icon_filepath, thumbnailImage=movies_icon_filepath)
    item.set_xbmc_list_item_obj(xbmcListItem)
    response_obj.addListItem(item)
    # Recently Added
    item = ListItem()
    item.set_next_action_name('Recent_Movies_List')
    item.add_request_data('movieCategoryUrl', 'http://www.pinoymovie.co/video')
    xbmcListItem = xbmcgui.ListItem(label='Recently Added', iconImage=movies_icon_filepath, thumbnailImage=movies_icon_filepath)
    item.set_xbmc_list_item_obj(xbmcListItem)
    #response_obj.addListItem(item)
    
    contentDiv = BeautifulSoup.SoupStrainer('div', {'id':'sub-sidebar'})
    soup = HttpClient().getBeautifulSoup(url='http://www.pinoymovie.co/video', parseOnlyThese=contentDiv)
    soup = soup.findChild('div', {'class':'right'})
    
    for liItemTag in soup.findChildren('li', {'class':re.compile(r'\bcat-item\b')}):
        aTag = liItemTag.findChild('a')
        categoryUrl = aTag['href']
        categoryName = aTag.getText()
        
        item = ListItem()
        item.set_next_action_name('Movies_List')
        item.add_request_data('movieCategoryUrl', categoryUrl)
        xbmcListItem = xbmcgui.ListItem(label=categoryName, iconImage=movies_icon_filepath, thumbnailImage=movies_icon_filepath)
        item.set_xbmc_list_item_obj(xbmcListItem)
        response_obj.addListItem(item)
    
    # Search TV
    search_icon_filepath = AddonUtils.getCompleteFilePath(baseDirPath=AddonContext().addonPath, extraDirPath=AddonUtils.ADDON_ART_FOLDER, filename='search.png')
    item = ListItem()
    item.set_next_action_name('Search_Movies_List')
    item.add_request_data('movieCategoryUrl', 'http://www.pinoymovie.co/?s=')
    xbmcListItem = xbmcgui.ListItem(label='Search Movies', iconImage=search_icon_filepath, thumbnailImage=search_icon_filepath)
    item.set_xbmc_list_item_obj(xbmcListItem)
    response_obj.addListItem(item)

示例#19

0

显示文件

def get_top_K_pages(phrase, K):
    """
    In which we coax a mighty search engine into giving us what we want.
    TODO:
    References:
      - http://en.wikibooks.org/wiki/Python_Programming/Internet
      - http://docs.python.org/library/urllib2.html
  """
    global W, T_to_be_visited
    # TODO: use urllib.quote instead of str.replace
    search_url = yahoo_url % (phrase.replace(' ', '+'), str(K))
    # Sleep for a few seconds, just in case we are calling the search engine too frequently
    time.sleep(search_lag_time)
    search_results = urllib2.urlopen(urllib2.Request(search_url, None,
                                                     headers))
    clickurls = BeautifulSoup.SoupStrainer('clickurl')
    results_soup = BeautifulSoup.BeautifulStoneSoup(search_results,
                                                    parseOnlyThese=clickurls)
    logging.debug('Search results: ' + results_soup.prettify())
    # order of W is not important at the moment
    W = set([link.string for link in results_soup.findAll('clickurl')])
    T_to_be_visited = list(W.copy())

示例#20

0

显示文件

文件： dtf_actions.py 项目： noba3/KoTos

def __retrieve_tv_shows__(tv_channel_url):
    tv_shows = []
    if tv_channel_url is None:
        return tv_shows
    tv_channel_url = BASE_WSITE_URL + tv_channel_url
    contentDiv = BeautifulSoup.SoupStrainer('div', {'class': 'all-tv-shows'})
    soup = HttpClient().get_beautiful_soup(url=tv_channel_url,
                                           parseOnlyThese=contentDiv,
                                           accept_500_error=True)
    list_item = soup.find('ul')
    for item in list_item.findChildren('li'):
        aTag = item.findChild('a')

        tv_show_url = str(aTag['href'])
        if tv_show_url[0:4] != "http":
            tv_show_url = BASE_WSITE_URL + '/' + tv_show_url
        tv_show_name = aTag.getText()
        tv_shows.append({
            "name": http.unescape(tv_show_name),
            "url": tv_show_url,
            "iconimage": ""
        })
    return tv_shows

示例#21

0

显示文件

文件： soupify.py 项目： garbados/Soup-Miner

def soupify(url, model):
    """
	Gets html from string url
	Passes html through strainer rules contained in dict model
	Feeds strainer results back into model
	If we got redirected during the search, return redirected url
	Else, return model with strainer results
	"""
    assert type(url) is str and type(model) is dict
    contents = urllib2.urlopen(url)
    # check for redirect
    if url == contents.geturl():
        # if no redirect, run soup through strainers
        for k, v in model.iteritems():
            try:
                if v:
                    strainer = BeautifulSoup.SoupStrainer(v[0], attrs=v[1])
                    model[k] = BeautifulSoup.BeautifulSoup(contents, strainer)
            except KeyError:
                print k, v
        return model
    else:
        return None

示例#22

0

显示文件

文件： test.py 项目： zuberv/xceltv

    contentDiv = BeautifulSoup.SoupStrainer('div', {'class': 'all-tv-shows'})
    soup = http.HttpClient().get_beautiful_soup(url=link,
                                                parseOnlyThese=contentDiv,
                                                accept_500_error=True)
    list = soup.find('ul')
    for item in list.findChildren('li'):
        tv_show = item.findChild('a')
        link = tv_show['href']
        name = tv_show.getText()
        print '>>>>>>>>' + name
        print '>>>>>>>>' + link


if __name__ == '__main__':
    print 'DTF actions...'
    contentDiv = BeautifulSoup.SoupStrainer('div',
                                            {'class': 'tv-channel-list'})
    soup = http.HttpClient().get_beautiful_soup(
        url='http://desitvforum.net/television/',
        parseOnlyThese=contentDiv,
        accept_500_error=True)
    list = soup.find('ul')
    for item in list.findChildren('li'):
        channel = item.findChild('a')
        link = channel['href']
        name = channel.getText()
        print name
        print link
        print '------------------------'
        try:
            retrieve_tv_shows(link)
        except:

示例#23

0

显示文件

def retrieveVideoLinks(request_obj, response_obj):

    url = request_obj.get_data()['movieUrl']
    contentDiv = BeautifulSoup.SoupStrainer('div', {'class':'video'})
    soup = HttpClient().getBeautifulSoup(url=url, parseOnlyThese=contentDiv)
    if len(str(soup)) ==0:
        contentDiv = BeautifulSoup.SoupStrainer('div', {'id':'content'})
        soup = HttpClient().getBeautifulSoup(url=url, parseOnlyThese=contentDiv)
    decodedSoup = urllib.unquote(str(soup))

    videoFrameLinks = re.compile('http://www.pinoymovie.c(o|a)/ajaxtabs/(.+?).htm').findall(decodedSoup)
    if len(videoFrameLinks) > 0:
        video_source_id = 1
        for ignoreIt, videoFrameLink in videoFrameLinks: #@UnusedVariable
            try:
                soup = HttpClient().getBeautifulSoup(url='http://www.pinoymovie.co/ajaxtabs/' + videoFrameLink + '.htm')
                video_url = str(soup.find('iframe')['src'])
                video_hosting_info = SnapVideo.findVideoHostingInfo(video_url)
                if video_hosting_info is None:
                    print 'UNKNOWN streaming link found: ' + video_url
                else:
                    video_source_img = video_hosting_info.get_video_hosting_image()
                    video_title = 'Source #' + str(video_source_id) + ' :: ' + video_hosting_info.get_video_hosting_name()
                    
                    item = ListItem()
                    item.add_request_data('videoLink', video_url)
                    item.add_request_data('videoTitle', video_title)
                    item.set_next_action_name('SnapAndPlayVideo')
                    xbmcListItem = xbmcgui.ListItem(label=video_title, iconImage=video_source_img, thumbnailImage=video_source_img)
                    item.set_xbmc_list_item_obj(xbmcListItem)
                    response_obj.addListItem(item)
                    video_source_id = video_source_id + 1
            except:
                print 'UNKNOWN streaming link found'
    else:
        videoLinks = re.compile('flashvars=(.+?)file=(.+?)&').findall(decodedSoup)
        
        moreLinks = re.compile('<iframe(.+?)src="(.+?)"', flags=re.I).findall(decodedSoup)
        if len(moreLinks) > 0:
            videoLinks.extend(moreLinks)
        
        moreLinks = re.compile('<a(.+?)href="(.+?)"', flags=re.I).findall(decodedSoup)
        if len(moreLinks) > 0:
            videoLinks.extend(moreLinks)
        if len(videoLinks) > 0:
            
            video_source_id = 1
            video_source_img = None
            video_part_index = 0
            video_playlist_items = []
            for ignoreIt, videoLink in videoLinks: #@UnusedVariable
                try:
                    if re.search('http://media.pinoymovie.ca/playlist/(.+?).xml', videoLink, re.I):
                        soupXml = HttpClient().getBeautifulSoup(url=videoLink)
                        for media in soupXml.findChildren('track'):
                            video_url = media.findChild('location').getText()
                            video_hosting_info = SnapVideo.findVideoHostingInfo(video_url)
                            if video_hosting_info is None:
                                print 'UNKNOWN streaming link found: ' + video_url
                            else:
                                
                                video_part_index = video_part_index + 1
                                video_link = {}
                                video_link['videoTitle'] = 'Source #' + str(video_source_id) + ' | ' + 'Part #' + str(video_part_index)
                                video_link['videoLink'] = video_url
                                video_link['videoSourceImg'] = video_hosting_info.get_video_hosting_image()
                                
                                video_playlist_items.append(video_link)
                                video_source_img = video_link['videoSourceImg']
                                
                                
                                item = ListItem()
                                item.add_request_data('videoLink', video_link['videoLink'])
                                item.add_request_data('videoTitle', video_link['videoTitle'])
                                item.set_next_action_name('SnapAndPlayVideo')
                                xbmcListItem = xbmcgui.ListItem(label=video_link['videoTitle'], iconImage=video_source_img, thumbnailImage=video_source_img)
                                item.set_xbmc_list_item_obj(xbmcListItem)
                                response_obj.addListItem(item)
                        
                        if len(video_playlist_items) > 0:
                            response_obj.addListItem(__preparePlayListItem__(video_source_id, video_source_img, video_playlist_items))
                            video_source_id = video_source_id + 1
                            video_source_img = None
                            video_part_index = 0
                            video_playlist_items = []
                    
                    else:
                        print "insecond"
                        if re.search('http://media.pinoymovie.ca/playlist/(.+?).htm', videoLink, re.I):
                            html = HttpClient().getHtmlContent(url=videoLink).replace('\'', '"')
                            videoLink = re.compile('<iframe(.+?)src="(.+?)"', flags=re.I).findall(html)[0][0]

                        video_hosting_info = SnapVideo.findVideoHostingInfo(videoLink)

                        if video_hosting_info is None:
                            print 'UNKNOWN streaming link found: ' + videoLink
                            
                        else:
                            item = ListItem()
                            item.add_request_data('videoLink', videoLink)
                            print "source:" + videoLink
                            item.add_request_data('videoTitle', 'Source #' + str(video_source_id))
                            item.set_next_action_name('SnapAndPlayVideo')
                            xbmcListItem = xbmcgui.ListItem(label='Source #' + str(video_source_id), iconImage=video_hosting_info.get_video_hosting_image(), thumbnailImage=video_hosting_info.get_video_hosting_image())
                            item.set_xbmc_list_item_obj(xbmcListItem)
                            response_obj.addListItem(item)
                            
                            video_source_id = video_source_id + 1
                except:
                    print 'UNKNOWN streaming link found'
                    video_source_img = None
                    video_part_index = 0
                    video_playlist_items = []

示例#24

0

显示文件

def retrieveVideoLinks(request_obj, response_obj):
    
    video_source_id = 1
    video_source_img = None
    video_part_index = 0
    video_playlist_items = []
    #ignoreAllLinks = False
    
    url = request_obj.get_data()['episodeUrl']
    contentDiv = BeautifulSoup.SoupStrainer('div', {'class':'entry'})
    soup = HttpClient().getBeautifulSoup(url=url, parseOnlyThese=contentDiv)
    soup = soup.findChild('div')
    for child in soup.findChildren():
        if child.name == 'img' or child.name == 'param' or child.name == 'object' or child.name == 'b' or child.name == 'font' or child.name == 'br':
            pass
        elif child.name == 'span' and re.search('ALTERNATIVE VIDEO', child.getText(), re.IGNORECASE):
            if len(video_playlist_items) > 0:
                response_obj.addListItem(__preparePlayListItem__(video_source_id, video_source_img, video_playlist_items))
                
            video_source_id = video_source_id + 1
            video_source_img = None
            video_part_index = 0
            video_playlist_items = []
            #ignoreAllLinks = False
        elif child.name == 'embed' or child.name == 'iframe':
            
            if re.search('http://gdata.youtube.com/feeds/api/playlists/', str(child)) or re.search('http://www.youtubereloaded.com/playlists/', str(child)):
                playlistId = re.compile('/playlists/(.+?)(\&|\.xml)').findall(str(child))[0][0]
                
                videoUrls = YouTube.retrievePlaylistVideoItems(playlistId)
                for videoUrl in videoUrls:
                    try:
                        video_part_index = video_part_index + 1
                        video_link = {}
                        video_link['videoTitle'] = 'Source #' + str(video_source_id) + ' | ' + 'Part #' + str(video_part_index)
                        video_link['videoLink'] = videoUrl
                        print "myvidlink"+videoUrl
                        video_hosting_info = SnapVideo.findVideoHostingInfo(video_link['videoLink'])
                        video_link['videoSourceImg'] = video_hosting_info.get_video_hosting_image()
                        
                        video_playlist_items.append(video_link)
                        video_source_img = video_link['videoSourceImg']
                        
                        item = ListItem()
                        item.add_request_data('videoLink', video_link['videoLink'])
                        item.add_request_data('videoTitle', video_link['videoTitle'])
                        item.set_next_action_name('SnapAndPlayVideo')
                        xbmcListItem = xbmcgui.ListItem(label='Source #' + str(video_source_id) + ' | ' + 'Part #' + str(video_part_index) , iconImage=video_source_img, thumbnailImage=video_source_img)
                        item.set_xbmc_list_item_obj(xbmcListItem)
                        response_obj.addListItem(item)
                    except:
                        print 'Unable to recognize a source = ' + video_link['videoLink']
                        video_source_img = None
                        video_part_index = 0
                        video_playlist_items = []
                        #ignoreAllLinks = True
                    
            else:

                videoUrl = str(child['src'])
                
                try:
                    video_part_index = video_part_index + 1
                    video_link = {}
                    video_link['videoTitle'] = 'Source #' + str(video_source_id) + ' | ' + 'Part #' + str(video_part_index)
                    video_link['videoLink'] = videoUrl
                    print "myvidlink"+videoUrl
                    video_hosting_info = SnapVideo.findVideoHostingInfo(video_link['videoLink'])
                    video_link['videoSourceImg'] = video_hosting_info.get_video_hosting_image()
                    
                    video_playlist_items.append(video_link)
                    video_source_img = video_link['videoSourceImg']
                    
                    item = ListItem()
                    item.add_request_data('videoLink', video_link['videoLink'])
                    item.add_request_data('videoTitle', video_link['videoTitle'])
                    item.set_next_action_name('SnapAndPlayVideo')
                    xbmcListItem = xbmcgui.ListItem(label='Source #' + str(video_source_id) + ' | ' + 'Part #' + str(video_part_index) , iconImage=video_source_img, thumbnailImage=video_source_img)
                    item.set_xbmc_list_item_obj(xbmcListItem)
                    response_obj.addListItem(item)
                except:
                    print 'Unable to recognize a source = ' + video_link['videoLink']
                    video_source_img = None
                    video_part_index = 0
                    video_playlist_items = []
                    #ignoreAllLinks = True
        else:
            print 'UNKNOWN child name'
            print child
            
    if len(video_playlist_items) > 0:
        response_obj.addListItem(__preparePlayListItem__(video_source_id, video_source_img, video_playlist_items))

示例#25

0

显示文件

def _retrieve_video_links_(req_attrib, modelMap):

    modelMap['channel-name'] = req_attrib['channel-name']
    modelMap['tv-show-name'] = req_attrib['tv-show-name']
    modelMap['episode-name'] = req_attrib['episode-name']

    video_source_id = 1
    video_source_img = None
    video_source_name = None
    video_part_index = 0
    video_playlist_items = []
    ignoreAllLinks = False

    list_items = []
    contentDiv = BeautifulSoup.SoupStrainer('div', {'id': 'left-div'})
    soup = HttpClient().get_beautiful_soup(url=req_attrib['episode-url'],
                                           parseOnlyThese=contentDiv)
    #     soup = BeautifulSoup.BeautifulSoup(HttpClient().get_html_content(url=req_attrib['episode-url'])).findAll('blockquote', {'class':re.compile(r'\bpostcontent\b')})[0]

    centerTag = soup.findNext('center')
    logging.getLogger().debug(centerTag)
    prevChild = ''
    prevAFont = None
    isHD = 'false'
    videoSource = ''
    for child in soup.findChildren():
        if child.name == 'span':
            if len(video_playlist_items) > 0:
                list_items.append(
                    __preparePlayListItem__(video_source_id, video_source_img,
                                            video_source_name,
                                            video_playlist_items, modelMap,
                                            isHD))

            logging.getLogger().debug(videoSource)
            videoSource = child.getText()
            if (re.search('720p', videoSource, re.I)):
                isHD = 'true'
            else:
                isHD = 'false'
            if video_source_img is not None:
                video_source_id = video_source_id + 1
                video_source_img = None
                video_source_name = None
                video_part_index = 0
                video_playlist_items = []
            ignoreAllLinks = False
        elif not ignoreAllLinks and child.name == 'a':
            if (str(child['href']) != 'https://www.facebook.com/iamdesirulez'):
                video_part_index = video_part_index + 1
                video_link = {}
                video_link['videoTitle'] = 'Source #' + str(
                    video_source_id) + ' | ' + 'Part #' + str(
                        video_part_index) + ' | ' + child.getText()
                video_link['videoLink'] = str(child['href'])
                video_link['videoSource'] = videoSource
                try:
                    try:
                        __prepareVideoLink__(video_link)
                    except Exception, e:
                        logging.getLogger().error(e)
                        video_hosting_info = SnapVideo().findVideoHostingInfo(
                            video_link['videoLink'])
                        if video_hosting_info is None or video_hosting_info.get_name(
                        ) == 'UrlResolver by t0mm0':
                            raise
                        video_link[
                            'videoSourceImg'] = video_hosting_info.get_icon()
                        video_link[
                            'videoSourceName'] = video_hosting_info.get_name()
                    video_playlist_items.append(video_link)
                    video_source_img = video_link['videoSourceImg']
                    video_source_name = video_link['videoSourceName']

                    item = xbmcgui.ListItem(label='Source #' +
                                            str(video_source_id) + ' | ' +
                                            'Part #' + str(video_part_index),
                                            iconImage=video_source_img,
                                            thumbnailImage=video_source_img)
                    item.setProperty('videoLink', video_link['videoLink'])
                    item.setProperty('videoTitle', video_link['videoTitle'])
                    item.setProperty('videoSourceName', video_source_name)
                    item.setProperty('isContinuousPlayItem', 'false')
                    list_items.append(item)

                    prevAFont = child.findChild('font')
                except:
                    logging.getLogger().error(
                        'Unable to recognize a source = ' +
                        str(video_link['videoLink']))
                    video_source_img = None
                    video_source_name = None
                    video_part_index = 0
                    video_playlist_items = []
                    ignoreAllLinks = True
                    prevAFont = None

示例#26

0

显示文件

文件： desiplex.py 项目： nadzvee/ProjectSwan

    def scrape_episode(self,
                       title,
                       show_year,
                       year,
                       season,
                       episode,
                       imdb,
                       tvdb,
                       debrid=False):
        try:
            query = '%s %s' % (title, episode)
            query = self.search_link % (urllib.quote_plus(query))

            result = client.request(self.base_link + query)

            result = result.decode('iso-8859-1').encode('utf-8')

            result = result.replace('\n', '').replace('\t', '')

            items = client.parseDOM(result, 'content:encoded')[0]

            items = re.compile('class=\"single-heading\">(.+?)<span').findall(
                items)

            for i in range(0, len(items)):
                try:
                    if '720p' in items[i]:
                        quality = 'HD'
                    else:
                        quality = 'SD'

                    urls = client.parseDOM(items[i], "a", ret="href")
                    for j in range(0, len(urls)):

                        result = client.request(urls[j])

                        item = BeautifulSoup.BeautifulSoup(
                            result,
                            parseOnlyThese=BeautifulSoup.SoupStrainer(
                                "iframe"))

                        if len(item) == 0:
                            item = re.compile('data-config="(.+?)"').findall(
                                result)[0]
                            item = [{"src": item}]

                        for links in item:
                            rUrl = links["src"]

                            if rUrl.startswith('//'):
                                rUrl = 'http:%s' % rUrl

                            urls[j] = rUrl
                            host = client.host(urls[0])
                    url = "##".join(urls)

                    self.srcs.append({
                        'source': host,
                        'parts': str(len(urls)),
                        'quality': quality,
                        'scraper': self.name,
                        'url': url,
                        'direct': False
                    })
                    urls = []
                except:
                    pass
            return self.srcs
        except:
            return self.srcs

示例#27

0

显示文件

文件： tanssinet.py 项目： yuandra/scraperwiki-scraper-vault

import BeautifulSoup
from scraperwiki import scrape, sqlite
import datetime, re, json

html = scrape('http://tinyurl.com/tulevat-tanssit')
restrict = BeautifulSoup.SoupStrainer(["h2", "dl"])

conversion = BeautifulSoup.BeautifulStoneSoup.ALL_ENTITIES
page = BeautifulSoup.BeautifulStoneSoup(html,
                                        parseOnlyThese=restrict,
                                        convertEntities=conversion)

dates = page.findAll("h2")

for d in dates:
    wday, dinfo = d.a.contents[0].split()
    record = {'weekday': wday}
    mdate = re.match("(\d+)\.(\d+)\.(\d\d\d\d)", dinfo)
    date = datetime.date(int(mdate.group(3)), int(mdate.group(2)),
                         int(mdate.group(1)))
    dthandler = lambda obj: obj.isoformat() if isinstance(
        obj, datetime.datetime) else None
    date = json.dumps(date, default=dthandler)
    record['date'] = date
    data = d.nextSibling
    try:
        record['place'] = data.dt.a.text
    except Exception, e:
        print data
    record['artists'] = data.dd.text.replace('&', ' & ').replace(',', ', ')
    sqlite.save(unique_keys=['place', 'date'], data=record, date=date)

示例#28

0

显示文件

def __retrieveRecentMovies__(movieLinkTag):
    movieLink = movieLinkTag['href']
    contentDiv = BeautifulSoup.SoupStrainer('div', {'id':'content'})
    soup = HttpClient().getBeautifulSoup(url=movieLink, parseOnlyThese=contentDiv)
    movieTag = soup.findChild('div', {'class':'post'})
    return __retrieveAndCreateMovieItem__(movieTag)