Python BeautifulSoup.find示例，beautifulsoup.BeautifulSoup.BeautifulSoup.find Python示例

示例#1

0

显示文件

文件： models.py 项目： tallstreet/Tall-Street-Site-Search

def get_page(sender, instance, **kwargs):
    data = urllib.urlopen(instance.url)
    instance.page = unicode(data.read(), errors="ignore")
    soup = BeautifulSoup(instance.page)
    instance.title = soup.html.head.title.string
    desc = soup.find("meta", {"name": "description"})
    if desc:
        instance.description = desc["content"]
    else:
        instance.description = ""
    keywords = soup.find("meta", {"name": "keywords"})
    if keywords:
        instance.keywords = keywords["content"]
    else:
        instance.keywords = ""

示例#2

0

显示文件

文件： uzg.py 项目： barttenbrinke/Bartsidee-Repository

    def Play(self, stream_name, stream_id, subtitle):
        data = ba.FetchUrl(stream_id, 3600)
        soup = BeautifulSoup(data, convertEntities="xml", smartQuotesTo="xml")
        streamid = re.compile("load_player\('(.*?)'", re.DOTALL + re.IGNORECASE).search(str(soup)).group(1)
        if streamid == "": mc.ShowDialogNotification("Geen stream beschikbaar...")

        data = ba.FetchUrl('http://player.omroep.nl/info/security', 0)
        soup = BeautifulSoup(data, convertEntities="xml", smartQuotesTo="xml")
        try:
            key = soup.session.key.contents[0]
        except:
            mc.ShowDialogNotification("Kan de security key niet ophalen")
            return
        security = base64.b64decode(key)

        securitystr = str(security).split('|')[1]
        md5code = streamid + '|' + securitystr
        md5code = md5.md5(md5code).hexdigest()

        streamdataurl = 'http://player.omroep.nl/info/stream/aflevering/' + str(streamid) + '/' + str(md5code).upper()
        data = ba.FetchUrl(streamdataurl, 0).decode('utf-8')
        xmlSoup = BeautifulSoup(data)
        streamurl = xmlSoup.find(attrs={"compressie_formaat" : "wvc1"})
        url_play = streamurl.streamurl.contents[0].replace(" ","").replace("\n","").replace("\t","")

        play = ba.CreatePlay()
        play.SetPath(url_play)
        if subtitle:
            play.SetSubtitle(self.GetSubtitle(security, streamid))
            play.SetSubtitle_type('sami')

        return play

示例#3

0

显示文件

文件： itv.py 项目： barttenbrinke/Bartsidee-Repository

    def Episode(self, stream_name, stream_id, page, totalpage):
        url = "http://mercury.itv.com/api/html/dotcom/Episode/Programme/" + quote(stream_id)
        data = ba.FetchUrl(url, 3600)
        soup = BeautifulSoup(data)

        if len(data) < 10:
            mc.ShowDialogNotification("No episode found for " + str(stream_name))
            episodelist = list()
            return episodelist
        table = soup.find("tbody")

        episodelist = list()
        for info in table.findAll("tr"):
            time = info.find("td", {"class": "t_time"})
            duration = info.find("td", {"class": "t_duration"})
            details = info.find("td", {"class": "t_details"})

            episode = ba.CreateEpisode()
            episode.SetName(stream_name)
            episode.SetId(self.url_base + details.a["href"])
            episode.SetDescription(duration.contents[0] + " - " + details.span.contents[0])
            episode.SetThumbnails(details.a.img["src"])
            episode.SetDate(time.contents[2])
            episode.SetPage(page)
            episode.SetTotalpage(totalpage)
            episodelist.append(episode)
        return episodelist

示例#4

0

显示文件

文件： zdf.py 项目： barttenbrinke/Bartsidee-Repository

    def Play(self, stream_name, stream_id, subtitle):
        url = 'http://www.zdf.de/ZDFmediathek/xmlservice/web/beitragsDetails?ak=web&id='+stream_id
        data = ba.FetchUrl(url)
        soup = BeautifulSoup(data, convertEntities="xml", smartQuotesTo="xml")

        url = soup.find('formitaet',{'basetype':'wmv3_wma9_asf_mms_asx_http'})
        url = url.url.contents[0]

        sub = soup.find('caption')
        try:
            sub = sub.url.contents[0]
        except:
            sub = ''

        play = ba.CreatePlay()
        play.SetPath(url)
        if subtitle:
            if sub:
                play.SetSubtitle(str(sub))
                play.SetSubtitle_type('flashxml')

        return play

示例#5

0

显示文件

文件： itv.py 项目： barttenbrinke/Bartsidee-Repository

    def Genre(self, genre, filter, page, totalpage):
        url = "http://mercury.itv.com/api/html/dotcom/Schedule/"
        data = ba.FetchUrl(url, 3600)
        soup = BeautifulSoup(data)

        if len(data) < 10:
            mc.ShowDialogNotification("No episode found for " + str(stream_name))
            episodelist = list()
            return episodelist

        day = soup.find("li", {"class": re.compile("^" + genre)})

        net = []

        if filter and filter != "None":
            net.append(filter)
        else:
            for id in self.filter:
                net.append(id)
        if "None" in net:
            net.remove("None")

        data = {}
        data_sorted = []
        for i in net:
            netdata = day.find("li", {"class": re.compile("^" + i)})
            for info in netdata.findAll(attrs={"class": re.compile("^whatsOnTime")}):
                if info.a:
                    title = info.find("span", {"class": "title"})
                    time = info.find("span", {"class": "time"})
                    # date:[name,id,filter]
                    data[time.contents[0]] = [title.contents[0], self.url_base + info.a["href"], i]
        date = data.keys()
        date.sort(reverse=True)
        for i in date:
            data_sorted.append({"name": data[i][0], "id": data[i][1], "filter": data[i][2], "date": i})

        genrelist = list()
        for info_sorted in data_sorted:
            genreitem = ba.CreateEpisode()
            genreitem.SetName(info_sorted["name"])
            genreitem.SetId(info_sorted["id"])
            genreitem.SetDate(info_sorted["date"])
            genreitem.SetFilter(info_sorted["filter"])
            genreitem.SetPage(page)
            genreitem.SetTotalpage(totalpage)
            genrelist.append(genreitem)

        return genrelist

示例#6

0

显示文件

    def downloadCue(self):
        cueNation = "http://cuenation.com/"

        if self.showName:
            url = self.cueNationFolder
            html = BeautifulSoup(urllib.urlopen(url).read())
            episodeLinkHtml = html.find(
                "a", text=re.compile(self.showName + " (?:Podcast\s)?(?:Episode\s)?" + self.episode, re.I)
            ).parent

            if episodeLinkHtml:
                # if every show is different artist (like anjuneabts worldwide) try to determine it from the link
                if not self.artist:
                    self.artist = episodeLinkHtml.string.split("-")[0].strip()

                # parse the html to get the cue filename
                episodeLink = episodeLinkHtml["href"]

                html = BeautifulSoup(urllib.urlopen(cueNation + episodeLink))
                cueLink = html.find("a", text="Download Cuesheet!").parent["href"]
                cueFileName = cueLink.split("=")[-1]
                self.pathToCue = os.path.join(self.outputDirectory, cueFileName)

                # use browser to open link cause of referer shit
                browser = mechanize.Browser()
                browser.open(cueNation + episodeLink)
                req = browser.click_link(text="Download Cuesheet!")
                browser.open(req)

                cue = open(self.pathToCue, "w")
                cue.write(browser.response().read())

                return self.pathToCue

            else:
                raise Exception("No cue found!")

示例#7

0

显示文件

文件： uzg.py 项目： barttenbrinke/Bartsidee-Repository

    def Genre(self, genre, filter, page, totalpage):
        url = self.url_base + '/7dagen/' + genre
        if filter != "": url = url + ',' + str(filter)
        url = url + '?weergave=detail&page=' + str(page)
        data = ba.FetchUrl(url, 3600)
        if data == "":
            mc.ShowDialogNotification("No genre found for " + str(genre))
            genrelist = list()
            return genrelist
        soup = BeautifulSoup(data, convertEntities="xml", smartQuotesTo="xml")
        if totalpage == "":
            try:
                pagediv = soup.findAll( 'div', {'class' : 'pagination'})[0]
                apage = pagediv.findAll("a")
                totalpage = int(apage[len(apage)-2].contents[0])
            except:
                totalpage = 1

        div_show = soup.find( 'table', {'class' : 'broadcasts detail'})

        genrelist = list()
        for info in div_show.findAll("tr"):
            omroep = info.findAll(attrs={"class" : "broadcaster-logo"})[0]['alt']
            if omroep == "Nederland 1": omroep = "nl1"
            elif omroep == "Nederland 2": omroep = "nl2"
            elif omroep == "Nederland 3": omroep = "nl3"
            try:
                thumb = info.findAll(attrs={"class" : "thumbnail"})[0]['src']
            except:
                thumb = info.findAll(attrs={"class" : "thumbnail placeholder"})[0]['src']
            path = self.url_base + info.find(attrs={"class" : "thumbnail_wrapper"})['href']
            date = info.find(attrs={"class" : "time"}).time.contents[0].replace(' ','').replace('\n','').replace('\t','')
            title = info.findAll(attrs={"class" : "series"})[0].contents[0]
            desc = info.find('div', {'class' : 'description'}).p.contents[0]

            genreitem = ba.CreateEpisode()
            genreitem.SetName(title)
            genreitem.SetId(path)
            genreitem.SetDescription(desc)
            genreitem.SetThumbnails(thumb)
            genreitem.SetDate(date)
            genreitem.SetFilter(str(omroep).upper())
            genreitem.SetPage(page)
            genreitem.SetTotalpage(totalpage)
            genrelist.append(genreitem)

        return genrelist

示例#8

0

显示文件

文件： ard.py 项目： barttenbrinke/Bartsidee-Repository

    def Episode(self, stream_name, stream_id, page, totalpage):

        url = self.url_base + '/ard/servlet/ajax-cache/3516962/view=list/documentId='+stream_id+'/goto='+str(page)+'/index.html'
        data = ba.FetchUrl(url, 3600)
        soup = BeautifulSoup(data, convertEntities="xml", smartQuotesTo="xml")
        
        if data < 20:
            mc.ShowDialogNotification("No episode found for " + str(stream_name))
            episodelist = list()
            return episodelist

        if totalpage == "":
            try:
                pages = soup.find( 'li', {'class' : 'mt-paging ajax-paging-li'})
                pages = pages.findAll('span')[2]
                pages = pages.contents[0][-2:].replace(' ','')
                print pages
                totalpage = int(pages)
            except:
                totalpage = 1


        episodelist = list()
        for info in soup.findAll( 'div', {'class' : 'mt-media_item'}):
            if info.findAll( 'span', {'class' : 'mt-icon mt-icon_video'}):
                detail = info.find('a')
                title = stream_name
                airtime = info.find('span', {'class' : 'mt-airtime'})
                thumb = info.find('img')

                episode = ba.CreateEpisode()
                episode.SetName(stream_name)
                episode.SetId(detail['href'].split('=')[1])
                episode.SetDescription(detail.contents[0])
                episode.SetThumbnails(self.url_base + thumb['data-src'])
                episode.SetDate(airtime.contents[0])
                episode.SetPage(page)
                episode.SetTotalpage(totalpage)
                episodelist.append(episode)

        return episodelist

示例#9

0

显示文件

文件： uzg.py 项目： barttenbrinke/Bartsidee-Repository

    def Search(self, search):
        url = self.url_base + '/programmas/search'
        params = 'query=' + quote_plus(search)
        data = ba.FetchUrl(url, 0, True, params)

        soup = BeautifulSoup(data, convertEntities="xml", smartQuotesTo="xml")
        div_page = soup.find("ul")
        
        streamlist = list()
        try:
            div_page.findAll("a")
        except:
            return streamlist

        for info in div_page.findAll('a'):
            stream = ba.CreateStream()
            stream.SetName(info.contents[0])
            stream.SetId(info['href'].split('/')[2])
            streamlist.append(stream)

        return streamlist

示例#10

0

显示文件

 def crawl(self, albumArtist, album, releaseDate):
     releaseFound = False
     searchPage = urllib.urlopen("{0}/search?query={1}&facets[]=fieldType:release".format(self.domain, urllib.quote(albumArtist + " " + album))).read()
     searchHtml = BeautifulSoup(searchPage)
     #releaseLinks = searchHtml.findAll('a', { 'name' : 'unit_title' })
     releaseLinks = []
     releases = searchHtml.findAll('li', { 'name' : re.compile('tiles-list_release_[0-9]+') })
     
     for release in releases:
         thisTitle = release.find('a', { 'name' : 'unit_title' })
         thisAlbum = thisTitle.string
         thisUrl = thisTitle['href']
         thisDate = release.find('span', { 'class' : 'itemRenderer-minor' }).contents[1].replace(" | ", "").strip()
         print thisDate
         if releaseDate and releaseDate == thisDate:
             releaseUrl = thisUrl
             beatportAlbum = thisAlbum
             releaseFound = True
             break
             
     if releaseFound:
         # open the release page
         releasePage = urllib.urlopen("{0}{1}".format(self.domain, releaseUrl))
         releaseHtml = BeautifulSoup(releasePage)
         
         # now that we are here, we can obtain some of the release info
         releaseInfoLabels = releaseHtml.findAll('td', { 'class' : 'meta-data-label' })
         
         beatportReleaseDate = releaseInfoLabels[0].nextSibling.string
         beatportLabel = releaseInfoLabels[1].nextSibling.a.string
         beatportCatalogNumber = releaseInfoLabels[2].nextSibling.string
         beatportAlbumArtUrl = releaseHtml.find('img', { 'class' : 'tile-image' })['src']
             
         return (self.decodeHtml(beatportAlbum), 
                 self.decodeHtml(beatportLabel), 
                 self.decodeHtml(beatportCatalogNumber), 
                 self.decodeHtml(beatportReleaseDate), 
                 beatportAlbumArtUrl)

示例#11

0

显示文件

文件： bbc.py 项目： barttenbrinke/Bartsidee-Repository

    def Play(self, stream_name, stream_id, subtitle):

        id = re.compile('episode\/(.*?)\/', re.DOTALL + re.IGNORECASE).search(str(stream_id)).group(1)
        url = self.url_base + '/iplayer/episode/' + id + '/'
        data = ba.FetchUrl(stream_id)
        pid = re.compile('ep.setVersionPid\("(.*?)"\)', re.DOTALL + re.IGNORECASE).search(str(data)).group(1)
        
        surl = 'http://www.bbc.co.uk/mediaselector/4/mtis/stream/' + pid
        bitrate = []
        data = ba.FetchUrl(surl)
        soup = BeautifulSoup(data, convertEntities="xml", smartQuotesTo="xml")
        for info in soup.findAll('media', {'bitrate':True}):
           bitrate.append(int(info['bitrate']))
        bitrate.sort()
        max = str(bitrate[-1])
    
        media = soup.find('media', {'bitrate':max})
        print media
        connection = media.find('connection', {'supplier':'akamai'})
        if not connection: connection = media.find('connection', {'supplier':'limelight'})

        identifier  = connection['identifier']
        server      = connection['server']
        supplier    = connection['supplier']
        try:
            auth    = connection['authString']
        except:
            auth    = connection['authstring']

        try:
            application = connection['application']
        except:
            application = 'live'

        #if subtitle:
        #    sub_url = soup.find('media', {'kind':'captions'})
        #    sub_url = sub_url.connection['href']

        timeout = 600
        swfplayer = 'http://www.bbc.co.uk/emp/10player.swf'
        #params = dict(protocol = "rtmp", port = "1935", server = server, auth = auth, ident = identifier, app = application)

        #if supplier == "akamai":
        #    url = "%(protocol)s://%(server)s:%(port)s/%(app)s?%(auth)s playpath=%(ident)s" % params
        #if supplier == "akamai":
        # note that librtmp has a small issue with constructing the tcurl here. we construct it ourselves for now (fixed in later librtmp)
        #    url = "%(protocol)s://%(server)s:%(port)s/ app=%(app)s?%(auth)s tcurl=%(protocol)s://%(server)s:%(port)s/%(app)s?%(auth)s playpath=%(ident)s" % params

        #    url += " swfurl=%s swfvfy=true timeout=%s" % (swfplayer, timeout)

        play = ba.CreatePlay()
        play.SetRTMPPath(identifier)
        if supplier == "akamai":
            play.SetRTMPDomain('rtmp://'+server+'/'+application)
            play.SetRTMPAuth('rtmp://'+server+'/'+application +'?'+ auth)
        elif supplier == "limelight":
            play.SetRTMPDomain('rtmp://'+server)
            play.SetRTMPAuth('rtmp://'+server+'/'+application +'?'+ auth)
        play.SetRTMPSwf(swfplayer)

        #play.SetPath(url)


        #url = 'http://www.bartsidee.nl/flowplayer2/index.html?net=' + str(domain) + '&id=mp4:' + str(id)
        #play = ba.CreatePlay()
        #play.SetPath(quote_plus(url))
        #play.SetDomain('bartsidee.nl')
        #play.SetJSactions(quote_plus('http://bartsidee.nl/boxee/apps/flow.js'))

        #if subtitle:
        #    play = ba.CreatePlay()
        #    play.SetPath(quote_plus(url))
        #    play.SetDomain('bbc.co.uk')
        #    play.SetJSactions(quote_plus('http://bartsidee.nl/boxee/apps/js/bbc1.js'))
        #else:
        #    play = ba.CreatePlay()
        #    play.SetPath(quote_plus(url))
        #    play.SetDomain('bbc.co.uk')
        #    play.SetJSactions(quote_plus('http://bartsidee.nl/boxee/apps/js/bbc0.js'))

        return play