def get_page(sender, instance, **kwargs): data = urllib.urlopen(instance.url) instance.page = unicode(data.read(), errors="ignore") soup = BeautifulSoup(instance.page) instance.title = soup.html.head.title.string desc = soup.find("meta", {"name": "description"}) if desc: instance.description = desc["content"] else: instance.description = "" keywords = soup.find("meta", {"name": "keywords"}) if keywords: instance.keywords = keywords["content"] else: instance.keywords = ""
def Play(self, stream_name, stream_id, subtitle): data = ba.FetchUrl(stream_id, 3600) soup = BeautifulSoup(data, convertEntities="xml", smartQuotesTo="xml") streamid = re.compile("load_player\('(.*?)'", re.DOTALL + re.IGNORECASE).search(str(soup)).group(1) if streamid == "": mc.ShowDialogNotification("Geen stream beschikbaar...") data = ba.FetchUrl('http://player.omroep.nl/info/security', 0) soup = BeautifulSoup(data, convertEntities="xml", smartQuotesTo="xml") try: key = soup.session.key.contents[0] except: mc.ShowDialogNotification("Kan de security key niet ophalen") return security = base64.b64decode(key) securitystr = str(security).split('|')[1] md5code = streamid + '|' + securitystr md5code = md5.md5(md5code).hexdigest() streamdataurl = 'http://player.omroep.nl/info/stream/aflevering/' + str(streamid) + '/' + str(md5code).upper() data = ba.FetchUrl(streamdataurl, 0).decode('utf-8') xmlSoup = BeautifulSoup(data) streamurl = xmlSoup.find(attrs={"compressie_formaat" : "wvc1"}) url_play = streamurl.streamurl.contents[0].replace(" ","").replace("\n","").replace("\t","") play = ba.CreatePlay() play.SetPath(url_play) if subtitle: play.SetSubtitle(self.GetSubtitle(security, streamid)) play.SetSubtitle_type('sami') return play
def Episode(self, stream_name, stream_id, page, totalpage): url = "http://mercury.itv.com/api/html/dotcom/Episode/Programme/" + quote(stream_id) data = ba.FetchUrl(url, 3600) soup = BeautifulSoup(data) if len(data) < 10: mc.ShowDialogNotification("No episode found for " + str(stream_name)) episodelist = list() return episodelist table = soup.find("tbody") episodelist = list() for info in table.findAll("tr"): time = info.find("td", {"class": "t_time"}) duration = info.find("td", {"class": "t_duration"}) details = info.find("td", {"class": "t_details"}) episode = ba.CreateEpisode() episode.SetName(stream_name) episode.SetId(self.url_base + details.a["href"]) episode.SetDescription(duration.contents[0] + " - " + details.span.contents[0]) episode.SetThumbnails(details.a.img["src"]) episode.SetDate(time.contents[2]) episode.SetPage(page) episode.SetTotalpage(totalpage) episodelist.append(episode) return episodelist
def Play(self, stream_name, stream_id, subtitle): url = 'http://www.zdf.de/ZDFmediathek/xmlservice/web/beitragsDetails?ak=web&id='+stream_id data = ba.FetchUrl(url) soup = BeautifulSoup(data, convertEntities="xml", smartQuotesTo="xml") url = soup.find('formitaet',{'basetype':'wmv3_wma9_asf_mms_asx_http'}) url = url.url.contents[0] sub = soup.find('caption') try: sub = sub.url.contents[0] except: sub = '' play = ba.CreatePlay() play.SetPath(url) if subtitle: if sub: play.SetSubtitle(str(sub)) play.SetSubtitle_type('flashxml') return play
def Genre(self, genre, filter, page, totalpage): url = "http://mercury.itv.com/api/html/dotcom/Schedule/" data = ba.FetchUrl(url, 3600) soup = BeautifulSoup(data) if len(data) < 10: mc.ShowDialogNotification("No episode found for " + str(stream_name)) episodelist = list() return episodelist day = soup.find("li", {"class": re.compile("^" + genre)}) net = [] if filter and filter != "None": net.append(filter) else: for id in self.filter: net.append(id) if "None" in net: net.remove("None") data = {} data_sorted = [] for i in net: netdata = day.find("li", {"class": re.compile("^" + i)}) for info in netdata.findAll(attrs={"class": re.compile("^whatsOnTime")}): if info.a: title = info.find("span", {"class": "title"}) time = info.find("span", {"class": "time"}) # date:[name,id,filter] data[time.contents[0]] = [title.contents[0], self.url_base + info.a["href"], i] date = data.keys() date.sort(reverse=True) for i in date: data_sorted.append({"name": data[i][0], "id": data[i][1], "filter": data[i][2], "date": i}) genrelist = list() for info_sorted in data_sorted: genreitem = ba.CreateEpisode() genreitem.SetName(info_sorted["name"]) genreitem.SetId(info_sorted["id"]) genreitem.SetDate(info_sorted["date"]) genreitem.SetFilter(info_sorted["filter"]) genreitem.SetPage(page) genreitem.SetTotalpage(totalpage) genrelist.append(genreitem) return genrelist
def downloadCue(self): cueNation = "http://cuenation.com/" if self.showName: url = self.cueNationFolder html = BeautifulSoup(urllib.urlopen(url).read()) episodeLinkHtml = html.find( "a", text=re.compile(self.showName + " (?:Podcast\s)?(?:Episode\s)?" + self.episode, re.I) ).parent if episodeLinkHtml: # if every show is different artist (like anjuneabts worldwide) try to determine it from the link if not self.artist: self.artist = episodeLinkHtml.string.split("-")[0].strip() # parse the html to get the cue filename episodeLink = episodeLinkHtml["href"] html = BeautifulSoup(urllib.urlopen(cueNation + episodeLink)) cueLink = html.find("a", text="Download Cuesheet!").parent["href"] cueFileName = cueLink.split("=")[-1] self.pathToCue = os.path.join(self.outputDirectory, cueFileName) # use browser to open link cause of referer shit browser = mechanize.Browser() browser.open(cueNation + episodeLink) req = browser.click_link(text="Download Cuesheet!") browser.open(req) cue = open(self.pathToCue, "w") cue.write(browser.response().read()) return self.pathToCue else: raise Exception("No cue found!")
def Genre(self, genre, filter, page, totalpage): url = self.url_base + '/7dagen/' + genre if filter != "": url = url + ',' + str(filter) url = url + '?weergave=detail&page=' + str(page) data = ba.FetchUrl(url, 3600) if data == "": mc.ShowDialogNotification("No genre found for " + str(genre)) genrelist = list() return genrelist soup = BeautifulSoup(data, convertEntities="xml", smartQuotesTo="xml") if totalpage == "": try: pagediv = soup.findAll( 'div', {'class' : 'pagination'})[0] apage = pagediv.findAll("a") totalpage = int(apage[len(apage)-2].contents[0]) except: totalpage = 1 div_show = soup.find( 'table', {'class' : 'broadcasts detail'}) genrelist = list() for info in div_show.findAll("tr"): omroep = info.findAll(attrs={"class" : "broadcaster-logo"})[0]['alt'] if omroep == "Nederland 1": omroep = "nl1" elif omroep == "Nederland 2": omroep = "nl2" elif omroep == "Nederland 3": omroep = "nl3" try: thumb = info.findAll(attrs={"class" : "thumbnail"})[0]['src'] except: thumb = info.findAll(attrs={"class" : "thumbnail placeholder"})[0]['src'] path = self.url_base + info.find(attrs={"class" : "thumbnail_wrapper"})['href'] date = info.find(attrs={"class" : "time"}).time.contents[0].replace(' ','').replace('\n','').replace('\t','') title = info.findAll(attrs={"class" : "series"})[0].contents[0] desc = info.find('div', {'class' : 'description'}).p.contents[0] genreitem = ba.CreateEpisode() genreitem.SetName(title) genreitem.SetId(path) genreitem.SetDescription(desc) genreitem.SetThumbnails(thumb) genreitem.SetDate(date) genreitem.SetFilter(str(omroep).upper()) genreitem.SetPage(page) genreitem.SetTotalpage(totalpage) genrelist.append(genreitem) return genrelist
def Episode(self, stream_name, stream_id, page, totalpage): url = self.url_base + '/ard/servlet/ajax-cache/3516962/view=list/documentId='+stream_id+'/goto='+str(page)+'/index.html' data = ba.FetchUrl(url, 3600) soup = BeautifulSoup(data, convertEntities="xml", smartQuotesTo="xml") if data < 20: mc.ShowDialogNotification("No episode found for " + str(stream_name)) episodelist = list() return episodelist if totalpage == "": try: pages = soup.find( 'li', {'class' : 'mt-paging ajax-paging-li'}) pages = pages.findAll('span')[2] pages = pages.contents[0][-2:].replace(' ','') print pages totalpage = int(pages) except: totalpage = 1 episodelist = list() for info in soup.findAll( 'div', {'class' : 'mt-media_item'}): if info.findAll( 'span', {'class' : 'mt-icon mt-icon_video'}): detail = info.find('a') title = stream_name airtime = info.find('span', {'class' : 'mt-airtime'}) thumb = info.find('img') episode = ba.CreateEpisode() episode.SetName(stream_name) episode.SetId(detail['href'].split('=')[1]) episode.SetDescription(detail.contents[0]) episode.SetThumbnails(self.url_base + thumb['data-src']) episode.SetDate(airtime.contents[0]) episode.SetPage(page) episode.SetTotalpage(totalpage) episodelist.append(episode) return episodelist
def Search(self, search): url = self.url_base + '/programmas/search' params = 'query=' + quote_plus(search) data = ba.FetchUrl(url, 0, True, params) soup = BeautifulSoup(data, convertEntities="xml", smartQuotesTo="xml") div_page = soup.find("ul") streamlist = list() try: div_page.findAll("a") except: return streamlist for info in div_page.findAll('a'): stream = ba.CreateStream() stream.SetName(info.contents[0]) stream.SetId(info['href'].split('/')[2]) streamlist.append(stream) return streamlist
def crawl(self, albumArtist, album, releaseDate): releaseFound = False searchPage = urllib.urlopen("{0}/search?query={1}&facets[]=fieldType:release".format(self.domain, urllib.quote(albumArtist + " " + album))).read() searchHtml = BeautifulSoup(searchPage) #releaseLinks = searchHtml.findAll('a', { 'name' : 'unit_title' }) releaseLinks = [] releases = searchHtml.findAll('li', { 'name' : re.compile('tiles-list_release_[0-9]+') }) for release in releases: thisTitle = release.find('a', { 'name' : 'unit_title' }) thisAlbum = thisTitle.string thisUrl = thisTitle['href'] thisDate = release.find('span', { 'class' : 'itemRenderer-minor' }).contents[1].replace(" | ", "").strip() print thisDate if releaseDate and releaseDate == thisDate: releaseUrl = thisUrl beatportAlbum = thisAlbum releaseFound = True break if releaseFound: # open the release page releasePage = urllib.urlopen("{0}{1}".format(self.domain, releaseUrl)) releaseHtml = BeautifulSoup(releasePage) # now that we are here, we can obtain some of the release info releaseInfoLabels = releaseHtml.findAll('td', { 'class' : 'meta-data-label' }) beatportReleaseDate = releaseInfoLabels[0].nextSibling.string beatportLabel = releaseInfoLabels[1].nextSibling.a.string beatportCatalogNumber = releaseInfoLabels[2].nextSibling.string beatportAlbumArtUrl = releaseHtml.find('img', { 'class' : 'tile-image' })['src'] return (self.decodeHtml(beatportAlbum), self.decodeHtml(beatportLabel), self.decodeHtml(beatportCatalogNumber), self.decodeHtml(beatportReleaseDate), beatportAlbumArtUrl)
def Play(self, stream_name, stream_id, subtitle): id = re.compile('episode\/(.*?)\/', re.DOTALL + re.IGNORECASE).search(str(stream_id)).group(1) url = self.url_base + '/iplayer/episode/' + id + '/' data = ba.FetchUrl(stream_id) pid = re.compile('ep.setVersionPid\("(.*?)"\)', re.DOTALL + re.IGNORECASE).search(str(data)).group(1) surl = 'http://www.bbc.co.uk/mediaselector/4/mtis/stream/' + pid bitrate = [] data = ba.FetchUrl(surl) soup = BeautifulSoup(data, convertEntities="xml", smartQuotesTo="xml") for info in soup.findAll('media', {'bitrate':True}): bitrate.append(int(info['bitrate'])) bitrate.sort() max = str(bitrate[-1]) media = soup.find('media', {'bitrate':max}) print media connection = media.find('connection', {'supplier':'akamai'}) if not connection: connection = media.find('connection', {'supplier':'limelight'}) identifier = connection['identifier'] server = connection['server'] supplier = connection['supplier'] try: auth = connection['authString'] except: auth = connection['authstring'] try: application = connection['application'] except: application = 'live' #if subtitle: # sub_url = soup.find('media', {'kind':'captions'}) # sub_url = sub_url.connection['href'] timeout = 600 swfplayer = 'http://www.bbc.co.uk/emp/10player.swf' #params = dict(protocol = "rtmp", port = "1935", server = server, auth = auth, ident = identifier, app = application) #if supplier == "akamai": # url = "%(protocol)s://%(server)s:%(port)s/%(app)s?%(auth)s playpath=%(ident)s" % params #if supplier == "akamai": # note that librtmp has a small issue with constructing the tcurl here. we construct it ourselves for now (fixed in later librtmp) # url = "%(protocol)s://%(server)s:%(port)s/ app=%(app)s?%(auth)s tcurl=%(protocol)s://%(server)s:%(port)s/%(app)s?%(auth)s playpath=%(ident)s" % params # url += " swfurl=%s swfvfy=true timeout=%s" % (swfplayer, timeout) play = ba.CreatePlay() play.SetRTMPPath(identifier) if supplier == "akamai": play.SetRTMPDomain('rtmp://'+server+'/'+application) play.SetRTMPAuth('rtmp://'+server+'/'+application +'?'+ auth) elif supplier == "limelight": play.SetRTMPDomain('rtmp://'+server) play.SetRTMPAuth('rtmp://'+server+'/'+application +'?'+ auth) play.SetRTMPSwf(swfplayer) #play.SetPath(url) #url = 'http://www.bartsidee.nl/flowplayer2/index.html?net=' + str(domain) + '&id=mp4:' + str(id) #play = ba.CreatePlay() #play.SetPath(quote_plus(url)) #play.SetDomain('bartsidee.nl') #play.SetJSactions(quote_plus('http://bartsidee.nl/boxee/apps/flow.js')) #if subtitle: # play = ba.CreatePlay() # play.SetPath(quote_plus(url)) # play.SetDomain('bbc.co.uk') # play.SetJSactions(quote_plus('http://bartsidee.nl/boxee/apps/js/bbc1.js')) #else: # play = ba.CreatePlay() # play.SetPath(quote_plus(url)) # play.SetDomain('bbc.co.uk') # play.SetJSactions(quote_plus('http://bartsidee.nl/boxee/apps/js/bbc0.js')) return play