def _get_photos(self, album_url): self._photos[album_url] = [] html = self._get_html(album_url) album_title = parseDOM(html, 'title')[0] images = parseDOM(html, 'div', attrs={'class': 'photo'}) descs = parseDOM(html, 'article', attrs={'class': 'pcaption'}) for _id, photo in enumerate(images): pic = urllib2.quote(parseDOM(photo, 'img', ret='src')[0]) description = stripTags( self._parser.unescape( parseDOM(descs[_id], 'div', attrs={'class': 'gcaption geor'})[0])) self._photos[album_url].append({ 'title': '%d - %s' % (_id + 1, album_title), 'album_title': album_title, 'photo_id': _id, 'pic': 'http:' + pic, 'description': description, 'album_url': album_url }) return self._photos[album_url]
def ListMovies(): cookie = cache.cache_get('dramaqueen_cookie')['value'] headersget.update({'Cookie': cookie}) url = params['url'] rM = str(requests.get(url, headers=headersget, timeout=15).content) rM = CleanHTML(rM) result = parseDOM(rM, 'div', attrs={'id': 'av_section_1'})[0] results = re.findall('flex_column av_one_fourth(.+?)</div></div></div>', result) Titles = re.findall('><p>(.+?)</p>', result) Plot = re.findall('/p>[\s,\S,.]<p>(.+?)</p>', result) obrazy = parseDOM(results, 'img', ret='src') linki = [item for item in parseDOM(results, 'a', ret='href')] for item in zip(linki, Titles, obrazy, Plot): addon.addLink(str(item[1]), str(item[0]), mode=5, thumb=str(item[2]), fanart=str(item[2]), plot=str(item[3]))
def _get_albums(self): self._albums = [] url = 'https://www.theatlantic.com/infocus/' html = self._get_html(url) pattern = r'@media\(min-width:\s*1632px\)\s*{\s*#river1 \.lead-image\s*{\s*background-image:\s*url\((.+?)\)' for _id, li in enumerate( parseDOM(html, 'li', attrs={'class': 'article'})): headline = parseDOM(li, 'h1')[0] match = re.search(pattern.replace('river1', 'river%d' % (_id + 1)), html) if match: self._albums.append({ 'title': parseDOM(headline, 'a')[0], 'album_id': _id, 'pic': match.group(1), 'description': stripTags( self._parser.unescape( parseDOM(li, 'p', attrs={'class': 'dek'})[0])), 'album_url': 'https://www.theatlantic.com' + parseDOM(headline, 'a', ret='href')[0] }) return self._albums
def getSWstreams(url): out=[] html,basurl=getUrl2(url) try: result = parseDOM(html,'font',attrs = {'size':'3'})[0] if '<center><b>' in result: result = parseDOM(html,'font',attrs = {'size':'3'})[1] result=result.replace('\n','').replace('</a> |',' |').replace('<b>','').replace('</b>','') try: xx=re.findall('(\w+: <a class.+?</a>)',result,re.DOTALL) for x in xx: lang=re.findall('^(\w+)',x,re.DOTALL)[0] hreftyt=re.findall('href="(.+?)".+?>(Source \d \w+)',x) for href,tyt in hreftyt: href=basurl+href tyt='%s - [B]%s[/B]'%(lang,tyt) out.append({'href':href,'title':tyt}) except: results=result.split('|') for result in results: href,name=re.findall('href="(.+?)".+?>(.+?)<\/a>',result)[0] href=url+href out.append({'href':href,'title':name.replace('<b>','').replace('</b>','')}) except: pass return out
def ListEpisodes(): section = params['section'] name = params['name'] url = params['url'] result = requests.get(url, timeout=15).content results = parseDOM(result, 'section', attrs={'id': 'anime-header'}) poster = parseDOM(results, 'img', ret='src')[0] link = parseDOM(results, 'a', ret='href') title = parseDOM(results, 'a') tags = parseDOM(result, 'div', attrs={'class': 'field field-name-field-tags'}) try: plot = re.findall('p><p>(.+?)</p>', result)[0] if len(re.findall('<span', plot)) >= 0: plot = re.sub('<span(.+?)/span>', '', plot) except: plot = '' pass for i in zip(title, link): addon.addLink(str(i[0]), str(i[1]), mode='AOListLinks', section='links', thumb=str(poster), plot=str(plot), fanart=custom_background)
def Browse_Seasons(): url = params['url'] section = params['section'] page = params['page'] img = params['img'] if section == 'polecane': html = requests.get(url, timeout=15).content result = parseDOM(html, 'ul', attrs={'class': 'pmenu'})[1] result = parseDOM(result, 'li') for item in result: link = parseDOM(item, 'a', ret='href')[0] nazwa = parseDOM(item, 'a')[0] if "Kolejno" in str(nazwa): continue addon.addDir(str(nazwa), url + str(link), mode='List_Episodes', isFolder= True, thumb=fanartAol, fanart=default_background, page=str(url), section='polecane') elif section == 'other': html = requests.get(url, timeout=15).content result = parseDOM(html, 'h1', attrs={'class': 'pod_naglowek'}) if len(result) > 1: for item in result: addon.addDir(str(item), url, mode='List_Episodes', isFolder= True, thumb=str(img), fanart=default_background, page=str(item), section='multi') elif len(result) <= 1: List_Episodes()
def ListDramas(): url = params['url'] rT = requests.get(url, timeout=15).content rT = CleanHTML(rT) result = parseDOM(rT, 'div', attrs={'id': 'av_section_1'})[0] results = re.findall('flex_column av_one_fourth(.+?)</div></div></div>', result) Titles = re.findall('><p>(.+?)</p>', result) Plot = re.findall('/p>[\s,\S,.]<p>(.+?)</p>', result) obrazy = parseDOM(results, 'img', ret='src') linki = [item for item in parseDOM(results, 'a', ret='href')] for item in zip(linki, Titles, obrazy, Plot): addon.addDir(str(item[1]), str(item[0]), mode=4, plot=(str(item[3])), fanart=(str(item[2])), isFolder=True, thumb=(str(item[2])), section='')
def Browse_Titles(): url = params['url'] name = params['name'] html = requests.get(url, timeout=15).content if name in html: mark1 = '>' + name + '</div>' mark2 = '</ul>' data = GetDataBeetwenMarkers(html, mark1, mark2, False)[1] data = re.findall('<a href="(.+?)"(.+?)">(.+?)</a></li>', data) data.sort() #####Polecane ####### if len(data) > 0: for item in data: link = item[0] title = item[2] if 'inne.wbijam' in str(item[0]).lower(): continue addon.addDir(title, link, mode='Browse_Seasons', thumb=fanartAol, fanart=default_background, section='polecane', page=str(url)) #####Pozostałe### elif len(data) == 0: data2 = GetDataBeetwenMarkers(html, mark1, mark2, False)[1] data2 = re.findall('<a href="(.+?)">(.+?)</a></li>', data2) data2.sort() for item in data2: link = url + item[0] set = requests.get(link, timeout=15).content image = parseDOM([i for i in parseDOM(set,'center') if 'img' in i][0], 'img', ret='src')[0] title = item[1] addon.addDir(title, link, mode='Browse_Seasons', thumb=url + str(image), fanart=default_background, section='other', page=str(url))
def _get_albums(self): self._albums = [] home_url = 'https://www.readingthepictures.org' url = home_url + '/category/notes/' html = self._get_html(url) articles = parseDOM(html, 'div', attrs={'class': 'article'}) for _id, article in enumerate(articles): title = parseDOM(article, 'a', ret='title')[0] picture = parseDOM(article, 'img', ret='src')[0] description = parseDOM(article, 'p')[0] self._albums.append({ 'title': self._parser.unescape(title), 'album_id': _id, 'pic': picture, 'description': stripTags(self._parser.unescape(description)), 'album_url': parseDOM(article, 'a', ret='href')[0] }) return self._albums
def ShindenGetVideoLink(url): headers = { 'Accept': '*/*', 'Origin': 'https://shinden.pl', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.46 Safari/537.36', 'DNT': '1', } if str(url).startswith("//"): url = "https://" + url session = requests.session() session.get(url, headers=headers, timeout=15) time.sleep(5) video = session.get(url.replace("player_load", "player_show") + "&width=508", timeout=5).content video_url = '' try: video_url = parseDOM(video, 'iframe', ret='src')[0] except: pass if not video_url: try: video_url = parseDOM(video, 'a', ret='href')[0] except: pass if not video_url: try: video_url = re.findall("src=\"(.*?)\"", video)[0] except: pass if str(video_url).startswith("//"): video_url = "http:" + video_url return video_url
def getSWlink(url): stream = '' playt = True html = getUrl(url, BASEURL3) if 'streamamg.com' in html: iframes = parseDOM(html, 'iframe', ret='src') #[0] for iframe in iframes: if 'streamamg.' in iframe: html2 = getUrl(iframe, url) xx = re.findall('"partnerId":(\d+)', html2, re.DOTALL)[0] xx2 = re.findall('"rootEntryId":"(.+?)"', html2, re.DOTALL)[0] m3u8 = 'http://open.http.mp.streamamg.com/p/%s/playManifest/entryId/%s/format/applehttp' % ( xx, xx2) return m3u8 + '|User-Agent=' + UA + '&Referer=' + iframe, False elif 'unblocked.is' in html: iframes = parseDOM(html, 'iframe', ret='src') #[0] for iframe in iframes: if 'unblocked.is' in iframe: if 'nullrefer.com' in iframe or 'href.li/' in iframe: iframe = urlparse.urlparse(iframe).query html2 = getUrl(iframe, url) stream = getUnblocked(html2) return stream, False else: stream = re.findall('source: "(.+?)"', html, re.DOTALL) if stream: stream = stream[0] else: stream = re.findall('source src="(.+?)"', html, re.DOTALL)[0] playt = False return stream + '|User-Agent=' + UA + '&Referer=' + url, playt
def ListTVCOMdzis(url): out = [] html = getUrl(url) result = parseDOM(html, 'div', attrs={ 'id': 'calendar-owl' })[0] #<div id="calendar-owl" class="owl-carousel"> dzis = parseDOM(result, 'div', attrs={'class': "item today"}) if dzis: dat = re.findall('<a href="\/Den\/\?d=(.+?)">DZI', dzis[0]) #[0] if dat: nagr = re.findall('"badge primary">(.+?)<', dzis[0]) live = re.findall('"badge secondary">(.+?)<', dzis[0]) wkrot = re.findall('"badge inverse">(.+?)<', dzis[0]) nagr = nagr[0] if nagr else '0' live = live[0] if live else '0' wkrot = wkrot[0] if wkrot else '0' dod = ' - (%s, %s, %s)' % (nagr, live, wkrot) out.append({'href': dat[0], 'title': 'DZIŚ' + dod}) days = parseDOM(result, 'div', attrs={'class': 'item'}) for day in days: hrefday = re.findall('href="\/Den\/\?d=(.+?)">(.+?)<', day)[0] nagr = re.findall('"badge primary">(.+?)<', day) live = re.findall('"badge secondary">(.+?)<', day) wkrot = re.findall('"badge inverse">(.+?)<', day) nagr = nagr[0] if nagr else '0' live = live[0] if live else '0' wkrot = wkrot[0] if wkrot else '0' dod = ' - (%s, %s, %s)' % (nagr, live, wkrot) out.append({'href': hrefday[0], 'title': '%s%s' % (hrefday[1], dod)}) return out
def _get_photos(self, album_url): self._photos[album_url] = [] html = self._get_html(album_url) pattern = r'source data-srcset=\"(.+?)\"' match_image = re.findall(pattern, html) album_title = self._parser.unescape(parseDOM(html, 'title')[0]) for _id, p in enumerate(parseDOM(html, 'p', attrs={'class': 'caption'})): match_description = re.search('<span>(.+?)</span>', p) if match_description: self._photos[album_url].append({ 'title': '%d - %s' % (_id + 1, album_title), 'album_title': album_title, 'photo_id': _id, 'pic': match_image[_id * 5], 'description': stripTags(self._parser.unescape( match_description.group(1))), 'album_url': album_url }) return self._photos[album_url]
def get_videos(url, description, ref_id, resolution_option=0, page=0): '''For a given topic url, returns a list of associated videos using the nyt REST API. ''' if ref_id == '': html = _get_html(url) menu = parseDOM(html, 'div', attrs={'class': 'recent-episodes'}) links = parseDOM(menu, 'a', attrs={'class': 'thumb-holder'}, ret='href') if description == 'New York': # this section does not have direct correspondent json classification # so it is directly extracted from html videos = [] for i, link in enumerate(links): video_id = re.search(r'^.+?/(\d{10,})/.+', link).group(1) videos.append( find_video_by_video_id(video_id, resolution_option)) else: for i, link in enumerate(links): # videos can be classified in more than one category and the main one may not b the one we're searching for (description) ref_id = link.split('=')[-1] videos = find_playlist_by_reference_id(ref_id, description, resolution_option, page) if videos != []: # correct classification! (json contains Show display_name == description) break else: # time not wasted examining various json urls, as we know that the received ref_id is good videos = find_playlist_by_reference_id(ref_id, description, resolution_option, page) return (videos, ref_id)
def getScheduleCR(): out = [] html = getUrl(BASEURL2) divs = parseDOM(html, 'div', attrs={'class': 'panel_mid_body'}) for div in divs: day = parseDOM(div, 'h2') #[0] if day: day = 'kiedy|%s' % day[0] out.append({'href': day}) trs = parseDOM(div, 'tr') #[0] for tr in trs: online = '[COLOR lime]► [/COLOR]' if tr.find( 'images/live.gif') > 0 else '[COLOR orangered]■ [/COLOR]' if '>VS</td>' in tr: czas, dysc, team1, team2, href = re.findall( '>(\d+:\d+)</td>.+?<span title="(.+?)".+?href=.+?>(.+?)<.+?>VS<.+?a href.+?>(.+?)</a>.+?<a class="watch_btn" href="(.+?)"', tr, re.DOTALL)[0] mecz = '%s vs %s' % (team1, team2) czas = czas.split(':') hrs = int(czas[0]) + 2 if hrs == 24: hrs = '00' mins = czas[1] czas = '%s:%s' % (str(hrs), mins) else: czas, dysc, team1, href = re.findall( '>(\d+:\d+)</td>.+?<span title="(.+?)".+?href=.+?>(.+?)<.+?<a class="watch_btn" href="(.+?)"', tr, re.DOTALL)[0] mecz = team1 title = '[B][COLOR khaki]%s%s : [/COLOR][/B][COLOR gold][B]%s[/B][/COLOR]' % ( online, czas, mecz) out.append({'title': title, 'href': href, 'code': dysc}) return out
def Kategorie(): cookie = cache.cache_get('dramaqueen_cookie')['value'] headersget.update({'Cookie': cookie}) url = params['url'] rG = requests.get(url, headers=headersget, timeout=15).content # LoginCheck(url=rG) result = parseDOM(rG, 'div', attrs={'class': 'tagcloud'})[0] links = parseDOM(result, 'a', ret='href') label = parseDOM(result, 'a') count = [ re.findall('\d+', i)[0] for i in parseDOM(result, 'a', ret='aria-label') ] for item in zip(label, links, count): addon.addDir(str(item[0]) + ' ' + '[COLOR %s]%s[/COLOR]' % ('green', str(item[2]) + ' pozycji'), str(item[1]), mode=7, fanart='', plot='', thumb='')
def _get_photos(self, album_url): self._photos[album_url] = [] html = self._get_html(album_url) pattern = r'@media\(min-width:1592px\){#img01 \.img{background-image:url\((.+?)\)' id_pattern = re.compile(r'#img(\d\d)') album_title = parseDOM(html, 'title')[0] for _id, p in enumerate(parseDOM(html, 'p', attrs={'class': 'caption'})): match = re.search(id_pattern, p) if match: img_id = match.group(1) match = re.search(pattern.replace('img01', 'img%s' % img_id), html) if match: self._photos[album_url].append({ 'title': '%d - %s' % (_id + 1, album_title), 'album_title': album_title, 'photo_id': _id, 'pic': match.group(1), 'description': stripTags(self._parser.unescape(p)).replace( '\n #', ''), 'album_url': album_url, }) return self._photos[album_url]
def _get_albums(self): self._albums = [] url = 'http://www.bostonglobe.com/news/bigpicture' html = self._get_html(url) for _id, album in enumerate(parseDOM(html, 'section')): title = parseDOM(album, 'a')[0] album_url = 'http://www.bostonglobe.com' + parseDOM( album, 'a', ret='href')[0] d = parseDOM(album, 'div', attrs={'class': 'subhead geor'})[0] if not d: continue description = stripTags(self._parser.unescape(d)) pic = urllib2.quote(parseDOM(album, 'img', ret='src')[0]) if not pic: continue self._albums.append({ 'title': title, 'album_id': _id, 'pic': 'http:' + pic, 'description': description, 'album_url': album_url }) return self._albums
def _get_albums(self): self._albums = [] home_url = 'https://time.com' url = home_url + '/tag/photography/' html = self._get_html(url) articles = parseDOM(html, 'div', attrs={'class': 'taxonomy-tout'}) for _id, article in enumerate(articles): title = parseDOM(article, 'h2')[0] picture = parseDOM(article, 'img', ret='src')[0] try: description = parseDOM(article, 'h3')[0] except Exception: description = '' self._albums.append({ 'title': self._parser.unescape(title), 'album_id': _id, 'pic': picture, 'description': stripTags(self._parser.unescape(description)), 'album_url': home_url + parseDOM(article, 'a', ret='href')[0] }) return self._albums
def getScheduleSW(): out = [] html = getUrl(BASEURL3) first = parseDOM(html, 'div', attrs={'class': 'tab'})[0] #<div class="tab"> iddaydate = re.findall("event, '(.+?)'\).+?<b>(.+?)</b>.+?<b>(.+?)</b>", first, re.DOTALL) for id, day, date in iddaydate: result = parseDOM(html, 'div', attrs={'id': id})[0] result = result.replace('a class=""', 'a class=" "') xxx = re.findall( '(\d+:\d+).*<a class="([^"]+)" href="([^"]+)">([^>]+)</a>', result) if xxx: day = ('kiedy|%s %s' % (day, date)).replace('FIRDAY', 'FRIDAY') out.append({'href': day}) for czas, ikona, href, tyt in xxx: if '\xf0\x9f\x8e\xb1' in ikona: ikona = 'snooker' tyt = re.sub('<font color=.+?>', '', tyt).replace('</font>', '') if '<a href' in tyt or '<br><br' in tyt: continue tyt = '[B][COLOR khaki]%s : [/COLOR][/B][COLOR gold][B]%s[/B][/COLOR]' % ( czas, tyt) href2 = 'http://strims.world' + href if href.startswith( '/') else 'http://strims.world/' + href out.append({'title': tyt, 'href': href2, 'image': ikona}) return out
def getSWstreamsx(url): out=[] html=getUrl(url) try: result = parseDOM(html,'font',attrs = {'size':'3'})[0] if '<center><b>' in result: result = parseDOM(html,'font',attrs = {'size':'3'})[1] t = re.sub('--.*?>', '', result) result= t.replace('\r\n\r\n','') try: xx=re.findall('(\w+)\: <a(.+?)adsbygoogle',result,re.DOTALL) b=xx[0] for x in xx: tit='%s'%x[0] aa=re.findall('href="(.+?)".+?>(.+?)</a>',x[1],re.DOTALL) for a in aa: if 'vjs' in a[0]: continue href= a[0] tytul= a[1].replace('<b>','').replace('</b>','') tyt='%s - [B]%s[/B]'%(tytul,tit) href=url+href out.append({'href':href,'title':tyt}) except: results=result.split('|') for result in results: href,name=re.findall('href="(.+?)".+?>(.+?)<\/a>',result)[0] href=url+href out.append({'href':href,'title':name.replace('<b>','').replace('</b>','')}) except: pass return out
def _get_albums(self): self._albums = [] home_url = 'https://www.bbc.com' url = home_url + '/news/in_pictures' html = self._get_html(url) articles = parseDOM(html, 'div', attrs={'class': 'gs-o-media__body'}) pictures = parseDOM( html, 'div', attrs={'class': \ 'gs-u-mb\+ gel-body-copy qa-post-body'} ) descriptions = parseDOM(html, 'div', attrs={'class': 'gel-5/8@l'}) timestamp = parseDOM(html, 'span', attrs={'class': 'qa-post-auto-meta'}) for _id, article in enumerate(articles): title = parseDOM(parseDOM(article, 'a')[0], 'span')[0] try: picture = parseDOM(pictures[_id], 'img', ret='srcset')[0] picture = re.search(r', (?P<bigger_url>https://[^ ]+) \d+w$', picture).group('bigger_url') description = parseDOM(descriptions[_id], 'p')[0] except Exception: continue self._albums.append({ 'title': self._parser.unescape( title ), 'album_id': _id, 'pic': picture, 'description': stripTags( self._parser.unescape( description ) ) + \ "\n\nPosted @" + timestamp[_id], 'album_url': home_url + parseDOM(article, 'a', ret='href')[0] }) return self._albums
def getLiveSport(): out = [] html = getUrl(BASEURL5, BASEURL5) result = parseDOM(html, 'ul', attrs={'class': "drop-list"}) acts = parseDOM(result, 'li', attrs={'class': "active"}) for act in acts: kiedy = re.findall('"text">(.+?)<\/span><\/a>', act)[0] #>12 September, Today</span></a> day = 'kiedy|%s' % kiedy out.append({'href': day}) act = act.replace("\'", '"') links = parseDOM(act, 'li') #[0] for link in links: # print link href = parseDOM(link, 'a', ret='href')[0] href = 'https://livesport.ws' + href if href.startswith( '/') else href try: team1 = re.findall('right;">(.+?)<\/div>', link)[0] team2 = re.findall('left;">(.+?)<\/div>', link)[0] mecz = '%s vs %s' % (team1, team2) except: mecz = re.findall('center;.+?>(.+?)<', link)[0] dysc = re.findall('"competition">(.+?)</', link) #[0] dysc = dysc[0] if dysc else '' ikon = parseDOM(link, 'img', ret='src')[0] datas = parseDOM(link, 'span', attrs={'class': "date"})[0] #<span class="date"> liv = parseDOM(datas, 'i')[0] online = '[COLOR lime]► [/COLOR]' if 'live' in liv.lower( ) else '[COLOR orangered]■ [/COLOR]' id = parseDOM(link, 'i', ret='id') #[0] if id: postid = re.findall('(\d+)', href)[0] eventid = id[0] href += '|event_id=%s|post_id=%s|' % (eventid, postid) #if 'live' in liv.lower(): # online = czas = parseDOM(datas, 'i', ret='data-datetime')[0] #attrs = {'class':"date"}) st = re.findall('(\d+:\d+)', czas)[0] czas1 = str(int(st.split(':')[0]) - 1) czas = re.sub('\d+:', czas1 + ':', czas) title = '[B][COLOR khaki]%s%s : [/COLOR][/B][COLOR gold][B]%s[/B][/COLOR]' % ( online, czas, mecz) out.append({ 'title': title, 'href': href, 'image': ikon, 'code': dysc }) #except: # pass return out
def get_domain_icon(entry_name, domain): import requests from CommonFunctions import parseDOM subs_dict = {} req = 'http://%s' % domain r = requests.get(req) if r.status_code == requests.codes.ok: try: og_url = parseDOM( r.text, "meta", attrs={"property": "og:url"}, ret="content" )[0] #<meta content="https://www.blogger.com" property="og:url"> except: og_url = req a = parseDOM(r.text, "meta", attrs={"property": "og:image"}, ret="content") b = parseDOM(r.text, "link", attrs={"rel": "apple-touch-icon"}, ret="href") c = parseDOM(r.text, "link", attrs={"rel": "apple-touch-icon-precomposed"}, ret="href") d = parseDOM(r.text, "link", attrs={"rel": "icon"}, ret="href") i = next((item for item in [a, b, c, d] if item), '') if i: try: icon = urlparse.urljoin(og_url, i[-1]) #handle relative or absolute subs_dict.update({ 'entry_name': entry_name, 'display_name': domain, 'icon_img': icon, }) return subs_dict except IndexError: pass else: log(" can't parse icon: get_domain_icon (%s)" % (domain)) else: log(' getting get_domain_icon (%s) info:%s' % (domain, r.status_code))
def get_topics(): '''Returns a list of (topic_name, url) of available topics''' html = _get_html(BASE_URL) menu = parseDOM(html, 'div', attrs={'class': 'header-container[^\'"]*'}) topics_url = parseDOM(menu, 'a', ret='href') topics_description = parseDOM(menu, 'a') links_indexes = [ x for x, y in enumerate(topics_url) if y.startswith('/video/') ] topics = [(stripTags(topics_description[i]), NYT_URL_BASE + topics_url[i][1:]) for i in links_indexes] topics.insert(0, (LATEST_VIDEOS, _url('/video/latest-video/'))) return topics
def places(item): if (DEBUG): ("[channel.py] places") itemlist = [] html = _get_html( item.url ) places = parseDOM( html, 'a', attrs={'class': 'locationLink'} ) places_url = parseDOM( html, 'a', attrs={'class': 'locationLink'}, ret='href' ) for _id, place in enumerate( places ): title = place url = PLACES_URL + places_url[_id] item=Item(action='place', title=title , url=url, thumbnail='', fanart='', plot='' ) itemlist.append( item ) return itemlist
def get_domain_icon( entry_name, domain, check_this_url_instead_of_domain=None ): import requests from CommonFunctions import parseDOM subs_dict={} #import pprint if check_this_url_instead_of_domain: req=check_this_url_instead_of_domain else: req='http://%s' %domain #log('get_domain_icon request='+req) #log('headers:' + repr(headers)) r = requests.get( req ) #log(repr(r.text)) if r.status_code == requests.codes.ok: try:og_url=parseDOM(r.text, "meta", attrs = { "property": "og:url" }, ret="content" )[0] #<meta content="https://www.blogger.com" property="og:url"> except:og_url=req #a=parseDOM(r.text, "link", attrs = { "rel": "shortcut icon" }, ret="href" ) #returns an ico file. we skip this a=parseDOM(r.text, "meta", attrs = { "property": "og:image" }, ret="content" ) b=parseDOM(r.text, "link", attrs = { "rel": "apple-touch-icon" }, ret="href" ) c=parseDOM(r.text, "link", attrs = { "rel": "apple-touch-icon-precomposed" }, ret="href" ) d=parseDOM(r.text, "link", attrs = { "rel": "icon" }, ret="href" ) i=next((item for item in [a,b,c,d] if item ), '') if i: #log( " icon candidates:" + repr(i)) try: icon=urlparse.urljoin(og_url, i[-1]) #handle relative or absolute #make structure same as that returned by get_subreddit_info() subs_dict.update( {'entry_name':entry_name, 'display_name':domain, 'icon_img': icon, # 'header_img': j.get('header_img'), #not used? usually similar to with icon_img # 'title':j.get('title'), # 'header_title':j.get('header_title'), # 'public_description':j.get('public_description'), # 'subreddit_type':j.get('subreddit_type'), # 'subscribers':j.get('subscribers'), # 'created':j.get('created'), #public, private # 'over18':j.get('over18'), } ) #log( pprint.pformat(subs_dict, indent=1) ) return subs_dict except IndexError: pass else: log( " can't parse icon: get_domain_icon (%s)" %(domain) ) else: log( ' getting get_domain_icon (%s) info:%s' %(domain, r.status_code) )
def getChannelsCR(): out=[] html=getUrl(BASEURL2) result = parseDOM(html,'ul',attrs = {'class':"nav-sidebar"})[0]#<div class="arrowgreen"> channels = parseDOM(result,'li') for channel in channels: if '<ul class="nav-submenu">' in channel: continue try: href = parseDOM(channel,'a',ret='href')[0] title = parseDOM(channel,'a',ret='title')[0] out.append({'href':href,'title':'[COLOR lime]► [/COLOR] [B][COLOR gold]'+title+'[/COLOR][/B]'}) except: pass return out
def _get_photos(self, album_url): self._photos[album_url] = [] html = self._get_html(album_url) html = html.replace('srcSet', 'srcset') album_title = self._parser.unescape(parseDOM(html, 'title')[0]) pictures = parseDOM(html, 'img', attrs={'class': '.+Image[^"]+'}, ret='srcset') descriptions = parseDOM(html, 'figcaption') if (len(descriptions) == 0): descriptions = [''] * len(pictures) id_picture = 0 for _id, description in enumerate(descriptions): try: description = stripTags( self._parser.unescape( description ) ).\ replace( 'image caption','' ) condition = True while (condition): picture = pictures[id_picture] picture = re.search( r', (?P<bigger_url>https://[^ ]+) \d+w$', picture).group('bigger_url') id_picture += 1 if (re.search(r'(transparent|line)[^\."]+\.png', picture) == None): condition = False if (description == '' and re.search(r'banner[^\."]+\.png', picture) != None): continue self._photos[album_url].append({ 'title': '%d - %s' % (_id + 1, album_title), 'album_title': album_title, 'photo_id': _id, 'pic': picture, 'description': self._parser.unescape(description), 'album_url': album_url }) except Exception: continue return self._photos[album_url]
def ListEpisodes(): cookie = cache.cache_get('dramaqueen_cookie')['value'] headersget.update({'Cookie': cookie}) name = params['name'] thumb = params['img'] url = params['url'] rE = str(requests.get(url, headers=headersget, timeout=15).content) LoginCheck(rE) rE = str.replace(rE, '–', '-') rE = rE.replace(' ', ' ') result = parseDOM(rE, 'div', attrs={'class': 'container'})[1] results = re.findall('av_toggle_section(.+?)<span', result) episodes = [item for item in parseDOM(results, 'p')] plot = parseDOM(rE, 'em')[0] plot = CleanHTML(plot) fanart = re.findall('background-image: url\((.+?)\);', rE)[1] inprogress = '[COLOR=red][I] w tłumaczeniu[/COLOR][/I]' incorrection = '[COLOR=red][I] korekta[/COLOR][/I]' for item in episodes: if 'tłumaczenie' in item: addon.addLink(str(inprogress), url, mode=5, fanart=(str(fanart)), plot=(str(plot)), thumb=(str(fanart))) elif 'korekta' in item: addon.addLink(str(incorrection), url, mode=5, fanart=(str(fanart)), plot=(str(plot)), thumb=(str(fanart))) else: addon.addLink(str(item), url, mode=5, fanart=(str(fanart)), plot=(str(plot)), thumb=(str(fanart)))
def ListTVCOMlinksDysc2(html): out = [] videos = parseDOM(html, 'div', attrs={'id': "video-selector"})[0] vids = parseDOM(videos, 'div', attrs={'class': "media"}) for vid in vids: try: href, tyt = re.findall('href="(.+?)">(.+?)<\/a>', vid)[0] except: tyt = re.findall('>(.+?)<\/h4>', vid)[0] href = re.findall('href="(.+?)"', vid)[0] href = 'https://www.tvcom.pl' + href if href.startswith('/') else href imag = re.findall('src="(.+?)"', vid)[0] dat = re.findall('<h5>(.+?)<\/h5>', vid)[0] tytul = '(%s) %s' % (dat, tyt) out.append({'href': href, 'title': tytul, 'imag': imag}) return out
def _get_albums(self): self._albums = [] url = 'http://www.theatlantic.com/infocus/' html = self._get_html(url) pattern = r'@media\(min-width:1632px\){#river1 \.lead-image{background-image:url\((.+?)\)' for _id, li in enumerate(parseDOM(html, 'li', attrs={'class': 'article'})): headline = parseDOM(li, 'h1')[0] match = re.search(pattern.replace('river1', 'river%d' % (_id + 1)), html) if match: self._albums.append({ 'title': parseDOM(headline, 'a')[0], 'album_id': _id, 'pic': match.group(1), 'description': stripTags(self._parser.unescape(parseDOM(li, 'p', attrs={'class': 'dek'})[0])), 'album_url': 'http://www.theatlantic.com' + parseDOM(headline, 'a', ret='href')[0], }) return self._albums
def _get_photos(self, album_url): self._photos[album_url] = [] html = self._get_html(album_url) album_title = parseDOM(html, 'title')[0] images = parseDOM(html, 'div', attrs={'class': 'photo'}) descs = parseDOM(html, 'article', attrs={'class': 'pcaption'}) for _id, photo in enumerate(images): pic = urllib2.quote(parseDOM(photo, 'img', ret='src')[0]) description = stripTags(parseDOM(descs[_id], 'div', attrs={'class': 'gcaption geor'})[0]) self._photos[album_url].append({ 'title': '%d - %s' % (_id + 1, album_title), 'album_title': album_title, 'photo_id': _id, 'pic': 'http:' + pic, 'description': description, 'album_url': album_url }) return self._photos[album_url]
def cams(item): if (DEBUG): ("[channel.py] cams") itemlist = [] if (DEBUG): logger.info("url=" + item.url) html = _get_html( item.url ) divs = parseDOM( html, 'div', attrs={'class': r'[^\'"]*?col\-xs\-12' } ) for _id, div in enumerate( divs ): thumbnail = parseDOM( div, 'img', ret='src' )[0].replace('256x144', '512x288').replace('128x72', '256x144') url = parseDOM( div, 'a', ret='href' )[0] if 'www.earthcam.com' not in url or 'alexa' in url or 'myearthcam' in url: continue title = parseDOM( div, 'span', attrs={'class': 'featuredTitle'} )[0] location = parseDOM( div, 'div', attrs={ 'class': 'featuredCity' } )[0] plot = title + "\n(" + location + ')' if plot == None: plot='' if (DEBUG): logger.info("%s, %s, %s, %s, %s" % (title, thumbnail, url, location, plot)) item=Item(action="play", title=title, url=url, thumbnail=thumbnail, fanart=thumbnail, plot=plot ) itemlist.append( item ) # more cameras from front page if (DEBUG): logger.info("url=" + URL) html = _get_html( URL ) divs = parseDOM( html, 'div', attrs={ 'class': '[^\'"]*?camera_block[^\'"]*?' } ) for _id, div in enumerate(divs): if not re.search( r'//www.earthcam.com/[^"}\']+?\?cam=', div ): continue try: title = parseDOM( div, 'img', ret='title')[0].replace('EarthCam: ','') thumbnail = parseDOM( div, 'img', ret='src')[0].replace('256x144', '512x288').replace('128x72', '256x144') url = URL + re.search( r'//www.earthcam.com/([^"}\']+)', div ).group(1) location = parseDOM( div, 'div', attrs={ 'class': '[^\'"]*?thumbnailTitle[^\'"]*?' } )[0] plot = title if (DEBUG): logger.info("cams : %s, %s, %s, %s, %s" % (title, thumbnail, url, location, plot)) except: continue item=Item(action="play", title=title , url=url, thumbnail=thumbnail, fanart=thumbnail, plot=plot ) itemlist.append( item ) #if _id >= 12: # break return itemlist
def _get_photos(self, album_url): self._photos[album_url] = [] html = self._get_html(album_url) pattern = r'@media\(min-width:1592px\){#img01 \.img{background-image:url\((.+?)\)' id_pattern = re.compile(r'#img(\d\d)') album_title = parseDOM(html, 'title')[0] for _id, p in enumerate(parseDOM(html, 'p', attrs={'class': 'caption'})): match = re.search(id_pattern, p) if match: img_id = match.group(1) match = re.search(pattern.replace('img01', 'img%s' % img_id), html) if match: self._photos[album_url].append({ 'title': '%d - %s' % (_id + 1, album_title), 'album_title': album_title, 'photo_id': _id, 'pic': match.group(1), 'description': stripTags(self._parser.unescape(p)).replace('\n #', ''), 'album_url': album_url, }) return self._photos[album_url]
def get_domain_icon( entry_name, domain ): import requests from CommonFunctions import parseDOM subs_dict={} req='http://%s' %domain r = requests.get( req ) if r.status_code == requests.codes.ok: try:og_url=parseDOM(r.text, "meta", attrs = { "property": "og:url" }, ret="content" )[0] #<meta content="https://www.blogger.com" property="og:url"> except:og_url=req a=parseDOM(r.text, "meta", attrs = { "property": "og:image" }, ret="content" ) b=parseDOM(r.text, "link", attrs = { "rel": "apple-touch-icon" }, ret="href" ) c=parseDOM(r.text, "link", attrs = { "rel": "apple-touch-icon-precomposed" }, ret="href" ) d=parseDOM(r.text, "link", attrs = { "rel": "icon" }, ret="href" ) i=next((item for item in [a,b,c,d] if item ), '') if i: try: icon=urlparse.urljoin(og_url, i[-1]) #handle relative or absolute subs_dict.update( {'entry_name':entry_name, 'display_name':domain, 'icon_img': icon, } ) return subs_dict except IndexError: pass else: log( " can't parse icon: get_domain_icon (%s)" %(domain) ) else: log( ' getting get_domain_icon (%s) info:%s' %(domain, r.status_code) )
def _get_albums(self): self._albums = [] url = 'http://www.bostonglobe.com/news/bigpicture' html = self._get_html(url) for _id, album in enumerate(parseDOM(html, 'section')): title = parseDOM(album, 'a')[0] album_url = 'http://www.bostonglobe.com' + parseDOM(album, 'a', ret='href')[0] d = parseDOM(album, 'div', attrs={'class': 'subhead geor'})[0] if not d: continue description = stripTags(self._parser.unescape(d)) pic = urllib2.quote(parseDOM(album, 'img', ret='src')[0]) if not pic: continue self._albums.append({ 'title': title, 'album_id': _id, 'pic': 'http:' + pic, 'description': description, 'album_url': album_url}) return self._albums
def _get_category(item, category): itemlist = [] if (DEBUG): logger.info("url=" + item.url) html = _get_html( item.url ) divs = parseDOM( html, 'div', attrs={'class': '[^\'"]*?col\-xs\-[^\'"]+?result_column_[AB][^\'"]*'}) (title, thumbnail, url, location, plot) = ('', '', '', '', '') if divs: for _id, div in enumerate( divs ): try: # column_A (even) contains thumbnail whilst column_B (odd) contains the rest of infos... if ( _id % 2 == 0 ): # column_A: thumbnail thumbnail = parseDOM( div, 'img', attrs={'class': '[^\'"]*thumbnailImage[^\'"]*'}, ret='src' )[0].replace('256x144', '512x288').replace('128x72', '256x144') else: # column_B url = parseDOM( div, 'a', attrs={'class': 'camTitle'}, ret='href' )[0] # discard (almost all) the external links: if not re.search( r'(//www.earthcam.com/|//(www.)?youtube.com/)', url ): #bStopNavigation = True #break continue title = parseDOM( parseDOM( div, 'a', attrs={'class': 'camTitle'} ), 'span' )[0].replace('EarthCam: ', '') location = parseDOM( div, 'div', attrs={'class': 'cam_location'} )[0] plot = parseDOM( div, 'div', attrs={'class': 'cam_description'} )[0] if plot == None: plot='' if (DEBUG): logger.info("%s, %s, %s, %s, %s" % (title, thumbnail, url, location, plot)) item=Item(action="play", title=title, url=url, thumbnail=thumbnail, fanart=thumbnail, plot=plot ) itemlist.append( item ) except: continue else: divs = parseDOM( html, 'div', attrs={'class': r'[^\'"]*?col\-xs\-12' } ) zone = parseDOM( html, 'p', attrs={ 'class': 'pageTitle' } )[0].replace(':', '') for _id, div in enumerate( divs ): thumbnail = parseDOM( div, 'img', ret='src' )[0].replace('256x144', '512x288').replace('128x72', '256x144') url = parseDOM( div, 'a', ret='href' )[0] title = parseDOM( div, 'span', attrs={'class': 'featuredTitle'} )[0] location = parseDOM( div, 'div', attrs={ 'class': 'featuredCity' } )[0] + ', ' + zone plot = title + "\n(" + location + ')' if plot == None: plot='' if (DEBUG): logger.info("%s, %s, %s, %s, %s" % (title, thumbnail, url, location, plot)) item=Item(action="play", title=title, url=url, thumbnail=thumbnail, fanart=thumbnail, plot=plot ) itemlist.append( item ) try: links = parseDOM( parseDOM( html, 'div', attrs={'id': 'pagination_bottom'} ), 'a', ret='href' ) links_text = parseDOM( parseDOM( html, 'div', attrs={'id': 'pagination_bottom'} ), 'a' ) link = links[-1] if re.search(r'^Next', links_text[-1]): url = link if category.startswith('search'): url = URL + RESULTS_URL + url[1:] category = 'search_results' else: url = URL + PREFIX_PATCH + url[1:] if (DEBUG): (url) item=Item(action=category, title='Next >>' , url=url, thumbnail='', fanart='', plot='' ) itemlist.append( item ) except: pass return itemlist