Python parseDOM примеры, CommonFunctions.parseDOM Python примеры использования

Пример #1

0

Показать файл

Файл: scrapers.py Проект: idleloop-github/script.module.bigpictures

    def _get_photos(self, album_url):
        self._photos[album_url] = []
        html = self._get_html(album_url)
        album_title = parseDOM(html, 'title')[0]
        images = parseDOM(html, 'div', attrs={'class': 'photo'})
        descs = parseDOM(html, 'article', attrs={'class': 'pcaption'})
        for _id, photo in enumerate(images):
            pic = urllib2.quote(parseDOM(photo, 'img', ret='src')[0])
            description = stripTags(
                self._parser.unescape(
                    parseDOM(descs[_id],
                             'div',
                             attrs={'class': 'gcaption geor'})[0]))
            self._photos[album_url].append({
                'title':
                '%d - %s' % (_id + 1, album_title),
                'album_title':
                album_title,
                'photo_id':
                _id,
                'pic':
                'http:' + pic,
                'description':
                description,
                'album_url':
                album_url
            })

        return self._photos[album_url]

Пример #2

0

Показать файл

def ListMovies():

    cookie = cache.cache_get('dramaqueen_cookie')['value']
    headersget.update({'Cookie': cookie})

    url = params['url']
    rM = str(requests.get(url, headers=headersget, timeout=15).content)
    rM = CleanHTML(rM)

    result = parseDOM(rM, 'div', attrs={'id': 'av_section_1'})[0]
    results = re.findall('flex_column av_one_fourth(.+?)</div></div></div>',
                         result)

    Titles = re.findall('><p>(.+?)</p>', result)
    Plot = re.findall('/p>[\s,\S,.]<p>(.+?)</p>', result)
    obrazy = parseDOM(results, 'img', ret='src')
    linki = [item for item in parseDOM(results, 'a', ret='href')]

    for item in zip(linki, Titles, obrazy, Plot):
        addon.addLink(str(item[1]),
                      str(item[0]),
                      mode=5,
                      thumb=str(item[2]),
                      fanart=str(item[2]),
                      plot=str(item[3]))

Пример #3

0

Показать файл

Файл: scrapers.py Проект: idleloop-github/script.module.bigpictures

    def _get_albums(self):
        self._albums = []
        url = 'https://www.theatlantic.com/infocus/'
        html = self._get_html(url)
        pattern = r'@media\(min-width:\s*1632px\)\s*{\s*#river1 \.lead-image\s*{\s*background-image:\s*url\((.+?)\)'
        for _id, li in enumerate(
                parseDOM(html, 'li', attrs={'class': 'article'})):
            headline = parseDOM(li, 'h1')[0]
            match = re.search(pattern.replace('river1', 'river%d' % (_id + 1)),
                              html)
            if match:
                self._albums.append({
                    'title':
                    parseDOM(headline, 'a')[0],
                    'album_id':
                    _id,
                    'pic':
                    match.group(1),
                    'description':
                    stripTags(
                        self._parser.unescape(
                            parseDOM(li, 'p', attrs={'class': 'dek'})[0])),
                    'album_url':
                    'https://www.theatlantic.com' +
                    parseDOM(headline, 'a', ret='href')[0]
                })

        return self._albums

Пример #4

0

Показать файл

def getSWstreams(url):
	out=[]
	html,basurl=getUrl2(url)
	try:
		result = parseDOM(html,'font',attrs = {'size':'3'})[0]
		
		if '<center><b>' in result:
			result = parseDOM(html,'font',attrs = {'size':'3'})[1]
		result=result.replace('\n','').replace('</a> |',' |').replace('<b>','').replace('</b>','')
		
		try:
			xx=re.findall('(\w+: <a class.+?</a>)',result,re.DOTALL)
			for x in xx:
				lang=re.findall('^(\w+)',x,re.DOTALL)[0]
				hreftyt=re.findall('href="(.+?)".+?>(Source \d \w+)',x)
				for href,tyt in hreftyt:
					
					href=basurl+href
					tyt='%s - [B]%s[/B]'%(lang,tyt)
					out.append({'href':href,'title':tyt})

		except:
			results=result.split('|')
			for result in results:
				href,name=re.findall('href="(.+?)".+?>(.+?)<\/a>',result)[0]
				href=url+href
				out.append({'href':href,'title':name.replace('<b>','').replace('</b>','')})		
		
	except:
		pass
	return out

Пример #5

0

Показать файл

def ListEpisodes():

    section = params['section']
    name = params['name']
    url = params['url']
    result = requests.get(url, timeout=15).content
    results = parseDOM(result, 'section', attrs={'id': 'anime-header'})
    poster = parseDOM(results, 'img', ret='src')[0]
    link = parseDOM(results, 'a', ret='href')
    title = parseDOM(results, 'a')
    tags = parseDOM(result,
                    'div',
                    attrs={'class': 'field field-name-field-tags'})
    try:
        plot = re.findall('p><p>(.+?)</p>', result)[0]
        if len(re.findall('<span', plot)) >= 0:
            plot = re.sub('<span(.+?)/span>', '', plot)
    except:
        plot = ''
        pass

    for i in zip(title, link):

        addon.addLink(str(i[0]),
                      str(i[1]),
                      mode='AOListLinks',
                      section='links',
                      thumb=str(poster),
                      plot=str(plot),
                      fanart=custom_background)

Пример #6

0

Показать файл

def Browse_Seasons():

    url = params['url']
    section = params['section']
    page = params['page']
    img = params['img']
    
    if section == 'polecane':
        html = requests.get(url, timeout=15).content
        result = parseDOM(html, 'ul', attrs={'class': 'pmenu'})[1]
        
        result = parseDOM(result, 'li')
        for item in result:
            link = parseDOM(item, 'a', ret='href')[0]
            nazwa = parseDOM(item, 'a')[0]
            if "Kolejno" in str(nazwa):
                continue
            addon.addDir(str(nazwa), url + str(link), mode='List_Episodes', isFolder= True,
                        thumb=fanartAol, fanart=default_background, page=str(url), section='polecane')
    elif section == 'other':
        html = requests.get(url, timeout=15).content
        result = parseDOM(html, 'h1', attrs={'class': 'pod_naglowek'})
        if len(result) > 1:
            for item in result:
               addon.addDir(str(item), url, mode='List_Episodes', isFolder= True, 
                        thumb=str(img), fanart=default_background, page=str(item), section='multi')
        elif len(result) <= 1:
           
            List_Episodes()

Пример #7

0

Показать файл

def ListDramas():

    url = params['url']
    rT = requests.get(url, timeout=15).content

    rT = CleanHTML(rT)

    result = parseDOM(rT, 'div', attrs={'id': 'av_section_1'})[0]
    results = re.findall('flex_column av_one_fourth(.+?)</div></div></div>',
                         result)

    Titles = re.findall('><p>(.+?)</p>', result)
    Plot = re.findall('/p>[\s,\S,.]<p>(.+?)</p>', result)
    obrazy = parseDOM(results, 'img', ret='src')
    linki = [item for item in parseDOM(results, 'a', ret='href')]

    for item in zip(linki, Titles, obrazy, Plot):
        addon.addDir(str(item[1]),
                     str(item[0]),
                     mode=4,
                     plot=(str(item[3])),
                     fanart=(str(item[2])),
                     isFolder=True,
                     thumb=(str(item[2])),
                     section='')

Пример #8

0

Показать файл

def Browse_Titles():

    url = params['url']
    name = params['name']
    html = requests.get(url, timeout=15).content
    if name in html:
        mark1 = '>' + name + '</div>'
        mark2 = '</ul>'

    data = GetDataBeetwenMarkers(html, mark1, mark2, False)[1]
    data = re.findall('<a href="(.+?)"(.+?)">(.+?)</a></li>', data)
    data.sort()
#####Polecane #######
    if len(data) > 0:  
        for item in data:
            link = item[0]
            title = item[2]
            if 'inne.wbijam' in str(item[0]).lower():
                continue
            addon.addDir(title, link, mode='Browse_Seasons', thumb=fanartAol, fanart=default_background, section='polecane', page=str(url))
#####Pozostałe###
    elif len(data) == 0:
            data2 = GetDataBeetwenMarkers(html, mark1, mark2, False)[1]
            data2 = re.findall('<a href="(.+?)">(.+?)</a></li>', data2)
            data2.sort()
            for item in data2:
                link = url + item[0]
                set = requests.get(link, timeout=15).content
                image = parseDOM([i for i in parseDOM(set,'center') if 'img' in i][0], 'img', ret='src')[0]
                title = item[1]
                addon.addDir(title, link, mode='Browse_Seasons', thumb=url + str(image), 
                             fanart=default_background, section='other', page=str(url))

Пример #9

0

Показать файл

    def _get_albums(self):
        self._albums = []
        home_url = 'https://www.readingthepictures.org'
        url = home_url + '/category/notes/'
        html = self._get_html(url)

        articles = parseDOM(html, 'div', attrs={'class': 'article'})
        for _id, article in enumerate(articles):
            title = parseDOM(article, 'a', ret='title')[0]
            picture = parseDOM(article, 'img', ret='src')[0]
            description = parseDOM(article, 'p')[0]
            self._albums.append({
                'title':
                self._parser.unescape(title),
                'album_id':
                _id,
                'pic':
                picture,
                'description':
                stripTags(self._parser.unescape(description)),
                'album_url':
                parseDOM(article, 'a', ret='href')[0]
            })

        return self._albums

Пример #10

0

Показать файл

Файл: hostanimeshinden.py Проект: Lantash77/lantash-repo

def ShindenGetVideoLink(url):

    headers = {
        'Accept': '*/*',
        'Origin': 'https://shinden.pl',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.46 Safari/537.36',
        'DNT': '1',
    }

    if str(url).startswith("//"): url = "https://" + url
    session = requests.session()
    session.get(url, headers=headers, timeout=15)
    time.sleep(5)
    video = session.get(url.replace("player_load", "player_show") +
                        "&width=508",
                        timeout=5).content
    video_url = ''
    try:
        video_url = parseDOM(video, 'iframe', ret='src')[0]
    except:
        pass
    if not video_url:
        try:
            video_url = parseDOM(video, 'a', ret='href')[0]
        except:
            pass
    if not video_url:
        try:
            video_url = re.findall("src=\"(.*?)\"", video)[0]
        except:
            pass
    if str(video_url).startswith("//"): video_url = "http:" + video_url
    return video_url

Пример #11

0

Показать файл

def getSWlink(url):
    stream = ''
    playt = True
    html = getUrl(url, BASEURL3)
    if 'streamamg.com' in html:
        iframes = parseDOM(html, 'iframe', ret='src')  #[0]
        for iframe in iframes:
            if 'streamamg.' in iframe:
                html2 = getUrl(iframe, url)
                xx = re.findall('"partnerId":(\d+)', html2, re.DOTALL)[0]
                xx2 = re.findall('"rootEntryId":"(.+?)"', html2, re.DOTALL)[0]
                m3u8 = 'http://open.http.mp.streamamg.com/p/%s/playManifest/entryId/%s/format/applehttp' % (
                    xx, xx2)
                return m3u8 + '|User-Agent=' + UA + '&Referer=' + iframe, False
    elif 'unblocked.is' in html:
        iframes = parseDOM(html, 'iframe', ret='src')  #[0]
        for iframe in iframes:
            if 'unblocked.is' in iframe:
                if 'nullrefer.com' in iframe or 'href.li/' in iframe:
                    iframe = urlparse.urlparse(iframe).query
                html2 = getUrl(iframe, url)
                stream = getUnblocked(html2)
                return stream, False
    else:
        stream = re.findall('source: "(.+?)"', html, re.DOTALL)
    if stream:
        stream = stream[0]
    else:
        stream = re.findall('source src="(.+?)"', html, re.DOTALL)[0]
        playt = False
    return stream + '|User-Agent=' + UA + '&Referer=' + url, playt

Пример #12

0

Показать файл

Файл: streamendous.py Проект: bopopescu/Kodi_Remote_Manager

def ListTVCOMdzis(url):
    out = []
    html = getUrl(url)
    result = parseDOM(html, 'div', attrs={
        'id': 'calendar-owl'
    })[0]  #<div id="calendar-owl" class="owl-carousel">
    dzis = parseDOM(result, 'div', attrs={'class': "item today"})

    if dzis:
        dat = re.findall('<a href="\/Den\/\?d=(.+?)">DZI', dzis[0])  #[0]
        if dat:
            nagr = re.findall('"badge primary">(.+?)<', dzis[0])
            live = re.findall('"badge secondary">(.+?)<', dzis[0])
            wkrot = re.findall('"badge inverse">(.+?)<', dzis[0])
            nagr = nagr[0] if nagr else '0'
            live = live[0] if live else '0'
            wkrot = wkrot[0] if wkrot else '0'
            dod = ' - (%s, %s, %s)' % (nagr, live, wkrot)
            out.append({'href': dat[0], 'title': 'DZIŚ' + dod})
    days = parseDOM(result, 'div', attrs={'class': 'item'})
    for day in days:
        hrefday = re.findall('href="\/Den\/\?d=(.+?)">(.+?)<', day)[0]
        nagr = re.findall('"badge primary">(.+?)<', day)
        live = re.findall('"badge secondary">(.+?)<', day)
        wkrot = re.findall('"badge inverse">(.+?)<', day)
        nagr = nagr[0] if nagr else '0'
        live = live[0] if live else '0'
        wkrot = wkrot[0] if wkrot else '0'
        dod = ' - (%s, %s, %s)' % (nagr, live, wkrot)

        out.append({'href': hrefday[0], 'title': '%s%s' % (hrefday[1], dod)})
    return out

Пример #13

0

Показать файл

    def _get_photos(self, album_url):
        self._photos[album_url] = []
        html = self._get_html(album_url)
        pattern = r'source data-srcset=\"(.+?)\"'
        match_image = re.findall(pattern, html)
        album_title = self._parser.unescape(parseDOM(html, 'title')[0])
        for _id, p in enumerate(parseDOM(html, 'p', attrs={'class':
                                                           'caption'})):
            match_description = re.search('<span>(.+?)</span>', p)
            if match_description:
                self._photos[album_url].append({
                    'title':
                    '%d - %s' % (_id + 1, album_title),
                    'album_title':
                    album_title,
                    'photo_id':
                    _id,
                    'pic':
                    match_image[_id * 5],
                    'description':
                    stripTags(self._parser.unescape(
                        match_description.group(1))),
                    'album_url':
                    album_url
                })

        return self._photos[album_url]

Пример #14

0

Показать файл

Файл: api.py Проект: idleloop-github/xbmc-newyorktimes

def get_videos(url, description, ref_id, resolution_option=0, page=0):
    '''For a given topic url, returns a list of associated videos using the
    nyt REST API.
    '''
    if ref_id == '':
        html = _get_html(url)
        menu = parseDOM(html, 'div', attrs={'class': 'recent-episodes'})
        links = parseDOM(menu,
                         'a',
                         attrs={'class': 'thumb-holder'},
                         ret='href')
        if description == 'New York':
            # this section does not have direct correspondent json classification
            # so it is directly extracted from html
            videos = []
            for i, link in enumerate(links):
                video_id = re.search(r'^.+?/(\d{10,})/.+', link).group(1)
                videos.append(
                    find_video_by_video_id(video_id, resolution_option))
        else:
            for i, link in enumerate(links):
                # videos can be classified in more than one category and the main one may not b the one we're searching for (description)
                ref_id = link.split('=')[-1]
                videos = find_playlist_by_reference_id(ref_id, description,
                                                       resolution_option, page)
                if videos != []:
                    # correct classification! (json contains Show display_name == description)
                    break
    else:
        # time not wasted examining various json urls, as we know that the received ref_id is good
        videos = find_playlist_by_reference_id(ref_id, description,
                                               resolution_option, page)
    return (videos, ref_id)

Пример #15

0

Показать файл

def getScheduleCR():
    out = []
    html = getUrl(BASEURL2)
    divs = parseDOM(html, 'div', attrs={'class': 'panel_mid_body'})
    for div in divs:
        day = parseDOM(div, 'h2')  #[0]
        if day:
            day = 'kiedy|%s' % day[0]
            out.append({'href': day})
        trs = parseDOM(div, 'tr')  #[0]
        for tr in trs:
            online = '[COLOR lime]► [/COLOR]' if tr.find(
                'images/live.gif') > 0 else '[COLOR orangered]■ [/COLOR]'
            if '>VS</td>' in tr:
                czas, dysc, team1, team2, href = re.findall(
                    '>(\d+:\d+)</td>.+?<span title="(.+?)".+?href=.+?>(.+?)<.+?>VS<.+?a href.+?>(.+?)</a>.+?<a class="watch_btn" href="(.+?)"',
                    tr, re.DOTALL)[0]
                mecz = '%s vs %s' % (team1, team2)

                czas = czas.split(':')
                hrs = int(czas[0]) + 2
                if hrs == 24:
                    hrs = '00'
                mins = czas[1]
                czas = '%s:%s' % (str(hrs), mins)
            else:
                czas, dysc, team1, href = re.findall(
                    '>(\d+:\d+)</td>.+?<span title="(.+?)".+?href=.+?>(.+?)<.+?<a class="watch_btn" href="(.+?)"',
                    tr, re.DOTALL)[0]
                mecz = team1
            title = '[B][COLOR khaki]%s%s : [/COLOR][/B][COLOR gold][B]%s[/B][/COLOR]' % (
                online, czas, mecz)
            out.append({'title': title, 'href': href, 'code': dysc})
    return out

Пример #16

0

Показать файл

def Kategorie():

    cookie = cache.cache_get('dramaqueen_cookie')['value']
    headersget.update({'Cookie': cookie})

    url = params['url']
    rG = requests.get(url, headers=headersget, timeout=15).content

    #    LoginCheck(url=rG)
    result = parseDOM(rG, 'div', attrs={'class': 'tagcloud'})[0]
    links = parseDOM(result, 'a', ret='href')
    label = parseDOM(result, 'a')

    count = [
        re.findall('\d+', i)[0]
        for i in parseDOM(result, 'a', ret='aria-label')
    ]

    for item in zip(label, links, count):

        addon.addDir(str(item[0]) + '   ' + '[COLOR %s]%s[/COLOR]' %
                     ('green', str(item[2]) + ' pozycji'),
                     str(item[1]),
                     mode=7,
                     fanart='',
                     plot='',
                     thumb='')

Пример #17

0

Показать файл

Файл: scrapers.py Проект: camster1/RTOTV

 def _get_photos(self, album_url):
     self._photos[album_url] = []
     html = self._get_html(album_url)
     pattern = r'@media\(min-width:1592px\){#img01 \.img{background-image:url\((.+?)\)'
     id_pattern = re.compile(r'#img(\d\d)')
     album_title = parseDOM(html, 'title')[0]
     for _id, p in enumerate(parseDOM(html, 'p', attrs={'class':
                                                        'caption'})):
         match = re.search(id_pattern, p)
         if match:
             img_id = match.group(1)
             match = re.search(pattern.replace('img01', 'img%s' % img_id),
                               html)
             if match:
                 self._photos[album_url].append({
                     'title':
                     '%d - %s' % (_id + 1, album_title),
                     'album_title':
                     album_title,
                     'photo_id':
                     _id,
                     'pic':
                     match.group(1),
                     'description':
                     stripTags(self._parser.unescape(p)).replace(
                         '\n                #', ''),
                     'album_url':
                     album_url,
                 })
     return self._photos[album_url]

Пример #18

0

Показать файл

Файл: scrapers.py Проект: camster1/RTOTV

    def _get_albums(self):
        self._albums = []
        url = 'http://www.bostonglobe.com/news/bigpicture'

        html = self._get_html(url)

        for _id, album in enumerate(parseDOM(html, 'section')):
            title = parseDOM(album, 'a')[0]
            album_url = 'http://www.bostonglobe.com' + parseDOM(
                album, 'a', ret='href')[0]
            d = parseDOM(album, 'div', attrs={'class': 'subhead geor'})[0]
            if not d:
                continue
            description = stripTags(self._parser.unescape(d))
            pic = urllib2.quote(parseDOM(album, 'img', ret='src')[0])
            if not pic:
                continue
            self._albums.append({
                'title': title,
                'album_id': _id,
                'pic': 'http:' + pic,
                'description': description,
                'album_url': album_url
            })

        return self._albums

Пример #19

0

Показать файл

    def _get_albums(self):
        self._albums = []
        home_url = 'https://time.com'
        url = home_url + '/tag/photography/'
        html = self._get_html(url)

        articles = parseDOM(html, 'div', attrs={'class': 'taxonomy-tout'})
        for _id, article in enumerate(articles):
            title = parseDOM(article, 'h2')[0]
            picture = parseDOM(article, 'img', ret='src')[0]
            try:
                description = parseDOM(article, 'h3')[0]
            except Exception:
                description = ''
            self._albums.append({
                'title':
                self._parser.unescape(title),
                'album_id':
                _id,
                'pic':
                picture,
                'description':
                stripTags(self._parser.unescape(description)),
                'album_url':
                home_url + parseDOM(article, 'a', ret='href')[0]
            })

        return self._albums

Пример #20

0

Показать файл

def getScheduleSW():
    out = []
    html = getUrl(BASEURL3)
    first = parseDOM(html, 'div', attrs={'class':
                                         'tab'})[0]  #<div class="tab">
    iddaydate = re.findall("event, '(.+?)'\).+?<b>(.+?)</b>.+?<b>(.+?)</b>",
                           first, re.DOTALL)
    for id, day, date in iddaydate:

        result = parseDOM(html, 'div', attrs={'id': id})[0]
        result = result.replace('a class=""', 'a class=" "')
        xxx = re.findall(
            '(\d+:\d+).*<a class="([^"]+)" href="([^"]+)">([^>]+)</a>', result)
        if xxx:
            day = ('kiedy|%s %s' % (day, date)).replace('FIRDAY', 'FRIDAY')
            out.append({'href': day})
            for czas, ikona, href, tyt in xxx:
                if '\xf0\x9f\x8e\xb1' in ikona:
                    ikona = 'snooker'
                tyt = re.sub('<font color=.+?>', '',
                             tyt).replace('</font>', '')
                if '<a href' in tyt or '<br><br' in tyt:
                    continue
                tyt = '[B][COLOR khaki]%s : [/COLOR][/B][COLOR gold][B]%s[/B][/COLOR]' % (
                    czas, tyt)
                href2 = 'http://strims.world' + href if href.startswith(
                    '/') else 'http://strims.world/' + href
                out.append({'title': tyt, 'href': href2, 'image': ikona})
    return out

Пример #21

0

Показать файл

def getSWstreamsx(url):
	out=[]
	html=getUrl(url)
	try:
		result = parseDOM(html,'font',attrs = {'size':'3'})[0]
		if '<center><b>' in result:
			result = parseDOM(html,'font',attrs = {'size':'3'})[1]
		t = re.sub('--.*?>', '', result)
		result= t.replace('\r\n\r\n','')	
		try:
			xx=re.findall('(\w+)\: <a(.+?)adsbygoogle',result,re.DOTALL)
			b=xx[0]
			for x in xx:
				tit='%s'%x[0]
				aa=re.findall('href="(.+?)".+?>(.+?)</a>',x[1],re.DOTALL)
				for a in aa:
					if 'vjs' in a[0]:
						continue				
					href= a[0]
					tytul= a[1].replace('<b>','').replace('</b>','')
					tyt='%s - [B]%s[/B]'%(tytul,tit)
					href=url+href
					out.append({'href':href,'title':tyt})

		except:
			results=result.split('|')
			for result in results:
				href,name=re.findall('href="(.+?)".+?>(.+?)<\/a>',result)[0]
				href=url+href
				out.append({'href':href,'title':name.replace('<b>','').replace('</b>','')})		
		
	except:
		pass
	return out

Пример #22

0

Показать файл

    def _get_albums(self):
        self._albums = []
        home_url = 'https://www.bbc.com'
        url = home_url + '/news/in_pictures'
        html = self._get_html(url)

        articles = parseDOM(html, 'div', attrs={'class': 'gs-o-media__body'})
        pictures  = parseDOM( html, 'div', attrs={'class': \
                        'gs-u-mb\+ gel-body-copy qa-post-body'} )
        descriptions = parseDOM(html, 'div', attrs={'class': 'gel-5/8@l'})
        timestamp = parseDOM(html,
                             'span',
                             attrs={'class': 'qa-post-auto-meta'})
        for _id, article in enumerate(articles):
            title = parseDOM(parseDOM(article, 'a')[0], 'span')[0]
            try:
                picture = parseDOM(pictures[_id], 'img', ret='srcset')[0]
                picture = re.search(r', (?P<bigger_url>https://[^ ]+) \d+w$',
                                    picture).group('bigger_url')
                description = parseDOM(descriptions[_id], 'p')[0]
            except Exception:
                continue
            self._albums.append({
                'title': self._parser.unescape( title ),
                'album_id': _id,
                'pic': picture,
                'description': stripTags( self._parser.unescape( description ) ) + \
                                "\n\nPosted @" + timestamp[_id],
                'album_url': home_url + parseDOM(article, 'a', ret='href')[0]
                })

        return self._albums

Пример #23

0

Показать файл

Файл: streamendous.py Проект: bopopescu/Kodi_Remote_Manager

def getLiveSport():
    out = []

    html = getUrl(BASEURL5, BASEURL5)

    result = parseDOM(html, 'ul', attrs={'class': "drop-list"})

    acts = parseDOM(result, 'li', attrs={'class': "active"})
    for act in acts:
        kiedy = re.findall('"text">(.+?)<\/span><\/a>',
                           act)[0]  #>12 September, Today</span></a>
        day = 'kiedy|%s' % kiedy
        out.append({'href': day})

        act = act.replace("\'", '"')
        links = parseDOM(act, 'li')  #[0]
        for link in links:
            #	print link
            href = parseDOM(link, 'a', ret='href')[0]
            href = 'https://livesport.ws' + href if href.startswith(
                '/') else href
            try:
                team1 = re.findall('right;">(.+?)<\/div>', link)[0]
                team2 = re.findall('left;">(.+?)<\/div>', link)[0]
                mecz = '%s vs %s' % (team1, team2)
            except:
                mecz = re.findall('center;.+?>(.+?)<', link)[0]
            dysc = re.findall('"competition">(.+?)</', link)  #[0]
            dysc = dysc[0] if dysc else ''
            ikon = parseDOM(link, 'img', ret='src')[0]
            datas = parseDOM(link, 'span',
                             attrs={'class': "date"})[0]  #<span class="date">
            liv = parseDOM(datas, 'i')[0]

            online = '[COLOR lime]► [/COLOR]' if 'live' in liv.lower(
            ) else '[COLOR orangered]■ [/COLOR]'
            id = parseDOM(link, 'i', ret='id')  #[0]
            if id:
                postid = re.findall('(\d+)', href)[0]
                eventid = id[0]
                href += '|event_id=%s|post_id=%s|' % (eventid, postid)
            #if 'live' in liv.lower():
            #	online =
            czas = parseDOM(datas, 'i',
                            ret='data-datetime')[0]  #attrs = {'class':"date"})
            st = re.findall('(\d+:\d+)', czas)[0]
            czas1 = str(int(st.split(':')[0]) - 1)
            czas = re.sub('\d+:', czas1 + ':', czas)
            title = '[B][COLOR khaki]%s%s : [/COLOR][/B][COLOR gold][B]%s[/B][/COLOR]' % (
                online, czas, mecz)
            out.append({
                'title': title,
                'href': href,
                'image': ikon,
                'code': dysc
            })
    #except:
    #	pass
    return out

Пример #24

0

Показать файл

Файл: utils.py Проект: staycanuca/plugin.video.archangel

def get_domain_icon(entry_name, domain):
    import requests
    from CommonFunctions import parseDOM
    subs_dict = {}

    req = 'http://%s' % domain

    r = requests.get(req)

    if r.status_code == requests.codes.ok:
        try:
            og_url = parseDOM(
                r.text, "meta", attrs={"property": "og:url"}, ret="content"
            )[0]  #<meta content="https://www.blogger.com" property="og:url">
        except:
            og_url = req

        a = parseDOM(r.text,
                     "meta",
                     attrs={"property": "og:image"},
                     ret="content")
        b = parseDOM(r.text,
                     "link",
                     attrs={"rel": "apple-touch-icon"},
                     ret="href")
        c = parseDOM(r.text,
                     "link",
                     attrs={"rel": "apple-touch-icon-precomposed"},
                     ret="href")
        d = parseDOM(r.text, "link", attrs={"rel": "icon"}, ret="href")

        i = next((item for item in [a, b, c, d] if item), '')
        if i:

            try:
                icon = urlparse.urljoin(og_url,
                                        i[-1])  #handle relative or absolute

                subs_dict.update({
                    'entry_name': entry_name,
                    'display_name': domain,
                    'icon_img': icon,
                })

                return subs_dict

            except IndexError:
                pass
        else:
            log("    can't parse icon: get_domain_icon (%s)" % (domain))
    else:
        log('    getting get_domain_icon (%s) info:%s' %
            (domain, r.status_code))

Пример #25

0

Показать файл

Файл: api.py Проект: idleloop-github/xbmc-newyorktimes

def get_topics():
    '''Returns a list of (topic_name, url) of available topics'''
    html = _get_html(BASE_URL)
    menu = parseDOM(html, 'div', attrs={'class': 'header-container[^\'"]*'})
    topics_url = parseDOM(menu, 'a', ret='href')
    topics_description = parseDOM(menu, 'a')
    links_indexes = [
        x for x, y in enumerate(topics_url) if y.startswith('/video/')
    ]
    topics = [(stripTags(topics_description[i]),
               NYT_URL_BASE + topics_url[i][1:]) for i in links_indexes]
    topics.insert(0, (LATEST_VIDEOS, _url('/video/latest-video/')))
    return topics

Пример #26

0

Показать файл

Файл: channel.py Проект: idleloop-github/xbmc-earthcam

def places(item):
    if (DEBUG): ("[channel.py] places")
    itemlist = []
    html = _get_html( item.url )
    places = parseDOM( html, 'a', attrs={'class': 'locationLink'} )
    places_url = parseDOM( html, 'a', attrs={'class': 'locationLink'}, ret='href' )
    for _id, place in enumerate( places ):
        title = place
        url   = PLACES_URL + places_url[_id]
        item=Item(action='place', title=title , url=url, thumbnail='',
                    fanart='', plot='' )
        itemlist.append( item )
    return itemlist

Пример #27

0

Показать файл

Файл: utils.py Проект: gedisony/script.reddit.reader

def get_domain_icon( entry_name, domain, check_this_url_instead_of_domain=None ):
    import requests
    from CommonFunctions import parseDOM
    subs_dict={}
    #import pprint
    if check_this_url_instead_of_domain:
        req=check_this_url_instead_of_domain
    else:
        req='http://%s' %domain

    #log('get_domain_icon request='+req)
    #log('headers:' + repr(headers))
    r = requests.get( req )
    #log(repr(r.text))
    if r.status_code == requests.codes.ok:
        try:og_url=parseDOM(r.text, "meta", attrs = { "property": "og:url" }, ret="content" )[0]  #<meta content="https://www.blogger.com" property="og:url">
        except:og_url=req
        #a=parseDOM(r.text, "link", attrs = { "rel": "shortcut icon" }, ret="href" ) #returns an ico file. we skip this
        a=parseDOM(r.text, "meta", attrs = { "property": "og:image" }, ret="content" )
        b=parseDOM(r.text, "link", attrs = { "rel": "apple-touch-icon" }, ret="href" )
        c=parseDOM(r.text, "link", attrs = { "rel": "apple-touch-icon-precomposed" }, ret="href" )
        d=parseDOM(r.text, "link", attrs = { "rel": "icon" }, ret="href" )

        i=next((item for item in [a,b,c,d] if item ), '')
        if i:
            #log( "    icon candidates:" + repr(i))
            try:
                icon=urlparse.urljoin(og_url, i[-1]) #handle relative or absolute
                #make structure same as that returned by get_subreddit_info()
                subs_dict.update( {'entry_name':entry_name,
                                   'display_name':domain,
                                   'icon_img': icon,
#                                   'header_img': j.get('header_img'), #not used? usually similar to with icon_img
#                                   'title':j.get('title'),
#                                   'header_title':j.get('header_title'),
#                                   'public_description':j.get('public_description'),
#                                   'subreddit_type':j.get('subreddit_type'),
#                                   'subscribers':j.get('subscribers'),
#                                   'created':j.get('created'),        #public, private
#                                   'over18':j.get('over18'),
                                   } )
                #log( pprint.pformat(subs_dict, indent=1) )
                return subs_dict

            except IndexError: pass
        else:
            log( "    can't parse icon: get_domain_icon (%s)" %(domain) )
    else:
        log( '    getting get_domain_icon (%s) info:%s' %(domain, r.status_code) )

Пример #28

0

Показать файл

Файл: utils.py Проект: Asainpersuasion1982/plugin.video.reddit_viewer

def get_domain_icon( entry_name, domain, check_this_url_instead_of_domain=None ):
    import requests
    from CommonFunctions import parseDOM
    subs_dict={}
    #import pprint
    if check_this_url_instead_of_domain:
        req=check_this_url_instead_of_domain
    else:
        req='http://%s' %domain

    #log('get_domain_icon request='+req)
    #log('headers:' + repr(headers))
    r = requests.get( req )
    #log(repr(r.text))
    if r.status_code == requests.codes.ok:
        try:og_url=parseDOM(r.text, "meta", attrs = { "property": "og:url" }, ret="content" )[0]  #<meta content="https://www.blogger.com" property="og:url">
        except:og_url=req
        #a=parseDOM(r.text, "link", attrs = { "rel": "shortcut icon" }, ret="href" ) #returns an ico file. we skip this
        a=parseDOM(r.text, "meta", attrs = { "property": "og:image" }, ret="content" )
        b=parseDOM(r.text, "link", attrs = { "rel": "apple-touch-icon" }, ret="href" )
        c=parseDOM(r.text, "link", attrs = { "rel": "apple-touch-icon-precomposed" }, ret="href" )
        d=parseDOM(r.text, "link", attrs = { "rel": "icon" }, ret="href" )

        i=next((item for item in [a,b,c,d] if item ), '')
        if i:
            #log( "    icon candidates:" + repr(i))
            try:
                icon=urlparse.urljoin(og_url, i[-1]) #handle relative or absolute
                #make structure same as that returned by get_subreddit_info()
                subs_dict.update( {'entry_name':entry_name,
                                   'display_name':domain,
                                   'icon_img': icon,
#                                   'header_img': j.get('header_img'), #not used? usually similar to with icon_img
#                                   'title':j.get('title'),
#                                   'header_title':j.get('header_title'),
#                                   'public_description':j.get('public_description'),
#                                   'subreddit_type':j.get('subreddit_type'),
#                                   'subscribers':j.get('subscribers'),
#                                   'created':j.get('created'),        #public, private
#                                   'over18':j.get('over18'),
                                   } )
                #log( pprint.pformat(subs_dict, indent=1) )
                return subs_dict

            except IndexError: pass
        else:
            log( "    can't parse icon: get_domain_icon (%s)" %(domain) )
    else:
        log( '    getting get_domain_icon (%s) info:%s' %(domain, r.status_code) )

Пример #29

0

Показать файл

def getChannelsCR():
	out=[]
	html=getUrl(BASEURL2)
	result = parseDOM(html,'ul',attrs = {'class':"nav-sidebar"})[0]#<div class="arrowgreen">
	channels = parseDOM(result,'li')
	for channel in channels:
		if '<ul class="nav-submenu">' in channel:
			continue
		try:
			href = parseDOM(channel,'a',ret='href')[0]
			title = parseDOM(channel,'a',ret='title')[0]
			out.append({'href':href,'title':'[COLOR lime]► [/COLOR] [B][COLOR gold]'+title+'[/COLOR][/B]'})
		except:
			pass
	return out

Пример #30

0

Показать файл

    def _get_photos(self, album_url):
        self._photos[album_url] = []
        html = self._get_html(album_url)
        html = html.replace('srcSet', 'srcset')
        album_title = self._parser.unescape(parseDOM(html, 'title')[0])
        pictures = parseDOM(html,
                            'img',
                            attrs={'class': '.+Image[^"]+'},
                            ret='srcset')
        descriptions = parseDOM(html, 'figcaption')
        if (len(descriptions) == 0):
            descriptions = [''] * len(pictures)
        id_picture = 0
        for _id, description in enumerate(descriptions):
            try:
                description = stripTags( self._parser.unescape( description ) ).\
                                replace( 'image caption','' )
                condition = True
                while (condition):
                    picture = pictures[id_picture]
                    picture = re.search(
                        r', (?P<bigger_url>https://[^ ]+) \d+w$',
                        picture).group('bigger_url')
                    id_picture += 1
                    if (re.search(r'(transparent|line)[^\."]+\.png',
                                  picture) == None):
                        condition = False
                if (description == ''
                        and re.search(r'banner[^\."]+\.png', picture) != None):
                    continue
                self._photos[album_url].append({
                    'title':
                    '%d - %s' % (_id + 1, album_title),
                    'album_title':
                    album_title,
                    'photo_id':
                    _id,
                    'pic':
                    picture,
                    'description':
                    self._parser.unescape(description),
                    'album_url':
                    album_url
                })
            except Exception:
                continue

        return self._photos[album_url]

Пример #31

0

Показать файл

def ListEpisodes():

    cookie = cache.cache_get('dramaqueen_cookie')['value']
    headersget.update({'Cookie': cookie})

    name = params['name']
    thumb = params['img']
    url = params['url']

    rE = str(requests.get(url, headers=headersget, timeout=15).content)
    LoginCheck(rE)

    rE = str.replace(rE, '&#8211;', '-')
    rE = rE.replace('&nbsp;', ' ')
    result = parseDOM(rE, 'div', attrs={'class': 'container'})[1]
    results = re.findall('av_toggle_section(.+?)<span', result)
    episodes = [item for item in parseDOM(results, 'p')]

    plot = parseDOM(rE, 'em')[0]
    plot = CleanHTML(plot)

    fanart = re.findall('background-image: url\((.+?)\);', rE)[1]

    inprogress = '[COLOR=red][I]  w tłumaczeniu[/COLOR][/I]'
    incorrection = '[COLOR=red][I]  korekta[/COLOR][/I]'

    for item in episodes:
        if 'tłumaczenie' in item:
            addon.addLink(str(inprogress),
                          url,
                          mode=5,
                          fanart=(str(fanart)),
                          plot=(str(plot)),
                          thumb=(str(fanart)))
        elif 'korekta' in item:
            addon.addLink(str(incorrection),
                          url,
                          mode=5,
                          fanart=(str(fanart)),
                          plot=(str(plot)),
                          thumb=(str(fanart)))
        else:
            addon.addLink(str(item),
                          url,
                          mode=5,
                          fanart=(str(fanart)),
                          plot=(str(plot)),
                          thumb=(str(fanart)))

Пример #32

0

Показать файл

Файл: streamendous.py Проект: bopopescu/Kodi_Remote_Manager

def ListTVCOMlinksDysc2(html):
    out = []
    videos = parseDOM(html, 'div', attrs={'id': "video-selector"})[0]
    vids = parseDOM(videos, 'div', attrs={'class': "media"})
    for vid in vids:
        try:
            href, tyt = re.findall('href="(.+?)">(.+?)<\/a>', vid)[0]
        except:
            tyt = re.findall('>(.+?)<\/h4>', vid)[0]
            href = re.findall('href="(.+?)"', vid)[0]
        href = 'https://www.tvcom.pl' + href if href.startswith('/') else href
        imag = re.findall('src="(.+?)"', vid)[0]
        dat = re.findall('<h5>(.+?)<\/h5>', vid)[0]
        tytul = '(%s) %s' % (dat, tyt)
        out.append({'href': href, 'title': tytul, 'imag': imag})
    return out

Пример #33

0

Показать файл

Файл: scrapers.py Проект: rmrector/script.module.bigpictures

 def _get_albums(self):
     self._albums = []
     url = 'http://www.theatlantic.com/infocus/'
     html = self._get_html(url)
     pattern = r'@media\(min-width:1632px\){#river1 \.lead-image{background-image:url\((.+?)\)'
     for _id, li in enumerate(parseDOM(html, 'li', attrs={'class': 'article'})):
         headline = parseDOM(li, 'h1')[0]
         match = re.search(pattern.replace('river1', 'river%d' % (_id + 1)), html)
         if match:
             self._albums.append({
                 'title': parseDOM(headline, 'a')[0],
                 'album_id': _id,
                 'pic': match.group(1),
                 'description': stripTags(self._parser.unescape(parseDOM(li, 'p', attrs={'class': 'dek'})[0])),
                 'album_url': 'http://www.theatlantic.com' + parseDOM(headline, 'a', ret='href')[0],
             })
     return self._albums

Пример #34

0

Показать файл

Файл: scrapers.py Проект: rmrector/script.module.bigpictures

    def _get_photos(self, album_url):
        self._photos[album_url] = []
        html = self._get_html(album_url)
        album_title = parseDOM(html, 'title')[0]
        images = parseDOM(html, 'div', attrs={'class': 'photo'})
        descs = parseDOM(html, 'article', attrs={'class': 'pcaption'})

        for _id, photo in enumerate(images):
            pic = urllib2.quote(parseDOM(photo, 'img', ret='src')[0])
            description = stripTags(parseDOM(descs[_id], 'div', attrs={'class': 'gcaption geor'})[0])
            self._photos[album_url].append({
                'title': '%d - %s' % (_id + 1, album_title),
                'album_title': album_title,
                'photo_id': _id,
                'pic': 'http:' + pic,
                'description': description,
                'album_url': album_url
            })
        return self._photos[album_url]

Пример #35

0

Показать файл

Файл: channel.py Проект: idleloop-github/xbmc-earthcam

def cams(item):
    if (DEBUG): ("[channel.py] cams")
    itemlist = []

    if (DEBUG): logger.info("url=" + item.url)

    html = _get_html( item.url )
    divs = parseDOM( html, 'div', attrs={'class': r'[^\'"]*?col\-xs\-12' } )
    for _id, div in enumerate( divs ):
        thumbnail = parseDOM( div, 'img', ret='src' )[0].replace('256x144', '512x288').replace('128x72', '256x144')
        url       = parseDOM( div, 'a', ret='href' )[0]
        if 'www.earthcam.com' not in url or 'alexa' in url or 'myearthcam' in url:
            continue
        title     = parseDOM( div, 'span', attrs={'class': 'featuredTitle'} )[0]
        location  = parseDOM( div, 'div', attrs={ 'class': 'featuredCity' } )[0]
        plot      = title + "\n(" + location + ')'
        if plot == None: plot=''
        if (DEBUG): logger.info("%s, %s, %s, %s, %s" % (title, thumbnail, url, location, plot))
        item=Item(action="play", title=title, url=url, thumbnail=thumbnail,
            fanart=thumbnail, plot=plot )
        itemlist.append( item )

    # more cameras from front page
    if (DEBUG): logger.info("url=" + URL)

    html = _get_html( URL )
    divs = parseDOM( html, 'div', attrs={ 'class': '[^\'"]*?camera_block[^\'"]*?' } )

    for _id, div in enumerate(divs):
        if not re.search( r'//www.earthcam.com/[^"}\']+?\?cam=', div ):
            continue
        try:
            title     = parseDOM( div, 'img', ret='title')[0].replace('EarthCam: ','')
            thumbnail = parseDOM( div, 'img', ret='src')[0].replace('256x144', '512x288').replace('128x72', '256x144')
            url       = URL + re.search( r'//www.earthcam.com/([^"}\']+)', div ).group(1)
            location  = parseDOM( div, 'div', attrs={ 'class': '[^\'"]*?thumbnailTitle[^\'"]*?' } )[0]
            plot      = title
            if (DEBUG): logger.info("cams : %s, %s, %s, %s, %s" % (title, thumbnail, url, location, plot))
        except:
            continue
        item=Item(action="play", title=title , url=url, thumbnail=thumbnail,
                fanart=thumbnail, plot=plot )
        itemlist.append( item )
        #if _id >= 12:
        #    break

    return itemlist

Пример #36

0

Показать файл

Файл: scrapers.py Проект: rmrector/script.module.bigpictures

 def _get_photos(self, album_url):
     self._photos[album_url] = []
     html = self._get_html(album_url)
     pattern = r'@media\(min-width:1592px\){#img01 \.img{background-image:url\((.+?)\)'
     id_pattern = re.compile(r'#img(\d\d)')
     album_title = parseDOM(html, 'title')[0]
     for _id, p in enumerate(parseDOM(html, 'p', attrs={'class': 'caption'})):
         match = re.search(id_pattern, p)
         if match:
             img_id = match.group(1)
             match = re.search(pattern.replace('img01', 'img%s' % img_id), html)
             if match:
                 self._photos[album_url].append({
                     'title': '%d - %s' % (_id + 1, album_title),
                     'album_title': album_title,
                     'photo_id': _id,
                     'pic': match.group(1),
                     'description': stripTags(self._parser.unescape(p)).replace('\n                #', ''),
                     'album_url': album_url,
                 })
     return self._photos[album_url]

Пример #37

0

Показать файл

Файл: utils.py Проект: gedisony/repo-plugins

def get_domain_icon( entry_name, domain ):
    import requests
    from CommonFunctions import parseDOM
    subs_dict={}

    req='http://%s' %domain

    r = requests.get( req )

    if r.status_code == requests.codes.ok:
        try:og_url=parseDOM(r.text, "meta", attrs = { "property": "og:url" }, ret="content" )[0]  #<meta content="https://www.blogger.com" property="og:url">
        except:og_url=req

        a=parseDOM(r.text, "meta", attrs = { "property": "og:image" }, ret="content" )
        b=parseDOM(r.text, "link", attrs = { "rel": "apple-touch-icon" }, ret="href" )
        c=parseDOM(r.text, "link", attrs = { "rel": "apple-touch-icon-precomposed" }, ret="href" )
        d=parseDOM(r.text, "link", attrs = { "rel": "icon" }, ret="href" )

        i=next((item for item in [a,b,c,d] if item ), '')
        if i:

            try:
                icon=urlparse.urljoin(og_url, i[-1]) #handle relative or absolute

                subs_dict.update( {'entry_name':entry_name,
                                   'display_name':domain,
                                   'icon_img': icon,

                                   } )

                return subs_dict

            except IndexError: pass
        else:
            log( "    can't parse icon: get_domain_icon (%s)" %(domain) )
    else:
        log( '    getting get_domain_icon (%s) info:%s' %(domain, r.status_code) )

Пример #38

0

Показать файл

Файл: scrapers.py Проект: rmrector/script.module.bigpictures

    def _get_albums(self):
        self._albums = []
        url = 'http://www.bostonglobe.com/news/bigpicture'

        html = self._get_html(url)

        for _id, album in enumerate(parseDOM(html, 'section')):
            title = parseDOM(album, 'a')[0]
            album_url = 'http://www.bostonglobe.com' + parseDOM(album, 'a', ret='href')[0]
            d = parseDOM(album, 'div', attrs={'class': 'subhead geor'})[0]
            if not d:
                continue
            description = stripTags(self._parser.unescape(d))
            pic = urllib2.quote(parseDOM(album, 'img', ret='src')[0])
            if not pic:
                continue
            self._albums.append({
                'title': title,
                'album_id': _id,
                'pic': 'http:' + pic,
                'description': description,
                'album_url': album_url})

        return self._albums

Пример #39

0

Показать файл

Файл: channel.py Проект: idleloop-github/xbmc-earthcam

def _get_category(item, category):
    itemlist = []

    if (DEBUG): logger.info("url=" + item.url)

    html = _get_html( item.url )

    divs = parseDOM( html, 'div', attrs={'class': '[^\'"]*?col\-xs\-[^\'"]+?result_column_[AB][^\'"]*'})

    (title, thumbnail, url, location, plot) = ('', '', '', '', '')

    if divs:
        for _id, div in enumerate( divs ):
            try:
                # column_A (even) contains thumbnail whilst column_B (odd) contains the rest of infos...
                if ( _id % 2 == 0 ):
                    # column_A: thumbnail
                    thumbnail = parseDOM( div, 'img', attrs={'class': '[^\'"]*thumbnailImage[^\'"]*'}, ret='src' )[0].replace('256x144', '512x288').replace('128x72', '256x144')
                else:
                    # column_B
                    url       = parseDOM( div, 'a', attrs={'class': 'camTitle'}, ret='href' )[0]
                    # discard (almost all) the external links:
                    if not re.search( r'(//www.earthcam.com/|//(www.)?youtube.com/)', url ):
                        #bStopNavigation = True
                        #break
                        continue
                    title     = parseDOM( parseDOM( div, 'a', attrs={'class': 'camTitle'} ), 'span' )[0].replace('EarthCam: ', '')
                    location  = parseDOM( div, 'div', attrs={'class': 'cam_location'} )[0]
                    plot      = parseDOM( div, 'div', attrs={'class': 'cam_description'} )[0]
                    if plot == None: plot=''
                    if (DEBUG): logger.info("%s, %s, %s, %s, %s" % (title, thumbnail, url, location, plot))
                    item=Item(action="play", title=title, url=url, thumbnail=thumbnail,
                        fanart=thumbnail, plot=plot )
                    itemlist.append( item )
            except:
                continue
    else:
        divs = parseDOM( html, 'div', attrs={'class': r'[^\'"]*?col\-xs\-12' } )
        zone = parseDOM( html, 'p', attrs={ 'class': 'pageTitle' } )[0].replace(':', '')
        for _id, div in enumerate( divs ):
            thumbnail = parseDOM( div, 'img', ret='src' )[0].replace('256x144', '512x288').replace('128x72', '256x144')
            url       = parseDOM( div, 'a', ret='href' )[0]
            title     = parseDOM( div, 'span', attrs={'class': 'featuredTitle'} )[0]
            location  = parseDOM( div, 'div', attrs={ 'class': 'featuredCity' } )[0] + ', ' + zone
            plot      = title + "\n(" + location + ')'
            if plot == None: plot=''
            if (DEBUG): logger.info("%s, %s, %s, %s, %s" % (title, thumbnail, url, location, plot))
            item=Item(action="play", title=title, url=url, thumbnail=thumbnail,
                fanart=thumbnail, plot=plot )
            itemlist.append( item )

    try:
        links = parseDOM( parseDOM( html, 'div', attrs={'id': 'pagination_bottom'} ), 'a', ret='href' )
        links_text = parseDOM( parseDOM( html, 'div', attrs={'id': 'pagination_bottom'} ), 'a' )
        link = links[-1]
        if re.search(r'^Next', links_text[-1]):
            url = link
            if category.startswith('search'):
                url = URL + RESULTS_URL + url[1:]
                category = 'search_results'
            else:
                url = URL + PREFIX_PATCH + url[1:]
            if (DEBUG): (url)
            item=Item(action=category, title='Next >>' , url=url, thumbnail='',
                    fanart='', plot='' )
            itemlist.append( item )
    except:
        pass

    return itemlist

Python parseDOM примеры использования