def getSearchResult(self, baseUrl, cat, searchType): printDBG("getSearchResult for url[%s] searchType[%s]" % (baseUrl, searchType)) sts, data = self.cm.getPage(baseUrl) if False == sts: printDBG("getSearchResult problem") return if 'filmy' == searchType: marer1 = '<div class="results_title">[^<]*?Filmy:[^<]*?</div>' marer2 = '<div class="results_title">' else: marer1 = '<div class="results_title">[^<]*?Seriale:[^<]*?</div>' marer2 = '<div class="results_title">' sts, data = CParsingHelper.getDataBeetwenReMarkers( data, re.compile(marer1), re.compile(marer2), False) if False == sts: printDBG("getSearchResult problem no data beetween markers") return data = data.split('<div class="result box pl-round"') if len(data) > 1: del data[0] for item in data: item = item.replace('<br/>', '') # url & title match = re.search( '<a href="([^"]+?)" class="en pl-white">([^<]+?)</a>', item) if match: url = self.MAINURL + match.group(1) title = match.group(2).replace('\n', '').replace('\r', '').strip() else: continue # img match = re.search('<img src="([^"]+?)"', item) if match: img = match.group(1) else: img = '' # plot match = re.search('<p>([^<]+?)</p>', item) if match: plot = match.group(1) else: plot = '' params = { 'title': title, 'url': url, 'icon': img, 'plot': plot } if cat == '': self.addVideo(params) else: params['name'] = 'category' params['category'] = cat self.addDir(params)
def getFilmTab(self, url, category, pager): sts, data = self.cm.getPage( url, {'header': self.HEADER } ) if not sts: return nextPage = re.search('<li><a href="/filmy?.+?" rel="next">»</a></li>', data) data = CParsingHelper.getDataBeetwenMarkers(data, '<div class="row-fluid movie-item">', '<div class="container">', False)[1] data = data.split('<div class="row-fluid movie-item">') titleA = re.compile('<a class="title"[^>]+?>') titleB = re.compile('</small>') plotA = re.compile('<p class="desc">') plotB = re.compile('</div>') for item in data: title = CParsingHelper.getDataBeetwenReMarkers(item, titleA, titleB, False)[1] page = self.MAINURL + CParsingHelper.getSearchGroups(item, 'class="title" href="([^"]+?)"', 1)[0] plot = CParsingHelper.getDataBeetwenReMarkers(item, plotA, plotB, False)[1] img = CParsingHelper.getSearchGroups(item, 'src="([^"]+?)"', 1)[0] if '' != title and '' != page: params = {'title': title, 'page': page, 'icon': img, 'plot': plot} self.addVideo(params) if nextPage: params = {'name': 'nextpage', 'category': category, 'title': 'Następna strona', 'page': str(int(pager) + 1)} self.addDir(params)
def listItems(self, cItem, category): printDBG("SeansikTV.listItems") page = cItem.get('page', 1) url = self._addPage(cItem.get('url'), page) sts, data = self.cm.getPage(url) if False == sts: return # check next page netxtPage = CParsingHelper.getDataBeetwenMarkers( data, '<b class="active">%d</b>' % page, '</div>', False)[1] if 'page' in netxtPage: netxtPage = True page += 1 else: netxtPage = False sts, data = CParsingHelper.getDataBeetwenMarkers( data, '<div class="content table-sofi', '<div class="content">', False) data = data.split('<div class="content table-sofi') for item in data: icon = self._getFullUrl( CParsingHelper.getSearchGroups(item, 'src="([^"]+?jpg)"')[0]) sts, tmp = CParsingHelper.getDataBeetwenReMarkers( item, re.compile('<td colspan="2"[^>]+?>'), re.compile('</td>'), False) url = self._getFullUrl( CParsingHelper.getSearchGroups(tmp, 'href="([^"]+?)"')[0]) tmp = tmp.split('</a>') title = self.cleanHtmlStr(tmp[0]) if 0 < len(tmp): desc = self.cleanHtmlStr(tmp[-1]) # validate data if '' == url or '' == title: continue params = { 'name': 'category', 'category': category, 'title': title, 'url': url, 'icon': icon, 'desc': desc } if 'video' != category: self.addDir(params) else: self.addVideo(params) if netxtPage: params = dict(cItem) params.update({'title': 'Następna strona', 'page': page}) self.addDir(params)
def getSearchResult(self, baseUrl, cat, searchType): printDBG("getSearchResult for url[%s] searchType[%s]" % (baseUrl, searchType) ) sts, data = self.cm.getPage( baseUrl ) if False == sts: printDBG("getSearchResult problem") return if 'filmy' == searchType: marer1 = '<div class="results_title">[^<]*?Filmy:[^<]*?</div>' marer2 = '<div class="results_title">' else: marer1 = '<div class="results_title">[^<]*?Seriale:[^<]*?</div>' marer2 = '<div class="results_title">' sts, data = CParsingHelper.getDataBeetwenReMarkers(data, re.compile(marer1), re.compile(marer2), False) if False == sts: printDBG("getSearchResult problem no data beetween markers") return data = data.split('<div class="result box pl-round"') if len(data) > 1: del data[0] for item in data: item = item.replace('<br/>', '') # url & title match = re.search('<a href="([^"]+?)" class="en pl-white">([^<]+?)</a>', item) if match: url = self.MAINURL + match.group(1) title = match.group(2).replace('\n', '').replace('\r', '').strip() else: continue # img match = re.search('<img src="([^"]+?)"', item) if match: img = match.group(1) else: img = '' # plot match = re.search('<p>([^<]+?)</p>', item) if match: plot = match.group(1) else: plot = '' params = { 'title': title, 'url': url, 'icon': img, 'plot': plot} if cat == '': self.addVideo(params) else: params['name']='category' params['category']=cat self.addDir(params)
def parseListBase(self, data, type='video'): printDBG("parseListBase----------------") urlPatterns = { 'video': ['video', 'href="[ ]*?(/watch\?v=[^"]+?)"', ''], 'channel': ['category', 'href="(/[^"]+?)"', ''], 'playlist': ['category', 'list=([^"]+?)"', '/playlist?list='], 'movie': ['video', 'data-context-item-id="([^"]+?)"', '/watch?v='], 'live': ['video', 'href="(/watch\?v=[^"]+?)"', ''], 'tray': ['video', 'data-video-id="([^"]+?)"', '/watch?v='], } currList = [] for i in range(len(data)): #printDBG("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") # get requaired params url = urlPatterns[type][2] + self.getAttributes( urlPatterns[type][1], data[i]) # get title title = '' #self.getAttributes('title="([^"]+?)"', data[i]) if '' == title: title = self.getAttributes( 'data-context-item-title="([^"]+?)"', data[i]) if '' == title: title = self.getAttributes('data-video-title="([^"]+?)"', data[i]) if '' == title: sts, title = CParsingHelper.getDataBeetwenMarkers( data[i], '<h3 class="yt-lockup-title">', '</h3>', False) if '' == title: sts, title = CParsingHelper.getDataBeetwenReMarkers( data[i], re.compile('<span [^>]*?class="title[^>]*?>'), re.compile('</span>'), False) if '' == title: sts, title = CParsingHelper.getDataBeetwenReMarkers( data[i], re.compile('class="pl-video-title-link[^>]*?>'), re.compile('<'), False) if '' == title: titleMarker = self.cm.ph.getSearchGroups( data[i], '(<[^">]+?"yt-lockup-title[^"]*?"[^>]*?>)')[0] if '' != titleMarker: tidx = titleMarker.find(' ') if tidx > 0: tmarker = titleMarker[1:tidx] title = self.cm.ph.getDataBeetwenMarkers( data[i], titleMarker, '</%s>' % tmarker)[1] if '' != title: title = CParsingHelper.cleanHtmlStr(title) if i == 0: printDBG(data[i]) img = self.getAttributes('data-thumb="([^"]+?\.jpg[^"]*?)"', data[i]) if '' == img: img = self.getAttributes('src="([^"]+?\.jpg[^"]*?)"', data[i]) if '' == img: img = self.getAttributes('<img[^>]+?data\-thumb="([^"]+?)"', data[i]) if '' == img: img = self.getAttributes('<img[^>]+?src="([^"]+?)"', data[i]) if '.gif' in img: img = '' time = self.getAttributes('data-context-item-time="([^"]+?)"', data[i]) if '' == time: time = self.getAttributes('class="video-time">([^<]+?)</span>', data[i]) if '' == time: sts, time = CParsingHelper.getDataBeetwenReMarkers( data[i], re.compile('pl-video-time"[^>]*?>'), re.compile('<'), False) if '' == time: sts, time = CParsingHelper.getDataBeetwenReMarkers( data[i], re.compile('timestamp"[^>]*?>'), re.compile('<'), False) time = time.strip() # desc descTab = [] desc = self.cm.ph.getDataBeetwenMarkers( data[i], '<div class="yt-lockup-meta', '</div>')[1] if desc != '': descTab.append(desc) desc = self.cm.ph.getDataBeetwenMarkers( data[i], '<span class="formatted-video-count', '</span>')[1] if desc != '': descTab.append(desc) desc = self.cm.ph.getDataBeetwenReMarkers( data[i], re.compile('class="video-description[^>]+?>'), re.compile('</p>'), False)[1] if '' == desc: desc = self.cm.ph.getDataBeetwenReMarkers( data[i], re.compile('class="yt-lockup-description[^>]+?>'), re.compile('</div>'), False)[1] if desc != '': descTab.append(desc) newDescTab = [] for desc in descTab: desc = CParsingHelper.cleanHtmlStr(desc) if desc != '': newDescTab.append(desc) urlTmp = url.split(';') if len(urlTmp) > 0: url = urlTmp[0] if type == 'video': url = url.split('&')[0] #printDBG("#####################################") #printDBG('url [%s] ' % url) #printDBG('title [%s] ' % title) #printDBG('img [%s] ' % img) #printDBG('time [%s] ' % time) #printDBG('desc [%s] ' % desc) if title != '' and url != '' and img != '': correctUrlTab = [url, img] for i in range(len(correctUrlTab)): if not correctUrlTab[i].startswith( 'http:') and not correctUrlTab[i].startswith( 'https:'): if correctUrlTab[i].startswith("//"): correctUrlTab[i] = 'http:' + correctUrlTab[i] else: correctUrlTab[ i] = 'http://www.youtube.com' + correctUrlTab[i] #else: # if correctUrlTab[i].startswith('https:'): # correctUrlTab[i] = "http:" + correctUrlTab[i][6:] title = CParsingHelper.cleanHtmlStr(title) params = { 'type': urlPatterns[type][0], 'category': type, 'title': title, 'url': correctUrlTab[0], 'icon': correctUrlTab[1].replace('&', '&'), 'time': time, 'desc': '[/br]'.join(newDescTab) } currList.append(params) return currList
def parseListBase(self, data, type='video'): printDBG("parseListBase----------------") urlPatterns = { 'video' : ['video' , 'href="[ ]*?(/watch\?v=[^"]+?)"' , ''], 'channel' : ['category', 'href="(/[^"]+?)"' , ''], 'playlist' : ['category', 'list=([^"]+?)"' , '/playlist?list='], 'movie' : ['video' , 'data-context-item-id="([^"]+?)"' , '/watch?v='], 'live' : ['video' , 'href="(/watch\?v=[^"]+?)"' , ''], 'tray' : ['video' , 'data-video-id="([^"]+?)"' , '/watch?v='], } currList = [] for i in range(len(data)): #printDBG("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") # get requaired params url = urlPatterns[type][2] + self.getAttributes(urlPatterns[type][1], data[i]) # get title title = '' #self.getAttributes('title="([^"]+?)"', data[i]) if '' == title: title = self.getAttributes('data-context-item-title="([^"]+?)"', data[i]) if '' == title: title = self.getAttributes('data-video-title="([^"]+?)"', data[i]) if '' == title: sts,title = CParsingHelper.getDataBeetwenMarkers(data[i], '<h3 class="yt-lockup-title">', '</h3>', False) if '' == title: sts,title = CParsingHelper.getDataBeetwenReMarkers(data[i], re.compile('<span [^>]*?class="title[^>]*?>'), re.compile('</span>'), False) if '' == title: sts,title = CParsingHelper.getDataBeetwenReMarkers(data[i], re.compile('class="pl-video-title-link[^>]*?>'), re.compile('<'), False) if '' == title: titleMarker = self.cm.ph.getSearchGroups(data[i], '(<[^">]+?"yt-lockup-title[^"]*?"[^>]*?>)')[0] if '' != titleMarker: tidx = titleMarker.find(' ') if tidx > 0: tmarker = titleMarker[1:tidx] title = self.cm.ph.getDataBeetwenMarkers(data[i], titleMarker, '</%s>' % tmarker)[1] if '' != title: title = CParsingHelper.removeDoubles(remove_html_markup(title, ' '), ' ') img = self.getAttributes('data-thumb="([^"]+?\.jpg)"', data[i]) if '' == img: img = self.getAttributes('src="([^"]+?\.jpg)"', data[i]) time = self.getAttributes('data-context-item-time="([^"]+?)"', data[i]) if '' == time: time = self.getAttributes('class="video-time">([^<]+?)</span>', data[i]) if '' == time: sts, time = CParsingHelper.getDataBeetwenReMarkers(data[i], re.compile('pl-video-time"[^>]*?>'), re.compile('<'), False) if '' == time: sts, time = CParsingHelper.getDataBeetwenReMarkers(data[i], re.compile('timestamp"[^>]*?>'), re.compile('<'), False) time = time.strip() # desc sts,desc = CParsingHelper.getDataBeetwenReMarkers(data[i], re.compile('class="video-description[^>]+?>'), re.compile('</p>'), False) if '' == desc: sts,desc = CParsingHelper.getDataBeetwenReMarkers(data[i], re.compile('class="yt-lockup-description[^>]+?>'), re.compile('</div>'), False) desc = CParsingHelper.removeDoubles(remove_html_markup(desc, ' '), ' ') urlTmp = url.split(';') if len(urlTmp) > 0: url = urlTmp[0] if type == 'video': url = url.split('&')[0] # printDBG('url [%s] ' % url) # printDBG('title [%s] ' % title) # printDBG('img [%s] ' % img) # printDBG('time [%s] ' % time) # printDBG('desc [%s] ' % desc) if title != '' and url != '' and img != '': correctUrlTab = [url, img] for i in range(len(correctUrlTab)): if not correctUrlTab[i].startswith('http:') and not correctUrlTab[i].startswith('https:'): if correctUrlTab[i].startswith("//"): correctUrlTab[i] = 'http:' + correctUrlTab[i] else: correctUrlTab[i] = 'http://www.youtube.com' + correctUrlTab[i] else: if correctUrlTab[i].startswith('https:'): correctUrlTab[i] = "http:" + correctUrlTab[i][6:] title = clean_html(title.decode("utf-8")).encode("utf-8") desc = clean_html(desc.decode("utf-8")).encode("utf-8") params = {'type': urlPatterns[type][0], 'category': type, 'title': title, 'url': correctUrlTab[0], 'icon': correctUrlTab[1], 'time': time, 'desc': desc} currList.append(params) return currList
def parseListBase(self, data, type='video'): printDBG("parseListBase----------------") urlPatterns = { 'video': ['video', 'href="[ ]*?(/watch\?v=[^"]+?)"', ''], 'channel': ['category', 'href="(/[^"]+?)"', ''], 'playlist': ['category', 'list=([^"]+?)"', '/playlist?list='], 'movie': ['video', 'data-context-item-id="([^"]+?)"', '/watch?v='], 'live': ['video', 'href="(/watch\?v=[^"]+?)"', ''], 'tray': ['video', 'data-video-id="([^"]+?)"', '/watch?v='], } currList = [] for i in range(len(data)): #printDBG("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") # get requaired params url = urlPatterns[type][2] + self.getAttributes( urlPatterns[type][1], data[i]) # get title title = '' #self.getAttributes('title="([^"]+?)"', data[i]) if '' == title: title = self.getAttributes( 'data-context-item-title="([^"]+?)"', data[i]) if '' == title: title = self.getAttributes('data-video-title="([^"]+?)"', data[i]) if '' == title: sts, title = CParsingHelper.getDataBeetwenMarkers( data[i], '<h3 class="yt-lockup-title">', '</h3>', False) if '' == title: sts, title = CParsingHelper.getDataBeetwenReMarkers( data[i], re.compile('<span [^>]*?class="title[^>]*?>'), re.compile('</span>'), False) if '' == title: sts, title = CParsingHelper.getDataBeetwenReMarkers( data[i], re.compile('class="pl-video-title-link[^>]*?>'), re.compile('<'), False) if '' != title: title = CParsingHelper.removeDoubles( remove_html_markup(title, ' '), ' ') img = self.getAttributes('data-thumb="([^"]+?\.jpg)"', data[i]) if '' == img: img = self.getAttributes('src="([^"]+?\.jpg)"', data[i]) time = self.getAttributes('data-context-item-time="([^"]+?)"', data[i]) if '' == time: time = self.getAttributes('class="video-time">([^<]+?)</span>', data[i]) if '' == time: sts, time = CParsingHelper.getDataBeetwenReMarkers( data[i], re.compile('pl-video-time"[^>]*?>'), re.compile('<'), False) if '' == time: sts, time = CParsingHelper.getDataBeetwenReMarkers( data[i], re.compile('timestamp"[^>]*?>'), re.compile('<'), False) time = time.strip() # desc sts, desc = CParsingHelper.getDataBeetwenReMarkers( data[i], re.compile('class="video-description[^>]+?>'), re.compile('</p>'), False) if '' == desc: sts, desc = CParsingHelper.getDataBeetwenReMarkers( data[i], re.compile('class="yt-lockup-description[^>]+?>'), re.compile('</div>'), False) desc = CParsingHelper.removeDoubles(remove_html_markup(desc, ' '), ' ') urlTmp = url.split(';') if len(urlTmp) > 0: url = urlTmp[0] if type == 'video': url = url.split('&')[0] # printDBG('url [%s] ' % url) # printDBG('title [%s] ' % title) # printDBG('img [%s] ' % img) # printDBG('time [%s] ' % time) # printDBG('desc [%s] ' % desc) if title != '' and url != '' and img != '': correctUrlTab = [url, img] for i in range(len(correctUrlTab)): if not correctUrlTab[i].startswith( 'http:') and not correctUrlTab[i].startswith( 'https:'): if correctUrlTab[i].startswith("//"): correctUrlTab[i] = 'http:' + correctUrlTab[i] else: correctUrlTab[ i] = 'http://www.youtube.com' + correctUrlTab[i] else: if correctUrlTab[i].startswith('https:'): correctUrlTab[i] = "http:" + correctUrlTab[i][6:] title = clean_html(title.decode("utf-8")).encode("utf-8") desc = clean_html(desc.decode("utf-8")).encode("utf-8") params = { 'type': urlPatterns[type][0], 'category': type, 'title': title, 'url': correctUrlTab[0], 'icon': correctUrlTab[1], 'time': time, 'desc': desc } currList.append(params) return currList