Пример #1
0
    def getSearchResult(self, baseUrl, cat, searchType):
        printDBG("getSearchResult for url[%s] searchType[%s]" %
                 (baseUrl, searchType))

        sts, data = self.cm.getPage(baseUrl)
        if False == sts:
            printDBG("getSearchResult problem")
            return

        if 'filmy' == searchType:
            marer1 = '<div class="results_title">[^<]*?Filmy:[^<]*?</div>'
            marer2 = '<div class="results_title">'
        else:
            marer1 = '<div class="results_title">[^<]*?Seriale:[^<]*?</div>'
            marer2 = '<div class="results_title">'

        sts, data = CParsingHelper.getDataBeetwenReMarkers(
            data, re.compile(marer1), re.compile(marer2), False)
        if False == sts:
            printDBG("getSearchResult problem no data beetween markers")
            return

        data = data.split('<div class="result box pl-round"')
        if len(data) > 1:
            del data[0]
            for item in data:
                item = item.replace('<br/>', '')
                # url & title
                match = re.search(
                    '<a href="([^"]+?)" class="en pl-white">([^<]+?)</a>',
                    item)
                if match:
                    url = self.MAINURL + match.group(1)
                    title = match.group(2).replace('\n',
                                                   '').replace('\r',
                                                               '').strip()
                else:
                    continue
                # img
                match = re.search('<img src="([^"]+?)"', item)
                if match: img = match.group(1)
                else: img = ''
                # plot
                match = re.search('<p>([^<]+?)</p>', item)
                if match: plot = match.group(1)
                else: plot = ''

                params = {
                    'title': title,
                    'url': url,
                    'icon': img,
                    'plot': plot
                }
                if cat == '':
                    self.addVideo(params)
                else:
                    params['name'] = 'category'
                    params['category'] = cat
                    self.addDir(params)
Пример #2
0
 def getFilmTab(self, url, category, pager):
     sts, data = self.cm.getPage( url, {'header': self.HEADER } )
     if not sts: return 
     nextPage = re.search('<li><a href="/filmy?.+?" rel="next">&raquo;</a></li>', data)        
     data = CParsingHelper.getDataBeetwenMarkers(data, '<div class="row-fluid  movie-item">', '<div class="container">', False)[1]
     data = data.split('<div class="row-fluid  movie-item">')
     titleA = re.compile('<a class="title"[^>]+?>')
     titleB = re.compile('</small>')
     plotA  = re.compile('<p class="desc">')
     plotB  = re.compile('</div>')
     for item in data:
         title = CParsingHelper.getDataBeetwenReMarkers(item, titleA, titleB, False)[1]
         page  = self.MAINURL + CParsingHelper.getSearchGroups(item, 'class="title" href="([^"]+?)"', 1)[0]
         plot  = CParsingHelper.getDataBeetwenReMarkers(item, plotA, plotB, False)[1]
         img   = CParsingHelper.getSearchGroups(item, 'src="([^"]+?)"', 1)[0]
         if '' != title and '' != page:
             params = {'title': title, 'page': page, 'icon': img, 'plot': plot}
             self.addVideo(params)
     if nextPage:
         params = {'name': 'nextpage', 'category': category, 'title': 'Następna strona', 'page': str(int(pager) + 1)}
         self.addDir(params)
Пример #3
0
    def listItems(self, cItem, category):
        printDBG("SeansikTV.listItems")

        page = cItem.get('page', 1)
        url = self._addPage(cItem.get('url'), page)
        sts, data = self.cm.getPage(url)
        if False == sts: return

        # check next page
        netxtPage = CParsingHelper.getDataBeetwenMarkers(
            data, '<b class="active">%d</b>' % page, '</div>', False)[1]
        if 'page' in netxtPage:
            netxtPage = True
            page += 1
        else:
            netxtPage = False

        sts, data = CParsingHelper.getDataBeetwenMarkers(
            data, '<div class="content table-sofi', '<div class="content">',
            False)
        data = data.split('<div class="content table-sofi')
        for item in data:
            icon = self._getFullUrl(
                CParsingHelper.getSearchGroups(item, 'src="([^"]+?jpg)"')[0])

            sts, tmp = CParsingHelper.getDataBeetwenReMarkers(
                item, re.compile('<td colspan="2"[^>]+?>'),
                re.compile('</td>'), False)
            url = self._getFullUrl(
                CParsingHelper.getSearchGroups(tmp, 'href="([^"]+?)"')[0])
            tmp = tmp.split('</a>')
            title = self.cleanHtmlStr(tmp[0])
            if 0 < len(tmp): desc = self.cleanHtmlStr(tmp[-1])
            # validate data
            if '' == url or '' == title: continue
            params = {
                'name': 'category',
                'category': category,
                'title': title,
                'url': url,
                'icon': icon,
                'desc': desc
            }
            if 'video' != category: self.addDir(params)
            else: self.addVideo(params)
        if netxtPage:
            params = dict(cItem)
            params.update({'title': 'Następna strona', 'page': page})
            self.addDir(params)
    def getSearchResult(self, baseUrl, cat, searchType):
        printDBG("getSearchResult for url[%s] searchType[%s]" % (baseUrl, searchType) )

        sts, data = self.cm.getPage( baseUrl )
        if False == sts:
            printDBG("getSearchResult problem")
            return
        
        if 'filmy' == searchType:
            marer1 = '<div class="results_title">[^<]*?Filmy:[^<]*?</div>'
            marer2 = '<div class="results_title">'
        else:
            marer1 = '<div class="results_title">[^<]*?Seriale:[^<]*?</div>'
            marer2 = '<div class="results_title">'
        
        sts, data = CParsingHelper.getDataBeetwenReMarkers(data, re.compile(marer1), re.compile(marer2), False)
        if False == sts:
            printDBG("getSearchResult problem no data beetween markers")
            return

        data = data.split('<div class="result box pl-round"')
        if len(data) > 1:
            del data[0]
            for item in data:
                item = item.replace('<br/>', '')
                # url & title
                match = re.search('<a href="([^"]+?)" class="en pl-white">([^<]+?)</a>', item)
                if match: 
                    url = self.MAINURL + match.group(1)
                    title = match.group(2).replace('\n', '').replace('\r', '').strip()
                else: continue
                # img
                match = re.search('<img src="([^"]+?)"', item)
                if match: img = match.group(1)
                else: img = ''
                # plot
                match = re.search('<p>([^<]+?)</p>', item)
                if match: plot = match.group(1)
                else: plot = ''
                
                params = { 'title': title, 'url': url, 'icon': img, 'plot': plot}
                if cat == '':
                    self.addVideo(params)
                else:
                    params['name']='category'
                    params['category']=cat 
                    self.addDir(params)
Пример #5
0
    def parseListBase(self, data, type='video'):
        printDBG("parseListBase----------------")
        urlPatterns = {
            'video': ['video', 'href="[ ]*?(/watch\?v=[^"]+?)"', ''],
            'channel': ['category', 'href="(/[^"]+?)"', ''],
            'playlist': ['category', 'list=([^"]+?)"', '/playlist?list='],
            'movie': ['video', 'data-context-item-id="([^"]+?)"', '/watch?v='],
            'live': ['video', 'href="(/watch\?v=[^"]+?)"', ''],
            'tray': ['video', 'data-video-id="([^"]+?)"', '/watch?v='],
        }
        currList = []
        for i in range(len(data)):
            #printDBG("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
            # get requaired params
            url = urlPatterns[type][2] + self.getAttributes(
                urlPatterns[type][1], data[i])

            # get title
            title = ''  #self.getAttributes('title="([^"]+?)"', data[i])
            if '' == title:
                title = self.getAttributes(
                    'data-context-item-title="([^"]+?)"', data[i])
            if '' == title:
                title = self.getAttributes('data-video-title="([^"]+?)"',
                                           data[i])
            if '' == title:
                sts, title = CParsingHelper.getDataBeetwenMarkers(
                    data[i], '<h3 class="yt-lockup-title">', '</h3>', False)
            if '' == title:
                sts, title = CParsingHelper.getDataBeetwenReMarkers(
                    data[i], re.compile('<span [^>]*?class="title[^>]*?>'),
                    re.compile('</span>'), False)
            if '' == title:
                sts, title = CParsingHelper.getDataBeetwenReMarkers(
                    data[i], re.compile('class="pl-video-title-link[^>]*?>'),
                    re.compile('<'), False)

            if '' == title:
                titleMarker = self.cm.ph.getSearchGroups(
                    data[i], '(<[^">]+?"yt-lockup-title[^"]*?"[^>]*?>)')[0]
                if '' != titleMarker:
                    tidx = titleMarker.find(' ')
                    if tidx > 0:
                        tmarker = titleMarker[1:tidx]
                        title = self.cm.ph.getDataBeetwenMarkers(
                            data[i], titleMarker, '</%s>' % tmarker)[1]

            if '' != title:
                title = CParsingHelper.cleanHtmlStr(title)
            if i == 0:
                printDBG(data[i])

            img = self.getAttributes('data-thumb="([^"]+?\.jpg[^"]*?)"',
                                     data[i])
            if '' == img:
                img = self.getAttributes('src="([^"]+?\.jpg[^"]*?)"', data[i])
            if '' == img:
                img = self.getAttributes('<img[^>]+?data\-thumb="([^"]+?)"',
                                         data[i])
            if '' == img:
                img = self.getAttributes('<img[^>]+?src="([^"]+?)"', data[i])
            if '.gif' in img: img = ''
            time = self.getAttributes('data-context-item-time="([^"]+?)"',
                                      data[i])
            if '' == time:
                time = self.getAttributes('class="video-time">([^<]+?)</span>',
                                          data[i])
            if '' == time:
                sts, time = CParsingHelper.getDataBeetwenReMarkers(
                    data[i], re.compile('pl-video-time"[^>]*?>'),
                    re.compile('<'), False)
            if '' == time:
                sts, time = CParsingHelper.getDataBeetwenReMarkers(
                    data[i], re.compile('timestamp"[^>]*?>'), re.compile('<'),
                    False)
            time = time.strip()

            # desc
            descTab = []

            desc = self.cm.ph.getDataBeetwenMarkers(
                data[i], '<div class="yt-lockup-meta', '</div>')[1]
            if desc != '': descTab.append(desc)
            desc = self.cm.ph.getDataBeetwenMarkers(
                data[i], '<span class="formatted-video-count', '</span>')[1]
            if desc != '': descTab.append(desc)

            desc = self.cm.ph.getDataBeetwenReMarkers(
                data[i], re.compile('class="video-description[^>]+?>'),
                re.compile('</p>'), False)[1]
            if '' == desc:
                desc = self.cm.ph.getDataBeetwenReMarkers(
                    data[i], re.compile('class="yt-lockup-description[^>]+?>'),
                    re.compile('</div>'), False)[1]
            if desc != '': descTab.append(desc)

            newDescTab = []
            for desc in descTab:
                desc = CParsingHelper.cleanHtmlStr(desc)
                if desc != '':
                    newDescTab.append(desc)

            urlTmp = url.split(';')
            if len(urlTmp) > 0: url = urlTmp[0]
            if type == 'video': url = url.split('&')[0]
            #printDBG("#####################################")
            #printDBG('url   [%s] ' % url)
            #printDBG('title [%s] ' % title)
            #printDBG('img   [%s] ' % img)
            #printDBG('time  [%s] ' % time)
            #printDBG('desc  [%s] ' % desc)
            if title != '' and url != '' and img != '':
                correctUrlTab = [url, img]
                for i in range(len(correctUrlTab)):
                    if not correctUrlTab[i].startswith(
                            'http:') and not correctUrlTab[i].startswith(
                                'https:'):
                        if correctUrlTab[i].startswith("//"):
                            correctUrlTab[i] = 'http:' + correctUrlTab[i]
                        else:
                            correctUrlTab[
                                i] = 'http://www.youtube.com' + correctUrlTab[i]
                    #else:
                    #    if correctUrlTab[i].startswith('https:'):
                    #        correctUrlTab[i] = "http:" + correctUrlTab[i][6:]

                title = CParsingHelper.cleanHtmlStr(title)
                params = {
                    'type': urlPatterns[type][0],
                    'category': type,
                    'title': title,
                    'url': correctUrlTab[0],
                    'icon': correctUrlTab[1].replace('&amp;', '&'),
                    'time': time,
                    'desc': '[/br]'.join(newDescTab)
                }
                currList.append(params)

        return currList
    def parseListBase(self, data, type='video'):
        printDBG("parseListBase----------------")
        urlPatterns = { 'video'    :    ['video'   , 'href="[ ]*?(/watch\?v=[^"]+?)"'            , ''], 
                        'channel'  :    ['category', 'href="(/[^"]+?)"'                     , ''],
                        'playlist' :    ['category', 'list=([^"]+?)"'                       , '/playlist?list='],
                        'movie'    :    ['video'   , 'data-context-item-id="([^"]+?)"'      , '/watch?v='],
                        'live'     :    ['video'   , 'href="(/watch\?v=[^"]+?)"'            , ''],
                        'tray'     :    ['video'   , 'data-video-id="([^"]+?)"'             , '/watch?v='], }
        currList = []
        for i in range(len(data)):
            #printDBG("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
            # get requaired params
            url   = urlPatterns[type][2] + self.getAttributes(urlPatterns[type][1], data[i])
            
            # get title
            title = '' #self.getAttributes('title="([^"]+?)"', data[i])
            if '' == title: title = self.getAttributes('data-context-item-title="([^"]+?)"', data[i])
            if '' == title: title = self.getAttributes('data-video-title="([^"]+?)"', data[i])
            if '' == title: sts,title = CParsingHelper.getDataBeetwenMarkers(data[i], '<h3 class="yt-lockup-title">', '</h3>', False) 
            if '' == title: sts,title = CParsingHelper.getDataBeetwenReMarkers(data[i], re.compile('<span [^>]*?class="title[^>]*?>'), re.compile('</span>'), False) 
            if '' == title: sts,title = CParsingHelper.getDataBeetwenReMarkers(data[i], re.compile('class="pl-video-title-link[^>]*?>'), re.compile('<'), False)
            
            if '' == title:
                titleMarker = self.cm.ph.getSearchGroups(data[i], '(<[^">]+?"yt-lockup-title[^"]*?"[^>]*?>)')[0]
                if '' != titleMarker:
                    tidx = titleMarker.find(' ')
                    if tidx > 0:
                        tmarker = titleMarker[1:tidx]
                        title = self.cm.ph.getDataBeetwenMarkers(data[i],  titleMarker, '</%s>' % tmarker)[1]
            
            if '' != title: title = CParsingHelper.removeDoubles(remove_html_markup(title, ' '), ' ')
                
            img   = self.getAttributes('data-thumb="([^"]+?\.jpg)"', data[i])
            if '' == img:  img = self.getAttributes('src="([^"]+?\.jpg)"', data[i])
            time  = self.getAttributes('data-context-item-time="([^"]+?)"', data[i])
            if '' == time: time  = self.getAttributes('class="video-time">([^<]+?)</span>', data[i])
            if '' == time: sts, time = CParsingHelper.getDataBeetwenReMarkers(data[i], re.compile('pl-video-time"[^>]*?>'), re.compile('<'), False)
            if '' == time: sts, time = CParsingHelper.getDataBeetwenReMarkers(data[i], re.compile('timestamp"[^>]*?>'), re.compile('<'), False)
            time = time.strip()
            # desc
            sts,desc  = CParsingHelper.getDataBeetwenReMarkers(data[i], re.compile('class="video-description[^>]+?>'), re.compile('</p>'), False)
            if '' == desc: sts,desc = CParsingHelper.getDataBeetwenReMarkers(data[i], re.compile('class="yt-lockup-description[^>]+?>'), re.compile('</div>'), False)
            desc = CParsingHelper.removeDoubles(remove_html_markup(desc, ' '), ' ')
            
            urlTmp = url.split(';')
            if len(urlTmp) > 0: url = urlTmp[0]
            if type == 'video': url = url.split('&')[0] 
                
            # printDBG('url   [%s] ' % url)
            # printDBG('title [%s] ' % title)
            # printDBG('img   [%s] ' % img)
            # printDBG('time  [%s] ' % time)
            # printDBG('desc  [%s] ' % desc)
            if title != '' and url != '' and img != '':
                correctUrlTab = [url, img]
                for i in range(len(correctUrlTab)):
                    if not correctUrlTab[i].startswith('http:') and not correctUrlTab[i].startswith('https:'):
                        if correctUrlTab[i].startswith("//"):
                            correctUrlTab[i] = 'http:' + correctUrlTab[i]
                        else:
                            correctUrlTab[i] = 'http://www.youtube.com' + correctUrlTab[i]
                    else:
                        if correctUrlTab[i].startswith('https:'):
                            correctUrlTab[i] = "http:" + correctUrlTab[i][6:]

                title = clean_html(title.decode("utf-8")).encode("utf-8")
                desc  = clean_html(desc.decode("utf-8")).encode("utf-8")
                params = {'type': urlPatterns[type][0], 'category': type, 'title': title, 'url': correctUrlTab[0], 'icon': correctUrlTab[1], 'time': time, 'desc': desc}
                currList.append(params)

        return currList
Пример #7
0
    def parseListBase(self, data, type='video'):
        printDBG("parseListBase----------------")
        urlPatterns = {
            'video': ['video', 'href="[ ]*?(/watch\?v=[^"]+?)"', ''],
            'channel': ['category', 'href="(/[^"]+?)"', ''],
            'playlist': ['category', 'list=([^"]+?)"', '/playlist?list='],
            'movie': ['video', 'data-context-item-id="([^"]+?)"', '/watch?v='],
            'live': ['video', 'href="(/watch\?v=[^"]+?)"', ''],
            'tray': ['video', 'data-video-id="([^"]+?)"', '/watch?v='],
        }
        currList = []
        for i in range(len(data)):
            #printDBG("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
            # get requaired params
            url = urlPatterns[type][2] + self.getAttributes(
                urlPatterns[type][1], data[i])

            # get title
            title = ''  #self.getAttributes('title="([^"]+?)"', data[i])
            if '' == title:
                title = self.getAttributes(
                    'data-context-item-title="([^"]+?)"', data[i])
            if '' == title:
                title = self.getAttributes('data-video-title="([^"]+?)"',
                                           data[i])
            if '' == title:
                sts, title = CParsingHelper.getDataBeetwenMarkers(
                    data[i], '<h3 class="yt-lockup-title">', '</h3>', False)
            if '' == title:
                sts, title = CParsingHelper.getDataBeetwenReMarkers(
                    data[i], re.compile('<span [^>]*?class="title[^>]*?>'),
                    re.compile('</span>'), False)
            if '' == title:
                sts, title = CParsingHelper.getDataBeetwenReMarkers(
                    data[i], re.compile('class="pl-video-title-link[^>]*?>'),
                    re.compile('<'), False)

            if '' != title:
                title = CParsingHelper.removeDoubles(
                    remove_html_markup(title, ' '), ' ')

            img = self.getAttributes('data-thumb="([^"]+?\.jpg)"', data[i])
            if '' == img:
                img = self.getAttributes('src="([^"]+?\.jpg)"', data[i])
            time = self.getAttributes('data-context-item-time="([^"]+?)"',
                                      data[i])
            if '' == time:
                time = self.getAttributes('class="video-time">([^<]+?)</span>',
                                          data[i])
            if '' == time:
                sts, time = CParsingHelper.getDataBeetwenReMarkers(
                    data[i], re.compile('pl-video-time"[^>]*?>'),
                    re.compile('<'), False)
            if '' == time:
                sts, time = CParsingHelper.getDataBeetwenReMarkers(
                    data[i], re.compile('timestamp"[^>]*?>'), re.compile('<'),
                    False)
            time = time.strip()
            # desc
            sts, desc = CParsingHelper.getDataBeetwenReMarkers(
                data[i], re.compile('class="video-description[^>]+?>'),
                re.compile('</p>'), False)
            if '' == desc:
                sts, desc = CParsingHelper.getDataBeetwenReMarkers(
                    data[i], re.compile('class="yt-lockup-description[^>]+?>'),
                    re.compile('</div>'), False)
            desc = CParsingHelper.removeDoubles(remove_html_markup(desc, ' '),
                                                ' ')

            urlTmp = url.split(';')
            if len(urlTmp) > 0: url = urlTmp[0]
            if type == 'video': url = url.split('&')[0]

            # printDBG('url   [%s] ' % url)
            # printDBG('title [%s] ' % title)
            # printDBG('img   [%s] ' % img)
            # printDBG('time  [%s] ' % time)
            # printDBG('desc  [%s] ' % desc)
            if title != '' and url != '' and img != '':
                correctUrlTab = [url, img]
                for i in range(len(correctUrlTab)):
                    if not correctUrlTab[i].startswith(
                            'http:') and not correctUrlTab[i].startswith(
                                'https:'):
                        if correctUrlTab[i].startswith("//"):
                            correctUrlTab[i] = 'http:' + correctUrlTab[i]
                        else:
                            correctUrlTab[
                                i] = 'http://www.youtube.com' + correctUrlTab[i]
                    else:
                        if correctUrlTab[i].startswith('https:'):
                            correctUrlTab[i] = "http:" + correctUrlTab[i][6:]

                title = clean_html(title.decode("utf-8")).encode("utf-8")
                desc = clean_html(desc.decode("utf-8")).encode("utf-8")
                params = {
                    'type': urlPatterns[type][0],
                    'category': type,
                    'title': title,
                    'url': correctUrlTab[0],
                    'icon': correctUrlTab[1],
                    'time': time,
                    'desc': desc
                }
                currList.append(params)

        return currList