示例#1
0
    def parse_index_file(self, fname, base_url=URL()):

        parser = SiteParser()

        def star_get_url(txt=''):
            return txt.partition('(')[2].partition(')')[0]

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'thumb')])
        startpage_rule.add_process_rule_level('a', {'title', 'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class',
                                                       'wp-pagenavi')])
        # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        startpage_hrefs_rule = ParserRule()
        startpage_hrefs_rule.add_activate_rule_level([('ul', 'class', 'list')])
        # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_hrefs_rule.add_process_rule_level('a', {'href'})
        startpage_hrefs_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_hrefs_rule)
        #
        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'id', 'video')])
        video_rule.add_process_rule_level('iframe', {'src'})
        # video_rule.set_attribute_filter_function('data', lambda text: 'flashvars' in text)
        parser.add_rule(video_rule)

        fake_video_rule = ParserRule()
        fake_video_rule.add_activate_rule_level([('div', 'id', 'video')])
        fake_video_rule.add_process_rule_level('div', {})
        # video_rule.set_attribute_filter_function('data', lambda text: 'flashvars' in text)
        parser.add_rule(fake_video_rule)

        #
        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('div', 'id', 'extras')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(gallery_href_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():  # len(video_rule.get_result()) > 0:
            # for item in video_rule.get_result():
            #     print(item)

            src = video_rule.get_result()[0]['src']
            query = parse_qs(urlparse(video_rule.get_result()[0]['src'])[4])

            alternates = list()

            if 'f' in query:
                data = {'data': query['f'][0]}
                php_url = 'http://donfreeporn.com/wp-content/themes/detube/Htplugins/Loader.php*'
                url = URL(php_url, 'POST', post_data=data)

                r = load(url)
                video_url = URL(r.json()['l'][0])
            else:
                r = load(URL(src))
                setup = self.quotes(r.text, 'jwplayer("vplayer").setup(',
                                    ');').replace(' ', '')
                sources = self.quotes(setup, 'sources:[{', '}],').split('},{')
                for item in sources:
                    if '.mp4' in item:
                        # print(item)
                        file = self.quotes(item, 'file:"', '"')
                        label = self.quotes(item, 'label:"', '"')
                        # print(file,label)
                        alternates.append(dict(text=label,
                                               url=URL(file + '*')))
                if len(alternates) == 0:
                    return result
                video_url = alternates[0]['url']

            video = MediaData(video_url)
            for item in alternates:
                video.add_alternate(item)

            result.set_type('video')
            result.set_video(video)

            for f in gallery_href_rule.get_result(['data', 'href']):
                result.add_control(ControlInfo(f['data'], URL(f['href'])))
            return result

        if fake_video_rule.is_result():
            print('Broken video on this url')
            return result

        if startpage_rule.is_result():  # len(startpage_rule.get_result()) > 0:
            result.set_type('hrefs')

            for item in startpage_rule.get_result():
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item.get('title', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            for item in startpage_hrefs_rule.get_result(['href']):
                href = item['href']
                label = href.split('/')[-2]
                # print(label,href)
                result.add_control(ControlInfo(label, URL(href)))

        return result
示例#2
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class',
                                                 'fixed-content')])
        startpage_rule.add_process_rule_level('a', {'href', 'class'})
        startpage_rule.add_process_rule_level('div', {'style'})
        startpage_rule.set_attribute_filter_function(
            'class', lambda x: x == 'thumbnail')
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        startpage_rule.set_attribute_modifier_function(
            'style', lambda x: x.partition("url('")[2].partition("')")[0])
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([
            ('div', 'class', 'col-xs-12 content-pagination')
        ])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        tags_rule = ParserRule()
        tags_rule.add_activate_rule_level([('section', 'id', 'footer-tag')])
        tags_rule.add_process_rule_level('a', {'href'})
        tags_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(tags_rule)

        categories_rule = ParserRule()
        categories_rule.add_activate_rule_level([('ul', 'class',
                                                  'nav navbar-nav')])
        categories_rule.add_process_rule_level('a', {'href'})
        categories_rule.set_attribute_filter_function(
            'href', lambda x: '/Category/' in x and "#" not in x)
        categories_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(categories_rule)

        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('body', '', '')])
        video_rule.add_process_rule_level('script', {})
        video_rule.set_attribute_filter_function(
            'data', lambda text: 'angular.' in text)
        parser.add_rule(video_rule)
        #
        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('div', 'class',
                                                    'row tag-area')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(gallery_href_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():  # len(video_rule.get_result()) > 0:
            script = video_rule.get_result()[0]['data'].replace(' ', '')
            json_file_url = self.get_href(self.quotes(script, "host:'", "'"),
                                          base_url)
            # print(json_file_url)

            from requests_loader import load, LoaderError

            json_file = Setting.base_dir + 'tsp_video.json'

            urls = list()
            result.set_type('video')

            try:
                r = load(URL(json_file_url), json_file)

                links = set()
                for item in r.json()['mediaSources']:
                    # print(item)
                    if item['source'] not in links:
                        data = dict(text=item['quality'],
                                    url=URL(item['source'] + '*'))
                        urls.append(data)
                        links.add(item['source'])

                if len(urls) == 1:
                    video = MediaData(urls[0]['url'])
                elif len(urls) > 1:
                    video = MediaData(urls[0]['url'])
                    for item in urls:
                        video.add_alternate(item)
                else:
                    return result

                result.set_video(video)

            except LoaderError as err:
                print(err)

            for f in gallery_href_rule.get_result(['data', 'href']):
                result.add_control(
                    ControlInfo(f['data'].strip(), URL(f['href'])))
            return result

        if startpage_rule.is_result():  # len(startpage_rule.get_result()) > 0:
            result.set_type('hrefs')

            for item in startpage_rule.get_result(['href']):
                # print(item)
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['style']),
                              href=URL(item['href']),
                              popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                label = item['data'].replace(' ', '')
                # print(item)
                if len(label) > 0:
                    result.add_page(ControlInfo(label, URL(item['href'])))

            if categories_rule.is_result(['href']):
                for item in categories_rule.get_result(['href', 'data']):
                    result.add_control(
                        ControlInfo(item['data'], URL(item['href'])))

            if tags_rule.is_result(['href']):
                for item in tags_rule.get_result(['href', 'data']):
                    result.add_control(
                        ControlInfo(item['data'], URL(item['href'])))

        return result
示例#3
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'link-3col')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        startpage_rule.set_attribute_modifier_function(
            'src', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class',
                                                       'wp-pagenavi')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        tags_rule = ParserRule()
        tags_rule.add_activate_rule_level([('div', 'class', 'tagcloud')])
        tags_rule.add_process_rule_level('a', {'href'})
        tags_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(tags_rule)

        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'class', 'player')])
        video_rule.add_process_rule_level('iframe', {'src'})
        video_rule.set_attribute_filter_function('src',
                                                 lambda x: 'fileone.tv' in x)
        video_rule.set_attribute_modifier_function(
            'src', lambda x: self.get_href(x, base_url))
        parser.add_rule(video_rule)

        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('div', 'id', 'extras')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(gallery_href_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():
            urls = UrlList()
            for item in video_rule.get_result():
                print(item)
                try:
                    r = load(URL(item['src']))
                    setup = self.quotes(r.text, "jwplayer('player').setup(",
                                        ")").replace(' ', '')
                    file = self.quotes(setup, "file:'", "'")
                    urls.add("default", URL(file + '*'))
                except LoaderError as err:
                    print(err)

            result.set_video(urls.get_media_data())

            for f in gallery_href_rule.get_result(['data', 'href']):
                result.add_control(
                    ControlInfo(f['data'].strip(), URL(f['href'])))
            return result

        if startpage_rule.is_result():  # len(startpage_rule.get_result()) > 0:
            for item in startpage_rule.get_result(['href']):
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            for item in tags_rule.get_result(['href', 'data']):
                result.add_control(ControlInfo(item['data'],
                                               URL(item['href'])))

        return result
示例#4
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'thumb')])
        startpage_rule.add_process_rule_level('a', {'href', 'title'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class', 'wp-pagenavi')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        tags_rule = ParserRule()
        tags_rule.add_activate_rule_level([('div', 'class', 'tagcloud')])
        tags_rule.add_process_rule_level('a', {'href'})
        tags_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(tags_rule)

        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'class', 'videoContainer')])
        video_rule.add_process_rule_level('iframe', {'src'})
        video_rule.set_attribute_modifier_function('src', lambda x: self.get_href(x, base_url))
        parser.add_rule(video_rule)

        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('div', 'id', 'extras')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(gallery_href_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():
            urls = UrlList()
            for item in video_rule.get_result():
                try:
                    r = load(URL(item['src']))
                    r = load(URL(self.quotes(r.text, "jwplayer().load('", "'") + '*'))
                    source = self.quotes(r.text, '<item>', '</item>').strip()
                    split = source.split('<jwplayer:source file="')
                    for l in split:
                        if l is '':
                            continue
                        url = l.partition('"')[0]
                        label = self.quotes(l, 'label="', '"')
                        urls.add(label, URL(url + '*'))

                except LoaderError as err:
                    print(err)

            result.set_video(urls.get_media_data())

            for f in gallery_href_rule.get_result(['data', 'href']):
                result.add_control(ControlInfo(f['data'].strip(), URL(f['href'])))
            return result

        if startpage_rule.is_result():  # len(startpage_rule.get_result()) > 0:
            for item in startpage_rule.get_result(['href']):
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            for item in tags_rule.get_result(['href', 'data']):
                result.add_control(ControlInfo(item['data'], URL(item['href'])))

        return result
示例#5
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([
            ('div', 'class', 'video_item_wrapper video_item_medium')
        ])
        startpage_rule.add_process_rule_level('a', {'href', 'class', 'title'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        # startpage_rule.set_attribute_filter_function('class',lambda x: x == 'thumbnail')
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        startpage_rule.set_attribute_modifier_function(
            'src', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'class', 'video_wrapper')])
        video_rule.add_process_rule_level('iframe', {'src'})
        # video_rule.set_attribute_filter_function('data', lambda text: 'angular.' in text)
        video_rule.set_attribute_modifier_function(
            'src', lambda x: self.get_href(x, base_url))
        parser.add_rule(video_rule)
        #
        video_href_rule = ParserRule()
        video_href_rule.add_activate_rule_level([
            ('div', 'class', 'single_description_item_info')
        ])
        video_href_rule.add_process_rule_level('a', {'href'})
        video_href_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(video_href_rule)

        try:
            if base_url.method == 'POST':
                has_more = False
                first_page = False

                with open(fname, encoding='utf-8', errors='ignore') as fd:
                    j = json.load(fd)
                success = j.get('success', False)
                if success:
                    next_data = j['data']
                    content = next_data['content']
                    if len(content) > 0:
                        has_more = next_data['has_more']
                        print('has_more:', has_more)
                        with open(fname, 'w', encoding='utf-8') as fd:
                            fd.write(content)

            else:
                first_page = True
                has_more = True

            self.proceed_parcing(parser, fname)

        except ValueError:
            return ParseResult()

        result = ParseResult()

        if video_rule.is_result():  # len(video_rule.get_result()) > 0:

            # print(video_rule.get_result())

            frame = URL(video_rule.get_result()[0]['src'])
            print(frame)

            from requests_loader import load, LoaderError, get_last_index_cookie

            frame_file = Setting.base_dir + 'frame.html'
            cookie = get_last_index_cookie()
            # print(cookie)

            # urls = list()
            # result.set_type('video')

            try:
                r = load(frame, frame_file, cookie=cookie)
                print(r.text)

                urls = list()

                # print(r.text)

                setup = r.text.replace(' ', '').replace(
                    '\\/',
                    '/').partition('vc.player_setup=')[2].partition(';')[0]
                playlist = setup.partition('"playlist":')[2]

                split = playlist.split('"file":"')

                for item in split:
                    if '"label":' in item:
                        part = item.partition('"')
                        url = part[0]
                        label = part[2].partition('"label":"')[2].partition(
                            '"')[0]
                        print(label, url)
                        next_data = dict(text=label, url=URL(url + '*'))
                        urls.append(next_data)

                if len(urls) == 1:
                    video = MediaData(urls[0]['url'])
                elif len(urls) > 1:
                    video = MediaData(urls[0]['url'])
                    for item in urls:
                        video.add_alternate(item)
                else:
                    return result

                result.set_video(video)

            except LoaderError as err:
                print(err)

            def add_categories(parcer_result, text):
                for f in parcer_result:
                    if text in f['href']:
                        result.add_control(
                            ControlInfo(f['data'].strip(), URL(f['href'])))

            parcer_result = video_href_rule.get_result(['data', 'href'])

            add_categories(parcer_result, '/studios/')
            add_categories(parcer_result, '/pornstars/')
            add_categories(parcer_result, '/channels/')

            return result

        if startpage_rule.is_result():

            for item in startpage_rule.get_result(['href']):
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item.get('alt', '')))

            prev_data = None
            if first_page:
                print(base_url.get())

                xhr_data = {'base_url': base_url, 'step': 100}
                next_data = {
                    'main_category_id': '1',
                    'type': 'post',
                    'filters[filter_type]': 'date',
                    'filters[filter_period]': ''
                }

                if base_url.contain('/video/'):
                    next_data['name'] = 'all_videos'
                if base_url.contain('/amateur/videos/'):
                    next_data['main_category_id'] = '4'
                    next_data['name'] = 'all_videos'
                if base_url.contain('-amateur'):
                    next_data['main_category_id'] = '4'
                if base_url.contain('/channels/'):
                    next_data['name'] = 'category_videos'
                    next_data['category_id[]'] = self.quotes(
                        base_url.get(), '/channels/', '/')
                if base_url.contain('/pornstars/'):
                    next_data['name'] = 'pornstar_related_videos'
                    next_data['content_id'] = self.quotes(
                        base_url.get(), '/pornstars/', '/')
                    xhr_data['step'] = 65
                if base_url.contain('/studios/'):
                    next_data['name'] = 'studio_related_videos'
                    next_data['content_id'] = self.quotes(
                        base_url.get(), '/studios/', '/')
                    xhr_data['step'] = 65

                next_data['offset'] = str(xhr_data['step'])

            else:
                next_data = base_url.post_data.copy()
                xhr_data = base_url.xhr_data.copy()
                curr = int(base_url.post_data['offset'])
                next_data['offset'] = str(curr + xhr_data['step'])
                if curr > 100:
                    prev_data = base_url.post_data.copy()
                    prev_data['offset'] = str(curr - xhr_data['step'])

            xhr_href = 'https://www.porndig.com/posts/load_more_posts/'

            result.add_page(ControlInfo('Main', xhr_data['base_url']))

            sorted_data = next_data.copy()
            sorted_data['offset'] = '0'

            for method in ['date', 'views', 'rating', 'duration', 'ctr']:
                data = sorted_data.copy()
                data['filters[filter_type]'] = method
                sorted_url = URL(xhr_href,
                                 method='POST',
                                 post_data=data,
                                 xhr_data=xhr_data)
                result.add_page(
                    ControlInfo('Sorted by {0}(0)'.format(method), sorted_url))

                if prev_data is not None:
                    data = prev_data.copy()
                    data['filters[filter_type]'] = method
                    prev_url = URL(xhr_href,
                                   method='POST',
                                   post_data=data,
                                   xhr_data=xhr_data)
                    result.add_page(
                        ControlInfo(
                            'Prev {0}({1})'.format(method, data['offset']),
                            prev_url))
                if has_more:
                    data = next_data.copy()
                    data['filters[filter_type]'] = method
                    next_url = URL(xhr_href,
                                   method='POST',
                                   post_data=data,
                                   xhr_data=xhr_data)
                    result.add_page(
                        ControlInfo(
                            'Next {0}({1})'.format(method, data['offset']),
                            next_url))

        return result
示例#6
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'video')])
        startpage_rule.add_process_rule_level('a', {'href', 'title'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        startpage_rule.set_attribute_modifier_function(
            'src', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        channels_rule = ParserRule()
        channels_rule.add_activate_rule_level([('ul', 'class', 'channels')])
        channels_rule.add_process_rule_level('a', {'href', 'title'})
        channels_rule.add_process_rule_level('div', {})
        channels_rule.add_process_rule_level('img', {'src', 'alt'})
        channels_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url).replace('*', '/'))
        channels_rule.set_attribute_modifier_function(
            'src', lambda x: self.get_href(x, base_url))
        parser.add_rule(channels_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([
            ('ul', 'class', 'pagination pagination-lg')
        ])
        # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        startpage_hrefs_rule = ParserRule()
        startpage_hrefs_rule.add_activate_rule_level([
            ('ul', 'class', 'nav nav-stacked navigation')
        ])
        # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_hrefs_rule.add_process_rule_level('a', {'href'})
        # startpage_hrefs_rule.set_attribute_filter_function('href',lambda x: '/videos/' in x)
        startpage_hrefs_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_hrefs_rule)
        #
        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'id', 'player')])
        video_rule.add_process_rule_level('script', {})
        video_rule.set_attribute_filter_function(
            'data', lambda text: 'jwplayer' in text)
        # video_rule.set_attribute_modifier_function('src',lambda x:self.get_href(x,base_url))
        parser.add_rule(video_rule)

        video2_rule = ParserRule()
        video2_rule.add_activate_rule_level([('div', 'id', 'video')])
        video2_rule.add_process_rule_level('script', {'src'})
        video2_rule.set_attribute_filter_function(
            'src', lambda text: 'pornbraze.com/' in text)
        video2_rule.set_attribute_modifier_function(
            'src', lambda x: self.get_href(x, base_url))
        parser.add_rule(video2_rule)

        #
        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([
            ('div', 'class', 'col-xs-12 col-sm-12 col-md-12')
        ])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(gallery_href_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():  # len(video_rule.get_result()) > 0:

            urls = list()
            for item in video_rule.get_result():
                # print(item['data'])
                script = item['data'].replace(' ', '')
                if 'sources:[{' in script:
                    txt = '[{' + self.quotes(item['data'].replace(' ', ''),
                                             'sources:[{', '}]') + '}]'
                    j = json.loads(txt)
                    for j_data in j:
                        # print(j_data)
                        if j_data['file'] is not '':
                            data = dict(text=j_data['label'],
                                        url=URL(j_data['file'] + '*'))
                            urls.append(data)
                elif 'sources:' in script:
                    if video2_rule.is_result(['src']):
                        # print(video2_rule.get_result())
                        php_url = URL(
                            video2_rule.get_result(['src'])[0]['src'])
                        # print(php_url)
                        res = load(php_url)
                        # print(res.text)
                        bitrates = self.quotes(res.text, "'bitrates':[{",
                                               "}]").split('},{')
                        # print(bitrates)
                        for line in bitrates:
                            print(line)
                            video_url = self.quotes(line, "'file':'", "'")
                            label = self.quotes(line, 'label:"', '"')
                            data = dict(text=label, url=URL(video_url + '*'))
                            urls.append(data)

            if len(urls) == 1:
                video = MediaData(urls[0]['url'])
            elif len(urls) > 1:
                video = MediaData(urls[0]['url'])
                for item in urls:
                    video.add_alternate(item)
            else:
                return result

            result.set_type('video')
            result.set_video(video)

            for f in gallery_href_rule.get_result(['data', 'href']):
                # print(f)
                href = f['href'].replace('*', '/')
                label = f['data']
                if '/users/' in href:
                    href = href + '/videos/public/'
                    label = '"' + label + '"'

                result.add_control(ControlInfo(label, URL(href)))

            return result

        if startpage_rule.is_result() or channels_rule.is_result():
            result.set_type('hrefs')

            for item in startpage_rule.get_result(['href']):
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item.get('alt', item.get('title', ''))))

            for item in channels_rule.get_result(['href']):
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item.get('alt', item.get('title', ''))))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            for item in startpage_hrefs_rule.get_result():
                label = item['href'].strip('*/').rpartition('/')[2]
                result.add_control(ControlInfo(label, URL(item['href'])))

        return result
示例#7
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        # startpage_rule.add_activate_rule_level([('div', 'class', 'post_block')])#
        startpage_rule.add_activate_rule_level([('div', 'class',
                                                 'vid_container')])  #
        startpage_rule.add_process_rule_level('img', {'src'})
        startpage_rule.add_process_rule_level('a', {'href', 'title'})
        # startpage_rule.set_attribute_filter_function('src',lambda x: '.jpg' in x)
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        # startpage_rule.set_attribute_modifier_function('title', lambda x: x.partition('#')[0])
        parser.add_rule(startpage_rule)

        startpage_combo_rule = ParserRule()
        # startpage_rule.add_activate_rule_level([('div', 'class', 'post_block')])#
        startpage_combo_rule.add_activate_rule_level([('div', 'class',
                                                       'combo_post_wrap')])
        startpage_combo_rule.add_process_rule_level('a', {'href', 'title'})
        startpage_combo_rule.add_process_rule_level('img', {'src'})
        # startpage_rule.set_attribute_filter_function('src',lambda x: '.jpg' in x)
        # startpage_combo_rule.set_attribute_modifier_function('title', lambda x: x.partition('#')[0])
        startpage_combo_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_combo_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'id',
                                                       'center_control')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        startpage_hrefs_rule = ParserRule()
        startpage_hrefs_rule.add_activate_rule_level([
            ('ul', 'class', 'dropdown-menu columns')
        ])
        startpage_hrefs_rule.add_process_rule_level('a', {'href'})
        # startpage_hrefs_rule.set_attribute_filter_function('href',lambda x: '/channel/' in x or '/prime/' in x)
        startpage_hrefs_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_hrefs_rule)

        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'itemprop', 'video')])
        # video_rule.add_process_rule_level('a', {'href'})
        video_rule.add_process_rule_level('video', {'src'})
        parser.add_rule(video_rule)

        video_multipart_rule = ParserRule()
        video_multipart_rule.add_activate_rule_level([('div', 'id',
                                                       'videos_container')])
        # video_rule.add_process_rule_level('a', {'href'})
        video_multipart_rule.add_process_rule_level(
            'div',
            {'data-source', 'data-hash', 'data-x', 'data-oid', 'data-pid'})
        parser.add_rule(video_multipart_rule)

        video_usss_rule = ParserRule()
        video_usss_rule.add_activate_rule_level([('body', '', '')])
        # video_rule.add_process_rule_level('a', {'href'})
        video_usss_rule.add_process_rule_level('script', {})
        video_usss_rule.set_attribute_filter_function('data',
                                                      lambda x: 'usss' in x)
        parser.add_rule(video_usss_rule)

        #
        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([
            ('div', 'class', 'popular_block_header_rl')
        ])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        # gallery_href_rule.set_attribute_filter_function('href',lambda x: '#' not in x)
        gallery_href_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(gallery_href_rule)

        gallery_author_rule = ParserRule()
        gallery_author_rule.add_activate_rule_level([
            ('div', 'id', 'posts_container')
        ])  # post_block
        gallery_author_rule.add_activate_rule_level([
            ('div', 'class', 'post_author_name')
        ])  # post_block
        gallery_author_rule.add_process_rule_level('a', {'href'})
        # gallery_href_rule.set_attribute_filter_function('href',lambda x: '#' not in x)
        gallery_author_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(gallery_author_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():  # len(video_rule.get_result()) > 0:
            # print('video rule')
            # print(video_rule.get_result())

            video = MediaData(URL(video_rule.get_result()[0]['src']))

            # result.set_type('video')
            result.set_video(video)

            for f in gallery_author_rule.get_result(['data', 'href']):
                # print(f)
                result.add_control(
                    ControlInfo('"' + f['data'].strip() + '"', URL(f['href'])))

            for f in gallery_href_rule.get_result(['data', 'href']):
                # print(f)
                result.add_control(
                    ControlInfo(f['data'].strip(), URL(f['href'])))
            return result

        if video_multipart_rule.is_result():

            res = video_multipart_rule.get_result()
            series = len(res)

            s = base_url.get().partition('?s=')[2]

            if s == '':
                serie = 1
            else:
                serie = int(s)

            uid = self.quotes(
                video_usss_rule.get_result()[0]['data'].replace(' ', ''),
                'usss[0]="', '"')
            curr_result = res[serie - 1]

            data = {
                'uid': uid,
                'source': curr_result['data-source'],
                'hash': curr_result['data-hash'],
                'x': curr_result['data-x'],
                'oid': curr_result['data-oid'],
                'pid': curr_result['data-pid']
            }

            url = URL(self.get_href('/php/get_vlink.php', base_url),
                      'POST',
                      post_data=data)

            r = load(url)

            video = MediaData(URL(r.text))

            result.set_type('video')
            result.set_video(video)

            for i in range(1, series + 1):
                label = 'S{0}'.format(i)
                if i == serie:
                    label += '(this)'
                url_i = base_url.get().partition('?')[0] + '?s={0}'.format(i)
                result.add_control(ControlInfo(label, URL(url_i + '*')))

            for f in gallery_author_rule.get_result(['data', 'href']):
                # print(f)
                result.add_control(
                    ControlInfo('"' + f['data'].strip() + '"', URL(f['href'])))

            for f in gallery_href_rule.get_result(['data', 'href']):
                # print(f)
                result.add_control(
                    ControlInfo(f['data'].strip(), URL(f['href'])))
            return result

        if startpage_rule.is_result() or startpage_combo_rule.is_result(
        ):  # len(startpage_rule.get_result()) > 0:
            # result.set_type('hrefs')

            for item in startpage_combo_rule.get_result():
                # print(item)
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item.get('title', '')))

            for item in startpage_rule.get_result(['href']):
                # print(item)
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item.get('title', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                href = item['href']
                data = item['data']
                n = href.rpartition('/')[2].partition('.')[0]
                result.add_page(ControlInfo('{1}'.format(data, n), URL(href)))

            for item in startpage_hrefs_rule.get_result(['href', 'data']):
                result.add_control(ControlInfo(item['data'],
                                               URL(item['href'])))

        return result
示例#8
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'thumb')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        startpage_rule.set_attribute_modifier_function(
            'src', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class',
                                                       'loop-nav-inner')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        tags_rule = ParserRule()
        tags_rule.add_activate_rule_level([('ul', 'class', 'menu')])
        tags_rule.add_process_rule_level('a', {'href'})
        tags_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(tags_rule)

        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'class',
                                             'section-content'),
                                            ('div', 'id', 'video')])
        video_rule.add_process_rule_level('iframe', {'src'})
        # video_rule.set_attribute_filter_function('src',lambda x:'fileone.tv' in x)
        video_rule.set_attribute_modifier_function(
            'src', lambda x: self.get_href(x, base_url))
        parser.add_rule(video_rule)

        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('div', 'id', 'extras')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(gallery_href_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():
            urls = UrlList()
            for item in video_rule.get_result():
                print(item)
                src = item['src']
                if '.video/embed' in src:
                    try:
                        r = load(URL(item['src']))
                        setup = self.quotes(r.text,
                                            'jwplayer("vplayer").setup(',
                                            ")").replace(' ', '')
                        sources = self.quotes(setup, 'sources:[{',
                                              '}],').split('},{')
                        for item in sources:
                            if '.mp4' in item:
                                file = self.quotes(item, 'file:"', '"')
                                label = self.quotes(item, 'label:"', '"')
                                urls.add(label, URL(file + '*'))
                    except LoaderError as err:
                        print(err)
                elif 'javfinder.com/' in src:
                    try:
                        r = load(URL(item['src']))
                        split1 = r.text.split('<source src="')[1:]
                        for f in split1:
                            f1 = f.partition('>')[0]
                            if '.mp4' in f1:
                                file = f1.partition('"')[0]
                                label = self.quotes(f1, 'res="', '"')
                                urls.add(label, URL(file + '*'))
                    except LoaderError as err:
                        print(err)

            result.set_video(urls.get_media_data())

            for f in gallery_href_rule.get_result(['data', 'href']):
                result.add_control(
                    ControlInfo(f['data'].strip(), URL(f['href'])))
            return result

        if startpage_rule.is_result():  # len(startpage_rule.get_result()) > 0:
            for item in startpage_rule.get_result(['href']):
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            for item in tags_rule.get_result(['href', 'data']):
                result.add_control(ControlInfo(item['data'],
                                               URL(item['href'])))

        return result