Пример #1
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'thumb')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        startpage_rule.set_attribute_modifier_function(
            'src', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class',
                                                       'loop-nav-inner')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        tags_rule = ParserRule()
        tags_rule.add_activate_rule_level([('ul', 'class', 'menu')])
        tags_rule.add_process_rule_level('a', {'href'})
        tags_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(tags_rule)

        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'class',
                                             'section-content'),
                                            ('div', 'id', 'video')])
        video_rule.add_process_rule_level('iframe', {'src'})
        # video_rule.set_attribute_filter_function('src',lambda x:'fileone.tv' in x)
        video_rule.set_attribute_modifier_function(
            'src', lambda x: self.get_href(x, base_url))
        parser.add_rule(video_rule)

        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('div', 'id', 'extras')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(gallery_href_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():
            urls = UrlList()
            for item in video_rule.get_result():
                print(item)
                src = item['src']
                if '.video/embed' in src:
                    try:
                        r = load(URL(item['src']))
                        setup = self.quotes(r.text,
                                            'jwplayer("vplayer").setup(',
                                            ")").replace(' ', '')
                        sources = self.quotes(setup, 'sources:[{',
                                              '}],').split('},{')
                        for item in sources:
                            if '.mp4' in item:
                                file = self.quotes(item, 'file:"', '"')
                                label = self.quotes(item, 'label:"', '"')
                                urls.add(label, URL(file + '*'))
                    except LoaderError as err:
                        print(err)
                elif 'javfinder.com/' in src:
                    try:
                        r = load(URL(item['src']))
                        split1 = r.text.split('<source src="')[1:]
                        for f in split1:
                            f1 = f.partition('>')[0]
                            if '.mp4' in f1:
                                file = f1.partition('"')[0]
                                label = self.quotes(f1, 'res="', '"')
                                urls.add(label, URL(file + '*'))
                    except LoaderError as err:
                        print(err)

            result.set_video(urls.get_media_data())

            for f in gallery_href_rule.get_result(['data', 'href']):
                result.add_control(
                    ControlInfo(f['data'].strip(), URL(f['href'])))
            return result

        if startpage_rule.is_result():  # len(startpage_rule.get_result()) > 0:
            for item in startpage_rule.get_result(['href']):
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            for item in tags_rule.get_result(['href', 'data']):
                result.add_control(ControlInfo(item['data'],
                                               URL(item['href'])))

        return result
Пример #2
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        # def star_get_url(txt=''):
        #     return txt.partition('(')[2].partition(')')[0]

        startpage_rule = ParserRule(debug=False)
        startpage_rule.add_activate_rule_level([('div', 'class', 'video_box'),
                                                ('div', 'class', 'box-thumbnail')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        # startpage_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x + '*')
        # startpage_rule.set_attribute_modifier_function('style', star_get_url)
        # startpage_rule.set_attribute_filter_function('href',lambda x: not '/pictures/'in x)
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('ul', 'id', 'pagination')])
        # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        # startpage_pages_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x + '*')
        parser.add_rule(startpage_pages_rule)

        startpage_hrefs_rule = ParserRule()
        startpage_hrefs_rule.add_activate_rule_level([('div', 'id', 'videos_categories')])
        # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_hrefs_rule.add_process_rule_level('a', {'href'})
        # startpage_hrefs_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x + '*')
        parser.add_rule(startpage_hrefs_rule)

        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'id', 'playerContainer')])
        video_rule.add_process_rule_level('script', {})
        video_rule.set_attribute_filter_function('data', lambda text: 'flashvars' in text)
        parser.add_rule(video_rule)

        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('li', 'class', 'tag-list'),
                                                   ('li', 'class', 'video-category')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        # gallery_href_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x + '*')
        parser.add_rule(gallery_href_rule)

        gallery_user_rule = ParserRule()
        gallery_user_rule.add_activate_rule_level([('span', 'id', 'videoUsername')])
        gallery_user_rule.add_process_rule_level('a', {'href'})
        gallery_user_rule.set_attribute_modifier_function('href', lambda x: x.replace('/user/', '/user-videos/'))
        parser.add_rule(gallery_user_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if len(video_rule.get_result()) > 0:
            script = video_rule.get_result()[0]['data'].replace(' ', '').replace('\\', '')
            flashvars = script.partition('flashvars={')[2].partition('};')[0]
            # print(flashvars)

            # def parce(txt):
            #     label = txt.partition('id:"')[2].partition('"')[0]
            #     file = txt.partition('url:"')[2].partition('"')[0]
            #     print(label,file)
            #     return dict(text=label, url=URL(file + '*'))

            urls = list()

            while '"quality_' in flashvars:
                nxt = flashvars.partition('"quality_')[2]

                t = nxt.partition('":"')
                label = t[0]
                file = t[2].partition('",')[0]
                # print (label, file)
                if file.startswith('http://'):
                    urls.append(dict(text=label, url=URL(file + '*')))
                flashvars = nxt

            # print(urls)

            if len(urls) == 1:
                video = MediaData(urls[0]['url'])
            elif len(urls) > 1:
                video = MediaData(urls[len(urls) - 1]['url'])
                for item in urls:
                    video.add_alternate(item)
            else:
                return result

            result.set_type('video')
            result.set_video(video)

            for f in gallery_user_rule.get_result(['data', 'href']):
                username = '******' + f['href'].split('/')[-2] + '"'
                result.add_control(ControlInfo(username, URL(f['href'])))

            for f in gallery_href_rule.get_result(['data', 'href']):
                result.add_control(ControlInfo(f['data'], URL(f['href'])))
            return result

        if len(startpage_rule.get_result()) > 0:
            result.set_type('hrefs')

            for item in startpage_rule.get_result(['href']):
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            for item in startpage_hrefs_rule.get_result(['href', 'data']):
                result.add_control(ControlInfo(item['data'], URL(item['href'])))

        return result
Пример #3
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()
        startpage_rule = ParserRule(debug=True)
        startpage_rule.add_activate_rule_level([('div', 'class',
                                                 'boxC videoList clearfix'),
                                                ('div', 'class', 'gallery')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        # startpage_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x)
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class', 'pager')
                                                      ])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        startpage_hrefs_rule = ParserRule()
        startpage_hrefs_rule.add_activate_rule_level([('div', 'id', 'menuLeft')
                                                      ])
        startpage_hrefs_rule.add_process_rule_level('a', {'href'})
        startpage_hrefs_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        startpage_hrefs_rule.set_attribute_filter_function(
            'href',
            lambda text: '/channels/' in text or '/photos/niches/' in text)
        parser.add_rule(startpage_hrefs_rule)

        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'id', 'player')])
        video_rule.add_process_rule_level('video', {'file'})
        # video_rule.set_attribute_filter_function('data',lambda text:'function playStart()' in text)
        parser.add_rule(video_rule)

        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([
            ('div', 'id', 'videoInfoBox'), ('div', 'id', 'galleryInfoBox')
        ])
        gallery_href_rule.add_activate_rule_level([('td', 'class', 'btnList')])
        gallery_href_rule.add_process_rule_level('a', {'href', 'title'})
        gallery_href_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(gallery_href_rule)

        picture_rule = ParserRule()  # gallery rule
        picture_rule.add_activate_rule_level([('div', 'class',
                                               'gallery iItem ')])
        picture_rule.add_activate_rule_level([('div', 'class', 'img vam')])
        picture_rule.add_process_rule_level('img', {'src'})
        picture_rule.set_attribute_modifier_function(
            'src', lambda text: text.replace('_160', '_1000'))
        parser.add_rule(picture_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if len(video_rule.get_result()) > 0:
            result.set_video(
                MediaData(URL(video_rule.get_result()[0]['file'] + '*')))
            result.set_type('video')

            for f in gallery_href_rule.get_result():
                # print(f)
                result.add_control(ControlInfo(f['data'], URL(f['href'])))
            return result

        if len(picture_rule.get_result()) > 0:
            result.set_type('pictures')
            i = 1
            for f in picture_rule.get_result():
                x = FullPictureInfo(abs_href=URL(f['src']),
                                    rel_name='%03d.jpg' % i)
                result.add_full(x)
                i += 1
                # print(f['src'])

            for f in gallery_href_rule.get_result():
                result.add_control(ControlInfo(f['data'], URL(f['href'])))
            return result

        if len(startpage_rule.get_result()) > 0:
            result.set_type('hrefs')
            for item in startpage_rule.get_result():
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item['alt']))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            for item in startpage_hrefs_rule.get_result(['href', 'data']):
                result.add_control(ControlInfo(item['data'],
                                               URL(item['href'])))

        return result
Пример #4
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        # def star_get_url(txt=''):
        #     return txt.partition('(')[2].partition(')')[0]

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'image ')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class',
                                                       'pagination')])
        # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        startpage_hrefs_rule = ParserRule()
        startpage_hrefs_rule.add_activate_rule_level([
            ('div', 'class', 'sub_menu dark-menu'),
            ('div', 'class', 'sub-menu dark-menu')
        ])
        # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_hrefs_rule.add_process_rule_level('a', {'href', 'title'})
        startpage_hrefs_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_hrefs_rule)
        #
        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'class', 'player')])
        video_rule.add_process_rule_level('script', {})
        video_rule.set_attribute_filter_function(
            'data', lambda text: 'flashvars' in text)
        parser.add_rule(video_rule)
        #
        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('div', 'class',
                                                    'block_content')])
        # gallery_href_rule.add_activate_rule_level([('td', 'colspan', '2')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_filter_function(
            'href', lambda x: '/tags/' in x or '/categories/' in x)
        gallery_href_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(gallery_href_rule)

        gallery_user_rule = ParserRule()
        gallery_user_rule.add_activate_rule_level([('div', 'class',
                                                    'block_content')])
        gallery_user_rule.add_process_rule_level('a', {'href'})
        gallery_user_rule.set_attribute_filter_function(
            'href', lambda x: '/members/' in x)
        # gallery_user_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x,base_url))
        # gallery_user_rule.set_attribute_filter_function('href',lambda x:'/categories/' in x)
        parser.add_rule(gallery_user_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():
            urls = UrlList()
            for item in video_rule.get_result():
                file = self.quotes(item['data'].replace(' ', ''),
                                   "video_url:'", "'")
                urls.add('default', URL(file))

            result.set_video(urls.get_media_data())

            if gallery_user_rule.is_result():
                username = gallery_user_rule.get_result()[0].get('data', '***')
                user = gallery_user_rule.get_result()[0]['href'].rstrip(
                    '/').rpartition('/')[2]
                result.add_control(
                    ControlInfo(
                        '"' + username + '"',
                        URL('http://gobdsm.com/members/' + user +
                            '/public_videos/')))

            for f in gallery_href_rule.get_result(['data', 'href']):
                result.add_control(ControlInfo(f['data'], URL(f['href'])))
            return result

        if startpage_rule.is_result():
            for item in startpage_rule.get_result(['href']):
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            for item in startpage_hrefs_rule.get_result(['href', 'data']):
                result.add_control(
                    ControlInfo(item.get('title', item.get('data', '')),
                                URL(item['href'])))

        return result
Пример #5
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        # print(base_url.domain())
        def star_get_url(txt=''):
            return txt.partition('(')[2].partition(')')[0]

        startpage_rule = ParserRule(debug=False)
        startpage_rule.add_activate_rule_level([('ul', 'class', 'responsiveListing')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'data-original'})
        startpage_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url) + '*')
        startpage_rule.set_attribute_modifier_function('data-original', lambda x: x.replace('//', 'https://'))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('section', 'class', 'pagination')])
        # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url) + '*')
        parser.add_rule(startpage_pages_rule)

        startpage_hrefs_rule = ParserRule()
        startpage_hrefs_rule.add_activate_rule_level([('ul', 'class', 'categoryList')])
        # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_hrefs_rule.add_process_rule_level('a', {'href'})
        # startpage_hrefs_rule.add_process_rule_level('span', {''})
        startpage_hrefs_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url) + '*')
        parser.add_rule(startpage_hrefs_rule)
        #
        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('body', '', '')])
        video_rule.add_process_rule_level('script', {})
        video_rule.set_attribute_filter_function('data', lambda text: 'videoVars' in text)
        parser.add_rule(video_rule)
        #
        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('div', 'class', 'videoInfoTop')])
        # gallery_href_rule.add_activate_rule_level([('td', 'class', 'links')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url) + '*')
        # gallery_href_rule.set_attribute_filter_function('href',lambda x: x!='*')
        parser.add_rule(gallery_href_rule)
        #
        # gallery_channel_rule = ParserRule()
        # gallery_channel_rule.add_activate_rule_level([('p', 'class', 'source')])
        # gallery_channel_rule.add_process_rule_level('a', {'href'})
        # gallery_channel_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x + '*')
        # parser.add_rule(gallery_channel_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if len(video_rule.get_result()) > 0:
            script = video_rule.get_result()[0]['data'].replace(' ', '')  # .replace('\\','')

            # print(script)

            urls = list()

            while '"quality_' in script:
                nxt = script.partition('"quality_')[2]

                t = nxt.partition('":"')
                label = t[0]
                file = t[2].partition('",')[0].replace('%2F', '/').replace('%3F', '?').replace('%26', '&').replace(
                    '%3D', '=')
                # print (label, file)
                urls.append(dict(text=label, url=URL('https:' + file + '*')))
                script = nxt

            if len(urls) == 1:
                video = MediaData(urls[0]['url'])
            elif len(urls) > 1:
                default = urls[len(urls) - 1]['url']
                for t in urls:
                    if '720p' in t['text']:
                        default = t['url']
                video = MediaData(default)
                for item in urls:
                    video.add_alternate(item)
            else:
                return result

            result.set_type('video')
            result.set_video(video)
            #
            # for f in gallery_channel_rule.get_result(['data', 'href']):
            #     result.add_control(ControlInfo(f['data'], URL(f['href'])))

            links = set()
            for f in gallery_href_rule.get_result(['data', 'href']):
                if f['href'] not in links:
                    label = f['data'].replace('\t', '')
                    if label == '':
                        label = f['href'].rpartition('/')[2]
                    # print(f)
                    result.add_control(ControlInfo(label, URL(f['href'])))
                    links.add(f['href'])
            return result

        if len(startpage_rule.get_result()) > 0:
            result.set_type('hrefs')

            for item in startpage_rule.get_result(['href']):
                # print (item)
                result.add_thumb(ThumbInfo(thumb_url=URL(item['data-original']), href=URL(item['href']),
                                           popup=item.get('title', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                # print(item)
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            if len(startpage_hrefs_rule.get_result(['href'])) > 0:
                for item in startpage_hrefs_rule.get_result(['href', 'data']):
                    href = item['href']
                    txt = href.rstrip('*').rpartition('/')[2]
                    # print(item)
                    result.add_control(ControlInfo(txt, URL(href)))

        return result
Пример #6
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'clearfix'),
                                                ('div', 'class', 'row clearfix  video-container')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level(
            [('div', 'class', 'btn-group clearfix full-width pagination-block')])
        # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        startpage_categories_rule = ParserRule()
        startpage_categories_rule.add_activate_rule_level([('ul', 'class', 'main-nav unstyled-list subCategories')])
        # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_categories_rule.add_process_rule_level('a', {'href'})
        # startpage_categories_rule.set_attribute_filter_function('href',lambda x:'/free_porn/' in x)
        startpage_categories_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_categories_rule)

        startpage_hrefs_rule = ParserRule()
        startpage_hrefs_rule.add_activate_rule_level([('div', 'class', 'cat-menu hidden-xs')])
        # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_hrefs_rule.add_process_rule_level('a', {'href'})
        startpage_hrefs_rule.set_attribute_filter_function('href', lambda x: '/free_porn/' in x)
        startpage_hrefs_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_hrefs_rule)
        #

        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('body', '', '')])
        video_rule.add_process_rule_level('script', {''})
        video_rule.set_attribute_filter_function('data', lambda text: 'var urls' in text)
        # video_rule.set_attribute_modifier_function('src',lambda txt:txt+'*')
        parser.add_rule(video_rule)

        gallery_rule = ParserRule()
        gallery_rule.add_activate_rule_level([('div', 'id', 'galleryImages')])
        gallery_rule.add_process_rule_level('a', {})
        gallery_rule.add_process_rule_level('img', {'src'})
        # video_rule.set_attribute_filter_function('data', lambda text: 'jwplayer' in text)
        gallery_rule.set_attribute_modifier_function('src', lambda txt: txt.replace('/thumbs/', '/'))
        parser.add_rule(gallery_rule)

        #
        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('div', 'class', 'video-player-list tag-list-block')])
        gallery_href_rule.add_process_rule_level('a', {'href', 'title'})
        gallery_href_rule.set_attribute_modifier_function('href', lambda x: (self.get_href(x, base_url)))
        parser.add_rule(gallery_href_rule)

        gallery_user_rule = ParserRule()
        gallery_user_rule.add_activate_rule_level([('div', 'class', 'video-player-info row')])
        gallery_user_rule.add_process_rule_level('a', {'href'})
        # gallery_user_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x + '*')
        # gallery_user_rule.set_attribute_filter_function('href',lambda x:'/categories/' in x)
        parser.add_rule(gallery_user_rule)

        all = ''
        for s in open(fname, encoding='utf-8', errors='ignore'):
            parser.feed(s)  # .replace('</b>','</a>'))
            all += s.replace(' ', '')

        result = ParseResult()

        if 'urls.push({' in all:
            video_url = all.partition('urls.push({')[2].partition('"});')[0].partition('file:"')[2]
            video = MediaData(URL(video_url + '*'))

            result.set_type('video')
            result.set_video(video)

            if gallery_user_rule.is_result():
                # print(gallery_user_rule.get_result())
                user_name = gallery_user_rule.get_result()[0]['data'].strip()
                user_number = gallery_user_rule.get_result()[0]['href'].rpartition('-')[2].rstrip('/')

                # print(user_name, user_number)
                result.add_control(ControlInfo('"' + user_name + '"',
                                               URL('http://shockingmovies.com/uploads-by-user/' + user_number + '/')))
                # result.add_control(ControlInfo(user+' gals', URL('http://motherless.com/galleries/member/'+user+'*')))

            for f in gallery_href_rule.get_result(['href']):
                label = f['data'].strip().strip(',')
                if label == '':
                    label = f['title']

                result.add_control(ControlInfo(label, URL(f['href'])))

            return result

        if gallery_rule.is_result():
            result.set_type('pictures')
            url = URL(gallery_rule.get_result()[0]['src'] + '*')
            base_dir = url.get_path(base=Setting.base_dir)
            result.set_gallery_path(base_dir)
            for f in gallery_rule.get_result():
                picture = FullPictureInfo(abs_href=URL(f['src'] + '*'), rel_name=f['src'].rpartition('/')[2])
                picture.set_base(base_dir)
                result.add_full(picture)

            for f in gallery_href_rule.get_result(['href']):
                label = f['data'].strip()
                if label == '':
                    label = f['title']
                if '/user/' in f['href']:
                    split = f['href'].rpartition('-')
                    base = split[0].partition('/user/')[0]
                    # print(split)
                    # print(base)
                    result.add_control(ControlInfo(label + ' videos', URL(base + '/uploads-by-user/' + split[2])))
                    result.add_control(
                        ControlInfo(label + ' gals', URL(base + '/uploads-by-user/' + split[2] + '?photos=1')))
                else:
                    result.add_control(ControlInfo(label, URL(f['href'])))

            return result

        if startpage_rule.is_result():  # len(startpage_rule.get_result()) > 0:
            result.set_type('hrefs')

            for item in startpage_rule.get_result(['href']):
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                # print(item)
                href = item['href']
                page_number = href.rpartition('/page')[2].rpartition('.')[0]
                result.add_page(ControlInfo(page_number, URL(href)))
                # print(href,page_number)

            if len(startpage_categories_rule.get_result(['href'])) > 0:
                for item in startpage_categories_rule.get_result(['href', 'data']):
                    result.add_control(ControlInfo(item.get('data', ''), URL(item['href'])))

            if len(startpage_hrefs_rule.get_result(['href'])) > 0:
                for item in startpage_hrefs_rule.get_result(['href', 'data']):
                    result.add_control(ControlInfo(item.get('data', ''), URL(item['href'])))

        return result
Пример #7
0
    def parse_index_file(self, fname, base_url=URL()):
        site_url = 'http://' + urlparse(base_url.get())[1].strip('/')
        print('site url=', site_url)

        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class',
                                                 'bodycontainer')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt', 'class'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: site_url + get_href(x))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        # startpage_pages_rule.add_activate_rule_level([('div', 'class', 'bodycontainer')])
        startpage_pages_rule.add_activate_rule_level([('td', 'align', 'right')
                                                      ])
        startpage_pages_rule.add_process_rule_level('a', {'href', 'title'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: site_url + x)
        parser.add_rule(startpage_pages_rule)

        site_rule = ParserRule()
        site_rule.add_activate_rule_level([('div', 'class', 'headerlinetext')])
        site_rule.add_process_rule_level('a', {'href'})
        parser.add_rule(site_rule)

        picture_trigger_rule = ParserRule()
        picture_trigger_rule.add_activate_rule_level([('a', 'class',
                                                       'fancybox')])
        picture_trigger_rule.add_process_rule_level('img', {'src'})
        parser.add_rule(picture_trigger_rule)

        picture_rule = ParserRule()
        picture_rule.add_activate_rule_level([('div', 'class', 'bodycontainer')
                                              ])
        picture_rule.add_process_rule_level('a', {'href', 'class'})
        picture_rule.add_process_rule_level('img', {'alt'})
        parser.add_rule(picture_rule)

        picture_href_rule = ParserRule()
        picture_href_rule.add_activate_rule_level([('div', 'class',
                                                    'bodycontainer')])
        picture_href_rule.add_activate_rule_level([('h2', 'style',
                                                    'font-size:18px')])
        picture_href_rule.add_process_rule_level('a', {'href'})
        picture_href_rule.set_attribute_modifier_function(
            'href', lambda x: site_url + x)
        parser.add_rule(picture_href_rule)

        for s in open(fname):
            parser.feed(s)

        result = ParseResult()

        if len(picture_trigger_rule.get_result()) > 0:
            result.set_type('pictures')
            i = 1
            for f in picture_rule.get_result(['href', 'alt', 'class']):
                if f['class'] == 'fancybox':
                    result.add_full(
                        FullPictureInfo(abs_href=URL(f['href']),
                                        rel_name='%03d.jpg' % i))
                    i += 1

            for item in picture_href_rule.get_result(['href', 'data']):
                result.add_control(
                    ControlInfo(text=item['data'], url=URL(item['href'])))

            return result

        if len(startpage_rule.get_result()) > 0:
            result.set_type('hrefs')
            for item in startpage_rule.get_result(['href', 'src', 'class']):
                if item['class'] == 'thumb':
                    result.add_thumb(
                        ThumbInfo(thumb_url=URL(item['src']),
                                  href=URL(item['href']),
                                  popup=item.get('alt', '')))

            for item in site_rule.get_result(['href', 'data']):
                result.add_site(ControlInfo(item['data'], URL(item['href'])))

            for item in startpage_pages_rule.get_result(
                ['href', 'data', 'title']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

        return result
Пример #8
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'fancy-thumbnails-container'),
                                                ('div', 'class', 'fancy-thumbnails-container inner-content'),
                                                ('div', 'class', 'dvd-cover-inner')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src'})
        startpage_rule.set_attribute_filter_function('src', lambda x: '.jpg' in x)
        startpage_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('ul', 'class', 'pagination')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        startpage_hrefs_rule = ParserRule()
        startpage_hrefs_rule.add_activate_rule_level([('ul', 'class', 'dropdown-menu columns')])
        startpage_hrefs_rule.add_process_rule_level('a', {'href'})
        # startpage_hrefs_rule.set_attribute_filter_function('href',lambda x: '/channel/' in x or '/prime/' in x)
        startpage_hrefs_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_hrefs_rule)

        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'class', 'scene')])
        video_rule.add_process_rule_level('a', {'href'})
        video_rule.add_process_rule_level('video', {'data-src'})
        parser.add_rule(video_rule)
        #
        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('ul', 'class', 'info')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_filter_function('href', lambda x: '#' not in x)
        gallery_href_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(gallery_href_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():  # len(video_rule.get_result()) > 0:
            source = ''
            n = 1
            for item in video_rule.get_result():
                print(item)
                scene = 'Scene {0}'.format(n)
                if base_url.contain(item['href']):
                    source = item['data-src']
                    scene += '(this)'

                result.add_control(ControlInfo(scene, URL(self.get_href(item['href'], base_url))))
                n += 1

            video = MediaData(URL(source))
            result.set_video(video)

            for f in gallery_href_rule.get_result(['data', 'href']):
                result.add_control(ControlInfo(f['data'].strip(), URL(f['href'])))
            return result

        if startpage_rule.is_result():
            for item in startpage_rule.get_result(['href']):
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            for item in startpage_hrefs_rule.get_result(['href', 'data']):
                result.add_control(ControlInfo(item['data'], URL(item['href'])))

        return result
Пример #9
0
    def parse_index_file(self, fname, base_url=URL()):
        print(base_url.get(), base_url.domain())
        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'ogpost'),
                                                ('div', 'class', 'post300'),
                                                ('div', 'class', 'galelement')
                                                ])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: get_href(x, base_url.domain()))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([
            ('span', 'class', 'pager'), ('div', 'class', 'pager')
        ])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: base_url.domain() + x)
        parser.add_rule(startpage_pages_rule)

        startpage_href_rule = ParserRule()
        startpage_href_rule.add_activate_rule_level([('div', 'id', 'right')])
        startpage_href_rule.add_activate_rule_level([('div', 'class',
                                                      'rightbox')])
        startpage_href_rule.add_process_rule_level('a', {'href'})
        parser.add_rule(startpage_href_rule)

        picture_rule = ParserRule()
        picture_rule.add_activate_rule_level([('div', 'class',
                                               'galcontentpics')])
        picture_rule.add_process_rule_level('a', set())
        picture_rule.add_process_rule_level('img', {'src'})
        picture_rule.set_attribute_modifier_function(
            'src', lambda text: _del_thumb(text))
        parser.add_rule(picture_rule)

        for s in open(fname):
            parser.feed(s)

        result = ParseResult()

        if len(startpage_rule.get_result()) > 0:
            result.set_type('hrefs')
            for item in startpage_rule.get_result(['href', 'src']):
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href'] + '*'),
                              popup=item.get('alt', '')))

            for item in startpage_href_rule.get_result(['href', 'data']):
                if item['href'].startswith('/'):
                    result.add_control(
                        ControlInfo(
                            item['data'],
                            URL(base_url.domain() + item['href'] + '*')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(
                    ControlInfo(item['data'], URL(item['href'] + '*')))

        if len(picture_rule.get_result()) > 0:
            result.set_type('pictures')
            i = 1
            for f in picture_rule.get_result(['src']):
                result.add_full(
                    FullPictureInfo(abs_href=URL(f['src']),
                                    rel_name='%03d.jpg' % i))
                i += 1

        return result
Пример #10
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        # def star_get_url(txt=''):
        #     return txt.partition('(')[2].partition(')')[0]

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([
            ('ul', 'class', 'thumbs-items'), ('ul', 'class', 'thumbs-albums'),
            ('ul', 'class', 'thumbs-categories')
        ])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'data-original', 'alt'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        # startpage_rule.set_attribute_modifier_function('src', lambda x: self.get_href(x,base_url))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class',
                                                       'pagination')])
        # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        startpage_hrefs_rule = ParserRule()
        startpage_hrefs_rule.add_activate_rule_level([('div', 'class',
                                                       'list-categories')])
        # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_hrefs_rule.add_process_rule_level('a', {'href', 'title'})
        # startpage_hrefs_rule.set_attribute_filter_function('href',lambda x: '/videos/' in x)
        startpage_hrefs_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_hrefs_rule)
        #
        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'class', 'player-holder')])
        video_rule.add_process_rule_level('script', {})
        video_rule.set_attribute_filter_function(
            'data', lambda text: 'video_url:' in text)
        parser.add_rule(video_rule)
        #
        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('div', 'class',
                                                    'specification')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(gallery_href_rule)

        gallery_user_rule = ParserRule(collect_data=True)
        gallery_user_rule.add_activate_rule_level([('div', 'class',
                                                    'user-info')])
        gallery_user_rule.add_process_rule_level('a', {'href', 'title'})
        # gallery_user_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x+'/videos',base_url))
        # gallery_user_rule.set_attribute_filter_function('href',lambda x:'/members/' in x)
        parser.add_rule(gallery_user_rule)

        photo_rule = ParserRule()
        photo_rule.add_activate_rule_level([('div', 'class', 'ad-thumbs')])
        photo_rule.add_process_rule_level('a', {'data-image'})
        # photo_rule.set_attribute_filter_function('href', lambda text: '/photos/' in text)
        photo_rule.set_attribute_modifier_function(
            'data-image', lambda x: self.get_href(x, base_url))
        parser.add_rule(photo_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        def add_href_and_user_to_result():
            if gallery_user_rule.is_result(['href']):
                for item in gallery_user_rule.get_result(['href']):
                    # print(item)
                    username = item['title']
                    # print(username)
                    if username != '':
                        result.add_control(
                            ControlInfo('"' + username + ' videos"',
                                        URL(item['href'] + 'public_videos/')))
                        result.add_control(
                            ControlInfo('"' + username + ' photos"',
                                        URL(item['href'] + 'albums/')))

            for f in gallery_href_rule.get_result(['data', 'href']):
                result.add_control(ControlInfo(f['data'], URL(f['href'])))

        if video_rule.is_result():  # len(video_rule.get_result()) > 0:

            # for item in video_rule.get_result():
            #     print('=============================')
            #     print(item['data'])

            script = video_rule.get_result()[0]['data'].replace(' ', '')
            # print(script)

            url = script.partition("video_url:'")[2].partition("'")[0].rstrip(
                '/')
            print(url)

            video = MediaData(URL(url))
            result.set_type('video')
            result.set_video(video)

            add_href_and_user_to_result()
            return result

        if photo_rule.is_result():
            result.set_type('pictures')
            base_dir = base_url.get_path(base=Setting.base_dir) + base_url.get(
            ).rpartition('/')[2] + '/'
            result.set_gallery_path(base_dir)
            # print(base_dir)

            for item in photo_rule.get_result():
                name = item['data-image'].rpartition('/')[2].strip('*')
                picture = FullPictureInfo(abs_href=URL(item['data-image']),
                                          rel_name=name)
                picture.set_base(base_dir)
                result.add_full(picture)

            add_href_and_user_to_result()

            return result

        if startpage_rule.is_result():  # len(startpage_rule.get_result()) > 0:
            result.set_type('hrefs')

            for item in startpage_rule.get_result(['href', 'data-original']):
                # print(item)
                href = item['href']
                label = href.split('/')[-2].upper().replace('-', ' ')
                # print(href,label)

                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['data-original']),
                              href=URL(href),
                              popup=label))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            if len(startpage_hrefs_rule.get_result(['href'])) > 0:
                for item in startpage_hrefs_rule.get_result(['href', 'title']):
                    result.add_control(
                        ControlInfo(item.get('title', item.get('data', '')),
                                    URL(item['href'])))

        return result
Пример #11
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class',
                                                 'video-item compact')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('ul', 'class',
                                                       'pagination')])
        # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        startpage_categories_rule = ParserRule()
        startpage_categories_rule.add_activate_rule_level([
            ('nav', 'class', 'video-categories')
        ])
        # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_categories_rule.add_process_rule_level('a', {'href'})
        # startpage_categories_rule.set_attribute_filter_function('href',lambda x:'/free_porn/' in x)
        startpage_categories_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_categories_rule)

        startpage_hrefs_rule = ParserRule()
        startpage_hrefs_rule.add_activate_rule_level([('div', 'class',
                                                       'cat-menu hidden-xs')])
        # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_hrefs_rule.add_process_rule_level('a', {'href'})
        startpage_hrefs_rule.set_attribute_filter_function(
            'href', lambda x: '/free_porn/' in x)
        startpage_hrefs_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_hrefs_rule)
        #
        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('video', '', '')])
        video_rule.add_process_rule_level('source', {'src'})
        # video_rule.set_attribute_filter_function('data', lambda text: 'jwplayer' in text)
        video_rule.set_attribute_modifier_function('src',
                                                   lambda txt: txt + '*')
        parser.add_rule(video_rule)

        gallery_rule = ParserRule()
        gallery_rule.add_activate_rule_level([('div', 'id', 'galleryImages')])
        gallery_rule.add_process_rule_level('a', {})
        gallery_rule.add_process_rule_level('img', {'src'})
        # video_rule.set_attribute_filter_function('data', lambda text: 'jwplayer' in text)
        gallery_rule.set_attribute_modifier_function(
            'src', lambda txt: txt.replace('/thumbs/', '/'))
        parser.add_rule(gallery_rule)

        #
        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('div', 'class', 'tags')])
        gallery_href_rule.add_process_rule_level('a', {'href', 'title'})
        gallery_href_rule.set_attribute_modifier_function(
            'href', lambda x: (self.get_href(x, base_url)))
        parser.add_rule(gallery_href_rule)

        gallery_user_rule = ParserRule()
        gallery_user_rule.add_activate_rule_level([('div', 'class', 'uploaded')
                                                   ])
        gallery_user_rule.add_process_rule_level('a', {'href'})
        # gallery_user_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x + '*')
        # gallery_user_rule.set_attribute_filter_function('href',lambda x:'/categories/' in x)
        parser.add_rule(gallery_user_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():  # len(video_rule.get_result()) > 0:
            urls = UrlList()
            for item in video_rule.get_result():
                urls.add('default', URL(item['src']))
            result.set_video(urls.get_media_data(-1))

            if gallery_user_rule.is_result():
                user = gallery_user_rule.get_result()[0]['href'].rpartition(
                    '/')[2]
                result.add_control(
                    ControlInfo(
                        '"' + user + '"',
                        URL('http://www.heavy-r.com/user/' + user +
                            '?pro=videos*')))

            for f in gallery_href_rule.get_result(['href']):
                label = f['data'].strip()
                if label == '':
                    label = f['title']
                result.add_control(ControlInfo(label, URL(f['href'])))

            return result

        if gallery_rule.is_result():
            result.set_type('pictures')
            url = URL(gallery_rule.get_result()[0]['src'] + '*')
            base_dir = url.get_path(base=Setting.base_dir)
            result.set_gallery_path(base_dir)
            for f in gallery_rule.get_result():
                picture = FullPictureInfo(abs_href=URL(f['src'] + '*'),
                                          rel_name=f['src'].rpartition('/')[2])
                picture.set_base(base_dir)
                result.add_full(picture)

            for f in gallery_href_rule.get_result(['href']):
                label = f['data'].strip()
                if label == '':
                    label = f['title']
                if '/user/' in f['href']:
                    split = f['href'].rpartition('-')
                    base = split[0].partition('/user/')[0]
                    # print(split)
                    # print(base)
                    result.add_control(
                        ControlInfo(label + ' videos',
                                    URL(base + '/uploads-by-user/' +
                                        split[2])))
                    result.add_control(
                        ControlInfo(
                            label + ' gals',
                            URL(base + '/uploads-by-user/' + split[2] +
                                '?photos=1')))
                else:
                    result.add_control(ControlInfo(label, URL(f['href'])))

            return result

        if startpage_rule.is_result():
            for item in startpage_rule.get_result(['href']):
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            if len(startpage_categories_rule.get_result(['href'])) > 0:
                for item in startpage_categories_rule.get_result(
                    ['href', 'data']):
                    result.add_control(
                        ControlInfo(item.get('data', ''), URL(item['href'])))

            if len(startpage_hrefs_rule.get_result(['href'])) > 0:
                for item in startpage_hrefs_rule.get_result(['href', 'data']):
                    result.add_control(
                        ControlInfo(item.get('data', ''), URL(item['href'])))

        return result
Пример #12
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        def star_get_url(txt=''):
            return txt.partition('(')[2].partition(')')[0]

        startpage_rule = ParserRule(debug=False)
        startpage_rule.add_activate_rule_level([('div', 'class', 'main l170'),
                                                ('div', 'class', 'main l200'),
                                                ('div', 'class', 'main'),
                                                ('div', 'class', 'profileRight'),
                                                ('div', 'class', 'main l200 r300')])
        startpage_rule.add_activate_rule_level([('ul', 'class', 'listThumbs'),
                                                ('ul', 'class', 'listProfiles'),
                                                ('ul', 'class', 'listChannels'),
                                                ('ul', 'class', 'listGalleries')])
        startpage_rule.add_process_rule_level('a', {'href', 'class', 'style'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x + '*')
        startpage_rule.set_attribute_modifier_function('style', star_get_url)
        startpage_rule.set_attribute_filter_function('href', lambda x: not '/pictures/' in x)
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class', 'pager')])
        # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x + '*')
        parser.add_rule(startpage_pages_rule)

        startpage_hrefs_rule = ParserRule()
        startpage_hrefs_rule.add_activate_rule_level([('ul', 'class', 'sFilters initial'),
                                                      ('ul', 'class', 'sFilters'),
                                                      ('div', 'class', 'listSearches searchOption'),
                                                      ('div', 'class', 'alpha')
                                                      ])
        # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_hrefs_rule.add_process_rule_level('a', {'href', 'title'})
        startpage_hrefs_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        startpage_hrefs_rule.set_attribute_filter_function('title', lambda x: 'Combine Category' not in x)
        parser.add_rule(startpage_hrefs_rule)

        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('head', '', '')])
        video_rule.add_process_rule_level('script', {})
        video_rule.set_attribute_filter_function('data', lambda text: 'streams:[' in text)
        parser.add_rule(video_rule)

        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('p', 'class', 'source tags'),
                                                   ('p', 'class', 'source categories')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(gallery_href_rule)

        gallery_user_rule = ParserRule()
        gallery_user_rule.add_activate_rule_level([('p', 'class', 'source')])
        gallery_user_rule.add_process_rule_level('a', {'href'})
        gallery_user_rule.set_attribute_filter_function('href', lambda x: '/profile/' in x)
        gallery_user_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x + '/videos', base_url))
        parser.add_rule(gallery_user_rule)

        gallery_actor_rule = ParserRule()
        gallery_actor_rule.add_activate_rule_level([('p', 'class', 'source')])
        gallery_actor_rule.add_process_rule_level('a', {'href'})
        gallery_actor_rule.set_attribute_filter_function('href', lambda x: '/pornstars/' in x)
        gallery_actor_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x + '/videos', base_url))
        parser.add_rule(gallery_actor_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():

            urls = UrlList()
            for item in video_rule.get_result():
                script = item['data'].replace(' ', '')
                sources = self.quotes(script, 'streams:[{', '}]').split('},{')
                for f in sources:
                    label = self.quotes(f, 'id:"', '"')
                    file = self.quotes(f, 'url:"', '"')
                    urls.add(label, URL(file + '*'))

            result.set_video(urls.get_media_data(-1))

            for f in gallery_user_rule.get_result(['href']):
                result.add_control(ControlInfo('"' + f['data'] + '"', URL(f['href'])))

            for f in gallery_actor_rule.get_result(['href']):
                result.add_control(ControlInfo(f['data'], URL(f['href'])))

            for f in gallery_href_rule.get_result(['data', 'href']):
                result.add_control(ControlInfo(f['data'], URL(f['href'])))
            return result

        if startpage_rule.is_result():
            #
            # for item in startpage_rule.get_result():
            #     print(item)

            for item in startpage_rule.get_result(['href', 'src']):
                caption = ''
                href = item['href']
                if '/channels/' in href or '/pornstars/' in href:
                    result.set_caption_visible(True)
                    caption = item.get('alt', href.rpartition('/')[2].strip('*').replace('-', ' ').title())
                result.add_thumb(ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=caption))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            for item in startpage_hrefs_rule.get_result(['href']):
                result.add_control(ControlInfo(item.get('title', item.get('data', '')), URL(item['href'])))

        return result
Пример #13
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class',
                                                 'thumb vidItem')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        # startpage_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x)
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class',
                                                       'pagination')])
        # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: base_url.domain() + x + '*')
        parser.add_rule(startpage_pages_rule)

        startpage_hrefs_rule = ParserRule()
        startpage_hrefs_rule.add_activate_rule_level([('ul', 'class',
                                                       'left-menu-box')])
        # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_hrefs_rule.add_process_rule_level('a', {'href'})
        startpage_hrefs_rule.set_attribute_filter_function(
            'href', lambda x: '/videos/' in x)
        startpage_hrefs_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_hrefs_rule)
        #
        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'class',
                                             'block videoDetail vidItem')])
        video_rule.add_process_rule_level('script', {})
        video_rule.set_attribute_filter_function(
            'data', lambda text: 'jwplayer' in text)
        parser.add_rule(video_rule)
        #
        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('div', 'class',
                                                    'content-tags')])
        # gallery_href_rule.add_activate_rule_level([('div', 'class', 'column second')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_modifier_function(
            'href', lambda x: base_url.domain() + x)
        parser.add_rule(gallery_href_rule)

        gallery_user_rule = ParserRule()
        gallery_user_rule.add_activate_rule_level([('div', 'class',
                                                    'user-card')])
        gallery_user_rule.add_process_rule_level('a', {'href'})
        gallery_user_rule.add_process_rule_level('span', {'class'})
        # gallery_user_rule.set_attribute_filter_function('href',lambda x:'/profile/' in x)
        gallery_user_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(gallery_user_rule)

        gallery_user_name_rule = ParserRule()
        gallery_user_name_rule.add_activate_rule_level([('div', 'class',
                                                         'user-data')])
        gallery_user_name_rule.add_process_rule_level('span', {'class'})
        # gallery_user_rule.set_attribute_filter_function('href',lambda x:'/profile/' in x)
        # gallery_user_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x+'/videos',base_url))
        parser.add_rule(gallery_user_name_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if len(video_rule.get_result()) > 0:
            script = video_rule.get_result()[0]['data'].replace('\t',
                                                                '').replace(
                                                                    '\n', '')

            # print(video_rule.get_result()[0]['data'])
            # print('len=',len(video_rule.get_result()))

            file = ''
            if 'sources:' in script:
                sources = script.partition('sources:')[2].partition(']')[0]
                # print(sources)
                file = sources.partition('file: "')[2].partition(
                    '",')[0].strip('"').replace(' ', '%20')
            # print(file)
            elif "filefallback':" in script:
                file = script.replace(
                    ' ',
                    '').partition("filefallback':\"")[2].partition('",')[0]
                # print(file)
            else:
                return result

            video = MediaData(URL(file))

            result.set_type('video')
            result.set_video(video)

            user_url = gallery_user_rule.get_result(['href'])[0]['href']
            user_name = gallery_user_name_rule.get_result(['data'])[0]['data']
            # print(user_url,user_name)
            result.add_control(
                ControlInfo('"' + user_name + '"', URL(user_url)))

            for f in gallery_href_rule.get_result(['data', 'href']):
                result.add_control(ControlInfo(f['data'], URL(f['href'])))
            return result

        if len(startpage_rule.get_result()) > 0:
            result.set_type('hrefs')

            for item in startpage_rule.get_result(['href', 'src']):
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src'].replace(' ', '%20')),
                              href=URL(item['href']),
                              popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            if len(startpage_hrefs_rule.get_result(['href', 'data'])) > 0:
                for item in startpage_hrefs_rule.get_result(['href', 'data']):
                    result.add_control(
                        ControlInfo(item['data'], URL(item['href'])))

        return result
Пример #14
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        # startpage_rule.add_activate_rule_level([('div', 'class', 'post_block')])#
        startpage_rule.add_activate_rule_level([('div', 'class',
                                                 'vid_container')])  #
        startpage_rule.add_process_rule_level('img', {'src'})
        startpage_rule.add_process_rule_level('a', {'href', 'title'})
        # startpage_rule.set_attribute_filter_function('src',lambda x: '.jpg' in x)
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        # startpage_rule.set_attribute_modifier_function('title', lambda x: x.partition('#')[0])
        parser.add_rule(startpage_rule)

        startpage_combo_rule = ParserRule()
        # startpage_rule.add_activate_rule_level([('div', 'class', 'post_block')])#
        startpage_combo_rule.add_activate_rule_level([('div', 'class',
                                                       'combo_post_wrap')])
        startpage_combo_rule.add_process_rule_level('a', {'href', 'title'})
        startpage_combo_rule.add_process_rule_level('img', {'src'})
        # startpage_rule.set_attribute_filter_function('src',lambda x: '.jpg' in x)
        # startpage_combo_rule.set_attribute_modifier_function('title', lambda x: x.partition('#')[0])
        startpage_combo_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_combo_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'id',
                                                       'center_control')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        startpage_hrefs_rule = ParserRule()
        startpage_hrefs_rule.add_activate_rule_level([
            ('ul', 'class', 'dropdown-menu columns')
        ])
        startpage_hrefs_rule.add_process_rule_level('a', {'href'})
        # startpage_hrefs_rule.set_attribute_filter_function('href',lambda x: '/channel/' in x or '/prime/' in x)
        startpage_hrefs_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_hrefs_rule)

        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'itemprop', 'video')])
        # video_rule.add_process_rule_level('a', {'href'})
        video_rule.add_process_rule_level('video', {'src'})
        parser.add_rule(video_rule)

        video_multipart_rule = ParserRule()
        video_multipart_rule.add_activate_rule_level([('div', 'id',
                                                       'videos_container')])
        # video_rule.add_process_rule_level('a', {'href'})
        video_multipart_rule.add_process_rule_level(
            'div',
            {'data-source', 'data-hash', 'data-x', 'data-oid', 'data-pid'})
        parser.add_rule(video_multipart_rule)

        video_usss_rule = ParserRule()
        video_usss_rule.add_activate_rule_level([('body', '', '')])
        # video_rule.add_process_rule_level('a', {'href'})
        video_usss_rule.add_process_rule_level('script', {})
        video_usss_rule.set_attribute_filter_function('data',
                                                      lambda x: 'usss' in x)
        parser.add_rule(video_usss_rule)

        #
        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([
            ('div', 'class', 'popular_block_header_rl')
        ])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        # gallery_href_rule.set_attribute_filter_function('href',lambda x: '#' not in x)
        gallery_href_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(gallery_href_rule)

        gallery_author_rule = ParserRule()
        gallery_author_rule.add_activate_rule_level([
            ('div', 'id', 'posts_container')
        ])  # post_block
        gallery_author_rule.add_activate_rule_level([
            ('div', 'class', 'post_author_name')
        ])  # post_block
        gallery_author_rule.add_process_rule_level('a', {'href'})
        # gallery_href_rule.set_attribute_filter_function('href',lambda x: '#' not in x)
        gallery_author_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(gallery_author_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():  # len(video_rule.get_result()) > 0:
            # print('video rule')
            # print(video_rule.get_result())

            video = MediaData(URL(video_rule.get_result()[0]['src']))

            # result.set_type('video')
            result.set_video(video)

            for f in gallery_author_rule.get_result(['data', 'href']):
                # print(f)
                result.add_control(
                    ControlInfo('"' + f['data'].strip() + '"', URL(f['href'])))

            for f in gallery_href_rule.get_result(['data', 'href']):
                # print(f)
                result.add_control(
                    ControlInfo(f['data'].strip(), URL(f['href'])))
            return result

        if video_multipart_rule.is_result():

            res = video_multipart_rule.get_result()
            series = len(res)

            s = base_url.get().partition('?s=')[2]

            if s == '':
                serie = 1
            else:
                serie = int(s)

            uid = self.quotes(
                video_usss_rule.get_result()[0]['data'].replace(' ', ''),
                'usss[0]="', '"')
            curr_result = res[serie - 1]

            data = {
                'uid': uid,
                'source': curr_result['data-source'],
                'hash': curr_result['data-hash'],
                'x': curr_result['data-x'],
                'oid': curr_result['data-oid'],
                'pid': curr_result['data-pid']
            }

            url = URL(self.get_href('/php/get_vlink.php', base_url),
                      'POST',
                      post_data=data)

            r = load(url)

            video = MediaData(URL(r.text))

            result.set_type('video')
            result.set_video(video)

            for i in range(1, series + 1):
                label = 'S{0}'.format(i)
                if i == serie:
                    label += '(this)'
                url_i = base_url.get().partition('?')[0] + '?s={0}'.format(i)
                result.add_control(ControlInfo(label, URL(url_i + '*')))

            for f in gallery_author_rule.get_result(['data', 'href']):
                # print(f)
                result.add_control(
                    ControlInfo('"' + f['data'].strip() + '"', URL(f['href'])))

            for f in gallery_href_rule.get_result(['data', 'href']):
                # print(f)
                result.add_control(
                    ControlInfo(f['data'].strip(), URL(f['href'])))
            return result

        if startpage_rule.is_result() or startpage_combo_rule.is_result(
        ):  # len(startpage_rule.get_result()) > 0:
            # result.set_type('hrefs')

            for item in startpage_combo_rule.get_result():
                # print(item)
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item.get('title', '')))

            for item in startpage_rule.get_result(['href']):
                # print(item)
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item.get('title', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                href = item['href']
                data = item['data']
                n = href.rpartition('/')[2].partition('.')[0]
                result.add_page(ControlInfo('{1}'.format(data, n), URL(href)))

            for item in startpage_hrefs_rule.get_result(['href', 'data']):
                result.add_control(ControlInfo(item['data'],
                                               URL(item['href'])))

        return result
Пример #15
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()
        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class',
                                                 'item photo-item')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([
            ('ul', 'class', 'justified-pagination')
        ])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        parser.add_rule(startpage_pages_rule)

        startpage_tags_rule = ParserRule()
        startpage_tags_rule.add_activate_rule_level([('ul', 'class', 'tags')])
        startpage_tags_rule.add_process_rule_level('a', {'href'})
        parser.add_rule(startpage_tags_rule)

        picture_rule = ParserRule()
        picture_rule.add_activate_rule_level([('div', 'class', 'photo-item')])
        picture_rule.add_process_rule_level('a', set())
        picture_rule.add_process_rule_level('img', {'src'})
        picture_rule.set_attribute_modifier_function(
            'src', lambda text: text.replace('thumb', 'origin'))
        parser.add_rule(picture_rule)

        picture_model_rule = ParserRule()
        picture_model_rule.add_activate_rule_level([('div', 'class',
                                                     'block attached-model')])
        picture_model_rule.add_process_rule_level('a', {'href'})
        picture_model_rule.add_process_rule_level('img', {'alt'})
        parser.add_rule(picture_model_rule)

        picture_tags_rule = ParserRule()
        picture_tags_rule.add_activate_rule_level([('div', 'class',
                                                    'block gallery-tags')])
        picture_tags_rule.add_activate_rule_level([('ul', 'class', 'tags')])
        picture_tags_rule.add_process_rule_level('a', {'href'})
        parser.add_rule(picture_tags_rule)

        for s in open(fname, encoding='utf-8'):
            parser.feed(s)

        result = ParseResult()

        if len(startpage_rule.get_result()) > 0:
            result.set_type('hrefs')
            for item in startpage_rule.get_result():
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            for item in startpage_tags_rule.get_result(['href', 'data']):
                result.add_control(ControlInfo(item['data'],
                                               URL(item['href'])))

        if len(picture_rule.get_result()) > 0:
            result.set_type('pictures')
            i = 1
            for f in picture_rule.get_result():
                result.add_full(
                    FullPictureInfo(abs_href=URL(f['src']),
                                    rel_name='%03d.jpg' % i))
                i += 1

            for item in picture_model_rule.get_result(['href', 'alt']):
                result.add_control(
                    ControlInfo(item['alt'], URL(item['href'] + '/galleries')))

            for item in picture_tags_rule.get_result(['href', 'data']):
                result.add_control(ControlInfo(item['data'],
                                               URL(item['href'])))

        return result
Пример #16
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        # startpage_rule.add_activate_rule_level([('section', '', '')])
        startpage_rule.add_activate_rule_level([
            ('article', 'class', 'teaser singleLink hasButtonRow'),
            ('article', 'class', 'activity video hasButtonFooter')
        ])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img',
                                              {'src', 'data-lazysrc', 'alt'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        startpage_rule.set_attribute_modifier_function(
            'src', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([
            ('nav', 'class', 'clearfix pagination bottom'),
            ('nav', 'class', 'range rangeCount-2 clearfix')
        ])
        startpage_pages_rule.add_process_rule_level('a', {'href', 'data-href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        startpage_pages_rule.set_attribute_modifier_function(
            'data-href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        startpage_categories_rule = ParserRule()
        startpage_categories_rule.add_activate_rule_level([
            ('select', 'id', 'input_selectCategories')
        ])
        startpage_categories_rule.add_process_rule_level('option', {'value'})
        startpage_categories_rule.set_attribute_modifier_function(
            'value', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_categories_rule)

        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'id', 'playerWrapper')])
        video_rule.add_process_rule_level('script', {})
        video_rule.set_attribute_filter_function(
            'data', lambda text: 'sources:' in text)
        parser.add_rule(video_rule)
        #
        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('dl', 'class', 'group')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(gallery_href_rule)

        gallery_user_rule = ParserRule()
        gallery_user_rule.add_activate_rule_level([
            ('nav', 'class', 'profileNav clearfix buttonRow')
        ])
        gallery_user_rule.add_process_rule_level('a', {'href'})
        gallery_user_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        gallery_user_rule.set_attribute_filter_function(
            'href', lambda x: '#videos' in x)
        parser.add_rule(gallery_user_rule)

        self.proceed_parcing(parser, fname)

        result = ParseResult()

        if video_rule.is_result():

            urls = UrlList()
            for item in video_rule.get_result():
                script = item['data'].replace(' ', '').replace('\\', '')
                sources = self.quotes(script, 'sources:{"', '"},').split('","')
                for f in sources:
                    t = f.partition('":"')
                    label = t[0]
                    file = self.get_href(t[2], base_url)
                    urls.add(label, URL(file))

            result.set_video(urls.get_media_data())

            # for f in gallery_user_rule.get_result():
            #     print(f)
            #     name='"{0}"'.format(f['href'].rpartition('/')[2].partition('#')[0])
            #     result.add_control(ControlInfo(name, URL(f['href'])))

            for f in gallery_href_rule.get_result(['data', 'href']):
                result.add_control(ControlInfo(f['data'], URL(f['href'])))

            return result

        if startpage_rule.is_result():

            for item in startpage_rule.get_result(['href']):
                # print(item)
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(
                        item.get('data-lazysrc', item['src'])),
                              href=URL(item['href']),
                              popup=item.get('alt'
                                             '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                # print(item)
                href = item.get('data-href', item['href'])
                # print(href)
                result.add_page(
                    ControlInfo(href.rpartition('/')[2].strip('*'), URL(href)))

            for item in startpage_categories_rule.get_result():
                result.add_control(
                    ControlInfo(item['data'], URL(item['value'])))

        return result
Пример #17
0
    def parse_index_file(self, fname, base_url=URL()):
        print(base_url.domain())
        parser = SiteParser()
        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'image')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class', 'pager')
                                                      ])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        picture_rule = ParserRule()
        picture_rule.add_activate_rule_level([('div', 'class', 'block center')
                                              ])
        picture_rule.add_process_rule_level('a', set())
        picture_rule.add_process_rule_level('img', {'src'})
        picture_rule.set_attribute_modifier_function(
            'src', lambda text: text.replace('/tn_', '/'))
        parser.add_rule(picture_rule)

        picture_href_rule = ParserRule()
        picture_href_rule.add_activate_rule_level([('div', 'class',
                                                    'list tags')])
        picture_href_rule.add_process_rule_level('a', {'href'})
        picture_href_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(picture_href_rule)

        for s in open(fname):
            parser.feed(s)

        result = ParseResult()

        if len(startpage_rule.get_result()) > 0:
            # print('Startpage rule')
            result.set_type('hrefs')
            for item in startpage_rule.get_result():
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href'] + '*'),
                              popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

        if len(picture_rule.get_result()) > 0:
            result.set_type('pictures')
            i = 1
            for f in picture_rule.get_result(['src']):
                # print(f)
                result.add_full(
                    FullPictureInfo(abs_href=URL(f['src']),
                                    rel_name='%03d.jpg' % i))
                i += 1

            for f in picture_href_rule.get_result():
                # print(f)
                result.add_control(ControlInfo(f['data'], URL(f['href'])))

        return result