示例#1
0
    def parse_picture_page(self, fname, base_url=URL()):
        print('Parsing:', base_url.get(), base_url.domain())

        parser = SiteParser()

        redirect_rule = ParserRule()
        redirect_rule.add_activate_rule_level([('head', '', '')])
        redirect_rule.add_process_rule_level('base', {'href'})
        parser.add_rule(redirect_rule)

        picture_rule = ParserRule()
        picture_rule.add_activate_rule_level([('div', 'class', 'girls')])
        picture_rule.add_process_rule_level('a', {'href'})
        picture_rule.set_attribute_modifier_function('href', lambda text: base_url.get() + '/' + text)
        parser.add_rule(picture_rule)

        for s in open(fname):
            parser.feed(s)

        result = ParseResult()

        if len(redirect_rule.get_result()) > 0:
            print('Redirecting', redirect_rule.get_result()[0]['href'])
            result.set_type('redirect')
            result.set_redirect(URL(redirect_rule.get_result()[0]['href']))
            return result

        if len(picture_rule.get_result()) > 0:
            result.set_type('pictures')
            i = 1
            for f in picture_rule.get_result(['href']):
                result.add_full(FullPictureInfo(abs_href=URL(f['href']), rel_name='%03d.jpg' % i))
                i += 1

        return result
示例#2
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()
        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'thumbs'),
                                                ('div', 'class', 'movie_thumbs')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function('href', self.get_href)
        parser.add_rule(startpage_rule)

        for s in open(fname):
            parser.feed(s)

        result = ParseResult()

        if len(startpage_rule.get_result()) > 0:
            # print('Startpage rule')
            result.set_type('hrefs')
            for item in startpage_rule.get_result():
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', '')))
        else:
            print(base_url.get(), ' not parsed by BEmultiThumbSite. Add rule.')

        return result
示例#3
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'thumbs200'),
                                                ('div', 'class', 'thumbs300')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function('href', self.get_href)
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class', 'menu')
                                                      ])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: base_url.domain() + x)
        startpage_pages_rule.set_attribute_filter_function(
            'href', lambda txt: '/st/' in txt)
        parser.add_rule(startpage_pages_rule)

        picture_rule = ParserRule()
        picture_rule.add_activate_rule_level([('div', 'class',
                                               'gallery-thumbs')])
        picture_rule.add_process_rule_level('a', set())
        picture_rule.add_process_rule_level('img', {'src'})
        picture_rule.set_attribute_modifier_function(
            'src', self.process_picture_address)
        parser.add_rule(picture_rule)

        for s in open(fname):
            parser.feed(s)

        result = ParseResult()

        if len(picture_rule.get_result()) > 0:
            result.set_type('pictures')
            for f in picture_rule.get_result():
                result.add_full(
                    FullPictureInfo(abs_href=URL(f['src']),
                                    rel_name=f['src'].rpartition('/')[2]))
            return result

        if len(startpage_rule.get_result()) > 0:
            # print('Startpage rule')
            result.set_type('hrefs')
            for item in startpage_rule.get_result():
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))
                #
                # for item in startpage_hrefs_rule.get_result(['href', 'data']):
                #     result.add_control(ControlInfo(item['data'], URL(item['href'])))

        return result
示例#4
0
    def parse_index_file(self, fname, base_url=URL()):
        if self.is_pictures_page(base_url):
            result = self.parse_picture_page(fname, base_url)
            return result
        # print(base_url.get(), base_url.domain())
        parser = SiteParser()
        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'one')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src'})
        startpage_rule.set_attribute_modifier_function('href', lambda x: get_href(x))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'id', 'pager')])
        startpage_pages_rule.add_activate_rule_level([('div', 'id', 'pc')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x)
        parser.add_rule(startpage_pages_rule)

        picture_rule = ParserRule()
        picture_rule.add_activate_rule_level([('div', 'id', 'cc')])
        picture_rule.add_process_rule_level('a', set())
        picture_rule.add_process_rule_level('img', {'src', 'title'})
        picture_rule.set_attribute_modifier_function('src', lambda text: text.replace('t_', ''))
        parser.add_rule(picture_rule)

        picture_href_rule = ParserRule()
        picture_href_rule.add_activate_rule_level([('div', 'id', 'cc')])
        picture_href_rule.add_activate_rule_level([('div', 'class', 'shorttext')])
        picture_href_rule.add_process_rule_level('a', {'href', 'alt'})
        parser.add_rule(picture_href_rule)

        for s in open(fname):
            parser.feed(s)

        result = ParseResult()

        if len(startpage_rule.get_result()) > 0:
            result.set_type('hrefs')
            for item in startpage_rule.get_result(['href', 'src']):
                result.add_thumb(ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href'] + '*')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'] + '*')))

        if len(picture_rule.get_result()) > 0:
            result.set_type('pictures')
            i = 1
            for f in picture_rule.get_result(['src', 'title']):
                result.add_full(FullPictureInfo(abs_href=URL(f['src']), rel_name='%03d.jpg' % i))
                i += 1

            for f in picture_href_rule.get_result():
                if f['href'].startswith('/'):
                    result.add_control(ControlInfo(text=f['alt'], url=URL(base_url.domain() + f['href'])))

        return result
示例#5
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'flower')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class', 'morepartners')])
        startpage_pages_rule.add_process_rule_level('a', {'href', 'alt'})
        startpage_pages_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        picture_trigger_rule = ParserRule()
        picture_trigger_rule.add_activate_rule_level([('a', 'class', 'thumbsmall')])
        # picture_trigger_rule.add_process_rule_level('a', set())
        picture_trigger_rule.add_process_rule_level('img', {'src'})
        # picture_trigger_rule.set_attribute_modifier_function('src', lambda text: text.replace('/tn_', '/'))
        parser.add_rule(picture_trigger_rule)

        picture_rule = ParserRule()
        picture_rule.add_activate_rule_level([('div', 'class', 'flower')])
        picture_rule.add_process_rule_level('a', {'class'})
        picture_rule.add_process_rule_level('img', {'src'})
        picture_rule.set_attribute_modifier_function('src', lambda text: text.replace('/tn_', '/'))
        parser.add_rule(picture_rule)

        for s in open(fname, encoding='utf-8'):
            parser.feed(s)

        result = ParseResult()

        if len(startpage_rule.get_result()) > 0:
            # print('Startpage rule')
            result.set_type('hrefs')
            for item in startpage_rule.get_result():
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

        if len(picture_trigger_rule.get_result()) > 0:
            result.set_type('pictures')
            i = 1
            for f in picture_rule.get_result(['src', 'class']):
                # print(f)
                if f['class'] == 'thumbsmall':
                    result.add_full(FullPictureInfo(abs_href=URL(f['src']), rel_name='%03d.jpg' % i))
                    i += 1

        return result
示例#6
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()
        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'post')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class', 'paginator')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        parser.add_rule(startpage_pages_rule)

        href_rule = ParserRule()
        href_rule.add_activate_rule_level([('div', 'class', 'sidebar')])
        href_rule.add_process_rule_level('li', {'class'})
        href_rule.add_process_rule_level('a', {'href', 'title'})
        href_rule.set_attribute_modifier_function('title', lambda text: text.replace('View all posts filed under ', ''))
        parser.add_rule(href_rule)

        picture_rule = ParserRule()
        picture_rule.add_activate_rule_level([('dl', 'class', 'gallery-item')])
        picture_rule.add_process_rule_level('a', set())
        picture_rule.add_process_rule_level('img', {'src'})
        picture_rule.set_attribute_modifier_function('src', lambda text: text.replace('-180x240', ''))
        parser.add_rule(picture_rule)

        for s in open(fname):
            parser.feed(s)

        result = ParseResult()

        if len(picture_rule.get_result()) > 0:
            result.set_type('pictures')
            i = 1
            for f in picture_rule.get_result(['src']):
                result.add_full(FullPictureInfo(abs_href=URL(f['src']), rel_name='%03d.jpg' % i))
                i += 1
            return result

        if len(startpage_rule.get_result()) > 0:
            # print('Startpage rule')
            result.set_type('hrefs')
            for item in startpage_rule.get_result():
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            for item in href_rule.get_result(['class', 'href', 'title']):
                # print(item['title'])
                result.add_control(ControlInfo(item['title'], URL(item['href'])))

        return result
示例#7
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()
        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'grid_1')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function('href', lambda x: 'http://hotxpix.net' + get_href(x))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class', 'pagination')])
        startpage_pages_rule.add_process_rule_level('a', {'href', 'title'})
        startpage_pages_rule.set_attribute_modifier_function('href', lambda x: 'http://hotxpix.net' + x)
        parser.add_rule(startpage_pages_rule)

        picture_trigger_rule = ParserRule()
        picture_trigger_rule.add_activate_rule_level([('div', 'class', 'grid_4 gal dbg')])
        picture_trigger_rule.add_process_rule_level('a', {'href'})
        parser.add_rule(picture_trigger_rule)

        picture_rule = ParserRule()
        picture_rule.add_activate_rule_level([('div', 'class', 'grid_4 gal dbg')])
        picture_rule.add_process_rule_level('a', {'href'})
        picture_rule.add_process_rule_level('img', {'src'})
        parser.add_rule(picture_rule)

        for s in open(fname):
            parser.feed(s)

        result = ParseResult()

        if len(picture_trigger_rule.get_result()) > 0:
            result.set_type('pictures')
            i = 1
            for f in picture_rule.get_result(['href', 'src']):
                if f['href'].endswith('.jpg'):
                    result.add_full(FullPictureInfo(abs_href=URL(f['href']), rel_name='%03d.jpg' % i))
                    i += 1

            return result

        if len(startpage_rule.get_result()) > 0:
            result.set_type('hrefs')
            for item in startpage_rule.get_result():
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data', 'title']):
                result.add_page(ControlInfo(item['title'], URL(item['href'])))

        return result
示例#8
0
文件: xxp_model.py 项目: vit-001/fget
    def parse_index(self, request, url=URL()):
        parser = SiteParser()

        picture_rule = ParserRule()
        picture_rule.add_activate_rule_level([('td', 'height', '500')])
        picture_rule.add_process_rule_level('a', set())
        picture_rule.add_process_rule_level('img', {'src'})
        picture_rule.set_attribute_modifier_function(
            'src', lambda x: url.domain() + '/' + x)
        parser.add_rule(picture_rule)

        for data in request:
            parser.feed(data.decode(encoding="utf-8", errors="ignore"))

        return picture_rule.get_result()[0]['src']
示例#9
0
文件: el_model.py 项目: vit-001/fget
    def parse_index(self, request, url):
        parser = SiteParser()

        picture_rule = ParserRule()
        picture_rule.add_activate_rule_level([('center', '', '')])
        picture_rule.add_process_rule_level('a', set())
        picture_rule.add_process_rule_level('img', {'src'})
        parser.add_rule(picture_rule)

        for data in request:
            parser.feed(data.decode('utf-8'))

        # print(picture_rule.get_result()[0]['src'])

        return picture_rule.get_result()[0]['src']
示例#10
0
文件: fp_model.py 项目: vit-001/fget
    def parse_index_file(self, fname, base_url=URL()):
        print(base_url.get(), base_url.domain())
        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'ogpost'),
                                                ('div', 'class', 'post300'),
                                                ('div', 'class', 'galelement')
                                                ])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: get_href(x, base_url.domain()))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([
            ('span', 'class', 'pager'), ('div', 'class', 'pager')
        ])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: base_url.domain() + x)
        parser.add_rule(startpage_pages_rule)

        startpage_href_rule = ParserRule()
        startpage_href_rule.add_activate_rule_level([('div', 'id', 'right')])
        startpage_href_rule.add_activate_rule_level([('div', 'class',
                                                      'rightbox')])
        startpage_href_rule.add_process_rule_level('a', {'href'})
        parser.add_rule(startpage_href_rule)

        picture_rule = ParserRule()
        picture_rule.add_activate_rule_level([('div', 'class',
                                               'galcontentpics')])
        picture_rule.add_process_rule_level('a', set())
        picture_rule.add_process_rule_level('img', {'src'})
        picture_rule.set_attribute_modifier_function(
            'src', lambda text: _del_thumb(text))
        parser.add_rule(picture_rule)

        for s in open(fname):
            parser.feed(s)

        result = ParseResult()

        if len(startpage_rule.get_result()) > 0:
            result.set_type('hrefs')
            for item in startpage_rule.get_result(['href', 'src']):
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href'] + '*'),
                              popup=item.get('alt', '')))

            for item in startpage_href_rule.get_result(['href', 'data']):
                if item['href'].startswith('/'):
                    result.add_control(
                        ControlInfo(
                            item['data'],
                            URL(base_url.domain() + item['href'] + '*')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(
                    ControlInfo(item['data'], URL(item['href'] + '*')))

        if len(picture_rule.get_result()) > 0:
            result.set_type('pictures')
            i = 1
            for f in picture_rule.get_result(['src']):
                result.add_full(
                    FullPictureInfo(abs_href=URL(f['src']),
                                    rel_name='%03d.jpg' % i))
                i += 1

        return result
示例#11
0
    def parse_index_file(self, fname, base_url=URL()):
        site_url = 'http://' + urlparse(base_url.get())[1].strip('/')
        print('site url=', site_url)

        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class',
                                                 'bodycontainer')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt', 'class'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: site_url + get_href(x))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        # startpage_pages_rule.add_activate_rule_level([('div', 'class', 'bodycontainer')])
        startpage_pages_rule.add_activate_rule_level([('td', 'align', 'right')
                                                      ])
        startpage_pages_rule.add_process_rule_level('a', {'href', 'title'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: site_url + x)
        parser.add_rule(startpage_pages_rule)

        site_rule = ParserRule()
        site_rule.add_activate_rule_level([('div', 'class', 'headerlinetext')])
        site_rule.add_process_rule_level('a', {'href'})
        parser.add_rule(site_rule)

        picture_trigger_rule = ParserRule()
        picture_trigger_rule.add_activate_rule_level([('a', 'class',
                                                       'fancybox')])
        picture_trigger_rule.add_process_rule_level('img', {'src'})
        parser.add_rule(picture_trigger_rule)

        picture_rule = ParserRule()
        picture_rule.add_activate_rule_level([('div', 'class', 'bodycontainer')
                                              ])
        picture_rule.add_process_rule_level('a', {'href', 'class'})
        picture_rule.add_process_rule_level('img', {'alt'})
        parser.add_rule(picture_rule)

        picture_href_rule = ParserRule()
        picture_href_rule.add_activate_rule_level([('div', 'class',
                                                    'bodycontainer')])
        picture_href_rule.add_activate_rule_level([('h2', 'style',
                                                    'font-size:18px')])
        picture_href_rule.add_process_rule_level('a', {'href'})
        picture_href_rule.set_attribute_modifier_function(
            'href', lambda x: site_url + x)
        parser.add_rule(picture_href_rule)

        for s in open(fname):
            parser.feed(s)

        result = ParseResult()

        if len(picture_trigger_rule.get_result()) > 0:
            result.set_type('pictures')
            i = 1
            for f in picture_rule.get_result(['href', 'alt', 'class']):
                if f['class'] == 'fancybox':
                    result.add_full(
                        FullPictureInfo(abs_href=URL(f['href']),
                                        rel_name='%03d.jpg' % i))
                    i += 1

            for item in picture_href_rule.get_result(['href', 'data']):
                result.add_control(
                    ControlInfo(text=item['data'], url=URL(item['href'])))

            return result

        if len(startpage_rule.get_result()) > 0:
            result.set_type('hrefs')
            for item in startpage_rule.get_result(['href', 'src', 'class']):
                if item['class'] == 'thumb':
                    result.add_thumb(
                        ThumbInfo(thumb_url=URL(item['src']),
                                  href=URL(item['href']),
                                  popup=item.get('alt', '')))

            for item in site_rule.get_result(['href', 'data']):
                result.add_site(ControlInfo(item['data'], URL(item['href'])))

            for item in startpage_pages_rule.get_result(
                ['href', 'data', 'title']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

        return result
示例#12
0
    def parse_index_file(self, fname, base_url=URL()):
        print(base_url.domain())
        parser = SiteParser()
        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'ownpost')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class', 'nav')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: base_url.domain() + '/' + x + '*')
        parser.add_rule(startpage_pages_rule)

        picture_rule = ParserRule()
        picture_rule.add_activate_rule_level([('div', 'class', 'gallery')])
        picture_rule.add_process_rule_level('a', set())
        picture_rule.add_process_rule_level('img', {'src', 'class'})
        picture_rule.set_attribute_modifier_function(
            'src', lambda text: text.replace('/tn_', '/'))
        parser.add_rule(picture_rule)

        picture_href_rule = ParserRule()
        picture_href_rule.add_activate_rule_level([('div', 'class', 'gallery')
                                                   ])
        picture_href_rule.add_process_rule_level('a', {'href'})
        picture_href_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        picture_href_rule.set_attribute_filter_function(
            'href', lambda x: x.find('/?category=') != -1)
        parser.add_rule(picture_href_rule)

        for s in open(fname):
            parser.feed(s)

        result = ParseResult()

        if len(startpage_rule.get_result()) > 0:
            # print('Startpage rule')
            result.set_type('hrefs')
            for item in startpage_rule.get_result():
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href'] + '*'),
                              popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

        if len(picture_rule.get_result()) > 0:
            result.set_type('pictures')
            i = 1
            for f in picture_rule.get_result(['src', 'class']):
                # print(f)
                result.add_full(
                    FullPictureInfo(abs_href=URL(f['src']),
                                    rel_name='%03d.jpg' % i))
                i += 1

            for f in picture_href_rule.get_result():
                # print(f)
                result.add_control(
                    ControlInfo(f['data'].replace(',', ''), URL(f['href'])))

        return result
示例#13
0
    def parse_index_file(self, fname, base_url=URL()):
        print(base_url.get(), base_url.domain())
        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'lady')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: base_url.domain() + '/' + x)
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class',
                                                       'nav_link')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: base_url.domain() + '/' + x)
        parser.add_rule(startpage_pages_rule)

        startpage_nav_rule = ParserRule()
        startpage_nav_rule.add_activate_rule_level([('td', 'class', 'nav')])
        startpage_nav_rule.add_process_rule_level('a', {'href'})
        parser.add_rule(startpage_nav_rule)

        picture_rule = ParserRule()
        picture_rule.add_activate_rule_level([('td', 'align', 'center')])
        picture_rule.add_process_rule_level('a', {'href'})
        picture_rule.add_process_rule_level('img', {'src'})
        picture_rule.set_attribute_modifier_function(
            'src', lambda text: _del_thumb(text))
        parser.add_rule(picture_rule)

        for s in open(fname):
            parser.feed(s)

        result = ParseResult()

        if len(startpage_rule.get_result()) > 0:
            result.set_type('hrefs')
            for item in startpage_rule.get_result(['href', 'src']):
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href'] + '*'),
                              popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(
                    ControlInfo(item['data'], URL(item['href'] + '*')))

            for item in startpage_nav_rule.get_result(['href', 'data']):
                if item['href'].startswith("http://www.deffki.su/"):
                    result.add_control(
                        ControlInfo(item['data'], URL(item['href'] + '*')))

        if base_url.contain('?go=gal&id='):
            result.set_type('pictures')
            dirname = self.base_addr + base_url.get_path(
            ) + '/' + base_url.get().rpartition('=')[2] + '/'
            result.set_gallery_path(dirname)
            i = 1
            for f in picture_rule.get_result(['src', 'href']):
                if not f['href'].startswith('prv.php?id='): continue
                if not f['src'].startswith('http://'): continue
                # print(f['src'])
                result.add_full(
                    FullPictureInfo(abs_href=URL(f['src']),
                                    abs_name=dirname + '%03d.jpg' % i))
                i += 1

        return result
示例#14
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()
        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([
            ('div', 'class', 'thumbs_main'),
            ('div', 'class', 'content_box model_sub'),
            ('div', 'class', 'teen_girls_list'),
            ('div', 'class', 'gallery_box')
        ])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function('href', get_href)
        parser.add_rule(startpage_rule)

        startpage_menu_rule = ParserRule()
        startpage_menu_rule.add_activate_rule_level([('div', 'class', 'head')])
        startpage_menu_rule.add_activate_rule_level([('ul', 'class', 'menu')])
        startpage_menu_rule.add_process_rule_level('a', {'href'})
        startpage_menu_rule.set_attribute_modifier_function(
            'href', lambda x: x)
        parser.add_rule(startpage_menu_rule)

        archive_pages_rule = ParserRule()
        archive_pages_rule.add_activate_rule_level([('div', 'class', 'head')])
        archive_pages_rule.add_activate_rule_level([('span', '', '')])
        archive_pages_rule.add_process_rule_level('a', {'href'})
        archive_pages_rule.set_attribute_modifier_function(
            'href', lambda x: 'http://www.teenport.com' + x)
        parser.add_rule(archive_pages_rule)

        model_href_rule = ParserRule()
        model_href_rule.add_activate_rule_level([
            ('div', 'class', 'model_desc model_niche_desc')
        ])
        model_href_rule.add_process_rule_level('a', {'href'})
        parser.add_rule(model_href_rule)

        picture_rule = ParserRule()
        picture_rule.add_activate_rule_level([
            ('div', 'class', 'thumb_box top_corners'),
            ('div', 'class', 'thumb_box bottom_corners')
        ])
        picture_rule.add_process_rule_level('a', set())
        picture_rule.add_process_rule_level('img', {'src'})
        picture_rule.set_attribute_modifier_function(
            'src', lambda text: text.replace('t', ''))
        parser.add_rule(picture_rule)

        picture_href_rule = ParserRule()
        picture_href_rule.add_activate_rule_level([('div', 'class', 'title')])
        picture_href_rule.add_process_rule_level('a', {'href', 'title'})
        parser.add_rule(picture_href_rule)

        for s in open(fname):
            parser.feed(s)

        result = ParseResult()

        if len(startpage_rule.get_result()) > 0:
            result.set_type('hrefs')
            for item in startpage_rule.get_result():
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item.get('alt', '')))

            for item in startpage_menu_rule.get_result(['href', 'data']):
                result.add_control(ControlInfo(item['data'],
                                               URL(item['href'])))

            for item in model_href_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            for item in archive_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

        if len(picture_rule.get_result()) > 0:
            result.set_type('pictures')
            for f in picture_rule.get_result():
                result.add_full(FullPictureInfo(rel_name=f['src']))

            for f in picture_href_rule.get_result():
                result.add_control(ControlInfo(f['title'], URL(f['href'])))

        return result
示例#15
0
文件: dsb_model.py 项目: vit-001/fget
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()
        domain = base_url.domain()

        href_rule = ParserRule()  # startpage & model's page
        href_rule.add_activate_rule_level([('div', 'id', 'lst-galleries'),
                                           ('div', 'class', 'lblock'),
                                           ('div', 'class', 'modal_info_full')
                                           ])
        href_rule.add_process_rule_level('a', {'href'})
        href_rule.add_process_rule_level('img', {'src', 'alt'})
        href_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        href_rule.set_attribute_modifier_function(
            'src', lambda x: self.get_href(x, base_url))
        href_rule.set_attribute_filter_function('href', self.thumb_href_filter)
        href_rule.set_attribute_filter_function('src', self.thumb_src_filter)
        parser.add_rule(href_rule)

        href_page_rule = ParserRule()  # page number in model's page
        href_page_rule.add_activate_rule_level([('div', 'class', 'pages'),
                                                ('div', 'class', 'cat')])
        href_page_rule.add_process_rule_level('a', {'href'})
        href_page_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(href_page_rule)

        model_litera_rule = ParserRule()
        model_litera_rule.add_activate_rule_level([('div', 'class',
                                                    'babe_index')])
        model_litera_rule.add_process_rule_level('a', {'href'})
        model_litera_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(model_litera_rule)

        model_more_rule = ParserRule()
        model_more_rule.add_activate_rule_level([('div', 'class', 'more'),
                                                 ('div', 'id', 'MoreCont')])
        model_more_rule.add_process_rule_level('a', {'href'})
        model_more_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        model_more_rule.set_attribute_filter_function('href',
                                                      self.thumb_href_filter)
        parser.add_rule(model_more_rule)

        picture_rule = ParserRule()  # gallery rule
        picture_rule.add_activate_rule_level([('div', 'class', 'lblock')])
        # picture_rule.add_activate_rule_level([('ul', 'class', 'block')])
        picture_rule.add_process_rule_level('a', {'href'})
        picture_rule.add_process_rule_level('img', {'alt'})
        picture_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        picture_rule.set_attribute_filter_function(
            'href', lambda x: x.endswith('.jpg'))
        parser.add_rule(picture_rule)

        picture_href_rule = ParserRule()  # gallery href's rule
        picture_href_rule.add_activate_rule_level([('div', 'id', 'ModelMenu'),
                                                   ('div', 'class', 'lblock')])
        picture_href_rule.add_process_rule_level('a', {'href', 'title'})
        picture_href_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        picture_href_rule.set_attribute_filter_function(
            'href', self.gal_href_filter)
        parser.add_rule(picture_href_rule)

        for s in open(fname):
            parser.feed(s)

        result = ParseResult()

        if len(picture_rule.get_result()) > 0:
            result.set_type('pictures')
            i = 1
            for f in picture_rule.get_result():
                x = FullPictureInfo(abs_href=URL(f['href']),
                                    rel_name='%03d.jpg' % i)
                result.add_full(x)
                i += 1

            for f in picture_href_rule.get_result(['href', 'title']):
                # print(f)
                result.add_control(
                    ControlInfo(text=f['title'], url=URL(f['href'])))
            return result

        if len(href_rule.get_result()) > 0:
            result.set_type('hrefs')
            for item in href_rule.get_result():
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item.get('alt', '')))

            for item in model_more_rule.get_result(['href', 'data']):
                result.add_control(ControlInfo(item['data'],
                                               URL(item['href'])))
            for item in model_litera_rule.get_result(['href', 'data']):
                result.add_control(ControlInfo(item['data'],
                                               URL(item['href'])))
            for item in href_page_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

        return result
示例#16
0
    def parse_index_file(self, fname, base_url=URL()):
        print('VP parsing')

        parser = SiteParser()
        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'bx'),
                                                ('div', 'class', 'bx lastrow')
                                                ])
        startpage_rule.add_process_rule_level('a', {'href', 'class'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class',
                                                       'pagerwrap')])
        # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        # startpage_pages_rule.set_attribute_modifier_function('href',lambda x: base_url.domain() + x)
        parser.add_rule(startpage_pages_rule)

        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'class', 'video_panel')])
        video_rule.add_process_rule_level('script', {})
        video_rule.set_attribute_filter_function(
            'data', lambda text: 'var flashvars' in text)
        parser.add_rule(video_rule)

        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('div', 'class',
                                                    'tagas-secondrow')])
        # gallery_href_rule.add_activate_rule_level([('td', 'class', 'btnList')])
        gallery_href_rule.add_process_rule_level('a', {'href'})
        gallery_href_rule.set_attribute_modifier_function(
            'href', lambda x: base_url.domain() + x + '*')
        # gallery_href_rule.set_attribute_filter_function('href',lambda x:'/category/'in x or '/search/'in x)
        parser.add_rule(gallery_href_rule)

        gallery_user_rule = ParserRule()
        gallery_user_rule.add_activate_rule_level([('div', 'class', 'info')])
        gallery_user_rule.add_process_rule_level('a', {'href'})
        # gallery_user_rule.set_attribute_filter_function('href',lambda x:'/profile/' in x)
        gallery_user_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x.replace('/user/', '/submitted/'),
                                            base_url))
        parser.add_rule(gallery_user_rule)

        for s in open(fname, encoding='utf-8', errors='ignore'):
            # print(s)
            parser.feed(s.replace('</b>', '</a>'))

        result = ParseResult()

        if len(video_rule.get_result()) > 0:
            script = video_rule.get_result()[0]['data'].replace(' ', '')

            # print(script)

            def get_url_from_script(script='', var=''):
                data = script.partition('flashvars.' + var +
                                        '="')[2].partition('"')[0]
                # print(var,data)
                if data.startswith('https://'): return URL(data)

            videoUrlLow = get_url_from_script(script, 'videoUrlLow')
            videoUrlLow2 = get_url_from_script(script, 'videoUrlLow2')
            videoUrlMedium = get_url_from_script(script, 'videoUrlMedium')
            videoUrlMedium2 = get_url_from_script(script, 'videoUrlMedium2')
            videoUrlHD = get_url_from_script(script, 'videoUrlHD')
            videoUrlHD2 = get_url_from_script(script, 'videoUrlHD2')

            def add_alternate(video, txt, url):
                if url is not None:
                    video.add_alternate(dict(text=txt, url=url))

            # video=MediaData(videoUrlMedium)

            if videoUrlMedium is not None:
                video = MediaData(videoUrlMedium)
            elif videoUrlLow is not None:
                video = MediaData(videoUrlLow)
            else:
                print('No url found')
                return result

            add_alternate(video, 'Low', videoUrlLow)
            add_alternate(video, 'Low2', videoUrlLow2)
            add_alternate(video, 'Medium', videoUrlMedium)
            add_alternate(video, 'Medium', videoUrlMedium2)
            add_alternate(video, 'HD', videoUrlHD)
            add_alternate(video, 'HD', videoUrlHD2)

            result.set_type('video')
            result.set_video(video)

            for f in gallery_user_rule.get_result():
                # print(f)
                result.add_control(
                    ControlInfo('"' + f['data'] + '"', URL(f['href'])))

            for f in gallery_href_rule.get_result(['data', 'href']):
                # print(f)
                result.add_control(ControlInfo(f['data'], URL(f['href'])))
            # print('return')
            return result

        if len(startpage_rule.get_result()) > 0:
            result.set_type('hrefs')
            for item in startpage_rule.get_result():
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item['alt']))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

                # for item in startpage_hrefs_rule.get_result(['href', 'data']):
                #     result.add_control(ControlInfo(item['data'], URL(item['href'])))

        return result
示例#17
0
    def parse_index_file(self, fname, base_url=URL()):
        print(base_url.domain())
        parser = SiteParser()
        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'single_plug')
                                                ])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        parser.add_rule(startpage_rule)

        # startpage_pages_rule = ParserRule()
        # startpage_pages_rule.add_activate_rule_level([('div', 'id', 'nav')])
        # startpage_pages_rule.add_process_rule_level('a', {'href'})
        # startpage_pages_rule.set_attribute_modifier_function('href', lambda x: base_url.get().partition('?')[0] + x+'*')
        # parser.add_rule(startpage_pages_rule)
        #
        # picture_rule = ParserRule()
        # picture_rule.add_activate_rule_level([('div', 'class', 'gallery_w')])
        # picture_rule.add_process_rule_level('a', set())
        # picture_rule.add_process_rule_level('img', {'src','class'})
        # picture_rule.set_attribute_modifier_function('src', lambda text: text.replace('/tn_', '/'))
        # parser.add_rule(picture_rule)
        #
        # picture_href_rule = ParserRule()
        # picture_href_rule.add_activate_rule_level([('div', 'class', 'tags')])
        # picture_href_rule.add_process_rule_level('a', {'href'})
        # picture_href_rule.set_attribute_modifier_function('href', lambda x: base_url.domain()+x)
        # parser.add_rule(picture_href_rule)

        for s in open(fname):
            parser.feed(s)

        result = []

        for item in startpage_rule.get_result():
            result.append(item['href'])

        # result = ParseResult(self)
        #
        # if len(startpage_rule.get_result()) > 0:
        #     # print('Startpage rule')
        #     result.set_type('hrefs')
        #     for item in startpage_rule.get_result():
        #         result.add_thumb(
        #             ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']+'*'), description=item.get('alt', '')))
        #
        #     for item in startpage_pages_rule.get_result(['href', 'data']):
        #         result.add_page(ControlInfo(item['data'], URL(item['href'])))
        #
        # if len(picture_rule.get_result()) > 0:
        #     result.set_type('pictures')
        #     i = 1
        #     for f in picture_rule.get_result(['src', 'class']):
        #         # print(f)
        #         result.add_full(FullPictureInfo(abs_href=URL(f['src']), rel_name='%03d.jpg' % i))
        #         i += 1
        #
        #     for f in picture_href_rule.get_result():
        #         # print(f)
        #         result.add_control(ControlInfo(f['data'].replace(',',''), URL(f['href'])))
        #
        return result
示例#18
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()
        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('ul', 'class', 'thumbs'),
                                                ('ul', 'class', 'thumbs break')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x)
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('p', 'class', 'pagination')])
        startpage_pages_rule.add_process_rule_level('a', {'class', 'href'})
        startpage_pages_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x)
        parser.add_rule(startpage_pages_rule)

        picture_trigger_rule = ParserRule()
        picture_trigger_rule.add_activate_rule_level([('head', '', '')])
        picture_trigger_rule.add_process_rule_level('meta', {'name'})
        parser.add_rule(picture_trigger_rule)

        picture_href_rule = ParserRule()
        picture_href_rule.add_activate_rule_level([('ul', 'class', 'options')])
        picture_href_rule.add_process_rule_level('a', {'href'})

        parser.add_rule(picture_href_rule)

        for s in open(fname):
            parser.feed(s)

        result = ParseResult()

        pictures = False
        for item in picture_trigger_rule.get_result(['name']):
            if item['name'] == 'gallery-id':
                pictures = True

        if pictures:
            result.set_type('pictures')
            i = 1
            for f in startpage_rule.get_result():
                result.add_full(FullPictureInfo(abs_href=URL(f['href']), rel_name='%03d.jpg' % i))
                i += 1

            for f in picture_href_rule.get_result():
                # print(f)
                if not f['href'].startswith('http://'):
                    result.add_control(ControlInfo(f['data'], URL(base_url.domain() + f['href'])))
            return result

        if len(startpage_rule.get_result()) > 0:
            # print('Startpage rule')
            result.set_type('hrefs')
            for item in startpage_rule.get_result():
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data', 'class']):
                if item['class'] == 'page':
                    result.add_page(ControlInfo(item['data'], URL(item['href'])))

        return result
示例#19
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'thumbs'),
                                                ('div', 'class', 'content_box domain2'),
                                                ('div', 'class', 'video_list'),
                                                ('div', 'class', 'video_list_models'),
                                                ('div', 'class', 'pics_list'),
                                                ('div', 'class', 'movie_thumbs')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function('href', self.get_href)
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class', 'pages'),
                                                      ('div', 'class', 'pg')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x)
        parser.add_rule(startpage_pages_rule)

        startpage_hrefs_rule = ParserRule()
        startpage_hrefs_rule.add_activate_rule_level([('li', 'class', 'orange dropdown')])
        startpage_hrefs_rule.add_process_rule_level('a', {'href'})
        startpage_hrefs_rule.set_attribute_filter_function('href', lambda txt: '/st/' in txt)
        parser.add_rule(startpage_hrefs_rule)

        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'class', 'player')])
        video_rule.add_process_rule_level('script', {})
        video_rule.set_attribute_filter_function('data', lambda text: 'video_url:' in text)
        parser.add_rule(video_rule)

        picture_rule = ParserRule()
        picture_rule.add_activate_rule_level([('div', 'class', 'thumb_box')])
        picture_rule.add_process_rule_level('a', set())
        picture_rule.add_process_rule_level('img', {'src'})
        picture_rule.set_attribute_modifier_function('src', lambda text: text.replace('t.jpg', '.jpg'))
        parser.add_rule(picture_rule)

        picture_href_rule = ParserRule()
        picture_href_rule.add_activate_rule_level([('div', 'class', 'crumbles'),
                                                   ('div', 'class', 'tags')])
        picture_href_rule.add_process_rule_level('a', {'href', 'title'})
        parser.add_rule(picture_href_rule)

        for s in open(fname):
            parser.feed(s)

        result = ParseResult()

        if video_rule.is_result():

            video = MediaData(URL(self.get_attr_from_script(video_rule.get_result()[0]['data'])))
            result.set_video(video)
            result.set_type('video')

            for f in picture_href_rule.get_result():
                # print(f)
                result.add_control(ControlInfo(f['data'], URL(f['href'])))
            return result

        if len(startpage_rule.get_result()) > 0:
            # print('Startpage rule')
            result.set_type('hrefs')
            for item in startpage_rule.get_result():
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            for item in startpage_hrefs_rule.get_result(['href', 'data']):
                result.add_control(ControlInfo(item['data'], URL(item['href'])))

        if len(picture_rule.get_result()) > 0:
            result.set_type('pictures')
            for f in picture_rule.get_result():
                result.add_full(FullPictureInfo(rel_name=f['src']))

            for f in picture_href_rule.get_result():
                # print(f)
                result.add_control(ControlInfo(f['title'], URL(f['href'])))

        return result
示例#20
0
文件: fk_model.py 项目: vit-001/fget
    def parse_index_file(self, fname, base_url=URL()):
        # print(base_url)
        parser = SiteParser()
        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'thumblinks')
                                                ])
        # startpage_rule.add_process_rule_level('div', {})
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url) + '*')
        startpage_rule.set_attribute_modifier_function(
            'src', lambda x: self.get_href(x, base_url) + '*')
        parser.add_rule(startpage_rule)

        tags_rule = ParserRule()
        tags_rule.add_activate_rule_level([('div', 'id', 'divTags')])
        tags_rule.add_process_rule_level('a', {'href'})
        tags_rule.set_attribute_modifier_function(
            'href', lambda x: base_url.domain() + x)
        parser.add_rule(tags_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('td', 'class', 'pages')
                                                      ])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: base_url.domain() + x)
        parser.add_rule(startpage_pages_rule)

        picture_base_addr_rule = ParserRule()
        picture_base_addr_rule.add_activate_rule_level([('div', 'class',
                                                         'imagelinks')])
        picture_base_addr_rule.add_process_rule_level('script', {})
        picture_base_addr_rule.set_attribute_filter_function(
            'data', lambda x: 'unescape' in x)
        parser.add_rule(picture_base_addr_rule)

        picture_rule = ParserRule()
        picture_rule.add_activate_rule_level([('div', 'class', 'imagelinks')])
        picture_rule.add_process_rule_level('script', {})
        picture_rule.set_attribute_filter_function('data',
                                                   lambda x: "'src'" in x)
        parser.add_rule(picture_rule)

        for s in open(fname):
            parser.feed(s)

        result = ParseResult()

        if len(picture_base_addr_rule.get_result()) > 0:
            result.set_type('pictures')
            base = \
            picture_base_addr_rule.get_result()[0]['data'].replace('%2f', '/').partition("unescape('//")[2].partition(
                "'")[0]
            # print(base)
            i = 1
            for f in picture_rule.get_result():
                picname = f['data'].partition("+'")[2].partition("'")[0]
                # print(picname)
                result.add_full(
                    FullPictureInfo(abs_href=URL(base + picname + '*'),
                                    rel_name='%03d.jpg' % i))
                i += 1

            for item in tags_rule.get_result(['href', 'data']):
                result.add_control(ControlInfo(item['data'],
                                               URL(item['href'])))

            return result

        if len(startpage_rule.get_result()) > 0:
            result.set_type('hrefs')
            for item in startpage_rule.get_result():
                # print(item)
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item.get('alt', '')))

            for item in tags_rule.get_result(['href', 'data']):
                result.add_control(ControlInfo(item['data'],
                                               URL(item['href'])))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

        return result
示例#21
0
文件: el_model.py 项目: vit-001/fget
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()
        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class',
                                                 'image-delete'),
                                                ('div', 'class', 'thumbs'),
                                                ('div', 'class', 'image')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class', 'pager')
                                                      ])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: base_url.domain() + x)
        parser.add_rule(startpage_pages_rule)

        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('div', 'class', 'main')])
        video_rule.add_process_rule_level('script', {})
        video_rule.set_attribute_filter_function(
            'data', lambda text: 'function getEmbed()' in text)
        parser.add_rule(video_rule)

        picture_rule = ParserRule()
        picture_rule.add_activate_rule_level([('div', 'class', 'thumbs2')])
        picture_rule.add_process_rule_level('a', {'href'})
        picture_rule.add_process_rule_level('img', {'src'})
        # picture_rule.set_attribute_modifier_function('src', lambda text: text.replace('thumb', 'origin'))
        parser.add_rule(picture_rule)

        picture_tags_rule = ParserRule()
        picture_tags_rule.add_activate_rule_level([('div', 'class', 'main')])
        picture_tags_rule.add_process_rule_level('a', {'href'})
        picture_tags_rule.set_attribute_filter_function(
            'href', lambda txt: '/categories/' in txt or '/model/' in txt)
        parser.add_rule(picture_tags_rule)

        for s in open(fname, encoding='utf-8'):
            parser.feed(s)

        result = ParseResult()

        if len(video_rule.get_result()) > 0:
            result.set_video(
                MediaData(
                    URL(
                        self.get_attr_from_script(
                            video_rule.get_result()[0]['data']))))
            result.set_type('video')

            for item in picture_tags_rule.get_result(['href', 'data']):
                result.add_control(ControlInfo(item['data'],
                                               URL(item['href'])))
            return result

        if len(startpage_rule.get_result()) > 0:
            result.set_type('hrefs')
            for item in startpage_rule.get_result():
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

        if len(picture_rule.get_result()) > 0:
            result.set_type('pictures')

            result.set_picture_collector(ELSitePictureCollector())

            i = 1
            for f in picture_rule.get_result():
                result.add_full(
                    FullPictureInfo(abs_href=URL(f['href']),
                                    rel_name='%03d.jpg' % i))
                i += 1

            for item in picture_tags_rule.get_result(['href', 'data']):
                result.add_control(ControlInfo(item['data'],
                                               URL(item['href'])))

        return result
示例#22
0
文件: top_model.py 项目: vit-001/fget
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()
        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'thumbs'),
                                                ('div', 'class',
                                                 'movie_thumbs')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function('href', get_href)
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class', 'head')
                                                      ])
        startpage_pages_rule.add_activate_rule_level([('div', 'class', 'pages')
                                                      ])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: 'http://www.tomorrowporn.com' + x)
        parser.add_rule(startpage_pages_rule)

        href_rule = ParserRule()
        href_rule.add_activate_rule_level([('ul', 'class', 'sub_thumb_list')])
        href_rule.add_process_rule_level('a', {'href'})
        href_rule.add_process_rule_level('img', {'src', 'alt'})
        href_rule.set_attribute_modifier_function('href', get_href)
        parser.add_rule(href_rule)

        picture_rule = ParserRule()
        picture_rule.add_activate_rule_level([
            ('div', 'class', 'thumb_box'),
            ('div', 'class', 'thumb_box bottom_corners'),
            ('div', 'class', 'thumb_box top_corners')
        ])
        picture_rule.add_process_rule_level('a', set())
        picture_rule.add_process_rule_level('img', {'src'})
        picture_rule.set_attribute_modifier_function(
            'src', lambda text: text.replace('t', ''))
        parser.add_rule(picture_rule)

        picture_href_rule = ParserRule()
        picture_href_rule.add_activate_rule_level([('div', 'class', 'menus')])
        picture_href_rule.add_process_rule_level('h2', set())
        picture_href_rule.add_process_rule_level('a', {'href', 'title'})
        parser.add_rule(picture_href_rule)

        for s in open(fname, encoding='utf-8'):
            parser.feed(s)

        result = ParseResult()

        if len(startpage_rule.get_result()) > 0:
            # result.set_type('hrefs')
            for item in startpage_rule.get_result():
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

        if len(href_rule.get_result()) > 0:
            # result.set_type('hrefs')
            for item in href_rule.get_result():
                # print (item)
                if 'src' in item:
                    result.add_thumb(
                        ThumbInfo(thumb_url=URL(item['src']),
                                  href=URL(item['href']),
                                  popup=item.get('alt', '')))

        if len(picture_rule.get_result()) > 0:
            # result.set_type('pictures')
            for f in picture_rule.get_result():
                result.add_full(FullPictureInfo(rel_name=f['src']))

            for f in picture_href_rule.get_result():
                # print(f)
                result.add_control(ControlInfo(f['title'], URL(f['href'])))

        return result
示例#23
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'clearfix'),
                                                ('div', 'class', 'row clearfix  video-container')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level(
            [('div', 'class', 'btn-group clearfix full-width pagination-block')])
        # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        startpage_categories_rule = ParserRule()
        startpage_categories_rule.add_activate_rule_level([('ul', 'class', 'main-nav unstyled-list subCategories')])
        # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_categories_rule.add_process_rule_level('a', {'href'})
        # startpage_categories_rule.set_attribute_filter_function('href',lambda x:'/free_porn/' in x)
        startpage_categories_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_categories_rule)

        startpage_hrefs_rule = ParserRule()
        startpage_hrefs_rule.add_activate_rule_level([('div', 'class', 'cat-menu hidden-xs')])
        # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')])
        startpage_hrefs_rule.add_process_rule_level('a', {'href'})
        startpage_hrefs_rule.set_attribute_filter_function('href', lambda x: '/free_porn/' in x)
        startpage_hrefs_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_hrefs_rule)
        #

        video_rule = ParserRule()
        video_rule.add_activate_rule_level([('body', '', '')])
        video_rule.add_process_rule_level('script', {''})
        video_rule.set_attribute_filter_function('data', lambda text: 'var urls' in text)
        # video_rule.set_attribute_modifier_function('src',lambda txt:txt+'*')
        parser.add_rule(video_rule)

        gallery_rule = ParserRule()
        gallery_rule.add_activate_rule_level([('div', 'id', 'galleryImages')])
        gallery_rule.add_process_rule_level('a', {})
        gallery_rule.add_process_rule_level('img', {'src'})
        # video_rule.set_attribute_filter_function('data', lambda text: 'jwplayer' in text)
        gallery_rule.set_attribute_modifier_function('src', lambda txt: txt.replace('/thumbs/', '/'))
        parser.add_rule(gallery_rule)

        #
        gallery_href_rule = ParserRule()
        gallery_href_rule.add_activate_rule_level([('div', 'class', 'video-player-list tag-list-block')])
        gallery_href_rule.add_process_rule_level('a', {'href', 'title'})
        gallery_href_rule.set_attribute_modifier_function('href', lambda x: (self.get_href(x, base_url)))
        parser.add_rule(gallery_href_rule)

        gallery_user_rule = ParserRule()
        gallery_user_rule.add_activate_rule_level([('div', 'class', 'video-player-info row')])
        gallery_user_rule.add_process_rule_level('a', {'href'})
        # gallery_user_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x + '*')
        # gallery_user_rule.set_attribute_filter_function('href',lambda x:'/categories/' in x)
        parser.add_rule(gallery_user_rule)

        all = ''
        for s in open(fname, encoding='utf-8', errors='ignore'):
            parser.feed(s)  # .replace('</b>','</a>'))
            all += s.replace(' ', '')

        result = ParseResult()

        if 'urls.push({' in all:
            video_url = all.partition('urls.push({')[2].partition('"});')[0].partition('file:"')[2]
            video = MediaData(URL(video_url + '*'))

            result.set_type('video')
            result.set_video(video)

            if gallery_user_rule.is_result():
                # print(gallery_user_rule.get_result())
                user_name = gallery_user_rule.get_result()[0]['data'].strip()
                user_number = gallery_user_rule.get_result()[0]['href'].rpartition('-')[2].rstrip('/')

                # print(user_name, user_number)
                result.add_control(ControlInfo('"' + user_name + '"',
                                               URL('http://shockingmovies.com/uploads-by-user/' + user_number + '/')))
                # result.add_control(ControlInfo(user+' gals', URL('http://motherless.com/galleries/member/'+user+'*')))

            for f in gallery_href_rule.get_result(['href']):
                label = f['data'].strip().strip(',')
                if label == '':
                    label = f['title']

                result.add_control(ControlInfo(label, URL(f['href'])))

            return result

        if gallery_rule.is_result():
            result.set_type('pictures')
            url = URL(gallery_rule.get_result()[0]['src'] + '*')
            base_dir = url.get_path(base=Setting.base_dir)
            result.set_gallery_path(base_dir)
            for f in gallery_rule.get_result():
                picture = FullPictureInfo(abs_href=URL(f['src'] + '*'), rel_name=f['src'].rpartition('/')[2])
                picture.set_base(base_dir)
                result.add_full(picture)

            for f in gallery_href_rule.get_result(['href']):
                label = f['data'].strip()
                if label == '':
                    label = f['title']
                if '/user/' in f['href']:
                    split = f['href'].rpartition('-')
                    base = split[0].partition('/user/')[0]
                    # print(split)
                    # print(base)
                    result.add_control(ControlInfo(label + ' videos', URL(base + '/uploads-by-user/' + split[2])))
                    result.add_control(
                        ControlInfo(label + ' gals', URL(base + '/uploads-by-user/' + split[2] + '?photos=1')))
                else:
                    result.add_control(ControlInfo(label, URL(f['href'])))

            return result

        if startpage_rule.is_result():  # len(startpage_rule.get_result()) > 0:
            result.set_type('hrefs')

            for item in startpage_rule.get_result(['href']):
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                # print(item)
                href = item['href']
                page_number = href.rpartition('/page')[2].rpartition('.')[0]
                result.add_page(ControlInfo(page_number, URL(href)))
                # print(href,page_number)

            if len(startpage_categories_rule.get_result(['href'])) > 0:
                for item in startpage_categories_rule.get_result(['href', 'data']):
                    result.add_control(ControlInfo(item.get('data', ''), URL(item['href'])))

            if len(startpage_hrefs_rule.get_result(['href'])) > 0:
                for item in startpage_hrefs_rule.get_result(['href', 'data']):
                    result.add_control(ControlInfo(item.get('data', ''), URL(item['href'])))

        return result
示例#24
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()
        domain = base_url.domain()

        href_rule = ParserRule()  # startpage & model's page
        href_rule.add_activate_rule_level([('div', 'class', 'galleries'),
                                           ('div', 'class', 'models'),
                                           ('div', 'class', 'videos')])
        href_rule.add_activate_rule_level([('div', 'class', 'items')])
        href_rule.add_process_rule_level('a', {'href'})
        href_rule.add_process_rule_level('img', {'src', 'alt'})
        href_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        href_rule.set_attribute_modifier_function(
            'src', lambda x: self.get_href(x, base_url))
        parser.add_rule(href_rule)

        href_page_rule = ParserRule()  # page number in model's page
        href_page_rule.add_activate_rule_level([('ul', 'class', 'pagination')])
        href_page_rule.add_process_rule_level('a', {'href'})
        href_page_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(href_page_rule)

        model_litera_rule = ParserRule()
        model_litera_rule.add_activate_rule_level([('span', 'class', 'chars')])
        model_litera_rule.add_process_rule_level('a', {'href'})
        model_litera_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(model_litera_rule)

        picture_rule = ParserRule()  # gallery rule
        picture_rule.add_activate_rule_level([('div', 'class', 'picture')])
        picture_rule.add_process_rule_level('a', {'href'})
        picture_rule.add_process_rule_level('img', {'alt'})
        picture_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(picture_rule)

        video_rule = ParserRule()  # gallery rule
        video_rule.add_activate_rule_level([('div', 'class', 'video')])
        video_rule.add_process_rule_level('source', {'src'})
        # video_rule.add_process_rule_level('img', {'alt'})
        # video_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x,base_url))
        parser.add_rule(video_rule)

        picture_href_rule = ParserRule()  # gallery href's rule
        picture_href_rule.add_activate_rule_level([('div', 'class', 'model')])
        picture_href_rule.add_activate_rule_level([('div', 'class', 'links')])
        picture_href_rule.add_process_rule_level('a', {'href'})
        picture_href_rule.set_attribute_modifier_function(
            'href', lambda x: self.get_href(x, base_url))
        parser.add_rule(picture_href_rule)

        for s in open(fname):
            parser.feed(s)

        result = ParseResult()

        if len(video_rule.get_result()) > 0:
            result.set_video(
                MediaData(URL(video_rule.get_result()[0]['src'] + '*')))
            result.set_type('video')

            for f in picture_href_rule.get_result(['href', 'data']):
                # print(f)
                result.add_control(
                    ControlInfo(text=f['data'], url=URL(f['href'])))
            return result

        if len(picture_rule.get_result()) > 0:
            result.set_type('pictures')
            for f in picture_rule.get_result():
                x = FullPictureInfo(abs_href=URL(f['href']),
                                    rel_name=f['href'].rpartition('/')[2])
                result.add_full(x)

            for f in picture_href_rule.get_result(['href', 'data']):
                # print(f)
                result.add_control(
                    ControlInfo(text=f['data'], url=URL(f['href'])))
            return result

        if len(href_rule.get_result()) > 0:
            result.set_type('hrefs')
            for item in href_rule.get_result():
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item.get('alt', '')))

            for item in model_litera_rule.get_result(['href', 'data']):
                result.add_control(ControlInfo(item['data'],
                                               URL(item['href'])))
            for item in href_page_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

        return result
示例#25
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()
        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class',
                                                 'item photo-item')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([
            ('ul', 'class', 'justified-pagination')
        ])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        parser.add_rule(startpage_pages_rule)

        startpage_tags_rule = ParserRule()
        startpage_tags_rule.add_activate_rule_level([('ul', 'class', 'tags')])
        startpage_tags_rule.add_process_rule_level('a', {'href'})
        parser.add_rule(startpage_tags_rule)

        picture_rule = ParserRule()
        picture_rule.add_activate_rule_level([('div', 'class', 'photo-item')])
        picture_rule.add_process_rule_level('a', set())
        picture_rule.add_process_rule_level('img', {'src'})
        picture_rule.set_attribute_modifier_function(
            'src', lambda text: text.replace('thumb', 'origin'))
        parser.add_rule(picture_rule)

        picture_model_rule = ParserRule()
        picture_model_rule.add_activate_rule_level([('div', 'class',
                                                     'block attached-model')])
        picture_model_rule.add_process_rule_level('a', {'href'})
        picture_model_rule.add_process_rule_level('img', {'alt'})
        parser.add_rule(picture_model_rule)

        picture_tags_rule = ParserRule()
        picture_tags_rule.add_activate_rule_level([('div', 'class',
                                                    'block gallery-tags')])
        picture_tags_rule.add_activate_rule_level([('ul', 'class', 'tags')])
        picture_tags_rule.add_process_rule_level('a', {'href'})
        parser.add_rule(picture_tags_rule)

        for s in open(fname, encoding='utf-8'):
            parser.feed(s)

        result = ParseResult()

        if len(startpage_rule.get_result()) > 0:
            result.set_type('hrefs')
            for item in startpage_rule.get_result():
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            for item in startpage_tags_rule.get_result(['href', 'data']):
                result.add_control(ControlInfo(item['data'],
                                               URL(item['href'])))

        if len(picture_rule.get_result()) > 0:
            result.set_type('pictures')
            i = 1
            for f in picture_rule.get_result():
                result.add_full(
                    FullPictureInfo(abs_href=URL(f['src']),
                                    rel_name='%03d.jpg' % i))
                i += 1

            for item in picture_model_rule.get_result(['href', 'alt']):
                result.add_control(
                    ControlInfo(item['alt'], URL(item['href'] + '/galleries')))

            for item in picture_tags_rule.get_result(['href', 'data']):
                result.add_control(ControlInfo(item['data'],
                                               URL(item['href'])))

        return result
示例#26
0
    def parse_index_file(self, fname, base_url=URL()):
        site = None
        for s in self.accepted_sites:
            if base_url.contain(s['test']):
                site = s

        parser = SiteParser()

        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([site['rule']])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('ul', 'class', 'pager'),
                                                      ('div', 'class', 'navbartext'),
                                                      ('div', 'class', 'navigation'),
                                                      ('div', 'class', 'pager'),
                                                      ('div', 'class', 'col-md-12 pager'),
                                                      ('ul', 'id', 'pager')])
        startpage_pages_rule.add_process_rule_level('a', {'href', 'alt'})
        startpage_pages_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(startpage_pages_rule)

        picture_rule = ParserRule()
        picture_rule.add_activate_rule_level([('article', 'class', 'b-margin-40 g148 gallery'),
                                              ('div', 'class', 'wrapper_g'),
                                              ('td', 'style', 'background:#ededed;'),
                                              ('div', 'id', 'gallerycont'),
                                              ('div', 'class', 'galleryblock'),
                                              ('div', 'class', 'list gallery'),
                                              ('div', 'class', 'picturecontainer mainpics'),
                                              ('div', 'class', 'single_thumb'),
                                              ('div', 'class', 'minithumbs')])
        picture_rule.add_process_rule_level('a', set())
        picture_rule.add_process_rule_level('img', {'src'})
        picture_rule.set_attribute_modifier_function('src', lambda text: text.replace('/tn_', '/').replace('_tn_', '_'))
        parser.add_rule(picture_rule)

        for s in open(fname, encoding='utf-8'):
            parser.feed(s)

        result = ParseResult()

        if len(startpage_rule.get_result()) > 0:
            # print('Startpage rule')
            result.set_type('hrefs')
            for item in startpage_rule.get_result():
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

        if len(picture_rule.get_result()) > 0:
            result.set_type('pictures')
            i = 1
            for f in picture_rule.get_result(['src']):
                # print(f)
                result.add_full(FullPictureInfo(abs_href=URL(f['src']), rel_name='%03d.jpg' % i))
                i += 1

        return result
示例#27
0
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()
        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('div', 'class', 'galleries')])
        startpage_rule.add_process_rule_level('div', {})
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function('href', get_href)
        parser.add_rule(startpage_rule)

        menu_rule = ParserRule()
        menu_rule.add_activate_rule_level([('div', 'class', 'menu-list')])
        menu_rule.add_process_rule_level('a', {'href'})
        menu_rule.set_attribute_modifier_function(
            'href', lambda x: "http://lustimages.com" + x)
        parser.add_rule(menu_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('div', 'class', 'pagi')
                                                      ])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: "http://lustimages.com" + x)
        parser.add_rule(startpage_pages_rule)

        picture_trigger_rule = ParserRule()
        picture_trigger_rule.add_activate_rule_level([('div', 'class',
                                                       'left-main')])
        picture_trigger_rule.add_process_rule_level('a', {'href'})
        parser.add_rule(picture_trigger_rule)

        picture_rule = ParserRule()
        picture_rule.add_activate_rule_level([('div', 'class', 'gall')])
        picture_rule.add_process_rule_level('a', {'href'})
        parser.add_rule(picture_rule)

        for s in open(fname):
            parser.feed(s)

        result = ParseResult()

        if len(picture_trigger_rule.get_result()) > 0:
            result.set_type('pictures')
            i = 1
            for f in picture_rule.get_result():
                result.add_full(
                    FullPictureInfo(abs_href=URL(f['href']),
                                    rel_name='%03d.jpg' % i))
                i += 1

            for item in menu_rule.get_result(['href', 'data']):
                result.add_control(ControlInfo(item['data'],
                                               URL(item['href'])))

            return result

        if len(startpage_rule.get_result()) > 0:
            result.set_type('hrefs')
            for item in startpage_rule.get_result():
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL("http://lustimages.com" + item['href']),
                              popup=item.get('alt', '')))

            for item in menu_rule.get_result(['href', 'data']):
                result.add_control(ControlInfo(item['data'],
                                               URL(item['href'])))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

        return result
示例#28
0
文件: tma_model.py 项目: vit-001/fget
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()
        domain = base_url.domain()

        href_rule = ParserRule()  # startpage & model's page
        href_rule.add_activate_rule_level([('div', 'class', 'block galleries first'),
                                           ('div', 'class', 'block models'),
                                           ('div', 'class', 'block galleries')])
        href_rule.add_activate_rule_level([('div', 'class', 'thumbs')])
        href_rule.add_process_rule_level('a', {'href'})
        href_rule.add_process_rule_level('img', {'src', 'alt'})
        href_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        href_rule.set_attribute_modifier_function('src', lambda x: self.get_href(x, base_url))
        parser.add_rule(href_rule)

        href_model_page_rule = ParserRule()  # page number in model's page
        href_model_page_rule.add_activate_rule_level([('div', 'class', 'block galleries'),
                                                      ('div', 'class', 'block models')])
        href_model_page_rule.add_activate_rule_level([('ul', 'class', 'pagination')])
        href_model_page_rule.add_process_rule_level('a', {'href'})
        href_model_page_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(href_model_page_rule)

        model_litera_rule = ParserRule()
        model_litera_rule.add_activate_rule_level([('div', 'id', 'header')])
        model_litera_rule.add_process_rule_level('a', {'href', 'title'})
        model_litera_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(model_litera_rule)

        picture_rule = ParserRule()  # gallery rule
        picture_rule.add_activate_rule_level([('div', 'class', 'block gallery')])
        picture_rule.add_process_rule_level('a', {'href'})
        picture_rule.add_process_rule_level('img', {'alt'})
        picture_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(picture_rule)

        picture_href_rule = ParserRule()  # gallery href's rule
        picture_href_rule.add_activate_rule_level([('div', 'class', 'profile')])
        picture_href_rule.add_activate_rule_level([('div', 'class', 'cover')])
        picture_href_rule.add_process_rule_level('a', {'href'})
        picture_href_rule.add_process_rule_level('img', {'alt'})
        picture_href_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url))
        parser.add_rule(picture_href_rule)

        for s in open(fname):
            parser.feed(s)

        result = ParseResult()

        if len(href_rule.get_result()) > 0:
            result.set_type('hrefs')
            for item in href_rule.get_result():
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', '')))

            for item in model_litera_rule.get_result(['href', 'title', 'data']):
                if item['title'].startswith('Met Art Models'):
                    result.add_control(ControlInfo(item['data'], URL(item['href'])))
            for item in href_model_page_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

        if len(picture_rule.get_result()) > 0:
            result.set_type('pictures')
            for f in picture_rule.get_result():
                x = FullPictureInfo(abs_href=URL(f['href']), rel_name=f['href'].rpartition('/')[2])
                result.add_full(x)

            for f in picture_href_rule.get_result(['alt', 'href']):
                # print(f)
                result.add_control(ControlInfo(text=f['alt'], url=URL(f['href'])))

        return result
示例#29
0
文件: xxp_model.py 项目: vit-001/fget
    def parse_index_file(self, fname, base_url=URL()):
        parser = SiteParser()
        startpage_rule = ParserRule()
        startpage_rule.add_activate_rule_level([('td', 'class', 'blokth')])
        startpage_rule.add_process_rule_level('a', {'href'})
        startpage_rule.add_process_rule_level('img', {'src', 'alt'})
        startpage_rule.set_attribute_modifier_function(
            'href', lambda x: base_url.domain() + '/' + x + '*')
        startpage_rule.set_attribute_modifier_function(
            'src', lambda x: base_url.domain() + '/' + x + '*')
        parser.add_rule(startpage_rule)

        startpage_hrefs_rule = ParserRule()
        startpage_hrefs_rule.add_activate_rule_level([('td', 'class', 'text2')
                                                      ])
        startpage_hrefs_rule.add_activate_rule_level([('div', 'id', 'div3')])
        startpage_hrefs_rule.add_process_rule_level('a', {'href'})
        startpage_hrefs_rule.set_attribute_modifier_function(
            'href', lambda x: base_url.domain() + '/' + x + '*')
        parser.add_rule(startpage_hrefs_rule)

        startpage_pages_rule = ParserRule()
        startpage_pages_rule.add_activate_rule_level([('td', 'class',
                                                       'archives')])
        startpage_pages_rule.add_process_rule_level('a', {'href'})
        startpage_pages_rule.set_attribute_modifier_function(
            'href', lambda x: base_url.domain() + '/' + x + '*')
        # picture_rule.set_attribute_modifier_function('src', lambda text: text.replace('thumb', 'origin'))
        parser.add_rule(startpage_pages_rule)

        picture_base_rule = ParserRule()
        picture_base_rule.add_activate_rule_level([('td', 'height', '500')])
        picture_base_rule.add_process_rule_level('a', set())
        picture_base_rule.add_process_rule_level('img', {'src'})
        picture_base_rule.set_attribute_modifier_function(
            'src', lambda x: base_url.domain() + '/' + x)
        parser.add_rule(picture_base_rule)

        picture_rule = ParserRule()
        picture_rule.add_activate_rule_level([('td', 'class', 'archives')])
        picture_rule.add_process_rule_level('a', {'href'})
        picture_rule.set_attribute_modifier_function(
            'href', lambda x: base_url.domain() + x + '*')
        # picture_rule.set_attribute_modifier_function('src', lambda text: text.replace('thumb', 'origin'))
        parser.add_rule(picture_rule)
        #
        picture_tags_rule = ParserRule()
        picture_tags_rule.add_activate_rule_level([('span', 'class', 'text2')])
        picture_tags_rule.add_process_rule_level('a', {'href'})
        picture_tags_rule.set_attribute_modifier_function(
            'href', lambda x: x + '*')
        parser.add_rule(picture_tags_rule)

        for s in open(fname):
            parser.feed(s)

        result = ParseResult()

        if len(startpage_rule.get_result()) > 0:
            result.set_type('hrefs')
            for item in startpage_rule.get_result():
                result.add_thumb(
                    ThumbInfo(thumb_url=URL(item['src']),
                              href=URL(item['href']),
                              popup=item.get('alt', '')))

            for item in startpage_pages_rule.get_result(['href', 'data']):
                result.add_page(ControlInfo(item['data'], URL(item['href'])))

            for item in startpage_hrefs_rule.get_result(['href', 'data']):
                if item['href'].rfind('?cat=') != -1:
                    result.add_control(
                        ControlInfo(item['data'], URL(item['href'])))

            return result

        if len(picture_rule.get_result()) > 0:
            result.set_type('pictures')

            result.set_picture_collector(XXPSitePictureCollector())

            dirname = self.base_addr.rstrip('/') + URL(
                picture_base_rule.get_result()[0]['src']).get_path()
            result.set_gallery_path(dirname)

            i = 1
            result.add_full(
                FullPictureInfo(abs_href=base_url,
                                abs_name=dirname + '%03d.jpg' % i))

            for f in picture_rule.get_result():
                i += 1
                result.add_full(
                    FullPictureInfo(abs_href=URL(f['href']),
                                    abs_name=dirname + '%03d.jpg' % i))

            for item in picture_tags_rule.get_result(['href', 'data']):
                if item['href'].rfind('?cat=') != -1:
                    result.add_control(
                        ControlInfo(item['data'], URL(item['href'])))

        return result