def parse_picture_page(self, fname, base_url=URL()): print('Parsing:', base_url.get(), base_url.domain()) parser = SiteParser() redirect_rule = ParserRule() redirect_rule.add_activate_rule_level([('head', '', '')]) redirect_rule.add_process_rule_level('base', {'href'}) parser.add_rule(redirect_rule) picture_rule = ParserRule() picture_rule.add_activate_rule_level([('div', 'class', 'girls')]) picture_rule.add_process_rule_level('a', {'href'}) picture_rule.set_attribute_modifier_function('href', lambda text: base_url.get() + '/' + text) parser.add_rule(picture_rule) for s in open(fname): parser.feed(s) result = ParseResult() if len(redirect_rule.get_result()) > 0: print('Redirecting', redirect_rule.get_result()[0]['href']) result.set_type('redirect') result.set_redirect(URL(redirect_rule.get_result()[0]['href'])) return result if len(picture_rule.get_result()) > 0: result.set_type('pictures') i = 1 for f in picture_rule.get_result(['href']): result.add_full(FullPictureInfo(abs_href=URL(f['href']), rel_name='%03d.jpg' % i)) i += 1 return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'thumbs'), ('div', 'class', 'movie_thumbs')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function('href', self.get_href) parser.add_rule(startpage_rule) for s in open(fname): parser.feed(s) result = ParseResult() if len(startpage_rule.get_result()) > 0: # print('Startpage rule') result.set_type('hrefs') for item in startpage_rule.get_result(): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) else: print(base_url.get(), ' not parsed by BEmultiThumbSite. Add rule.') return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'thumbs200'), ('div', 'class', 'thumbs300')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function('href', self.get_href) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'menu') ]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: base_url.domain() + x) startpage_pages_rule.set_attribute_filter_function( 'href', lambda txt: '/st/' in txt) parser.add_rule(startpage_pages_rule) picture_rule = ParserRule() picture_rule.add_activate_rule_level([('div', 'class', 'gallery-thumbs')]) picture_rule.add_process_rule_level('a', set()) picture_rule.add_process_rule_level('img', {'src'}) picture_rule.set_attribute_modifier_function( 'src', self.process_picture_address) parser.add_rule(picture_rule) for s in open(fname): parser.feed(s) result = ParseResult() if len(picture_rule.get_result()) > 0: result.set_type('pictures') for f in picture_rule.get_result(): result.add_full( FullPictureInfo(abs_href=URL(f['src']), rel_name=f['src'].rpartition('/')[2])) return result if len(startpage_rule.get_result()) > 0: # print('Startpage rule') result.set_type('hrefs') for item in startpage_rule.get_result(): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) # # for item in startpage_hrefs_rule.get_result(['href', 'data']): # result.add_control(ControlInfo(item['data'], URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): if self.is_pictures_page(base_url): result = self.parse_picture_page(fname, base_url) return result # print(base_url.get(), base_url.domain()) parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'one')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src'}) startpage_rule.set_attribute_modifier_function('href', lambda x: get_href(x)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'id', 'pager')]) startpage_pages_rule.add_activate_rule_level([('div', 'id', 'pc')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x) parser.add_rule(startpage_pages_rule) picture_rule = ParserRule() picture_rule.add_activate_rule_level([('div', 'id', 'cc')]) picture_rule.add_process_rule_level('a', set()) picture_rule.add_process_rule_level('img', {'src', 'title'}) picture_rule.set_attribute_modifier_function('src', lambda text: text.replace('t_', '')) parser.add_rule(picture_rule) picture_href_rule = ParserRule() picture_href_rule.add_activate_rule_level([('div', 'id', 'cc')]) picture_href_rule.add_activate_rule_level([('div', 'class', 'shorttext')]) picture_href_rule.add_process_rule_level('a', {'href', 'alt'}) parser.add_rule(picture_href_rule) for s in open(fname): parser.feed(s) result = ParseResult() if len(startpage_rule.get_result()) > 0: result.set_type('hrefs') for item in startpage_rule.get_result(['href', 'src']): result.add_thumb(ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href'] + '*'))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href'] + '*'))) if len(picture_rule.get_result()) > 0: result.set_type('pictures') i = 1 for f in picture_rule.get_result(['src', 'title']): result.add_full(FullPictureInfo(abs_href=URL(f['src']), rel_name='%03d.jpg' % i)) i += 1 for f in picture_href_rule.get_result(): if f['href'].startswith('/'): result.add_control(ControlInfo(text=f['alt'], url=URL(base_url.domain() + f['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'flower')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'morepartners')]) startpage_pages_rule.add_process_rule_level('a', {'href', 'alt'}) startpage_pages_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) picture_trigger_rule = ParserRule() picture_trigger_rule.add_activate_rule_level([('a', 'class', 'thumbsmall')]) # picture_trigger_rule.add_process_rule_level('a', set()) picture_trigger_rule.add_process_rule_level('img', {'src'}) # picture_trigger_rule.set_attribute_modifier_function('src', lambda text: text.replace('/tn_', '/')) parser.add_rule(picture_trigger_rule) picture_rule = ParserRule() picture_rule.add_activate_rule_level([('div', 'class', 'flower')]) picture_rule.add_process_rule_level('a', {'class'}) picture_rule.add_process_rule_level('img', {'src'}) picture_rule.set_attribute_modifier_function('src', lambda text: text.replace('/tn_', '/')) parser.add_rule(picture_rule) for s in open(fname, encoding='utf-8'): parser.feed(s) result = ParseResult() if len(startpage_rule.get_result()) > 0: # print('Startpage rule') result.set_type('hrefs') for item in startpage_rule.get_result(): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) if len(picture_trigger_rule.get_result()) > 0: result.set_type('pictures') i = 1 for f in picture_rule.get_result(['src', 'class']): # print(f) if f['class'] == 'thumbsmall': result.add_full(FullPictureInfo(abs_href=URL(f['src']), rel_name='%03d.jpg' % i)) i += 1 return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'post')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'paginator')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) parser.add_rule(startpage_pages_rule) href_rule = ParserRule() href_rule.add_activate_rule_level([('div', 'class', 'sidebar')]) href_rule.add_process_rule_level('li', {'class'}) href_rule.add_process_rule_level('a', {'href', 'title'}) href_rule.set_attribute_modifier_function('title', lambda text: text.replace('View all posts filed under ', '')) parser.add_rule(href_rule) picture_rule = ParserRule() picture_rule.add_activate_rule_level([('dl', 'class', 'gallery-item')]) picture_rule.add_process_rule_level('a', set()) picture_rule.add_process_rule_level('img', {'src'}) picture_rule.set_attribute_modifier_function('src', lambda text: text.replace('-180x240', '')) parser.add_rule(picture_rule) for s in open(fname): parser.feed(s) result = ParseResult() if len(picture_rule.get_result()) > 0: result.set_type('pictures') i = 1 for f in picture_rule.get_result(['src']): result.add_full(FullPictureInfo(abs_href=URL(f['src']), rel_name='%03d.jpg' % i)) i += 1 return result if len(startpage_rule.get_result()) > 0: # print('Startpage rule') result.set_type('hrefs') for item in startpage_rule.get_result(): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) for item in href_rule.get_result(['class', 'href', 'title']): # print(item['title']) result.add_control(ControlInfo(item['title'], URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'grid_1')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function('href', lambda x: 'http://hotxpix.net' + get_href(x)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'pagination')]) startpage_pages_rule.add_process_rule_level('a', {'href', 'title'}) startpage_pages_rule.set_attribute_modifier_function('href', lambda x: 'http://hotxpix.net' + x) parser.add_rule(startpage_pages_rule) picture_trigger_rule = ParserRule() picture_trigger_rule.add_activate_rule_level([('div', 'class', 'grid_4 gal dbg')]) picture_trigger_rule.add_process_rule_level('a', {'href'}) parser.add_rule(picture_trigger_rule) picture_rule = ParserRule() picture_rule.add_activate_rule_level([('div', 'class', 'grid_4 gal dbg')]) picture_rule.add_process_rule_level('a', {'href'}) picture_rule.add_process_rule_level('img', {'src'}) parser.add_rule(picture_rule) for s in open(fname): parser.feed(s) result = ParseResult() if len(picture_trigger_rule.get_result()) > 0: result.set_type('pictures') i = 1 for f in picture_rule.get_result(['href', 'src']): if f['href'].endswith('.jpg'): result.add_full(FullPictureInfo(abs_href=URL(f['href']), rel_name='%03d.jpg' % i)) i += 1 return result if len(startpage_rule.get_result()) > 0: result.set_type('hrefs') for item in startpage_rule.get_result(): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data', 'title']): result.add_page(ControlInfo(item['title'], URL(item['href']))) return result
def parse_index(self, request, url=URL()): parser = SiteParser() picture_rule = ParserRule() picture_rule.add_activate_rule_level([('td', 'height', '500')]) picture_rule.add_process_rule_level('a', set()) picture_rule.add_process_rule_level('img', {'src'}) picture_rule.set_attribute_modifier_function( 'src', lambda x: url.domain() + '/' + x) parser.add_rule(picture_rule) for data in request: parser.feed(data.decode(encoding="utf-8", errors="ignore")) return picture_rule.get_result()[0]['src']
def parse_index(self, request, url): parser = SiteParser() picture_rule = ParserRule() picture_rule.add_activate_rule_level([('center', '', '')]) picture_rule.add_process_rule_level('a', set()) picture_rule.add_process_rule_level('img', {'src'}) parser.add_rule(picture_rule) for data in request: parser.feed(data.decode('utf-8')) # print(picture_rule.get_result()[0]['src']) return picture_rule.get_result()[0]['src']
def parse_index_file(self, fname, base_url=URL()): print(base_url.get(), base_url.domain()) parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'ogpost'), ('div', 'class', 'post300'), ('div', 'class', 'galelement') ]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: get_href(x, base_url.domain())) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([ ('span', 'class', 'pager'), ('div', 'class', 'pager') ]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: base_url.domain() + x) parser.add_rule(startpage_pages_rule) startpage_href_rule = ParserRule() startpage_href_rule.add_activate_rule_level([('div', 'id', 'right')]) startpage_href_rule.add_activate_rule_level([('div', 'class', 'rightbox')]) startpage_href_rule.add_process_rule_level('a', {'href'}) parser.add_rule(startpage_href_rule) picture_rule = ParserRule() picture_rule.add_activate_rule_level([('div', 'class', 'galcontentpics')]) picture_rule.add_process_rule_level('a', set()) picture_rule.add_process_rule_level('img', {'src'}) picture_rule.set_attribute_modifier_function( 'src', lambda text: _del_thumb(text)) parser.add_rule(picture_rule) for s in open(fname): parser.feed(s) result = ParseResult() if len(startpage_rule.get_result()) > 0: result.set_type('hrefs') for item in startpage_rule.get_result(['href', 'src']): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href'] + '*'), popup=item.get('alt', ''))) for item in startpage_href_rule.get_result(['href', 'data']): if item['href'].startswith('/'): result.add_control( ControlInfo( item['data'], URL(base_url.domain() + item['href'] + '*'))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page( ControlInfo(item['data'], URL(item['href'] + '*'))) if len(picture_rule.get_result()) > 0: result.set_type('pictures') i = 1 for f in picture_rule.get_result(['src']): result.add_full( FullPictureInfo(abs_href=URL(f['src']), rel_name='%03d.jpg' % i)) i += 1 return result
def parse_index_file(self, fname, base_url=URL()): site_url = 'http://' + urlparse(base_url.get())[1].strip('/') print('site url=', site_url) parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'bodycontainer')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt', 'class'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: site_url + get_href(x)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() # startpage_pages_rule.add_activate_rule_level([('div', 'class', 'bodycontainer')]) startpage_pages_rule.add_activate_rule_level([('td', 'align', 'right') ]) startpage_pages_rule.add_process_rule_level('a', {'href', 'title'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: site_url + x) parser.add_rule(startpage_pages_rule) site_rule = ParserRule() site_rule.add_activate_rule_level([('div', 'class', 'headerlinetext')]) site_rule.add_process_rule_level('a', {'href'}) parser.add_rule(site_rule) picture_trigger_rule = ParserRule() picture_trigger_rule.add_activate_rule_level([('a', 'class', 'fancybox')]) picture_trigger_rule.add_process_rule_level('img', {'src'}) parser.add_rule(picture_trigger_rule) picture_rule = ParserRule() picture_rule.add_activate_rule_level([('div', 'class', 'bodycontainer') ]) picture_rule.add_process_rule_level('a', {'href', 'class'}) picture_rule.add_process_rule_level('img', {'alt'}) parser.add_rule(picture_rule) picture_href_rule = ParserRule() picture_href_rule.add_activate_rule_level([('div', 'class', 'bodycontainer')]) picture_href_rule.add_activate_rule_level([('h2', 'style', 'font-size:18px')]) picture_href_rule.add_process_rule_level('a', {'href'}) picture_href_rule.set_attribute_modifier_function( 'href', lambda x: site_url + x) parser.add_rule(picture_href_rule) for s in open(fname): parser.feed(s) result = ParseResult() if len(picture_trigger_rule.get_result()) > 0: result.set_type('pictures') i = 1 for f in picture_rule.get_result(['href', 'alt', 'class']): if f['class'] == 'fancybox': result.add_full( FullPictureInfo(abs_href=URL(f['href']), rel_name='%03d.jpg' % i)) i += 1 for item in picture_href_rule.get_result(['href', 'data']): result.add_control( ControlInfo(text=item['data'], url=URL(item['href']))) return result if len(startpage_rule.get_result()) > 0: result.set_type('hrefs') for item in startpage_rule.get_result(['href', 'src', 'class']): if item['class'] == 'thumb': result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in site_rule.get_result(['href', 'data']): result.add_site(ControlInfo(item['data'], URL(item['href']))) for item in startpage_pages_rule.get_result( ['href', 'data', 'title']): result.add_page(ControlInfo(item['data'], URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): print(base_url.domain()) parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'ownpost')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'nav')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: base_url.domain() + '/' + x + '*') parser.add_rule(startpage_pages_rule) picture_rule = ParserRule() picture_rule.add_activate_rule_level([('div', 'class', 'gallery')]) picture_rule.add_process_rule_level('a', set()) picture_rule.add_process_rule_level('img', {'src', 'class'}) picture_rule.set_attribute_modifier_function( 'src', lambda text: text.replace('/tn_', '/')) parser.add_rule(picture_rule) picture_href_rule = ParserRule() picture_href_rule.add_activate_rule_level([('div', 'class', 'gallery') ]) picture_href_rule.add_process_rule_level('a', {'href'}) picture_href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) picture_href_rule.set_attribute_filter_function( 'href', lambda x: x.find('/?category=') != -1) parser.add_rule(picture_href_rule) for s in open(fname): parser.feed(s) result = ParseResult() if len(startpage_rule.get_result()) > 0: # print('Startpage rule') result.set_type('hrefs') for item in startpage_rule.get_result(): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href'] + '*'), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) if len(picture_rule.get_result()) > 0: result.set_type('pictures') i = 1 for f in picture_rule.get_result(['src', 'class']): # print(f) result.add_full( FullPictureInfo(abs_href=URL(f['src']), rel_name='%03d.jpg' % i)) i += 1 for f in picture_href_rule.get_result(): # print(f) result.add_control( ControlInfo(f['data'].replace(',', ''), URL(f['href']))) return result
def parse_index_file(self, fname, base_url=URL()): print(base_url.get(), base_url.domain()) parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'lady')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: base_url.domain() + '/' + x) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'nav_link')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: base_url.domain() + '/' + x) parser.add_rule(startpage_pages_rule) startpage_nav_rule = ParserRule() startpage_nav_rule.add_activate_rule_level([('td', 'class', 'nav')]) startpage_nav_rule.add_process_rule_level('a', {'href'}) parser.add_rule(startpage_nav_rule) picture_rule = ParserRule() picture_rule.add_activate_rule_level([('td', 'align', 'center')]) picture_rule.add_process_rule_level('a', {'href'}) picture_rule.add_process_rule_level('img', {'src'}) picture_rule.set_attribute_modifier_function( 'src', lambda text: _del_thumb(text)) parser.add_rule(picture_rule) for s in open(fname): parser.feed(s) result = ParseResult() if len(startpage_rule.get_result()) > 0: result.set_type('hrefs') for item in startpage_rule.get_result(['href', 'src']): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href'] + '*'), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page( ControlInfo(item['data'], URL(item['href'] + '*'))) for item in startpage_nav_rule.get_result(['href', 'data']): if item['href'].startswith("http://www.deffki.su/"): result.add_control( ControlInfo(item['data'], URL(item['href'] + '*'))) if base_url.contain('?go=gal&id='): result.set_type('pictures') dirname = self.base_addr + base_url.get_path( ) + '/' + base_url.get().rpartition('=')[2] + '/' result.set_gallery_path(dirname) i = 1 for f in picture_rule.get_result(['src', 'href']): if not f['href'].startswith('prv.php?id='): continue if not f['src'].startswith('http://'): continue # print(f['src']) result.add_full( FullPictureInfo(abs_href=URL(f['src']), abs_name=dirname + '%03d.jpg' % i)) i += 1 return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([ ('div', 'class', 'thumbs_main'), ('div', 'class', 'content_box model_sub'), ('div', 'class', 'teen_girls_list'), ('div', 'class', 'gallery_box') ]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function('href', get_href) parser.add_rule(startpage_rule) startpage_menu_rule = ParserRule() startpage_menu_rule.add_activate_rule_level([('div', 'class', 'head')]) startpage_menu_rule.add_activate_rule_level([('ul', 'class', 'menu')]) startpage_menu_rule.add_process_rule_level('a', {'href'}) startpage_menu_rule.set_attribute_modifier_function( 'href', lambda x: x) parser.add_rule(startpage_menu_rule) archive_pages_rule = ParserRule() archive_pages_rule.add_activate_rule_level([('div', 'class', 'head')]) archive_pages_rule.add_activate_rule_level([('span', '', '')]) archive_pages_rule.add_process_rule_level('a', {'href'}) archive_pages_rule.set_attribute_modifier_function( 'href', lambda x: 'http://www.teenport.com' + x) parser.add_rule(archive_pages_rule) model_href_rule = ParserRule() model_href_rule.add_activate_rule_level([ ('div', 'class', 'model_desc model_niche_desc') ]) model_href_rule.add_process_rule_level('a', {'href'}) parser.add_rule(model_href_rule) picture_rule = ParserRule() picture_rule.add_activate_rule_level([ ('div', 'class', 'thumb_box top_corners'), ('div', 'class', 'thumb_box bottom_corners') ]) picture_rule.add_process_rule_level('a', set()) picture_rule.add_process_rule_level('img', {'src'}) picture_rule.set_attribute_modifier_function( 'src', lambda text: text.replace('t', '')) parser.add_rule(picture_rule) picture_href_rule = ParserRule() picture_href_rule.add_activate_rule_level([('div', 'class', 'title')]) picture_href_rule.add_process_rule_level('a', {'href', 'title'}) parser.add_rule(picture_href_rule) for s in open(fname): parser.feed(s) result = ParseResult() if len(startpage_rule.get_result()) > 0: result.set_type('hrefs') for item in startpage_rule.get_result(): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_menu_rule.get_result(['href', 'data']): result.add_control(ControlInfo(item['data'], URL(item['href']))) for item in model_href_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) for item in archive_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) if len(picture_rule.get_result()) > 0: result.set_type('pictures') for f in picture_rule.get_result(): result.add_full(FullPictureInfo(rel_name=f['src'])) for f in picture_href_rule.get_result(): result.add_control(ControlInfo(f['title'], URL(f['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() domain = base_url.domain() href_rule = ParserRule() # startpage & model's page href_rule.add_activate_rule_level([('div', 'id', 'lst-galleries'), ('div', 'class', 'lblock'), ('div', 'class', 'modal_info_full') ]) href_rule.add_process_rule_level('a', {'href'}) href_rule.add_process_rule_level('img', {'src', 'alt'}) href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) href_rule.set_attribute_modifier_function( 'src', lambda x: self.get_href(x, base_url)) href_rule.set_attribute_filter_function('href', self.thumb_href_filter) href_rule.set_attribute_filter_function('src', self.thumb_src_filter) parser.add_rule(href_rule) href_page_rule = ParserRule() # page number in model's page href_page_rule.add_activate_rule_level([('div', 'class', 'pages'), ('div', 'class', 'cat')]) href_page_rule.add_process_rule_level('a', {'href'}) href_page_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(href_page_rule) model_litera_rule = ParserRule() model_litera_rule.add_activate_rule_level([('div', 'class', 'babe_index')]) model_litera_rule.add_process_rule_level('a', {'href'}) model_litera_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(model_litera_rule) model_more_rule = ParserRule() model_more_rule.add_activate_rule_level([('div', 'class', 'more'), ('div', 'id', 'MoreCont')]) model_more_rule.add_process_rule_level('a', {'href'}) model_more_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) model_more_rule.set_attribute_filter_function('href', self.thumb_href_filter) parser.add_rule(model_more_rule) picture_rule = ParserRule() # gallery rule picture_rule.add_activate_rule_level([('div', 'class', 'lblock')]) # picture_rule.add_activate_rule_level([('ul', 'class', 'block')]) picture_rule.add_process_rule_level('a', {'href'}) picture_rule.add_process_rule_level('img', {'alt'}) picture_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) picture_rule.set_attribute_filter_function( 'href', lambda x: x.endswith('.jpg')) parser.add_rule(picture_rule) picture_href_rule = ParserRule() # gallery href's rule picture_href_rule.add_activate_rule_level([('div', 'id', 'ModelMenu'), ('div', 'class', 'lblock')]) picture_href_rule.add_process_rule_level('a', {'href', 'title'}) picture_href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) picture_href_rule.set_attribute_filter_function( 'href', self.gal_href_filter) parser.add_rule(picture_href_rule) for s in open(fname): parser.feed(s) result = ParseResult() if len(picture_rule.get_result()) > 0: result.set_type('pictures') i = 1 for f in picture_rule.get_result(): x = FullPictureInfo(abs_href=URL(f['href']), rel_name='%03d.jpg' % i) result.add_full(x) i += 1 for f in picture_href_rule.get_result(['href', 'title']): # print(f) result.add_control( ControlInfo(text=f['title'], url=URL(f['href']))) return result if len(href_rule.get_result()) > 0: result.set_type('hrefs') for item in href_rule.get_result(): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in model_more_rule.get_result(['href', 'data']): result.add_control(ControlInfo(item['data'], URL(item['href']))) for item in model_litera_rule.get_result(['href', 'data']): result.add_control(ControlInfo(item['data'], URL(item['href']))) for item in href_page_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): print('VP parsing') parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'bx'), ('div', 'class', 'bx lastrow') ]) startpage_rule.add_process_rule_level('a', {'href', 'class'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'pagerwrap')]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) # startpage_pages_rule.set_attribute_modifier_function('href',lambda x: base_url.domain() + x) parser.add_rule(startpage_pages_rule) video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'class', 'video_panel')]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function( 'data', lambda text: 'var flashvars' in text) parser.add_rule(video_rule) gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'class', 'tagas-secondrow')]) # gallery_href_rule.add_activate_rule_level([('td', 'class', 'btnList')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function( 'href', lambda x: base_url.domain() + x + '*') # gallery_href_rule.set_attribute_filter_function('href',lambda x:'/category/'in x or '/search/'in x) parser.add_rule(gallery_href_rule) gallery_user_rule = ParserRule() gallery_user_rule.add_activate_rule_level([('div', 'class', 'info')]) gallery_user_rule.add_process_rule_level('a', {'href'}) # gallery_user_rule.set_attribute_filter_function('href',lambda x:'/profile/' in x) gallery_user_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x.replace('/user/', '/submitted/'), base_url)) parser.add_rule(gallery_user_rule) for s in open(fname, encoding='utf-8', errors='ignore'): # print(s) parser.feed(s.replace('</b>', '</a>')) result = ParseResult() if len(video_rule.get_result()) > 0: script = video_rule.get_result()[0]['data'].replace(' ', '') # print(script) def get_url_from_script(script='', var=''): data = script.partition('flashvars.' + var + '="')[2].partition('"')[0] # print(var,data) if data.startswith('https://'): return URL(data) videoUrlLow = get_url_from_script(script, 'videoUrlLow') videoUrlLow2 = get_url_from_script(script, 'videoUrlLow2') videoUrlMedium = get_url_from_script(script, 'videoUrlMedium') videoUrlMedium2 = get_url_from_script(script, 'videoUrlMedium2') videoUrlHD = get_url_from_script(script, 'videoUrlHD') videoUrlHD2 = get_url_from_script(script, 'videoUrlHD2') def add_alternate(video, txt, url): if url is not None: video.add_alternate(dict(text=txt, url=url)) # video=MediaData(videoUrlMedium) if videoUrlMedium is not None: video = MediaData(videoUrlMedium) elif videoUrlLow is not None: video = MediaData(videoUrlLow) else: print('No url found') return result add_alternate(video, 'Low', videoUrlLow) add_alternate(video, 'Low2', videoUrlLow2) add_alternate(video, 'Medium', videoUrlMedium) add_alternate(video, 'Medium', videoUrlMedium2) add_alternate(video, 'HD', videoUrlHD) add_alternate(video, 'HD', videoUrlHD2) result.set_type('video') result.set_video(video) for f in gallery_user_rule.get_result(): # print(f) result.add_control( ControlInfo('"' + f['data'] + '"', URL(f['href']))) for f in gallery_href_rule.get_result(['data', 'href']): # print(f) result.add_control(ControlInfo(f['data'], URL(f['href']))) # print('return') return result if len(startpage_rule.get_result()) > 0: result.set_type('hrefs') for item in startpage_rule.get_result(): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item['alt'])) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) # for item in startpage_hrefs_rule.get_result(['href', 'data']): # result.add_control(ControlInfo(item['data'], URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): print(base_url.domain()) parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'single_plug') ]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) parser.add_rule(startpage_rule) # startpage_pages_rule = ParserRule() # startpage_pages_rule.add_activate_rule_level([('div', 'id', 'nav')]) # startpage_pages_rule.add_process_rule_level('a', {'href'}) # startpage_pages_rule.set_attribute_modifier_function('href', lambda x: base_url.get().partition('?')[0] + x+'*') # parser.add_rule(startpage_pages_rule) # # picture_rule = ParserRule() # picture_rule.add_activate_rule_level([('div', 'class', 'gallery_w')]) # picture_rule.add_process_rule_level('a', set()) # picture_rule.add_process_rule_level('img', {'src','class'}) # picture_rule.set_attribute_modifier_function('src', lambda text: text.replace('/tn_', '/')) # parser.add_rule(picture_rule) # # picture_href_rule = ParserRule() # picture_href_rule.add_activate_rule_level([('div', 'class', 'tags')]) # picture_href_rule.add_process_rule_level('a', {'href'}) # picture_href_rule.set_attribute_modifier_function('href', lambda x: base_url.domain()+x) # parser.add_rule(picture_href_rule) for s in open(fname): parser.feed(s) result = [] for item in startpage_rule.get_result(): result.append(item['href']) # result = ParseResult(self) # # if len(startpage_rule.get_result()) > 0: # # print('Startpage rule') # result.set_type('hrefs') # for item in startpage_rule.get_result(): # result.add_thumb( # ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']+'*'), description=item.get('alt', ''))) # # for item in startpage_pages_rule.get_result(['href', 'data']): # result.add_page(ControlInfo(item['data'], URL(item['href']))) # # if len(picture_rule.get_result()) > 0: # result.set_type('pictures') # i = 1 # for f in picture_rule.get_result(['src', 'class']): # # print(f) # result.add_full(FullPictureInfo(abs_href=URL(f['src']), rel_name='%03d.jpg' % i)) # i += 1 # # for f in picture_href_rule.get_result(): # # print(f) # result.add_control(ControlInfo(f['data'].replace(',',''), URL(f['href']))) # return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('ul', 'class', 'thumbs'), ('ul', 'class', 'thumbs break')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('p', 'class', 'pagination')]) startpage_pages_rule.add_process_rule_level('a', {'class', 'href'}) startpage_pages_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x) parser.add_rule(startpage_pages_rule) picture_trigger_rule = ParserRule() picture_trigger_rule.add_activate_rule_level([('head', '', '')]) picture_trigger_rule.add_process_rule_level('meta', {'name'}) parser.add_rule(picture_trigger_rule) picture_href_rule = ParserRule() picture_href_rule.add_activate_rule_level([('ul', 'class', 'options')]) picture_href_rule.add_process_rule_level('a', {'href'}) parser.add_rule(picture_href_rule) for s in open(fname): parser.feed(s) result = ParseResult() pictures = False for item in picture_trigger_rule.get_result(['name']): if item['name'] == 'gallery-id': pictures = True if pictures: result.set_type('pictures') i = 1 for f in startpage_rule.get_result(): result.add_full(FullPictureInfo(abs_href=URL(f['href']), rel_name='%03d.jpg' % i)) i += 1 for f in picture_href_rule.get_result(): # print(f) if not f['href'].startswith('http://'): result.add_control(ControlInfo(f['data'], URL(base_url.domain() + f['href']))) return result if len(startpage_rule.get_result()) > 0: # print('Startpage rule') result.set_type('hrefs') for item in startpage_rule.get_result(): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data', 'class']): if item['class'] == 'page': result.add_page(ControlInfo(item['data'], URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'thumbs'), ('div', 'class', 'content_box domain2'), ('div', 'class', 'video_list'), ('div', 'class', 'video_list_models'), ('div', 'class', 'pics_list'), ('div', 'class', 'movie_thumbs')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function('href', self.get_href) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'pages'), ('div', 'class', 'pg')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x) parser.add_rule(startpage_pages_rule) startpage_hrefs_rule = ParserRule() startpage_hrefs_rule.add_activate_rule_level([('li', 'class', 'orange dropdown')]) startpage_hrefs_rule.add_process_rule_level('a', {'href'}) startpage_hrefs_rule.set_attribute_filter_function('href', lambda txt: '/st/' in txt) parser.add_rule(startpage_hrefs_rule) video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'class', 'player')]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function('data', lambda text: 'video_url:' in text) parser.add_rule(video_rule) picture_rule = ParserRule() picture_rule.add_activate_rule_level([('div', 'class', 'thumb_box')]) picture_rule.add_process_rule_level('a', set()) picture_rule.add_process_rule_level('img', {'src'}) picture_rule.set_attribute_modifier_function('src', lambda text: text.replace('t.jpg', '.jpg')) parser.add_rule(picture_rule) picture_href_rule = ParserRule() picture_href_rule.add_activate_rule_level([('div', 'class', 'crumbles'), ('div', 'class', 'tags')]) picture_href_rule.add_process_rule_level('a', {'href', 'title'}) parser.add_rule(picture_href_rule) for s in open(fname): parser.feed(s) result = ParseResult() if video_rule.is_result(): video = MediaData(URL(self.get_attr_from_script(video_rule.get_result()[0]['data']))) result.set_video(video) result.set_type('video') for f in picture_href_rule.get_result(): # print(f) result.add_control(ControlInfo(f['data'], URL(f['href']))) return result if len(startpage_rule.get_result()) > 0: # print('Startpage rule') result.set_type('hrefs') for item in startpage_rule.get_result(): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) for item in startpage_hrefs_rule.get_result(['href', 'data']): result.add_control(ControlInfo(item['data'], URL(item['href']))) if len(picture_rule.get_result()) > 0: result.set_type('pictures') for f in picture_rule.get_result(): result.add_full(FullPictureInfo(rel_name=f['src'])) for f in picture_href_rule.get_result(): # print(f) result.add_control(ControlInfo(f['title'], URL(f['href']))) return result
def parse_index_file(self, fname, base_url=URL()): # print(base_url) parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'thumblinks') ]) # startpage_rule.add_process_rule_level('div', {}) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url) + '*') startpage_rule.set_attribute_modifier_function( 'src', lambda x: self.get_href(x, base_url) + '*') parser.add_rule(startpage_rule) tags_rule = ParserRule() tags_rule.add_activate_rule_level([('div', 'id', 'divTags')]) tags_rule.add_process_rule_level('a', {'href'}) tags_rule.set_attribute_modifier_function( 'href', lambda x: base_url.domain() + x) parser.add_rule(tags_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('td', 'class', 'pages') ]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: base_url.domain() + x) parser.add_rule(startpage_pages_rule) picture_base_addr_rule = ParserRule() picture_base_addr_rule.add_activate_rule_level([('div', 'class', 'imagelinks')]) picture_base_addr_rule.add_process_rule_level('script', {}) picture_base_addr_rule.set_attribute_filter_function( 'data', lambda x: 'unescape' in x) parser.add_rule(picture_base_addr_rule) picture_rule = ParserRule() picture_rule.add_activate_rule_level([('div', 'class', 'imagelinks')]) picture_rule.add_process_rule_level('script', {}) picture_rule.set_attribute_filter_function('data', lambda x: "'src'" in x) parser.add_rule(picture_rule) for s in open(fname): parser.feed(s) result = ParseResult() if len(picture_base_addr_rule.get_result()) > 0: result.set_type('pictures') base = \ picture_base_addr_rule.get_result()[0]['data'].replace('%2f', '/').partition("unescape('//")[2].partition( "'")[0] # print(base) i = 1 for f in picture_rule.get_result(): picname = f['data'].partition("+'")[2].partition("'")[0] # print(picname) result.add_full( FullPictureInfo(abs_href=URL(base + picname + '*'), rel_name='%03d.jpg' % i)) i += 1 for item in tags_rule.get_result(['href', 'data']): result.add_control(ControlInfo(item['data'], URL(item['href']))) return result if len(startpage_rule.get_result()) > 0: result.set_type('hrefs') for item in startpage_rule.get_result(): # print(item) result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in tags_rule.get_result(['href', 'data']): result.add_control(ControlInfo(item['data'], URL(item['href']))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'image-delete'), ('div', 'class', 'thumbs'), ('div', 'class', 'image')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'pager') ]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: base_url.domain() + x) parser.add_rule(startpage_pages_rule) video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'class', 'main')]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function( 'data', lambda text: 'function getEmbed()' in text) parser.add_rule(video_rule) picture_rule = ParserRule() picture_rule.add_activate_rule_level([('div', 'class', 'thumbs2')]) picture_rule.add_process_rule_level('a', {'href'}) picture_rule.add_process_rule_level('img', {'src'}) # picture_rule.set_attribute_modifier_function('src', lambda text: text.replace('thumb', 'origin')) parser.add_rule(picture_rule) picture_tags_rule = ParserRule() picture_tags_rule.add_activate_rule_level([('div', 'class', 'main')]) picture_tags_rule.add_process_rule_level('a', {'href'}) picture_tags_rule.set_attribute_filter_function( 'href', lambda txt: '/categories/' in txt or '/model/' in txt) parser.add_rule(picture_tags_rule) for s in open(fname, encoding='utf-8'): parser.feed(s) result = ParseResult() if len(video_rule.get_result()) > 0: result.set_video( MediaData( URL( self.get_attr_from_script( video_rule.get_result()[0]['data'])))) result.set_type('video') for item in picture_tags_rule.get_result(['href', 'data']): result.add_control(ControlInfo(item['data'], URL(item['href']))) return result if len(startpage_rule.get_result()) > 0: result.set_type('hrefs') for item in startpage_rule.get_result(): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) if len(picture_rule.get_result()) > 0: result.set_type('pictures') result.set_picture_collector(ELSitePictureCollector()) i = 1 for f in picture_rule.get_result(): result.add_full( FullPictureInfo(abs_href=URL(f['href']), rel_name='%03d.jpg' % i)) i += 1 for item in picture_tags_rule.get_result(['href', 'data']): result.add_control(ControlInfo(item['data'], URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'thumbs'), ('div', 'class', 'movie_thumbs')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function('href', get_href) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'head') ]) startpage_pages_rule.add_activate_rule_level([('div', 'class', 'pages') ]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: 'http://www.tomorrowporn.com' + x) parser.add_rule(startpage_pages_rule) href_rule = ParserRule() href_rule.add_activate_rule_level([('ul', 'class', 'sub_thumb_list')]) href_rule.add_process_rule_level('a', {'href'}) href_rule.add_process_rule_level('img', {'src', 'alt'}) href_rule.set_attribute_modifier_function('href', get_href) parser.add_rule(href_rule) picture_rule = ParserRule() picture_rule.add_activate_rule_level([ ('div', 'class', 'thumb_box'), ('div', 'class', 'thumb_box bottom_corners'), ('div', 'class', 'thumb_box top_corners') ]) picture_rule.add_process_rule_level('a', set()) picture_rule.add_process_rule_level('img', {'src'}) picture_rule.set_attribute_modifier_function( 'src', lambda text: text.replace('t', '')) parser.add_rule(picture_rule) picture_href_rule = ParserRule() picture_href_rule.add_activate_rule_level([('div', 'class', 'menus')]) picture_href_rule.add_process_rule_level('h2', set()) picture_href_rule.add_process_rule_level('a', {'href', 'title'}) parser.add_rule(picture_href_rule) for s in open(fname, encoding='utf-8'): parser.feed(s) result = ParseResult() if len(startpage_rule.get_result()) > 0: # result.set_type('hrefs') for item in startpage_rule.get_result(): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) if len(href_rule.get_result()) > 0: # result.set_type('hrefs') for item in href_rule.get_result(): # print (item) if 'src' in item: result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) if len(picture_rule.get_result()) > 0: # result.set_type('pictures') for f in picture_rule.get_result(): result.add_full(FullPictureInfo(rel_name=f['src'])) for f in picture_href_rule.get_result(): # print(f) result.add_control(ControlInfo(f['title'], URL(f['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'clearfix'), ('div', 'class', 'row clearfix video-container')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level( [('div', 'class', 'btn-group clearfix full-width pagination-block')]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) startpage_categories_rule = ParserRule() startpage_categories_rule.add_activate_rule_level([('ul', 'class', 'main-nav unstyled-list subCategories')]) # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_categories_rule.add_process_rule_level('a', {'href'}) # startpage_categories_rule.set_attribute_filter_function('href',lambda x:'/free_porn/' in x) startpage_categories_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_categories_rule) startpage_hrefs_rule = ParserRule() startpage_hrefs_rule.add_activate_rule_level([('div', 'class', 'cat-menu hidden-xs')]) # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_hrefs_rule.add_process_rule_level('a', {'href'}) startpage_hrefs_rule.set_attribute_filter_function('href', lambda x: '/free_porn/' in x) startpage_hrefs_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_hrefs_rule) # video_rule = ParserRule() video_rule.add_activate_rule_level([('body', '', '')]) video_rule.add_process_rule_level('script', {''}) video_rule.set_attribute_filter_function('data', lambda text: 'var urls' in text) # video_rule.set_attribute_modifier_function('src',lambda txt:txt+'*') parser.add_rule(video_rule) gallery_rule = ParserRule() gallery_rule.add_activate_rule_level([('div', 'id', 'galleryImages')]) gallery_rule.add_process_rule_level('a', {}) gallery_rule.add_process_rule_level('img', {'src'}) # video_rule.set_attribute_filter_function('data', lambda text: 'jwplayer' in text) gallery_rule.set_attribute_modifier_function('src', lambda txt: txt.replace('/thumbs/', '/')) parser.add_rule(gallery_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'class', 'video-player-list tag-list-block')]) gallery_href_rule.add_process_rule_level('a', {'href', 'title'}) gallery_href_rule.set_attribute_modifier_function('href', lambda x: (self.get_href(x, base_url))) parser.add_rule(gallery_href_rule) gallery_user_rule = ParserRule() gallery_user_rule.add_activate_rule_level([('div', 'class', 'video-player-info row')]) gallery_user_rule.add_process_rule_level('a', {'href'}) # gallery_user_rule.set_attribute_modifier_function('href', lambda x: base_url.domain() + x + '*') # gallery_user_rule.set_attribute_filter_function('href',lambda x:'/categories/' in x) parser.add_rule(gallery_user_rule) all = '' for s in open(fname, encoding='utf-8', errors='ignore'): parser.feed(s) # .replace('</b>','</a>')) all += s.replace(' ', '') result = ParseResult() if 'urls.push({' in all: video_url = all.partition('urls.push({')[2].partition('"});')[0].partition('file:"')[2] video = MediaData(URL(video_url + '*')) result.set_type('video') result.set_video(video) if gallery_user_rule.is_result(): # print(gallery_user_rule.get_result()) user_name = gallery_user_rule.get_result()[0]['data'].strip() user_number = gallery_user_rule.get_result()[0]['href'].rpartition('-')[2].rstrip('/') # print(user_name, user_number) result.add_control(ControlInfo('"' + user_name + '"', URL('http://shockingmovies.com/uploads-by-user/' + user_number + '/'))) # result.add_control(ControlInfo(user+' gals', URL('http://motherless.com/galleries/member/'+user+'*'))) for f in gallery_href_rule.get_result(['href']): label = f['data'].strip().strip(',') if label == '': label = f['title'] result.add_control(ControlInfo(label, URL(f['href']))) return result if gallery_rule.is_result(): result.set_type('pictures') url = URL(gallery_rule.get_result()[0]['src'] + '*') base_dir = url.get_path(base=Setting.base_dir) result.set_gallery_path(base_dir) for f in gallery_rule.get_result(): picture = FullPictureInfo(abs_href=URL(f['src'] + '*'), rel_name=f['src'].rpartition('/')[2]) picture.set_base(base_dir) result.add_full(picture) for f in gallery_href_rule.get_result(['href']): label = f['data'].strip() if label == '': label = f['title'] if '/user/' in f['href']: split = f['href'].rpartition('-') base = split[0].partition('/user/')[0] # print(split) # print(base) result.add_control(ControlInfo(label + ' videos', URL(base + '/uploads-by-user/' + split[2]))) result.add_control( ControlInfo(label + ' gals', URL(base + '/uploads-by-user/' + split[2] + '?photos=1'))) else: result.add_control(ControlInfo(label, URL(f['href']))) return result if startpage_rule.is_result(): # len(startpage_rule.get_result()) > 0: result.set_type('hrefs') for item in startpage_rule.get_result(['href']): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): # print(item) href = item['href'] page_number = href.rpartition('/page')[2].rpartition('.')[0] result.add_page(ControlInfo(page_number, URL(href))) # print(href,page_number) if len(startpage_categories_rule.get_result(['href'])) > 0: for item in startpage_categories_rule.get_result(['href', 'data']): result.add_control(ControlInfo(item.get('data', ''), URL(item['href']))) if len(startpage_hrefs_rule.get_result(['href'])) > 0: for item in startpage_hrefs_rule.get_result(['href', 'data']): result.add_control(ControlInfo(item.get('data', ''), URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() domain = base_url.domain() href_rule = ParserRule() # startpage & model's page href_rule.add_activate_rule_level([('div', 'class', 'galleries'), ('div', 'class', 'models'), ('div', 'class', 'videos')]) href_rule.add_activate_rule_level([('div', 'class', 'items')]) href_rule.add_process_rule_level('a', {'href'}) href_rule.add_process_rule_level('img', {'src', 'alt'}) href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) href_rule.set_attribute_modifier_function( 'src', lambda x: self.get_href(x, base_url)) parser.add_rule(href_rule) href_page_rule = ParserRule() # page number in model's page href_page_rule.add_activate_rule_level([('ul', 'class', 'pagination')]) href_page_rule.add_process_rule_level('a', {'href'}) href_page_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(href_page_rule) model_litera_rule = ParserRule() model_litera_rule.add_activate_rule_level([('span', 'class', 'chars')]) model_litera_rule.add_process_rule_level('a', {'href'}) model_litera_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(model_litera_rule) picture_rule = ParserRule() # gallery rule picture_rule.add_activate_rule_level([('div', 'class', 'picture')]) picture_rule.add_process_rule_level('a', {'href'}) picture_rule.add_process_rule_level('img', {'alt'}) picture_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(picture_rule) video_rule = ParserRule() # gallery rule video_rule.add_activate_rule_level([('div', 'class', 'video')]) video_rule.add_process_rule_level('source', {'src'}) # video_rule.add_process_rule_level('img', {'alt'}) # video_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x,base_url)) parser.add_rule(video_rule) picture_href_rule = ParserRule() # gallery href's rule picture_href_rule.add_activate_rule_level([('div', 'class', 'model')]) picture_href_rule.add_activate_rule_level([('div', 'class', 'links')]) picture_href_rule.add_process_rule_level('a', {'href'}) picture_href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(picture_href_rule) for s in open(fname): parser.feed(s) result = ParseResult() if len(video_rule.get_result()) > 0: result.set_video( MediaData(URL(video_rule.get_result()[0]['src'] + '*'))) result.set_type('video') for f in picture_href_rule.get_result(['href', 'data']): # print(f) result.add_control( ControlInfo(text=f['data'], url=URL(f['href']))) return result if len(picture_rule.get_result()) > 0: result.set_type('pictures') for f in picture_rule.get_result(): x = FullPictureInfo(abs_href=URL(f['href']), rel_name=f['href'].rpartition('/')[2]) result.add_full(x) for f in picture_href_rule.get_result(['href', 'data']): # print(f) result.add_control( ControlInfo(text=f['data'], url=URL(f['href']))) return result if len(href_rule.get_result()) > 0: result.set_type('hrefs') for item in href_rule.get_result(): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in model_litera_rule.get_result(['href', 'data']): result.add_control(ControlInfo(item['data'], URL(item['href']))) for item in href_page_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'item photo-item')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([ ('ul', 'class', 'justified-pagination') ]) startpage_pages_rule.add_process_rule_level('a', {'href'}) parser.add_rule(startpage_pages_rule) startpage_tags_rule = ParserRule() startpage_tags_rule.add_activate_rule_level([('ul', 'class', 'tags')]) startpage_tags_rule.add_process_rule_level('a', {'href'}) parser.add_rule(startpage_tags_rule) picture_rule = ParserRule() picture_rule.add_activate_rule_level([('div', 'class', 'photo-item')]) picture_rule.add_process_rule_level('a', set()) picture_rule.add_process_rule_level('img', {'src'}) picture_rule.set_attribute_modifier_function( 'src', lambda text: text.replace('thumb', 'origin')) parser.add_rule(picture_rule) picture_model_rule = ParserRule() picture_model_rule.add_activate_rule_level([('div', 'class', 'block attached-model')]) picture_model_rule.add_process_rule_level('a', {'href'}) picture_model_rule.add_process_rule_level('img', {'alt'}) parser.add_rule(picture_model_rule) picture_tags_rule = ParserRule() picture_tags_rule.add_activate_rule_level([('div', 'class', 'block gallery-tags')]) picture_tags_rule.add_activate_rule_level([('ul', 'class', 'tags')]) picture_tags_rule.add_process_rule_level('a', {'href'}) parser.add_rule(picture_tags_rule) for s in open(fname, encoding='utf-8'): parser.feed(s) result = ParseResult() if len(startpage_rule.get_result()) > 0: result.set_type('hrefs') for item in startpage_rule.get_result(): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) for item in startpage_tags_rule.get_result(['href', 'data']): result.add_control(ControlInfo(item['data'], URL(item['href']))) if len(picture_rule.get_result()) > 0: result.set_type('pictures') i = 1 for f in picture_rule.get_result(): result.add_full( FullPictureInfo(abs_href=URL(f['src']), rel_name='%03d.jpg' % i)) i += 1 for item in picture_model_rule.get_result(['href', 'alt']): result.add_control( ControlInfo(item['alt'], URL(item['href'] + '/galleries'))) for item in picture_tags_rule.get_result(['href', 'data']): result.add_control(ControlInfo(item['data'], URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): site = None for s in self.accepted_sites: if base_url.contain(s['test']): site = s parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([site['rule']]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('ul', 'class', 'pager'), ('div', 'class', 'navbartext'), ('div', 'class', 'navigation'), ('div', 'class', 'pager'), ('div', 'class', 'col-md-12 pager'), ('ul', 'id', 'pager')]) startpage_pages_rule.add_process_rule_level('a', {'href', 'alt'}) startpage_pages_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) picture_rule = ParserRule() picture_rule.add_activate_rule_level([('article', 'class', 'b-margin-40 g148 gallery'), ('div', 'class', 'wrapper_g'), ('td', 'style', 'background:#ededed;'), ('div', 'id', 'gallerycont'), ('div', 'class', 'galleryblock'), ('div', 'class', 'list gallery'), ('div', 'class', 'picturecontainer mainpics'), ('div', 'class', 'single_thumb'), ('div', 'class', 'minithumbs')]) picture_rule.add_process_rule_level('a', set()) picture_rule.add_process_rule_level('img', {'src'}) picture_rule.set_attribute_modifier_function('src', lambda text: text.replace('/tn_', '/').replace('_tn_', '_')) parser.add_rule(picture_rule) for s in open(fname, encoding='utf-8'): parser.feed(s) result = ParseResult() if len(startpage_rule.get_result()) > 0: # print('Startpage rule') result.set_type('hrefs') for item in startpage_rule.get_result(): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) if len(picture_rule.get_result()) > 0: result.set_type('pictures') i = 1 for f in picture_rule.get_result(['src']): # print(f) result.add_full(FullPictureInfo(abs_href=URL(f['src']), rel_name='%03d.jpg' % i)) i += 1 return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'galleries')]) startpage_rule.add_process_rule_level('div', {}) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function('href', get_href) parser.add_rule(startpage_rule) menu_rule = ParserRule() menu_rule.add_activate_rule_level([('div', 'class', 'menu-list')]) menu_rule.add_process_rule_level('a', {'href'}) menu_rule.set_attribute_modifier_function( 'href', lambda x: "http://lustimages.com" + x) parser.add_rule(menu_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'pagi') ]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: "http://lustimages.com" + x) parser.add_rule(startpage_pages_rule) picture_trigger_rule = ParserRule() picture_trigger_rule.add_activate_rule_level([('div', 'class', 'left-main')]) picture_trigger_rule.add_process_rule_level('a', {'href'}) parser.add_rule(picture_trigger_rule) picture_rule = ParserRule() picture_rule.add_activate_rule_level([('div', 'class', 'gall')]) picture_rule.add_process_rule_level('a', {'href'}) parser.add_rule(picture_rule) for s in open(fname): parser.feed(s) result = ParseResult() if len(picture_trigger_rule.get_result()) > 0: result.set_type('pictures') i = 1 for f in picture_rule.get_result(): result.add_full( FullPictureInfo(abs_href=URL(f['href']), rel_name='%03d.jpg' % i)) i += 1 for item in menu_rule.get_result(['href', 'data']): result.add_control(ControlInfo(item['data'], URL(item['href']))) return result if len(startpage_rule.get_result()) > 0: result.set_type('hrefs') for item in startpage_rule.get_result(): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL("http://lustimages.com" + item['href']), popup=item.get('alt', ''))) for item in menu_rule.get_result(['href', 'data']): result.add_control(ControlInfo(item['data'], URL(item['href']))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() domain = base_url.domain() href_rule = ParserRule() # startpage & model's page href_rule.add_activate_rule_level([('div', 'class', 'block galleries first'), ('div', 'class', 'block models'), ('div', 'class', 'block galleries')]) href_rule.add_activate_rule_level([('div', 'class', 'thumbs')]) href_rule.add_process_rule_level('a', {'href'}) href_rule.add_process_rule_level('img', {'src', 'alt'}) href_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) href_rule.set_attribute_modifier_function('src', lambda x: self.get_href(x, base_url)) parser.add_rule(href_rule) href_model_page_rule = ParserRule() # page number in model's page href_model_page_rule.add_activate_rule_level([('div', 'class', 'block galleries'), ('div', 'class', 'block models')]) href_model_page_rule.add_activate_rule_level([('ul', 'class', 'pagination')]) href_model_page_rule.add_process_rule_level('a', {'href'}) href_model_page_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(href_model_page_rule) model_litera_rule = ParserRule() model_litera_rule.add_activate_rule_level([('div', 'id', 'header')]) model_litera_rule.add_process_rule_level('a', {'href', 'title'}) model_litera_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(model_litera_rule) picture_rule = ParserRule() # gallery rule picture_rule.add_activate_rule_level([('div', 'class', 'block gallery')]) picture_rule.add_process_rule_level('a', {'href'}) picture_rule.add_process_rule_level('img', {'alt'}) picture_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(picture_rule) picture_href_rule = ParserRule() # gallery href's rule picture_href_rule.add_activate_rule_level([('div', 'class', 'profile')]) picture_href_rule.add_activate_rule_level([('div', 'class', 'cover')]) picture_href_rule.add_process_rule_level('a', {'href'}) picture_href_rule.add_process_rule_level('img', {'alt'}) picture_href_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(picture_href_rule) for s in open(fname): parser.feed(s) result = ParseResult() if len(href_rule.get_result()) > 0: result.set_type('hrefs') for item in href_rule.get_result(): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in model_litera_rule.get_result(['href', 'title', 'data']): if item['title'].startswith('Met Art Models'): result.add_control(ControlInfo(item['data'], URL(item['href']))) for item in href_model_page_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) if len(picture_rule.get_result()) > 0: result.set_type('pictures') for f in picture_rule.get_result(): x = FullPictureInfo(abs_href=URL(f['href']), rel_name=f['href'].rpartition('/')[2]) result.add_full(x) for f in picture_href_rule.get_result(['alt', 'href']): # print(f) result.add_control(ControlInfo(text=f['alt'], url=URL(f['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('td', 'class', 'blokth')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: base_url.domain() + '/' + x + '*') startpage_rule.set_attribute_modifier_function( 'src', lambda x: base_url.domain() + '/' + x + '*') parser.add_rule(startpage_rule) startpage_hrefs_rule = ParserRule() startpage_hrefs_rule.add_activate_rule_level([('td', 'class', 'text2') ]) startpage_hrefs_rule.add_activate_rule_level([('div', 'id', 'div3')]) startpage_hrefs_rule.add_process_rule_level('a', {'href'}) startpage_hrefs_rule.set_attribute_modifier_function( 'href', lambda x: base_url.domain() + '/' + x + '*') parser.add_rule(startpage_hrefs_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('td', 'class', 'archives')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: base_url.domain() + '/' + x + '*') # picture_rule.set_attribute_modifier_function('src', lambda text: text.replace('thumb', 'origin')) parser.add_rule(startpage_pages_rule) picture_base_rule = ParserRule() picture_base_rule.add_activate_rule_level([('td', 'height', '500')]) picture_base_rule.add_process_rule_level('a', set()) picture_base_rule.add_process_rule_level('img', {'src'}) picture_base_rule.set_attribute_modifier_function( 'src', lambda x: base_url.domain() + '/' + x) parser.add_rule(picture_base_rule) picture_rule = ParserRule() picture_rule.add_activate_rule_level([('td', 'class', 'archives')]) picture_rule.add_process_rule_level('a', {'href'}) picture_rule.set_attribute_modifier_function( 'href', lambda x: base_url.domain() + x + '*') # picture_rule.set_attribute_modifier_function('src', lambda text: text.replace('thumb', 'origin')) parser.add_rule(picture_rule) # picture_tags_rule = ParserRule() picture_tags_rule.add_activate_rule_level([('span', 'class', 'text2')]) picture_tags_rule.add_process_rule_level('a', {'href'}) picture_tags_rule.set_attribute_modifier_function( 'href', lambda x: x + '*') parser.add_rule(picture_tags_rule) for s in open(fname): parser.feed(s) result = ParseResult() if len(startpage_rule.get_result()) > 0: result.set_type('hrefs') for item in startpage_rule.get_result(): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) for item in startpage_hrefs_rule.get_result(['href', 'data']): if item['href'].rfind('?cat=') != -1: result.add_control( ControlInfo(item['data'], URL(item['href']))) return result if len(picture_rule.get_result()) > 0: result.set_type('pictures') result.set_picture_collector(XXPSitePictureCollector()) dirname = self.base_addr.rstrip('/') + URL( picture_base_rule.get_result()[0]['src']).get_path() result.set_gallery_path(dirname) i = 1 result.add_full( FullPictureInfo(abs_href=base_url, abs_name=dirname + '%03d.jpg' % i)) for f in picture_rule.get_result(): i += 1 result.add_full( FullPictureInfo(abs_href=URL(f['href']), abs_name=dirname + '%03d.jpg' % i)) for item in picture_tags_rule.get_result(['href', 'data']): if item['href'].rfind('?cat=') != -1: result.add_control( ControlInfo(item['data'], URL(item['href']))) return result