def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() def star_get_url(txt=''): return txt.partition('(')[2].partition(')')[0] startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'thumb')]) startpage_rule.add_process_rule_level('a', {'title', 'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'wp-pagenavi')]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) startpage_hrefs_rule = ParserRule() startpage_hrefs_rule.add_activate_rule_level([('ul', 'class', 'list')]) # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_hrefs_rule.add_process_rule_level('a', {'href'}) startpage_hrefs_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_hrefs_rule) # video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'id', 'video')]) video_rule.add_process_rule_level('iframe', {'src'}) # video_rule.set_attribute_filter_function('data', lambda text: 'flashvars' in text) parser.add_rule(video_rule) fake_video_rule = ParserRule() fake_video_rule.add_activate_rule_level([('div', 'id', 'video')]) fake_video_rule.add_process_rule_level('div', {}) # video_rule.set_attribute_filter_function('data', lambda text: 'flashvars' in text) parser.add_rule(fake_video_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'id', 'extras')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_href_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): # len(video_rule.get_result()) > 0: # for item in video_rule.get_result(): # print(item) src = video_rule.get_result()[0]['src'] query = parse_qs(urlparse(video_rule.get_result()[0]['src'])[4]) alternates = list() if 'f' in query: data = {'data': query['f'][0]} php_url = 'http://donfreeporn.com/wp-content/themes/detube/Htplugins/Loader.php*' url = URL(php_url, 'POST', post_data=data) r = load(url) video_url = URL(r.json()['l'][0]) else: r = load(URL(src)) setup = self.quotes(r.text, 'jwplayer("vplayer").setup(', ');').replace(' ', '') sources = self.quotes(setup, 'sources:[{', '}],').split('},{') for item in sources: if '.mp4' in item: # print(item) file = self.quotes(item, 'file:"', '"') label = self.quotes(item, 'label:"', '"') # print(file,label) alternates.append(dict(text=label, url=URL(file + '*'))) if len(alternates) == 0: return result video_url = alternates[0]['url'] video = MediaData(video_url) for item in alternates: video.add_alternate(item) result.set_type('video') result.set_video(video) for f in gallery_href_rule.get_result(['data', 'href']): result.add_control(ControlInfo(f['data'], URL(f['href']))) return result if fake_video_rule.is_result(): print('Broken video on this url') return result if startpage_rule.is_result(): # len(startpage_rule.get_result()) > 0: result.set_type('hrefs') for item in startpage_rule.get_result(): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('title', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) for item in startpage_hrefs_rule.get_result(['href']): href = item['href'] label = href.split('/')[-2] # print(label,href) result.add_control(ControlInfo(label, URL(href))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'fixed-content')]) startpage_rule.add_process_rule_level('a', {'href', 'class'}) startpage_rule.add_process_rule_level('div', {'style'}) startpage_rule.set_attribute_filter_function( 'class', lambda x: x == 'thumbnail') startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) startpage_rule.set_attribute_modifier_function( 'style', lambda x: x.partition("url('")[2].partition("')")[0]) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([ ('div', 'class', 'col-xs-12 content-pagination') ]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) tags_rule = ParserRule() tags_rule.add_activate_rule_level([('section', 'id', 'footer-tag')]) tags_rule.add_process_rule_level('a', {'href'}) tags_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(tags_rule) categories_rule = ParserRule() categories_rule.add_activate_rule_level([('ul', 'class', 'nav navbar-nav')]) categories_rule.add_process_rule_level('a', {'href'}) categories_rule.set_attribute_filter_function( 'href', lambda x: '/Category/' in x and "#" not in x) categories_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(categories_rule) video_rule = ParserRule() video_rule.add_activate_rule_level([('body', '', '')]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function( 'data', lambda text: 'angular.' in text) parser.add_rule(video_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'class', 'row tag-area')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_href_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): # len(video_rule.get_result()) > 0: script = video_rule.get_result()[0]['data'].replace(' ', '') json_file_url = self.get_href(self.quotes(script, "host:'", "'"), base_url) # print(json_file_url) from requests_loader import load, LoaderError json_file = Setting.base_dir + 'tsp_video.json' urls = list() result.set_type('video') try: r = load(URL(json_file_url), json_file) links = set() for item in r.json()['mediaSources']: # print(item) if item['source'] not in links: data = dict(text=item['quality'], url=URL(item['source'] + '*')) urls.append(data) links.add(item['source']) if len(urls) == 1: video = MediaData(urls[0]['url']) elif len(urls) > 1: video = MediaData(urls[0]['url']) for item in urls: video.add_alternate(item) else: return result result.set_video(video) except LoaderError as err: print(err) for f in gallery_href_rule.get_result(['data', 'href']): result.add_control( ControlInfo(f['data'].strip(), URL(f['href']))) return result if startpage_rule.is_result(): # len(startpage_rule.get_result()) > 0: result.set_type('hrefs') for item in startpage_rule.get_result(['href']): # print(item) result.add_thumb( ThumbInfo(thumb_url=URL(item['style']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): label = item['data'].replace(' ', '') # print(item) if len(label) > 0: result.add_page(ControlInfo(label, URL(item['href']))) if categories_rule.is_result(['href']): for item in categories_rule.get_result(['href', 'data']): result.add_control( ControlInfo(item['data'], URL(item['href']))) if tags_rule.is_result(['href']): for item in tags_rule.get_result(['href', 'data']): result.add_control( ControlInfo(item['data'], URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'link-3col')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) startpage_rule.set_attribute_modifier_function( 'src', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'wp-pagenavi')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) tags_rule = ParserRule() tags_rule.add_activate_rule_level([('div', 'class', 'tagcloud')]) tags_rule.add_process_rule_level('a', {'href'}) tags_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(tags_rule) video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'class', 'player')]) video_rule.add_process_rule_level('iframe', {'src'}) video_rule.set_attribute_filter_function('src', lambda x: 'fileone.tv' in x) video_rule.set_attribute_modifier_function( 'src', lambda x: self.get_href(x, base_url)) parser.add_rule(video_rule) gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'id', 'extras')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_href_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): urls = UrlList() for item in video_rule.get_result(): print(item) try: r = load(URL(item['src'])) setup = self.quotes(r.text, "jwplayer('player').setup(", ")").replace(' ', '') file = self.quotes(setup, "file:'", "'") urls.add("default", URL(file + '*')) except LoaderError as err: print(err) result.set_video(urls.get_media_data()) for f in gallery_href_rule.get_result(['data', 'href']): result.add_control( ControlInfo(f['data'].strip(), URL(f['href']))) return result if startpage_rule.is_result(): # len(startpage_rule.get_result()) > 0: for item in startpage_rule.get_result(['href']): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) for item in tags_rule.get_result(['href', 'data']): result.add_control(ControlInfo(item['data'], URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'thumb')]) startpage_rule.add_process_rule_level('a', {'href', 'title'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'wp-pagenavi')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) tags_rule = ParserRule() tags_rule.add_activate_rule_level([('div', 'class', 'tagcloud')]) tags_rule.add_process_rule_level('a', {'href'}) tags_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(tags_rule) video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'class', 'videoContainer')]) video_rule.add_process_rule_level('iframe', {'src'}) video_rule.set_attribute_modifier_function('src', lambda x: self.get_href(x, base_url)) parser.add_rule(video_rule) gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'id', 'extras')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function('href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_href_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): urls = UrlList() for item in video_rule.get_result(): try: r = load(URL(item['src'])) r = load(URL(self.quotes(r.text, "jwplayer().load('", "'") + '*')) source = self.quotes(r.text, '<item>', '</item>').strip() split = source.split('<jwplayer:source file="') for l in split: if l is '': continue url = l.partition('"')[0] label = self.quotes(l, 'label="', '"') urls.add(label, URL(url + '*')) except LoaderError as err: print(err) result.set_video(urls.get_media_data()) for f in gallery_href_rule.get_result(['data', 'href']): result.add_control(ControlInfo(f['data'].strip(), URL(f['href']))) return result if startpage_rule.is_result(): # len(startpage_rule.get_result()) > 0: for item in startpage_rule.get_result(['href']): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) for item in tags_rule.get_result(['href', 'data']): result.add_control(ControlInfo(item['data'], URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([ ('div', 'class', 'video_item_wrapper video_item_medium') ]) startpage_rule.add_process_rule_level('a', {'href', 'class', 'title'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) # startpage_rule.set_attribute_filter_function('class',lambda x: x == 'thumbnail') startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) startpage_rule.set_attribute_modifier_function( 'src', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'class', 'video_wrapper')]) video_rule.add_process_rule_level('iframe', {'src'}) # video_rule.set_attribute_filter_function('data', lambda text: 'angular.' in text) video_rule.set_attribute_modifier_function( 'src', lambda x: self.get_href(x, base_url)) parser.add_rule(video_rule) # video_href_rule = ParserRule() video_href_rule.add_activate_rule_level([ ('div', 'class', 'single_description_item_info') ]) video_href_rule.add_process_rule_level('a', {'href'}) video_href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(video_href_rule) try: if base_url.method == 'POST': has_more = False first_page = False with open(fname, encoding='utf-8', errors='ignore') as fd: j = json.load(fd) success = j.get('success', False) if success: next_data = j['data'] content = next_data['content'] if len(content) > 0: has_more = next_data['has_more'] print('has_more:', has_more) with open(fname, 'w', encoding='utf-8') as fd: fd.write(content) else: first_page = True has_more = True self.proceed_parcing(parser, fname) except ValueError: return ParseResult() result = ParseResult() if video_rule.is_result(): # len(video_rule.get_result()) > 0: # print(video_rule.get_result()) frame = URL(video_rule.get_result()[0]['src']) print(frame) from requests_loader import load, LoaderError, get_last_index_cookie frame_file = Setting.base_dir + 'frame.html' cookie = get_last_index_cookie() # print(cookie) # urls = list() # result.set_type('video') try: r = load(frame, frame_file, cookie=cookie) print(r.text) urls = list() # print(r.text) setup = r.text.replace(' ', '').replace( '\\/', '/').partition('vc.player_setup=')[2].partition(';')[0] playlist = setup.partition('"playlist":')[2] split = playlist.split('"file":"') for item in split: if '"label":' in item: part = item.partition('"') url = part[0] label = part[2].partition('"label":"')[2].partition( '"')[0] print(label, url) next_data = dict(text=label, url=URL(url + '*')) urls.append(next_data) if len(urls) == 1: video = MediaData(urls[0]['url']) elif len(urls) > 1: video = MediaData(urls[0]['url']) for item in urls: video.add_alternate(item) else: return result result.set_video(video) except LoaderError as err: print(err) def add_categories(parcer_result, text): for f in parcer_result: if text in f['href']: result.add_control( ControlInfo(f['data'].strip(), URL(f['href']))) parcer_result = video_href_rule.get_result(['data', 'href']) add_categories(parcer_result, '/studios/') add_categories(parcer_result, '/pornstars/') add_categories(parcer_result, '/channels/') return result if startpage_rule.is_result(): for item in startpage_rule.get_result(['href']): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) prev_data = None if first_page: print(base_url.get()) xhr_data = {'base_url': base_url, 'step': 100} next_data = { 'main_category_id': '1', 'type': 'post', 'filters[filter_type]': 'date', 'filters[filter_period]': '' } if base_url.contain('/video/'): next_data['name'] = 'all_videos' if base_url.contain('/amateur/videos/'): next_data['main_category_id'] = '4' next_data['name'] = 'all_videos' if base_url.contain('-amateur'): next_data['main_category_id'] = '4' if base_url.contain('/channels/'): next_data['name'] = 'category_videos' next_data['category_id[]'] = self.quotes( base_url.get(), '/channels/', '/') if base_url.contain('/pornstars/'): next_data['name'] = 'pornstar_related_videos' next_data['content_id'] = self.quotes( base_url.get(), '/pornstars/', '/') xhr_data['step'] = 65 if base_url.contain('/studios/'): next_data['name'] = 'studio_related_videos' next_data['content_id'] = self.quotes( base_url.get(), '/studios/', '/') xhr_data['step'] = 65 next_data['offset'] = str(xhr_data['step']) else: next_data = base_url.post_data.copy() xhr_data = base_url.xhr_data.copy() curr = int(base_url.post_data['offset']) next_data['offset'] = str(curr + xhr_data['step']) if curr > 100: prev_data = base_url.post_data.copy() prev_data['offset'] = str(curr - xhr_data['step']) xhr_href = 'https://www.porndig.com/posts/load_more_posts/' result.add_page(ControlInfo('Main', xhr_data['base_url'])) sorted_data = next_data.copy() sorted_data['offset'] = '0' for method in ['date', 'views', 'rating', 'duration', 'ctr']: data = sorted_data.copy() data['filters[filter_type]'] = method sorted_url = URL(xhr_href, method='POST', post_data=data, xhr_data=xhr_data) result.add_page( ControlInfo('Sorted by {0}(0)'.format(method), sorted_url)) if prev_data is not None: data = prev_data.copy() data['filters[filter_type]'] = method prev_url = URL(xhr_href, method='POST', post_data=data, xhr_data=xhr_data) result.add_page( ControlInfo( 'Prev {0}({1})'.format(method, data['offset']), prev_url)) if has_more: data = next_data.copy() data['filters[filter_type]'] = method next_url = URL(xhr_href, method='POST', post_data=data, xhr_data=xhr_data) result.add_page( ControlInfo( 'Next {0}({1})'.format(method, data['offset']), next_url)) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'video')]) startpage_rule.add_process_rule_level('a', {'href', 'title'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) startpage_rule.set_attribute_modifier_function( 'src', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) channels_rule = ParserRule() channels_rule.add_activate_rule_level([('ul', 'class', 'channels')]) channels_rule.add_process_rule_level('a', {'href', 'title'}) channels_rule.add_process_rule_level('div', {}) channels_rule.add_process_rule_level('img', {'src', 'alt'}) channels_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url).replace('*', '/')) channels_rule.set_attribute_modifier_function( 'src', lambda x: self.get_href(x, base_url)) parser.add_rule(channels_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([ ('ul', 'class', 'pagination pagination-lg') ]) # startpage_pages_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) startpage_hrefs_rule = ParserRule() startpage_hrefs_rule.add_activate_rule_level([ ('ul', 'class', 'nav nav-stacked navigation') ]) # startpage_hrefs_rule.add_activate_rule_level([('a', 'class', 'current')]) startpage_hrefs_rule.add_process_rule_level('a', {'href'}) # startpage_hrefs_rule.set_attribute_filter_function('href',lambda x: '/videos/' in x) startpage_hrefs_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_hrefs_rule) # video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'id', 'player')]) video_rule.add_process_rule_level('script', {}) video_rule.set_attribute_filter_function( 'data', lambda text: 'jwplayer' in text) # video_rule.set_attribute_modifier_function('src',lambda x:self.get_href(x,base_url)) parser.add_rule(video_rule) video2_rule = ParserRule() video2_rule.add_activate_rule_level([('div', 'id', 'video')]) video2_rule.add_process_rule_level('script', {'src'}) video2_rule.set_attribute_filter_function( 'src', lambda text: 'pornbraze.com/' in text) video2_rule.set_attribute_modifier_function( 'src', lambda x: self.get_href(x, base_url)) parser.add_rule(video2_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([ ('div', 'class', 'col-xs-12 col-sm-12 col-md-12') ]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_href_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): # len(video_rule.get_result()) > 0: urls = list() for item in video_rule.get_result(): # print(item['data']) script = item['data'].replace(' ', '') if 'sources:[{' in script: txt = '[{' + self.quotes(item['data'].replace(' ', ''), 'sources:[{', '}]') + '}]' j = json.loads(txt) for j_data in j: # print(j_data) if j_data['file'] is not '': data = dict(text=j_data['label'], url=URL(j_data['file'] + '*')) urls.append(data) elif 'sources:' in script: if video2_rule.is_result(['src']): # print(video2_rule.get_result()) php_url = URL( video2_rule.get_result(['src'])[0]['src']) # print(php_url) res = load(php_url) # print(res.text) bitrates = self.quotes(res.text, "'bitrates':[{", "}]").split('},{') # print(bitrates) for line in bitrates: print(line) video_url = self.quotes(line, "'file':'", "'") label = self.quotes(line, 'label:"', '"') data = dict(text=label, url=URL(video_url + '*')) urls.append(data) if len(urls) == 1: video = MediaData(urls[0]['url']) elif len(urls) > 1: video = MediaData(urls[0]['url']) for item in urls: video.add_alternate(item) else: return result result.set_type('video') result.set_video(video) for f in gallery_href_rule.get_result(['data', 'href']): # print(f) href = f['href'].replace('*', '/') label = f['data'] if '/users/' in href: href = href + '/videos/public/' label = '"' + label + '"' result.add_control(ControlInfo(label, URL(href))) return result if startpage_rule.is_result() or channels_rule.is_result(): result.set_type('hrefs') for item in startpage_rule.get_result(['href']): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', item.get('title', '')))) for item in channels_rule.get_result(['href']): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', item.get('title', '')))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) for item in startpage_hrefs_rule.get_result(): label = item['href'].strip('*/').rpartition('/')[2] result.add_control(ControlInfo(label, URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() # startpage_rule.add_activate_rule_level([('div', 'class', 'post_block')])# startpage_rule.add_activate_rule_level([('div', 'class', 'vid_container')]) # startpage_rule.add_process_rule_level('img', {'src'}) startpage_rule.add_process_rule_level('a', {'href', 'title'}) # startpage_rule.set_attribute_filter_function('src',lambda x: '.jpg' in x) startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) # startpage_rule.set_attribute_modifier_function('title', lambda x: x.partition('#')[0]) parser.add_rule(startpage_rule) startpage_combo_rule = ParserRule() # startpage_rule.add_activate_rule_level([('div', 'class', 'post_block')])# startpage_combo_rule.add_activate_rule_level([('div', 'class', 'combo_post_wrap')]) startpage_combo_rule.add_process_rule_level('a', {'href', 'title'}) startpage_combo_rule.add_process_rule_level('img', {'src'}) # startpage_rule.set_attribute_filter_function('src',lambda x: '.jpg' in x) # startpage_combo_rule.set_attribute_modifier_function('title', lambda x: x.partition('#')[0]) startpage_combo_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_combo_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'id', 'center_control')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) startpage_hrefs_rule = ParserRule() startpage_hrefs_rule.add_activate_rule_level([ ('ul', 'class', 'dropdown-menu columns') ]) startpage_hrefs_rule.add_process_rule_level('a', {'href'}) # startpage_hrefs_rule.set_attribute_filter_function('href',lambda x: '/channel/' in x or '/prime/' in x) startpage_hrefs_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_hrefs_rule) video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'itemprop', 'video')]) # video_rule.add_process_rule_level('a', {'href'}) video_rule.add_process_rule_level('video', {'src'}) parser.add_rule(video_rule) video_multipart_rule = ParserRule() video_multipart_rule.add_activate_rule_level([('div', 'id', 'videos_container')]) # video_rule.add_process_rule_level('a', {'href'}) video_multipart_rule.add_process_rule_level( 'div', {'data-source', 'data-hash', 'data-x', 'data-oid', 'data-pid'}) parser.add_rule(video_multipart_rule) video_usss_rule = ParserRule() video_usss_rule.add_activate_rule_level([('body', '', '')]) # video_rule.add_process_rule_level('a', {'href'}) video_usss_rule.add_process_rule_level('script', {}) video_usss_rule.set_attribute_filter_function('data', lambda x: 'usss' in x) parser.add_rule(video_usss_rule) # gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([ ('div', 'class', 'popular_block_header_rl') ]) gallery_href_rule.add_process_rule_level('a', {'href'}) # gallery_href_rule.set_attribute_filter_function('href',lambda x: '#' not in x) gallery_href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_href_rule) gallery_author_rule = ParserRule() gallery_author_rule.add_activate_rule_level([ ('div', 'id', 'posts_container') ]) # post_block gallery_author_rule.add_activate_rule_level([ ('div', 'class', 'post_author_name') ]) # post_block gallery_author_rule.add_process_rule_level('a', {'href'}) # gallery_href_rule.set_attribute_filter_function('href',lambda x: '#' not in x) gallery_author_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_author_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): # len(video_rule.get_result()) > 0: # print('video rule') # print(video_rule.get_result()) video = MediaData(URL(video_rule.get_result()[0]['src'])) # result.set_type('video') result.set_video(video) for f in gallery_author_rule.get_result(['data', 'href']): # print(f) result.add_control( ControlInfo('"' + f['data'].strip() + '"', URL(f['href']))) for f in gallery_href_rule.get_result(['data', 'href']): # print(f) result.add_control( ControlInfo(f['data'].strip(), URL(f['href']))) return result if video_multipart_rule.is_result(): res = video_multipart_rule.get_result() series = len(res) s = base_url.get().partition('?s=')[2] if s == '': serie = 1 else: serie = int(s) uid = self.quotes( video_usss_rule.get_result()[0]['data'].replace(' ', ''), 'usss[0]="', '"') curr_result = res[serie - 1] data = { 'uid': uid, 'source': curr_result['data-source'], 'hash': curr_result['data-hash'], 'x': curr_result['data-x'], 'oid': curr_result['data-oid'], 'pid': curr_result['data-pid'] } url = URL(self.get_href('/php/get_vlink.php', base_url), 'POST', post_data=data) r = load(url) video = MediaData(URL(r.text)) result.set_type('video') result.set_video(video) for i in range(1, series + 1): label = 'S{0}'.format(i) if i == serie: label += '(this)' url_i = base_url.get().partition('?')[0] + '?s={0}'.format(i) result.add_control(ControlInfo(label, URL(url_i + '*'))) for f in gallery_author_rule.get_result(['data', 'href']): # print(f) result.add_control( ControlInfo('"' + f['data'].strip() + '"', URL(f['href']))) for f in gallery_href_rule.get_result(['data', 'href']): # print(f) result.add_control( ControlInfo(f['data'].strip(), URL(f['href']))) return result if startpage_rule.is_result() or startpage_combo_rule.is_result( ): # len(startpage_rule.get_result()) > 0: # result.set_type('hrefs') for item in startpage_combo_rule.get_result(): # print(item) result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('title', ''))) for item in startpage_rule.get_result(['href']): # print(item) result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('title', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): href = item['href'] data = item['data'] n = href.rpartition('/')[2].partition('.')[0] result.add_page(ControlInfo('{1}'.format(data, n), URL(href))) for item in startpage_hrefs_rule.get_result(['href', 'data']): result.add_control(ControlInfo(item['data'], URL(item['href']))) return result
def parse_index_file(self, fname, base_url=URL()): parser = SiteParser() startpage_rule = ParserRule() startpage_rule.add_activate_rule_level([('div', 'class', 'thumb')]) startpage_rule.add_process_rule_level('a', {'href'}) startpage_rule.add_process_rule_level('img', {'src', 'alt'}) startpage_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) startpage_rule.set_attribute_modifier_function( 'src', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_rule) startpage_pages_rule = ParserRule() startpage_pages_rule.add_activate_rule_level([('div', 'class', 'loop-nav-inner')]) startpage_pages_rule.add_process_rule_level('a', {'href'}) startpage_pages_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(startpage_pages_rule) tags_rule = ParserRule() tags_rule.add_activate_rule_level([('ul', 'class', 'menu')]) tags_rule.add_process_rule_level('a', {'href'}) tags_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(tags_rule) video_rule = ParserRule() video_rule.add_activate_rule_level([('div', 'class', 'section-content'), ('div', 'id', 'video')]) video_rule.add_process_rule_level('iframe', {'src'}) # video_rule.set_attribute_filter_function('src',lambda x:'fileone.tv' in x) video_rule.set_attribute_modifier_function( 'src', lambda x: self.get_href(x, base_url)) parser.add_rule(video_rule) gallery_href_rule = ParserRule() gallery_href_rule.add_activate_rule_level([('div', 'id', 'extras')]) gallery_href_rule.add_process_rule_level('a', {'href'}) gallery_href_rule.set_attribute_modifier_function( 'href', lambda x: self.get_href(x, base_url)) parser.add_rule(gallery_href_rule) self.proceed_parcing(parser, fname) result = ParseResult() if video_rule.is_result(): urls = UrlList() for item in video_rule.get_result(): print(item) src = item['src'] if '.video/embed' in src: try: r = load(URL(item['src'])) setup = self.quotes(r.text, 'jwplayer("vplayer").setup(', ")").replace(' ', '') sources = self.quotes(setup, 'sources:[{', '}],').split('},{') for item in sources: if '.mp4' in item: file = self.quotes(item, 'file:"', '"') label = self.quotes(item, 'label:"', '"') urls.add(label, URL(file + '*')) except LoaderError as err: print(err) elif 'javfinder.com/' in src: try: r = load(URL(item['src'])) split1 = r.text.split('<source src="')[1:] for f in split1: f1 = f.partition('>')[0] if '.mp4' in f1: file = f1.partition('"')[0] label = self.quotes(f1, 'res="', '"') urls.add(label, URL(file + '*')) except LoaderError as err: print(err) result.set_video(urls.get_media_data()) for f in gallery_href_rule.get_result(['data', 'href']): result.add_control( ControlInfo(f['data'].strip(), URL(f['href']))) return result if startpage_rule.is_result(): # len(startpage_rule.get_result()) > 0: for item in startpage_rule.get_result(['href']): result.add_thumb( ThumbInfo(thumb_url=URL(item['src']), href=URL(item['href']), popup=item.get('alt', ''))) for item in startpage_pages_rule.get_result(['href', 'data']): result.add_page(ControlInfo(item['data'], URL(item['href']))) for item in tags_rule.get_result(['href', 'data']): result.add_control(ControlInfo(item['data'], URL(item['href']))) return result