class CWebParserSite(CWebParserMultiUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) # self.utils.verify = False self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if url is None: yield None if self.dbUtils.get_db_url(url): yield None html = self.utils.get_page_raw(url) if html: a = pq(html) # items items = a('li.i_list.list_n2 > a') parse_succeed = True icount = 0 for item in items.items(): try: data_p = self.common.parse_item(item) data_t = { 'name': data_p.get('brief').get('name'), 'url': data_p.get('brief').get('url'), 'refurl': url } data = dict(data_t, **data_p) yield data icount += 1 except: parse_succeed = False continue if parse_succeed and icount > 0: self.log('parsed url %s' % url) self.dbUtils.put_db_url(url) else: self.log('request %s error' % url) except: self.log('error in parse url %s' % url) yield None yield None
class CWebParserSite(CWebParserMultiUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.utils.verify = False self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if url is None: yield None if self.dbUtils.get_db_url(url): yield None html = self.utils.get_page(url) if html: a = pq(html) # items items = a('#posts_cont div.home_box') parse_succeed = True for item in items.items(): try: data_p = self.common.parse_item(item('h3 a')) data_p['brief']['board'] = item('a img').attr('src') data_t = { 'name': data_p.get('brief').get('name'), 'url': data_p.get('brief').get('url'), 'refurl': url, } data = dict(data_t, **data_p) yield data except: parse_succeed = False continue if parse_succeed: self.log('parsed url %s' % url) self.dbUtils.put_db_url(url) else: self.log('request %s error' % url) except: self.log('error in parse url %s' % url) yield None yield None
class CWebParserSite(CWebParserMultiUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if url is None: yield None if self.dbUtils.get_db_url(url): yield None html = self.utils.get_page(url) if html: a = pq(html) items = a( 'div.gallery-section div.thumb-list.gallery.thumb-list--sidebar div.thumb-list__item.gallery-thumb a' ) parse_succeed = True for item in items.items(): try: data_p = self.common.parse_item(item) data_t = { 'name': "Galleries", 'url': data_p.get('brief').get('url'), 'refurl': url } data = dict(data_t, **data_p) yield data except: parse_succeed = False continue if parse_succeed: self.log('parsed url %s' % url) self.dbUtils.put_db_url(url) else: self.log('request %s error' % url) except: self.log('error in parse url %s' % url) yield None yield None
class CWebParserSite(CWebParserMultiUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if url is None: yield None if self.dbUtils.get_db_url(url): yield None html = self.utils.get_page(url) if html: a = pq(html) # items items = a( '#box_152 > table > tbody > tr td table tr:nth-child(3) td a' ) parse_succeed = True for item in items.items(): try: data_p = self.common.parse_item(item) data_t = { 'name': data_p.get('brief').get('name'), 'url': data_p.get('brief').get('url'), 'refurl': url } data = dict(data_t, **data_p) yield data except: parse_succeed = False continue if parse_succeed: self.log('parsed url %s' % url) self.dbUtils.put_db_url(url) else: self.log('request %s error' % url) except: self.log('error in parse url %s' % url) yield None yield None
class CWebParserHunterSingleUrl(CWebParserSingleUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if not url: yield None if self.dbUtils.get_db_url(url): yield None html = self.utils.get_page(url) if html: a = pq(html) # items items = a('#content h2').prev_all('ul.gallery-a li') parse_succeed = True for item in items.items(): try: data_p = self.common.parse_item(item) data_t = { 'url': data_p.get('brief').get('url'), 'refurl': url } data = dict(data_t, **data_p) yield data except: parse_succeed = False continue if parse_succeed: self.log('parsed url %s' % url) self.dbUtils.put_db_url(url) else: self.log('request %s error' % url) except: self.log('error in parse url %s' % url) yield None yield None
class CWebParserHunterSingleUrl(CWebParserSingleUrl): def __init__(self, url, savePath, parseOnly): super(CWebParserSingleUrl, self).__init__(url) self.savePath = savePath self.utils = CWebSpiderUtils(savePath) self.common = CWebParserHunterCommon(savePath, parseOnly) self.parseOnly = parseOnly ''' parse_page @author: chenzf ''' def parse_page(self): urlsGen = self.urls_genarator() while True: try: url = next(urlsGen) if not url: yield None html = self.utils.get_page(url) if html: a = pq(html) #items items = a('ul.gallery-a li') for item in items.items(): try: yield self.common.parse_item(item) except: self.log('error in parse item %s' % url) continue self.log('parsed url %s' % url) else: self.log('request %s error' %url) except: self.log('error in parse url %s' % url) yield None yield None ''' process_image @author: chenzf ''' def process_data(self, data): if self.parseOnly == 1: return self.common.process_data(data)
class CWebParserSite(CWebParserSingleUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if not url: yield None if self.dbUtils.get_db_url(url): yield None html = self.utils.get_page(url) if html: a = pq(html) # items items = a('.products .contain .grid .col-sm-12') parse_succeed = True for item in items.items(): try: data_p = self.common.parse_item(item) data_t = { 'name': data_p.get('brief').get('name'), 'url': data_p.get('brief').get('url'), # 'board': data_p.get('brief').get('board'), 'refurl': url } data = dict(data_t, **data_p) yield data except: parse_succeed = False continue if parse_succeed: self.dbUtils.put_db_url(url) else: self.log('html none in parse url %s' % url) except: self.log('error in parse url %s' % url) yield None yield None
class CWebParserSite(CWebParserMultiUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if not url: yield None if self.dbUtils.get_db_url(url): yield None html = self.utils.get_page(url) if html: soup = pq(html) items = soup('.listProfiles li') for item in items.items(): data = {} try: data_gen = self.common.parse_item(item) while True: data = next(data_gen) if not data: break yield data except: self.log('error in item in url %s' % url) continue self.log('parsed url %s' % url) self.dbUtils.put_db_url(url) else: self.log('request %s error' % url) except: self.log('error in parse url %s' % url) yield None yield None
class CWebParserSite(CWebParserMultiUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if not url: yield None if self.dbUtils.get_db_url(url): yield None html = self.utils.get_page(url) if html: a = pq(html) # items items = a('ul.gallery-d li') for item in items.items(): data_p = self.common.parse_item(item) data_t = { 'name': data_p.get('brief').get('name'), 'url': data_p.get('brief').get('url'), 'board': data_p.get('brief').get('board'), 'refurl': url } data = dict(data_t, **data_p) yield data self.log('parsed url %s' % url) self.dbUtils.put_db_url(url) else: self.log('request %s error' % url) except: self.log('error in parse url %s' % url) yield None yield None
class CWebParserSite(CWebParserMultiUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if not url: yield None if self.dbUtils.get_db_url(url): yield None html = self.utils.get_page(url) if html: a = pq(html) # items items = a('li.g1-collection-item') for item in items.items(): data = self.common.parse_item(item) yield data self.log('parsed url %s' % url) self.dbUtils.put_db_url(url) else: self.log('request %s error' % url) except: self.log('error in parse url %s' % url) yield None yield None
def parse_video(self, url): videos_dict = [] browser = CWebSpiderUtils(None) browser.init_chrome() while True: html = self.utils.get_page(url) page = pq(html) items = page('.listThumbs li') for item in items.items(): url = urljoin('http://www.pornvidhub.com/', item('a.title').attr['href']) if url: video, still = self.parse_video_detail(url, browser) else: video = None still = [] video_item = { 'name': self.utils.format_name(item('a.title').attr['title']), 'url': url, 'video': video, 'stills': still } yield [video_item], False videos_dict.append(video_item) next_btn = page('span.numbers').nextAll('a.nav') if next_btn: url = urljoin('http://www.pornvidhub.com/', next_btn.attr['href']) else: break browser.close_chrome() yield videos_dict, True yield None, False
def __init__(self, url, savePath): super(CWebParserSingleUrl, self).__init__(url) self.savePath = savePath self.utils = CWebSpiderUtils(savePath)
class CWebParserSite(CWebParserSingleUrl): def __init__(self, url, savePath): super(CWebParserSingleUrl, self).__init__(url) self.savePath = savePath self.utils = CWebSpiderUtils(savePath) ''' parse_page @author: chenzf ''' def parse_page(self): try: urlsGen = self.urls_genarator() while True: try: url = next(urlsGen) if not url: yield None html = self.utils.get_page(url) if html: data = {} soup = BeautifulSoup(html, 'lxml') step = 1 #board_image board_image = soup.find('div', class_="board_image") if board_image: name = board_image.find('img').attrs['alt'].strip() board_image = board_image.find('img').attrs['src'] else: name = None board_image = None step = 2 data['name'] = name data['url'] = url data['board'] = board_image step = 3 #poster_image poster_image = soup.find('div', class_="poster_image") if poster_image: poster_image = poster_image.find( 'img').attrs['src'] else: poster_image = None data['poster'] = poster_image step = 4 #profile labels = soup.find('div', class_="labels") rows = labels.find_all('div', class_="row") profile = [] for row in rows: profile.append(row.get_text().strip().replace( '\n', '')) data['profile'] = profile step = 5 #products details = soup.find('div', class_="details") counts = details.find('div', class_="counts") items = counts.find_all('a') products = [] for item in items: products.append(item.get_text().strip()) data['products'] = products step = 6 #detail product data['galleries'] = self.parse_galleries(soup) step = 7 data['films'] = self.parse_films(soup) step = 8 data['massages'] = self.parse_massages(soup) step = 9 yield data except: errMsg = 'error in parse %s , step %s' % (url, step) self.log(errMsg) print(errMsg) except: print('error in parse %s' % url) yield None yield None def parse_galleries(self, soup): galleries_dict = [] galleries = soup.find(id='galleries-wrapper') if galleries: items = galleries.find_all('div', class_='item') for item in items: date_item = item.find('small').string.replace( ' ', '-').replace(',', '-').replace('--', '-').split('-') date = date_item[2] + '-' + date_item[0] + '-' + date_item[1] mid = item.find( 'a', attrs={'data-lightbox': "lightbox--poster_image"}) if not mid: mid_url = None else: mid_url = mid.attrs['href'] large = item.find( 'a', attrs={'data-lightbox': "lightbox--board_image"}) if not large: large_url = None else: large_url = large.attrs['href'] url = urljoin('http://www.hegre.com/', item.find('a').attrs['href']) html = self.utils.get_page(url) soup = BeautifulSoup(html, 'lxml') cover = soup.find( 'div', class_='non-members', attrs={"style": re.compile('background-image.*?')}) if cover: cover = re.search('.*?url\((.*?)\)', cover.attrs['style']).group(1) else: cover = None galleries_dict.append({ 'name': self.utils.format_name(item.find('img').attrs['alt']), 'url': url, 'board': [cover, large_url], 'poster': [mid_url, item.find('img').attrs['src']], 'date': date }) return galleries_dict def parse_massages(self, soup): massages_dict = [] massages = soup.find(id='massages-wrapper') if massages: items = massages.find_all('div', class_='item') for item in items: mid = item.find( 'a', attrs={'data-lightbox': "lightbox--poster_image"}) if not mid: mid_url = None else: mid_url = mid.attrs['href'] large = item.find( 'a', attrs={'data-lightbox': "lightbox--board_image"}) if not large: large_url = None else: large_url = large.attrs['href'] url = urljoin('http://www.hegre.com/', item.find('a').attrs['href']) video, date, cover = self.parse_massages_detail(url) massages_dict.append({ 'name': self.utils.format_name(item.find('img').attrs['alt']), 'url': url, 'board': [cover, large_url, item.find('img').attrs['src']], 'poster': [], 'video': video, 'date': date }) return massages_dict def parse_massages_detail(self, url): data = {} html = self.utils.get_page(url) soup = BeautifulSoup(html, 'lxml') item = soup.find('div', class_="video-player-wrapper") board = None if item: style_text = item.attrs['style'] board = re.search("url\(\'(.*?)\'\)", style_text, re.S).group(1) Full = [] items = soup.find_all('div', class_="resolution content ") for item in items: Full.append(item.find('a').attrs['href']) data['full'] = Full Trailer = [] items = soup.find_all('div', class_="resolution trailer top-resolution") for item in items: Trailer.append(item.find('a').attrs['href']) data['trailer'] = Trailer item = soup.find('div', class_='video-stills') Stills = [] if item: stills = item.find_all('div', class_="img-holder") for still in stills: small = still.find('img') if not small: small_url = None else: small_url = small.attrs['src'] large = still.find('a') if not large: large_url = None else: large_url = large.attrs['href'] Stills.append([large_url, small_url]) data['stills'] = Stills date = soup.find('span', class_="date").string return data, date, board def parse_films(self, soup): films_dict = [] films = soup.find(id='films-wrapper') if films: items = films.find_all('div', class_='item') for item in items: mid = item.find( 'a', attrs={'data-lightbox': "lightbox--poster_image"}) if not mid: mid_url = None else: mid_url = mid.attrs['href'] large = item.find( 'a', attrs={'data-lightbox': "lightbox--board_image"}) if not large: large_url = None else: large_url = large.attrs['href'] url = urljoin('http://www.hegre.com/', item.find('a').attrs['href']) video, date, cover = self.parse_films_detail(url) films_dict.append({ 'name': self.utils.format_name(item.find('img').attrs['alt']), 'url': url, 'board': [cover, large_url], 'poster': [mid_url, item.find('img').attrs['src']], 'video': video, 'date': date }) return films_dict def parse_films_detail(self, url): data = {} html = self.utils.get_page(url) soup = BeautifulSoup(html, 'lxml') board = None item = soup.find('div', class_="video-player-wrapper") if item: style_text = item.attrs['style'] board = re.search("url\(\'(.*?)\'\)", style_text, re.S).group(1) if board: pass else: item = soup.find('div', class_="content-overlay-wrapper") if item: style_text = item.select_one( 'div[class="non-members"]').attrs['style'] board = re.search("url\((.*?)\)", style_text, re.S).group(1) Full = [] items = soup.find_all('div', class_="resolution content ") for item in items: Full.append(item.find('a').attrs['href']) data['full'] = Full Trailer = [] items = soup.find_all('div', class_="resolution trailer top-resolution") for item in items: Trailer.append(item.find('a').attrs['href']) data['trailer'] = Trailer item = soup.find('div', class_='video-stills') Stills = [] if item: stills = item.find_all('div', class_="img-holder") for still in stills: small = still.find('img') if not small: small_url = None else: small_url = small.attrs['src'] large = still.find('a') if not large: large_url = None else: large_url = large.attrs['href'] Stills.append([large_url, small_url]) data['stills'] = Stills date = soup.find('span', class_="date").string return data, date, board ''' process_image @author: chenzf ''' # @vthread.pool(8) def process_data(self, data): # print(data) dir_name = self.savePath.format(filePath=data.get('name')) if not os.path.exists(dir_name): os.makedirs(dir_name) with open(dir_name + '\\info.json', 'w') as f: json.dump(data, f) board = data.get('board') if board: self.utils.download_file(board, '%s\\%s' % (data.get('name'), 'board')) poster = data.get('poster') if poster: self.utils.download_file( poster, '%s\\%s' % (data.get('name'), data.get('name'))) self.process_galleries(data) self.process_massages(data) self.process_films(data) def process_galleries(self, data): galleries = data.get('galleries') modelName = data.get('name') for item in galleries: boards = item.get('board') for board in boards: if board: self.utils.download_file( board, '%s\\%s\\%s\\%s' % (modelName, 'galleries', item.get('name'), 'board')) break posters = item.get('poster') for poster in posters: if poster: self.utils.download_file( poster, '%s\\%s\\%s\\%s' % (modelName, 'galleries', item.get('name'), 'poster')) break def process_massages(self, data): massages = data.get('massages') modelName = data.get('name') for item in massages: boards = item.get('board') for board in boards: if board: self.utils.download_file( board, '%s\\%s\\%s\\%s' % (modelName, 'massages', item.get('name'), 'board')) break posters = item.get('poster') for poster in posters: if poster: self.utils.download_file( poster, '%s\\%s\\%s\\%s' % (modelName, 'massages', item.get('name'), 'poster')) break video = item.get('video') stills = video.get('stills') for i, val in enumerate(stills, start=1): for subVal in val: if subVal: self.utils.download_file( subVal, '%s\\%s\\%s\\%s' % (modelName, 'massages', item.get('name'), str(i))) break trailers = video.get('trailer') for trailer in trailers: if trailer: self.utils.download_file( trailer, '%s\\%s\\%s\\%s' % (modelName, 'massages', item.get('name'), item.get('name'))) break def process_films(self, data): films = data.get('films') modelName = data.get('name') for item in films: boards = item.get('board') for board in boards: if board: self.utils.download_file( board, '%s\\%s\\%s\\%s' % (modelName, 'films', item.get('name'), 'board')) break posters = item.get('poster') for poster in posters: if poster: self.utils.download_file( poster, '%s\\%s\\%s\\%s' % (modelName, 'films', item.get('name'), 'poster')) break video = item.get('video') stills = video.get('stills') for i, val in enumerate(stills, start=1): for subVal in val: if subVal: self.utils.download_file( subVal, '%s\\%s\\%s\\%s' % (modelName, 'films', item.get('name'), str(i))) break trailers = video.get('trailer') for trailer in trailers: if trailer: self.utils.download_file( trailer, '%s\\%s\\%s\\%s' % (modelName, 'films', item.get('name'), item.get('name'))) break ''' urls_genarator @author: chenzf ''' def urls_genarator(self): html = self.utils.get_page(self.url) soup = BeautifulSoup(html, 'lxml') item_div = soup.find_all('div', class_="item") for item in item_div: url = urljoin( 'http://www.hegre.com/', item.find('a', class_='artwork').attrs['href'].strip()) yield url yield None
class CWebParserSite(CWebParserSingleUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if not url: yield None html = self.utils.get_page(url) if html: a = pq(html) # items items = a( 'div.ts-responsive-wrap div.tshowcase-inner-box div.tshowcase-box-photo > a' ) for item in items.items(): modelurl = item.attr('href') modelsearch = modelurl name = item('img').attr('title') board = item('img').attr('src') try: while modelsearch is not None: html = self.utils.get_page(modelsearch) if html: b = pq(html) if self.dbUtils.get_db_url(modelsearch): continue else: products = b('div.home_box > a') for product in products.items(): data_p = self.common.parse_item( product) data_t = { 'name': self.utils.format_name(name), 'url': modelurl, 'board': board, 'refurl': modelurl } data = dict(data_t, **data_p) yield data self.dbUtils.put_db_url(modelsearch) nexturl = b('link[rel=next]') if nexturl: modelsearch = nexturl.attr('href') else: modelsearch = None else: modelsearch = None except: continue self.log('parsed url %s' % url) else: self.log('request %s error' % url) except: self.log('error in parse url %s' % url) yield None yield None
class CWebParserSite(CWebParserMultiUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if url is None: yield None html = self.utils.get_page(url) if html: if self.dbUtils.get_db_url(url): pass else: a = pq(html) items = a( 'body > div.main-wrap > div.best-list-block.hide-on-search > div.width-wrap > div.thumb-container div.pornstar-thumb-container div.pornstar-thumb-container__info div.pornstar-thumb-container__info-title a') for item in items.items(): model_url_origin = item.attr('href') name = item.text() index = 1 while True: model_url = "%s/%s" % (model_url_origin, index) if index == 1: if self.dbUtils.get_db_url(model_url_origin): index = index + 1 continue elif self.dbUtils.get_db_url(model_url): index = index + 1 continue break if index > 2: index = index - 1 model_url = "%s/%s" % (model_url_origin, index) else: model_url = model_url_origin while True: self.log('request %s' % model_url) html2 = self.utils.get_page(model_url) if html2: if self.dbUtils.get_db_url(model_url): pass else: board = pq(html2)('div.pornstar-logo img').attr('src') data_ps, parse_res = self.parse_sub_page(html2) for data_p in data_ps: data_t = { 'name': name, 'url': model_url, 'board': board, 'refurl': url } data = dict(data_t, **data_p) yield data if parse_res: self.log('parsed url %s' % model_url) self.dbUtils.put_db_url(model_url) next_url = pq(html2)('li.next a').attr("href") if next_url: model_url = next_url else: break else: break; else: self.log('request %s error' % url) except: self.log('error in parse url %s' % url) yield None yield None def parse_sub_page(self, html): b = pq(html) items = b( 'body > div.main-wrap > main > div > article > div.index-videos.mixed-section > div.thumb-list.thumb-list--sidebar.thumb-list--recent > div.thumb-list__item.video-thumb a.video-thumb-info__name') sub_datas = [] parse_successed = None for item in items.items(): try: data_p = self.common.parse_item(item) sub_datas.append(data_p) if not parse_successed: parse_successed = True else: parse_successed = True & parse_successed except: parse_successed = False return sub_datas, parse_successed
class CWebParserSite(CWebParserMultiUrl): def __init__(self, url, start, end, savePath): super(CWebParserMultiUrl, self).__init__(url, start, end) self.savePath = savePath self.utils = CWebSpiderUtils(savePath) ''' parse_page @author: chenzf ''' def parse_page(self): urlsGen = self.urls_genarator() while True: try: url = next(urlsGen) if not url: return None html = self.utils.get_page(url) if html: soup = pq(html) contents = soup('.listProfiles li') for content in contents.items(): data = {} try: step = 1 name = None modelurl = None board = None if content('a.thumb'): modelurl = urljoin( 'http://www.pornvidhub.com/', content('a.thumb').attr('href')) name = content('a.title').text() board = content('img').attr('src') data['name'] = name data['url'] = modelurl data['board'] = board step = 2 if modelurl: videos_all = self.parse_video(modelurl) step = 3 while True: video_one, isLast = next(videos_all) if not video_one: step = 4 break if isLast: data['videos'] = video_one self.save_info(data) step = 5 else: data_temp = deepcopy(data) data_temp['videos'] = video_one yield data_temp step = 6 else: data['videos'] = [] yield data step = 7 except: errMsg = 'error in parse %s , step %s' % (modelurl, step) print(errMsg) self.log(errMsg) except: print('error in parse %s' % url) yield None yield None def parse_video(self, url): videos_dict = [] browser = CWebSpiderUtils(None) browser.init_chrome() while True: html = self.utils.get_page(url) page = pq(html) items = page('.listThumbs li') for item in items.items(): url = urljoin('http://www.pornvidhub.com/', item('a.title').attr['href']) if url: video, still = self.parse_video_detail(url, browser) else: video = None still = [] video_item = { 'name': self.utils.format_name(item('a.title').attr['title']), 'url': url, 'video': video, 'stills': still } yield [video_item], False videos_dict.append(video_item) next_btn = page('span.numbers').nextAll('a.nav') if next_btn: url = urljoin('http://www.pornvidhub.com/', next_btn.attr['href']) else: break browser.close_chrome() yield videos_dict, True yield None, False def parse_video_detail(self, url, browser): html = browser.get_chrome(url, '#player #html5') if not html: return None, [] soup = BeautifulSoup(html, 'lxml') Stills = [] stills = soup.find(id='tabPhotos').find_all('img') for still in stills: small_url = still.attrs['src'] thumb = re.search('.*?/(\d+x\d+/).*?', small_url, re.S) if thumb: large_url = small_url.replace(thumb.group(1), "") else: large_url = None Stills.append([large_url, small_url]) soup = pq(html) video_text = soup('#player #html5') if video_text: video = video_text.attr['src'] return video, Stills ''' process_image @author: chenzf ''' # @vthread.pool(8) def process_data(self, data): # self.save_info(data) # print(data.get('videos')[0].get('name')) board = data.get('board') if board: self.utils.download_file( board, '%s\\%s' % (data.get('name'), data.get('name'))) self.process_videos(data) def process_videos(self, data): videos = data.get('videos') modelName = data.get('name') for item in videos: stills = item.get('stills') for i, val in enumerate(stills, start=1): for subVal in val: if subVal: self.utils.download_file( subVal, '%s\\%s\\%s' % (modelName, item.get('name'), str(i))) break video = item.get('video') if video: self.utils.download_file( video, '%s\\%s\\%s' % (modelName, item.get('name'), item.get('name')), headers={'Referer': data.get('url')}) break
class CWebParserSite(CWebParserSingleUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if url is None: yield None html = self.utils.get_page(url) if html: a = pq(html, parser='html') # items items = a('a.artwork') for item in items.items(): modelurl = urljoin('http://www.hegre.com/', item.attr('href').strip()) board = item('img').attr('src') name = item.attr('title') if self.dbUtils.get_db_url(modelurl): continue bFarseSucceed = True html2 = self.utils.get_page(modelurl) if html2: b = pq(html2, parser='html') item_models = b('#films-wrapper div.item') for item_model in item_models.items(): try: data_p = self.common.parse_item(item_model) data_t = { 'name': self.utils.format_name(name), 'url': modelurl, 'board': board, 'refurl': modelurl } data = dict(data_t, **data_p) yield data except: bFarseSucceed = False continue b = pq(html2, parser='html') item_models = b('#massages-wrapper div.item') for item_model in item_models.items(): try: data_p = self.common.parse_item(item_model) data_t = { 'name': self.utils.format_name(name), 'url': modelurl, 'board': board, 'refurl': modelurl } data = dict(data_t, **data_p) yield data except: bFarseSucceed = False continue self.log('parsed url %s' % modelurl) if bFarseSucceed: self.dbUtils.put_db_url(modelurl) self.log('parsed url %s' % url) self.dbUtils.put_db_url(url) else: self.log('request %s error' % url) except: self.log('error in parse url %s' % url) yield None yield None
class CWebParserSite(CWebParserMultiUrl): def __init__(self, url, start, end, savePath): super(CWebParserMultiUrl, self).__init__(url, start, end) self.savePath = savePath self.utils = CWebSpiderUtils(savePath) ''' parse_page @author: chenzf ''' def parse_page(self): try: urlsGen = self.urls_genarator() while True: url = next(urlsGen) if not url: yield None html = self.utils.get_page(url) if html: soup = BeautifulSoup(html, 'lxml') items = soup.find_all('div', class_="item") for item in items: posterImg = item.find( 'div', class_='img-holder').find('img').attrs['src'] mid = item.find('div', class_='cover-links').find( 'a', attrs={'data-lightbox': "lightbox--posterImg"}) if not mid: midUrl = None else: midUrl = mid.attrs['href'] large = item.find('div', class_='cover-links').find( 'a', attrs={'data-lightbox': "lightbox--board_image"}) if not large: largeUrl = None else: largeUrl = large.attrs['href'] name = item.find('a', class_='open-in-content-overlay' ).attrs['title'].strip() url = urljoin( 'http://www.hegre.com/', item.find('a', class_='open-in-content-overlay'). attrs['href']) data = { 'name': self.utils.format_name(name), 'small': posterImg, 'mid': midUrl, 'large': largeUrl, 'url': url, 'detail': self.process_data_detail(url) } yield data except: print('error in parse %s' % url) yield None yield None ''' parse_page @author: chenzf ''' def parse_page_detail(self, html): data = {} soup = BeautifulSoup(html, 'lxml') board = None item = soup.find('div', class_="content-overlay-wrapper") if item: style_text = item.select_one( 'div[class="non-members"]').attrs['style'] board = re.search("url\((.*?)\)", style_text, re.S).group(1) data['board'] = board DownLoad = [] items = soup.find_all('div', class_="gallery-zips") for item in items: DownLoad.append(item.find('a').attrs['href']) data['download'] = DownLoad data['date'] = soup.find('span', class_="date").string return data ''' process_data_detail @author: chenzf ''' def process_data_detail(self, url): detail = None html = self.utils.get_page(url) if html: detail = self.parse_page_detail(html) return detail ''' process_data @author: chenzf ''' # @vthread.pool(8) def process_data(self, data): dir_name = self.savePath.format(filePath=data.get('name')) if not os.path.exists(dir_name): os.makedirs(dir_name) with open(dir_name + '\\info.json', 'w') as f: json.dump(data, f) for subkeys in ['large', 'mid', 'small']: url = data.get(subkeys) if url: self.utils.download_file( url, '%s\\%s' % (data.get('name'), data.get('name'))) break detail = data.get('detail') board = detail.get('board') if board: self.utils.download_file(board, '%s\\%s' % (data.get('name'), 'cover')) elif data.get('mid'): board = data.get('mid') self.utils.download_file(board, '%s\\%s' % (data.get('name'), 'cover'))
class CWebParserSite(CWebParserSingleUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if url is None: yield None html = self.utils.get_page(url, headers={ "Host": "godsartnudes.com", "Upgrade-Insecure-Requests": "1" }) if html: a = pq(html) # items items = a( 'div.row.gan-central div.col-xxs-12.col-xs-6.col-sm-4.col-md-3 div.Thumb a:last-of-type' ) for item in items.items(): name = item.text() # board = item('a img').attr('lsrc') + '.jpg' model_url = urljoin('http://godsartnudes.com', item.attr('href')) if self.dbUtils.get_db_url(model_url): continue html2 = self.utils.get_page(model_url) if html2: b = pq(html2) modelitems = b( 'div.row.spacetop div.col-xxs-12.col-xs-6.col-sm-4.col-md-3 div.thumbImage > a:first-child' ) parse_succeed = True processNum = 0 for modelitem in modelitems.items(): parse_succeed &= True try: data_p = self.common.parse_item(modelitem) data_t = { 'name': name, 'url': model_url, 'refurl': url } data = dict(data_t, **data_p) yield data processNum += 1 except: parse_succeed = False continue if parse_succeed and processNum > 0: self.log('parsed url %s' % model_url) self.dbUtils.put_db_url(model_url) else: self.log('request %s error' % url) except: self.log('error in parse url %s' % url) yield None yield None def urls_genarator(self): for url in range(ord("A"), ord("Z") + 1): yield self.url.format(page=chr(url)) yield None
class CWebParserSite(CWebParserMultiUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if url is None: yield None if self.dbUtils.get_db_url(url): yield None html = self.utils.get_page(url) if html: a = pq(html, parser='html') # items items = a('#block-system-main .node-grid') for item in items.items(): board = item('div.field-type-image img').attr('src') name = item('.grid-meta a').text() modelurl = urljoin('http://www.hegregirls.com/', item('.grid-meta a').attr('href')) html2 = self.utils.get_page(modelurl) if html2: b = pq(html2, parser='html') items_model = b( '#main-content .content .content .grid-4') for item_model in items_model.items(): try: if not re.search('galleries', item_model.attr('about')): continue data_p = self.common.parse_item(item_model) data_t = { 'name': name, 'url': modelurl, 'board': board, 'refurl': url } data = dict(data_t, **data_p) yield data except: continue self.log('parsed url %s' % url) self.dbUtils.put_db_url(url) else: self.log('request %s error' % url) except: self.log('error in parse url %s' % url) yield None yield None
class CWebParserSite(CWebParserSingleUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if not url: yield None if self.dbUtils.get_db_url(url): yield None html = self.utils.get_page(url) if html: name = url.split('/')[-2] a = pq(html) # items items = a('#content div.wrap.wrap2 div.thumbs a') parse_succeed = True for item in items.items(): try: data_p = self.common.parse_item(item) data_t = { 'name': name, 'url': data_p.get('brief').get('url'), 'refurl': url } data = dict(data_t, **data_p) yield data except: parse_succeed = False continue if parse_succeed: self.log('parsed url %s' % url) self.dbUtils.put_db_url(url) else: self.log('request %s error' % url) except: self.log('error in parse url %s' % url) yield None yield None def urls_genarator(self): html = self.utils.get_page(self.url) if html: a = pq(html) models = a('ul.bottomLists2 ul li a') for model in models.items(): yield urljoin('http://xnudegirls.com/', model.attr('href')) yield None
class CWebParserSite(CWebParserSingleUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) self.utils.verify = False ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if url is None: yield None html = self.utils.get_page(url) if html: a = pq(html) # items items = a('#models tr td a') for item in items.items(): name = item.attr('title') model_url = urljoin('https://www.babesmachine.com', item.attr('href')) if self.dbUtils.get_db_url(model_url): yield None html2 = self.utils.get_page(model_url) if html2: b = pq(html2) modelitems = b('#posts tr td a') parse_succeed = True for modelitem in modelitems.items(): try: data_p = self.common.parse_item(modelitem) data_t = { 'name': name, 'url': model_url, 'refurl': url } data = dict(data_t, **data_p) yield data except: parse_succeed = False continue if parse_succeed: self.log('parsed url %s' % model_url) self.dbUtils.put_db_url(model_url) else: self.log('request %s error' % model_url) continue else: self.log('request %s error' % url) except: self.log('error in parse url %s' % url) yield None yield None
class CWebParserHunterDb(CWebParser): def __init__(self, url, savePath, start, len, parseOnly): # super(CWebParserSingleUrl, self).__init__(url) self.savePath = savePath self.utils = CWebSpiderUtils(savePath) self.common = CWebParserHunterCommon(savePath, parseOnly) self.parseOnly = parseOnly self.start = start self.len = len ''' parse_page @author: chenzf ''' def parse_page(self): db_count = 0 while True: try: for item in self.common.get_db_item(self.start, self.len): try: url = item.get('url') board = item.get('board') discrib = item.get('discrib') b = pq(url) art_site_info = b('#breadcrumbs li') info_string = [] for it in art_site_info.items(): info_string.append(it.text()) if len(info_string) >=3: site, model, name = info_string[0], info_string[1], info_string[2] video = None video_item = b('video') if video_item: src = [] for src_item in video_item('source').items(): src.append(src_item.attr('src')) video={ 'src': src, 'poster':video_item.attr('poster') } else: previews = b('ul.gallery-b li') stills = [] for preview in previews.items(): stills.append([ preview('a').attr('href'), preview('img').attr('src')]) data = { 'site' : site, 'name' : self.utils.format_name(name), 'model' : self.utils.format_name(model), 'discrib' : self.utils.format_name(discrib), 'board' : board, 'url' : url, 'stills' : stills, 'video' : video } db_count +=1 print('current db index %s' %db_count) yield data except: self.log('error in parse item %s' % url) continue except: self.log('error in parse url %s' % url) yield None yield None ''' process_image @author: chenzf ''' def process_data(self, data): self.common.process_data(data)
def __init__(self, url, savePath, parseOnly): super(CWebParserSingleUrl, self).__init__(url) self.savePath = savePath self.utils = CWebSpiderUtils(savePath) self.common = CWebParserHunterCommon(savePath, parseOnly) self.parseOnly = parseOnly
def __init__(self, url, start, end, savePath): super(CWebParserMultiUrl, self).__init__(url, start, end) self.savePath = savePath self.utils = CWebSpiderUtils(savePath)
class CWebParserSite(CWebParserSingleUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if not url: yield None html = self.utils.get_page(url, headers={ "Accept-Encoding": "", }) if html: a = pq(html) # items items = a('ul.links li.hideli') for item in items.items(): modelurl = item('a').attr('href') name = item('a').attr('title') if self.dbUtils.get_db_url(modelurl): continue html = self.utils.get_page(modelurl, headers={ "Accept-Encoding": "", }) if html: b = pq(html) products = b('li.box-shadow') try: for product in products.items(): data_p = self.common.parse_item(product) data_t = { 'name': name, 'url': modelurl, 'refurl': modelurl } data = dict(data_t, **data_p) yield data except: continue self.dbUtils.put_db_url(modelurl) self.log('parsed url %s' % url) else: self.log('request %s error' % url) except: self.log('error in parse url %s' % url) yield None
class CWebParserSite(CWebParserMultiUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if url is None: yield None html = self.utils.get_page(url) if html: if self.dbUtils.get_db_url(url): pass else: a = pq(html) # items items = a('#content > ul li.pornstars a') for item in items.items(): name = item('a img').attr('alt') board = item('a img').attr('src') model_url_origin = urljoin( 'https://www.thumbzilla.com/', item.attr('href')) index = 1 while True: model_url = "%s?page=%s" % (model_url_origin, index) if index == 1: if self.dbUtils.get_db_url(model_url_origin): index = index + 1 continue elif self.dbUtils.get_db_url(model_url): index = index + 1 continue break if index > 2: index = index - 1 model_url = "%s?page=%s" % (model_url_origin, index) else: model_url = model_url_origin while True: self.log('request %s' % model_url) html2 = self.utils.get_page(model_url) if html2: if self.dbUtils.get_db_url(model_url): pass else: data_ps, parse_res = self.parse_sub_page( html2) for data_p in data_ps: data_t = { 'name': name, 'url': model_url, 'board': board, 'refurl': url } data = dict(data_t, **data_p) yield data if parse_res: self.log('parsed url %s' % model_url) self.dbUtils.put_db_url(model_url) next_url = pq(html2)('li.page_next a') if next_url: model_url = urljoin( 'https://www.thumbzilla.com', next_url.attr('href')) else: break else: break else: self.log('request %s error' % url) except: self.log('error in parse url %s' % url) yield None yield None def parse_sub_page(self, html): b = pq(html) items = b('#content ul li:gt(4) a') sub_datas = [] parse_successed = None for item in items.items(): try: data_p = self.common.parse_item(item) sub_datas.append(data_p) if not parse_successed: parse_successed = True else: parse_successed = True & parse_successed except: parse_successed = False return sub_datas, parse_successed
class CWebParserSite(CWebParserSingleUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if url is None: yield None while True: html = self.utils.get_page(url) if html: if self.dbUtils.get_db_url(url): pass else: a = pq(html) # items items = a('ul.set.sluts_main li') parse_succeed = True for item in items.items(): try: name = item('b a').text() board = item('a img').attr('lsrc') + '.jpg' model_url = urljoin('https://www.hqsluts.com/', item('b a').attr('href')) html2 = self.utils.get_page(model_url) if html2: b = pq(html2) modelitems = b('ul.set.s**t li') for modelitem in modelitems.items(): try: data_p = self.common.parse_item( modelitem) data_t = { 'name': self.utils.format_name(name), 'url': model_url, 'board': board, 'refurl': url } data = dict(data_t, **data_p) yield data except: parse_succeed = False continue except: parse_succeed = False continue if parse_succeed: self.log('parsed url %s' % url) self.dbUtils.put_db_url(url) next_url = a('#pages li a[count="Next Page"]') if next_url: url = urljoin('https://www.hqsluts.com/', next_url.attr('href')) self.log('request %s' % url) else: break else: self.log('request %s error' % url) continue except: self.log('error in parse url %s' % url) yield None yield None
def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database'))
class CWebParserSite(CWebParserSingleUrl): def __init__(self, **kwArgs): super().__init__(**kwArgs) self.utils = CWebSpiderUtils(self.savePath) self.common = CWebParserSiteCommon(self) self.dbUtils = CWebDataDbUtis(kwArgs.get('database')) ''' parse_page @author: chenzf ''' def parse_page(self, url): try: if not url: yield None if self.dbUtils.get_db_url(url): yield None end_pos = url.rfind('/') - 1 # 倒数第一个"/"的位置再左移一位 start_pos = url.rfind( '/', 0, end_pos) # 网址从开始截至到end_pos的位置,从右往左出现的第一个"/"也就是我们要找的倒数第二个"/" name = url[start_pos + 1:] # 截取网址的倒数第二个 "/" 后面的内容 data_total = 1 html = self.utils.get_page(url) if html: a = pq(html) data_total = a('button.js-load-more').attr('data-total') if not data_total: data_total = 1 if int(data_total) > 0: for page in range(1, int(data_total) + 1): try: cate_url = '%s?mode=async&action=get_block&block_id=list_albums_common_albums_list&from=%s' % ( url, page) if self.dbUtils.get_db_url(cate_url): continue html = self.utils.get_page(cate_url) if html: b = pq(html) items = b('div.masonry_item >a') for item in items.items(): data_p = self.common.parse_item(item) data_t = { 'name': name, 'url': data_p.get('brief').get('url'), 'refurl': cate_url } data = dict(data_t, **data_p) yield data self.dbUtils.put_db_url(cate_url) except: continue self.log('parsed url %s' % url) self.dbUtils.put_db_url(url) else: self.log('request %s error' % url) except: self.log('error in parse url %s' % url) yield None yield None ''' urls_genarator @author: chenzf ''' def urls_genarator(self): html = self.utils.get_page(self.url) if html: a = pq(html) categorys = a('div.masonry_item a') for category in categorys.items(): yield category.attr('href'), category.attr('title') yield None