def parse_page(urls): try: for url in urls: html = utils.get_page(url) if html: a = pq(html) #items items = a('ul.gallery-a li.vid') for item in items.items(): url = item('a').attr('href') discrib = item('img').attr('alt') result = re.findall('[a-zA-z]+://[^\s]*', str(item('img').attr('srcset'))) b = pq(url) art_site_info = b('#breadcrumbs li') info_string = [] for it in art_site_info.items(): info_string.append(it.text()) if len(info_string) >= 3: site = info_string[0] model = info_string[1] name = info_string[2] video = None video_item = b('video') if video_item: src = [] for src_item in video_item('source').items(): src.append(src_item.attr('src')) video = { 'src': src, 'poster': video_item.attr('poster') } image = { 'site': site, 'name': utils.format_name(name), 'model': utils.format_name(model), 'discrib': utils.format_name(discrib), 'small': result[1] if result and len(result) >= 2 else None, 'mid': item('img').attr('src'), 'large': result[0] if result and len(result) >= 2 else None, 'url': url, 'image_set': result, 'video': video } yield image except: print('error in parse %s' % url) yield None yield None
def parse_page(html): soup = BeautifulSoup(html, 'lxml') images = [] main_div = soup.find('div', id="block-system-main") contents = main_div.find_all('div', class_='node-grid') for content in contents: try: nick = None if content.select_one('.grid-meta .nick'): nick = content.select_one('.grid-meta .nick').string images.append({ 'name': utils.format_name( content.find('div', class_='grid-meta').find('a').string), 'url': urljoin( 'http://hegregirls.com/', content.find('div', class_='grid-meta').find('a').attrs['href']), 'board': content.find('img').attrs['src'], 'nick': nick, 'stats': content.select_one('.grid-meta .stats').string, }) except Exception as e: print(content.find('div', class_='grid-meta').find('a').string) print("error") return images
def parse_page(html): image = [] soup = BeautifulSoup(html, 'lxml') #items items = soup.find_all('div', class_="item") for item in items: #poster_image poster_image = item.find('div', class_="poster_image") if poster_image: name = poster_image.find('img').attrs['alt'].strip() poster_image = poster_image.find('img').attrs['src'] mid = item.find('a', attrs={'data-lightbox':"lightbox--poster_image"}) if not mid: mid_url = None; else: mid_url = mid.attrs['href'] large = item.find('a', attrs={'data-lightbox':"lightbox--board_image"}) if not large: large_url = None; else: large_url = large.attrs['href'] image.append({ 'name': utils.format_name(name), 'small': poster_image, 'mid': mid_url, 'large': large_url, 'url': urljoin('http://www.hegre.com/', item.find('a').attrs['href']), }) return image
def parse_page(urls_gen): try: while True: url = next(urls_gen) if not url: return None html = utils.get_page(url) a = pq(html) #items items = a('li.g1-collection-item') for item in items.items(): url = item('a[rel=bookmark]').attr('href') name = item('a[rel=bookmark]').text() result = re.findall('[a-zA-z]+://[^\s]*', str(item('img.attachment-bimber-grid-standard').attr('srcset'))) b = pq(url) video = None player = b('div.flowplayer') if player: src = json.loads(player.attr('data-item')).get('sources')[0].get('src') board = re.search('background-image: url\((.*?)\)', player.attr('style')).group(1) video = { 'src':src, 'board':board } previews = b('div.tiled-gallery-item a') details = [] for preview in previews.items(): details.append({ 'large': preview('img').attr('data-large-file'), 'mid': preview('img').attr('data-medium-file'), 'small': preview('img').attr('src'), } ) image = { 'name': utils.format_name(name), # 'small': result[1] if result and len(result) >= 2 else None, # 'mid': mid, # 'large': result[2] if result and len(result) >= 3 else None, 'url': url, 'video': video, 'detail':details, 'image_set': result } yield image except: print('error in parse %s' % url) yield None yield None
def parse_page(html): image = [] try: a = pq(html) #items items = a('#content li.box-shadow') for item in items.items(): # img = item('img').attr('src') # srcset = item('img').attr('srcset') url = item('a').attr('href') name = item('a').attr('title') mid = item('img').attr('src') result = re.findall('[a-zA-z]+://[^\s]*', str(item('img').attr('srcset'))) b = pq(url) # video = b('video source').attr('src') video = b('video a').attr('href') previews = b('.ngg-gallery-thumbnail a') details = [] for preview in previews.items(): details.append({ 'large': preview('a').attr('data-src'), 'small': preview('a').attr('data-thumbnail') } ) image.append({ 'name': utils.format_name(name), 'small': result[1] if result and len(result) >= 2 else None, 'mid': mid, 'large': result[2] if result and len(result) >= 3 else None, 'url': url, 'video': video, 'detail':details }) except: print('error in parse') # print(url) # print(result) return None return image
def parse_page(html): images = [] soup = BeautifulSoup(html, 'lxml') #items items = soup.find_all('div', class_="item") for item in items: poster_image = item.find('div', class_='img-holder').find('img').attrs['src'] mid = item.find('div', class_='cover-links').find( 'a', attrs={'data-lightbox': "lightbox--poster_image"}) if not mid: mid_url = None else: mid_url = mid.attrs['href'] large = item.find('div', class_='cover-links').find( 'a', attrs={'data-lightbox': "lightbox--board_image"}) if not large: large_url = None else: large_url = large.attrs['href'] name = item.find( 'a', class_='open-in-content-overlay').attrs['title'].strip() images.append({ 'name': utils.format_name(name), 'small': poster_image, 'mid': mid_url, 'large': large_url, 'url': urljoin( 'http://www.hegre.com/', item.find('a', class_='open-in-content-overlay').attrs['href']), }) return images
def parse_page(html): image = [] soup = BeautifulSoup(html, 'lxml') #items items = soup.find_all('div', class_="node-grid") for item in items: image.append({ 'name': utils.format_name(item.select_one('.grid-meta h4 a').string), 'board': item.select_one('.content .field-type-image a img').get('src'), 'url': urljoin( 'http://www.hegregirls.com/', item.select_one('.content .field-type-image a').get('href')), }) return image
def parse_page(html): image = [] try: a = pq(html) #items items = a('#content li.box-shadow') for item in items.items(): # img = item('img').attr('src') # srcset = item('img').attr('srcset') url = item('a').attr('href') name = item('a').attr('title') mid = item('img').attr('src') b = pq(url) previews = b('.ngg-gallery-thumbnail a') details = [] for preview in previews.items(): details.append({ 'large': preview('a').attr('data-src'), 'small': preview('a').attr('data-thumbnail') } ) image.append({ 'name': utils.format_name(name), 'mid': mid, 'url': url, 'detail':details }) except: print('error in parse') # print(url) # print(result) return None return image
def parse_page(urls_gen): try: while True: url = next(urls_gen) if not url: return None html = utils.get_page(url) referer = url if html: a = pq(html) #items items = a('div.modelItem') for item in items.items(): url = item('a.thumb').attr('href') board = item('img').attr('src') name = item('a.title').text() b = pq(url, headers=utils.default_headers, timeout=30) poster = b('div.left div.photo img').attr('src') profile = [] profile_info = b('div.right li') for profile_item in profile_info.items(): profile.append(profile_item.text()) details = None videos_info = b('div.content div.item') if videos_info: for video_item in videos_info.items(): video_url = video_item('a.thumb').attr('href') video_name = video_item('a.thumb').attr('title') video_img = re.search( '(.*?)-\d.jpg$', video_item('img').attr('src'), re.S) c = pq(video_url, headers=utils.default_headers, timeout=30) video_poster = c('video').attr('poster') src = [] for src_item in c('video source').items(): src.append(src_item.attr('src')) details = { 'url': video_url, 'name': utils.format_name(video_name), 'image_set': [ '%s-%s.jpg' % (video_img.group(1), i) for i in range(1, 10) ], 'poster': video_poster, 'src': src } image = { 'brief': { 'name': utils.format_name(name), 'board': board, 'url': url, 'profile': profile, 'referer': referer }, 'video': details, } yield image except: print('error in parse %s' % url) yield None yield None
def parse_page(urls): try: for url in urls: html = utils.get_page(url) if html: a = pq(html) #items items = a('ul.gallery-a li') for item in items.items(): if item.hasClass('vid'): continue url = item('a').attr('href') discrib = item('img').attr('alt') result = re.findall('[a-zA-z]+://[^\s]*', str(item('img').attr('srcset'))) b = pq(url) # art_site_info = b('#breadcrumbs li') info_string = [] for it in art_site_info.items(): info_string.append(it.text()) if len(info_string) >=3: site = info_string[0] model = info_string[1] name = info_string[2] previews = b('ul.gallery-b li') details = [] for preview in previews.items(): details.append({ 'large': preview('a').attr('href'), 'small': preview('img').attr('src'), } ) # # video = None # if b('video'): # video = { # 'url': b('video source[type="video/mp4"').attr('src'), # 'board': b('video').attr('poster') # } image = { 'site': site, 'name': utils.format_name(name), 'model': utils.format_name(model), 'discrib': utils.format_name(discrib), 'small': result[1] if result and len(result) >= 2 else None, 'mid': item('img').attr('src'), 'large': result[0] if result and len(result) >= 2 else None, 'url': url, 'detail':details, 'image_set': result, # 'video': video } yield image except: print('error in parse %s' % url) yield None yield None
def parse_page(url, html): image = {'url':url} soup = BeautifulSoup(html, 'lxml') #board_image board_image = soup.find('div', class_="board_image") if board_image: name = board_image.find('img').attrs['alt'].strip() board_image = board_image.find('img').attrs['src'] image['board_image'] = board_image image['name'] = name #poster_image poster_image = soup.find('div', class_="poster_image") if poster_image: poster_image = poster_image.find('img').attrs['src'] image['poster_image'] = poster_image #products details = soup.find('div', class_="details") counts = details.find('div', class_="counts") items = counts.find_all('a') products = [] for item in items: products.append(item.get_text().strip()) image['products'] = products #profile labels = soup.find('div', class_="labels") rows = labels.find_all('div', class_="row") profile = [] for row in rows: profile.append(row.get_text().strip().replace('\n', '')) image['profile'] = profile #galleries-wrapper galleries_dict = [] galleries = soup.find(id = 'galleries-wrapper') if galleries: items = galleries.find_all('div', class_='item') for item in items: date_item = item.find('small').string.replace(' ','-').replace(',','-').replace('--','-').split('-') date = date_item[2] +'-'+ date_item[0]+'-'+ date_item[1] mid = item.find('a', attrs={'data-lightbox':"lightbox--poster_image"}) if not mid: mid_url = None; else: mid_url = mid.attrs['href'] large = item.find('a', attrs={'data-lightbox':"lightbox--board_image"}) if not large: large_url = None; else: large_url = large.attrs['href'] galleries_dict.append({ 'name': utils.format_name(item.find('img').attrs['alt']), 'url': urljoin('http://www.hegre.com/', item.find('a').attrs['href']), 'small': item.find('img').attrs['src'], 'mid': mid_url, 'large': large_url, 'date':date }) image['galleries'] = galleries_dict #films-wrapper films_dict=[] films = soup.find(id = 'films-wrapper') if films: items = films.find_all('div', class_='item') for item in items: mid = item.find('a', attrs={'data-lightbox':"lightbox--poster_image"}) if not mid: mid_url = None; else: mid_url = mid.attrs['href'] large = item.find('a', attrs={'data-lightbox':"lightbox--board_image"}) if not large: large_url = None; else: large_url = large.attrs['href'] films_dict.append({ 'name': utils.format_name(item.find('img').attrs['alt']), 'url': urljoin('http://www.hegre.com/', item.find('a').attrs['href']), 'small': item.find('img').attrs['src'], 'mid': mid_url, 'large': large_url, }) image['films'] = films_dict #massages-wrapper massages_dict=[] massages = soup.find(id = 'massages-wrapper') if massages: items = massages.find_all('div', class_='item') for item in items: mid = item.find('a', attrs={'data-lightbox':"lightbox--poster_image"}) if not mid: mid_url = None; else: mid_url = mid.attrs['href'] large = item.find('a', attrs={'data-lightbox':"lightbox--board_image"}) if not large: large_url = None; else: large_url = large.attrs['href'] massages_dict.append({ 'name': utils.format_name(item.find('img').attrs['alt']), 'url': urljoin('http://www.hegre.com/', item.find('a').attrs['href']), # 'small': item.find('img').attrs['src'], 'mid': mid_url, 'large': large_url, }) image['massages'] = massages_dict return image
def parse_page_detail(html): image = {} soup = BeautifulSoup(html, 'lxml') #board_image board_image = soup.find('div', class_="field-name-model-board") if board_image: board_image = board_image.find('img').attrs['src'] image['board_image'] = board_image #poster_image poster_image = soup.find('div', class_="box border") if poster_image: poster_image = poster_image.find('img').attrs['src'] image['poster_image'] = poster_image #profile labels = soup.find('div', class_="box border") rows = labels.find_all('li') profile = [] for row in rows: profile.append(row.get_text().strip().replace('\n', '')) image['profile'] = profile #wrapper galleries_dict = [] films_dict = [] items = soup.select('#main-content .content .content .grid-4') for item in items: if re.search('galleries', item.attrs['about']): date_release = "" for s in item.find(class_="release-date").strings: date_release += s try: detail_url = item.select_one('.preview-link a') if detail_url: detail_url = detail_url.attrs['href'] if detail_url: detail_url = urljoin('http://hegregirls.com/', detail_url) galleries_dict.append({ 'name': utils.format_name(item.select_one('.grid-meta a').string), 'date': date_release, 'url': urljoin( 'http://hegregirls.com/', item.select_one('.field-name-coverl a').attrs['href']), 'img': item.find('img').attrs['src'], 'board': item.select_one('.field-name-coverl a').get('rel')[0], 'detail': parse_galleries_detail(detail_url) }) except: print('galleries_dict dict error') else: try: films_dict.append({ 'name': utils.format_name(item.select_one('.grid-meta a').string), 'img': item.select_one( '.field-name-movie-cover a img').attrs['src'], 'board': item.select_one('a.hegre-poster-zoom').get('href')[0], 'url': urljoin( 'http://hegregirls.com/', item.select_one( '.field-name-movie-cover a').attrs['href']), }) except: print('films_dict dict error') # print(item.select_one('.grid-meta a').string) # print(item.select_one('.field-name-movie-cover a').attrs['href']) # print(item.select_one('.field-name-movie-cover a img').attrs['src']) # print(item.select_one('a.hegre-poster-zoom').attrs['href']) # print(item.select_one('a.hegre-poster-zoom').attrs['rel']) image['galleries'] = galleries_dict image['films'] = films_dict return image
def parse_page(urls_gen): try: while True: url = next(urls_gen) if not url: return None html = utils.get_page(url) if html: a = pq(html) #items items = a('nav.pagination-a').prev_all('ul li') for item in items.items(): if item.hasClass('vid'): continue url = item('a').attr('href') discrib = item('a').attr('title') result = re.findall('[a-zA-z]+://[^\s]*', str(item('img').attr('srcset'))) b = pq(url) art_site_info = b('#breadcrumbs li') info_string = [] for it in art_site_info.items(): info_string.append(it.text()) if len(info_string) >= 3: site = info_string[0] model = info_string[1] name = info_string[2] previews = b('ul.gallery-b li') details = [] for preview in previews.items(): details.append({ 'large': preview('a').attr('href'), 'small': preview('img').attr('src'), }) image = { 'site': site, 'name': utils.format_name(name), 'model': utils.format_name(model), 'discrib': utils.format_name(discrib), 'small': result[1] if result and len(result) >= 2 else None, 'mid': item('a').attr('src'), 'large': result[0] if result and len(result) >= 2 else None, 'url': url, 'detail': details, 'image_set': result, } yield image except: print('error in parse %s' % url) yield None yield None