Exemplo n.º 1
0
def parse_page(urls):
    try:
        for url in urls:
            html = utils.get_page(url)
            if html:
                a = pq(html)
                #items
                items = a('ul.gallery-a li.vid')

                for item in items.items():
                    url = item('a').attr('href')
                    discrib = item('img').attr('alt')
                    result = re.findall('[a-zA-z]+://[^\s]*',
                                        str(item('img').attr('srcset')))

                    b = pq(url)

                    art_site_info = b('#breadcrumbs li')
                    info_string = []
                    for it in art_site_info.items():
                        info_string.append(it.text())

                    if len(info_string) >= 3:
                        site = info_string[0]
                        model = info_string[1]
                        name = info_string[2]

                    video = None
                    video_item = b('video')
                    if video_item:
                        src = []

                        for src_item in video_item('source').items():
                            src.append(src_item.attr('src'))
                        video = {
                            'src': src,
                            'poster': video_item.attr('poster')
                        }

                    image = {
                        'site': site,
                        'name': utils.format_name(name),
                        'model': utils.format_name(model),
                        'discrib': utils.format_name(discrib),
                        'small':
                        result[1] if result and len(result) >= 2 else None,
                        'mid': item('img').attr('src'),
                        'large':
                        result[0] if result and len(result) >= 2 else None,
                        'url': url,
                        'image_set': result,
                        'video': video
                    }
                    yield image
    except:
        print('error in parse %s' % url)
        yield None

    yield None
Exemplo n.º 2
0
def parse_page(html):
    soup = BeautifulSoup(html, 'lxml')

    images = []

    main_div = soup.find('div', id="block-system-main")
    contents = main_div.find_all('div', class_='node-grid')

    for content in contents:
        try:
            nick = None
            if content.select_one('.grid-meta .nick'):
                nick = content.select_one('.grid-meta .nick').string
            images.append({
                'name':
                utils.format_name(
                    content.find('div', class_='grid-meta').find('a').string),
                'url':
                urljoin(
                    'http://hegregirls.com/',
                    content.find('div',
                                 class_='grid-meta').find('a').attrs['href']),
                'board':
                content.find('img').attrs['src'],
                'nick':
                nick,
                'stats':
                content.select_one('.grid-meta .stats').string,
            })
        except Exception as e:
            print(content.find('div', class_='grid-meta').find('a').string)
            print("error")
    return images
Exemplo n.º 3
0
def parse_page(html):
    image = []  

    soup = BeautifulSoup(html, 'lxml')   
    #items
    items  = soup.find_all('div', class_="item")
    
    for item in items:       
        #poster_image
        poster_image  = item.find('div', class_="poster_image")
        if poster_image:
            name  = poster_image.find('img').attrs['alt'].strip()
            poster_image = poster_image.find('img').attrs['src']

        mid = item.find('a', attrs={'data-lightbox':"lightbox--poster_image"})
        if not mid:
            mid_url = None;
        else:
            mid_url = mid.attrs['href']
             
        large = item.find('a', attrs={'data-lightbox':"lightbox--board_image"})
        if not large:
            large_url = None;
        else:
            large_url = large.attrs['href']   

        image.append({           
            'name':  utils.format_name(name),            
            'small': poster_image,
            'mid': mid_url,
            'large': large_url,   
            'url':  urljoin('http://www.hegre.com/', item.find('a').attrs['href']),
            })

    return image
Exemplo n.º 4
0
def parse_page(urls_gen):
    try:
        while True:
            url = next(urls_gen)
            if not url:
                return None
            html = utils.get_page(url)

            a = pq(html)   
            #items
            items = a('li.g1-collection-item')
        
            for item in items.items():
                url = item('a[rel=bookmark]').attr('href')
                name = item('a[rel=bookmark]').text()
                result = re.findall('[a-zA-z]+://[^\s]*', str(item('img.attachment-bimber-grid-standard').attr('srcset')))
    
                b = pq(url)
    
                video = None
                player = b('div.flowplayer')
                if player:
                    src = json.loads(player.attr('data-item')).get('sources')[0].get('src')     
                    board = re.search('background-image: url\((.*?)\)', player.attr('style')).group(1)
                    video = {
                        'src':src,
                        'board':board
                        }
    
                previews = b('div.tiled-gallery-item a')
                details = []
                for preview in previews.items():
                    details.append({
                        'large': preview('img').attr('data-large-file'),
                        'mid': preview('img').attr('data-medium-file'),
                        'small': preview('img').attr('src'),                    
                        }
                        )
                       
     
                image = {           
                    'name': utils.format_name(name),
    #                 'small': result[1] if result and len(result) >= 2 else None,
    #                 'mid':   mid,
    #                 'large':  result[2] if result and len(result) >= 3 else None,  
                    'url':  url,
                    'video': video,
                    'detail':details,
                    'image_set': result
                    }
                
                yield image
    except:
        print('error in parse %s' % url)
        yield None    
    
    yield None
Exemplo n.º 5
0
def parse_page(html):
    image = []  

    try:
        a = pq(html)   
        #items
        items = a('#content li.box-shadow')
    
        for item in items.items():
     #       img = item('img').attr('src')
    #         srcset = item('img').attr('srcset')
            url = item('a').attr('href')
            name = item('a').attr('title')
            mid = item('img').attr('src')
            result = re.findall('[a-zA-z]+://[^\s]*', str(item('img').attr('srcset')))

            b = pq(url)
    #        video = b('video source').attr('src')
            video = b('video a').attr('href')
    
            previews = b('.ngg-gallery-thumbnail a')
            details = []
            for preview in previews.items():
                details.append({
                    'large': preview('a').attr('data-src'),
                    'small': preview('a').attr('data-thumbnail')
                    }
                    )
                   
            image.append({           
                'name': utils.format_name(name),
                'small': result[1] if result and len(result) >= 2 else None,
                'mid':   mid,
                'large':  result[2] if result and len(result) >= 3 else None,  
                'url':  url,
                'video': video,
                'detail':details
                })
    except:
        print('error in parse')
#         print(url)
#         print(result)
        return None

    return image
Exemplo n.º 6
0
def parse_page(html):
    images = []

    soup = BeautifulSoup(html, 'lxml')
    #items
    items = soup.find_all('div', class_="item")

    for item in items:
        poster_image = item.find('div',
                                 class_='img-holder').find('img').attrs['src']

        mid = item.find('div', class_='cover-links').find(
            'a', attrs={'data-lightbox': "lightbox--poster_image"})
        if not mid:
            mid_url = None
        else:
            mid_url = mid.attrs['href']

        large = item.find('div', class_='cover-links').find(
            'a', attrs={'data-lightbox': "lightbox--board_image"})
        if not large:
            large_url = None
        else:
            large_url = large.attrs['href']

        name = item.find(
            'a', class_='open-in-content-overlay').attrs['title'].strip()
        images.append({
            'name':
            utils.format_name(name),
            'small':
            poster_image,
            'mid':
            mid_url,
            'large':
            large_url,
            'url':
            urljoin(
                'http://www.hegre.com/',
                item.find('a',
                          class_='open-in-content-overlay').attrs['href']),
        })

    return images
Exemplo n.º 7
0
def parse_page(html):
    image = []

    soup = BeautifulSoup(html, 'lxml')
    #items
    items = soup.find_all('div', class_="node-grid")

    for item in items:
        image.append({
            'name':
            utils.format_name(item.select_one('.grid-meta h4 a').string),
            'board':
            item.select_one('.content .field-type-image a img').get('src'),
            'url':
            urljoin(
                'http://www.hegregirls.com/',
                item.select_one('.content .field-type-image a').get('href')),
        })

    return image
Exemplo n.º 8
0
def parse_page(html):
    image = []  

    try:
        a = pq(html)   
        #items
        items = a('#content li.box-shadow')
    
        for item in items.items():
     #       img = item('img').attr('src')
    #         srcset = item('img').attr('srcset')
            url = item('a').attr('href')
            name = item('a').attr('title')
            mid = item('img').attr('src')
            
            b = pq(url)
            previews = b('.ngg-gallery-thumbnail a')
            details = []
            for preview in previews.items():
                details.append({
                    'large': preview('a').attr('data-src'),
                    'small': preview('a').attr('data-thumbnail')
                    }
                    )
                   
            image.append({           
                'name': utils.format_name(name),

                'mid':   mid,

                'url':  url,

                'detail':details
                })
    except:
        print('error in parse')
#         print(url)
#         print(result)
        return None

    return image
Exemplo n.º 9
0
def parse_page(urls_gen):
    try:
        while True:
            url = next(urls_gen)
            if not url:
                return None
            html = utils.get_page(url)
            referer = url
            if html:
                a = pq(html)
                #items
                items = a('div.modelItem')

                for item in items.items():

                    url = item('a.thumb').attr('href')
                    board = item('img').attr('src')
                    name = item('a.title').text()

                    b = pq(url, headers=utils.default_headers, timeout=30)
                    poster = b('div.left div.photo img').attr('src')
                    profile = []
                    profile_info = b('div.right li')
                    for profile_item in profile_info.items():
                        profile.append(profile_item.text())

                    details = None
                    videos_info = b('div.content div.item')
                    if videos_info:
                        for video_item in videos_info.items():
                            video_url = video_item('a.thumb').attr('href')
                            video_name = video_item('a.thumb').attr('title')
                            video_img = re.search(
                                '(.*?)-\d.jpg$',
                                video_item('img').attr('src'), re.S)

                            c = pq(video_url,
                                   headers=utils.default_headers,
                                   timeout=30)
                            video_poster = c('video').attr('poster')
                            src = []
                            for src_item in c('video source').items():
                                src.append(src_item.attr('src'))

                            details = {
                                'url':
                                video_url,
                                'name':
                                utils.format_name(video_name),
                                'image_set': [
                                    '%s-%s.jpg' % (video_img.group(1), i)
                                    for i in range(1, 10)
                                ],
                                'poster':
                                video_poster,
                                'src':
                                src
                            }

                            image = {
                                'brief': {
                                    'name': utils.format_name(name),
                                    'board': board,
                                    'url': url,
                                    'profile': profile,
                                    'referer': referer
                                },
                                'video': details,
                            }
                            yield image
    except:
        print('error in parse %s' % url)
        yield None

    yield None
Exemplo n.º 10
0
def parse_page(urls):
    try:
        for url in urls:
            html = utils.get_page(url)
            if html:
                a = pq(html)   
                #items
                items = a('ul.gallery-a li')
            
                for item in items.items():
                    if item.hasClass('vid'):
                        continue
                
                    url = item('a').attr('href')
                    discrib = item('img').attr('alt')
                    result = re.findall('[a-zA-z]+://[^\s]*', str(item('img').attr('srcset')))
        
                    b = pq(url)
        #            
                    art_site_info = b('#breadcrumbs li')
                    info_string = []
                    for it in art_site_info.items(): 
                        info_string.append(it.text())
                        
                    if len(info_string) >=3:
                        site = info_string[0]
                        model = info_string[1]
                        name = info_string[2]
                    
                    previews = b('ul.gallery-b  li')
                    details = []
                    for preview in previews.items():
                        details.append({
                            'large': preview('a').attr('href'),
                            'small': preview('img').attr('src'),                    
                            }
                            )
        #                    
        #             video = None
        #             if b('video'):
        #                 video = {
        #                     'url': b('video source[type="video/mp4"').attr('src'),
        #                     'board': b('video').attr('poster')                    
        #                     }
                    
                    image = {    
                        'site':  site,
                        'name':  utils.format_name(name),  
                        'model': utils.format_name(model),  
                        'discrib': utils.format_name(discrib),
                        'small': result[1] if result and len(result) >= 2 else None,
                        'mid':   item('img').attr('src'),
                        'large':  result[0] if result and len(result) >= 2 else None,  
                        'url':  url,
                        'detail':details,
                        'image_set': result,
        #                 'video': video
                        }            
                    yield image
    except:
        print('error in parse %s' % url)
        yield None    
    
    yield None
Exemplo n.º 11
0
def parse_page(url, html):
    image = {'url':url}   

    soup = BeautifulSoup(html, 'lxml')   
    #board_image
    board_image  = soup.find('div', class_="board_image")
    if board_image:
        name  = board_image.find('img').attrs['alt'].strip()
        board_image = board_image.find('img').attrs['src']     
  
    image['board_image'] = board_image
    image['name'] = name
       
    #poster_image
    poster_image  = soup.find('div', class_="poster_image")
    if poster_image:
        poster_image = poster_image.find('img').attrs['src']

    image['poster_image'] = poster_image
    
    #products  
    details = soup.find('div', class_="details")
    counts = details.find('div', class_="counts")
    items = counts.find_all('a')
    products = []
    for item in items:
        products.append(item.get_text().strip())
                
    image['products'] = products
    
    #profile    
    labels = soup.find('div', class_="labels")
    rows = labels.find_all('div', class_="row")
    profile = []
    for row in rows:
        profile.append(row.get_text().strip().replace('\n', '')) 
    image['profile'] = profile
    
    
    #galleries-wrapper
    galleries_dict = []
    galleries = soup.find(id = 'galleries-wrapper')

    if galleries:
        items = galleries.find_all('div', class_='item')
        for item in items:
            date_item = item.find('small').string.replace(' ','-').replace(',','-').replace('--','-').split('-')
            date = date_item[2] +'-'+ date_item[0]+'-'+ date_item[1]
            
            mid = item.find('a', attrs={'data-lightbox':"lightbox--poster_image"})
            if not mid:
                mid_url = None;
            else:
                mid_url = mid.attrs['href']
                
            large = item.find('a', attrs={'data-lightbox':"lightbox--board_image"})
            if not large:
                large_url = None;
            else:
                large_url = large.attrs['href']        
            
            galleries_dict.append({
                'name':  utils.format_name(item.find('img').attrs['alt']),
                'url': urljoin('http://www.hegre.com/', item.find('a').attrs['href']),
                'small': item.find('img').attrs['src'],
                'mid': mid_url,
                'large': large_url,
                'date':date            
                })
                                   
    image['galleries'] = galleries_dict                      
                                   
    
    #films-wrapper
    films_dict=[]
    films = soup.find(id = 'films-wrapper')
    if films:
        items = films.find_all('div', class_='item')
        for item in items:
            mid = item.find('a', attrs={'data-lightbox':"lightbox--poster_image"})
            if not mid:
                mid_url = None;
            else:
                mid_url = mid.attrs['href']
                
            large = item.find('a', attrs={'data-lightbox':"lightbox--board_image"})
            if not large:
                large_url = None;
            else:
                large_url = large.attrs['href']        
            
            films_dict.append({
                'name': utils.format_name(item.find('img').attrs['alt']),
                'url': urljoin('http://www.hegre.com/', item.find('a').attrs['href']),
                'small': item.find('img').attrs['src'],
                'mid': mid_url,
                'large': large_url,      
                })
    image['films'] = films_dict  
    
    #massages-wrapper
    massages_dict=[]
    massages = soup.find(id = 'massages-wrapper')
    if massages:
        items = massages.find_all('div', class_='item')
        for item in items:
            mid = item.find('a', attrs={'data-lightbox':"lightbox--poster_image"})
            if not mid:
                mid_url = None;
            else:
                mid_url = mid.attrs['href']
                
            large = item.find('a', attrs={'data-lightbox':"lightbox--board_image"})
            if not large:
                large_url = None;
            else:
                large_url = large.attrs['href']        
            
            massages_dict.append({
                'name': utils.format_name(item.find('img').attrs['alt']),
                'url': urljoin('http://www.hegre.com/', item.find('a').attrs['href']),
#              'small': item.find('img').attrs['src'],
                'mid': mid_url,
                'large': large_url,      
                })
    image['massages'] = massages_dict      

    return image
Exemplo n.º 12
0
def parse_page_detail(html):
    image = {}
    soup = BeautifulSoup(html, 'lxml')
    #board_image
    board_image = soup.find('div', class_="field-name-model-board")
    if board_image:
        board_image = board_image.find('img').attrs['src']

    image['board_image'] = board_image

    #poster_image
    poster_image = soup.find('div', class_="box border")
    if poster_image:
        poster_image = poster_image.find('img').attrs['src']

    image['poster_image'] = poster_image

    #profile

    labels = soup.find('div', class_="box border")
    rows = labels.find_all('li')
    profile = []
    for row in rows:
        profile.append(row.get_text().strip().replace('\n', ''))
    image['profile'] = profile

    #wrapper
    galleries_dict = []
    films_dict = []

    items = soup.select('#main-content .content .content .grid-4')
    for item in items:
        if re.search('galleries', item.attrs['about']):
            date_release = ""
            for s in item.find(class_="release-date").strings:
                date_release += s

            try:
                detail_url = item.select_one('.preview-link a')
                if detail_url:
                    detail_url = detail_url.attrs['href']

                if detail_url:
                    detail_url = urljoin('http://hegregirls.com/', detail_url)

                galleries_dict.append({
                    'name':
                    utils.format_name(item.select_one('.grid-meta a').string),
                    'date':
                    date_release,
                    'url':
                    urljoin(
                        'http://hegregirls.com/',
                        item.select_one('.field-name-coverl a').attrs['href']),
                    'img':
                    item.find('img').attrs['src'],
                    'board':
                    item.select_one('.field-name-coverl a').get('rel')[0],
                    'detail':
                    parse_galleries_detail(detail_url)
                })
            except:
                print('galleries_dict dict error')

        else:
            try:
                films_dict.append({
                    'name':
                    utils.format_name(item.select_one('.grid-meta a').string),
                    'img':
                    item.select_one(
                        '.field-name-movie-cover a img').attrs['src'],
                    'board':
                    item.select_one('a.hegre-poster-zoom').get('href')[0],
                    'url':
                    urljoin(
                        'http://hegregirls.com/',
                        item.select_one(
                            '.field-name-movie-cover a').attrs['href']),
                })

            except:
                print('films_dict dict error')


#                 print(item.select_one('.grid-meta a').string)
#                 print(item.select_one('.field-name-movie-cover a').attrs['href'])
#                 print(item.select_one('.field-name-movie-cover a img').attrs['src'])
#                 print(item.select_one('a.hegre-poster-zoom').attrs['href'])
#                 print(item.select_one('a.hegre-poster-zoom').attrs['rel'])

    image['galleries'] = galleries_dict
    image['films'] = films_dict

    return image
Exemplo n.º 13
0
def parse_page(urls_gen):
    try:
        while True:
            url = next(urls_gen)
            if not url:
                return None
            html = utils.get_page(url)
            if html:
                a = pq(html)
                #items
                items = a('nav.pagination-a').prev_all('ul li')

                for item in items.items():
                    if item.hasClass('vid'):
                        continue

                    url = item('a').attr('href')
                    discrib = item('a').attr('title')
                    result = re.findall('[a-zA-z]+://[^\s]*',
                                        str(item('img').attr('srcset')))

                    b = pq(url)

                    art_site_info = b('#breadcrumbs li')
                    info_string = []
                    for it in art_site_info.items():
                        info_string.append(it.text())

                    if len(info_string) >= 3:
                        site = info_string[0]
                        model = info_string[1]
                        name = info_string[2]

                    previews = b('ul.gallery-b  li')
                    details = []
                    for preview in previews.items():
                        details.append({
                            'large': preview('a').attr('href'),
                            'small': preview('img').attr('src'),
                        })

                    image = {
                        'site': site,
                        'name': utils.format_name(name),
                        'model': utils.format_name(model),
                        'discrib': utils.format_name(discrib),
                        'small':
                        result[1] if result and len(result) >= 2 else None,
                        'mid': item('a').attr('src'),
                        'large':
                        result[0] if result and len(result) >= 2 else None,
                        'url': url,
                        'detail': details,
                        'image_set': result,
                    }
                    yield image
    except:
        print('error in parse %s' % url)
        yield None

    yield None