def fetchAlbum(url): pq = helper.get(url) dirName = os.path.join('.', 'aiss', pq('title').text()) helper.mkDir(dirName) index = 1 for img in pq('.message > img'): helper.downloadImg(img.get('src'), os.path.join(dirName, '%03d.jpg' % index)) index += 1
def fetchAlbum(url, dirName): if 'rosi' in url: pq = helper.get(url) dirName = os.path.join(dirName, pq('#post-title').text().split('No.')[1]) helper.mkDir(dirName) for a in pq('.gallery-icon > a'): imgUrl = a.get('href') helper.downloadImg(imgUrl, os.path.join(dirName, imgUrl.split('/')[-1]))
def fetchPage(page): url = '%s/page/%d/' % (BASE_URL, page) pq = helper.get(url) for a in pq('a.disp_a'): title = a.get('title').replace('Permalink to ', '') url = a.get('href') dirName = os.path.join('cartoon', title) if not os.path.exists(os.path.join(dirName, 'url.txt')): helper.mkDir(dirName) if not fetchGallery(url, title, page): return False return True
def fetchComic(webPage, comicIndex, url, page = 1, comicDir = None): pq = helper.get('%s/%d' % (url, page)) if page == 1: title = pq('title').text().replace('/', '&').split(' - ')[0] comicDir = os.path.join('animezilla', title) if os.path.exists(os.path.join('animezilla', '0uploaded', title, 'done.txt')): return True if os.path.exists(os.path.join(comicDir, 'done.txt')): return True helper.mkDir(comicDir) if webPage == 1 and comicIndex == 16: if page < 90: return fetchComic(webPage, comicIndex, url, 90, comicDir) img = pq('img#comic') print('[%s] downloading webPage page => %d, comic index => %d, comic page => %d' % (helper.now(), webPage, comicIndex, page)) downloadImg(img.attr('src'), os.path.join(comicDir, '%03d.jpg' % page), url) time.sleep(3) if(len(img.parents('a')) == 0): helper.writeFile('done', os.path.join(comicDir, 'done.txt')) return True return fetchComic(webPage, comicIndex, url, page + 1, comicDir)
def fetch_model(url, name, head_img): '''fetch model''' model_dir = os.path.join('vivthomas', 'model') helper.mkDir(model_dir) helper.mkDir(os.path.join('vivthomas', 'photo')) # 下载头像先 helper.downloadImg(head_img, os.path.join( model_dir, '%s.jpg' % name)) if os.path.exists(os.path.join('vivthomas', 'model', '%s.json' % (name))): return # 然后去抓取详细数据 model_info = { 'name': name, 'photos': [] } pyquery = helper.get(url) country_span = pyquery('.custom-country') model_info['country'] = country_span.text() if country_span else 'unknow' # 获取照片数据 custom_content_list = pyquery('.custom-content-list') custom_content = None for item in custom_content_list: if item.getchildren()[0].getchildren()[0].text.startswith('Photos with'): custom_content = item break # if item.getchildren()[0].getchildren()[0].text: # pass if custom_content is None: helper.writeFile(json.dumps(model_info), os.path.join( 'vivthomas', 'model', '%s.json' % (name))) return # if len(custom_content_list) == 3: # custom_content = custom_content_list[1] # else: # custom_content = custom_content_list[0] list_group_item_list = custom_content.getchildren()[2].findall('li') for list_group_item in list_group_item_list: custom_list_item_detailed = list_group_item.getchildren()[1] img = custom_list_item_detailed.getchildren()[0].getchildren()[ 0].getchildren()[0] # custom_list_item_detailed.getchildren()[1].getchildren()[0].getchildren()[0].text photo_name = img.get('alt') # Released: Feb 26, 2016 date_str = custom_list_item_detailed.getchildren()[1].getchildren()[ 1].text_content().split(': ')[1] date_str = '%s-%d-%s' % (date_str.split(', ')[1], helper.getMonth( date_str.split(' ')[0]), date_str.split(' ')[1].replace(',', '')) # 模特名 arr = custom_list_item_detailed.getchildren()[1].getchildren()[ 2].getchildren() model_name_arr = [] for i in xrange(1, len(arr)): model_name_arr.append(arr[i].text) # model_name = custom_list_item_detailed.getchildren()[1].getchildren()[2].getchildren()[1].text # print(model_name_arr) # date = datetime.datetime(int(date_str.split(', ')[1]), helper.getMonth(date_str.split(' ')[0]), int(date_str.split(' ')[1].replace(',', ''))) # print date # 下载照片的封面 photo_path = os.path.join('vivthomas', 'photo', '%s_%s' % (date_str, photo_name.replace( '/', ' ')), '%s_%s.jpg' % (date_str, photo_name.replace('/', ' '))) helper.downloadImg(img.get('src'), photo_path) # 存到数据库 # mongo.newAlbun(photo_name, date) photo_json = { 'date': date_str, 'name': photo_name, 'model': model_name_arr } photo_json_str = json.dumps(photo_json) model_info.get('photos').append(photo_json) helper.writeFile(photo_json_str, os.path.join( 'vivthomas', 'photo', '%s_%s' % (date_str, photo_name), '%s_%s.json' % (date_str, photo_name))) helper.writeFile(json.dumps(model_info), os.path.join( 'vivthomas', 'model', '%s.json' % (name)))
except Exception as e: print(e) return with open(imgPath, 'wb') as f: for chunk in r.iter_content(chunk_size=1024): if chunk: f.write(chunk) f.flush() if __name__ == '__main__': baseUrl = 'http://www.hhssee.com/manhua31358.html' pq = helper.get(baseUrl) comicName = pq('h1').text() comicDir = os.path.join('hhssee', comicName) helper.mkDir(comicDir) bookUrlArr = [] for a in pq('a.l_s'): bookUrlArr.append({ 'url': 'http://www.hhssee.com%s' % a.get('href'), 'name': a.text }) bookIndex = 0 for bookData in bookUrlArr: bookIndex += 1 # if bookIndex < 5: # continue bookName = bookData.get('name') if not '卷' in bookName:
import helper import re, os, calendar, datetime # http://madamevoyeur.com/index.php?p=1&a=2010 BASE_URL = 'http://madamevoyeur.com' # def fetchImagePage(year, pageIndex): # url = 'http://madamevoyeur.com/index.php?p=365&a=2016' # pq = helper.get(url) # pq("a[name='%s']>video" %CateGory) # for item in pq(paramter): if __name__ == '__main__': dirName = 'madameVoyeur' helper.mkDir(dirName) t = datetime.datetime.now() nowYear = int(t.year) year = 2017 while year <= nowYear: pageIndex = 24 for pageIndex in range(1, calendar.isleap(year) and 367 or 366): print('%s => year: %d page: %d' % (helper.now(), year, pageIndex)) # http://madamevoyeur.com/images/2016/16035.jpg helper.downloadImg('%s/images/%d/%02d%03d.jpg' % (BASE_URL, year, 0 if year == 2010 else year - 2000, pageIndex), os.path.join(dirName, '%d_%03d_1.jpg' % (year, pageIndex))) helper.downloadImg('%s/images/%d-2/%02d%03d.jpg' % (BASE_URL, year, 0 if year == 2010 else year - 2000, pageIndex), os.path.join(dirName, '%d_%03d_2.jpg' % (year, pageIndex))) helper.downloadImg('%s/images/%d-3/%02d%03d.jpg' % (BASE_URL, year, 0 if year == 2010 else year - 2000, pageIndex), os.path.join(dirName, '%d_%03d_3.jpg' % (year, pageIndex))) year += 1 # year 2017 page 24
def fetchGallery(url, page): print('now page %d' % page) pq = helper.get(url) # SexArt – Alexis Crystal & Michael Fly – Call | AdultPhotoSets.ru title = pq('title').text() title = title.split(' | ')[0] dirName = os.path.join('imgs', '0uploaded', title) dirName = filterDirName(dirName) if os.path.exists(dirName): print('exists!!! skip!') return True dirName = os.path.join('imgs', '0uploaded', '0baidu', title) dirName = filterDirName(dirName) if os.path.exists(dirName): print('exists!!! skip!') return True dirName = os.path.join('imgs', '0uploaded', 'MetArt', title) dirName = filterDirName(dirName) if os.path.exists(dirName): print('exists!!! skip!') return True dirName = os.path.join('imgs', '0nas', 'MetArt', title) dirName = filterDirName(dirName) if os.path.exists(dirName): print('exists!!! skip!') return True dirName = os.path.join('imgs', '0nas', 'MetArtX', title) dirName = filterDirName(dirName) if os.path.exists(dirName): print('exists!!! skip!') return True dirName = os.path.join('imgs', '0error', title) dirName = filterDirName(dirName) if os.path.exists(dirName): print('exists!!! skip!') return True # 创建本地目录 dirName = os.path.join('imgs', title) dirName = filterDirName(dirName) # print('make dir %s' % dirName) # 如果存在url.txt,说明这个相册已经抓取过了,直接return吧 if os.path.exists('%s/url.txt' % dirName): print('exists!!! skip!') return True helper.mkDir(dirName) i = 0 tag = None imgUrl = [] aArr = pq('a.externalLink') if not aArr or len(aArr) < 1: aArr = pq('div.content>p>a') if not aArr or len(aArr) < 1: aArr = pq('div.content>a') if not aArr or len(aArr) < 1: # http://imgtrex.com/8kbfdzphqsr1/daniela-dressed-for-sex-02-10000px arr = re.compile(r'http://imgtrex\.com/\w+/[a-z0-9-]+\.jpg').findall(pq.html()) if len(arr) == 0: print('can\'t find any <a>') if url == 'http://adultphotosets.ru/met-art-lupita-gifera/': return True if url == 'http://adultphotosets.ru/rylskyart-oretha-mars-second-2-mars/': return True if url == 'http://adultphotosets.ru/met-art-nikolina-deirth/': return True return False aArr = [{'href': a} for a in arr] # for a in arr: # aArr.append({'href': a}) if aArr and len(aArr) > 0: if 'imgchili.net' in aArr[0].get('href'): imgArr = pq('div.content>p>a>img') if not imgArr or len(imgArr) < 1: imgArr = pq('div.content>a>img') # http://t10.imgchili if imgArr and len(imgArr) > 0: tag = imgArr[0].get('src').replace('http://', '').split('.imgchili')[0].replace('t', '') for a in aArr: print('%s image index => %d' % (helper.now(), i)) url = fetchLargeImageUrl(a.get('href'), tag) if url == None: if i == 0: print('fetchLargeImageUrl failed') return True else: if url != '': imgUrl.append(url) i += 1 if len(imgUrl) > 0: helper.writeFile('\n'.join(imgUrl), '%s/url.txt' % dirName) return True
def fetchModel(url=None, headUrl=None, name='Abril C', score=8.97): if url is None: url = 'https://www.eternaldesire.com/model/abril-c/' if headUrl is None: headUrl = 'https://static.eternaldesire.com/media/headshots/abril-c.jpg?fv=e6f189022422389d377149f795d1da13' modelPath = os.path.join('eternaldesire', 'models', name) helper.mkDir(modelPath) helper.downloadImg(headUrl, os.path.join(modelPath, '%s_EternalDesire.jpg' % name)) modelInfo = { 'name': name, 'score': score, 'Age first shot': 0, 'Eye color': '', 'Hair color': '', 'Breasts': '', 'Shaved': '', 'Measurements': '', 'Height': '', 'Weight': '', 'Country': '', 'Ethnicity': '', 'photos': [] } pq = helper.get(url, None, None, 1) infoLiArr = pq('.model_info > ul > li') for li in infoLiArr: arr = li.text_content().split(': ') for key in modelInfo: if key == arr[0]: modelInfo[key] = arr[1] break photoIndex = 1 while photoIndex < 100: photoDivArr = pq('#latest_photo_update_%d .update_cell' % photoIndex) liArr = pq('#latest_photo_update_%d .hover_container_stats li' % photoIndex) if not liArr or len(liArr) == 0: break for i in xrange(0, len(photoDivArr)): photoInfo = {'name': '', 'date': '0.0.1970', 'model': ''} img = photoDivArr[i].find('a').find('img') coverUrl = img.get('src') photoInfo['name'] = img.get('alt') photoInfo['date'] = liArr[i * 3].text_content().replace( 'Date published:', '') photoInfo['model'] = liArr[i * 3 + 1].text_content().replace( 'Featuring: ', '') jsonStr = json.dumps(photoInfo) photoPath = os.path.join( 'eternaldesire', 'photos', photoInfo.get('name') + '-' + photoInfo['model']) helper.mkDir(photoPath) helper.writeFile(jsonStr, os.path.join(photoPath, 'info.json')) helper.downloadImg( coverUrl, os.path.join( photoPath, '%s_cover_EternalDesire.jpg' % photoInfo.get('name'))) modelInfo['photos'].append(photoInfo) photoIndex += 1 helper.writeFile(json.dumps(modelInfo), os.path.join(modelPath, 'info.json'))
import flightclub import stockx import goat import nike import kickz import sys # WEB_DOMAIN = ['footlocker', 'jimmyjazz', 'sneakersnstuff', 'footaction'] WEB_DOMAIN = ['finishline', 'champssports', 'stadiumgoods', 'flightclub', 'eastbay', 'stockx', 'goat', 'kickz', 'footlocker', 'nike', 'jimmyjazz'] if __name__ == '__main__': for dir_name in WEB_DOMAIN: helper.mkDir(os.path.join('.', 'imgs', dir_name)) parser = argparse.ArgumentParser() parser.add_argument("target", help=','.join(WEB_DOMAIN)) parser.add_argument("action", help='common or hot') options = parser.parse_args() target = options.target action = options.action if target not in WEB_DOMAIN: print('legal target: [%s] ' % ', '.join(WEB_DOMAIN)) else: if target == 'eastbay': eastbay.start(action) elif target == 'footlocker': footlocker.start(action) elif target == 'jimmyjazz':
#!/usr/bin/env python # -*- coding: utf-8 -*- # 生成缩略图,以供上传到qq群文件里 import Image, os, helper dir_name = r'G:\adultphotosets\imgs\EternalDesire – Abril C – Turgues'.decode( 'utf-8') def scale(imgPath): print('try to open file: %s' % imgPath) im = Image.open(imgPath) # 获得图像尺寸: w, h = im.size im.thumbnail((200, int(h * 200 / w))) # 把缩放后的图像用jpeg格式保存: arr = imgPath.split('\\') arr.insert(-1, 's') im.save('\\'.join(arr), 'jpeg') print('saved => %s' % '\\'.join(arr)) if __name__ == '__main__': helper.mkDir(os.path.join(dir_name, 's')) for parent, dirnames, filenames in os.walk(dir_name): for filename in filenames: if not parent.endswith('\\s'): if filename.endswith('jpg'): scale(os.path.join(parent, filename))
import json import datetime import helper modelArr = [] albumArr = [] # def findModel(id): # global modelArr # for model in modelArr: # if model.get('id') == id: # return model if __name__ == '__main__': model_dir = '/Users/eddie104/Documents/hongjie/photosets/femjoy/model' helper.mkDir(model_dir) page = 1 while True: url = 'https://www.femjoy.com/api/v2/actors?sorting=date&thumb_size=355x475&limit=48&page=%d' % page txt = helper.get(url, returnText=True) jsonData = json.loads(txt) total_pages = jsonData.get('pagination').get('total_pages') for modelData in jsonData.get('results'): model = { 'id': modelData.get('id'), 'slug': modelData.get('slug'), 'name': modelData.get('name'),