def fetchLargeImageUrl(imgUrl, tag): if not imgUrl.endswith('zip'): if 'imagehosting.pro' in imgUrl or 'gif-jpg.com' in imgUrl or 'ipics.info' in imgUrl: pq = helper.get(imgUrl) img = pq('img.centred') url = img.attr('src') if not url: img = pq('img.centred_resized') url = img.attr('src') if url == None: return '' return url elif 'img.yt' in imgUrl or 'imgcandy.net' in imgUrl: if 'img.yt' in imgUrl: imgUrl = imgUrl.replace('http:', 'https:') pq = helper.post(imgUrl, 2) img = pq('img.centred') return img.attr('src') elif 'imgchili.net' in imgUrl: # global imgchili_cookies # pq = helper.get(imgUrl, imgchili_cookies) # img = pq('img#show_image') # url = img.attr('src') # return url # http://imgchili.net/show/102747/102747596__sexart_raddia_cover.jpg # http://i11.imgchili.net/102747/102747596__sexart_raddia_cover.jpg url = imgUrl.replace('//imgchili', '//i%s.imgchili' % tag).replace('show/', '') return url elif 'imagetwist.com' in imgUrl: pq = helper.get(imgUrl) if not pq: return '' img = pq('img.img-responsive') url = img.attr('src') return url or '' elif 'dfiles.ru' in imgUrl: # 这是个提供下载zip的页面,直接跳过 return '' elif 'imgtrex.com' in imgUrl: pq = helper.get(imgUrl) img = pq('img.pic') return img.attr('src') elif 'addimage.info' in imgUrl: print('addimage.info is over!!!') return '' elif 'dragimage.org' in imgUrl: print('dragimage.org is over!!!') return '' else: print('unknow image url => %s' % imgUrl) return None return ''
def fetch_detail(url): url = 'https://www.footaction.com' + url pq = helper.get(url, cookies) name = pq('span.c-product-name').text() print('name = %s' % name) number = pq('div.c-tab-panel').text().split(' ')[2] print('number = %s' % number) size_price_arr = [] price = '0.00' try: price = float(pq('span.sr-only').text().replace('$', '')) except: price = float(pq('span.final').text().replace('$', '')) size_arr = pq('div.c-size p > label').text().split(' ') for size in size_arr: size_price_arr.append({ 'size': float(size), 'price': price, 'isInStock': True }) print('size_price_arr = ', size_price_arr) img_json_str = helper.get('https://images.footaction.com/is/image/EBFL2/%sMM?req=set,json' % number, returnText=True) img_json = None img_url = None try: img_json = json.loads(img_json_str.replace('/*jsonp*/s7jsonResponse(', '').replace(',"");', '')) img_item_arr = img_json.get('set').get('item') for img_item in img_item_arr: if img_item.get('type') == 'img_set': img_url = img_item.get('set').get('item')[0].get('s').get('n') break except: img_json_str = helper.get('https://images.footaction.com/is/image/EBFL2/%s?req=set,json' % number, returnText=True) img_json = json.loads(img_json_str.replace('/*jsonp*/s7jsonResponse(', '').replace(',"");', '')) img_item_arr = img_json.get('set').get('item') try: img_url = img_item_arr[0].get('s').get('n') except: img_url = img_item_arr.get('s').get('n') img_url = 'https://images.footaction.com/is/image/%s?wid=600&hei=600&fmt=jpg' % img_url print(img_url) global platform helper.downloadImg(img_url, os.path.join('.', 'imgs', platform, '%s.jpg' % number)) mongo.insert_pending_goods(name, number, url, size_price_arr, ['%s.jpg' % number], platform) # 上传到七牛 qiniuUploader.upload_2_qiniu(platform, '%s.jpg' % number, './imgs/%s/%s.jpg' % (platform, number))
def start(): crawl_counter = mongo.get_crawl_counter(platform) # 创建一个队列用来保存进程获取到的数据 q = Queue() # 有错误的页面链接 error_page_url_queue = Queue() # 先获取cookie _, tmpCookie = helper.get('https://www.kickz.com/us/men/shoes/c', myHeaders={ 'User-Agent': 'Mozilla/5.0' }, withCookie=True) global cookie cookie['JSESSIONID'] = tmpCookie.get('JSESSIONID', '') total_page = 20 fetch_page(['https://www.kickz.com/us/men/shoes/c?selectedPage=%d' % page for page in range(1, total_page + 1)], 1, q, error_page_url_queue, crawl_counter) total_page = 17 fetch_page(['https://www.kickz.com/us/kids,women/shoes/shoe-sizes/38+,36-2:3,40+,37+,41+,39-1:3,35+,36,36+,39+,39,37,38,41-1:3,42,41,40,39:40,38-2:3,40-2:3,35:36,37:38,37-1:3,41:42/c?selectedPage=%d' % page for page in range(1, total_page + 1)], 2, q, error_page_url_queue, crawl_counter) # # 处理出错的链接 # while not error_page_url_queue.empty(): # error_page_url_list = [] # while not error_page_url_queue.empty(): # error_page_url_list.append(error_page_url_queue.get()) # error_page_men_url_list = [url_data.get('url') for url_data in error_page_url_list if url_data.get('gender') == 1] # fetch_page(error_page_men_url_list, 1, q, error_page_url_queue, crawl_counter) # error_page_women_url_list = [url_data.get('url') for url_data in error_page_url_list if url_data.get('gender') == 2] # fetch_page(error_page_women_url_list, 2, q, error_page_url_queue, crawl_counter) helper.log('done', platform)
def main(chat='A', enabled=False, chat_index=-1): '''main''' chat_index = CHAT_ARR.index(chat) if chat_index == -1 else chat_index b = True is_enabled = enabled if chat_index < len(CHAT_ARR): url = 'https://www.metartx.com/models/all/%s' % CHAT_ARR[chat_index] pyquery = helper.get(url) a_arr = pyquery('.list-group-item > a') for item in a_arr: if b: url = item.get('href') if url == "https://www.metart.com/model/uliya-a/": is_enabled = True if is_enabled: head_img = item.find('img').get('src') name = item.find('img').get('alt') json_path = os.path.join('metartx', 'model', '%s.json' % name) img_pathh = os.path.join('metartx', 'model', '%s_MetArtX.jpg' % name) if not os.path.exists(json_path) or not os.path.exists( img_pathh): fetch_model(url, name, head_img) b = False else: b = True main(chat_index=chat_index + 1, enabled=is_enabled)
def main(chat, enabled=False): chat_index = CHAT_ARR.index(chat) '''main''' b = True is_enabled = enabled if chat_index < 26: url = 'https://www.vivthomas.com/models/all/%s' % CHAT_ARR[chat_index] pyquery = helper.get(url) a_arr = pyquery('.list-group-item > a') for item in a_arr: if b: url = item.get('href') if url == "https://www.metart.com/model/uliya-a/": is_enabled = True if is_enabled: head_img = item.find('img').get('src') name = item.find('img').get('alt') json_path = os.path.join( 'vivthomas', 'model', '%s.json' % name) img_pathh = os.path.join( 'vivthomas', 'model', '%s.jpg' % name) print(url, name, head_img) if not os.path.exists(json_path) or not os.path.exists(img_pathh): fetch_model(url, name, head_img) b = False else: b = True if chat_index < len(CHAT_ARR) - 1: main(CHAT_ARR[chat_index + 1], is_enabled)
def main(chat_index=0, enabled=False): '''main''' b = True is_enabled = enabled if chat_index < 26: url = 'https://www.eroticbeauty.com/models/all//%s' % CHAT_ARR[ chat_index] pyquery = helper.get(url) a_arr = pyquery('.list-group-item > a') for item in a_arr: if b: url = item.get('href') print('now => %s' % url) # if url == "https://www.metart.com/model/uliya-a/": # is_enabled = True if is_enabled: head_img = item.find('img').get('src') name = item.find('img').get('alt') json_path = os.path.join('eroticbeauty', 'model', '%s.json' % name) img_pathh = os.path.join('eroticbeauty', 'model', '%s.jpg' % name) if not os.path.exists(json_path) or not os.path.exists( img_pathh): fetch_model(url, name, head_img) b = False else: b = True main(chat_index + 1, is_enabled)
def run(self): try: pq = helper.get(self.url, myHeaders=self.headers) a_list = pq('a.product-image') for a in a_list: self.q.put(a.get('href')) except: self.error_page_url_queue.put(self.url)
def fetchAlbum(url): pq = helper.get(url) dirName = os.path.join('.', 'aiss', pq('title').text()) helper.mkDir(dirName) index = 1 for img in pq('.message > img'): helper.downloadImg(img.get('src'), os.path.join(dirName, '%03d.jpg' % index)) index += 1
def run(self): try: pq = helper.get(self.url, myHeaders=self.headers) a_list = pq('div.mainsite_record_listing li > a') total = len(a_list) for i in range(0, total): self.q.put(a_list[i].get('href')) except: self.error_page_url_queue.put({'url': self.url, 'gender': self.gender})
def run(self): # 获取商品详情url try: pq = helper.get(self.url, cookies=cookie, myHeaders=self.headers) for a in pq('a.no-h-over'): self.q.put(a.get('link')) # helper.log('[DEBUG] => ' + a.get('link'), platform) except: helper.log('[ERROR] => ' + self.url, platform) self.error_page_url_queue.put({'url': self.url, 'gender': self.gender})
def fetchAlbum(url, dirName): if 'rosi' in url: pq = helper.get(url) dirName = os.path.join(dirName, pq('#post-title').text().split('No.')[1]) helper.mkDir(dirName) for a in pq('.gallery-icon > a'): imgUrl = a.get('href') helper.downloadImg(imgUrl, os.path.join(dirName, imgUrl.split('/')[-1]))
def run(self): ''' 解析网站源码 ''' time.sleep(random.randint(2, 5)) try: pq = helper.get(self.url, myHeaders=self.headers, cookies=self.cookies) # 款型名称 name = pq('h1#pdp_product_title') if name and len(name) > 0: name = name[0].text # 配色的编号 number = pq('li.description-preview__style-color').text().split(':')[1].strip() # 颜色值 color_value = pq('li.description-preview__color-description').text().split(':')[1].strip() price = 0 for div in pq('div.text-color-black'): if div.get('data-test') == 'product-price': price = float(div.text.replace('$', '')) break size_price_arr = [] for input in pq('div.availableSizeContainer input'): # M 3.5 / W 5 size = input.get('aria-label').replace('W', '').replace('M', '').replace('C', '').strip() if '/' in size: size = size.split('/')[0].strip() size_price_arr.append({ 'size': float(size), 'price': price, 'isInStock': input.get('disabled', False) == False }) img_url = None for source in pq('noscript > picture > source'): img_url = source.get('srcset') break if img_url: pass result = helper.downloadImg(img_url, os.path.join('.', 'imgs', platform, '%s.jpg' % number)) if result == 1: # 上传到七牛 qiniuUploader.upload_2_qiniu(platform, '%s.jpg' % number, './imgs/%s/%s.jpg' % (platform, number)) mongo.insert_pending_goods(name, number, self.url, size_price_arr, ['%s.jpg' % number], self.gender, color_value, platform, '5be444e3c7e854cab4b252a0', self.crawl_counter, '', True if img_url else False) else: helper.log('%s has no name' % self.url, platform) # name = pq('h1.exp-pdp-title__main-title') # name = name[0].text except Exception as e: global error_detail_url error_counter = error_detail_url.get(self.url, 1) error_detail_url[self.url] = error_counter + 1 helper.log('[ERROR] error timer = %s, url = %s' % (error_counter, self.url), platform) helper.log(e, platform) if error_counter < 3: self.q.put(self.url)
def run(self): try: pq = helper.get(self.url) for a in pq('li.item > a'): self.q.put(a.get('href')) except: helper.log('[ERROR] => ' + self.url, platform) self.error_page_url_queue.put({ 'url': self.url, 'gender': self.gender })
def fetch_page(url, page = 1, total_page = -1): page_url = '%s?currentPage=%d&sort=name-asc' % (url, page - 1) pq = helper.get(page_url, cookies, headers) # 获取商品详情url a_arr = pq('div.c-product-card > a') for a in a_arr: fetch_detail(a.get('href')) if total_page < 0: total_str = pq('div.sub strong').text() total_page = int(math.ceil(int(total_str) / 60)) if page + 1 < total_page: fetch_page(url, page + 1, total_page)
def fetchPage(page): url = '%s/page/%d/' % (BASE_URL, page) pq = helper.get(url) for a in pq('a.disp_a'): title = a.get('title').replace('Permalink to ', '') url = a.get('href') dirName = os.path.join('cartoon', title) if not os.path.exists(os.path.join(dirName, 'url.txt')): helper.mkDir(dirName) if not fetchGallery(url, title, page): return False return True
def fetchPage(page): global enalbed url = '%s/page/%d/' % (BASE_URL, page) pq = helper.get(url) for a in pq('h2 > a'): url = a.get('href') if not enalbed: if url == 'http://adultphotosets.ru/metart-andrea-sixth-presenting/': enalbed = True if enalbed: if not fetchGallery(url, page): return False return True
def run(self): # 获取商品详情url try: pq = helper.get(self.url, myHeaders=self.headers) for span in pq('span.product_title'): a = PyQuery(span).parents('a') self.q.put(a.attr('href')) except: helper.log('[ERROR] => ' + self.url, 'eastbay') self.error_page_url_queue.put({ 'url': self.url, 'gender': self.gender })
def run(self): # 获取商品详情url try: pq = helper.get(self.url) for a in pq('div.product_grid_image > a'): self.q.put('http://www.jimmyjazz.com%s' % a.get('href')) except: global platform helper.log('[ERROR] => ' + self.url, platform) self.error_page_url_queue.put({ 'url': self.url, 'gender': self.gender })
def fetch_page(url, page=1): total_page = -1 while True: page_url = '%s/%d?orderBy=Published' % (url, page) pq = helper.get(page_url) if total_page < 0: span = pq('span.current-page')[0] total_page = int(span.text.strip().split('(')[1].replace(')', '')) # 获取商品详情url for a in pq('li.product > a'): fetch_detail('https://www.sneakersnstuff.com%s' % a.get('href'), page) page += 1 if page > total_page: # 下一页超过最大页数,break break
def fetchGallery(url, title, cartoonPage, page=1, urlArr=None): print('%s => now cartoonPage => %d' % (helper.now(), cartoonPage)) # print('now cartoon => %s' % title) if not urlArr: urlArr = [] pq = helper.get('%s/%d' % (url, page)) if not pq: return False for img in pq('p>img'): src = img.get('src') if src in urlArr: dirName = os.path.join('cartoon', title) helper.writeFile('\n'.join(urlArr), u'%s/url.txt' % dirName) return True urlArr.append(src) return fetchGallery(url, title, cartoonPage, page + 1, urlArr)
def run(self): try: txt = helper.get(self.url, myHeaders=self.headers, returnText=True) json_data = json.loads(txt) item_list = json_data.get('sections')[0].get('items') for item_data in item_list: self.q.put(item_data.get('pdpUrl')) except Exception as e: error_counter = error_detail_url.get(self.url, 1) error_detail_url[self.url] = error_counter + 1 helper.log('[ERROR] error timer = %s, url = %s' % (error_counter, self.url), platform) helper.log(e, platform) if error_counter < 3: self.q.put(self.url) finally: helper.log('[INFO] %s is done' % self.url, platform)
def run(self): # 获取商品详情url try: html = helper.get(self.url, myHeaders=self.headers, returnText=True) json_data = json.loads(html) print(json_data.get('products', [])[0]) # for span in pq('span.product_title'): # a = PyQuery(span).parents('a') # self.q.put(a.attr('href')) except: helper.log('[ERROR] => ' + self.url, platform) self.error_page_url_queue.put({ 'url': self.url, 'gender': self.gender })
def fetchGallery(url): pq = helper.get(url) # SexArt – Alexis Crystal & Michael Fly – Call | AdultPhotoSets.ru title = pq('title').text() title = title.split(' | ')[0] dirName = os.path.join('imgs', '0error', title) i = 0 tag = None imgUrl = [] aArr = pq('a.externalLink') if not aArr or len(aArr) < 1: aArr = pq('div.content>p>a') if not aArr or len(aArr) < 1: # http://imgtrex.com/8kbfdzphqsr1/daniela-dressed-for-sex-02-10000px arr = re.compile( r'http://imgtrex\.com/\w+/[a-z0-9-]+\.jpg').findall(pq.html()) if len(arr) == 0: print('can\'t find any <a>') return None aArr = [{'href': a} for a in arr] # for a in arr: # aArr.append({'href': a}) if aArr and len(aArr) > 0: if 'imgchili.net' in aArr[0].get('href'): imgArr = pq('div.content>p>a>img') # http://t10.imgchili tag = imgArr[0].get('src').replace( 'http://', '').split('.imgchili')[0].replace('t', '') for a in aArr: print('%s image index => %d' % (helper.now(), i)) url = fetchLargeImageUrl(a.get('href'), tag) if url == None: print('fetchLargeImageUrl failed') return None else: if url != '': imgUrl.append(url) i += 1 if len(imgUrl) > 0: helper.writeFile('\n'.join(imgUrl), '%s/url.txt' % dirName) return title
def main(): # https://www.eternaldesire.com/models/all/A/ global chatIndex chatIndex = 12 b = False while chatIndex < 23: url = 'https://www.eternaldesire.com/models/all/%s/' % CHAT_ARR[ chatIndex] pq = helper.get(url) scoreSpanArr = pq('.update_information_gallery_rating_number') aArr = pq('.update_information_model_name') imgArr = pq('td > a> img') for i in xrange(0, len(imgArr)): if imgArr[i].get('alt') == 'Mila': b = True if b: fetchModel(aArr[i].get('href'), imgArr[i].get('src'), imgArr[i].get('alt'), scoreSpanArr[i].text) # break chatIndex += 1
def run(self): ''' 解析网站源码 ''' time.sleep(2) try: pq = helper.get(self.url, cookie) name = pq('h1#prodNameId').text() number = pq('span#supplierArtNumSpan').text() color_value = pq('span#variantColorId').text() size_price_arr = [] for a in pq('div#2SizeContainer > div > a'): arr = [item.strip() for item in a.get('onclick').replace('ProductDetails.changeSizeAffectedLinks(', '').replace(');', '').split('\n')] # print(arr) # '8+', => 8+, => 8+ arr[6] = arr[6].replace('\'', '').replace(',', '').replace('Y', '') size_price_arr.append({ 'size': float(arr[6]) if '+' not in arr[6] else float(arr[6].replace('+', '')) + 0.5, # '115,76 USD', => '115.76 USD'. => '115.76 USD'. => '115.76 => 115.76 'price': float(arr[2].replace(',', '.').replace(' USD\'.', '').replace('\'', '')), 'isInStock': True }) # print(size_price_arr) img_downloaded = mongo.is_pending_goods_img_downloaded(self.url) if not img_downloaded: img_url = pq('img.productDetailPic').attr('src') result = helper.downloadImg(img_url, os.path.join('.', 'imgs', platform, '%s.jpg' % number)) if result == 1: # 上传到七牛 qiniuUploader.upload_2_qiniu(platform, '%s.jpg' % number, './imgs/%s/%s.jpg' % (platform, number)) img_downloaded = True mongo.insert_pending_goods(name, number, self.url, size_price_arr, ['%s.jpg' % number], self.gender, color_value, platform, '5bc87d6dc7e854cab4875368', self.crawl_counter, img_downloaded=img_downloaded) except Exception as e: global error_detail_url error_counter = error_detail_url.get(self.url, 1) error_detail_url[self.url] = error_counter + 1 helper.log('[ERROR] error timer = %s, url = %s' % (error_counter, self.url), platform) helper.log(e, platform) if error_counter < 3: self.q.put(self.url)
def fetchComic(webPage, comicIndex, url, page = 1, comicDir = None): pq = helper.get('%s/%d' % (url, page)) if page == 1: title = pq('title').text().replace('/', '&').split(' - ')[0] comicDir = os.path.join('animezilla', title) if os.path.exists(os.path.join('animezilla', '0uploaded', title, 'done.txt')): return True if os.path.exists(os.path.join(comicDir, 'done.txt')): return True helper.mkDir(comicDir) if webPage == 1 and comicIndex == 16: if page < 90: return fetchComic(webPage, comicIndex, url, 90, comicDir) img = pq('img#comic') print('[%s] downloading webPage page => %d, comic index => %d, comic page => %d' % (helper.now(), webPage, comicIndex, page)) downloadImg(img.attr('src'), os.path.join(comicDir, '%03d.jpg' % page), url) time.sleep(3) if(len(img.parents('a')) == 0): helper.writeFile('done', os.path.join(comicDir, 'done.txt')) return True return fetchComic(webPage, comicIndex, url, page + 1, comicDir)
def fetch_page(url, q, crawl_counter, gender, error_page_url_queue): total_page = -1 page = 1 page_thread_list = [] while True: page_url = '%s?ppg=104&page=%d' % (url, page) # 创建并启动线程 time.sleep(1.2) page_spider = PageSpider(page_url, q, error_page_url_queue, gender) page_spider.start() page_thread_list.append(page_spider) if total_page < 0: pq = helper.get(page_url) div = pq('div.pagination_info')[0] total_page = int(div.text.strip().split('of ')[1]) page += 1 if page > total_page: # 下一页超过最大页数,break break for t in page_thread_list: t.join() goods_thread_list = [] while True: queue_size = q.qsize() if queue_size > 0: # 每次启动5个抓取商品的线程 for i in range(5 if queue_size > 5 else queue_size): goods_spider = GoodsSpider(q.get(), gender, q, crawl_counter) goods_spider.start() goods_thread_list.append(goods_spider) for t in goods_thread_list: t.join() goods_thread_list = [] else: break
import helper import constants # https://api.enterprise.apigee.com/v1/organizations/yemhuynh-eval/environments/prod/caches for e in constants.ENVS: caches = [] cacheNames = helper.get("environments/" + e + "/caches") for name in cacheNames: cacheInfo = helper.get("environments/" + e + "/caches/" + name) caches.append(cacheInfo) helper.dumpConfig("env/" + e + "/caches.json", caches)
#!/usr/bin/env python # -*- coding: utf-8 -*- # # Author: [email protected] # Date: 2017-07-16 15:12:36 import helper import argparse if __name__ == '__main__': # parser = argparse.ArgumentParser() # parser.add_argument("--url", help = "the url of web") # options = parser.parse_args() # http://www.soku.com/search_video/q_王者荣耀_orderby_1_limitdate_0?site=14&page=2 for page in range(1, 99): pq = helper.get( 'http://www.soku.com/search_video/q_王者荣耀_orderby_1_limitdate_0?site=14&page=%d' % page) for a in pq('.v-link > a'): helper.runCmd('you-get -o ./videos --format=mp4 %s' % a.get('href'))
def fetch_model(url, name, head_img): '''fetch model''' model_dir = os.path.join('vivthomas', 'model') helper.mkDir(model_dir) helper.mkDir(os.path.join('vivthomas', 'photo')) # 下载头像先 helper.downloadImg(head_img, os.path.join( model_dir, '%s.jpg' % name)) if os.path.exists(os.path.join('vivthomas', 'model', '%s.json' % (name))): return # 然后去抓取详细数据 model_info = { 'name': name, 'photos': [] } pyquery = helper.get(url) country_span = pyquery('.custom-country') model_info['country'] = country_span.text() if country_span else 'unknow' # 获取照片数据 custom_content_list = pyquery('.custom-content-list') custom_content = None for item in custom_content_list: if item.getchildren()[0].getchildren()[0].text.startswith('Photos with'): custom_content = item break # if item.getchildren()[0].getchildren()[0].text: # pass if custom_content is None: helper.writeFile(json.dumps(model_info), os.path.join( 'vivthomas', 'model', '%s.json' % (name))) return # if len(custom_content_list) == 3: # custom_content = custom_content_list[1] # else: # custom_content = custom_content_list[0] list_group_item_list = custom_content.getchildren()[2].findall('li') for list_group_item in list_group_item_list: custom_list_item_detailed = list_group_item.getchildren()[1] img = custom_list_item_detailed.getchildren()[0].getchildren()[ 0].getchildren()[0] # custom_list_item_detailed.getchildren()[1].getchildren()[0].getchildren()[0].text photo_name = img.get('alt') # Released: Feb 26, 2016 date_str = custom_list_item_detailed.getchildren()[1].getchildren()[ 1].text_content().split(': ')[1] date_str = '%s-%d-%s' % (date_str.split(', ')[1], helper.getMonth( date_str.split(' ')[0]), date_str.split(' ')[1].replace(',', '')) # 模特名 arr = custom_list_item_detailed.getchildren()[1].getchildren()[ 2].getchildren() model_name_arr = [] for i in xrange(1, len(arr)): model_name_arr.append(arr[i].text) # model_name = custom_list_item_detailed.getchildren()[1].getchildren()[2].getchildren()[1].text # print(model_name_arr) # date = datetime.datetime(int(date_str.split(', ')[1]), helper.getMonth(date_str.split(' ')[0]), int(date_str.split(' ')[1].replace(',', ''))) # print date # 下载照片的封面 photo_path = os.path.join('vivthomas', 'photo', '%s_%s' % (date_str, photo_name.replace( '/', ' ')), '%s_%s.jpg' % (date_str, photo_name.replace('/', ' '))) helper.downloadImg(img.get('src'), photo_path) # 存到数据库 # mongo.newAlbun(photo_name, date) photo_json = { 'date': date_str, 'name': photo_name, 'model': model_name_arr } photo_json_str = json.dumps(photo_json) model_info.get('photos').append(photo_json) helper.writeFile(photo_json_str, os.path.join( 'vivthomas', 'photo', '%s_%s' % (date_str, photo_name), '%s_%s.json' % (date_str, photo_name))) helper.writeFile(json.dumps(model_info), os.path.join( 'vivthomas', 'model', '%s.json' % (name)))