Пример #1
0
def fetchLargeImageUrl(imgUrl, tag):
	if not imgUrl.endswith('zip'):
		if 'imagehosting.pro' in imgUrl or 'gif-jpg.com' in imgUrl or 'ipics.info' in imgUrl:
			pq = helper.get(imgUrl)
			img = pq('img.centred')
			url = img.attr('src')
			if not url:
				img = pq('img.centred_resized')
				url = img.attr('src')
			if url == None:
				return ''
			return url
		elif 'img.yt' in imgUrl or 'imgcandy.net' in imgUrl:
			if 'img.yt' in imgUrl:
				imgUrl = imgUrl.replace('http:', 'https:')
			pq = helper.post(imgUrl, 2)
			img = pq('img.centred')
			return img.attr('src')
		elif 'imgchili.net' in imgUrl:
			# global imgchili_cookies
			# pq = helper.get(imgUrl, imgchili_cookies)
			# img = pq('img#show_image')
			# url = img.attr('src')
			# return url
			
			# http://imgchili.net/show/102747/102747596__sexart_raddia_cover.jpg
			# http://i11.imgchili.net/102747/102747596__sexart_raddia_cover.jpg
			url = imgUrl.replace('//imgchili', '//i%s.imgchili' % tag).replace('show/', '')
			return url
		elif 'imagetwist.com' in imgUrl:
			pq = helper.get(imgUrl)
			if not pq:
				return ''
			img = pq('img.img-responsive')
			url = img.attr('src')
			return url or ''
		elif 'dfiles.ru' in imgUrl:
			# 这是个提供下载zip的页面,直接跳过
			return ''
		elif 'imgtrex.com' in imgUrl:
			pq = helper.get(imgUrl)
			img = pq('img.pic')
			return img.attr('src')
		elif 'addimage.info' in imgUrl:
			print('addimage.info is over!!!')
			return ''
		elif 'dragimage.org' in imgUrl:
			print('dragimage.org is over!!!')
			return ''
		else:
			print('unknow image url => %s' % imgUrl)
			return None
	return ''
Пример #2
0
def fetch_detail(url):
    url = 'https://www.footaction.com' + url
    pq = helper.get(url, cookies)
    name = pq('span.c-product-name').text()
    print('name = %s' % name)
    number = pq('div.c-tab-panel').text().split(' ')[2]
    print('number = %s' % number)

    size_price_arr = []
    price = '0.00'
    try:
        price = float(pq('span.sr-only').text().replace('$', ''))
    except:
        price = float(pq('span.final').text().replace('$', ''))
    size_arr = pq('div.c-size p > label').text().split(' ')
    for size in size_arr:
        size_price_arr.append({
            'size': float(size),
            'price': price,
            'isInStock': True
        })
    print('size_price_arr = ', size_price_arr)

    img_json_str = helper.get('https://images.footaction.com/is/image/EBFL2/%sMM?req=set,json' % number, returnText=True)
    img_json = None
    img_url = None
    try:
        img_json = json.loads(img_json_str.replace('/*jsonp*/s7jsonResponse(', '').replace(',"");', ''))
        img_item_arr = img_json.get('set').get('item')
        for img_item in img_item_arr:
            if img_item.get('type') == 'img_set':
                img_url = img_item.get('set').get('item')[0].get('s').get('n')
                break
    except:
        img_json_str = helper.get('https://images.footaction.com/is/image/EBFL2/%s?req=set,json' % number, returnText=True)
        img_json = json.loads(img_json_str.replace('/*jsonp*/s7jsonResponse(', '').replace(',"");', ''))
        img_item_arr = img_json.get('set').get('item')
        try:
            img_url = img_item_arr[0].get('s').get('n')
        except:
            img_url = img_item_arr.get('s').get('n')

    img_url = 'https://images.footaction.com/is/image/%s?wid=600&hei=600&fmt=jpg' % img_url
    print(img_url)
    global platform
    helper.downloadImg(img_url, os.path.join('.', 'imgs', platform, '%s.jpg' % number))
    mongo.insert_pending_goods(name, number, url, size_price_arr, ['%s.jpg' % number], platform)
    # 上传到七牛
    qiniuUploader.upload_2_qiniu(platform, '%s.jpg' % number, './imgs/%s/%s.jpg' % (platform, number))
Пример #3
0
def start():
    crawl_counter = mongo.get_crawl_counter(platform)
    # 创建一个队列用来保存进程获取到的数据
    q = Queue()
    # 有错误的页面链接
    error_page_url_queue = Queue()
    # 先获取cookie
    _, tmpCookie = helper.get('https://www.kickz.com/us/men/shoes/c', myHeaders={
        'User-Agent': 'Mozilla/5.0'
    }, withCookie=True)
    global cookie
    cookie['JSESSIONID'] = tmpCookie.get('JSESSIONID', '')
    total_page = 20
    fetch_page(['https://www.kickz.com/us/men/shoes/c?selectedPage=%d' % page
                for page in range(1, total_page + 1)], 1, q, error_page_url_queue, crawl_counter)

    total_page = 17
    fetch_page(['https://www.kickz.com/us/kids,women/shoes/shoe-sizes/38+,36-2:3,40+,37+,41+,39-1:3,35+,36,36+,39+,39,37,38,41-1:3,42,41,40,39:40,38-2:3,40-2:3,35:36,37:38,37-1:3,41:42/c?selectedPage=%d' % page
                for page in range(1, total_page + 1)], 2, q, error_page_url_queue, crawl_counter)

    # # 处理出错的链接
    # while not error_page_url_queue.empty():
    #     error_page_url_list = []
    #     while not error_page_url_queue.empty():
    #         error_page_url_list.append(error_page_url_queue.get())

    #     error_page_men_url_list = [url_data.get('url') for url_data in error_page_url_list if url_data.get('gender') == 1]
    #     fetch_page(error_page_men_url_list, 1, q, error_page_url_queue, crawl_counter)
    #     error_page_women_url_list = [url_data.get('url') for url_data in error_page_url_list if url_data.get('gender') == 2]
    #     fetch_page(error_page_women_url_list, 2, q, error_page_url_queue, crawl_counter)
    helper.log('done', platform)
Пример #4
0
def main(chat='A', enabled=False, chat_index=-1):
    '''main'''
    chat_index = CHAT_ARR.index(chat) if chat_index == -1 else chat_index
    b = True
    is_enabled = enabled
    if chat_index < len(CHAT_ARR):
        url = 'https://www.metartx.com/models/all/%s' % CHAT_ARR[chat_index]
        pyquery = helper.get(url)
        a_arr = pyquery('.list-group-item > a')
        for item in a_arr:
            if b:
                url = item.get('href')
                if url == "https://www.metart.com/model/uliya-a/":
                    is_enabled = True
                if is_enabled:
                    head_img = item.find('img').get('src')
                    name = item.find('img').get('alt')
                    json_path = os.path.join('metartx', 'model',
                                             '%s.json' % name)
                    img_pathh = os.path.join('metartx', 'model',
                                             '%s_MetArtX.jpg' % name)
                    if not os.path.exists(json_path) or not os.path.exists(
                            img_pathh):
                        fetch_model(url, name, head_img)
                b = False
            else:
                b = True
        main(chat_index=chat_index + 1, enabled=is_enabled)
Пример #5
0
def main(chat, enabled=False):
    chat_index = CHAT_ARR.index(chat)
    '''main'''
    b = True
    is_enabled = enabled
    if chat_index < 26:
        url = 'https://www.vivthomas.com/models/all/%s' % CHAT_ARR[chat_index]
        pyquery = helper.get(url)
        a_arr = pyquery('.list-group-item > a')
        for item in a_arr:
            if b:
                url = item.get('href')
                if url == "https://www.metart.com/model/uliya-a/":
                    is_enabled = True
                if is_enabled:
                    head_img = item.find('img').get('src')
                    name = item.find('img').get('alt')
                    json_path = os.path.join(
                        'vivthomas', 'model', '%s.json' % name)
                    img_pathh = os.path.join(
                        'vivthomas', 'model', '%s.jpg' % name)
                    print(url, name, head_img)
                    if not os.path.exists(json_path) or not os.path.exists(img_pathh):
                        fetch_model(url, name, head_img)
                b = False
            else:
                b = True
        if chat_index < len(CHAT_ARR) - 1:
            main(CHAT_ARR[chat_index + 1], is_enabled)
Пример #6
0
def main(chat_index=0, enabled=False):
    '''main'''
    b = True
    is_enabled = enabled
    if chat_index < 26:
        url = 'https://www.eroticbeauty.com/models/all//%s' % CHAT_ARR[
            chat_index]
        pyquery = helper.get(url)
        a_arr = pyquery('.list-group-item > a')
        for item in a_arr:
            if b:
                url = item.get('href')
                print('now => %s' % url)
                # if url == "https://www.metart.com/model/uliya-a/":
                #     is_enabled = True
                if is_enabled:
                    head_img = item.find('img').get('src')
                    name = item.find('img').get('alt')
                    json_path = os.path.join('eroticbeauty', 'model',
                                             '%s.json' % name)
                    img_pathh = os.path.join('eroticbeauty', 'model',
                                             '%s.jpg' % name)
                    if not os.path.exists(json_path) or not os.path.exists(
                            img_pathh):
                        fetch_model(url, name, head_img)
                b = False
            else:
                b = True
        main(chat_index + 1, is_enabled)
Пример #7
0
 def run(self):
     try:
         pq = helper.get(self.url, myHeaders=self.headers)
         a_list = pq('a.product-image')
         for a in a_list:
             self.q.put(a.get('href'))
     except:
         self.error_page_url_queue.put(self.url)
Пример #8
0
def fetchAlbum(url):
    pq = helper.get(url)
    dirName = os.path.join('.', 'aiss', pq('title').text())
    helper.mkDir(dirName)
    index = 1
    for img in pq('.message > img'):
        helper.downloadImg(img.get('src'),
                           os.path.join(dirName, '%03d.jpg' % index))
        index += 1
Пример #9
0
 def run(self):
     try:
         pq = helper.get(self.url, myHeaders=self.headers)
         a_list = pq('div.mainsite_record_listing li > a')
         total = len(a_list)
         for i in range(0, total):
             self.q.put(a_list[i].get('href'))
     except:
         self.error_page_url_queue.put({'url': self.url, 'gender': self.gender})
Пример #10
0
 def run(self):
     # 获取商品详情url
     try:
         pq = helper.get(self.url, cookies=cookie, myHeaders=self.headers)
         for a in pq('a.no-h-over'):
             self.q.put(a.get('link'))
             # helper.log('[DEBUG] => ' + a.get('link'), platform)
     except:
         helper.log('[ERROR] => ' + self.url, platform)
         self.error_page_url_queue.put({'url': self.url, 'gender': self.gender})
Пример #11
0
def fetchAlbum(url, dirName):
    if 'rosi' in url:
        pq = helper.get(url)
        dirName = os.path.join(dirName,
                               pq('#post-title').text().split('No.')[1])
        helper.mkDir(dirName)
        for a in pq('.gallery-icon > a'):
            imgUrl = a.get('href')
            helper.downloadImg(imgUrl,
                               os.path.join(dirName,
                                            imgUrl.split('/')[-1]))
Пример #12
0
 def run(self):
     '''
     解析网站源码
     '''
     time.sleep(random.randint(2, 5))
     try:
         pq = helper.get(self.url, myHeaders=self.headers, cookies=self.cookies)
         # 款型名称
         name = pq('h1#pdp_product_title')
         if name and len(name) > 0:
             name = name[0].text
             # 配色的编号
             number = pq('li.description-preview__style-color').text().split(':')[1].strip()
             # 颜色值
             color_value = pq('li.description-preview__color-description').text().split(':')[1].strip()
             price = 0
             for div in pq('div.text-color-black'):
                 if div.get('data-test') == 'product-price':
                     price = float(div.text.replace('$', ''))
                     break
             size_price_arr = []
             for input in pq('div.availableSizeContainer input'):
                 # M 3.5 / W 5
                 size = input.get('aria-label').replace('W', '').replace('M', '').replace('C', '').strip()
                 if '/' in size:
                     size = size.split('/')[0].strip()
                 size_price_arr.append({
                     'size': float(size),
                     'price': price,
                     'isInStock': input.get('disabled', False) == False
                 })
             img_url = None
             for source in pq('noscript > picture > source'):
                 img_url = source.get('srcset')
                 break
             if img_url:
                 pass
             result = helper.downloadImg(img_url, os.path.join('.', 'imgs', platform, '%s.jpg' % number))
             if result == 1:
                 # 上传到七牛
                 qiniuUploader.upload_2_qiniu(platform, '%s.jpg' % number, './imgs/%s/%s.jpg' % (platform, number))
             mongo.insert_pending_goods(name, number, self.url, size_price_arr, ['%s.jpg' % number], self.gender, color_value, platform, '5be444e3c7e854cab4b252a0', self.crawl_counter, '', True if img_url else False)
         else:
             helper.log('%s has no name' % self.url, platform)
             # name = pq('h1.exp-pdp-title__main-title')
             # name = name[0].text
     except Exception as e:
         global error_detail_url
         error_counter = error_detail_url.get(self.url, 1)
         error_detail_url[self.url] = error_counter + 1
         helper.log('[ERROR] error timer = %s, url = %s' % (error_counter, self.url), platform)
         helper.log(e, platform)
         if error_counter < 3:
             self.q.put(self.url)
Пример #13
0
 def run(self):
     try:
         pq = helper.get(self.url)
         for a in pq('li.item > a'):
             self.q.put(a.get('href'))
     except:
         helper.log('[ERROR] => ' + self.url, platform)
         self.error_page_url_queue.put({
             'url': self.url,
             'gender': self.gender
         })
Пример #14
0
def fetch_page(url, page = 1, total_page = -1):
    page_url = '%s?currentPage=%d&sort=name-asc' % (url, page - 1)
    pq = helper.get(page_url, cookies, headers)
    # 获取商品详情url
    a_arr = pq('div.c-product-card > a')
    for a in a_arr:
        fetch_detail(a.get('href'))
    if total_page < 0:
        total_str = pq('div.sub strong').text()
        total_page = int(math.ceil(int(total_str) / 60))
    if page + 1 < total_page:
        fetch_page(url, page + 1, total_page)
Пример #15
0
def fetchPage(page):
    url = '%s/page/%d/' % (BASE_URL, page)
    pq = helper.get(url)
    for a in pq('a.disp_a'):
        title = a.get('title').replace('Permalink to ', '')
        url = a.get('href')
        dirName = os.path.join('cartoon', title)
        if not os.path.exists(os.path.join(dirName, 'url.txt')):
            helper.mkDir(dirName)
            if not fetchGallery(url, title, page):
                return False
    return True
Пример #16
0
def fetchPage(page):
	global enalbed
	url = '%s/page/%d/' % (BASE_URL, page)
	pq = helper.get(url)
	for a in pq('h2 > a'):
		url = a.get('href')
		if not enalbed:
			if url == 'http://adultphotosets.ru/metart-andrea-sixth-presenting/':
				enalbed = True
		if enalbed:
			if not fetchGallery(url, page):
				return False
	return True
Пример #17
0
 def run(self):
     # 获取商品详情url
     try:
         pq = helper.get(self.url, myHeaders=self.headers)
         for span in pq('span.product_title'):
             a = PyQuery(span).parents('a')
             self.q.put(a.attr('href'))
     except:
         helper.log('[ERROR] => ' + self.url, 'eastbay')
         self.error_page_url_queue.put({
             'url': self.url,
             'gender': self.gender
         })
Пример #18
0
 def run(self):
     # 获取商品详情url
     try:
         pq = helper.get(self.url)
         for a in pq('div.product_grid_image > a'):
             self.q.put('http://www.jimmyjazz.com%s' % a.get('href'))
     except:
         global platform
         helper.log('[ERROR] => ' + self.url, platform)
         self.error_page_url_queue.put({
             'url': self.url,
             'gender': self.gender
         })
Пример #19
0
def fetch_page(url, page=1):
    total_page = -1
    while True:
        page_url = '%s/%d?orderBy=Published' % (url, page)
        pq = helper.get(page_url)
        if total_page < 0:
            span = pq('span.current-page')[0]
            total_page = int(span.text.strip().split('(')[1].replace(')', ''))
        # 获取商品详情url
        for a in pq('li.product > a'):
            fetch_detail('https://www.sneakersnstuff.com%s' % a.get('href'),
                         page)
        page += 1
        if page > total_page:
            # 下一页超过最大页数,break
            break
Пример #20
0
def fetchGallery(url, title, cartoonPage, page=1, urlArr=None):
    print('%s => now cartoonPage => %d' % (helper.now(), cartoonPage))
    # print('now cartoon => %s' % title)
    if not urlArr:
        urlArr = []
    pq = helper.get('%s/%d' % (url, page))
    if not pq:
        return False
    for img in pq('p>img'):
        src = img.get('src')
        if src in urlArr:
            dirName = os.path.join('cartoon', title)
            helper.writeFile('\n'.join(urlArr), u'%s/url.txt' % dirName)
            return True
        urlArr.append(src)
    return fetchGallery(url, title, cartoonPage, page + 1, urlArr)
Пример #21
0
 def run(self):
     try:
         txt = helper.get(self.url, myHeaders=self.headers, returnText=True)
         json_data = json.loads(txt)
         item_list = json_data.get('sections')[0].get('items')
         for item_data in item_list:
             self.q.put(item_data.get('pdpUrl'))
     except Exception as e:
         error_counter = error_detail_url.get(self.url, 1)
         error_detail_url[self.url] = error_counter + 1
         helper.log('[ERROR] error timer = %s, url = %s' % (error_counter, self.url), platform)
         helper.log(e, platform)
         if error_counter < 3:
             self.q.put(self.url)
     finally:
         helper.log('[INFO] %s is done' % self.url, platform)
Пример #22
0
 def run(self):
     # 获取商品详情url
     try:
         html = helper.get(self.url,
                           myHeaders=self.headers,
                           returnText=True)
         json_data = json.loads(html)
         print(json_data.get('products', [])[0])
         # for span in pq('span.product_title'):
         #     a = PyQuery(span).parents('a')
         #     self.q.put(a.attr('href'))
     except:
         helper.log('[ERROR] => ' + self.url, platform)
         self.error_page_url_queue.put({
             'url': self.url,
             'gender': self.gender
         })
Пример #23
0
def fetchGallery(url):
    pq = helper.get(url)
    # SexArt – Alexis Crystal & Michael Fly – Call | AdultPhotoSets.ru
    title = pq('title').text()
    title = title.split(' | ')[0]
    dirName = os.path.join('imgs', '0error', title)

    i = 0
    tag = None
    imgUrl = []
    aArr = pq('a.externalLink')
    if not aArr or len(aArr) < 1:
        aArr = pq('div.content>p>a')
        if not aArr or len(aArr) < 1:
            # http://imgtrex.com/8kbfdzphqsr1/daniela-dressed-for-sex-02-10000px
            arr = re.compile(
                r'http://imgtrex\.com/\w+/[a-z0-9-]+\.jpg').findall(pq.html())
            if len(arr) == 0:
                print('can\'t find any <a>')
                return None
            aArr = [{'href': a} for a in arr]
            # for a in arr:
            # 	aArr.append({'href': a})

        if aArr and len(aArr) > 0:
            if 'imgchili.net' in aArr[0].get('href'):
                imgArr = pq('div.content>p>a>img')
                # http://t10.imgchili
                tag = imgArr[0].get('src').replace(
                    'http://', '').split('.imgchili')[0].replace('t', '')
    for a in aArr:
        print('%s image index => %d' % (helper.now(), i))
        url = fetchLargeImageUrl(a.get('href'), tag)
        if url == None:
            print('fetchLargeImageUrl failed')
            return None
        else:
            if url != '':
                imgUrl.append(url)
            i += 1
    if len(imgUrl) > 0:
        helper.writeFile('\n'.join(imgUrl), '%s/url.txt' % dirName)
    return title
Пример #24
0
def main():
    # https://www.eternaldesire.com/models/all/A/
    global chatIndex
    chatIndex = 12
    b = False
    while chatIndex < 23:
        url = 'https://www.eternaldesire.com/models/all/%s/' % CHAT_ARR[
            chatIndex]
        pq = helper.get(url)
        scoreSpanArr = pq('.update_information_gallery_rating_number')
        aArr = pq('.update_information_model_name')
        imgArr = pq('td > a> img')
        for i in xrange(0, len(imgArr)):
            if imgArr[i].get('alt') == 'Mila':
                b = True
            if b:
                fetchModel(aArr[i].get('href'), imgArr[i].get('src'),
                           imgArr[i].get('alt'), scoreSpanArr[i].text)
            # break
        chatIndex += 1
Пример #25
0
 def run(self):
     '''
     解析网站源码
     '''
     time.sleep(2)
     try:
         pq = helper.get(self.url, cookie)
         name = pq('h1#prodNameId').text()
         number = pq('span#supplierArtNumSpan').text()
         color_value = pq('span#variantColorId').text()
         size_price_arr = []
         for a in pq('div#2SizeContainer > div > a'):
             arr = [item.strip() for item in a.get('onclick').replace('ProductDetails.changeSizeAffectedLinks(', '').replace(');', '').split('\n')]
             # print(arr)
             # '8+', => 8+, => 8+
             arr[6] = arr[6].replace('\'', '').replace(',', '').replace('Y', '')
             size_price_arr.append({
                 'size': float(arr[6]) if '+' not in arr[6] else float(arr[6].replace('+', '')) + 0.5,
                 # '115,76 USD', => '115.76 USD'. => '115.76 USD'. => '115.76 => 115.76
                 'price': float(arr[2].replace(',', '.').replace(' USD\'.', '').replace('\'', '')),
                 'isInStock': True
             })
         # print(size_price_arr)
         img_downloaded = mongo.is_pending_goods_img_downloaded(self.url)
         if not img_downloaded:
             img_url = pq('img.productDetailPic').attr('src')
             result = helper.downloadImg(img_url, os.path.join('.', 'imgs', platform, '%s.jpg' % number))
             if result == 1:
                 # 上传到七牛
                 qiniuUploader.upload_2_qiniu(platform, '%s.jpg' % number, './imgs/%s/%s.jpg' % (platform, number))
                 img_downloaded = True
         mongo.insert_pending_goods(name, number, self.url, size_price_arr, ['%s.jpg' % number], self.gender, color_value, platform, '5bc87d6dc7e854cab4875368', self.crawl_counter, img_downloaded=img_downloaded)
     except Exception as e:
         global error_detail_url
         error_counter = error_detail_url.get(self.url, 1)
         error_detail_url[self.url] = error_counter + 1
         helper.log('[ERROR] error timer = %s, url = %s' % (error_counter, self.url), platform)
         helper.log(e, platform)
         if error_counter < 3:
             self.q.put(self.url)
Пример #26
0
def fetchComic(webPage, comicIndex, url, page = 1, comicDir = None):
	pq = helper.get('%s/%d' % (url, page))
	if page == 1:
		title = pq('title').text().replace('/', '&').split(' - ')[0]
		comicDir = os.path.join('animezilla', title)
		if os.path.exists(os.path.join('animezilla', '0uploaded', title, 'done.txt')):
			return True
		if os.path.exists(os.path.join(comicDir, 'done.txt')):
			return True
		helper.mkDir(comicDir)
	if webPage == 1 and comicIndex == 16:
		if page < 90:
			return fetchComic(webPage, comicIndex, url, 90, comicDir)

	img = pq('img#comic')
	print('[%s] downloading webPage page => %d, comic index => %d, comic page => %d' % (helper.now(), webPage, comicIndex, page))
	downloadImg(img.attr('src'), os.path.join(comicDir, '%03d.jpg' % page), url)
	time.sleep(3)
	if(len(img.parents('a')) == 0):
		helper.writeFile('done', os.path.join(comicDir, 'done.txt'))
		return True
	return fetchComic(webPage, comicIndex, url, page + 1, comicDir)
Пример #27
0
def fetch_page(url, q, crawl_counter, gender, error_page_url_queue):
    total_page = -1
    page = 1
    page_thread_list = []
    while True:
        page_url = '%s?ppg=104&page=%d' % (url, page)
        # 创建并启动线程
        time.sleep(1.2)
        page_spider = PageSpider(page_url, q, error_page_url_queue, gender)
        page_spider.start()
        page_thread_list.append(page_spider)

        if total_page < 0:
            pq = helper.get(page_url)
            div = pq('div.pagination_info')[0]
            total_page = int(div.text.strip().split('of ')[1])
        page += 1
        if page > total_page:
            # 下一页超过最大页数,break
            break
    for t in page_thread_list:
        t.join()

    goods_thread_list = []
    while True:
        queue_size = q.qsize()
        if queue_size > 0:
            # 每次启动5个抓取商品的线程
            for i in range(5 if queue_size > 5 else queue_size):
                goods_spider = GoodsSpider(q.get(), gender, q, crawl_counter)
                goods_spider.start()
                goods_thread_list.append(goods_spider)
            for t in goods_thread_list:
                t.join()
            goods_thread_list = []
        else:
            break
Пример #28
0
import helper
import constants
# https://api.enterprise.apigee.com/v1/organizations/yemhuynh-eval/environments/prod/caches

for e in constants.ENVS:
    caches = []
    cacheNames = helper.get("environments/" + e + "/caches")
    for name in cacheNames:
        cacheInfo = helper.get("environments/" + e + "/caches/" + name)
        caches.append(cacheInfo)

    helper.dumpConfig("env/" + e + "/caches.json", caches)
Пример #29
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Author: [email protected]
# Date: 2017-07-16 15:12:36

import helper
import argparse

if __name__ == '__main__':
    # parser = argparse.ArgumentParser()
    # parser.add_argument("--url", help = "the url of web")
    # options = parser.parse_args()

    # http://www.soku.com/search_video/q_王者荣耀_orderby_1_limitdate_0?site=14&page=2
    for page in range(1, 99):
        pq = helper.get(
            'http://www.soku.com/search_video/q_王者荣耀_orderby_1_limitdate_0?site=14&page=%d'
            % page)
        for a in pq('.v-link > a'):
            helper.runCmd('you-get -o ./videos --format=mp4 %s' %
                          a.get('href'))
Пример #30
0
def fetch_model(url, name, head_img):
    '''fetch model'''
    model_dir = os.path.join('vivthomas', 'model')
    helper.mkDir(model_dir)
    helper.mkDir(os.path.join('vivthomas', 'photo'))
    # 下载头像先
    helper.downloadImg(head_img, os.path.join(
        model_dir, '%s.jpg' % name))
    if os.path.exists(os.path.join('vivthomas', 'model', '%s.json' % (name))):
        return
    # 然后去抓取详细数据
    model_info = {
        'name': name,
        'photos': []
    }
    pyquery = helper.get(url)
    country_span = pyquery('.custom-country')
    model_info['country'] = country_span.text() if country_span else 'unknow'

    # 获取照片数据
    custom_content_list = pyquery('.custom-content-list')
    custom_content = None
    for item in custom_content_list:
        if item.getchildren()[0].getchildren()[0].text.startswith('Photos with'):
            custom_content = item
            break
        # if item.getchildren()[0].getchildren()[0].text:
        #     pass
    if custom_content is None:
        helper.writeFile(json.dumps(model_info), os.path.join(
            'vivthomas', 'model', '%s.json' % (name)))
        return
    # if len(custom_content_list) == 3:
    #     custom_content = custom_content_list[1]
    # else:
    #     custom_content = custom_content_list[0]
    list_group_item_list = custom_content.getchildren()[2].findall('li')
    for list_group_item in list_group_item_list:
        custom_list_item_detailed = list_group_item.getchildren()[1]
        img = custom_list_item_detailed.getchildren()[0].getchildren()[
            0].getchildren()[0]
        # custom_list_item_detailed.getchildren()[1].getchildren()[0].getchildren()[0].text
        photo_name = img.get('alt')
        # Released: Feb 26, 2016
        date_str = custom_list_item_detailed.getchildren()[1].getchildren()[
            1].text_content().split(': ')[1]
        date_str = '%s-%d-%s' % (date_str.split(', ')[1], helper.getMonth(
            date_str.split(' ')[0]), date_str.split(' ')[1].replace(',', ''))
        # 模特名
        arr = custom_list_item_detailed.getchildren()[1].getchildren()[
            2].getchildren()
        model_name_arr = []
        for i in xrange(1, len(arr)):
            model_name_arr.append(arr[i].text)
        # model_name = custom_list_item_detailed.getchildren()[1].getchildren()[2].getchildren()[1].text
        # print(model_name_arr)
        # date = datetime.datetime(int(date_str.split(', ')[1]), helper.getMonth(date_str.split(' ')[0]), int(date_str.split(' ')[1].replace(',', '')))
        # print date
        # 下载照片的封面
        photo_path = os.path.join('vivthomas', 'photo', '%s_%s' % (date_str, photo_name.replace(
            '/', ' ')), '%s_%s.jpg' % (date_str, photo_name.replace('/', ' ')))
        helper.downloadImg(img.get('src'), photo_path)
        # 存到数据库
        # mongo.newAlbun(photo_name, date)
        photo_json = {
            'date': date_str,
            'name': photo_name,
            'model': model_name_arr
        }
        photo_json_str = json.dumps(photo_json)
        model_info.get('photos').append(photo_json)
        helper.writeFile(photo_json_str, os.path.join(
            'vivthomas', 'photo', '%s_%s' % (date_str, photo_name), '%s_%s.json' % (date_str, photo_name)))
    helper.writeFile(json.dumps(model_info), os.path.join(
        'vivthomas', 'model', '%s.json' % (name)))