def get_image_path(brand_id): brand_name = cm.norm_brand_name( cm.fetch_brand_by_id(brand_id)['brandname_e']) return { 'full': os.path.join(base_path, 'images', str.format('{0}_{1}', brand_id, brand_name), 'full'), 'thumb': os.path.join(base_path, 'images', str.format('{0}_{1}', brand_id, brand_name), 'thumb') }
def update_tags_mapping(brand_id, region, tag_raw, tag_name, serialize=True): """ 更新tags_mapping映射机制。根据区域不同,在标签的源代码表象和标签的展示表象之间,建立映射关系。 :param brand_id: :param region: :param tag_raw: :param tag_name: :param serialize: 是否更新数据文件 """ brand_name = cm.norm_brand_name( cm.fetch_brand_by_id(brand_id)['brandname_e']) data_dir = get_data_path(brand_id) region = region.lower() fname = os.path.normpath( os.path.join( data_dir, str.format('{0}_{1}_{2}_tags_mapping.json', brand_id, brand_name, region))) if brand_id not in tags_mapping: try: with open(fname, 'r') as f: data = json.load(f, encoding='utf-8') except ValueError: data = {} except IOError: data = {} tags_mapping[brand_id] = data else: data = tags_mapping[brand_id] tag_raw = tag_raw.encode('utf-8') if isinstance(tag_raw, unicode) else tag_raw tag_name = tag_name.encode('utf-8') if isinstance(tag_name, unicode) else tag_name data[tag_raw] = tag_name if serialize: cm.make_sure_path_exists(data_dir) with open(fname, 'w') as f: json.dump(data, f, ensure_ascii=False, encoding='utf-8')
def fetch_products(region, category, gender, refresh_post_data=False): """ 抓取单品信息 """ # 获得过滤器的信息 brand_name = cm.norm_brand_name( cm.fetch_brand_by_id(brand_id)['brandname_e']) data_dir = get_data_path(brand_id) cm.make_sure_path_exists(data_dir) fname = os.path.normpath( os.path.join( data_dir, str.format('{0}_{1}_{2}_{3}_{4}.json', brand_id, brand_name, category.replace('/', '_'), gender, region))) if not os.path.isfile(fname) or refresh_post_data: logger.info( str.format('Fetch filter set for {0}, {1}', category, gender).decode('utf-8')) filter_combinations = fetch_filter(region, category, gender, 0, { 'post_data': basic_query.copy(), 'tags': {}, 'processed': False }) post_data = basic_query.copy() post_data[ "/vuitton/ecommerce/commerce/catalog/FindProductsFormHandler.pageId"] = category post_data[ "/vuitton/ecommerce/commerce/catalog/FindProductsFormHandler.gender"] = gender filter_combinations.append({ 'post_data': post_data, 'processed': False, 'tags': {} }) with open(fname, 'w') as f: json.dump(filter_combinations, f) else: with open(fname, 'r') as f: filter_combinations = json.load(f, encoding='utf-8') processed_urls = set([]) for filter_data in filter_combinations: # 跳过已经处理过的post数据 if filter_data['processed']: continue filter_data['tags']['brand_id'] = 10226 filter_data['tags']['brandname_e'] = 'Louis Vuitton' filter_data['tags']['brandname_c'] = '路易威登' filter_data['tags']['category'] = category filter_data['tags']['gender'] = gender filter_data['tags']['region'] = region page = 1 while True: filter_data['post_data'][ "/vuitton/ecommerce/commerce/catalog/FindProductsFormHandler.pageNumber"] = page response = cm.retry_helper(lambda val: cm.post_data( url=val, data=filter_data['post_data'], client="iPad"), param=hosts["data_host"][region] + gender, logger=logger, except_class=(URLError, socket.timeout), retry_delay=10) if response is None: continue # 得到单品的列表 product_list = pq(response['body'])('li[data-url]') if len(product_list) == 0: break # logger.info(str.format('{0} products found at page {1}', len(product_list), page).decode('utf-8')) page += 1 for item in product_list: url_component = item.attrib['data-url'] m = re.search(r'[^-]+$', url_component) if m is None: continue url = item.attrib['data-url'] if url in processed_urls: continue else: processed_urls.add(url) fetch_product_details(region, url, filter_data) filter_data['processed'] = True with open(fname, 'w') as f: json.dump(filter_combinations, f)
def fetch_image(body, model, refetch=False): """ 抓取单品详情页面中的相关图片,并保存在数据库中。 :param body: :param model: :param retry: :param cool_time: :param refetch: 是否强制重新抓取图片 """ temp = get_image_path(brand_id) image_dir = temp['full'] image_thumb_dir = temp['thumb'] brand_name = cm.norm_brand_name( cm.fetch_brand_by_id(brand_id)['brandname_e']) cm.make_sure_path_exists(image_dir) cm.make_sure_path_exists(image_thumb_dir) results = [] for img_body in pq(body)('#productSheetSlideshow ul.bigs li img'): temp = img_body.attrib[ 'data-src'] if 'data-src' in img_body.attrib else ( img_body.attrib['src'] if 'src' in img_body.attrib else '') mt = re.search( ur'RENDITIONS\["tablet"\]\["productMain"\]\s*=\s*\'([^\']+)\'', body) if not mt: continue jcr = mt.group(1) base_name = os.path.splitext(os.path.split(temp)[1])[0] if re.search(r'^http://', temp) is None: url = hosts['image_host'] + temp else: url = temp url_thumb = unicode.format(u'{0}/jcr:content/renditions/{1}_{2}.jpg', url, base_name, jcr) m = re.search(r'([^/]+$)', url) if m is None: continue # flist = tuple(os.listdir(image_dir)) # if refetch or fname not in flist: response = fetch_image(url_thumb, logger) if response is None or len(response['body']) == 0: continue # 写入图片文件 # fname = str.format('{0}_{1}_{2}_{3}', brand_id, brand_name, model, m.group()) fname = str.format('{0}.{1}', hashlib.sha1(url_thumb).hexdigest(), response['image_ext']) full_name = os.path.normpath(os.path.join(image_dir, fname)) path_db = os.path.normpath( os.path.join('10226_louis_vuitton/full', fname)) with open(full_name, 'wb') as f: f.write(response['body']) buf = response['body'] # else: # with open(full_name, 'rb') as f: # buf = f.read() md5 = hashlib.md5() md5.update(buf) checksum = md5.hexdigest() results.append([ 'True', { 'checksum': checksum, 'url': url_thumb, 'path': str.format('full/{0}', fname) } ]) # db.start_transaction() # try: # # If the file already exists # rs = db.query( # str.format('SELECT path,width,height,format,url FROM products_image WHERE checksum="{0}"', # checksum)).fetch_row(how=1) # if rs: # path_db = cm.unicodify(rs[0]['path']) # width = rs[0]['width'] # height = rs[0]['height'] # fmt = rs[0]['format'] # url = rs[0]['url'] # else: # img = Image.open(full_name) # width, height = img.size # fmt = img.format # url = url_thumb # # rs = db.query(str.format('SELECT * FROM products_image WHERE path="{0}" AND model="{1}"', path_db, # model)).fetch_row(maxrows=0) # if not rs: # db.insert({'model': model, 'url': url, 'path': path_db, 'width': width, # 'height': height, 'format': fmt, 'brand_id': brand_id, 'checksum': checksum}, # 'products_image', ['fetch_time', 'update_time']) # # db.commit() # except: # db.rollback() # raise return results
def get_data_path(brand_id): brand_name = cm.norm_brand_name( cm.fetch_brand_by_id(brand_id)['brandname_e']) return os.path.normpath( os.path.join(base_path, 'data', str.format('{0}_{1}', brand_id, brand_name)))