def parse_content(self, response): try: data = json.loads(response.text) except: print("-----------------------获取到json:" + response.text + "------------------------------") return try: model = data['strategyModel'] category = model['categoryName'] title = model['title'] description = model['description'] content = model['context'] design_strategy_item = DesignStrategyItem() # type: DesignStrategyItem design_strategy_item['category'] = category design_strategy_item['title'] = title design_strategy_item['description'] = description design_strategy_item['content'] = content design_strategy_item['html_url'] = response.url yield design_strategy_item except Exception as e: print("-----------------------获取到json:" + response.text + "------------------------------") log.warn("%s ( refer: %s )" % (e, response.url)) if config.USE_PROXY: proxy_pool.add_failed_time(response.meta['proxy'].replace('http://', ''))
def process_exception(self, request, exception, spider): try: proxy_pool.add_failed_time(request.meta['proxy'].replace( 'http://', '')) except Exception as e: log.error(e) pass
def process_response(self, request, response, spider): if response.status < 200 or response.status >= 400: try: proxy_pool.add_failed_time(request.meta['proxy'].replace( 'http://', '')) except KeyError: pass return response
def process_response(self, request, response, spider): print("CatchException:" + request.url + " " + str(response.status)) if response.status < 200 or response.status >= 400: try: if 'splash' not in request.meta: proxy_pool.add_failed_time(request.meta['proxy'].replace( 'http://', '')) else: proxy_pool.add_failed_time( request.meta['splash']['args']['proxy'].replace( 'http://', '')) except KeyError: pass except Exception as e: log.error(e) return response
def download_img(img_url, file_path): proxies = None proxy = '' if config.USE_PROXY: proxy = proxy_pool.random_choice_proxy() proxies = { 'http': "http://%s" % proxy, } try: response = requests.get(img_url, stream=True, proxies=proxies) if response.status_code == 200: with open(file_path, 'wb') as f: for chunk in response.iter_content(1024): f.write(chunk) else: if config.USE_PROXY: proxy_pool.add_failed_time(proxy) except: if config.USE_PROXY: proxy_pool.add_failed_time(proxy)
def parse_content(self, response): uuid = utils.get_uuid() cid = response.meta['cid'] title = response.meta['title'] try: data = json.loads(response.text) except: print("-----------------------获取到json:" + response.text + "------------------------------") return data_img_list = data['dataImg'] for _data_img in data_img_list: data_album_list = _data_img['album'] for data_album in data_album_list: data_img = data_album['l'] # http://pic.to8to.com/case/1605/05/20160505_f0af86a239d0b02e9635a47ih5l1riuq_sp.jpg img_url = 'http://pic.to8to.com/case/{short_name}'.format( short_name=data_img['s']) if self.design_picture_service.is_duplicate_url(img_url): break sub_title = data_img['t'] original_width = data_img['w'] original_height = data_img['h'] tags = [] try: zoom_type = ZONE_TYPE[data_img['zid']] if zoom_type is not None or not zoom_type.strip() == '': tags.append(zoom_type) except KeyError: pass try: style_id = STYLE_ID[data_img['sid']] if style_id is not None or not style_id.strip() == '': tags.append(style_id) except KeyError: pass try: area = AREA[data_img['a']] if area is not None or not area.strip() == '': tags.append(area) except KeyError: pass try: color_id = COLOR_ID[data_img['coid']] if color_id is not None or not color_id.strip() == '': tags.append(color_id) except KeyError: pass try: house_type = HX_ID[data_img['hxid']] if house_type is not None or not house_type.strip() == '': tags.append(house_type) except KeyError: pass try: part = PART_ID[data_img['pid']] if part is not None or not part.strip() == '': tags.append(part) except KeyError: pass try: design_picture_item = DesignPictureItem( ) # type: DesignPictureItem design_picture_item['fid'] = uuid design_picture_item['html_url'] = response.url design_picture_item['img_url'] = img_url design_picture_item['tags'] = tags design_picture_item['title'] = title design_picture_item['sub_title'] = sub_title design_picture_item['img_width'] = str(original_width) design_picture_item['img_height'] = str(original_height) design_picture_item['description'] = design_picture_item[ 'title'] yield design_picture_item except Exception as e: print("-----------------------获取到json:" + response.text + "------------------------------") log.warn("%s ( refer: %s )" % (e, response.url)) if config.USE_PROXY: proxy_pool.add_failed_time( response.meta['proxy'].replace('http://', ''))