예제 #1
0
 def upload(self, path):
     """
         cos_path:/news/jiemian/image/
     :param path
     :return:
     """
     counter = 0
     url = ''
     while counter != 10:
         try:
             # 得到hash
             uploadName = path.replace('image\\', '')
             request = UploadFileRequest(u"crawler", self.cos_path + uploadName,
                                         self.local_path + path,
                                         insert_only=0)
             upload_file_ret = self.cos_client.upload_file(request)
             if upload_file_ret['code'] == 0:
                 data = upload_file_ret['data'] or {}
                 url = data['source_url']
                 print u'上传成功 ' + url
             else:
                 print u'上传图片失败', upload_file_ret
             break
         except Exception as e:
             counter += 1
             TimerUtil.sleep(10)
     return url
예제 #2
0
 def parseResult(self, response):
     status = response.status
     haoYaoShiId = response.meta['haoYaoShiId']
     self.logWarn(u'haoyaoshi_id: %d 请求状态%d %s' %
                  (haoYaoShiId, status, response.url))
     if status == 404:
         self.statusDao.updateStatus(haoYaoShiId,
                                     self.statusDao.Status_no_source)
         return
     if status == 403:
         self.statusDao.updateStatus(haoYaoShiId,
                                     self.statusDao.Status_be_forbid)
         NetworkUtil.getNewIp()
         TimerUtil.sleep(60)
         return
     # 判断使用哪种解析方式, url是最终的url,重定向之后
     url = response.url
     if 'http://www.ehaoyao.com/product' in url:
         contentItem = self.parseDetail1(response)
         if contentItem:
             return contentItem
     elif 'http://www.ehaoyao.us/goods.php' in url:
         # 更改状态:不需要处理
         self.statusDao.updateStatus(haoYaoShiId,
                                     self.statusDao.Status_dont_need_parse)
     else:
         # 更改状态:没有解析方法
         self.statusDao.updateStatus(haoYaoShiId,
                                     self.statusDao.Status_no_parse_method)
예제 #3
0
    def process_item(self, item, spider):
        image_urls = []
        for image_url in item['image_urls']:
            url = image_url.get('url')
            urlHash = EncryptUtil.md5(url)
            path = 'full/' + str(urlHash) + '.jpg'
            detailPath = self.savePath + '/' + path
            # 创建目录
            saveDir = self.savePath + '/full'
            if not FileUtil.dirIsExist(saveDir):
                FileUtil.createDir(saveDir)

            if FileUtil.fileIsExist(detailPath):
                spider.logDao.info(u'图片已经存在本地:' + url)
                image_url_new = {
                    'ok': True,
                    'x': {
                        'url': url,
                        'path': path
                    }
                }
            else:
                try:
                    fileResponse = requests.get(url, timeout=10)
                    req_code = fileResponse.status_code
                    req_msg = fileResponse.reason
                    if req_code == 200:
                        open(detailPath, 'wb').write(fileResponse.content)
                        image_url_new = {
                            'ok': True,
                            'x': {
                                'url': url,
                                'path': path
                            }
                        }
                        spider.logDao.info(u'图片成功下载:' + url)
                    else:
                        spider.logDao.info(u'下载图片失败:' + url)
                        image_url_new = {
                            'ok': False,
                            'x': {
                                'url': url,
                            }
                        }
                except Exception, e:
                    print e
                    spider.logDao.warn(u'下载图片失败:' + url)
                    image_url_new = {
                        'ok': False,
                        'x': {
                            'url': url,
                        }
                    }
            image_urls.append(image_url_new)
            # 空转2s
            TimerUtil.sleep(2)
예제 #4
0
    def wait_utils_env_ok(self):
        # 检测网络
        while not NetworkUtil.checkNetWork():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logWarn(u'检测网络不可行')
            # continue

        # 检测服务器
        while not NetworkUtil.checkService():
            # 20s检测一次
            TimerUtil.sleep(20)
            self.logWarn(u'检测服务器不可行')
            # continue
        return True
예제 #5
0
    def downloadSpecific(self, haoYaoShiId):
        count = 0
        while count <= 10:
            # 获取说明书
            try:
                specificUrl = 'http://www.ehaoyao.com/meal/%s/specific?_=1508406771571' % haoYaoShiId
                result = requests.get(specificUrl)
                if result.status_code == 200:
                    content = json.loads(result.content)
                    if content.get('code') == 1:
                        return json.dumps(json.loads(result.content).get(
                            'data', {}).get('specificInfo', ''),
                                          ensure_ascii=False)
                    return ''
            except Exception as e:
                self.logWarn('downloadSpecific:' + str(e))
                return ''

            count += 1
            TimerUtil.sleep(15)
예제 #6
0
def downLoadImage(image_url_sources):
    image_urls = []
    for image_url in image_url_sources:
        file_path = os.path.dirname(os.path.realpath(__file__)) + u'/image'
        if not os.path.isdir(file_path):
            os.mkdir(file_path)
        url = image_url.get('url')
        print url
        urlHash = EncryptUtil.md5(url)
        fileName = str(urlHash) + '.jpg'
        detailPath = file_path + '\\' + fileName

        if FileUtil.fileIsExist(detailPath):
            print u'图片已经存在本地:' + url
            image_url_new = {
                'ok': True,
                'x': {
                    'url': url,
                    'path': detailPath,
                    'fileName': fileName
                }
            }
        else:
            try:
                fileResponse = requests.get(url, timeout=10)
                req_code = fileResponse.status_code
                req_msg = fileResponse.reason
                if req_code == 200:
                    open(detailPath, 'wb').write(fileResponse.content)
                    # 判断大小是否大于100kb 压缩到600, 质量为80
                    if len(fileResponse.content) > 100 * 1024:
                        # 目标图片大小
                        dst_w = 600
                        dst_h = 600
                        # 保存的图片质量
                        save_q = 80
                        ImageCompressUtil().resizeImg(ori_img=detailPath,
                                                      dst_img=detailPath,
                                                      dst_w=dst_w,
                                                      dst_h=dst_h,
                                                      save_q=save_q)
                    image_url_new = {
                        'ok': True,
                        'x': {
                            'url': url,
                            'path': detailPath,
                            'fileName': fileName
                        }
                    }
                    # http://p0.ifengimg.com/pmop/2017/1010/E66C2599CE9403A670AD405F4CCAB271B366D7DC_size415_w1290_h692.png
                    print u'图片成功下载,大小:' + str(
                        len(fileResponse.content) / 1024) + 'kb ' + url
                    print u'最终存储图片,大小:' + str(
                        os.path.getsize(detailPath) / 1024) + 'kb ' + url
                else:
                    print u'下载图片失败:' + url
                    image_url_new = {
                        'ok': False,
                        'x': {
                            'url': url,
                        }
                    }
            except Exception, e:
                print u'下载图片失败:' + url
                image_url_new = {
                    'ok': False,
                    'x': {
                        'url': url,
                    }
                }
        image_urls.append(image_url_new)
        # 空转2s
        TimerUtil.sleep(2)