def process_object_image(self, task_name, collection, obj, image_field, image_url, base_dir, ext='jpg', skip_existing=True): path = os.path.join(base_dir, hashed_path(image_url, ext=ext)) if os.path.exists(path) and skip_existing: collection.update({'_id': obj['_id']}, { '$set': { '%s_path' % image_field: path, '%s_url' % image_field: image_url } }) else: self.add_task( Task(task_name, url=image_url, obj=obj, disable_cache=True, image_field=image_field, collection=collection, base_dir=base_dir, ext=ext))
def handler(self, collection, obj, set_field, base_dir, task_args=None, grab_args=None, callback=None): for image in obj.get(set_field, []): path = hashed_path(image["url"], base_dir=base_dir) if os.path.exists(path): if path != image["path"]: db[collection].update( {"_id": obj["_id"], ("%s.url" % set_field): image["url"]}, {"$set": {"%s.$.path": path}} ) else: kwargs = {} if task_args: kwargs = deepcopy(task_args) g = Grab() g.setup(url=image["url"]) if grab_args: g.setup(**grab_args) g.setup(referer=build_image_hosting_referer(image["url"])) yield Task( callback=callback or image_set_handler, grab=g, collection=collection, path=path, obj=obj, image=image, set_field=set_field, disable_cache=True, backup=g.dump_config(), **kwargs )
def handler(self, url, collection, obj, path_field, base_dir, task_args=None, grab_args=None, callback=None): path = hashed_path(url, base_dir=base_dir) if os.path.exists(path): if path != obj.get(path_field, None): db[collection].update({"_id": obj["_id"]}, {"$set": {path_field: path}}) else: kwargs = {} if task_args: kwargs = deepcopy(task_args) g = Grab() g.setup(url=url) if grab_args: g.setup(**grab_args) g.setup(referer=build_image_hosting_referer(url)) yield Task( callback=callback or image_handler, grab=g, collection=collection, path=path, obj=obj, path_field=path_field, disable_cache=True, backup=g.dump_config(), **kwargs )
def handler(self, collection, obj, set_field, base_dir, task_args=None, grab_args=None, callback=None): from database import db for image in obj.get(set_field, []): path = hashed_path(image['url'], base_dir=base_dir) if os.path.exists(path): if path != image['path']: db[collection].update( {'_id': obj['_id'], ('%s.url' % set_field): image['url']}, {'$set': {('%s.$.path' % set_field): path}}) else: kwargs = {} if task_args: kwargs = deepcopy(task_args) g = Grab() g.setup(url=image['url']) if grab_args: g.setup(**grab_args) g.setup(referer=build_image_hosting_referer(image['url'])) yield Task( callback=callback or image_set_handler, grab=g, collection=collection, path=path, obj=obj, image=image, set_field=set_field, disable_cache=True, backup=g.dump_config(), **kwargs )
def process_object_image(self, task_name, collection, obj, image_field, image_url, base_dir, ext='jpg', skip_existing=True): path = os.path.join(base_dir, hashed_path(image_url, ext=ext)) if os.path.exists(path) and skip_existing: collection.update({'_id': obj['_id']}, {'$set': {'%s_path' % image_field: path, '%s_url' % image_field: image_url}}) else: self.add_task(Task(task_name, url=image_url, obj=obj, disable_cache=True, image_field=image_field, collection=collection, base_dir=base_dir, ext=ext))
def save_hash(self, location, basedir, ext=None): """ Save response body into file with special path builded from hash. That allows to lower number of files per directory. :param location: URL of file or something else. It is used to build the SHA1 hash. :param basedir: base directory to save the file. Note that file will not be saved directly to this directory but to some sub-directory of `basedir` :param ext: extension which should be appended to file name. The dot is inserted automatically between filename and extension. :returns: path to saved file relative to `basedir` Example:: >>> url = 'http://yandex.ru/logo.png' >>> g.go(url) >>> g.response.save_hash(url, 'some_dir', ext='png') 'e8/dc/f2918108788296df1facadc975d32b361a6a.png' # the file was saved to $PWD/some_dir/e8/dc/... TODO: replace `basedir` with two options: root and save_to. And returns save_to + path """ if isinstance(location, unicode): location = location.encode('utf-8') rel_path = hashed_path(location, ext=ext) path = os.path.join(basedir, rel_path) if not os.path.exists(path): path_dir, path_fname = os.path.split(path) try: os.makedirs(path_dir) except OSError: pass with open(path, 'wb') as out: if isinstance(self._cached_body, unicode): out.write(self._cached_body.encode('utf-8')) else: out.write(self._cached_body) return rel_path
def handler(self, collection, obj, set_field, base_dir, task_args=None, grab_args=None, callback=None): from database import db for image in obj.get(set_field, []): path = hashed_path(image['url'], base_dir=base_dir) if os.path.exists(path): if path != image['path']: db[collection].update( { '_id': obj['_id'], ('%s.url' % set_field): image['url'] }, {'$set': { '%s.$.path': path }}) else: kwargs = {} if task_args: kwargs = deepcopy(task_args) g = Grab() g.setup(url=image['url']) if grab_args: g.setup(**grab_args) g.setup(referer=build_image_hosting_referer(image['url'])) yield Task(callback=callback or image_set_handler, grab=g, collection=collection, path=path, obj=obj, image=image, set_field=set_field, disable_cache=True, backup=g.dump_config(), **kwargs)
def handler(self, url, collection, obj, path_field, base_dir, task_args=None, grab_args=None, callback=None): from database import db path = hashed_path(url, base_dir=base_dir) if os.path.exists(path): if path != obj.get(path_field, None): db[collection].update({'_id': obj['_id']}, {'$set': { path_field: path }}) else: kwargs = {} if task_args: kwargs = deepcopy(task_args) g = Grab() g.setup(url=url) if grab_args: g.setup(**grab_args) g.setup(referer=build_image_hosting_referer(url)) yield Task(callback=callback or image_handler, grab=g, collection=collection, path=path, obj=obj, path_field=path_field, disable_cache=True, backup=g.dump_config(), **kwargs)