def test_resolve_remote_files(tmpdir): tmpdir.join("remote").join("dir").join("a.txt").write("toto", ensure=True) tmpdir.join("local").ensure_dir() storage_config = { "tmp": { "type": "local", "basedir": str(tmpdir) }, "tmp2": { "type": "local", "basedir": str(tmpdir.join("remote")) } } client = StorageClient(config=storage_config) config = { "a": "/home/ubuntu/a.txt", "b": "non_storage:b.txt", "c": "tmp:remote/dir/a.txt", "d": "tmp2:/dir/a.txt", "e": True, "f": "tmp:", } config = utility.resolve_remote_files(config, str(tmpdir.join("local")), client) c_path = tmpdir.join("local").join("tmp/remote/dir/a.txt") d_path = tmpdir.join("local").join("tmp2/dir/a.txt") f_path = tmpdir.join("local").join("tmp") assert config["a"] == "/home/ubuntu/a.txt" assert config["b"] == "non_storage:b.txt" assert config["c"] == str(c_path) assert config["d"] == str(d_path) assert c_path.check(file=1) assert d_path.check(file=1) assert f_path.check(dir=1)
class Utility(object): """Base class for utilities.""" def __init__(self): self._corpus_dir = os.getenv('CORPUS_DIR') workspace_dir = os.getenv('WORKSPACE_DIR', '/root/workspace') self._output_dir = os.path.join(workspace_dir, 'output') self._data_dir = os.path.join(workspace_dir, 'data') self._shared_dir = os.path.join(workspace_dir, 'shared') self._tmp_dir = tempfile.mkdtemp() try: if not os.path.exists(self._output_dir): os.makedirs(self._output_dir) if not os.path.exists(self._data_dir): os.makedirs(self._data_dir) if not os.path.exists(self._shared_dir): os.makedirs(self._shared_dir) except OSError: pass @property @abc.abstractmethod def name(self): raise NotImplementedError() @abc.abstractmethod def declare_arguments(self, parser): raise NotImplementedError() @abc.abstractmethod def exec_function(self, args): """Launch the utility with provided params """ raise NotImplementedError() def run(self, args=None): """Main entrypoint.""" parser = argparse.ArgumentParser() parser.add_argument('-s', '--storage_config', default=None, help=('Configuration of available storages as a file or a JSON string. ' 'Setting "-" will read from the standard input.')) parser.add_argument('-t', '--task_id', default=None, help="Identifier of this run.") parser.add_argument('-i', '--image', default="?", help="Full URL (registry/image:tag) of the image used for this run.") parser.add_argument('-b', '--beat_url', default=None, help=("Endpoint that listens to beat requests " "(push notifications of activity).")) parser.add_argument('-bi', '--beat_interval', default=30, type=int, help="Interval of beat requests in seconds.") parser.add_argument('--statistics_url', default=None, help=('Endpoint that listens to statistics summaries generated ' 'at the end of the execution')) parser.add_argument('-ms', '--model_storage', default=os.environ["MODELS_DIR"], help='Model storage in the form <storage_id>:[<path>].') parser.add_argument('-msr', '--model_storage_read', default=None, help=('Model storage to read from, in the form <storage_id>:[<path>] ' '(defaults to model_storage).')) parser.add_argument('-msw', '--model_storage_write', default=None, help=('Model storage to write to, in the form <storage_id>:[<path>] ' '(defaults to model_storage).')) parser.add_argument('-c', '--config', default=None, help=('Configuration as a file or a JSON string. ' 'Setting "-" will read from the standard input.')) parser.add_argument('--config_update_mode', choices=['default', 'merge', 'replace'], default='default', help=('How to update the parent task configuration with the given ' 'configuration. ' '"default": automatic mode based on the configuration, ' '"merge": recursively update configuration fields, ' '"replace": replace the top-most fields.')) parser.add_argument('-m', '--model', default=None, help='Model to load.') parser.add_argument('-g', '--gpuid', default="0", help="Comma-separated list of 0-indexed GPU identifiers.") parser.add_argument('--no_push', default=False, action='store_true', help='Do not push model.') self.declare_arguments(parser) args = parser.parse_args(args=args) if args.task_id is None: args.task_id = str(uuid.uuid4()) self._task_id = args.task_id self._image = args.image start_beat_service( os.uname()[1], args.beat_url, args.task_id, interval=args.beat_interval) self._storage = StorageClient( config=load_config(args.storage_config) if args.storage_config else None) if args.model_storage_read is None: args.model_storage_read = args.model_storage if args.model_storage_write is None: args.model_storage_write = args.model_storage self._model_storage_read = args.model_storage_read self._model_storage_write = args.model_storage_write # for backward compatibility - convert singleton in int args.gpuid = args.gpuid.split(',') args.gpuid = [int(g) for g in args.gpuid] if len(args.gpuid) == 1: args.gpuid = args.gpuid[0] self._gpuid = args.gpuid self._config = load_config(args.config) if args.config is not None else None self._model = args.model self._no_push = args.no_push logger.info('Starting executing utility %s=%s', self.name, args.image) start_time = time.time() stats = self.exec_function(args) end_time = time.time() logger.info('Finished executing utility in %s seconds', str(end_time-start_time)) if args.statistics_url is not None: requests.post(args.statistics_url, json={ 'task_id': self._task_id, 'start_time': start_time, 'end_time': end_time, 'statistics': stats or {} }) def convert_to_local_file(self, nextval, is_dir = False): new_val = [] for val in nextval: inputs = val.split(',') local_inputs = [] for remote_input in inputs: local_input = os.path.join(self._data_dir, self._storage.split(remote_input)[-1]) if is_dir: self._storage.get_directory(remote_input, local_input) else: self._storage.get_file(remote_input, local_input) local_inputs.append(local_input) new_val.append(','.join(local_inputs)) return new_val
def run(self, args=None): """Main entrypoint.""" parser = argparse.ArgumentParser() parser.add_argument('-s', '--storage_config', default=None, help=('Configuration of available storages as a file or a JSON string. ' 'Setting "-" will read from the standard input.')) parser.add_argument('-t', '--task_id', default=None, help="Identifier of this run.") parser.add_argument('-i', '--image', default="?", help="Full URL (registry/image:tag) of the image used for this run.") parser.add_argument('-b', '--beat_url', default=None, help=("Endpoint that listens to beat requests " "(push notifications of activity).")) parser.add_argument('-bi', '--beat_interval', default=30, type=int, help="Interval of beat requests in seconds.") parser.add_argument('--statistics_url', default=None, help=('Endpoint that listens to statistics summaries generated ' 'at the end of the execution')) parser.add_argument('-ms', '--model_storage', default=os.environ["MODELS_DIR"], help='Model storage in the form <storage_id>:[<path>].') parser.add_argument('-msr', '--model_storage_read', default=None, help=('Model storage to read from, in the form <storage_id>:[<path>] ' '(defaults to model_storage).')) parser.add_argument('-msw', '--model_storage_write', default=None, help=('Model storage to write to, in the form <storage_id>:[<path>] ' '(defaults to model_storage).')) parser.add_argument('-c', '--config', default=None, help=('Configuration as a file or a JSON string. ' 'Setting "-" will read from the standard input.')) parser.add_argument('--config_update_mode', choices=['default', 'merge', 'replace'], default='default', help=('How to update the parent task configuration with the given ' 'configuration. ' '"default": automatic mode based on the configuration, ' '"merge": recursively update configuration fields, ' '"replace": replace the top-most fields.')) parser.add_argument('-m', '--model', default=None, help='Model to load.') parser.add_argument('-g', '--gpuid', default="0", help="Comma-separated list of 0-indexed GPU identifiers.") parser.add_argument('--no_push', default=False, action='store_true', help='Do not push model.') self.declare_arguments(parser) args = parser.parse_args(args=args) if args.task_id is None: args.task_id = str(uuid.uuid4()) self._task_id = args.task_id self._image = args.image start_beat_service( os.uname()[1], args.beat_url, args.task_id, interval=args.beat_interval) self._storage = StorageClient( config=load_config(args.storage_config) if args.storage_config else None) if args.model_storage_read is None: args.model_storage_read = args.model_storage if args.model_storage_write is None: args.model_storage_write = args.model_storage self._model_storage_read = args.model_storage_read self._model_storage_write = args.model_storage_write # for backward compatibility - convert singleton in int args.gpuid = args.gpuid.split(',') args.gpuid = [int(g) for g in args.gpuid] if len(args.gpuid) == 1: args.gpuid = args.gpuid[0] self._gpuid = args.gpuid self._config = load_config(args.config) if args.config is not None else None self._model = args.model self._no_push = args.no_push logger.info('Starting executing utility %s=%s', self.name, args.image) start_time = time.time() stats = self.exec_function(args) end_time = time.time() logger.info('Finished executing utility in %s seconds', str(end_time-start_time)) if args.statistics_url is not None: requests.post(args.statistics_url, json={ 'task_id': self._task_id, 'start_time': start_time, 'end_time': end_time, 'statistics': stats or {} })
def get_storage_client(accessible_storages): storage_client = StorageClient(rmprivate(accessible_storages)) return storage_client
def main(): parser = argparse.ArgumentParser() parser.add_argument('-c', '--config', default=None, required=True, help='Storages configuration file.') parser.add_argument('--info', '-v', action='store_true', help='info mode') parser.add_argument('--verbose', '-vv', action='store_true', help='verbose mode') subparsers = parser.add_subparsers(help='command help', dest='cmd') subparsers.required = True parser_list = subparsers.add_parser('list', help='list file on a storage') parser_list.add_argument('--recursive', '-r', action='store_true', help='recursive listing') parser_list.add_argument('storage', type=resolvedpath, help='path to list') parser_get = subparsers.add_parser('get', help='download a file or directory') parser_get.add_argument('storage', type=resolvedpath, help='path to file or directory to download, directory must ends with /') parser_get.add_argument('local', type=str, help='local path') parser_get = subparsers.add_parser('push', help='upload a file or directory') parser_get.add_argument('local', type=str, help='local path to file or directory to upload') parser_get.add_argument('storage', type=resolvedpath, help='remote path') parser_stat = subparsers.add_parser('stat', help='returns stat on a remote file/directory') parser_stat.add_argument('storage', type=resolvedpath, help='remote path') args = parser.parse_args() if args.info: logging.basicConfig(level=logging.INFO) if args.verbose: logging.basicConfig(level=logging.DEBUG) with open(args.config) as jsonf: config = json.load(jsonf) # support configuration from automatic tests if 'storages' in config: config = config['storages'] client = StorageClient(config=config) if args.cmd == "list": listdir = client.listdir(args.storage, args.recursive) for k in sorted(listdir.keys()): if listdir[k].get("is_dir"): print("dir", k) else: date = datetime.fromtimestamp(listdir[k]["last_modified"]) print(" ", "%10d" % listdir[k]["size"], date.strftime("%Y-%m-%dT%H:%M:%S"), k) elif args.cmd == "get": directory = args.storage.endswith('/') if directory: if os.path.isfile(args.local): raise ValueError("%s should be a directory", args.local) client.get_directory(args.storage, args.local) else: client.get_file(args.storage, args.local) elif args.cmd == "push": client.push(args.local, args.storage) elif args.cmd == "stat": print(client.stat(args.storage))
def run(self, args=None): """Main entrypoint.""" parser = argparse.ArgumentParser() parser.add_argument( "-s", "--storage_config", default=None, help= ("Configuration of available storages as a file or a JSON string. " 'Setting "-" will read from the standard input.'), ) parser.add_argument("-t", "--task_id", default=None, help="Identifier of this run.") parser.add_argument( "-i", "--image", default="?", help= "Full URL (registry/image:tag) of the image used for this run.", ) parser.add_argument( "-b", "--beat_url", default=None, help=("Endpoint that listens to beat requests " "(push notifications of activity)."), ) parser.add_argument( "-bi", "--beat_interval", default=30, type=int, help="Interval of beat requests in seconds.", ) parser.add_argument( "--statistics_url", default=None, help=("Endpoint that listens to statistics summaries generated " "at the end of the execution"), ) parser.add_argument( "-ms", "--model_storage", default=os.environ["MODELS_DIR"], help="Model storage in the form <storage_id>:[<path>].", ) parser.add_argument( "-msr", "--model_storage_read", default=None, help=( "Model storage to read from, in the form <storage_id>:[<path>] " "(defaults to model_storage)."), ) parser.add_argument( "-msw", "--model_storage_write", default=None, help=( "Model storage to write to, in the form <storage_id>:[<path>] " "(defaults to model_storage)."), ) parser.add_argument( "-c", "--config", default=None, help=("Configuration as a file or a JSON string. " 'Setting "-" will read from the standard input.'), ) parser.add_argument( "--config_update_mode", choices=["default", "merge", "replace"], default="default", help=("How to update the parent task configuration with the given " "configuration. " '"default": automatic mode based on the configuration, ' '"merge": recursively update configuration fields, ' '"replace": replace the top-most fields.'), ) parser.add_argument("-m", "--model", default=None, help="Model to load.") parser.add_argument( "-g", "--gpuid", default="0", help="Comma-separated list of 0-indexed GPU identifiers.", ) parser.add_argument("--no_push", default=False, action="store_true", help="Do not push model.") self.declare_arguments(parser) args = parser.parse_args(args=args) if args.task_id is None: args.task_id = str(uuid.uuid4()) self._task_id = args.task_id self._image = args.image start_beat_service(os.uname()[1], args.beat_url, args.task_id, interval=args.beat_interval) self._storage = StorageClient(config=load_config(args.storage_config) if args.storage_config else None) if args.model_storage_read is None: args.model_storage_read = args.model_storage if args.model_storage_write is None: args.model_storage_write = args.model_storage self._model_storage_read = args.model_storage_read self._model_storage_write = args.model_storage_write # for backward compatibility - convert singleton in int args.gpuid = args.gpuid.split(",") args.gpuid = [int(g) for g in args.gpuid] if len(args.gpuid) == 1: args.gpuid = args.gpuid[0] self._gpuid = args.gpuid self._config = load_config( args.config) if args.config is not None else None self._model = args.model self._no_push = args.no_push logger.info("Starting executing utility %s=%s", self.name, args.image) start_time = time.time() stats = self.exec_function(args) end_time = time.time() logger.info("Finished executing utility in %.1f seconds", end_time - start_time) if args.statistics_url is not None: requests.post( args.statistics_url, json={ "task_id": self._task_id, "start_time": start_time, "end_time": end_time, "statistics": stats or {}, }, )
def main(): parser = argparse.ArgumentParser() parser.add_argument('-c', '--config', default=None, required=True, help='Storages configuration file.') parser.add_argument('--info', '-v', action='store_true', help='info mode') parser.add_argument('--verbose', '-vv', action='store_true', help='verbose mode') subparsers = parser.add_subparsers(help='command help', dest='cmd') subparsers.required = True parser_list = subparsers.add_parser('list', help='list file on a storage') parser_list.add_argument('--recursive', '-r', action='store_true', help='recursive listing') parser_list.add_argument('storage', type=resolvedpath, help='path to list') parser_get = subparsers.add_parser('get', help='download a file or directory') parser_get.add_argument( 'storage', type=resolvedpath, help='path to file or directory to download, directory must ends with /' ) parser_get.add_argument('local', type=str, help='local path') parser_get = subparsers.add_parser('push', help='upload a file or directory') parser_get.add_argument('local', type=str, help='local path to file or directory to upload') parser_get.add_argument( 'storage', type=resolvedpath, help='path to file or directory to download, directory must ends with /' ) parser_get = subparsers.add_parser('delete', help='delete a corpus') parser_get.add_argument( 'storage', type=resolvedpath, help='path to file or directory to download, directory must ends with /' ) parser_get.add_argument('corpusId', type=str, help='corpus id') parser_stat = subparsers.add_parser( 'stat', help='returns stat on a remote file/directory') parser_stat.add_argument( 'storage', type=resolvedpath, help='path to file or directory to download, directory must ends with /' ) parser_get = subparsers.add_parser( 'stream_corpus_manager', help='Export a corpus in TMX(default) or biText') parser_get.add_argument( 'storage', type=resolvedpath, help='path to file or directory to download, directory must ends with /' ) parser_get.add_argument('corpusId', type=str, help='corpus id') parser_get.add_argument( 'format', type=check_format, help='Format of the corpus (application/x-tmx+xml, text/bitext)') parser_search = subparsers.add_parser( 'search', help='list corpus segments identified ' 'by corpus id') parser_search.add_argument('storage', type=resolvedpath, help='remote path') parser_search.add_argument('id', help='remote id') parser_search.add_argument('search_query', type=resolvedjson, help='query text for search') parser_search.add_argument('skip', default=0, help='number of segments skip (default 0)') parser_search.add_argument( 'limit', default=0, help='number of segments returned (default 0 meaning all)') parser_search = subparsers.add_parser( 'seg_delete', help='Delete segments identified by id') parser_search.add_argument('storage', type=resolvedpath, help='remote path') parser_search.add_argument('corpus_id', help='corpus id') parser_search.add_argument('ids', help='list segment id') parser_stream = subparsers.add_parser( 'stream', help='print out specific corpus by name') parser_stream.add_argument('storage', type=resolvedpath, help='remote path') args = parser.parse_args() if args.info: logging.basicConfig(level=logging.INFO) if args.verbose: logging.basicConfig(level=logging.DEBUG) with open(args.config) as jsonf: config = json.load(jsonf) # support configuration from automatic tests if 'storages' in config: config = config['storages'] client = StorageClient(config=config) if args.cmd == "list": listdir = client.listdir(args.storage, args.recursive) for k in sorted(listdir.keys()): if listdir[k].get("is_dir"): print("dir", k) else: date = datetime.fromtimestamp(listdir[k]["last_modified"]) if "entries" in listdir[k]: size = listdir[k]["entries"] else: size = listdir[k]["size"] print(" ", "%10d" % size, date.strftime("%Y-%m-%dT%H:%M:%S"), k) elif args.cmd == "get": directory = args.storage.endswith('/') if directory: if os.path.isfile(args.local): raise ValueError("%s should be a directory" % args.local) client.get_directory(args.storage, args.local) else: client.get_file(args.storage, args.local) elif args.cmd == "push": client.push(args.local, args.storage) elif args.cmd == "delete": client.delete_corpus_manager(args.storage, args.corpusId) elif args.cmd == "stat": print(client.stat(args.storage)) elif args.cmd == "stream_corpus_manager": byte_result = b'' for chunk in client.stream_corpus_manager(args.storage, args.corpusId, args.format): if chunk: byte_result += chunk sys.stdout.write(byte_result.decode("utf-8")) elif args.cmd == "stream": byte_result = b'' for chunk in client.stream(args.storage): if chunk: byte_result += chunk sys.stdout.write(byte_result.decode("utf-8")) elif args.cmd == "search": print( client.search(args.storage, args.id, args.search_query, args.skip, args.limit)) elif args.cmd == "seg_delete": print(client.seg_delete(args.storage, args.corpus_id, args.ids)) elif args.cmd == "seg_add": print(client.seg_add(args.storage, args.corpus_id, args.ids))