ToDo: - configure server - support TLS - support token auth method """ from json import loads from typing import Dict from jinja2 import Template from ceph.ceph import Ceph, CephNode from utility.log import Log from utility.utils import get_cephci_config LOG = Log(__name__) AGENT_HCL = """pid_file = "/run/vault-agent-pid" auto_auth { method "AppRole" { mount_path = "auth/approle" config = { role_id_file_path = "/usr/local/etc/vault/.app-role-id" secret_id_file_path = "/usr/local/etc/vault/.app-secret-id" remove_secret_id_file_after_reading = "false" } } } {%- if data.auth == "token" %} sink "file" {
2. Increase decrease PG counts, enable - disable - configure modules that do this 3. Enable logging to file, set and reset config params and cluster checks 4. Set-up email alerts and other cluster operations More operations to be added as needed """ import datetime import json import re import time from ceph.ceph_admin import CephAdmin from utility.log import Log log = Log(__name__) class RadosOrchestrator: """ RadosOrchestrator class contains various methods that perform various day1 and day2 operations on the cluster Usage: The class is initialized with the CephAdmin object for various operations """ def __init__(self, node: CephAdmin): """ initializes the env to run rados commands Args: node: CephAdmin object """ self.node = node self.ceph_cluster = node.cluster
--hotfix-repo <repo> To run sanity on hotfix build --ignore-latest-container Skip getting latest nightly container --skip-version-compare Skip verification that ceph versions change post upgrade -c --custom-config <name>=<value> Add a custom config key/value to ceph_conf_overrides --custom-config-file <file> Add custom config yaml to ceph_conf_overrides --xunit-results Create xUnit result file for test suite run [default: false] --enable-eus Enables EUS rpms on EUS suppored distro [default: false] --skip-enabling-rhel-rpms skip adding rpms from subscription if using beta rhel images for Interop runs --skip-sos-report Enables to collect sos-report on test suite failures [default: false] """ log = Log(__name__) root = logging.getLogger() root.setLevel(logging.INFO) formatter = logging.Formatter( "%(asctime)s - %(name)s - %(levelname)s - %(message)s") ch = logging.StreamHandler(sys.stdout) ch.setLevel(logging.ERROR) ch.setFormatter(formatter) root.addHandler(ch) test_names = [] @retry(LibcloudError, tries=5, delay=15)
User(Role.get(5), "*****@*****.**", "Administrator", "pwd", ""), User(Role.get(3), "*****@*****.**", "Benutzer", "pwd", ""), User(Role.get(3), "*****@*****.**", "Test Benutzer", "pwd", ""), ] for item in items: item.create() # ---------------------------------------------------------------------------- # @staticmethod def installSessions(): Session.get(1) # Initialise application # -------------------------------------------------------------------------------- # Log.level(Log.DEBUG) Log.information(__name__, "Initialising Flask...") app = Flask(__name__, static_folder="../../static", template_folder="../../template") Bootstrap(app) app.secret_key = Configuration["secret_key"] cache.init_app(app, config={ "CACHE_TYPE": Configuration["cache_type"], "CACHE_DIR": Configuration["cache_path"] }) Log.information(__name__, "Connecting to database...") app.config["SQLALCHEMY_DATABASE_URI"] = Configuration["sql_db_uri"] db.app = app db.init_app(app)
@app.route('/node/<node_url>', methods=['DELETE']) def remove_node(node_url): if node_url == '' or node_url == None: response = {'message': 'No node attached.'} return jsonify(response), 400 blockchain.remove_peer_node(node_url) response = { 'message': 'Node removed succesfully.', 'all_nodes': blockchain.get_peer_nodes() } return jsonify(response), 201 @app.route('/node', methods=['GET']) def get_node(): nodes = blockchain.get_peer_nodes() response = {'all_nodes': nodes} return jsonify(response), 201 if __name__ == '__main__': from argparse import ArgumentParser parser = ArgumentParser() parser.add_argument('-p', '--port', type=int, default=3200) args = parser.parse_args() port = args.port Log.log_message('Server started at port:{}'.format(port), port) wallet = Wallet(port) blockchain = Blockchain(wallet.public_key, port) app.run(host='0.0.0.0', port=port)
parser.add_argument("--loss", default=0, type=int, help="= 0, smooth CE; = 1, focal loss.") parser.add_argument("--data_bal", default='equal', type=str, help="Set to 'equal' (default) or 'unequal'.") args = parser.parse_args() print(args) initialize(args, seed=42) dataset = Cifar(args) log = Log(log_each=10) if args.multigpu == 1: model = WideResNet(args.depth, args.width_factor, args.dropout, in_channels=3, labels=10) model = nn.DataParallel(model).cuda() else: model = WideResNet(args.depth, args.width_factor, args.dropout, in_channels=3, labels=10).cuda() base_optimizer = torch.optim.SGD
"""Cephadm orchestration host operations.""" import json from copy import deepcopy from ceph.ceph import CephNode from ceph.utils import get_node_by_id from utility.log import Log from .common import config_dict_to_string from .helper import monitoring_file_existence from .maintenance import MaintenanceMixin from .orch import Orch, ResourceNotFoundError logger = Log(__name__) DEFAULT_KEYRING_PATH = "/etc/ceph/ceph.client.admin.keyring" DEFAULT_CEPH_CONF_PATH = "/etc/ceph/ceph.conf" class HostOpFailure(Exception): pass class Host(MaintenanceMixin, Orch): """Interface for executing ceph host <options> operations.""" SERVICE_NAME = "host" def list(self): """ List the cluster hosts
def main_worker(gpu, n_gpus_per_node, args): is_master = gpu == 0 directory = initialize(args, create_directory=is_master, init_wandb=args.log_wandb and is_master) os.environ["MASTER_ADDR"] = "localhost" if "MASTER_PORT" not in os.environ: os.environ["MASTER_PORT"] = "12345" if args.distributed: dist.init_process_group(backend=args.dist_backend, init_method="env://", world_size=n_gpus_per_node, rank=gpu) dataset = SharedDataset(args) dataset.load_datasets(args, gpu, n_gpus_per_node) model = Model(dataset, args) parameters = [{ "params": p, "weight_decay": args.encoder_weight_decay } for p in model.get_encoder_parameters(args.n_encoder_layers) ] + [{ "params": model.get_decoder_parameters(), "weight_decay": args.decoder_weight_decay }] optimizer = AdamW(parameters, betas=(0.9, args.beta_2)) scheduler = multi_scheduler_wrapper(optimizer, args) autoclip = AutoClip([ p for name, p in model.named_parameters() if "loss_weights" not in name ]) if args.balance_loss_weights: loss_weight_learner = LossWeightLearner(args, model, n_gpus_per_node) if is_master: if args.log_wandb: import wandb wandb.watch(model, log=args.wandb_log_mode) print(f"\nmodel: {model}\n") log = Log(dataset, model, optimizer, args, directory, log_each=10, log_wandb=args.log_wandb) torch.cuda.set_device(gpu) model = model.cuda(gpu) if args.distributed: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu]) raw_model = model.module else: raw_model = model force_cpu_dev = False #changed - along below if force_cpu_dev: dev0 = torch.device("cpu") model.to(dev0) gpu = dev0 for epoch in range(args.epochs): # # TRAINING # model.train() if is_master: log.train(len_dataset=dataset.train_size) i = 0 model.zero_grad() losses_over_bs = [] #changed - added to accum losses on for batch in dataset.train: if not force_cpu_dev: #changed - if clause added batch = Batch.to(batch, gpu) total_loss, losses, stats = model(batch) for head in raw_model.heads: stats.update(head.loss_weights_dict()) if args.balance_loss_weights: loss_weight_learner.compute_grad(losses, epoch) losses_over_bs.append( total_loss.item()) #changed - added for analyzing loss total_loss.backward() if (i + 1) % args.accumulation_steps == 0: grad_norm = autoclip() if args.balance_loss_weights: loss_weight_learner.step(epoch) scheduler(epoch) optimizer.step() model.zero_grad() if is_master: with torch.no_grad(): batch_size = batch["every_input"][0].size( 0) * args.accumulation_steps log(batch_size, stats, args.frameworks, grad_norm=grad_norm, learning_rates=scheduler.lr() + [loss_weight_learner.scheduler.lr()]) del total_loss, losses i += 1 if not is_master: continue # # VALIDATION CROSS-ENTROPIES # model.eval() log.eval(len_dataset=dataset.val_size) with torch.no_grad(): for batch in dataset.val: try: _, _, stats = model(Batch.to(batch, gpu)) batch_size = batch["every_input"][0].size(0) log(batch_size, stats, args.frameworks) except RuntimeError as e: if 'out of memory' in str(e): print('| WARNING: ran out of memory, skipping batch') if hasattr(torch.cuda, 'empty_cache'): torch.cuda.empty_cache() else: raise e lobs = np.array(losses_over_bs) #changed to be uses with below print( str(lobs.mean()) + "; " + str(lobs.max()) + "; " + str(lobs.min())) #changed - print loss for epoch log.flush() # # VALIDATION MRP-SCORES # predict(raw_model, dataset.val, args.validation_data, args, directory, gpu, run_evaluation=True, epoch=epoch) # # TEST PREDICTION # test_fpath = f"{directory}/test_predictions/" #changed - catch exists error if not os.path.exists(test_fpath): os.mkdir(test_fpath) #os.mkdir(f"{directory}/test_predictions/") predict(raw_model, dataset.test, args.test_data, args, f"{directory}/test_predictions/", gpu)
--hotfix-repo <repo> To run sanity on hotfix build --ignore-latest-container Skip getting latest nightly container --skip-version-compare Skip verification that ceph versions change post upgrade -c --custom-config <name>=<value> Add a custom config key/value to ceph_conf_overrides --custom-config-file <file> Add custom config yaml to ceph_conf_overrides --xunit-results Create xUnit result file for test suite run [default: false] --enable-eus Enables EUS rpms on EUS suppored distro [default: false] --skip-enabling-rhel-rpms skip adding rpms from subscription if using beta rhel images for Interop runs --skip-sos-report Enables to collect sos-report on test suite failures [default: false] """ log = Log() test_names = [] @retry(LibcloudError, tries=5, delay=15) def create_nodes( conf, inventory, osp_cred, run_id, cloud_type="openstack", report_portal_session=None, instances_name=None, enable_eus=False, rp_logger: Optional[ReportPortal] = None, ):