def local_tail(self, logfile, webuser): # 创建一个可跨文件的全局变量,以便控制死循环 gl._init() gl.set_value('deploy_' + str(webuser), False) try: with open(logfile, 'rt') as f: f.seek(0, 0) while True: is_stop = gl.get_value('deploy_' + str(webuser)) line = f.readline() if line: self.send_message(webuser, line) elif is_stop: self.send_message(webuser, '[INFO]文件监视结束..') break except Exception as e: self.send_message(webuser, e)
def remote_tail(self, host, port, user, passwd, logfile, webuser, filter_text=None): # 创建一个可跨文件的全局变量,控制停止 try: self.client = paramiko.SSHClient() self.client.load_system_host_keys() self.client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) self.client.connect(hostname=host, port=port, username=user, password=passwd) interact = SSHClientInteraction(self.client, timeout=10, display=False) interact.expect('.*#.*') logfile = logfile.strip().replace('&&', '').replace('||', '').replace('|', '') self.send_message(webuser, '[INFO][%s@%s]开始监控日志' % (user, host)) gl._init() gl.set_value('tail_' + str(webuser), self.client) if filter_text: filter_text_re = filter_text.strip().replace('&&', '').replace( '||', '').replace('|', '') interact.send('tail -f %s|grep --color=never %s' % (logfile, filter_text_re)) else: interact.send('tail -f %s' % (logfile)) interact.tail( output_callback=lambda m: self.send_message(webuser, m)) except Exception as e: self.send_message(webuser, e) finally: try: self.client.close() except Exception as e: self.send_message(webuser, e)
from model import Discriminator, resnet_56, resnet_56_sparse from data import cifar10 from resnet import ResNet18, ResNet50 from resnet_sprase import ResNet18_sprase, ResNet50_sprase from collections import OrderedDict import numpy as np from torch.autograd import Variable from resnet_imagenet import resnet101 import torchvision.datasets as datasets import torchvision.transforms as transforms import utils.globalvar as gl gl._init() import time import logging import sys try: import nvidia.dali.plugin.pytorch as plugin_pytorch from nvidia.dali.pipeline import Pipeline import nvidia.dali.ops as ops import nvidia.dali.types as types except ImportError: raise ImportError( "Please install DALI from https://www.github.com/NVIDIA/DALI to run this example." ) num_gpu = 4
help='Use multi-processing distributed training to launch ' 'N processes per node, which has N GPUs. This is the ' 'fastest way to use PyTorch for either single node or ' 'multi node data parallel training') parser.add_argument('--channel_removed_ratio',default=0.2,type=float,help='removed ratio.') parser.add_argument('--spatial_removed_ratio',default=0.2,type=float,help='removed ratio.') parser.add_argument('--Is_spatial',action='store_true',help='use spatial module or not,default is channel with conv.') parser.add_argument('--lasso',action='store_true',help='add l1 regularization to channel module.') parser.add_argument('--l1_coe',default=1e-8,type=float,help='coe of l1 regularization.') parser.add_argument('--show',action='store_true',help='show model architecture.') parser.add_argument('--flops',action='store_true',help='calc flops given a pretrained model.') parser.add_argument('--debug',action='store_true',help='debug.') args = parser.parse_args() os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu) best_acc1 = 0 gvar._init() gvar.set_value('removed_ratio_c',args.channel_removed_ratio) gvar.set_value('removed_ratio_s',args.spatial_removed_ratio) gvar.set_value('is_spatial',args.Is_spatial) def main(): if not os.path.isdir(args.save_dir): os.makedirs(args.save_dir) if args.seed is not None: random.seed(args.seed) torch.manual_seed(args.seed) cudnn.deterministic = True warnings.warn('You have chosen to seed training. ' 'This will turn on the CUDNN deterministic setting, ' 'which can slow down your training considerably! ' 'You may see unexpected behavior when restarting '
def post(self, request, format=None): if request.data['excu'] == 'init': # 项目初始化 id = request.data['id'] result = self.repo_init(id) if result.exited == 0: Project.objects.filter(id=id).update(status='Succeed') info_logger.info('初始化项目:' + str(id) + ',执行成功!') http_status = OK msg = '初始化成功!' else: error_logger.error('初始化项目:%s 执行失败! 错误信息:%s' % (str(id), result.stderr)) http_status = BAD msg = '初始化项目:%s 执行失败! 错误信息:%s' % (str(id), result.stderr) return XopsResponse(msg, status=http_status) elif request.data['excu'] == 'deploy': # 部署操作 id = request.data['id'] webuser = request.user.username alias = request.data['alias'] self.start_time = time.strftime("%Y%m%d%H%M%S", time.localtime()) record_id = str(alias) + '_' + str(self.start_time) name = '部署_' + record_id DeployRecord.objects.create(name=name, alias=alias, status='Failed', project_id=int(id)) Project.objects.filter(id=id).update(last_task_status='Failed') local_log_path = self._path.rstrip('/') + '/' + str( id) + '_' + str(request.data['alias']) + '/logs' log = local_log_path + '/' + record_id + '.log' version = request.data['version'].strip() serverid = request.data['server_ids'] deploy = DeployExcu(webuser, record_id, id) deploy.start(log, version, serverid, record_id, webuser, self.start_time) return XopsResponse(record_id) elif request.data['excu'] == 'rollback': # 回滚 id = request.data['id'] project_id = request.data['project_id'] alias = request.data['alias'] self.start_time = time.strftime("%Y%m%d%H%M%S", time.localtime()) record_id = str(alias) + '_' + str(self.start_time) log = self._path.rstrip('/') + '/' + str(project_id) + '_' + str( alias) + '/logs/' + record_id + '.log' self.do_rollback(id, log, record_id) return XopsResponse(record_id) elif request.data['excu'] == 'deploymsg': # 部署控制台消息读取 try: id = request.data['id'] alias = request.data['alias'] record = request.data['record'] scenario = int(request.data['scenario']) logfile = self._path.rstrip('/') + '/' + str(id) + '_' + str( alias) + '/logs/' + record + '.log' webuser = request.user.username print(webuser) msg = Tailf() if scenario == 0: gl._init() gl.set_value('deploy_' + str(webuser), False) msg.local_tailf(logfile, webuser) http_status = OK request_status = '执行成功!' except Exception as e: http_status = BAD request_status = '执行错误:日志文件可能不存在!' print(e) return XopsResponse(request_status, status=http_status) elif request.data['excu'] == 'readlog' and request.data[ 'scenario'] == 1: # 读取部署日志 try: id = request.data['id'] alias = request.data['alias'] record = request.data['record'] logfile = self._path.rstrip('/') + '/' + str(id) + '_' + str( alias) + '/logs/' + record + '.log' response = FileResponse(open(logfile, 'rb')) response['Content-Type'] = 'text/plain' return response except Exception: http_status = BAD request_status = '执行错误:文件不存在!' return XopsResponse(request_status, status=http_status) elif request.data['excu'] == 'app_start': # 项目启动 try: app_start = request.data['app_start'] host = request.data['host'] webuser = request.user.username auth_info, auth_key = auth_init(host) connect = Shell(auth_info, connect_timeout=5, connect_kwargs=auth_key) app_start = app_start.strip().replace('&&', '').replace('||', '') connect.run(app_start, ws=True, webuser=webuser) connect.close() http_status = OK request_status = '执行成功!' except Exception as e: http_status = BAD request_status = '执行错误:' + str(e) return XopsResponse(request_status, status=http_status) elif request.data['excu'] == 'app_stop': # 项目停止 try: app_stop = request.data['app_stop'] host = request.data['host'] webuser = request.user.username auth_info, auth_key = auth_init(host) connect = Shell(auth_info, connect_timeout=5, connect_kwargs=auth_key) app_stop = app_stop.strip().replace('&&', '').replace('||', '') connect.run(app_stop, ws=True, webuser=webuser) connect.close() http_status = OK request_status = '执行成功!' except Exception as e: http_status = BAD request_status = '执行错误:' + str(e) return XopsResponse(request_status, status=http_status) elif request.data['excu'] == 'tail_start': # 日志监控 try: filter_text = str(request.data['filter']) app_log_file = request.data['app_log_file'] host = request.data['host'] webuser = request.user.username device_info = DeviceInfo.objects.filter(id=int(host)).values() host = device_info[0]['hostname'] auth_type = device_info[0]['auth_type'] connect_info = ConnectionInfo.objects.filter( hostname=host, auth_type=auth_type).values() user = connect_info[0]['username'] passwd = connect_info[0]['password'] port = connect_info[0]['port'] tail = Tailf() tail.remote_tail(host, port, user, passwd, app_log_file, webuser, filter_text=filter_text) http_status = OK request_status = '执行成功!' except Exception as e: http_status = BAD request_status = str(e) return XopsResponse(request_status, status=http_status) elif request.data['excu'] == 'tail_stop': # 日志监控停止 try: webuser = request.user.username if hasattr(gl, '_global_dict'): tail_key = 'tail_' + str(webuser) if tail_key in gl._global_dict.keys(): client = gl.get_value('tail_' + str(webuser)) client.close() http_status = OK request_status = '执行成功!' except Exception as e: http_status = BAD request_status = str(e) return XopsResponse(request_status, status=http_status)