def check_resources_usage(): mapping = { 'CPU': { 'used': psutil.cpu_percent(), 'limit': CPU_LIMIT }, 'Memory': { 'used': psutil.virtual_memory().percent, 'limit': MEMORY_LIMIT }, 'Disc space': { 'used': psutil.disk_usage('/').percent, 'limit': DISC_LIMIT }, } warning_messages = [] info_messages = [] for resource_name, params in mapping.items(): used = params['used'] limit = params['limit'] if used >= limit: warning_messages.append( '*Warning:* {} usage: *{}%* (limit: {})'.format( resource_name, used, limit)) else: info_messages.append('{} usage: *{}%*'.format(resource_name, used)) if warning_messages: warning_messages_str = '\n'.join(warning_messages) info_messages_str = '\n'.join(info_messages) message = '{}\n{}'.format(warning_messages_str, info_messages_str) send_slack_message(text=message)
def check_celery_workers_heartbeat(): """ Notifies through Slack when there are no active Celery workers. """ if not is_process_running('celery'): memory_info = psutil.virtual_memory() disk_info = psutil.disk_usage('/') error_message = ( '*Error:* Celery workers are not running.\nMemory used: *{}%*\nDisk used: *{}%*' ).format(memory_info.percent, disk_info.percent) send_slack_message(text=error_message)
metadata = dict(zip(metadata_list, metadata_values)) # calculate result start_time = time.process_time() cur.execute(metadata['query_left']) result_left = cur.fetchone()[0] # evaluate result, set status, send slack msg if requested if eval(str(result_left) + metadata['operand'] + str(metadata['value_right'])): status = 'ok' else: status = 'alerting' cur.execute("select ch.webhook from alerts.rules_channels rch join alerts.channels ch " "on rch.channel_id = ch.id where rch.rule_id = " + str(rule_id[0])) webhook_urls = cur.fetchone() utils.send_slack_message(rule_id[0], metadata['name'], webhook_urls) if metadata['debug']: final_query = metadata['query_left'] + metadata['operand'] + str(metadata['value_right']) else: final_query = None end_time = time.process_time() # insert result to results table cur.execute("insert into alerts.results (rule_id, calculated_at, results, status, duration, query) " "VALUES(%s, %s, %s, %s, %s, %s)", (rule_id, datetime.datetime.now(), result_left, status, end_time - start_time, final_query)) conn.commit() cur.close() conn.close()
create_pull_result = functions.create_pull(args.head, args.base) logger.info(f'Successfully created PR: {create_pull_result["html_url"]}') # 「Slack 通知不要」ならばここで終了です。 if not args.slack_notification: sys.exit() # NOTE: ここ以降の処理は、 # 「PR の commits 一覧をリリースノートとして Slack へ通知したい」 # というニーズのためにある処理です。 # もともとは、「PR コメントにリリースノートを投稿し、それを Slack へ通知する」と # わざわざ「PR コメント」を経由していたため、ラベルをつけたりなんだりと非常に苦労しましたが # 「いや直接 Slack にメッセージしたらいいじゃん」ということで簡略化できました。 # NOTE: ラベル指定での Slack notification は知見として残しておきます。 # 下記コマンドで通知可能です。 # /github subscribe OWNER/REPO pulls,comments,+label:"CONTINUOUS-PR" # api を使って PR の commits 一覧を取得します。 # NOTE: 「リリースノート」となる一覧を取得するための処理です。 list_commits_on_pull_result = functions.list_commits_on_pull( create_pull_result['number']) logger.info(f'Successfully listed commits, count: {len(list_commits_on_pull_result)}') # noqa: E501 comment_body = functions.create_comment_body( list_commits_on_pull_result, args.base, ) # comment_body として取得した内容は、リリースノートとして扱い Slack へ送ります。 utils.send_slack_message(comment_body) logger.info('Successfully sent message to Slack')
def main(): # save input stats for later use print(args.work_dir, args.exp) work_dir = os.path.join(args.work_dir, args.exp) if not os.path.exists(work_dir): os.makedirs(work_dir) # copy this file to work dir to keep training configuration shutil.copy(__file__, os.path.join(work_dir, 'main.py')) with open(os.path.join(work_dir, 'args.pkl'), 'wb') as f: pickle.dump(args, f) # transform transform1 = transforms.Compose( [transforms.ToTensor(), transforms.Normalize([0.5], [0.5])]) # 1.train_dataset train_path, test_path = loader.make_dataset(args.train_site, train_size=args.train_size, mode='train') np.save(os.path.join(work_dir, '{}_test_path.npy'.format(args.train_site)), test_path) train_image_path = train_path[0] train_label_path = train_path[1] test_image_path = test_path[0] test_label_path = test_path[1] train_dataset = loader.CustomDataset(train_image_path, train_label_path, args.train_site, args.input_size, transform1, arg_mode=args.arg_mode, arg_thres=args.arg_thres) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=4) val_dataset = loader.CustomDataset(test_image_path, test_label_path, args.train_site, args.input_size, transform1, arg_mode=False) val_loader = data.DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=4) Train_test_dataset = loader.CustomDataset(test_image_path, test_label_path, args.train_site, args.input_size, transform1) Train_test_loader = data.DataLoader(Train_test_dataset, batch_size=1, shuffle=True, num_workers=4) trn_logger = Logger(os.path.join(work_dir, 'train.log')) trn_raw_logger = Logger(os.path.join(work_dir, 'train_raw.log')) val_logger = Logger(os.path.join(work_dir, 'validation.log')) # 3.model_select my_net, model_name = model_select( args.arch, args.input_size, ) # 4.gpu select my_net = nn.DataParallel(my_net).cuda() cudnn.benchmark = True # 5.optim if args.optim == 'adam': gen_optimizer = torch.optim.Adam(my_net.parameters(), lr=args.initial_lr, eps=args.eps) elif args.optim == 'sgd': gen_optimizer = torch.optim.SGD(my_net.parameters(), lr=args.initial_lr, momentum=0.9, weight_decay=args.weight_decay) # lr decay lr_schedule = args.lr_schedule lr_scheduler = optim.lr_scheduler.MultiStepLR(gen_optimizer, milestones=lr_schedule[:-1], gamma=args.gamma) # 6.loss if args.loss_function == 'bce': criterion = nn.BCEWithLogitsLoss( pos_weight=torch.Tensor([args.bce_weight])).cuda() elif args.loss_function == 'mse': criterion = nn.MSELoss().cuda() ##################################################################################### # train send_slack_message(args.token, '#jm_private', '{} : starting_training'.format(args.exp)) best_iou = 0 try: if args.train_mode: for epoch in range(lr_schedule[-1]): train(my_net, train_loader, gen_optimizer, epoch, criterion, trn_logger, trn_raw_logger) iou = validate(val_loader, my_net, criterion, epoch, val_logger, save_fig=False, work_dir_name='jsrt_visualize_per_epoch') print( 'validation_iou **************************************************************' ) lr_scheduler.step() if args.val_size == 0: is_best = 1 else: is_best = iou > best_iou best_iou = max(iou, best_iou) checkpoint_filename = 'model_checkpoint_{:0>3}.pth'.format( epoch + 1) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': my_net.state_dict(), 'optimizer': gen_optimizer.state_dict() }, is_best, work_dir, filename='checkpoint.pth') print("train end") except RuntimeError as e: send_slack_message( args.token, '#jm_private', '----------------------------------- error train : send to message JM & Please send a kakao talk ----------------------------------------- \n error message : {}' .format(e)) import ipdb ipdb.set_trace() draw_curve(work_dir, trn_logger, val_logger) send_slack_message(args.token, '#jm_private', '{} : end_training'.format(args.exp)) if args.test_mode: print('Test mode ...') main_test(model=my_net, test_loader=test_data_list, args=args)
def main(): if args.server == 'server_A': work_dir = os.path.join('/data1/JM/lung_segmentation', args.exp) print(work_dir) elif args.server == 'server_B': work_dir = os.path.join('/data1/workspace/JM_gen/lung_seg', args.exp) print(work_dir) if not os.path.exists(work_dir): os.makedirs(work_dir) # copy this file to work dir to keep training configuration shutil.copy(__file__, os.path.join(work_dir, 'main.py')) with open(os.path.join(work_dir, 'args.pkl'), 'wb') as f: pickle.dump(args, f) # transform transform1 = transforms.Compose( [transforms.ToTensor(), transforms.Normalize([0.5], [0.5])]) # 1.train_dataset if args.val_size == 0: train_path, test_path = loader.make_dataset(args.server, args.train_dataset + '_dataset', train_size=args.train_size) np.save( os.path.join(work_dir, '{}_test_path.npy'.format(args.train_dataset)), test_path) train_image_path = train_path[0] train_label_path = train_path[1] test_image_path = test_path[0] test_label_path = test_path[1] train_dataset = loader.CustomDataset(train_image_path, train_label_path, transform1, arg_mode=args.arg_mode, arg_thres=args.arg_thres, arg_range=args.arg_range, dataset=args.train_dataset) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=4) # Organize images and labels differently. train_dataset_random = loader.CustomDataset(train_image_path, train_label_path, transform1, arg_mode=args.arg_mode, arg_thres=args.arg_thres, arg_range=args.arg_range, dataset=args.train_dataset) train_loader_random = data.DataLoader(train_dataset_random, batch_size=args.batch_size, shuffle=True, num_workers=4) val_dataset = loader.CustomDataset(test_image_path, test_label_path, transform1, arg_mode=False, dataset=args.train_dataset) val_loader = data.DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=4) # 'JSRT' test_dataset Train_test_dataset = loader.CustomDataset(test_image_path, test_label_path, transform1, dataset=args.train_dataset) Train_test_loader = data.DataLoader(Train_test_dataset, batch_size=1, shuffle=True, num_workers=4) # 2.test_dataset_path # 'MC'test_dataset test_data1_path, _ = loader.make_dataset(args.server, args.test_dataset1 + '_dataset', train_size=1) test_data1_dataset = loader.CustomDataset(test_data1_path[0], test_data1_path[1], transform1, dataset=args.test_dataset1) test_data1_loader = data.DataLoader(test_data1_dataset, batch_size=1, shuffle=True, num_workers=4) # 'sh'test_dataset test_data2_path, _ = loader.make_dataset(args.server, args.test_dataset2 + '_dataset', train_size=1) test_data2_dataset = loader.CustomDataset(test_data2_path[0], test_data2_path[1], transform1, dataset=args.test_dataset2) test_data2_loader = data.DataLoader(test_data2_dataset, batch_size=1, shuffle=True, num_workers=0) test_data_list = [ Train_test_loader, test_data1_loader, test_data2_loader ] # np.save(os.path.join(work_dir, 'input_stats.npy'), train_dataset.input_stats) trn_logger = Logger(os.path.join(work_dir, 'train.log')) trn_raw_logger = Logger(os.path.join(work_dir, 'train_raw.log')) val_logger = Logger(os.path.join(work_dir, 'validation.log')) # 3.model_select model_seg, model_name = model_select(args.arch_seg) model_ae, _ = model_select(args.arch_ae) # 4.gpu select model_seg = nn.DataParallel(model_seg).cuda() model_ae = nn.DataParallel(model_ae).cuda() #model_seg = model_seg.cuda() #model_ae = model_ae.cuda() cudnn.benchmark = True # 5.optim if args.optim == 'adam': optimizer_seg = torch.optim.Adam(model_seg.parameters(), lr=args.initial_lr) optimizer_ae = torch.optim.Adam(model_ae.parameters(), lr=args.initial_lr) elif args.optim == 'sgd': optimizer_seg = torch.optim.SGD(model_seg.parameters(), lr=args.initial_lr, weight_decay=args.weight_decay) optimizer_ae = torch.optim.SGD(model_ae.parameters(), lr=args.initial_lr, weight_decay=args.weight_decay) # if args.clip_grad : # # import torch.nn.utils as torch_utils # max_grad_norm = 1. # # torch_utils.clip_grad_norm_(model_seg.parameters(), # max_grad_norm # ) # torch_utils.clip_grad_norm_(model_ae.parameters(), # max_grad_norm # ) # lr decay lr_schedule = args.lr_schedule lr_scheduler_seg = optim.lr_scheduler.MultiStepLR( optimizer_seg, milestones=lr_schedule[:-1], gamma=args.gamma) lr_scheduler_ae = optim.lr_scheduler.MultiStepLR( optimizer_ae, milestones=lr_schedule[:-1], gamma=args.gamma) # 6.loss criterion_seg = loss_function_select(args.seg_loss_function) criterion_ae = loss_function_select(args.ae_loss_function) criterion_embedding = loss_function_select(args.embedding_loss_function) ##################################################################################### # train send_slack_message('#jm_private', '{} : starting_training'.format(args.exp)) best_iou = 0 try: if args.train_mode: for epoch in range(lr_schedule[-1]): train(model_seg=model_seg, model_ae=model_ae, train_loader=train_loader, train_loder_random=train_loader_random, optimizer_seg=optimizer_seg, optimizer_ae=optimizer_ae, criterion_seg=criterion_seg, criterion_ae=criterion_ae, criterion_embedding=criterion_embedding, epoch=epoch, logger=trn_logger, sublogger=trn_raw_logger) iou = validate(model=model_seg, val_loader=val_loader, criterion=criterion_seg, epoch=epoch, logger=val_logger, work_dir=work_dir, save_fig=False, work_dir_name='{}_visualize_per_epoch'.format( args.train_dataset)) print( 'validation result **************************************************************' ) lr_scheduler_seg.step() lr_scheduler_ae.step() if args.val_size == 0: is_best = 1 else: is_best = iou > best_iou best_iou = max(iou, best_iou) checkpoint_filename = 'model_checkpoint_{:0>3}.pth'.format( epoch + 1) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model_seg.state_dict(), 'optimizer': optimizer_seg.state_dict() }, is_best, work_dir, filename='checkpoint.pth') print("train end") except RuntimeError as e: send_slack_message( '#jm_private', '----------------------------------- error train : send to message JM & Please send a kakao talk ----------------------------------------- \n error message : {}' .format(e)) import ipdb ipdb.set_trace() draw_curve(work_dir, trn_logger, val_logger) send_slack_message('#jm_private', '{} : end_training'.format(args.exp)) #--------------------------------------------------------------------------------------------------------# #here is load model for last pth load_filename = os.path.join(work_dir, 'model_best.pth') checkpoint = torch.load(load_filename) ch_epoch = checkpoint['epoch'] save_check_txt = os.path.join(work_dir, str(ch_epoch)) f = open("{}_best_checkpoint.txt".format(save_check_txt), 'w') f.close() # --------------------------------------------------------------------------------------------------------# # validation if args.test_mode: print('Test mode ...') main_test(model=model_seg, test_loader=test_data_list, args=args)