示例#1
0
 def test_logger(self):
     """Tests the logger instantiation"""
     logger = log.get_logger(__file__)
     assert len(logger.handlers) == 2
     logger2 = log.get_logger(__file__)
     assert logger == logger2
     logger.handlers = list()
     logger = log.get_logger(__file__, "simple")
     assert len(logger.handlers) == 2
     logger.handlers = list()
     with pytest.raises(ValueError):
         assert log.get_logger(__file__, "wrong_formatter")
示例#2
0
    def __init__(self, config=None) -> None:
        self.__dict__.update(config)
        self.config = config
        self.logger = logger.get_logger('proto_{}'.format(self.global_id))

        super().__init__(self.epochs, self.eval_epoch, self.patience,
                         self.eval_tasks, self.batch_size, self.first_eval,
                         self.logger)
        self.device = torch.device(self.device)
        self.logger.info('current hp', config)
        self.logger_performance = logger.get_logger(
            'proto_{}'.format(self.global_id), 'valid.txt')
示例#3
0
    def check_config():
        """Checks if the default values are changed in the config and if some
        important requirements are satisfied"""
        logger = log.get_logger(__file__)

        correct_config = True

        if (
            "INSERT" in conf.JIRA_URL
            or "INSERT" in conf.JIRA_USER
            or "INSERT" in conf.JIRA_PASSWORD
        ):
            correct_config = False
            logger.critical(
                "Some of your Jira information is not yet configured, "
                "please change."
            )

        if not os.path.ismount(conf.REPO_PATH):
            correct_config = False
            logger.critical(
                "Your munki repository is not mounted, please mount."
            )

        if not os.path.exists(conf.MAKECATALOGS):
            correct_config = False
            logger.critical("Your make catalogs path is wrong, please correct.")

        config_file_path = os.path.join(conf.LOG_DIR, conf.LOG_FILENAME)
        if not os.path.exists(config_file_path):
            correct_config = False
            logger.critical(
                f"The config file {config_file_path} does not exists, please "
                f"create it."
            )

        if conf.DRY_RUN:
            logger.warning(
                "The program is executed in dry run mode, no changes will be "
                "commited."
            )

        if not correct_config and not isinstance(conf, MunkiPromoterTestConfig):
            # if we are testing we do not want to supply a complete
            # configuration therefore we only raise the exception when running
            # in non testing mode.
            raise ImproperlyConfigured()
示例#4
0
 def __init__(self,
              meta_epoch,
              valid_check_epoch,
              patience,
              valid_tasks,
              batch_size,
              first_eval=1,
              logger=logger.get_logger('base')) -> None:
     super().__init__()
     self.logger = logger
     self.timer = timer()
     self.timer.initialize(time.time(), 60 * 1000)
     self.meta_epoch = meta_epoch
     self.valid_check_epoch = valid_check_epoch
     self.patience = patience
     self.valid_tasks = valid_tasks
     self.batch_size = batch_size
     self.first_eval = first_eval
     self.data_augmentor = DataArgumentor(
     ) if self.use_data_augmentation else None
     self.turn_on_data_augmentor = False
示例#5
0
 def __init__(self, meta_epoch, valid_check_epoch, patience, valid_tasks, batch_size, first_eval=1, logger=logger.get_logger('base')) -> None:
     super().__init__()
     self.logger = logger
     self.timer = timer()
     self.timer.initialize(time.time(), 60 * 100)
     self.meta_epoch = meta_epoch
     self.valid_check_epoch = valid_check_epoch
     self.patience = patience
     self.valid_tasks = valid_tasks
     self.batch_size = batch_size
     self.first_eval = first_eval
     self.training_mode = 0
     self.training_stage = 0
     self.saving = False
示例#6
0
                    type=str,
                    default='info',
                    help='set logging level to store and print statements')
parser.add_argument('--seed',
                    type=int,
                    default=1234,
                    help='random seed to set for reproducibility')

args = parser.parse_args()

# setting up
set_seeds(args.seed)
set_warnings()
log_save_path = f'{args.out_folder}/{args.model_name}/{args.log_file}'.lower()
make_dir(log_save_path)
logger = get_logger(log_save_path, no_stdout=False, set_level=args.log_level)
device = get_device(args.use_cpu, args.cuda_device)

# suppress warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false"


# functions
def plot(results, plot_save_path):
    if 'train_acc' in results.keys() and 'val_acc' in results.keys():
        plt.plot(results['train_acc'], label='train accuracy')
        plt.plot(results['val_acc'], label='validation accuracy')

        plt.title('Training results')
        plt.ylabel('Accuracy')
        plt.xlabel('Epoch')
示例#7
0
def main():
    '''Parse Arguments'''
    parser = build_parser()
    args = parser.parse_args()
    '''Specify Seeds for reproducibility'''
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    random.seed(args.seed)
    '''Configs'''
    device = gpu_init_pytorch(args.gpu)

    mode = args.mode
    if mode == 'train':
        is_train = True
    else:
        is_train = False

    # ckpt= args.ckpt

    run_name = args.run_name
    args.log_path = os.path.join(log_folder, run_name)
    args.model_path = os.path.join(model_folder, run_name)
    args.board_path = os.path.join(board_path, run_name)
    args.outputs_path = os.path.join(outputs_folder, run_name)

    args_file = os.path.join(args.model_path, 'args.p')

    log_file = os.path.join(args.log_path, 'log.txt')

    if args.results:
        args.result_path = os.path.join(
            result_folder, 'val_results_{}.json'.format(args.dataset))

    logging_var = bool(args.logging)

    if is_train:
        create_save_directories(args.log_path)
        create_save_directories(args.model_path)
        create_save_directories(args.outputs_path)
    else:
        create_save_directories(args.log_path)
        create_save_directories(args.result_path)

    logger = get_logger(run_name, log_file, logging.DEBUG)

    logger.debug('Created Relevant Directories')
    logger.info('Experiment Name: {}'.format(args.run_name))

    if args.mt:

        vocab1_path = os.path.join(args.model_path, 'vocab1.p')
        vocab2_path = os.path.join(args.model_path, 'vocab2.p')

        if is_train:
            #pdb.set_trace()
            train_dataloader, val_dataloader = load_data(args, logger)

            logger.debug('Creating Vocab...')

            voc1 = Voc()
            voc1.create_vocab_dict(args, 'src', train_dataloader)

            # To Do : Remove Later
            voc1.add_to_vocab_dict(args, 'src', val_dataloader)

            voc2 = Voc()
            voc2.create_vocab_dict(args, 'trg', train_dataloader)

            # To Do : Remove Later
            voc2.add_to_vocab_dict(args, 'trg', val_dataloader)
            logger.info('Vocab Created with number of words : {}'.format(
                voc1.nwords))

            with open(vocab1_path, 'wb') as f:
                pickle.dump(voc1, f, protocol=pickle.HIGHEST_PROTOCOL)
            with open(vocab2_path, 'wb') as f:
                pickle.dump(voc2, f, protocol=pickle.HIGHEST_PROTOCOL)
            logger.info('Vocab saved at {}'.format(vocab1_path))

        else:
            test_dataloader = load_data(args, logger)
            logger.info('Loading Vocab File...')

            with open(vocab1_path, 'rb') as f:
                voc1 = pickle.load(f)
            with open(vocab2_path, 'rb') as f:
                voc2 = pickle.load(f)
            logger.info(
                'Vocab Files loaded from {}\nNumber of Words: {}'.format(
                    vocab1_path, voc1.nwords))

            # print('Done')

            # TO DO : Load Existing Checkpoints here
        checkpoint = get_latest_checkpoint(args.model_path, logger)
        '''Param Specs'''
        layers = args.layers
        heads = args.heads
        d_model = args.d_model
        d_ff = args.d_ff
        max_len = args.max_length
        dropout = args.dropout
        BATCH_SIZE = args.batch_size
        epochs = args.epochs

        if logging_var:
            meta_fname = os.path.join(args.log_path, 'meta.txt')
            loss_fname = os.path.join(args.log_path, 'loss.txt')

            meta_fh = open(meta_fname, 'w')
            loss_fh = open(loss_fname, 'w')

            print('Log Files created at: {}'.format(args.log_path))

            write_meta(args, meta_fh)
        """stime= time.time()
					print('Loading Data...')
					train, val, test, SRC, TGT = build_data()
					etime= (time.time()-stime)/60
					print('Data Loaded\nTime Taken:{}'.format(etime ))"""

        pad_idx = voc1.w2id['PAD']

        model = make_model(voc1.nwords,
                           voc2.nwords,
                           N=layers,
                           h=heads,
                           d_model=d_model,
                           d_ff=d_ff,
                           dropout=dropout)
        model.to(device)

        criterion = LabelSmoothing(size=voc2.nwords,
                                   padding_idx=pad_idx,
                                   smoothing=0.1)
        criterion.to(device)

        # train_iter = MyIterator(train, batch_size=BATCH_SIZE, device=device,
        # 						repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
        # 						batch_size_fn=batch_size_fn, train=True)

        # valid_iter = MyIterator(val, batch_size=BATCH_SIZE, device=device,
        # 						repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
        # 						batch_size_fn=batch_size_fn, train=False)

        if mode == 'train':
            model_opt = NoamOpt(
                model.src_embed[0].d_model, 1, 2000,
                torch.optim.Adam(model.parameters(),
                                 lr=0,
                                 betas=(0.9, 0.98),
                                 eps=1e-9))
            max_val_score = 0.0
            min_error_score = 100.0
            epoch_offset = 0
            for epoch in range(epochs):
                # pdb.set_trace()
                #if epoch%3==0:

                print('Training Epoch: ', epoch)
                model.train()
                run_epoch((rebatch(args, device, voc1, voc2, pad_idx, b)
                           for b in train_dataloader), model,
                          LossCompute(model.generator,
                                      criterion,
                                      device=device,
                                      opt=model_opt))
                model.eval()
                # loss = run_epoch((rebatch(args, device, voc1, voc2, pad_idx, b) for b in val_dataloader),
                #  				  model,
                #  				  LossCompute(model.generator, criterion, device=device, opt=None))
                # loss_str= "Epoch: {} \t Val Loss: {}\n".format(epoch,loss)
                # print(loss_str)

                refs = []
                hyps = []
                error_score = 0

                for i, batch in enumerate(val_dataloader):
                    sent1s = sents_to_idx(voc1, batch['src'], args.max_length)
                    sent2s = sents_to_idx(voc2, batch['trg'], args.max_length)
                    sent1_var, sent2_var, input_len1, input_len2 = process_batch(
                        sent1s, sent2s, voc1, voc2, device, voc1.id2w[pad_idx])

                    sent1s = idx_to_sents(voc1, sent1_var, no_eos=True)
                    sent2s = idx_to_sents(voc2, sent2_var, no_eos=True)

                    #pdb.set_trace()
                    # for l in range(len(batch['src'])):
                    # 	if len(batch['src'][l].split())!=9:
                    # 		print(l)

                    #for eg in range(sent1_var.size(0)):
                    src = sent1_var.transpose(0, 1)
                    src_mask = (src != voc1.w2id['PAD']).unsqueeze(-2)

                    #refs.append([' '.join(sent2s[eg])])
                    refs += [[' '.join(sent2s[i])]
                             for i in range(sent2_var.size(1))]

                    # pdb.set_trace()
                    out = greedy_decode(model,
                                        src,
                                        src_mask,
                                        max_len=60,
                                        start_symbol=voc2.w2id['<s>'],
                                        pad=pad_idx)

                    words = []

                    decoded_words = [[] for i in range(out.size(0))]
                    ends = []

                    #pdb.set_trace()

                    #print("Translation:", end="\t")
                    for z in range(1, out.size(1)):
                        for b in range(len(decoded_words)):
                            sym = voc2.id2w[out[b, z].item()]
                            if b not in ends:
                                if sym == "</s>":
                                    ends.append(b)
                                    continue
                                #print(sym, end =" ")
                                decoded_words[b].append(sym)

                    with open(args.outputs_path + '/outputs.txt',
                              'a') as f_out:
                        f_out.write('Batch: ' + str(i) + '\n')
                        f_out.write(
                            '---------------------------------------\n')
                        for z in range(len(decoded_words)):
                            try:
                                f_out.write('Example: ' + str(z) + '\n')
                                f_out.write('Source: ' + batch['src'][z] +
                                            '\n')
                                f_out.write('Target: ' + batch['trg'][z] +
                                            '\n')
                                f_out.write('Generated: ' +
                                            stack_to_string(decoded_words[z]) +
                                            '\n' + '\n')
                            except:
                                logger.warning('Exception: Failed to generate')
                                pdb.set_trace()
                                break
                        f_out.write(
                            '---------------------------------------\n')
                        f_out.close()

                    hyps += [
                        ' '.join(decoded_words[z])
                        for z in range(len(decoded_words))
                    ]
                    #hyps.append(stack_to_string(words))

                    error_score += cal_score(decoded_words, batch['trg'])

                    #print()
                    #print("Target:", end="\t")
                    for z in range(1, sent2_var.size(0)):
                        sym = voc2.id2w[sent2_var[z, 0].item()]
                        if sym == "</s>": break
                        #print(sym, end =" ")
                    #print()
                    #break

                val_bleu_epoch = bleu_scorer(refs, hyps)
                print('Epoch: {}  Val bleu: {}'.format(epoch,
                                                       val_bleu_epoch[0]))
                print('Epoch: {}  Val Error: {}'.format(
                    epoch, error_score / len(val_dataloader)))

                # if logging_var:
                # 	loss_fh.write(loss_str)
                if epoch % 10 == 0:
                    ckpt_path = os.path.join(args.model_path, 'model.pt')
                    logger.info('Saving Checkpoint at : {}'.format(ckpt_path))
                    torch.save(model.state_dict(), ckpt_path)
                    print('Model saved at: {}'.format(ckpt_path))

        else:
            model.load_state_dict(torch.load(args.model_path))
            model.eval()

        # pdb.set_trace()
        # for i, batch in enumerate(val_dataloader):
        # 	sent1s = sents_to_idx(voc1, batch['src'], args.max_length)
        # 	sent2s = sents_to_idx(voc2, batch['trg'], args.max_length)
        # 	sent1_var, sent2_var, input_len1, input_len2  = process_batch(sent1s, sent2s, voc1, voc2, device)
        # 	src = sent1_var.transpose(0, 1)[:1]
        # 	src_mask = (src != voc1.w2id['PAD']).unsqueeze(-2)
        # 	out = greedy_decode(model, src, src_mask, max_len=max_len, start_symbol=voc2.w2id['<s>'])
        # 	print("Translation:", end="\t")
        # 	for i in range(1, out.size(1)):
        # 		sym = voc2.id2w[out[0, i].item()]
        # 		if sym == "</s>": break
        # 		print(sym, end =" ")
        # 	print()
        # 	print("Target:", end="\t")
        # 	for i in range(1, sent2_var.size(0)):
        # 		sym = voc2.id2w[sent2_var[i, 0].item()]
        # 		if sym == "</s>": break
        # 		print(sym, end =" ")
        # 	print()
        # 	break

    else:
        '''
		Code for Synthetic Data
		'''
        vocab_path = os.path.join(args.model_path, 'vocab.p')

        if is_train:
            #pdb.set_trace()
            train_dataloader, val_dataloader = load_data(args, logger)

            logger.debug('Creating Vocab...')

            voc = Syn_Voc()
            voc.create_vocab_dict(args, train_dataloader)

            # To Do : Remove Later
            voc.add_to_vocab_dict(args, val_dataloader)

            logger.info('Vocab Created with number of words : {}'.format(
                voc.nwords))

            with open(vocab_path, 'wb') as f:
                pickle.dump(voc, f, protocol=pickle.HIGHEST_PROTOCOL)

            logger.info('Vocab saved at {}'.format(vocab_path))

        else:
            test_dataloader = load_data(args, logger)
            logger.info('Loading Vocab File...')

            with open(vocab_path, 'rb') as f:
                voc = pickle.load(f)

            logger.info(
                'Vocab Files loaded from {}\nNumber of Words: {}'.format(
                    vocab_path, voc.nwords))

            # print('Done')

            # TO DO : Load Existing Checkpoints here
        # checkpoint = get_latest_checkpoint(args.model_path, logger)
        '''Param Specs'''
        layers = args.layers
        heads = args.heads
        d_model = args.d_model
        d_ff = args.d_ff
        max_len = args.max_length
        dropout = args.dropout
        BATCH_SIZE = args.batch_size
        epochs = args.epochs

        if logging_var:
            meta_fname = os.path.join(args.log_path, 'meta.txt')
            loss_fname = os.path.join(args.log_path, 'loss.txt')

            meta_fh = open(meta_fname, 'w')
            loss_fh = open(loss_fname, 'w')

            print('Log Files created at: {}'.format(args.log_path))

            write_meta(args, meta_fh)
        """stime= time.time()
					print('Loading Data...')
					train, val, test, SRC, TGT = build_data()
					etime= (time.time()-stime)/60
					print('Data Loaded\nTime Taken:{}'.format(etime ))"""

        pad_idx = voc.w2id['PAD']

        model = make_model(voc.nwords,
                           voc.nwords,
                           N=layers,
                           h=heads,
                           d_model=d_model,
                           d_ff=d_ff,
                           dropout=dropout)
        model.to(device)

        logger.info('Initialized Model')

        criterion = LabelSmoothing(size=voc.nwords,
                                   padding_idx=pad_idx,
                                   smoothing=0.1)
        criterion.to(device)

        # train_iter = MyIterator(train, batch_size=BATCH_SIZE, device=device,
        # 						repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
        # 						batch_size_fn=batch_size_fn, train=True)

        # valid_iter = MyIterator(val, batch_size=BATCH_SIZE, device=device,
        # 						repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
        # 						batch_size_fn=batch_size_fn, train=False)

        if mode == 'train':
            model_opt = NoamOpt(
                model.src_embed[0].d_model, 1, 3000,
                torch.optim.Adam(model.parameters(),
                                 lr=0,
                                 betas=(0.9, 0.98),
                                 eps=1e-9))
            max_bleu_score = 0.0
            min_error_score = 100.0
            epoch_offset = 0
            logger.info('Starting Training Procedure')
            for epoch in range(epochs):
                # pdb.set_trace()
                #if epoch%3==0:

                print('Training Epoch: ', epoch)
                model.train()
                start_time = time.time()
                run_epoch((rebatch(args, device, voc, voc, pad_idx, b)
                           for b in train_dataloader), model,
                          LossCompute(model.generator,
                                      criterion,
                                      device=device,
                                      opt=model_opt))

                time_taken = (time.time() - start_time) / 60.0
                logger.debug(
                    'Training for epoch {} completed...\nTime Taken: {}'.
                    format(epoch, time_taken))
                logger.debug('Starting Validation')

                model.eval()
                # loss = run_epoch((rebatch(args, device, voc1, voc2, pad_idx, b) for b in val_dataloader),
                #  				  model,
                #  				  LossCompute(model.generator, criterion, device=device, opt=None))
                # loss_str= "Epoch: {} \t Val Loss: {}\n".format(epoch,loss)
                # print(loss_str)

                refs = []
                hyps = []
                error_score = 0

                for i, batch in enumerate(val_dataloader):
                    sent1s = sents_to_idx(voc, batch['src'], args.max_length)
                    sent2s = sents_to_idx(voc, batch['trg'], args.max_length)
                    sent1_var, sent2_var, input_len1, input_len2 = process_batch(
                        sent1s, sent2s, voc, voc, device, voc.id2w[pad_idx])

                    sent1s = idx_to_sents(voc, sent1_var, no_eos=True)
                    sent2s = idx_to_sents(voc, sent2_var, no_eos=True)

                    # pdb.set_trace()
                    # for l in range(len(batch['src'])):
                    # 	if len(batch['src'][l].split())!=9:
                    # 		print(l)

                    #for eg in range(sent1_var.size(0)):
                    src = sent1_var.transpose(0, 1)

                    ### FOR NON-DIRECTIONAL ###
                    # src_mask = (src != voc.w2id['PAD']).unsqueeze(-2)

                    ### FOR DIRECTIONAL ###
                    src_mask = make_std_mask(src, pad_idx)
                    src_mask_bi = make_bi_std_mask(src, pad_idx)
                    src_mask_dec = (src != voc.w2id['PAD']).unsqueeze(-2)
                    #refs.append([' '.join(sent2s[eg])])
                    # refs += [[' '.join(sent2s[i])] for i in range(sent2_var.size(1))]
                    refs += [[x] for x in batch['trg']]

                    out = greedy_decode(model,
                                        src,
                                        src_mask,
                                        max_len=max_len,
                                        start_symbol=voc.w2id['<s>'],
                                        pad=pad_idx,
                                        src_mask_dec=src_mask_dec,
                                        src_mask_bi=src_mask_bi)

                    words = []

                    decoded_words = [[] for i in range(out.size(0))]
                    ends = []

                    # pdb.set_trace()

                    #print("Translation:", end="\t")
                    for z in range(1, out.size(1)):
                        for b in range(len(decoded_words)):
                            sym = voc.id2w[out[b, z].item()]
                            if b not in ends:
                                if sym == "</s>":
                                    ends.append(b)
                                    continue
                                #print(sym, end =" ")
                                decoded_words[b].append(sym)

                    with open(args.outputs_path + '/outputs.txt',
                              'a') as f_out:
                        f_out.write('Batch: ' + str(i) + '\n')
                        f_out.write(
                            '---------------------------------------\n')
                        for z in range(len(decoded_words)):
                            try:
                                f_out.write('Example: ' + str(z) + '\n')
                                f_out.write('Source: ' + batch['src'][z] +
                                            '\n')
                                f_out.write('Target: ' + batch['trg'][z] +
                                            '\n')
                                f_out.write('Generated: ' +
                                            stack_to_string(decoded_words[z]) +
                                            '\n' + '\n')
                            except:
                                logger.warning('Exception: Failed to generate')
                                pdb.set_trace()
                                break
                        f_out.write(
                            '---------------------------------------\n')
                        f_out.close()

                    hyps += [
                        ' '.join(decoded_words[z])
                        for z in range(len(decoded_words))
                    ]
                    #hyps.append(stack_to_string(words))

                    if args.ap:
                        error_score += cal_score_AP(decoded_words,
                                                    batch['trg'])
                    else:
                        error_score += cal_score(decoded_words, batch['trg'])

                    #print()
                    #print("Target:", end="\t")
                    for z in range(1, sent2_var.size(0)):
                        sym = voc.id2w[sent2_var[z, 0].item()]
                        if sym == "</s>": break
                        #print(sym, end =" ")
                    #print()
                    #break

                if (error_score / len(val_dataloader)) < min_error_score:
                    min_error_score = error_score / len(val_dataloader)

                val_bleu_epoch = bleu_scorer(refs, hyps)

                if max_bleu_score < val_bleu_epoch[0]:
                    max_bleu_score = val_bleu_epoch[0]

                logger.info('Epoch: {}  Val bleu: {}'.format(
                    epoch, val_bleu_epoch[0]))
                logger.info('Maximum Bleu: {}'.format(max_bleu_score))
                logger.info('Epoch: {}  Val Error: {}'.format(
                    epoch, error_score / len(val_dataloader)))
                logger.info('Minimum Error: {}'.format(min_error_score))

                # if logging_var:
                # 	loss_fh.write(loss_str)
                if epoch % 5 == 0:
                    ckpt_path = os.path.join(args.model_path, 'model.pt')
                    logger.info('Saving Checkpoint at : {}'.format(ckpt_path))
                    torch.save(model.state_dict(), ckpt_path)
                    print('Model saved at: {}'.format(ckpt_path))

            store_results(args, max_bleu_score, min_error_score)
            logger.info('Scores saved at {}'.format(args.result_path))

        else:
            model.load_state_dict(torch.load(args.model_path))
            model.eval()
示例#8
0
#  Gmacht mit ❤️ in Basel
#
#  Copyright (c) 2019 University of Basel
#  Last modified 16/07/2019, 12:55.
#
#  Developed by Tom Cinbis and Tim Königl on 16/07/2019, 13:03

from datetime import datetime
from typing import Dict

from src.core.base_classes import Package
from src.utils import logger as log
from src.utils.config import Catalog, PackageState, JiraLane, Present
from src.utils.config import conf

logger = log.get_logger(__file__)


class Promoter:
    """
    The main class for the promotion logic of the program.
    The munki and jira packages will be compared and the munki packages will be
    updated according to the state of the jira packages. Additionally automatic
    catalog transitions are realised if the right criteria are fulfilled.
    """

    def __init__(self, munki_packages: Dict, jira_packages: Dict):
        """
        Initializes a promoter object which contains the munki and the jira packages.

        :param munki_packages: `Dict` the munki packages
示例#9
0
#!/usr/bin/env python
import faust

from src.config import Config
from src.utils.logger import get_logger

logger = get_logger('app')

config = Config()

app = faust.App('compute',
                broker=config.KAFKA_BROKER_URL,
                debug=True,
                web_port=config.WEB_PORT,
                autodiscover=True,
                origin='src')
config.init_app(app)

if __name__ == '__main__':
    app.main()
示例#10
0
 def _get_logger(log_file_name, file_dir, config):
     if log_file_name:
         return get_logger(log_file_name, file_dir, config)
     return None
示例#11
0
def main():
    # =========================================
    # === Settings
    # =========================================
    # get logger
    logger = get_logger(out_file="ensemble.log")

    logger.info("=== file path ===")
    # set model
    oof_1_path = "./data/output/20190930_hmdhmd/20190930_hmdhmd_oof.csv"
    pred_1_path = "./data/output/20190930_hmdhmd/20190930_hmdhmd_pred.csv"
    logger.info(f"hmd model - oof: {oof_1_path}")
    logger.info(f"hmd model - pred: {pred_1_path}")

    oof_2_path = "./data/output/20191001_ML_Bear/OOF_20190930_ModelAvg_based_on_LB09578_20190930_03_full_model_01_oof09565_pub09886_pri09882.csv"
    pred_2_path = "./data/output/20191001_ML_Bear/PRED_20190930_ModelAvg_based_on_LB09578_20190930_03_full_model_01_oof09565_pub09886_pri09882.csv"
    logger.info(f"bear model - oof: {oof_2_path}")
    logger.info(f"bear model - pred: {pred_2_path}")

    oof_3_path = "./data/output/model_25/oof_preds.npy"
    pred_3_path = "./data/output/model_25/submission.csv"
    logger.info(f"hakubishin model - oof: {oof_3_path}")
    logger.info(f"hakubishin model - pred: {pred_3_path}")

    oof_4_path = "./data/output/20190930_holygo/20190930_2032__train_oof_holygo_CV0-9592479__LB0.9594.csv"
    pred_4_path = "./data/output/20190930_holygo/20190930_2032__test_pred_holygo_CV0-9592479__LB0.9594.csv"
    logger.info(f"holygo model - oof: {oof_4_path}")
    logger.info(f"holygo model - pred: {pred_4_path}")

    # load data
    oof_1 = pd.read_csv(oof_1_path).sort_values("TransactionID")["isFraud"].values
    oof_2 = pd.read_csv(oof_2_path).sort_values("TransactionID")["isFraud"].values
    oof_3 = np.load(oof_3_path)
    oof_4 = pd.read_csv(oof_4_path).sort_values("TransactionID").iloc[:len(oof_3)]["isFraud"].values

    pred_1 = pd.read_csv(pred_1_path).sort_values("TransactionID").reset_index(drop=True)
    pred_2 = pd.read_csv(pred_2_path).sort_values("TransactionID").reset_index(drop=True)
    pred_3 = pd.read_csv(pred_3_path).sort_values("TransactionID").reset_index(drop=True)
    pred_4 = pd.read_csv(pred_4_path).sort_values("TransactionID").reset_index(drop=True)

    # =========================================
    # === data loading
    # =========================================
    train = pd.read_csv('./data/input/train.csv')
    # test = pd.read_csv('./data/input/test.csv')
    y_train = train["isFraud"].values

    # =========================================
    # === check score
    # =========================================
    logger.info("=== check score ===")
    def calc_bear_score(df):
        df_probing = pd.read_csv('data/interim/probing_toolbox/old/probing.csv').loc[:, ['TransactionID', 'data_type', 'Probing_isFraud']]
        df = pd.merge(df_probing, df, on='TransactionID', how='left')
        # test public score
        public_score = roc_auc_score(
            df[df.data_type=="test_public"]['Probing_isFraud'],
            df[df.data_type=="test_public"]['isFraud']
        )
        # test private score
        private_score = roc_auc_score(
            df[df.data_type=="test_private"]['Probing_isFraud'],
            df[df.data_type=="test_private"]['isFraud']
        )
        return public_score, private_score

    cv = roc_auc_score(y_train, oof_1)
    pub, prv = calc_bear_score(pred_1)
    logger.info(f"hmd model: cv{cv}, pub{pub}, prv{prv}")

    cv = roc_auc_score(y_train, oof_2)
    pub, prv = calc_bear_score(pred_2)
    logger.info(f"bear model: cv{cv}, pub{pub}, prv{prv}")

    cv = roc_auc_score(y_train, oof_3)
    pub, prv = calc_bear_score(pred_3)
    logger.info(f"hakubishin model: cv{cv}, pub{pub}, prv{prv}")

    cv = roc_auc_score(y_train, oof_4)
    pub, prv = calc_bear_score(pred_4)
    logger.info(f"holygo model: cv{cv}, pub{pub}, prv{prv}")

    # =========================================
    # === user info
    # =========================================
    logger.info("=== user info ===")
    thres = 2
    logger.info(f"user count thres: {thres}")
    predicted_user = pd.read_csv('./data/interim/20190901_user_ids_share.csv').sort_values("TransactionID").reset_index(drop=True)
    user_count = predicted_user["predicted_user_id"].value_counts()
    target_user_id = user_count[user_count <= thres].index.tolist()
    train_predicted_user = predicted_user.iloc[:len(oof_3)]
    train_target_df = train_predicted_user.query("predicted_user_id in @target_user_id")
    train_target_index = train_target_df.index

    cv = roc_auc_score(y_train[train_target_index], oof_1[train_target_index])
    logger.info(f"hmd model: cv{cv}")
    cv = roc_auc_score(y_train[train_target_index], oof_2[train_target_index])
    logger.info(f"bear model: cv{cv}")
    cv = roc_auc_score(y_train[train_target_index], oof_3[train_target_index])
    logger.info(f"hakubishin model: cv{cv}")
    cv = roc_auc_score(y_train[train_target_index], oof_4[train_target_index])
    logger.info(f"holygo model: cv{cv}")

    # =========================================
    # === hand made
    # =========================================
    logger.info("=== hand made ===")

    sub = pred_3.copy()
    #x_opt = [0.10, 0.25, 0.55, 0.10]
    x_opt = [0.050, 0.226, 0.6725, 0.0515]
    logger.info(f"rate: {x_opt}") 
    oof = oof_1 * x_opt[0] + oof_2 * x_opt[1] + oof_3 * x_opt[2] + oof_4 * x_opt[3]
    cv = roc_auc_score(y_train[train_target_index], oof[train_target_index])
    logger.info(f"ensemble model: cv{cv}")

    sub["isFraud"] = pred_1["isFraud"] * x_opt[0] + pred_2["isFraud"] * x_opt[1] + pred_3["isFraud"] * x_opt[2] + pred_4["isFraud"] * x_opt[3]
    pub, prv = calc_bear_score(sub)
    logger.info(f"ensemble model: pub{pub}, prv{prv}")
    sub.to_csv("sub_avg.csv",header=True,index=False)
    import pdb; pdb.set_trace()

    # override probing value and save
    df_probing = pd.read_csv('data/interim/probing_toolbox/20190929_probing.csv').loc[:, ['TransactionID', 'data_type', 'Probing_isFraud']]
    sub = pd.merge(sub, df_probing, on="TransactionID", how="left")
    # override only probing_isfraud = 1
    sub.loc[sub.Probing_isFraud == 1, "isFraud"] = 1
    sub = sub[["TransactionID", "isFraud"]]
    pub, prv = calc_bear_score(sub)
    logger.info(f"ensemble model after override proving value: pub{pub}, prv{prv}")
    sub.to_csv("sub_avg.csv",header=True,index=False)

    # =========================================
    # === optimize
    # =========================================
    logger.info("=== optimize ===")
    sub = pred_3.copy()

    def f(x):
        x0 = x[:, 0]
        x1 = x[:, 1]
        x2 = x[:, 2]
        x3 = x[:, 3]

        sub["isFraud"] = pred_1["isFraud"] * x0 + pred_2["isFraud"] * x1 + pred_3["isFraud"] * x2 + pred_4["isFraud"] * x3
        public_score, private_score = calc_bear_score(sub)

        oof = oof_1 * x0 + oof_2 * x1 + oof_3 * x2 + oof_4 * x3
        cv = roc_auc_score(y_train[train_target_index], oof[train_target_index])

        opt_value = -1 * private_score
        # opt_value = -1 * (private_score + public_score + cv)

        return opt_value

    bounds = [
        {'name': 'x0', 'type': 'continuous', 'domain': (0.05, 1)},
        {'name': 'x1', 'type': 'continuous', 'domain': (0.05, 1)},
        {'name': 'x2', 'type': 'continuous', 'domain': (0.05, 1)},
        {'name': 'x3', 'type': 'continuous', 'domain': (0.05, 1)},
    ]

    constraints = [
        {
            'name': 'constr_1',
            'constraint': '(x[:,0] + x[:,1] + x[:,2] + x[:,3]) - 1 - 0.001'
        },
        {
            'name': 'constr_2',
            'constraint': '1 - (x[:,0] + x[:,1] + x[:,2] + x[:,3]) - 0.001'
        }
    ]

    myBopt = GPyOpt.methods.BayesianOptimization(f=f, domain=bounds, constraints=constraints)
    myBopt.run_optimization(max_iter=30)
    logger.info(f"rate: {myBopt.x_opt}") 
    logger.info(f"value: {myBopt.fx_opt}")

    # check oof
    oof = oof_1 * myBopt.x_opt[0] + oof_2 * myBopt.x_opt[1] + oof_3 * myBopt.x_opt[2] + oof_4 * myBopt.x_opt[3]
    cv = roc_auc_score(y_train[train_target_index], oof_1[train_target_index])
    logger.info(f"ensemble model: cv{cv}")

    # make submission file
    sub = pred_3.copy()
    sub["isFraud"] = pred_1["isFraud"] * myBopt.x_opt[0] + pred_2["isFraud"] * myBopt.x_opt[1] + pred_3["isFraud"] * myBopt.x_opt[2] + pred_4["isFraud"] * myBopt.x_opt[3]
    pub, prv = calc_bear_score(sub)
    logger.info(f"ensemble model: pub{pub}, prv{prv}")
    import pdb; pdb.set_trace()

    # override probing value and save
    df_probing = pd.read_csv('data/interim/probing_toolbox/20190929_probing.csv').loc[:, ['TransactionID', 'data_type', 'Probing_isFraud']]
    sub = pd.merge(sub, df_probing, on="TransactionID", how="left")
    # override only probing_isfraud = 1
    sub.loc[sub.Probing_isFraud == 1, "isFraud"] = 1
    sub = sub[["TransactionID", "isFraud"]]
    pub, prv = calc_bear_score(sub)
    logger.info(f"ensemble model after override proving value: pub{pub}, prv{prv}")
    sub.to_csv("sub_avg.csv",header=True,index=False)
示例#12
0
def parse_args():
    parser = configargparse.ArgumentParser(
        description='Training Wikinet 2',
        formatter_class=configargparse.ArgumentDefaultsHelpFormatter)
    # General
    general = parser.add_argument_group('General Settings.')
    general.add_argument('--my-config',
                         required=True,
                         is_config_file=True,
                         help='config file path')
    general.add_argument('--exp_name',
                         type=str,
                         default="debug",
                         help="Experiment name")
    general.add_argument("--debug",
                         type=str2bool,
                         default=True,
                         help="whether to debug")

    # Data
    data = parser.add_argument_group('Data Settings.')
    data.add_argument('--data_path',
                      required=True,
                      type=str,
                      help='location of data dir')
    data.add_argument(
        '--data_type',
        type=str,
        help='name of train dataset, a directory of this name should contain '
        'generated training data using gen_train_data.py')
    data.add_argument('--train_size',
                      type=int,
                      help='number of training abstracts')
    data.add_argument('--data_types',
                      type=str,
                      help='name of datasets separated by comma')

    # Max Padding
    padding = parser.add_argument_group('Max Padding for batch.')
    padding.add_argument('--max_context_size',
                         type=int,
                         help='max number of context')
    padding.add_argument('--max_ent_size',
                         type=int,
                         help='max number of entities considered in abstract')

    # Model Type
    model_selection = parser.add_argument_group('Type of model to train.')
    model_selection.add_argument(
        '--pre_train',
        type=str,
        help='if specified, model will load state dict, must be ckpt')

    # Model params
    model_params = parser.add_argument_group("Parameters for chosen model.")
    model_params.add_argument('--dp', type=float, help='drop out')
    model_params.add_argument('--hidden_size',
                              type=int,
                              help='size of hidden layer in yamada model')

    # Candidate Generation
    candidate = parser.add_argument_group('Candidate generation.')
    candidate.add_argument("--num_candidates",
                           type=int,
                           default=32,
                           help="Total number of candidates")
    candidate.add_argument("--prop_gen_candidates",
                           type=float,
                           default=0.5,
                           help="Proportion of candidates generated")

    # Training
    training = parser.add_argument_group("Training parameters.")
    training.add_argument("--num_epochs",
                          type=int,
                          default=5,
                          help="Number of epochs")
    training.add_argument("--save_every",
                          type=int,
                          default=5,
                          help="how often to checkpoint")
    training.add_argument("--patience",
                          type=int,
                          default=5,
                          help="Patience for early stopping")
    training.add_argument("--batch_size",
                          type=int,
                          default=32,
                          help="Batch size")
    training.add_argument("--num_workers",
                          type=int,
                          default=4,
                          help="number of workers for data loader")
    training.add_argument('--lr', type=float, help='learning rate')
    training.add_argument('--wd', type=float, help='weight decay')
    training.add_argument('--embs_optim',
                          type=str,
                          choices=['adagrad', 'adam', 'rmsprop', 'sparseadam'],
                          help='optimizer for embeddings')
    training.add_argument(
        '--other_optim',
        type=str,
        choices=['adagrad', 'adam', 'rmsprop'],
        help='optimizer for paramaters that are not embeddings')
    training.add_argument('--sparse', type=str2bool, help='sparse gradients')

    # cuda
    parser.add_argument("--device", type=str, help="cuda device")
    parser.add_argument("--use_cuda", type=str2bool, help="use gpu or not")
    parser.add_argument("--profile",
                        type=str2bool,
                        help="if set will run profiler on dataloader and exit")

    args = parser.parse_args()
    logger = get_logger(args)

    if args.wd > 0:
        assert not args.sparse

    if args.use_cuda:
        devices = args.device.split(",")
        if len(devices) > 1:
            devices = tuple([int(device) for device in devices])
        else:
            devices = int(devices[0])
        args.__dict__['device'] = devices

    logger.info("Experiment Parameters:")
    print()
    for arg in sorted(vars(args)):
        logger.info('{:<15}\t{}'.format(arg, getattr(args, arg)))

    model_date_dir = join(args.data_path, 'models',
                          '{}'.format(datetime.now().strftime("%Y_%m_%d")))
    if not os.path.exists(model_date_dir):
        os.makedirs(model_date_dir)
    model_dir = join(model_date_dir, args.exp_name)
    args.__dict__['model_dir'] = model_dir
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    return args, logger, model_dir
示例#13
0
import json
import os

import aiohttp
import numpy as np

from src.app import app, config
from src.models.faust_dao import State, CalculationItem

from src.utils.logger import get_logger

logger = get_logger('zz-compute-agents')


@app.agent(config.topics['model-tasks-do'])
async def compute_agent(events):
    async for event in events:
        task = event.task
        print(f'MYAGENT RECEIVED -- {task!r}')
        task.state = State.IN_PROGRESS.value
        await config.topics['model-tasks-done'].send(value=task)
        try:
            async with aiohttp.ClientSession() as session:
                compute_closure = compute(session, task)
                if not event.runner_code:
                    outputs = await compute_closure(task.data)
                else:
                    _locals = {'klass': None}
                    exec(event.runner_code, {
                        '__builtins__': __builtins__,
                        'np': np
示例#14
0
import signal

SIGNALS_TO_NAMES_DICT = dict((getattr(signal, n), n) \
    for n in dir(signal) if n.startswith('SIG') and '_' not in n )


def receive_signal(signum, stack):
    if signum in [1, 2, 3, 15]:
        print('Caught signal %s (%s), exiting.' %
              (SIGNALS_TO_NAMES_DICT[signum], str(signum)))
    else:
        print('Caught signal %s (%s), ignoring.' %
              (SIGNALS_TO_NAMES_DICT[signum], str(signum)))


LOGGER = logger.get_logger('main-thread')

GLOBAL_CONFIG = {}


def predict(learner, episode_queue, total_task=200, res='cpu', kargs={}):
    time1 = time.time()
    if 'time_fired' in kargs:
        LOGGER.debug(kargs['taskid'], 'fired time',
                     time1 - kargs['time_fired'])
    # LOGGER.info('task id', kargs['taskid'], 'device', res)
    device = torch.device(res)
    learner.to(device)
    result = []
    for i in range(total_task):
        if isinstance(episode_queue, list):
def main():
	'''read arguments'''
	parser = build_parser()
	args = parser.parse_args()
	config =args
	mode = config.mode
	if mode == 'train':
		is_train = True
	else:
		is_train = False

	''' Set seed for reproducibility'''
	np.random.seed(config.seed)
	torch.manual_seed(config.seed)
	random.seed(config.seed)

	'''GPU initialization'''
	device = gpu_init_pytorch(config.gpu)
	#device = 'cpu'
	'''Run Config files/paths'''
	run_name = config.run_name
	config.log_path = os.path.join(log_folder, run_name)
	config.model_path = os.path.join(model_folder, run_name)
	config.board_path = os.path.join(board_path, run_name)

	vocab_path = os.path.join(config.model_path, 'vocab.p')
	config_file = os.path.join(config.model_path, 'config.p')
	log_file = os.path.join(config.log_path, 'log.txt')

	if config.results:
		config.result_path = os.path.join(result_folder, 'val_results_{}.json'.format(config.dataset))

	if is_train:
		create_save_directories(config.log_path, config.model_path)
	else:
		create_save_directories(config.log_path, config.result_path)

	logger = get_logger(run_name, log_file, logging.DEBUG)
	writer = SummaryWriter(config.board_path)

	logger.debug('Created Relevant Directories')
	logger.info('Experiment Name: {}'.format(config.run_name))

	'''Read Files and create/load Vocab'''
	if is_train:

		logger.debug('Creating Vocab and loading Data ...')
		train_loader, val_loader_bins, voc  = load_data(config, logger)

		logger.info(
			'Vocab Created with number of words : {}'.format(voc.nwords))		

		with open(vocab_path, 'wb') as f:
			pickle.dump(voc, f, protocol=pickle.HIGHEST_PROTOCOL)
		logger.info('Vocab saved at {}'.format(vocab_path))



	else:
		logger.info('Loading Vocab File...')

		with open(vocab_path, 'rb') as f:
			voc = pickle.load(f)

		logger.info('Vocab Files loaded from {}'.format(vocab_path))

		logger.info("Loading Test Dataloaders...")
		config.batch_size = 1
		test_loader_bins = load_data(config, logger, voc)
		logger.info("Done loading test dataloaders")

	# print('Done')

	# TO DO : Load Existing Checkpoints here


	if is_train:
		
		max_val_acc = 0.0
		epoch_offset= 0


		if config.load_model:
			checkpoint = get_latest_checkpoint(config.model_path, logger)
			if checkpoint:
				ckpt = torch.load(checkpoint, map_location=lambda storage, loc: storage)
				#config.lr = checkpoint['lr']
				model = build_model(config=config, voc=voc, device=device, logger=logger)
				model.load_state_dict(ckpt['model_state_dict'])
				model.optimizer.load_state_dict(ckpt['optimizer_state_dict'])
		else:
			model = build_model(config=config, voc=voc, device=device, logger=logger)
		# pdb.set_trace()

		logger.info('Initialized Model')

		with open(config_file, 'wb') as f:
			pickle.dump(vars(config), f, protocol=pickle.HIGHEST_PROTOCOL)

		logger.debug('Config File Saved')

		logger.info('Starting Training Procedure')
		train_model(model, train_loader, val_loader_bins, voc,
					device, config, logger, epoch_offset, max_val_acc, writer)

	else:

		gpu = config.gpu

		with open(config_file, 'rb') as f:
			bias = config.bias
			extraffn = config.extraffn
			config = AttrDict(pickle.load(f))
			config.gpu = gpu
			config.bins = len(test_loader_bins)
			config.batch_size = 1
			config.bias = bias
			config.extraffn = extraffn
			# To do: remove it later
			#config.num_labels =2  

		model = build_model(config=config, voc=voc, device=device, logger=logger)
		checkpoint = get_latest_checkpoint(config.model_path, logger)
		ep_offset, train_loss, score, voc = load_checkpoint(
			model, config.mode, checkpoint, logger, device, bins = config.bins)

		logger.info('Prediction from')
		od = OrderedDict()
		od['epoch'] = ep_offset
		od['train_loss'] = train_loss
		if config.bins != -1:
			for i in range(config.bins):
				od['max_val_acc_bin{}'.format(i)] = score[i]
		else:
			od['max_val_acc'] = score
		print_log(logger, od)
		pdb.set_trace()
		#test_acc_epoch, test_loss_epoch = run_validation(config, model, test_loader, voc, device, logger)
		#test_analysis_dfs = []
		for i in range(config.bins):
			test_acc_epoch, test_analysis_df = run_test(config, model, test_loader_bins[i], voc, device, logger)
			logger.info('Bin {} Accuracy: {}'.format(i, test_acc_epoch))
			#test_analysis_dfs.append(test_analysis_df)
			test_analysis_df.to_csv(os.path.join(result_folder, '{}_{}_test_analysis_bin{}.csv'.format(config.dataset, config.model_type, i)))
		logger.info("Analysis results written to {}...".format(result_folder))
示例#16
0
from sklearn import clone
from sklearn.model_selection import GridSearchCV, GroupKFold

import pandas as pd
import numpy as np

from os.path import join

from src.evaluation.classification import evaluate_fold
from src.models.base import ModelBase
from src.utils.logger import get_logger
from src.utils.misc import randomised_order

logger = get_logger(__name__)

__all__ = [
    'sklearn_model',
]


def select_fold(key, folds, fold_name):
    assert fold_name in folds.columns
    fold_def = folds[fold_name]
    fold_vals = set(np.unique(fold_def.values))
    assert fold_vals.issubset({'train', 'val', 'test'})
    return fold_def


def learn_sklearn_model(key, index, features, targets, fold_def, model, n_splits):
    assert index.shape[0] == features.shape[0]
    assert index.shape[0] == targets.shape[0]
示例#17
0
import os

from src.models.faust_dao import ModelTask, ModelTaskDoEvent
from src.utils.logger import get_logger

logger = get_logger('config')


class Config:
    KAFKA_BROKER_URL = None
    WEB_PORT = None

    def __init__(self):
        """
        Initiates microservice configuration
        :return: configuration object
        """
        self.KAFKA_BROKER_URL = self._set_kafka_url()
        self.WEB_PORT = self._set_web_port()
        self.topics = {
            'model-tasks-do': None,
            'model-tasksdone': None,
            'model-metadata-updates': None
        }
        self.debug_models = {
            'mod-dummy':
            'https://mod-dummy-501-zz-test.22ad.bi-x.openshiftapps.com'
            '/v1/models/mod-dummy:predict',
            'mod-text-class':
            'https://mod-text-class-501-zz-test.22ad.bi-x.'
            'openshiftapps.com/v1/models/mod-text-class:predict'