Пример #1
0
class CreateClassifier(object):
    def __init__(self):
        self.pre = Preprocess()
        self.nlp = NLPHelper()
        self.fex = FeatureExtractor()
        self.ut = Utility()
        self.mt = ModelTrainer()

    def createClassifier(self):
        # get golden data
        # data = self.nlp.getGoldenDataset()
        # extract entity and save into pickle
        # self.nlp.extractNews(data)  #CHANGE MODULE WHEN SWITCHING BETWEEN ADDITIONAL IDN AND DEFAULT
        # self.nlp.core_nlp.close()

        # # find feature in one text and save it to excel
        # # scenario 1
        # # path = "scenario1_halfidn_pickle/"
        # # scenario 2
        # # path = "scenario2_fullidn_pickle/"
        # # scenario 3
        # path = "scenario3_stanford_pickle/"
        # path = "test/"
        # filelist = os.listdir(path)
        # data = pd.DataFrame()

        # for idx, file in enumerate(filelist):

        #     #buka file pickle yang isinya data ner, coref, dan pos dari suatu teks berita
        #     pkl_dict = self.ut.loadPickle(os.path.join(path, file))
        #     # ekstraksi fitur dari file pickle
        #     temp = self.fex.extractFeaturesFromPickle(pkl_dict)
        #     data = data.append(temp)

        # #scenario 1
        # self.ut.convertToExcel("scenario1_idnnerhalf_extracted_feature.xlsx",data,'Sheet1')
        # #scenario 2
        # self.ut.convertToExcel("scenario2_idnnerfull_extracted_feature.xlsx",data,'Sheet1')
        # #scenario 3
        # self.ut.convertToExcel("scenario3_stanford_extracted_feature.xlsx",data,'Sheet1')
        # #scenario testing
        # self.ut.convertToExcel("testing_rf.xlsx",data,'Sheet1')

        # for training use
        # reading excel that contain features (HARUS DIKASIH KOLOM WHO DAN WHERE DULU, DAN DITENTUKAN YANG MANA WHO DAN WHERE)
        # scenario 1
        # df = pd.read_excel('scenario1_idnnerhalf_extracted_feature.xlsx', sheet_name='Sheet1')
        # scenario 2
        df = pd.read_excel('scenario2_idnnerfull_extracted_feature.xlsx',
                           sheet_name='Sheet1')
        # # scenario 3
        # df = pd.read_excel('scenario3_stanford_extracted_feature.xlsx', sheet_name='Sheet1')

        # # training model for detecting who and where, input "where" or "who" meaning that column will be dropped (deleted)
        who = self.mt.train(df, 'where')
        where = self.mt.train(df, 'who')
        self.nlp.core_nlp.close()
Пример #2
0
def train_and_eval(embedding, layers, batch_size, layers_type):
    # Device
    device = get_device()

    # Training parameters
    epochs = 5

    # Train and dev data
    train_file = './data/snli_1.0_train.jsonl'
    train_data = Data(train_file, embedding)
    dev_file = './data/snli_1.0_dev.jsonl'
    dev_data = Data(dev_file, embedding)
    test_file = './data/snli_1.0_test.jsonl'
    test_data = Data(test_file, embedding)

    # Create the model
    model = ResidualLSTMEncoder(embedding_vectors=embedding.vectors,
                                padding_index=train_data.padding_index,
                                layers_def=layers,
                                output_size=len(train_data.c2i),
                                max_sentence_length=Data.MAX_SENTENCE_SIZE,
                                hidden_mlp=800,
                                device=device,
                                layers_type=layers_type)

    num_of_params = sum(p.numel() for p in model.parameters())

    print("Number of model parameters: %d" % num_of_params)
    model = model.to(device)

    # Create optimizer
    optimizer = optim.Adam(model.parameters(), lr=2e-4)
    # optimizer = optim.Adagrad(model.parameters())

    # Create a model trainer object
    model_trainer = ModelTrainer(net=model,
                                 device=device,
                                 optimizer=optimizer)

    # Train the model
    model_trainer.train(train_data, dev_data,
                        train_log_file='train_1.txt', dev_log_file='dev_1.txt',
                        epochs=epochs, batch_size=batch_size)

    # Save the model
    model_trainer.save_model('./models/model_1')

    # Test the model
    test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size,
                                              shuffle=False, num_workers=0)

    test_performencer = Performencer(name='Test',
                                     output_size=model.output_size)
    model_trainer.eval(test_loader, test_performencer)
    test_performencer.pinpoint()
    test_performencer.log_to_file('test_1.txt')
Пример #3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--data',
                        type=str,
                        help='Data file path',
                        required=True)
    parser.add_argument('--output',
                        type=str,
                        help='Output file path',
                        required=True)
    parser.add_argument('--output_model',
                        type=str,
                        help='Model path',
                        default=None)
    parser.add_argument('--level', type=int, default=0)
    parser.add_argument('--fold', type=int, default=2)
    parser.add_argument('--iter', type=int, default=1)
    parser.add_argument('--batch_size', type=int, default=16)
    parser.add_argument('--epoch', type=int, default=30)
    parser.add_argument('--random_state', type=int, default=None)
    args = parser.parse_args()

    dataset = BrainDataset(args.data, expand_dim=True, level=args.level)
    model = CNN1D(len(np.unique(dataset.label))).to(DEVICE)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    epochs = args.epoch
    batch_size = args.batch_size
    trainer = ModelTrainer(model, dataset, DEVICE)
    result = trainer.train(optimizer,
                           criterion,
                           batch_size=batch_size,
                           epochs=epochs,
                           kfold=args.fold,
                           iteration=args.iter,
                           random_state=args.random_state)

    result = np.array(result)
    np.savetxt(args.output, result, delimiter=",")
    if args.output_model is not None:
        torch.save(model.state_dict(), args.output_model)
 def train_small_test_version(self, hyperparams_dict):
     """perform training on small test data"""
     trainer = ModelTrainer(self.dataloaders, hyperparams_dict,
                            self.wv_wrapper, self.path)
     model, losses, accuracies = trainer.train(epochs=3)
     return model, losses, accuracies
Пример #5
0
    elif opt.model_name == 2:
        # MLP
        train_config['using_spectrogram'] = True
        train_config['criterion'] = 'MSE'

    elif opt.model_name == 3:
        # simple generator
        train_config['using_simple_g'] = True
        train_config['criterion'] = 'MSE'

    elif opt.model_name == 4:
        # 1D auto encoder
        train_config['criterion'] = 'MSE'

    elif opt.model_name == 5:
        # 2D auto encoder
        train_config['criterion'] = 'MSE'
        train_config['using_spectrogram'] = True

    elif opt.model_name == 6:
        # simple auto encoder
        train_config['criterion'] = 'MSE'

    elif opt.model_name == 7:
        # adversarial MLP
        train_config['using_spectrogram'] = True

    trainer = ModelTrainer(**train_config)
    trainer.train()
Пример #6
0
def main():
    logging.warning("dummy warning!!!")
    logging.error("dummy error!!!")
    logging.info("dummy info!!!")
    logging.debug("dummy debug!!!")

    logging.warning(f"Inside {__file__}")

    parser = argparse.ArgumentParser()
    parser.add_argument("--subscription_id",
                        type=str,
                        dest="subscription_id",
                        help="The Azure subscription ID")
    parser.add_argument("--resource_group",
                        type=str,
                        dest="resource_group",
                        help="The resource group name")
    parser.add_argument("--workspace_name",
                        type=str,
                        dest="workspace_name",
                        help="The workspace name")
    parser.add_argument("--experiments_config_filepath",
                        type=str,
                        dest="experiments_config_filepath",
                        help="A path to the JSON config file")  # noqa: E501
    parser.add_argument("--model_name",
                        type=str,
                        dest="model_name",
                        help="Name of the Model")
    parser.add_argument("--should_register_model",
                        type=str2bool,
                        dest="should_register_model",
                        default=False,
                        help="Register trained model")  # noqa: E501
    args = parser.parse_args()

    logging.warning(f"Argument 1: {args.subscription_id}")
    logging.warning(f"Argument 2: {args.resource_group}")
    logging.warning(f"Argument 3: {args.workspace_name}")
    logging.warning(f"Argument 4: {args.experiments_config_filepath}")
    logging.warning(f"Argument 5: {args.model_name}")
    logging.warning(f"Argument 6: {args.should_register_model}")

    # Get current service context
    run = Run.get_context()
    workspace = run.experiment.workspace

    # Load training configuration
    experiment_configuration = ExperimentConfigurationWrapper()
    experiment_configuration.load(args.experiments_config_filepath)
    training_config = experiment_configuration.json["feature_extractor"][
        "training"]

    # initialize empty collections for data
    # train_set = []
    # test_set = []
    # dev_set = []

    download_root_dir = os.path.join('/mnt', 'tmp', 'datasets')
    data_splitter = HDF5TrainTestSplitter()
    for data_config in training_config["data"]:
        cropped_cells_dataset_name = data_config['input'][
            'cropped_cells_dataset_name']
        cropped_cells_dataset_version = data_config['input'][
            'cropped_cells_dataset_version']
        cropped_cells_dataset = Dataset.get_by_name(
            workspace=workspace,
            name=cropped_cells_dataset_name,
            version=cropped_cells_dataset_version)

        msg = (
            f"Dataset '{cropped_cells_dataset_name}', id: {cropped_cells_dataset.id}"
            f", version: {cropped_cells_dataset.version} will be used to prepare data for a feature extractor training."
        )
        logging.warning(msg)

        # Create a folder where datasets will be downloaded to
        dataset_target_path = os.path.join(download_root_dir,
                                           cropped_cells_dataset_name)
        os.makedirs(dataset_target_path, exist_ok=True)

        # Download 'cropped cells' dataset (consisting of HDF5 and CSV files)
        dataset_target_path = download_registered_file_dataset(
            workspace, cropped_cells_dataset, download_root_dir)
        list_all_files_in_location(dataset_target_path)

        # Split data (indices) into subsets
        df_metadata = pd.read_csv(
            os.path.join(dataset_target_path, 'cropped_nuclei.csv'))
        logging.warning(f"Metadata dataframe (shape): {df_metadata.shape}")

        logging.warning("Splitting data into subsets...")
        data_splitter.add_dataset(
            name=data_config['input']['cropped_cells_dataset_name'],
            fname=os.path.join(dataset_target_path,
                               'cropped_nuclei_images.h5'),
            metadata=df_metadata)

    data_splitter.train_dev_test_split()

    # --------
    # Training
    # --------

    # Init dataloaders
    #train_dataset = CellDataset(cell_list=train_set, target_cell_shape=INPUT_IMAGE_SIZE)
    train_dataset = CellDataset(splitter=data_splitter, dset_type='train')
    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=BATCH_SIZE,
        shuffle=True,
        num_workers=DATA_LOADER_WORKERS,
    )
    #dev_dataset = CellDataset(cell_list=dev_set, target_cell_shape=INPUT_IMAGE_SIZE)
    dev_dataset = CellDataset(splitter=data_splitter, dset_type='dev')
    dev_data_loader = torch.utils.data.DataLoader(
        dev_dataset,
        batch_size=BATCH_SIZE,
        shuffle=False,
        num_workers=DATA_LOADER_WORKERS,
    )
    #test_dataset = CellDataset(cell_list=test_set, target_cell_shape=INPUT_IMAGE_SIZE)
    test_dataset = CellDataset(splitter=data_splitter, dset_type='test')
    test_data_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=BATCH_SIZE,
        shuffle=False,
        num_workers=DATA_LOADER_WORKERS,
    )

    # Define and Train model
    device = torch.device(DEVICE)
    model = AUTOENCODER(
        latent_dim_size=LATENT_DIM_SIZE,
        input_image_size=INPUT_IMAGE_SIZE,
        device=device,
    )

    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    print(f"Using {torch.cuda.device_count()} GPUs for training")
    # model = torch.nn.DataParallel(model, device_ids=[0, 1, 2, 3])
    trainer = ModelTrainer(model, device)
    tr_losses, dev_losses = trainer.train(
        epochs=EPOCHS,
        optimizer=optimizer,
        train_data_loader=train_data_loader,
        dev_data_loader=dev_data_loader,
    )
    test_loss = trainer.test_model(test_data_loader)
    run.log("dev_loss", np.max(dev_losses))
    run.log("train_loss", np.max(tr_losses))
    run.log("test_loss", test_loss)
    # Plot training metrics and model sample reconstructions
    trainer.get_training_plot(tr_losses=tr_losses, dev_losses=dev_losses)
    run.log_image("model training metrics", plot=plt)

    dataiter = iter(test_data_loader)
    images = dataiter.next()
    trainer.get_pred_samples(images, figsize=(40, 40))
    run.log_image("sample reconstructions", plot=plt)

    # Training completed!  Let's save the model and upload it to AML
    os.makedirs("./models", exist_ok=True)
    model_file_name = "model.ext"
    model_output_loc = os.path.join(".", "models", model_file_name)
    torch.save(model, model_output_loc)

    run.upload_files(names=[model_output_loc], paths=[model_output_loc])

    # Register model (ideally, this should be a separate step)
    if args.should_register_model:
        logging.warning("List of the associated stored files:")
        logging.warning(run.get_file_names())

        logging.warning("Registering a new model...")
        # TODO: prepare a list of metrics that were logged using run.log()
        metric_names = []

        if os.path.exists(model_output_loc):
            register_model(
                run=run,
                model_name=args.model_name,
                model_description="Feature extraction model",
                model_path=model_output_loc,
                training_context="PythonScript",
                metric_names=metric_names,
            )
        else:
            logging.warning(
                f"Cannot register model as path {model_output_loc} does not exist."
            )
    else:
        logging.warning("A trained model will not be registered.")

    logging.warning("Done!")
    logging.info("Done Info Style!")
Пример #7
0
def main(args):

    total_step = 100//args.EF

    # set random seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    random.seed(args.seed)

    # prepare checkpoints and log folders
    if not os.path.exists(args.checkpoints_dir):
        os.makedirs(args.checkpoints_dir)
    if not os.path.exists(args.logs_dir):
        os.makedirs(args.logs_dir)

    # initialize dataset
    if args.dataset == 'visda':
        args.data_dir = os.path.join(args.data_dir, 'visda')
        data = Visda_Dataset(root=args.data_dir, partition='train', label_flag=None)

    elif args.dataset == 'office':
        args.data_dir = os.path.join(args.data_dir, 'Office')
        data = Office_Dataset(root=args.data_dir, partition='train', label_flag=None, source=args.source_name,
                              target=args.target_name)

    elif args.dataset == 'home':
        args.data_dir = os.path.join(args.data_dir, 'OfficeHome')
        data = Home_Dataset(root=args.data_dir, partition='train', label_flag=None, source=args.source_name,
                              target=args.target_name)
    elif args.dataset == 'visda18':
        args.data_dir = os.path.join(args.data_dir, 'visda18')
        data = Visda18_Dataset(root=args.data_dir, partition='train', label_flag=None)
    else:
        print('Unknown dataset!')

    args.class_name = data.class_name
    args.num_class = data.num_class
    args.alpha = data.alpha
    # setting experiment name
    label_flag = None
    selected_idx = None
    args.experiment = set_exp_name(args)
    logger = Logger(args)

    if not args.visualization:

        for step in range(total_step):

            print("This is {}-th step with EF={}%".format(step, args.EF))

            trainer = ModelTrainer(args=args, data=data, step=step, label_flag=label_flag, v=selected_idx, logger=logger)

            # train the model
            args.log_epoch = 4 + step//2
            trainer.train(step, epochs= 4 + (step) * 2, step_size=args.log_epoch)

            # pseudo_label
            pred_y, pred_score, pred_acc = trainer.estimate_label()

            # select data from target to source
            selected_idx = trainer.select_top_data(pred_score)

            # add new data
            label_flag, data = trainer.generate_new_train_data(selected_idx, pred_y, pred_acc)
    else:
        # load trained weights
        trainer = ModelTrainer(args=args, data=data)
        trainer.load_model_weight(args.checkpoint_path)
        vgg_feat, node_feat, target_labels, split = trainer.extract_feature()
        visualize_TSNE(node_feat, target_labels, args.num_class, args, split)

        plt.savefig('./node_tsne.png', dpi=300)
Пример #8
0
def main(args):
    # Modified here
    total_step = 100 // args.EF

    # set random seed
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    random.seed(args.seed)

    # prepare checkpoints and log folders
    if not os.path.exists(args.checkpoints_dir):
        os.makedirs(args.checkpoints_dir)
    if not os.path.exists(args.logs_dir):
        os.makedirs(args.logs_dir)

    # initialize dataset
    if args.dataset == 'nusimg':
        args.data_dir = os.path.join(args.data_dir, 'visda')
        data = NUSIMG_Dataset(root=args.data_dir,
                              partition='train',
                              label_flag=None,
                              source=args.source_path,
                              target=args.target_path)

    elif args.dataset == 'office':
        args.data_dir = os.path.join(args.data_dir, 'Office')
        data = Office_Dataset(root=args.data_dir,
                              partition='train',
                              label_flag=None,
                              source=args.source_path,
                              target=args.target_path)
    elif args.dataset == 'mrc':
        data = MRC_Dataset(root=args.data_dir,
                           partition='train',
                           label_flag=None,
                           source=args.source_path,
                           target=args.target_path)
    else:
        print('Unknown dataset!')

    args.class_name = data.class_name
    args.num_class = data.num_class
    args.alpha = data.alpha
    # setting experiment name
    label_flag = None
    selected_idx = None
    args.experiment = set_exp_name(args)

    logger = Logger(args)
    trainer = ModelTrainer(args=args,
                           data=data,
                           label_flag=label_flag,
                           v=selected_idx,
                           logger=logger)
    for step in range(total_step):

        print("This is {}-th step with EF={}%".format(step, args.EF))
        # train the model
        args.log_epoch = 5
        trainer.train(epochs=24, step=step)  #24
        # psedo_label
        pred_y, pred_score, pred_acc = trainer.estimate_label()

        # select data from target to source
        selected_idx = trainer.select_top_data(pred_score)

        # add new data
        trainer.generate_new_train_data(selected_idx, pred_y, pred_acc)
Пример #9
0
def train():
    model_trainer = ModelTrainer(parsed_args)
    model_trainer.train()
Пример #10
0
    if torch.cuda.is_available():
        device = torch.device(args.device)
    else:
        device = torch.device('cpu')

    optimizer = torch.optim.Adam(network.parameters(),
                                 lr=args.learning_rate,
                                 weight_decay=1e-2)

    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=lr_schedule, gamma=lr_gamma)

    # ## Debug
    # torch.autograd.set_detect_anomaly(True)

    trainer = ModelTrainer(model=network,
                           train_loader=trainloader,
                           test_loader=testloader,
                           optimizer=optimizer,
                           scheduler=scheduler,
                           criterion=loss_fn,
                           epochs=args.epochs,
                           name=args.name,
                           test_freq=args.val_freq,
                           device=args.device)

    # train
    best_model = trainer.train()

    # #Test best model
    # trainer.test_best_model(best_model, fname_suffix='_posttraining')