Exemplo n.º 1
0
    def test_cpu(self):
        """Run full training for MNIST CPU training."""
        workdir = self.get_tmp_model_dir()
        config = default.get_config()

        start_time = time.time()
        train.train_and_evaluate(config=config, workdir=workdir)
        benchmark_time = time.time() - start_time

        summaries = self.read_summaries(workdir)

        # Summaries contain all the information necessary for the regression
        # metrics.
        wall_time, _, eval_accuracy = zip(*summaries['eval_accuracy'])
        wall_time = np.array(wall_time)
        sec_per_epoch = np.mean(wall_time[1:] - wall_time[:-1])
        end_eval_accuracy = eval_accuracy[-1]

        # Assertions are deferred until the test finishes, so the metrics are
        # always reported and benchmark success is determined based on *all*
        # assertions.
        self.assertBetween(end_eval_accuracy, 0.98, 1.0)

        # Use the reporting API to report single or multiple metrics/extras.
        self.report_wall_time(benchmark_time)
        self.report_metrics({
            'sec_per_epoch': sec_per_epoch,
            'accuracy': end_eval_accuracy,
        })
        self.report_extras({
            'model_name': 'MNIST',
            'description': 'CPU test for MNIST.',
            'implementation': 'linen',
        })
Exemplo n.º 2
0
    def test_train_and_evaluate(self):
        """Tests training and evaluation loop using mocked data."""
        # Create a temporary directory where tensorboard metrics are written.
        model_dir = tempfile.mkdtemp()

        # Go two directories up to the root of the flax directory.
        flax_root_dir = pathlib.Path(__file__).parents[2]
        data_dir = str(flax_root_dir) + '/.tfds/metadata'

        with tfds.testing.mock_data(num_examples=1, data_dir=data_dir):
            lm1b_train.train_and_evaluate(random_seed=0,
                                          batch_size=1,
                                          learning_rate=0.05,
                                          num_train_steps=1,
                                          num_eval_steps=1,
                                          eval_freq=1,
                                          max_target_length=10,
                                          max_eval_target_length=32,
                                          weight_decay=1e-1,
                                          data_dir=None,
                                          model_dir=model_dir,
                                          restore_checkpoints=False,
                                          save_checkpoints=False,
                                          checkpoint_freq=2,
                                          max_predict_token_length=2,
                                          sampling_temperature=0.6,
                                          sampling_top_k=4,
                                          prompt_str='unittest ')
Exemplo n.º 3
0
    def _test_8x_v100_half_precision(self, num_epochs: int, min_accuracy,
                                     max_accuracy):
        """Utility to benchmark ImageNet on 8xV100 GPUs. Use in your test func."""
        # Make sure tf does not allocate gpu memory.
        tf.config.experimental.set_visible_devices([], 'GPU')

        workdir = self.get_tmp_model_dir()
        config = config_lib.get_config()
        config.num_epochs = num_epochs
        start_time = time.time()
        train.train_and_evaluate(config=config, workdir=workdir)
        benchmark_time = time.time() - start_time
        summaries = self.read_summaries(workdir)

        # Summaries contain all the information necessary for the regression
        # metrics.
        wall_time, _, eval_accuracy = zip(*summaries['eval_accuracy'])
        wall_time = np.array(wall_time)
        sec_per_epoch = np.mean(wall_time[1:] - wall_time[:-1])
        end_accuracy = eval_accuracy[-1]

        # Assertions are deferred until the test finishes, so the metrics are
        # always reported and benchmark success is determined based on *all*
        # assertions.
        self.assertBetween(end_accuracy, min_accuracy, max_accuracy)

        # Use the reporting API to report single or multiple metrics/extras.
        self.report_wall_time(benchmark_time)
        self.report_metrics({
            'sec_per_epoch': sec_per_epoch,
            'accuracy': end_accuracy
        })
Exemplo n.º 4
0
    def test_train_and_evaluate(self):
        config = default.get_config()
        config.max_corpus_chars = 1000
        config.vocab_size = 32
        config.batch_size = 8
        config.num_train_steps = 1
        config.num_eval_steps = 1
        config.num_predict_steps = 1

        config.num_layers = 1
        config.qkv_dim = 128
        config.emb_dim = 128
        config.mlp_dim = 512
        config.num_heads = 2

        config.max_target_length = 32
        config.max_eval_target_length = 32
        config.max_predict_length = 32

        workdir = tempfile.mkdtemp()

        # Go two directories up to the root of the flax directory.
        flax_root_dir = pathlib.Path(__file__).parents[2]
        data_dir = str(flax_root_dir) + '/.tfds/metadata'  # pylint: disable=unused-variable

        with tfds.testing.mock_data(num_examples=128, data_dir=data_dir):
            train.train_and_evaluate(config, workdir)
        logging.info('workdir content: %s', tf.io.gfile.listdir(workdir))
Exemplo n.º 5
0
Arquivo: main.py Projeto: voicedm/flax
def main(argv):
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')

    # Hide any GPUs form TensorFlow. Otherwise TF might reserve memory and make
    # it unavailable to JAX.
    tf.config.experimental.set_visible_devices([], 'GPU')

    logging.info('JAX process: %d / %d', jax.process_index(),
                 jax.process_count())
    logging.info('JAX local devices: %r', jax.local_devices())

    # Add a note so that we can tell which task is which JAX host.
    # (Depending on the platform task 0 is not guaranteed to be host 0)
    platform.work_unit().set_task_status(
        f'process_index: {jax.process_index()}, '
        f'process_count: {jax.process_count()}')
    platform.work_unit().create_artifact(platform.ArtifactType.DIRECTORY,
                                         FLAGS.workdir, 'workdir')

    if FLAGS.sample:
        sample.save_images(sample.generate_sample(FLAGS.config, FLAGS.workdir),
                           'sample.png')
    else:
        train.train_and_evaluate(FLAGS.config, FLAGS.workdir)
Exemplo n.º 6
0
def main(argv):
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')

    # Make sure tf does not allocate gpu memory.
    tf.config.experimental.set_visible_devices([], 'GPU')
    # Require JAX omnistaging mode.
    jax.config.enable_omnistaging()

    train.train_and_evaluate(workdir=FLAGS.workdir, config=FLAGS.config)
Exemplo n.º 7
0
    def test_train_and_evaluate(self):
        config = get_test_config()
        workdir = tempfile.mkdtemp()

        # Go two directories up to the root of the flax directory.
        flax_root_dir = pathlib.Path(__file__).parents[2]
        data_dir = str(flax_root_dir) + '/.tfds/metadata'  # pylint: disable=unused-variable

        with tfds.testing.mock_data(num_examples=8, data_dir=data_dir):
            train.train_and_evaluate(config, workdir)
        logging.info('workdir content: %s', tf.io.gfile.listdir(workdir))
Exemplo n.º 8
0
  def test_train_and_evaluate(self):
    """Tests training and evaluation code by running a single step."""
    # Create a temporary directory where tensorboard metrics are written.
    workdir = tempfile.mkdtemp()

    # Go two directories up to the root of the flax directory.
    flax_root_dir = pathlib.Path(__file__).parents[2]
    data_dir = str(flax_root_dir) + "/.tfds/metadata"  # pylint: disable=unused-variable

    # Define training configuration.
    config = default.get_config()
    config.num_epochs = 1
    config.batch_size = 8

    with tfds.testing.mock_data(num_examples=8, data_dir=data_dir):
      train.train_and_evaluate(config=config, workdir=workdir)
Exemplo n.º 9
0
def main():
    f = open("outdir.txt", 'r')
    outdir = f.read().rstrip('\n')
    f = open("experiment_folder.txt", 'r')
    experiment_folder = f.read().rstrip('\n')

    ##pass the size of the vocabulary to the model
    with open(os.path.join(outdir, mt_to_ix_file)) as f:
        rd = csv.reader(f)
        vocab_size = 0
        for r in rd:
            vocab_size += 1

    #set random seed for reproducible experiments
    torch.manual_seed(12)
    torch.cuda.manual_seed(12)

    ##Import data
    data = myData(outdir, ehr_file)
    data_generator = DataLoader(data,
                                model_pars['batch_size'],
                                shuffle=True,
                                collate_fn=my_collate,
                                drop_last=True)
    #define model and optimizer
    print("cohort numerosity:{0}".format(len(data)))
    model = net.LSTMehrEncoding(vocab_size, model_pars['embedding_dim'],
                                model_pars['batch_size'])
    #model = nn.DataParallel(model, device_ids=[1,2,3])
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=model_pars['learning_rate'],
                                 weight_decay=1e-5)

    #start the unsupervised training and evaluation
    model.cuda()
    loss_fn = net.criterion
    print("Starting training for {} epochs...".format(
        model_pars['num_epochs']))
    mrn, encoded, metrics_avg = train_and_evaluate(model, data_generator,
                                                   loss_fn, optimizer, metrics,
                                                   experiment_folder)
    svd = TruncatedSVD(n_components=100)
    encoded = svd.fit_transform(encoded)

    with open(experiment_folder + '/LSTMencoded_vect.csv', 'w') as f:
        wr = csv.writer(f, delimiter=',')
        for e in encoded:
            wr.writerow(e)

    with open(experiment_folder + '/LSTMmrns.csv', 'w') as f:
        wr = csv.writer(f, delimiter=',')
        for m in mrn:
            wr.writerow([m])

    with open(experiment_folder + '/LSTMmetrics.txt', 'w') as f:
        wr = csv.writer(f, delimiter='\t')
        wr.writerow(["Mean loss:", metrics_avg['loss']])
        wr.writerow(["Accuracy:", metrics_avg['accuracy']])
def setup_and_train(parmas):
    model = Net(params).cuda() if params.cuda else Net(params)

    image_size = model.image_size()
    train_transform = transforms.Compose([
        transforms.RandomResizedCrop(image_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(), normalize
    ])

    valid_transform = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(image_size),
        transforms.ToTensor(), normalize
    ])

    loss_fn = FocalLoss()

    # Observe that all parameters are being optimized
    # optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
    optimizer = optim.SGD([{
        'params': model.base_parameters
    }, {
        'params': model.last_parameters,
        'lr': 1e-2
    }],
                          lr=1e-3,
                          momentum=0.9)
    # optimizer = optim.Adam(model.parameters(), lr=params.learning_rate)

    # Decay LR by a factor of 0.1 every 7 epochs
    exp_lr_scheduler = lr_scheduler.StepLR(optimizer,
                                           step_size=params.step_size,
                                           gamma=params.gama)

    dataloaders = get_dateloaders(params,
                                  train_transform=train_transform,
                                  valid_transform=valid_transform)

    train_and_evaluate(model=model,
                       dataloaders=dataloaders,
                       optimizer=optimizer,
                       loss_fn=loss_fn,
                       scheduler=exp_lr_scheduler,
                       params=params)
Exemplo n.º 11
0
  def test_train_and_evaluate(self):
    """Tests training and evaluation loop using mocked data."""
    # Create a temporary directory where tensorboard metrics are written.
    workdir = tempfile.mkdtemp()

    # Go two directories up to the root of the flax directory.
    flax_root_dir = pathlib.Path(__file__).parents[2]
    data_dir = str(flax_root_dir) + '/.tfds/metadata'

    # Define training configuration
    config = default_lib.get_config()
    config.batch_size = 1
    config.num_epochs = 1
    config.num_train_steps = 1
    config.steps_per_eval = 1

    with tfds.testing.mock_data(num_examples=1, data_dir=data_dir):
      train.train_and_evaluate(workdir=workdir, config=config)
Exemplo n.º 12
0
  def test_fake_data(self):
    workdir = self.get_tmp_model_dir()
    config = config_lib.get_config()
    # Go two directories up to the root of the flax directory.
    flax_root_dir = pathlib.Path(__file__).parents[2]
    data_dir = str(flax_root_dir) + '/.tfds/metadata'

    start_time = time.time()
    with tfds.testing.mock_data(num_examples=1024, data_dir=data_dir):
      train.train_and_evaluate(config, workdir)
    benchmark_time = time.time() - start_time

    self.report_wall_time(benchmark_time)
    self.report_extras({
        'description': 'ImageNet ResNet50 with fake data',
        'model_name': 'resnet50',
        'parameters': f'hp=true,bs={FLAGS.config.batch_size}',
    })
Exemplo n.º 13
0
def main():
    f = open("outdir.txt", 'r')
    outdir = f.read().rstrip('\n')
    #create an experiment folder tied to date and time where to save output from the model 
    experiment_folder = os.path.expanduser('~/data1/stratification_ILRM/experiments/') + disease_folder +\
                    '-'.join(map(str, list(datetime.now().timetuple()[:6])))
    os.makedirs(experiment_folder)
    f = open("experiment_folder.txt", 'w') ##path to the experiment folder is saved in a txt file
    f.write(experiment_folder)
    f.close()
    
    ##pass the size of the vocabulary to the model
    with open(os.path.join(outdir, mt_to_ix_file)) as f:
            rd = csv.reader(f)
            next(rd)
            vocab_size = 1
            for r in rd:
                vocab_size+=1

    #set random seed for reproducible experiments
    torch.manual_seed(123)
    torch.cuda.manual_seed(123)

    ##Import data
    data = myData(outdir, ehr_file)
    data_generator = DataLoader(data, model_pars['batch_size'], shuffle=True, collate_fn=my_collate)
    #define model and optimizer
    print("cohort numerosity:{0} -- max_seq_length:{1}".format(len(data), L))
    model = net.ehrEncoding(vocab_size, L, model_pars['embedding_dim'], model_pars['kernel_size'])
    optimizer = torch.optim.Adam(model.parameters(), lr=model_pars['learning_rate'], weight_decay=1e-5)

    #start the unsupervised training and evaluation
    model.cuda()
    loss_fn = net.criterion
    print("Starting training for {} epochs...".format(model_pars['num_epochs']))
    mrn, encoded, metrics_avg = train_and_evaluate(model, data_generator, loss_fn, optimizer, metrics, experiment_folder)
    
    ##save encoded vectors, medical record number list (to keep track of the order) and metric (loss and accuracy)
    with open(experiment_folder + '/encoded_vect.csv', 'w') as f:
        wr = csv.writer(f, delimiter=',')
        for e in encoded:
            wr.writerow(e)

    with open(experiment_folder + '/mrns.csv', 'w') as f:
        wr = csv.writer(f, delimiter=',')
        for m in mrn:
            wr.writerow([m])

    with open(experiment_folder + '/metrics.txt', 'w') as f:
        wr = csv.writer(f, delimiter='\t')
        #for m, v in metrics_average.items():
        #    wr.writerow([m, v])
        wr.writerow(["Mean loss:", metrics_avg['loss']])
        wr.writerow(["Accuracy:", metrics_avg['accuracy']])
Exemplo n.º 14
0
def main(argv):
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')

    FLAGS.log_dir = FLAGS.workdir
    FLAGS.stderrthreshold = 'info'
    logging.get_absl_handler().start_logging_to_file()

    # Hide any GPUs form TensorFlow. Otherwise TF might reserve memory and make
    # it unavailable to JAX.
    tf.config.experimental.set_visible_devices([], 'GPU')

    logging.info('JAX host: %d / %d', jax.host_id(), jax.host_count())
    logging.info('JAX local devices: %r', jax.local_devices())

    # Add a note so that we can tell which task is which JAX host.
    # (Depending on the platform task 0 is not guaranteed to be host 0)
    platform.work_unit().set_task_status(
        f'host_id: {jax.host_id()}, host_count: {jax.host_count()}')
    platform.work_unit().create_artifact(platform.ArtifactType.DIRECTORY,
                                         FLAGS.workdir, 'workdir')

    train.train_and_evaluate(FLAGS.config, FLAGS.workdir)
Exemplo n.º 15
0
def main(argv):
    if len(argv) > 1:
        raise app.UsageError("Too many command-line arguments.")

    # Hide any GPUs form TensorFlow. Otherwise TF might reserve memory and make
    # it unavailable to JAX.
    tf.config.experimental.set_visible_devices([], "GPU")

    if FLAGS.jax_backend_target:
        logging.info("Using JAX backend target %s", FLAGS.jax_backend_target)
        jax.config.update("jax_xla_backend", "tpu_driver")
        jax.config.update("jax_backend_target", FLAGS.jax_backend_target)

    logging.info("JAX host: %d / %d", jax.host_id(), jax.host_count())
    logging.info("JAX local devices: %r", jax.local_devices())

    # Add a note so that we can tell which task is which JAX host.
    # (Depending on the platform task 0 is not guaranteed to be host 0)
    platform.work_unit().set_task_status(
        f"host_id: {jax.host_id()}, host_count: {jax.host_count()}")
    platform.work_unit().create_artifact(platform.ArtifactType.DIRECTORY,
                                         FLAGS.workdir, "workdir")

    train.train_and_evaluate(FLAGS.config, FLAGS.workdir)
Exemplo n.º 16
0
    def test_train_and_evaluate(self):
        """Tests training and evaluation loop using TFDS mocked data."""
        # Create a temporary directory where tensorboard metrics are written.
        model_dir = tempfile.mkdtemp()

        # Go two directories up to the root of the flax directory.
        flax_root_dir = pathlib.Path(__file__).parents[2]
        data_dir = str(flax_root_dir) + '/.tfds/metadata'

        with tfds.testing.mock_data(num_examples=8, data_dir=data_dir):
            sst2_train.train_and_evaluate(seed=0,
                                          model_dir=model_dir,
                                          num_epochs=1,
                                          batch_size=8,
                                          embedding_size=256,
                                          hidden_size=256,
                                          min_freq=5,
                                          max_seq_len=55,
                                          dropout=0.5,
                                          emb_dropout=0.5,
                                          word_dropout_rate=0.1,
                                          learning_rate=0.0005,
                                          checkpoints_to_keep=0,
                                          l2_reg=1e-6)
Exemplo n.º 17
0
def one_search_experiment(dataset,
                          error_type,
                          train_file,
                          model,
                          seed,
                          n_jobs=1,
                          hyperparams=None,
                          skip_test_files=[]):
    """One experiment on the datase given an error type, a train file, a model and a random search seed
        
    Args:
        dataset (dict): dataset dict in config.py
        error_type (string): error type
        train_file (string): filename of training set (dirty or clean) 
        model (dict): ml model dict in model.py
        seed (int): seed for this experiment
    """
    np.random.seed(seed)
    # generate random seeds for down sample and training
    down_sample_seed, train_seed = np.random.randint(1000, size=2)

    # load and preprocess data
    X_train, y_train, X_test_list, y_test_list, test_files = \
        preprocess(dataset, error_type, train_file, normalize=True, down_sample_seed=down_sample_seed)

    test_files = list(set(test_files).difference(set(skip_test_files)))

    # train and evaluate
    result = train_and_evaluate(X_train,
                                y_train,
                                X_test_list,
                                y_test_list,
                                test_files,
                                model,
                                n_jobs=n_jobs,
                                seed=train_seed,
                                hyperparams=hyperparams)
    return result
Exemplo n.º 18
0
def main():

    ##pass the size of the vocabulary to the model
    with open(os.path.join(data_folder, mt_to_ix_file)) as f:
        rd = csv.reader(f)
        vocab_size = 0
        for r in rd:
            vocab_size += 1

    #set random seed for reproducible experiments
    torch.manual_seed(123)
    torch.cuda.manual_seed(123)

    ##Import data
    data = myData(data_folder, ehr_file)
    data_generator = DataLoader(data,
                                model_pars['batch_size'],
                                shuffle=True,
                                collate_fn=my_collate)
    #define model and optimizer
    print("cohort numerosity:{0} -- max_seq_length:{1}".format(len(data), L))
    model = net.ehrEncoding(vocab_size, L, model_pars['embedding_dim'],
                            model_pars['kernel_size'])
    #model = nn.DataParallel(model, device_ids=[1,2,3])
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=model_pars['learning_rate'],
                                 weight_decay=1e-5)

    #start the unsupervised training and evaluation
    model.cuda()
    loss_fn = net.criterion
    print("Starting training for {} epochs...".format(
        model_pars['num_epochs']))
    mrn, encoded, metrics_avg = train_and_evaluate(model, data_generator,
                                                   loss_fn, optimizer,
                                                   experiment_folder, metrics)

    #with open(experiment_folder + '/TRencoded_vect.csv', 'w') as f:
    #    wr = csv.writer(f, delimiter=',')
    #    for e in encoded_tr:
    #        wr.writerow(e)

    #with open(experiment_folder + '/TRmrns.csv', 'w') as f:
    #    wr = csv.writer(f, delimiter=',')
    #    for m in mrn_tr:
    #        wr.writerow([m])

    #with open(experiment_folder + '/TRmetrics.txt', 'w') as f:
    #    wr = csv.writer(f, delimiter = '\t')
    #for m, v in metrics_average.items():
    #    wr.writerow([m, v])
    #    wr.writerow(["Mean loss:", loss_tr])

    ##load and evaluate best model
    #print("Evaluating best model...")
    #best_saved = torch.load(experiment_folder + '/best_model.pt')
    #model.load_state_dict(best_saved['state_dict'])
    #mrn, encoded, metrics_avg = evaluate(model, loss_fn, data_generator, metrics, best_eval=True)

    with open(experiment_folder + '/encoded_vect.csv', 'w') as f:
        wr = csv.writer(f, delimiter=',')
        for e in encoded:
            wr.writerow(e)

    with open(experiment_folder + '/mrns.csv', 'w') as f:
        wr = csv.writer(f, delimiter=',')
        for m in mrn:
            wr.writerow([m])

    with open(experiment_folder + '/metrics.txt', 'w') as f:
        wr = csv.writer(f, delimiter='\t')
        #for m, v in metrics_average.items():
        #    wr.writerow([m, v])
        wr.writerow(["Mean loss:", metrics_avg['loss']])
        wr.writerow(["Accuracy:", metrics_avg['accuracy']])
def learn_patient_representations(
        indir,
        test_set=False,
        sampling=None,
        emb_filename=None
):
    # encodings folder to save the representations
    exp_dir = os.path.join(indir, 'encodings')
    if test_set:
        exp_dir = os.path.join(indir, 'encodings', 'test')

    os.makedirs(exp_dir, exist_ok=True)

    # get the vocabulary size
    vocab_size, vocab = vocabulary.get_vocab(indir)

    # load pre-computed embeddings
    if emb_filename is not None:
        model = Word2Vec.load(emb_filename)
        embs = model.wv
        del model
        print('Loaded pre-computed embeddings for {0} concepts'.format(
            len(embs.vocab)))
    else:
        embs = None

    # set random seed for experiment reproducibility
    torch.manual_seed(123)
    torch.cuda.manual_seed(123)

    # load data
    data_tr = EHRdata(os.path.join(indir), ut.dt_files['ehr-file'], sampling)
    data_generator_tr = DataLoader(
        data_tr,
        ut.model_param['batch_size'],
        shuffle=True,
        collate_fn=ehr_collate
    )

    if test_set:
        data_ts = EHRdata(os.path.join(indir), ut.dt_files['ehr-file-test'], sampling)

        data_generator_ts = DataLoader(
            data_ts,
            ut.model_param['batch_size'],
            shuffle=True,
            collate_fn=ehr_collate
        )
        print("Test cohort size: {0}".format(len(data_ts)))
    else:
        data_generator_ts = data_generator_tr

    print('Training cohort size: {0}\n'.format(len(data_tr)))
    print('Max Sequence Length: {0}\n'.format(ut.len_padded))
    print('Learning rate: {0}'.format(ut.model_param['learning_rate']))
    print('Batch size: {0}'.format(ut.model_param['batch_size']))
    print('Kernel size: {0}\n'.format(ut.model_param['kernel_size']))

    # define model and optimizer
    model = net.ehrEncoding(
        vocab_size=vocab_size,
        max_seq_len=ut.len_padded,  # 32
        emb_size=ut.model_param['embedding_size'],  # 100
        kernel_size=ut.model_param['kernel_size'],  # 5
        pre_embs=embs,
        vocab=vocab
    )

    optimizer = torch.optim.Adam(
        model.parameters(),
        lr=ut.model_param['learning_rate'],
        weight_decay=ut.model_param['weight_decay']
    )

    # model.cuda()
    if torch.cuda.device_count() > 1:
        print('No. of GPUs: {0}\n'.format(torch.cuda.device_count()))
        model = nn.DataParallel(model)
    else:
        model.cuda()
        print('No. of GPUs: 1\n')

    loss_fn = net.criterion
    print('Training for {} epochs\n'.format(ut.model_param['num_epochs']))

    #only train
    train_and_evaluate(
        model,
        data_generator_tr,
        data_generator_ts,
        loss_fn,
        optimizer,
        net.metrics,
        exp_dir
    )

    # uncomment this out to train AND evaluate
    # will take a really, really long time
    # training and evaluation
    # results of best model are saved to outdir/best_model.pt in this function
    '''
    mrn, encoded, encoded_avg, metrics_avg = train_and_evaluate(
        model,
        data_generator_tr,
        data_generator_ts,
        loss_fn,
        optimizer,
        net.metrics,
        exp_dir
    )

    # save encodings
    # encoded vectors (representations)
    outfile = os.path.join(exp_dir, 'convae_avg_vect.csv')
    with open(outfile, 'w') as f:
        wr = csv.writer(f)
        wr.writerow(["MRN", "ENCODED-AVG"])
        for m, e in zip(mrn, encoded_avg):
            wr.writerow([m] + list(e))

    outfile = os.path.join(exp_dir, 'convae_vect.csv')
    with open(outfile, 'w') as f:
        wr = csv.writer(f)
        wr.writerow(["MRN", "ENCODED-SUBSEQ"])
        for m, evs in zip(mrn, encoded):
            for e in evs:
                wr.writerow([m] + e)

    # metrics (loss and accuracy)
    outfile = os.path.join(exp_dir, 'metrics.txt')
    with open(outfile, 'w') as f:
        f.write('Mean loss: %.3f\n' % metrics_avg['loss'])
        f.write('Accuracy: %.3f\n' % metrics_avg['accuracy'])

    # chop patient sequences into fixed subsequences of length L
    # L = ut.len_padded = 32
    # I think that this is here for the human-readable version of how the patient records are subset
    outfile = os.path.join(exp_dir, 'cohort_ehr_subseq{0}.csv'.format(ut.len_padded))
    write_ehr_subseq(data_generator_tr, outfile)

    if test_set:
        outfile = os.path.join(exp_dir, 'test_cohort_ehr_subseq{0}.csv'.format(ut.len_padded))
        write_ehr_subseq(data_generator_ts, outfile)
    '''
    return
Exemplo n.º 20
0
            test_set = TestDataset(test_x_input, test_v_input, test_label)
            val_set = TestDataset(val_x_input, val_v_input, val_label)

            # sampler
            train_sampler = WeightedSampler(
                train_v_input
            )  # Use weighted sampler instead of random sampler

            # loader
            train_loader = DataLoader(train_set,
                                      batch_size=params.batch_size,
                                      sampler=train_sampler,
                                      num_workers=4)
            test_loader = DataLoader(test_set,
                                     batch_size=params.predict_batch,
                                     sampler=RandomSampler(test_set),
                                     num_workers=4)
            val_loader = DataLoader(val_set,
                                    batch_size=params.predict_batch,
                                    sampler=RandomSampler(val_set),
                                    num_workers=4)

            optimizer = optim.Adam(model.parameters(), lr=params.learning_rate)
            loss_fn = net.loss_fn

            restore_file = None
            train_and_evaluate(model, train_loader, test_loader, val_loader,
                               optimizer, loss_fn, params, restore_file)
            # break
        # break
def learn_patient_representations(indir,
                                  outdir,
                                  disease_dt,
                                  eval_baseline=False,
                                  sampling=None,
                                  emb_filename=None):

    # experiment folder with date and time to save the representations
    exp_dir = os.path.join(
        outdir, '-'.join([
            disease_dt,
            datetime.now().strftime('%Y-%m-%d-%H-%M-%S'), 'w2v-nobn-softplus'
        ]))
    os.makedirs(exp_dir)

    # get the vocabulary size
    fvocab = os.path.join(indir, ut.dt_files['vocab'])
    with open(fvocab) as f:
        rd = csv.reader(f)
        next(rd)
        vocab = {}
        for r in rd:
            tkn = r[0].split('::')
            tkn[1] = tkn[1].capitalize()
            vocab[int(r[1])] = '::'.join(tkn)
        vocab_size = len(vocab) + 1
    print('Vocabulary size: {0}'.format(vocab_size))

    # load pre-computed embeddings
    if emb_filename is not None:
        model = Word2Vec.load(emb_filename)
        embs = model.wv
        del model
        print('Loaded pre-computed embeddings for {0} concepts'.format(
            len(embs.vocab)))
    else:
        embs = None

    # set random seed for experiment reproducibility
    torch.manual_seed(123)
    torch.cuda.manual_seed(123)

    # load data
    data = EHRdata(indir, ut.dt_files['ehr'], sampling)
    data_generator = DataLoader(data,
                                ut.model_param['batch_size'],
                                shuffle=True,
                                collate_fn=ehr_collate)

    print('Cohort Size: {0} -- Max Sequence Length: {1}\n'.format(
        len(data), ut.len_padded))

    # define model and optimizer
    print('Learning rate: {0}'.format(ut.model_param['learning_rate']))
    print('Batch size: {0}'.format(ut.model_param['batch_size']))
    print('Kernel size: {0}\n'.format(ut.model_param['kernel_size']))

    model = net.ehrEncoding(vocab_size=vocab_size,
                            max_seq_len=ut.len_padded,
                            emb_size=ut.model_param['embedding_size'],
                            kernel_size=ut.model_param['kernel_size'],
                            pre_embs=embs,
                            vocab=vocab)

    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=ut.model_param['learning_rate'],
                                 weight_decay=ut.model_param['weight_decay'])

    # training and evaluation
    if torch.cuda.device_count() > 1:
        print('No. of GPUs: {0}\n'.format(torch.cuda.device_count()))
        model = nn.DataParallel(model)
    else:
        model.cuda()
        print('No. of GPUs: 1\n')

    # model.cuda()
    loss_fn = net.criterion
    print('Training for {} epochs\n'.format(ut.model_param['num_epochs']))
    mrn, encoded, metrics_avg = train_and_evaluate(model, data_generator,
                                                   loss_fn, optimizer,
                                                   net.metrics, exp_dir)

    # save results

    # encoded vectors (representations)
    outfile = os.path.join(exp_dir, 'encoded_vect.csv')
    with open(outfile, 'w') as f:
        wr = csv.writer(f)
        wr.writerows(encoded)

    # MRNs to keep track of the order
    outfile = os.path.join(exp_dir, 'mrns.csv')
    with open(outfile, 'w') as f:
        wr = csv.writer(f)
        for m in mrn:
            wr.writerow([m])

    # metrics (loss and accuracy)
    outfile = os.path.join(exp_dir, 'metrics.txt')
    with open(outfile, 'w') as f:
        f.write('Mean loss: %.3f\n' % metrics_avg['loss'])
        f.write('Accuracy: %.3f\n' % metrics_avg['accuracy'])

    # evaluate clustering
    gt_file = os.path.join(indir, ut.dt_files['diseases'])
    gt_disease = clu.load_mrn_disease(gt_file)
    min_clu = 2
    max_clu = 10

    if eval_baseline:
        print('\nRunning clustering on the TF-IDF vectors')
        datafile = os.path.join(indir, ut.dt_files['ehr'])
        mrn_idx, svd_mtx = clu.svd_tfidf(datafile, vocab_size)
        gt_disease_raw = [gt_disease[m][0] for m in mrn_idx]
        clu.eval_hierarchical_clustering(svd_mtx, gt_disease_raw, min_clu,
                                         max_clu)

    print('\nRunning clustering on the encoded vectors')
    gt_disease_enc = [gt_disease[m][0] for m in mrn]
    clu.eval_hierarchical_clustering(encoded,
                                     gt_disease_enc,
                                     min_clu,
                                     max_clu,
                                     preproc=True)

    return
Exemplo n.º 22
0
def learn_patient_representations(indir,
                                  test_set=False,
                                  sampling=None,
                                  emb_filename=None):
    # experiment folder with date and time to save the representations
    exp_dir = os.path.join(indir, 'encodings')
    os.makedirs(exp_dir, exist_ok=True)

    # get the vocabulary size
    fvocab = os.path.join(os.path.join(indir), ut.dt_files['vocab'])
    with open(fvocab) as f:
        rd = csv.reader(f)
        next(rd)
        vocab = {}
        for r in rd:
            vocab[int(r[1])] = r[0]
        vocab_size = len(vocab) + 1
    print('Vocabulary size: {0}'.format(vocab_size))

    # load pre-computed embeddings
    if emb_filename is not None:
        model = Word2Vec.load(emb_filename)
        embs = model.wv
        del model
        print('Loaded pre-computed embeddings for {0} concepts'.format(
            len(embs.vocab)))
    else:
        embs = None

    # set random seed for experiment reproducibility
    torch.manual_seed(123)
    torch.cuda.manual_seed(123)

    # load data
    data_tr = EHRdata(os.path.join(indir), ut.dt_files['ehr-file'], sampling)
    data_generator_tr = DataLoader(data_tr,
                                   ut.model_param['batch_size'],
                                   shuffle=True,
                                   collate_fn=ehr_collate)
    if test_set:
        data_ts = EHRdata(os.path.join(indir), ut.dt_files['ehr-file-test'],
                          sampling)

        data_generator_ts = DataLoader(data_ts,
                                       ut.model_param['batch_size'],
                                       shuffle=True,
                                       collate_fn=ehr_collate)
        print("Test cohort size: {0}".format(len(data_ts)))
    else:
        data_generator_ts = data_generator_tr

    print('Training cohort size: {0}\n'.format(len(data_tr)))
    print('Max Sequence Length: {0}\n'.format(ut.len_padded))
    # define model and optimizer
    print('Learning rate: {0}'.format(ut.model_param['learning_rate']))
    print('Batch size: {0}'.format(ut.model_param['batch_size']))
    print('Kernel size: {0}\n'.format(ut.model_param['kernel_size']))

    model = net.ehrEncoding(vocab_size=vocab_size,
                            max_seq_len=ut.len_padded,
                            emb_size=ut.model_param['embedding_size'],
                            kernel_size=ut.model_param['kernel_size'],
                            pre_embs=embs,
                            vocab=vocab)

    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=ut.model_param['learning_rate'],
                                 weight_decay=ut.model_param['weight_decay'])

    # training and evaluation
    if torch.cuda.device_count() > 1:
        print('No. of GPUs: {0}\n'.format(torch.cuda.device_count()))
        model = nn.DataParallel(model)
    else:
        model.cuda()
        print('No. of GPUs: 1\n')

    # model.cuda()
    loss_fn = net.criterion
    print('Training for {} epochs\n'.format(ut.model_param['num_epochs']))

    mrn, encoded, encoded_avg, metrics_avg = train_and_evaluate(
        model, data_generator_tr, data_generator_ts, loss_fn, optimizer,
        net.metrics, exp_dir)

    # save results

    # encoded vectors (representations)
    outfile = os.path.join(exp_dir, 'convae-avg_vect.csv')
    with open(outfile, 'w') as f:
        wr = csv.writer(f)
        wr.writerow(["MRN", "ENCODED-AVG"])
        for m, e in zip(mrn, encoded_avg):
            wr.writerow([m] + list(e))

    outfile = os.path.join(exp_dir, 'convae_vect.csv')
    with open(outfile, 'w') as f:
        wr = csv.writer(f)
        wr.writerow(["MRN", "ENCODED-SUBSEQ"])
        for m, evs in zip(mrn, encoded):
            for e in evs:
                wr.writerow([m] + e)

    # metrics (loss and accuracy)
    outfile = os.path.join(exp_dir, 'metrics.txt')
    with open(outfile, 'w') as f:
        f.write('Mean loss: %.3f\n' % metrics_avg['loss'])
        f.write('Accuracy: %.3f\n' % metrics_avg['accuracy'])

    # ehr subseq with age in days
    outfile = os.path.join(exp_dir,
                           'cohort-ehr-subseq{0}.csv'.format(ut.len_padded))
    with open(os.path.join(os.path.join(indir), 'cohort-ehrseq.csv')) as f:
        rd = csv.reader(f)
        next(rd)
        ehr = {}
        for r in rd:
            ehr.setdefault(r[0], list()).extend(r[1:])
    ehr_subseq = {}
    for list_m, batch in data_generator_tr:
        for b, m in zip(batch, list_m):
            if len(b) == 1:
                ehr_subseq[m] = b.tolist()
            else:
                seq = []
                for vec in b.tolist():
                    seq.extend(vec)
                nseq, nleft = divmod(len(seq), ut.len_padded)
                if nleft > 0:
                    seq = seq + [0] * \
                          (ut.len_padded - nleft)
                for i in range(0, len(seq) - ut.len_padded + 1, ut.len_padded):
                    ehr_subseq.setdefault(m, list()).append(seq[i:i +
                                                                ut.len_padded])
    with open(outfile, 'w') as f:
        wr = csv.writer(f)
        wr.writerow(["MRN", "EHRsubseq"])
        for m, subseq in ehr_subseq.items():
            for seq in subseq:
                wr.writerow([m] + list(filter(lambda x: x != 0, seq)))

    if test_set:
        outfile = os.path.join(
            exp_dir, 'cohort_test-ehr-subseq{0}.csv'.format(ut.len_padded))
        ehr_subseq = {}
        for list_m, batch in data_generator_ts:
            for b, m in zip(batch, list_m):
                if len(b) == 1:
                    ehr_subseq[m] = b.tolist()
                else:
                    seq = []
                    for vec in b.tolist():
                        seq.extend(vec)
                    nseq, nleft = divmod(len(seq), ut.len_padded)
                    if nleft > 0:
                        seq = seq + [0] * \
                              (ut.len_padded - nleft)
                    for i in range(0,
                                   len(seq) - ut.len_padded + 1,
                                   ut.len_padded):
                        ehr_subseq.setdefault(m, list()).append(
                            seq[i:i + ut.len_padded])
        with open(outfile, 'w') as f:
            wr = csv.writer(f)
            wr.writerow(["MRN", "EHRsubseq"])
            for m, subseq in ehr_subseq.items():
                for seq in subseq:
                    wr.writerow([m] + list(filter(lambda x: x != 0, seq)))

    return
Exemplo n.º 23
0
                                            len(args.gpus),
                                            cpu_merge=False)

    model.compile(
        loss=args.loss,
        optimizer=keras.optimizers.Adadelta(),
        metrics=['accuracy'],
    )

    return model


# Model
try:
    model = load_model(SINGLE_MODEL_NAME)
    print("Model loaded from disk")
    create_model = False
except Exception:
    create_model = True

if create_model:
    print("Creating new single vgg model")
    model = compiled_single_model(input_shape)

train_and_evaluate(model,
                   args.epochs,
                   args.batches,
                   gpus=args.gpus,
                   plot_history=args.plot_history,
                   plot_model=args.plot_model)
    model.embedding.weight.data.copy_(pretrained_embeddings)

    UNK_IDX = dataset.TEXT.vocab.stoi[dataset.TEXT.unk_token]

    model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
    model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

    optimizer = optim.Adam(model.parameters())

    criterion = nn.CrossEntropyLoss()

    model = model.to(device)
    criterion = criterion.to(device)

    if answers.get('choice') == 'Train model':
        train.train_and_evaluate(model, train_iterator, valid_iterator, optimizer, criterion)
        test_loss, test_acc = train.evaluate(model, test_iterator, criterion)
        print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc * 100:.3f}%')

    if answers.get('choice') == 'Evaluate model':
        model.load_state_dict(torch.load('ezmath-model_83.pt'))
        test_loss, test_acc = train.evaluate(model, test_iterator, criterion)
        print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc * 100:.3f}%')

    if answers.get('choice') == 'Make Prediction':
        model.load_state_dict(torch.load('ezmath-model_83.pt'))
        nlp = spacy.load('it_core_news_sm')
        string = input("Please insert the exercise text: ")
        print('Making prediction for: ')
        print(string)
        pred_class = model.predict_class(string, nlp, dataset, device)
Exemplo n.º 25
0
def generate_entry(model_name,
                   hyperparameters,
                   datasets=('low', 'medium', 'high'),
                   use_hierarchical_attention=False,
                   use_ptr_gen=True,
                   test_data='test',
                   write_hyperparameter=False,
                   output_folder=None,
                   resume=False):

    languages = get_languages()

    if output_folder is None:
        output_folder = os.path.join('output', model_name)
    if not resume:
        os.makedirs(output_folder)

    if write_hyperparameter:
        with open(os.path.join(output_folder, 'hyperparameters'),
                  'w',
                  encoding='utf8') as file:
            file.write(hyperparameters)

    for language in tqdm(sorted(languages)):
        for dataset in datasets:
            if resume and os.path.exists(
                    os.path.join(output_folder, '{}-{}-out'.format(
                        language, dataset))):
                continue
            lr = hyperparameters['lr'][dataset]
            embedding_size = hyperparameters['embedding_size'][dataset]
            hidden_size = hyperparameters['hidden_size'][dataset]
            clip = hyperparameters['clip'][dataset]
            dropout_p = hyperparameters['dropout_p'][dataset]
            alpha = hyperparameters['alpha'][dataset]
            beta = hyperparameters['beta'][dataset]
            patience = hyperparameters['patience'][dataset]
            epochs_extension = hyperparameters['epochs_extension'][dataset]

            experiment_name = "{}_{}_{}_lr{}_em{}_hd_{}_clip{}_p{}_a{}_b_{}_{}".format(
                model_name, language, dataset, lr, embedding_size, hidden_size,
                str(clip), dropout_p, alpha, beta, int(time.time()))

            try:
                model_inputs_train, model_inputs_val, labels_train, labels_val, \
                vocab = package.data.load_data(language, dataset, test_data=test_data, use_external_val_data=True,
                                               val_ratio=0.2, random_state=42)
            except FileNotFoundError:
                continue

            model = package.net.Model(
                vocab,
                embedding_size=embedding_size,
                hidden_size=hidden_size,
                use_hierarchical_attention=use_hierarchical_attention,
                use_ptr_gen=use_ptr_gen,
                dropout_p=dropout_p).to(device)
            optimizer = optim.Adam(lr=lr, params=model.parameters())
            loss_fn = package.loss.Criterion(vocab, alpha, beta)

            writer = SummaryWriter('runs/' + experiment_name)
            model_save_dir = os.path.join('./saved_models', experiment_name)
            os.makedirs(model_save_dir)

            epochs = hyperparameters['epochs'][dataset]
            train_and_evaluate(model_inputs_train,
                               labels_train,
                               model_inputs_val,
                               labels_val,
                               model,
                               optimizer,
                               loss_fn,
                               epochs=epochs,
                               batch_size=32,
                               model_save_dir=model_save_dir,
                               show_progress=False,
                               writer=writer,
                               clip=clip)
            epochs_trained = epochs

            # Load best performing model on validation set
            best_state = torch.load(os.path.join(model_save_dir, 'best.model'))
            while epochs_trained - best_state['epoch_num'] < patience:
                train_and_evaluate(model_inputs_train,
                                   labels_train,
                                   model_inputs_val,
                                   labels_val,
                                   model,
                                   optimizer,
                                   loss_fn,
                                   epochs=epochs_extension,
                                   batch_size=32,
                                   model_save_dir=model_save_dir,
                                   show_progress=False,
                                   writer=writer,
                                   clip=clip,
                                   starting_epoch=epochs_trained + 1,
                                   initial_best_val_acc=best_state['val_acc'])
                epochs_trained += epochs_extension
                best_state = torch.load(
                    os.path.join(model_save_dir, 'best.model'))
            model.load_state_dict(best_state['model_state'])

            if test_data == 'dev':
                dev_file = os.path.join(TASK1_DATA_PATH,
                                        '{}-dev'.format(language))
                lemmas_test, tags_test, _ = read_dataset(dev_file)
            elif test_data == 'test':
                test_file = os.path.join(TASK1_DATA_PATH,
                                         '{}-covered-test'.format(language))
                lemmas_test, tags_test = read_covered_dataset(test_file)
            else:
                raise ValueError

            file_path = os.path.join(output_folder,
                                     '{}-{}-out'.format(language, dataset))
            generate_output(model, lemmas_test, tags_test, file_path)
Exemplo n.º 26
0
def main(args):
    # Load the parameters from json file
    params_dir = args.params_dir
    json_path = os.path.join(params_dir, 'params.json')
    assert os.path.isfile(
        json_path), "No json configuration file found at {}".format(json_path)
    params = utils.Params(json_path)

    # use GPU if available
    params.cuda = torch.cuda.is_available()

    # Set the random seed for reproducible experiments
    torch.manual_seed(params.seed)
    if params.cuda:
        torch.cuda.manual_seed(params.seed)

    # Set the logger
    model_dir = args.output_dir
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    utils.set_logger(os.path.join(model_dir, 'train.log'))

    logging.info("************ Validation fold: {} ************".format(
        args.fold))

    # Create the input data pipeline
    logging.info("Loading the datasets...")

    config_dict = {
        'image_dir': os.path.join(args.input_dir, 'train'),
        'csv_path': os.path.join(args.input_dir, 'train.csv')
    }

    train_data = DataPreprocess(config_dict)
    df, target_cols, num_targets = train_data.df, train_data.target_cols, train_data.num_targets

    # check for debug mode
    if args.debug:
        params.num_epochs = 1
        df = df.sample(n=100, random_state=params.seed).reset_index(drop=True)

    # update params
    params.mode = args.mode
    params.num_targets = num_targets
    params.target_cols = target_cols

    # split data into folds and pass to the model
    Fold = GroupKFold(n_splits=params.num_folds)
    groups = df['PatientID'].values
    for n, (train_index, valid_index) in enumerate(
            Fold.split(df, df[params.target_cols], groups)):
        df.loc[valid_index, 'fold'] = int(n)
    df['fold'] = df['fold'].astype(int)

    # get training and validation data using folds
    train_df = df[df.fold != args.fold].reset_index(drop=True)
    valid_df = df[df.fold == args.fold].reset_index(drop=True)

    # get dataloaders
    train_dataloader = dataloader.fetch_dataloader(train_df,
                                                   params,
                                                   data='train')
    valid_dataloader = dataloader.fetch_dataloader(valid_df,
                                                   params,
                                                   data='valid')

    logging.info("- done.")

    # Define the model and optimizer
    model = RANZCRModel(params, pretrained=True).model
    if params.cuda:
        model = model.to(torch.device('cuda'))

    optimizer = optim.Adam(model.parameters(),
                           lr=params.learning_rate,
                           amsgrad=False)
    scheduler = lr_scheduler.ReduceLROnPlateau(optimizer,
                                               mode='min',
                                               factor=0.1,
                                               patience=2,
                                               verbose=True)

    # fetch loss function and metrics
    loss_fn = nn.BCEWithLogitsLoss()
    metrics = models.metrics

    # Train the model
    logging.info("Starting training for {} epoch(s)".format(params.num_epochs))
    train_and_evaluate(model, train_dataloader, valid_dataloader,
                       valid_df[params.target_cols].values, optimizer,
                       scheduler, loss_fn, metrics, params, model_dir)