Пример #1
0
# initialize experiment / load dataset
logger = initialize_exp(params)
data, attributes, data2, attributes2 = load_images(params)
train_data = DataSampler(data[0], attributes[0], data2, attributes2, params)
valid_data = DataSampler(data[1], attributes[1], None, None, params)

# build the model
ae = AutoEncoder(params).cuda()
lat_dis = LatentDiscriminator(params).cuda() if params.n_lat_dis else None
ptc_dis = PatchDiscriminator(params).cuda() if params.n_ptc_dis else None
clf_dis = Classifier(params).cuda() if params.n_clf_dis else None
eval_clf = torch.load(params.eval_clf).cuda().eval()

# trainer / evaluator
trainer = Trainer(ae, lat_dis, ptc_dis, clf_dis, train_data, params)
evaluator = Evaluator(ae, lat_dis, ptc_dis, clf_dis, eval_clf, valid_data, params)


for n_epoch in range(params.n_epochs):

    logger.info('Starting epoch %i...' % n_epoch)

    for n_iter in range(0, params.epoch_size, params.batch_size):
        
        # latent discriminator training
        for _ in range(params.n_lat_dis):
            trainer.lat_dis_step()

        # patch discriminator training
        for _ in range(params.n_ptc_dis):
            trainer.ptc_dis_step()
Пример #2
0
    for n in range(params.num_samples):
        mappings.append(
            sample_from_multivariate_gaussian(interpolated_mapping,
                                              var=params.var))

    for m, sampled_mapping in enumerate(mappings):
        trainer.set_mapping_weights(weights=sampled_mapping)

        # compute the discriminator loss
        logger.info('----> COMPUTING DISCRIMINATOR LOSS <----\n\n')
        logger.info('alpha={}, m={}'.format(alpha, m))
        losses = []
        for n_iter in range(0, params.epoch_size, params.batch_size):
            loss = trainer.compute_loss()
            losses.append(loss.cpu().data.numpy())
            #print(losses)
        logger.info('Discriminator loss {}: {}\n'.format(m, np.mean(losses)))

        evaluator = Evaluator(trainer)

        # run evaluations
        to_log = OrderedDict({'n_iter': 0})
        evaluator.monolingual_wordsim(to_log)
        # evaluator.monolingual_wordanalogy(to_log)
        if params.tgt_lang:
            evaluator.crosslingual_wordsim(to_log)
            evaluator.word_translation(to_log)
            #evaluator.sent_translation(to_log)
            evaluator.dist_mean_cosine(to_log)
Пример #3
0
def main():
    VALIDATION_METRIC_SUP = 'precision_at_1-csls_knn_10'
    VALIDATION_METRIC_UNSUP = 'mean_cosine-csls_knn_10-S2T-10000'


    # main
    parser = argparse.ArgumentParser(description='Supervised training')
    parser.add_argument("--seed", type=int, default=-1, help="Initialization seed")
    parser.add_argument("--verbose", type=int, default=2, help="Verbose level (2:debug, 1:info, 0:warning)")
    parser.add_argument("--exp_path", type=str, default="", help="Where to store experiment logs and models")
    parser.add_argument("--exp_name", type=str, default="debug", help="Experiment name")
    parser.add_argument("--exp_id", type=str, default="", help="Experiment ID")
    parser.add_argument("--cuda", type=bool_flag, default=True, help="Run on GPU")
    parser.add_argument("--export", type=str, default="txt", help="Export embeddings after training (txt / pth)")

    # data
    parser.add_argument("--src_lang", type=str, default='en', help="Source language")
    parser.add_argument("--tgt_lang", type=str, default='es', help="Target language")
    parser.add_argument("--aux_lang", type=str, default='', help="Auxiliary language")
    parser.add_argument("--emb_dim", type=int, default=300, help="Embedding dimension")
    parser.add_argument("--max_vocab", type=int, default=200000, help="Maximum vocabulary size (-1 to disable)")
    # training refinement
    parser.add_argument("--n_refinement", type=int, default=5, help="Number of refinement iterations (0 to disable the refinement procedure)")
    # dictionary creation parameters (for refinement)
    parser.add_argument("--dico_train", type=str, default="default", help="Path to training dictionary (default: use identical character strings)")
    parser.add_argument("--dico_eval", type=str, default="default", help="Path to evaluation dictionary")
    parser.add_argument("--dico_method", type=str, default='csls_knn_10', help="Method used for dictionary generation (nn/invsm_beta_30/csls_knn_10)")
    parser.add_argument("--dico_build", type=str, default='S2T&T2S', help="S2T,T2S,S2T|T2S,S2T&T2S")
    parser.add_argument("--dico_threshold", type=float, default=0, help="Threshold confidence for dictionary generation")
    parser.add_argument("--dico_max_rank", type=int, default=10000, help="Maximum dictionary words rank (0 to disable)")
    parser.add_argument("--dico_min_size", type=int, default=0, help="Minimum generated dictionary size (0 to disable)")
    parser.add_argument("--dico_max_size", type=int, default=0, help="Maximum generated dictionary size (0 to disable)")
    # reload pre-trained embeddings
    parser.add_argument("--src_emb", type=str, default='', help="Reload source embeddings")
    parser.add_argument("--tgt_emb", type=str, default='', help="Reload target embeddings")
    parser.add_argument("--aux_emb", type=str, default='', help="Reload auxiliary embeddings")
    parser.add_argument("--normalize_embeddings", type=str, default="", help="Normalize embeddings before training")
    parser.add_argument("--fitting_method", type=str, default="non_iterative", help="Method of fitting, one of [non_iterative, em, gauss_seidel, gradient_based]")

    # parse parameters
    params = parser.parse_args()

    # check parameters
    assert not params.cuda or torch.cuda.is_available()
    assert params.dico_train in ["identical_char", "default"] or os.path.isfile(params.dico_train)
    assert params.dico_build in ["S2T", "T2S", "S2T|T2S", "S2T&T2S"]
    assert params.dico_max_size == 0 or params.dico_max_size < params.dico_max_rank
    assert params.dico_max_size == 0 or params.dico_max_size > params.dico_min_size
    print(params.src_emb, params.tgt_emb, params.aux_emb)
    assert os.path.isfile(params.src_emb)
    assert os.path.isfile(params.tgt_emb)
    assert params.dico_eval == 'default' or os.path.isfile(params.dico_eval)
    assert params.export in ["", "txt", "pth"]

    # build logger / model / trainer / evaluator
    logger = initialize_exp(params)
    src_emb, tgt_emb, aux_emb, mapping, _ = build_model(params, False)

    trainer = Trainer(src_emb, tgt_emb, aux_emb, mapping, None, params)

    # load a training dictionary. if a dictionary path is not provided, use a default
    # one ("default") or create one based on identical character strings ("identical_char")
    trainer.load_training_dico(params.dico_train)

    # define the validation metric
    VALIDATION_METRIC = VALIDATION_METRIC_UNSUP if params.dico_train == 'identical_char' else VALIDATION_METRIC_SUP
    logger.info("Validation metric: %s" % VALIDATION_METRIC)

    # apply the PCCA solution
    trainer.fit(fitting_method=params.fitting_method)

    # IMPORTANT: EVALUATOR SHOULD BE CREATED AFTER TRAINER HAS BEEN FITTED
    evaluator = Evaluator(trainer)

    # embeddings evaluation
    to_log = OrderedDict({})
    evaluator.all_eval(to_log)

    logger.info("__log__:%s" % json.dumps(to_log))
Пример #4
0
assert not params.cuda or torch.cuda.is_available()
assert 0 <= params.dis_dropout < 1
assert 0 <= params.dis_input_dropout < 1
assert 0 <= params.dis_smooth < 0.5
assert params.dis_lambda > 0 and params.dis_steps > 0
assert 0 < params.lr_shrink <= 1
assert os.path.isfile(params.src_emb)
assert os.path.isfile(params.tgt_emb)
assert params.dico_eval == 'default' or os.path.isfile(params.dico_eval)
assert params.export in ["", "txt", "pth"]

# build model / trainer / evaluator
logger = initialize_exp(params)
src_emb, tgt_emb, mapping, discriminator = build_model(params, True)
trainer = Trainer(src_emb, tgt_emb, mapping, discriminator, params)
evaluator = Evaluator(trainer)

writer = SummaryWriter()


"""
Learning loop for Adversarial Training
"""
if params.adversarial:
    logger.info('----> ADVERSARIAL TRAINING <----\n\n')

    # training loop
    for n_epoch in range(params.n_epochs):

        logger.info('Starting adversarial training epoch %i...' % n_epoch)
        tic = time.time()
Пример #5
0
assert 0 <= params.dis_dropout < 1
assert 0 <= params.dis_input_dropout < 1
assert 0 <= params.dis_smooth < 0.5
assert params.dis_lambda > 0 and params.dis_steps > 0
assert 0 < params.lr_shrink <= 1
assert os.path.isfile(params.src_emb)
assert os.path.isfile(params.tgt_emb)
assert params.dico_eval == 'default' or os.path.isfile(params.dico_eval)
assert params.export in ["", "txt", "pth"]

# build model / trainer / evaluator
logger = initialize_exp(params)
src_emb, tgt_emb, mapping, discriminator = build_model(params, True)

trainer = Trainer(src_emb, tgt_emb, mapping, discriminator, params)
evaluator = Evaluator(trainer)

writer = SummaryWriter()

with open("data/results/wordresult_" + params.exp_id + ".txt", "wr") as w:
    """
    Learning loop for Adversarial Training
    """
    if params.adversarial:
        logger.info('----> ADVERSARIAL TRAINING <----\n\n')

        # training loop
        for n_epoch in range(params.n_epochs):

            logger.info('Starting adversarial training epoch %i...' % n_epoch)
            tic = time.time()
Пример #6
0
def evaluate_real_datasets():
    REAL_DATASET_GROUP_PATH = 'data/raw/'
    real_dataset_groups = glob.glob(REAL_DATASET_GROUP_PATH + '*')
    seeds = np.random.randint(np.iinfo(np.uint32).max,
                              size=RUNS,
                              dtype=np.uint32)
    results = pd.DataFrame()
    datasets = [KDDCup(seed=1)]
    for real_dataset_group in real_dataset_groups:
        for data_set_path in glob.glob(real_dataset_group +
                                       '/labeled/train/*'):
            data_set_name = data_set_path.split('/')[-1].replace('.pkl', '')
            dataset = RealPickledDataset(data_set_name, data_set_path)
            datasets.append(dataset)

    for seed in seeds:
        datasets[0] = KDDCup(seed)
        evaluator = Evaluator(datasets, detectors, seed=seed)
        evaluator.evaluate()
        result = evaluator.benchmarks()
        evaluator.plot_roc_curves()
        evaluator.plot_threshold_comparison()
        evaluator.plot_scores()
        results = results.append(result, ignore_index=True)

    avg_results = results.groupby(['dataset', 'algorithm'],
                                  as_index=False).mean()
    evaluator.set_benchmark_results(avg_results)
    evaluator.export_results('run_real_datasets')
    evaluator.create_boxplots(runs=RUNS, data=results, detectorwise=False)
    evaluator.create_boxplots(runs=RUNS, data=results, detectorwise=True)
Пример #7
0
assert 0 < params.lr_shrink <= 1
assert os.path.isfile(params.src_emb)
assert os.path.isfile(params.tgt_emb)
assert params.dico_eval == 'default' or os.path.isfile(params.dico_eval)
assert params.export in ["", "txt", "pth"]

# build model / trainer / evaluator
logger = initialize_exp(params)
src_emb, tgt_emb, _, mapping, discriminator = build_model(params, True)
if params.glo_emb:
    glo_emb = KeyedVectors.load_word2vec_format(params.glo_emb)
else:
    glo_emb = None
trainer = Trainer(src_emb, tgt_emb, glo_emb, params.eval_emb, mapping,
                  discriminator, params)
evaluator = Evaluator(trainer)
"""
Learning loop for Adversarial Training
"""
if params.adversarial:
    logger.info('----> ADVERSARIAL TRAINING <----\n\n')

    # training loop
    for n_epoch in range(params.n_epochs):

        logger.info('Starting adversarial training epoch %i...' % n_epoch)
        tic = time.time()
        n_words_proc = 0
        stats = {'DIS_COSTS': []}

        for n_iter in range(0, params.epoch_size, params.batch_size):
Пример #8
0
assert os.path.isfile(params.tgt_emb)
assert params.dico_eval == 'default' or os.path.isfile(params.dico_eval)
assert params.export in ["", "txt", "pth"]
params.verbose = 0
params.leave_one_out = True

# build logger / model / trainer / evaluator
#do all this stuff once to get the word2id's
logger = initialize_exp(params)
src_emb, tgt_emb, eval_emb, mapping, _ = build_model(params, False)
if params.glo_emb:
    glo_emb = KeyedVectors.load_word2vec_format(params.glo_emb) if params.glo_emb else None
else:
    glo_emb = None
trainer = Trainer(src_emb, tgt_emb, glo_emb, eval_emb, mapping, None, params)
evaluator = Evaluator(trainer)

# load a training dictionary. if a dictionary path is not provided, use a default
# one ("default") or create one based on identical character strings ("identical_char")
trainer.load_training_dico(params.dico_train, params.subsample)
eval_dico = load_dictionary(params.dico_eval, trainer.src_dico.word2id, trainer.tgt_dico.word2id)
if params.cuda:
    eval_dico.cuda()
dico = trainer.dico.clone()

# define the validation metric
logger.info("Validation metric: %s" % params.val_metric)

cos = torch.nn.CosineSimilarity(dim=1)

ranks = []
assert not params.cuda or torch.cuda.is_available()
assert 0 <= params.dis_dropout < 1
assert 0 <= params.dis_input_dropout < 1
assert 0 <= params.dis_smooth < 0.5
assert params.dis_lambda > 0 and params.dis_steps > 0
assert 0 < params.lr_shrink <= 1
assert os.path.isfile(params.src_emb)
assert all(os.path.isfile(emb) for emb in params.tgt_emb)
assert params.dico_eval == 'default' or os.path.isfile(params.dico_eval)
assert params.export in ["", "txt", "pth"]

# build model / trainer / evaluator
logger = initialize_exp(params)
src_emb, tgt_emb, mapping, discriminator = build_model(params, True)
trainer = Trainer(src_emb, tgt_emb, mapping, discriminator, params)
evaluator = Evaluator(trainer)
"""
Learning loop for Adversarial Training
"""
if params.adversarial:
    logger.info('----> ADVERSARIAL TRAINING <----\n\n')

    # training loop
    for n_epoch in range(params.n_epochs):

        logger.info('Starting adversarial training epoch %i...' % n_epoch)
        tic = time.time()
        n_words_proc = 0
        stats = {'DIS_COSTS': []}

        for n_iter in range(0, params.epoch_size, params.batch_size):
Пример #10
0
assert not params.cuda or torch.cuda.is_available()
assert 0 <= params.dis_dropout < 1
assert 0 <= params.dis_input_dropout < 1
assert 0 <= params.dis_smooth < 0.5
assert params.dis_lambda > 0 and params.dis_steps > 0
assert 0 < params.lr_shrink <= 1
assert os.path.isfile(params.src_emb)
assert os.path.isfile(params.tgt_emb)
assert params.dico_eval == 'default' or os.path.isfile(params.dico_eval)
assert params.export in ["", "txt", "pth"]

# build model / trainer / evaluator
logger = initialize_exp(params)
src_emb, tgt_emb, mapping, discriminator = build_model(params, True)
trainer = Trainer(src_emb, tgt_emb, mapping, discriminator, params)
evaluator = Evaluator(trainer)
"""
Learning loop for Adversarial Training
"""
if params.adversarial:
    logger.info('----> ADVERSARIAL TRAINING <----\n\n')

    # training loop
    for n_epoch in range(params.n_epochs):

        logger.info('Starting adversarial training epoch %i...' % n_epoch)
        tic = time.time()
        n_words_proc = 0
        stats = {'DIS_COSTS': []}

        for n_iter in range(0, params.epoch_size, params.batch_size):
Пример #11
0
assert not params.cuda or torch.cuda.is_available()
assert 0 <= params.dis_dropout < 1
assert 0 <= params.dis_input_dropout < 1
assert 0 <= params.dis_smooth < 0.5
assert params.dis_lambda > 0 and params.dis_steps > 0
assert 0 < params.lr_shrink <= 1
assert os.path.isfile(params.src_emb)
assert os.path.isfile(params.tgt_emb)
assert params.dico_eval == 'default' or os.path.isfile(params.dico_eval)
assert params.export in ["", "txt", "pth"]

# build model / trainer / evaluator
logger = initialize_exp(params)
src_emb, tgt_emb, mapping, discriminator = build_model(params, True)
trainer = Trainer(src_emb, tgt_emb, mapping, discriminator, params)
evaluator = Evaluator(trainer)

# load a training dictionary (supervision)
trainer.load_training_dico(params.dico_train)
"""
Learning loop for Adversarial Training
"""
if params.adversarial:
    logger.info('----> ADVERSARIAL TRAINING <----\n\n')

    # training loop
    for n_epoch in range(params.n_epochs):

        logger.info('Starting adversarial training epoch %i...' % n_epoch)
        tic = time.time()
        n_words_proc = 0
Пример #12
0
def learning(params, src_data, tgt_data, options):
    VALIDATION_METRIC = 'mean_cosine-csls_knn_10-S2T-10000'
    logger = logging.getLogger('{}Log'.format(src_data.dataname))
    for i in range(10):
        # tic = time.time()
        if i == 0:
            options.initialize = True
        else:
            options.initialize = False

        logger.info("src_learning {}回目".format(i + 1))
        src_data = RVSML_OT_Learning(src_data, options, params)

        logger.info("tgt_learning {}回目".format(i + 1))
        tgt_data = RVSML_OT_Learning(tgt_data, options, params)

        # build model / trainer / evaluator
        src_emb, tgt_emb, mapping, discriminator = build_model(
            params, src_data, tgt_data, True)

        trainer = Trainer(src_emb, tgt_emb, mapping, discriminator, params)
        evaluator = Evaluator(trainer)
        """
        Learning loop for Adversarial Training
        """
        logger.info('----> ADVERSARIAL TRAINING <----\n\n')

        # training loop
        for n_epoch in range(params.n_epochs):

            logger.info('Starting adversarial training epoch %i...' % n_epoch)
            tic = time.time()
            n_words_proc = 0
            stats = {'DIS_COSTS': []}

            for n_iter in range(0, params.epoch_size, params.batch_size):

                # discriminator training
                for _ in range(params.dis_steps):
                    trainer.dis_step(stats)

                # mapping training (discriminator fooling)
                n_words_proc += trainer.mapping_step(stats)

                # log stats
                if n_iter % 500 == 0:
                    stats_str = [('DIS_COSTS', 'Discriminator loss')]
                    stats_log = [
                        '%s: %.4f' % (v, np.mean(stats[k]))
                        for k, v in stats_str if len(stats[k]) > 0
                    ]
                    stats_log.append('%i samples/s' % int(n_words_proc /
                                                          (time.time() - tic)))
                    logger.info(('%06i - ' % n_iter) + ' - '.join(stats_log))

                    # reset
                    tic = time.time()
                    n_words_proc = 0
                    for k, _ in stats_str:
                        del stats[k][:]

            # embeddings / discriminator evaluation
            to_log = OrderedDict({'n_epoch': n_epoch})
            evaluator.all_eval(to_log)
            evaluator.eval_dis(to_log)

            # JSON log / save best model / end of epoch
            logger.info("__log__:%s" % json.dumps(to_log))
            trainer.save_best(to_log, VALIDATION_METRIC)
            logger.info('End of epoch %i.\n\n' % n_epoch)

            # update the learning rate (stop if too small)
            trainer.update_lr(to_log, VALIDATION_METRIC)
            if trainer.map_optimizer.param_groups[0]['lr'] < params.min_lr:
                logger.info('Learning rate < 1e-6. BREAK.')
                break
        """
        Learning loop for Procrustes Iterative Refinement
        """
        # if params.n_refinement > 0:
        # Get the best mapping according to VALIDATION_METRIC
        logger.info('----> ITERATIVE PROCRUSTES REFINEMENT <----\n\n')
        trainer.reload_best()

        # training loop
        for n_iter in range(params.n_refinement):

            logger.info('Starting refinement iteration %i...' % n_iter)

            # build a dictionary from aligned embeddings
            trainer.build_dictionary()

            # apply the Procrustes solution
            trainer.procrustes()

            # embeddings evaluation
            to_log = OrderedDict({'n_iter': n_iter})
            evaluator.all_eval(to_log)

            # JSON log / save best model / end of epoch
            logger.info("__log__:%s" % json.dumps(to_log))
            trainer.save_best(to_log, VALIDATION_METRIC)
            logger.info('End of refinement iteration %i.\n\n' % n_iter)

        src_data.trans_mat = torch.mm(src_data.trans_mat,
                                      trainer.mapping.weight.data)

    return src_data, tgt_data
Пример #13
0
assert 0 <= params.dis_dropout < 1
assert 0 <= params.dis_input_dropout < 1
assert 0 <= params.dis_smooth < 0.5
assert params.dis_lambda > 0 and params.dis_steps > 0
assert 0 < params.lr_shrink <= 1
assert os.path.isfile(params.src_emb)
assert os.path.isfile(params.tgt_emb)
# assert params.dico_eval == 'default' or os.path.isfile(params.dico_eval)
assert params.dico_eval == '' or os.path.isfile(params.dico_eval)
assert params.export in ["", "txt", "pth"]

# build model / trainer / evaluator
logger = initialize_exp(params)
src_emb, tgt_emb, mapping, discriminator = build_model(params, True)
trainer = Trainer(src_emb, tgt_emb, mapping, discriminator, params)
evaluator = Evaluator(trainer)
"""
Learning loop for Adversarial Training
"""
if params.adversarial:
    logger.info('----> ADVERSARIAL TRAINING <----\n\n')

    # training loop
    for n_epoch in range(params.n_epochs):

        logger.info('Starting adversarial training epoch %i...' % n_epoch)
        tic = time.time()
        n_words_proc = 0
        stats = {'DIS_COSTS': []}

        for n_iter in range(0, params.epoch_size, params.batch_size):
Пример #14
0
                    help="Normalize embeddings before training")
# inference parameters
parser.add_argument("--multilingual_inference_method",
                    nargs='+',
                    help="which inference methods to use",
                    default=['BI', 'NT', 'CNT', 'CAT'])

# parse parameters
params = parser.parse_args()
# check parameters
assert not params.cuda or torch.cuda.is_available()
assert all(os.path.isfile(emb) for emb in params.embs)
assert len(params.langs) == len(params.embs)
assert all([
    inf_met in ['BI', 'NT', 'CNT', 'CAT'] or inf_met.startswith('CAT')
    for inf_met in params.multilingual_inference_method
])

# build logger / model / trainer / evaluator
logger = initialize_exp(params)
embs, mapping = build_model(params)
trainer = Trainer(embs, mapping, params)
evaluator = Evaluator(trainer)
"""
Inference with MWT (Multilingual Word Translation)
"""
logger.info('Starting inference...')

# embeddings evaluation
evaluator.word_translation()
logger.info('End of inference.\n\n')
Пример #15
0
params = parser.parse_args()

# check parameters
assert not params.cuda or torch.cuda.is_available()
assert params.dico_train in ["identical_char", "default"] or os.path.isfile(params.dico_train)
assert params.dico_build in ["S2T", "T2S", "S2T|T2S", "S2T&T2S"]
assert params.dico_max_size == 0 or params.dico_max_size < params.dico_max_rank
assert params.dico_max_size == 0 or params.dico_max_size > params.dico_min_size
assert os.path.isfile(params.src_emb)
assert os.path.isfile(params.tgt_emb)

# build logger / model / trainer / evaluator
logger = initialize_exp(params)
src_emb, tgt_emb, mapping, _ = build_model(params, False)
trainer = Trainer(src_emb, tgt_emb, mapping, None, params)
evaluator = Evaluator(trainer)

# load a training dictionary. if a dictionary path is not provided, use a default
# one ("default") or create one based on identical character strings ("identical_char")
trainer.load_training_dico(params.dico_train)

"""
Learning loop for Procrustes Iterative Refinement
"""
for n_iter in range(params.n_iters):

    logger.info('Starting refinement iteration %i...' % n_iter)

    # build a dictionary from aligned embeddings (unless
    # it is the first iteration and we use the init one)
    if n_iter > 0 or not hasattr(trainer, 'dico'):
Пример #16
0

# parse parameters
params = parser.parse_args()

# check parameters
assert params.src_lang, "source language undefined"
assert os.path.isfile(params.src_emb)
assert not params.tgt_lang or os.path.isfile(params.tgt_emb)
assert params.dico_eval == 'default' or os.path.isfile(params.dico_eval)

# build logger / model / trainer / evaluator
logger = initialize_exp(params)
src_emb, tgt_emb, mapping, _ = build_model(params, False)
trainer = Trainer(src_emb, tgt_emb, mapping, None, params)
evaluator = Evaluator(trainer)

# run evaluations
to_log = OrderedDict({'n_iter': 0})
evaluator.cluster_accuracy(to_log)
# evaluator.monolingual_wordsim(to_log)
# evaluator.monolingual_wordanalogy(to_log)
if params.tgt_lang:
    evaluator.cluster_accuracy(to_log, cl="separately")
    evaluator.cluster_accuracy(to_log, cl="multilingual")
    # evaluator.crosslingual_wordsim(to_log)
    # evaluator.word_translation(to_log)
    # evaluator.sent_translation(to_log)
    # evaluator.dist_mean_cosine(to_log)

logger.info(" === EVALUTATION RESULTS === ")
Пример #17
0
# build the trainable model
ae = AutoEncoder(params).cuda()

# build the Teacher model (if required)
if params.lambda_jacobian > 0:
    params2 = params
    params2.max_fm = 512 + params.n_attr
    ae_teacher = AutoEncoder(params2).cuda()
    params.max_fm = params.max_fm_orig
else:
    ae_teacher = None

# trainer / evaluator
trainer = Trainer(ae, ae_teacher, train_data, params)

evaluator = Evaluator(ae, ae_teacher, valid_data, params)

for n_epoch in range(params.n_epochs):

    logger.info('Starting epoch %i...' % n_epoch)

    for eee in range(25):
        evaluator.autoencoder_step(iterno=eee, epoch=n_epoch)
    evaluator.step(n_epoch)

    for n_iter in range(0, params.epoch_size, params.batch_size):

        # autoencoder training
        trainer.autoencoder_step()

        # print training statistics
Пример #18
0
    path = os.path.join(trainer.params.exp_path, 'translation.tsv')
    logger.info('Saving translations to %s', path)
    with open(path, 'w') as out, open('corpora/voynich.txt') as inp:
        for idx, line in enumerate(inp):
            line = line.rstrip()
            translation = ' '.join([txt_dict[w] for w in line.split()])
            if idx < 10:
                logger.info(translation)
            out.write(translation + '\n')


# build model / trainer / evaluator
logger = initialize_exp(params)
src_emb, tgt_emb, mapping, discriminator = build_model(params, True)
trainer = Trainer(src_emb, tgt_emb, mapping, discriminator, params)
evaluator = Evaluator(trainer)
"""
Learning loop for Adversarial Training
"""
if params.adversarial:
    logger.info('----> ADVERSARIAL TRAINING <----\n\n')

    # training loop
    for n_epoch in range(params.n_epochs):

        logger.info('Starting adversarial training epoch %i...' % n_epoch)
        tic = time.time()
        n_words_proc = 0
        stats = {'DIS_COSTS': []}

        for n_iter in range(0, params.epoch_size, params.batch_size):
Пример #19
0
assert 0 <= params.dis_input_dropout < 1
assert 0 <= params.dis_smooth < 0.5
assert params.dis_lambda > 0 and params.dis_steps > 0
assert 0 < params.lr_shrink <= 1
assert os.path.isfile(params.src_emb)
assert os.path.isfile(params.tgt_emb)
assert params.dico_eval == 'default' or os.path.isfile(params.dico_eval)
assert params.export in ["", "txt", "pth"]

# build model / trainer / evaluator
logger = initialize_exp(params)
src_emb, tgt_emb, mapping, discriminator = build_model(params, True)

trainer = Trainer(src_emb, tgt_emb, mapping, discriminator, params)

evaluator = Evaluator(trainer)
np.random.seed(params.seed)

# init generator parameters with artetxe's methods
if params.map_init == "second_order":
    m_init = extract_initial_mapping(params, src_emb, tgt_emb)
    trainer.set_mapping_weights(torch.from_numpy(m_init))

# if we initialize the generator from a supervised mapping, evaluate before training for sanity check
if not params.map_id_init:
    # embeddings / discriminator evaluation
    to_log = OrderedDict({'n_epoch': -1})
    evaluator.all_eval(to_log)
    evaluator.eval_dis(to_log)
"""
Learning loop for Adversarial Training
assert params.dico_build in ["S2T", "T2S", "S2T|T2S", "S2T&T2S"]
assert params.dico_max_size == 0 or params.dico_max_size < params.dico_max_rank
assert params.dico_max_size == 0 or params.dico_max_size > params.dico_min_size
assert os.path.isfile(params.src_emb)
assert all(os.path.isfile(emb) for emb in params.tgt_emb)
#assert params.dico_eval == 'default' or os.path.isfile(params.dico_eval)
assert params.export in ["", "txt", "pth"]
assert len(params.tgt_lang) == len(params.tgt_emb)
assert len(params.tgt_lang) == 1 or params.generalized
assert params.fine_tuning <= params.n_refinement

# build logger / model / trainer / evaluator
logger = initialize_exp(params)
src_emb, tgt_emb, mapping, _ = build_model(params, False)
trainer = Trainer(src_emb, tgt_emb, mapping, None, params)
evaluator = Evaluator(trainer)

#start generalized training with support
support = True if params.generalized else False
# load a training dictionary. if a dictionary path is not provided, use a default
# one ("default") or create one based on identical character strings ("identical_char")
trainer.load_training_dico(params.dico_train, support)
"""
Learning loop for Procrustes Iterative Learning
"""
for n_iter in range(params.n_refinement + 1):

    if n_iter > params.n_refinement - params.fine_tuning:
        support = False

    logger.info('Starting iteration %i...' % n_iter)
Пример #21
0
parser.add_argument("--normalize_embeddings", type=str, default="", help="Normalize embeddings before training")

parser.add_argument("--validate_path", type=str, default="", help="Validation words")


# parse parameters
params = parser.parse_args()

# build model / trainer / evaluator
logger = initialize_exp(params)
src_emb, tgt_emb, mapping, discriminator = build_model(params, True)


trainer = Trainer(src_emb, tgt_emb, mapping, discriminator, params)
trainer.reload_best()
evaluator = Evaluator(trainer)


if params.validate_path == "":

    while True:
        word = raw_input("Query word: ")
        print word
        to_log2 = OrderedDict({'n_epoch': 0})
        evaluator.validate_words(to_log2, [word])
        print to_log2["Word_dict"].keys()
        for wordKey in to_log2["Word_dict"].keys():
            print to_log2["Word_dict"][wordKey]
else:

    en_words = []
Пример #22
0
assert params.dis_lambda > 0 and params.dis_steps > 0
assert 0 < params.lr_shrink <= 1
assert os.path.isfile(params.src_emb)
assert os.path.isfile(params.tgt_emb)
assert params.dico_eval == 'default' or params.dico_eval == 'vecmap' or os.path.isfile(
    params.dico_eval)
assert params.export in ["", "txt", "pth"]

# build model / trainer / evaluator
logger = initialize_exp(params)
src_emb, tgt_emb, mapping_G, mapping_F, discriminator_A, discriminator_B, encoder_A, decoder_A, encoder_B, decoder_B = build_model(
    params, True)
trainer = Trainer(src_emb, tgt_emb, mapping_G, mapping_F, discriminator_A,
                  discriminator_B, encoder_A, decoder_A, encoder_B, decoder_B,
                  params)
evaluator = Evaluator(trainer)
"""
Learning loop for Adversarial Training
"""
if params.adversarial:

    # first train the autoencoder to become mature
    trainer.train_autoencoder_A()
    trainer.train_autoencoder_B()

    logger.info('----> ADVERSARIAL TRAINING <----\n\n')

    # adversarial training loop
    for n_epoch in range(params.n_epochs):

        logger.info('Starting adversarial training epoch %i...' % n_epoch)
Пример #23
0
# check parameters
assert not params.cuda or torch.cuda.is_available()
assert 0 <= params.dis_dropout < 1
assert 0 <= params.dis_input_dropout < 1
assert 0 <= params.dis_smooth < 0.5
assert params.dis_lambda > 0 and params.dis_steps > 0
assert 0 < params.lr_shrink <= 1
assert os.path.isfile(params.src_emb)
assert os.path.isfile(params.tgt_emb)

# build model / trainer / evaluator
logger = initialize_exp(params)
src_emb, tgt_emb, mapping, discriminator = build_model(params, True)
trainer = Trainer(src_emb, tgt_emb, mapping, discriminator, params)
evaluator = Evaluator(trainer)
"""
Learning loop for Adversarial Training
"""
if params.adversarial:
    logger.info('----> ADVERSARIAL TRAINING <----\n\n')

    # training loop
    for n_epoch in range(params.n_epochs):

        logger.info('Starting adversarial training epoch %i...' % n_epoch)
        tic = time.time()
        n_words_proc = 0
        stats = {'DIS_COSTS': []}

        for n_iter in range(0, params.epoch_size, params.batch_size):
Пример #24
0
# reload pre-trained embeddings
parser.add_argument("--src_emb", type=str, default="", help="Reload source embeddings")
parser.add_argument("--tgt_emb", type=str, default="", help="Reload target embeddings")
parser.add_argument("--max_vocab", type=int, default=200000, help="Maximum vocabulary size")
parser.add_argument("--emb_dim", type=int, default=300, help="Embedding dimension")
parser.add_argument("--normalize_embeddings", type=str, default="", help="Normalize embeddings before training")


# parse parameters
params = parser.parse_args()

# check parameters
assert params.src_lang, "source language undefined"
assert os.path.isfile(params.src_emb)
assert not params.tgt_lang or os.path.isfile(params.tgt_emb)

# build logger / model / trainer / evaluator
logger = initialize_exp(params)
src_emb, tgt_emb, mapping, _ = build_model(params, False)
trainer = Trainer(src_emb, tgt_emb, mapping, None, params)
evaluator = Evaluator(trainer)

# run evaluations
to_log = OrderedDict({'n_iter': 0})
evaluator.monolingual_wordsim(to_log)
if params.tgt_lang:
    evaluator.crosslingual_wordsim(to_log)
    evaluator.word_translation(to_log)
    evaluator.sent_translation(to_log)
    # evaluator.dist_mean_cosine(to_log)
Пример #25
0
parser.add_argument("--model_name",type=str, default="", help="Model name")

# parse parameters
params = parser.parse_args()

# check parameters
assert params.src_lang, "source language undefined"
assert os.path.isfile(params.src_emb)
assert not params.tgt_lang or os.path.isfile(params.tgt_emb)
assert params.dico_eval == 'default' or os.path.isfile(params.dico_eval)

# build logger / model / trainer / evaluator
logger = initialize_exp(params)
src_emb, tgt_emb, mapping, _ = build_model(params, False)
trainer = Trainer(src_emb, tgt_emb, mapping, None, params)
evaluator = Evaluator(trainer)

# run evaluations
to_log = OrderedDict({'n_iter': 0})
log_export  = {
    "name" : params.model_name
}
evaluator.monolingual_wordsim(to_log,log_export)
evaluator.monolingual_wordanalogy(to_log,log_export)
if params.tgt_lang:
    # evaluator.crosslingual_wordsim(to_log)
    evaluator.word_translation(to_log)
    # evaluator.sent_translation(to_log)
    evaluator.dist_mean_cosine(to_log)

log_export['nn-1'] = to_log['precision_at_1-nn']
Пример #26
0
def main():
    params = load_args()

    logger = create_logger(
        os.path.join(params.exp_path, "lnmap-experiment.log"))
    logger.info("{}".format(
        jsbeautifier.beautify(json.dumps(params.__dict__), opts)))
    set_seed(params)

    src_emb, tgt_emb, mapping_G, mapping_F, encoder_A, decoder_A, encoder_B, decoder_B = build_model(
        params)
    trainer = Trainer(src_emb, tgt_emb, mapping_G, mapping_F, encoder_A,
                      decoder_A, encoder_B, decoder_B, params)
    evaluator = Evaluator(trainer)

    trainer.load_training_dico(logger)
    trainer.load_training_dico(logger, src2tgt=False)
    logger.info("Seed dictionary size: {}".format(trainer.dico_AB.shape[0]))
    trainer.dico_AB_original = trainer.dico_AB.clone()
    trainer.dico_BA_original = trainer.dico_BA.clone()

    if params.load_autoenc_weights:
        load_autoenc_weights(params, trainer, logger)
    else:
        trainer.train_autoencoder_A(logger)
        trainer.train_autoencoder_B(logger)
        if params.save_autoenc_weights:
            save_autoenc_weights(params, trainer, logger)

    # Source to Target Training
    logger.info("\n \n Training for {} to {}".format(params.src_lang,
                                                     params.tgt_lang))
    for i in range(params.iteration):
        trainer.train_A2B()

        emb1 = (trainer.mapping_G(
            trainer.encoder_A(
                trainer.src_emb.weight.data)).data)[0:params.dico_max_rank]
        emb2 = (trainer.encoder_B(
            trainer.tgt_emb.weight.data).data)[0:params.dico_max_rank]
        emb1 = emb1 / emb1.norm(2, 1, keepdim=True).expand_as(emb1)
        emb2 = emb2 / emb2.norm(2, 1, keepdim=True).expand_as(emb2)

        all_pairs, all_scores = generate_new_dictionary_bidirectional(
            emb1, emb2)

        add_size = params.induced_dico_c * (i + 1)
        trainer.dico_AB = torch.cat(
            (trainer.dico_AB_original, all_pairs[:add_size].cuda()), 0)
        if i == 0:
            logger.info(
                "After first iteration train dictionary size: {}".format(
                    trainer.dico_AB.shape[0]))

    logger.info("Final iteration train dictionary size: {}".format(
        trainer.dico_AB.shape[0]))
    trainer.set_eval()
    precision_at_1 = get_word_translation_accuracy(
        params,
        trainer.mapping_G(trainer.encoder_A(
            trainer.src_emb.weight.data).data).data,
        trainer.encoder_B(trainer.tgt_emb.weight.data).data,
        src2tgt=True)

    if params.save_model_weights:
        save_model_weights(params, trainer, src2tgt=True)

    # Target to Source Training
    logger.info("\n \n Training for {} to {}".format(params.tgt_lang,
                                                     params.src_lang))
    n_iter = 0
    for i in range(params.iteration):
        trainer.train_B2A()

        emb1 = ((trainer.encoder_A(
            trainer.src_emb.weight.data)).data)[0:params.dico_max_rank]
        emb2 = (trainer.mapping_F(
            trainer.encoder_B(
                trainer.tgt_emb.weight.data)).data)[0:params.dico_max_rank]
        emb1 = emb1 / emb1.norm(2, 1, keepdim=True).expand_as(emb1)
        emb2 = emb2 / emb2.norm(2, 1, keepdim=True).expand_as(emb2)

        all_pairs, all_scores = generate_new_dictionary_bidirectional(
            emb2, emb1)

        add_size = params.induced_dico_c * (i + 1)
        trainer.dico_BA = torch.cat(
            (trainer.dico_BA_original, all_pairs[:add_size].cuda()), 0)
        if i == 0:
            logger.info(
                "After first iteration train dictionary size: {}".format(
                    trainer.dico_BA.shape[0]))

    logger.info("Final iteration train dictionary size: {}".format(
        trainer.dico_BA.shape[0]))

    trainer.set_eval()
    precision_at_1 = get_word_translation_accuracy(
        params,
        trainer.mapping_F(trainer.encoder_B(
            trainer.tgt_emb.weight.data).data).data,
        trainer.encoder_A(trainer.src_emb.weight.data).data,
        src2tgt=False)

    if params.save_model_weights:
        save_model_weights(params, trainer, src2tgt=False)
Пример #27
0
                    type=bool_flag,
                    default=False,
                    help="print predictions")

# parse parameters
params = parser.parse_args()

# check parameters
assert params.src_lang, "source language undefined"
assert os.path.isfile(params.src_emb)
assert not params.tgt_lang or os.path.isfile(params.tgt_emb)
assert params.dico_eval == 'default' or os.path.isfile(params.dico_eval)

# build logger / model / trainer / evaluator
logger = initialize_exp(params)
src_emb, tgt_emb, mapping, _ = build_model(params, False)
trainer = Trainer(src_emb, tgt_emb, mapping, None, params)
evaluator = Evaluator(trainer)

# run evaluations
to_log = OrderedDict({'n_iter': 0})
evaluator.monolingual_wordsim(to_log)
# evaluator.monolingual_wordanalogy(to_log)
if params.tgt_lang:
    evaluator.crosslingual_wordsim(to_log)
    evaluator.word_translation(to_log,
                               print_trans=params.print,
                               result_path=trainer.params.exp_path)
    evaluator.sent_translation(to_log)
    # evaluator.dist_mean_cosine(to_log)
Пример #28
0
assert params.dico_train in [
    "identical_char", "default", "identical_num", "MAT"
] or os.path.isfile(params.dico_train)
assert params.dico_build in ["S2T", "T2S", "S2T|T2S", "S2T&T2S"]
assert params.dico_max_size == 0 or params.dico_max_size < params.dico_max_rank
assert params.dico_max_size == 0 or params.dico_max_size > params.dico_min_size
assert all(os.path.isfile(emb) for emb in params.embs)
assert params.dico_eval == 'default' or os.path.isfile(params.dico_eval)
assert params.export in ["", "txt", "pth"]
assert len(params.langs) == len(params.embs)

# build logger / model / trainer / evaluator
logger = initialize_exp(params)
embs, mapping = build_model(params)
trainer = Trainer(embs, mapping, params)
evaluator = Evaluator(trainer)
final_results = {
    '{}-{}'.format(src, tgt): 0
    for src, tgt in permutations(params.langs, 2)
}

# load a training dictionary. if a dictionary path is not provided, use a default
# one ("default") or create one based on identical character strings ("identical_char")
trainer.load_multi_pairwise_training_dico(params.dico_train,
                                          params.dicts_train_path)
"""
Learning loop for Multilingual Pairwise Procrustes Analysis (MPPA)
"""
for n_iter in range(params.n_refinement):

    logger.info('Starting iteration %i...' % n_iter)
Пример #29
0
assert not params.cuda or torch.cuda.is_available()
assert 0 <= params.dis_dropout < 1
assert 0 <= params.dis_input_dropout < 1
assert 0 <= params.dis_smooth < 0.5
assert params.dis_lambda > 0 and params.dis_steps > 0
assert 0 < params.lr_shrink <= 1
assert os.path.isfile(params.src_emb)
assert os.path.isfile(params.tgt_emb)
assert params.dico_eval == 'default' or os.path.isfile(params.dico_eval)
assert params.export in ["", "txt", "pth"]

# build model / trainer / evaluator
logger = initialize_exp(params)
src_emb, tgt_emb, mapping, discriminator = build_model(params, True)
trainer = Trainer(src_emb, tgt_emb, mapping, discriminator, params)
evaluator = Evaluator(trainer)

compute_candidates_for_method_similarity(params.src_dico.word2id,
                                         params.tgt_dico.word2id, params,
                                         "dict/candidates_dict.txt")
logger.info("Num of synthetic class-method dictionary : " +
            str(len(params.src_candidate_indices)))
"""
Learning loop for Adversarial Training
"""
if params.adversarial:
    logger.info('----> ADVERSARIAL TRAINING <----\n\n')

    # training loop
    for n_epoch in range(params.n_epochs):
Пример #30
0
# check parameters
assert not params.cuda or torch.cuda.is_available()
assert 0 <= params.dis_dropout < 1
assert 0 <= params.dis_input_dropout < 1
assert 0 <= params.dis_smooth < 0.5
assert params.dis_lambda > 0 and params.dis_steps > 0
assert 0 < params.lr_shrink <= 1
assert os.path.isfile(params.src_emb)
assert os.path.isfile(params.tgt_emb)
assert params.export in ["", "txt", "pth"]

# build model / trainer / evaluator
logger = initialize_exp(params)
src_emb, tgt_emb, mapping, discriminator = build_model(params, True)
trainer = Trainer(src_emb, tgt_emb, mapping, discriminator, params)
evaluator = Evaluator(trainer)


"""
Learning loop for Adversarial Training
"""
if params.adversarial:
    logger.info('----> ADVERSARIAL TRAINING <----\n\n')

    # training loop
    for n_epoch in range(params.n_epochs):

        logger.info('Starting adversarial training epoch %i...' % n_epoch)
        tic = time.time()
        n_words_proc = 0
        stats = {'DIS_COSTS': []}
Пример #31
0
parser.add_argument("--use_earlystop", action="store_true", help="Early stop")

# parse parameters
params = parser.parse_args()

# check parameters
assert params.src_lang, "source language undefined"
assert os.path.isfile(params.src_emb)
assert not params.tgt_lang or os.path.isfile(params.tgt_emb)
assert params.dico_eval == 'default' or os.path.isfile(params.dico_eval)

# build logger / model / trainer / evaluator
logger = initialize_exp(params)
src_emb, tgt_emb, mapping, _ = build_model(params, False)
trainer = Trainer(src_emb, tgt_emb, mapping, None, params)
evaluator = Evaluator(trainer)

# run evaluations
to_log = OrderedDict({'n_iter': 0})
evaluator.monolingual_wordsim(to_log)
# evaluator.monolingual_wordanalogy(to_log)
if params.tgt_lang:
    evaluator.crosslingual_wordsim(to_log)
    examples = evaluator.word_translation(to_log,
                                          use_csls=False,
                                          output_results=True)
    for method, lists in examples.items():
        for k, values in zip([1, 5, 10], lists):
            with open(
                    os.path.join(params.exp_path,
                                 '{}_at_{}.tsv'.format(method, k)), 'w') as f:
Пример #32
0
                    help="Embedding dimension")
parser.add_argument("--normalize_embeddings",
                    type=str,
                    default="",
                    help="Normalize embeddings before training")

# parse parameters
params = parser.parse_args()

# check parameters
assert params.src_lang, "source language undefined"
assert os.path.isfile(params.src_emb)
assert not params.tgt_lang or os.path.isfile(params.tgt_emb)
assert params.dico_eval == 'default' or os.path.isfile(params.dico_eval)

# build logger / model / trainer / evaluator
logger = initialize_exp(params)
src_emb, tgt_emb, mapping, _ = build_model(params, False)
trainer = Trainer(src_emb, tgt_emb, mapping, None, params)
evaluator = Evaluator(trainer)

# run evaluations
to_log = OrderedDict({'n_iter': 0})
evaluator.monolingual_wordsim(to_log)
# evaluator.monolingual_wordanalogy(to_log)
if params.tgt_lang:
    evaluator.crosslingual_wordsim(to_log)
    evaluator.word_translation(to_log)
    evaluator.sent_translation(to_log)
    # evaluator.dist_mean_cosine(to_log)
Пример #33
0
assert not params.cuda or torch.cuda.is_available()
assert params.dico_train in ["identical_char", "default"] or os.path.isfile(
    params.dico_train)
assert params.dico_build in ["S2T", "T2S", "S2T|T2S", "S2T&T2S"]
assert params.dico_max_size == 0 or params.dico_max_size < params.dico_max_rank
assert params.dico_max_size == 0 or params.dico_max_size > params.dico_min_size
assert os.path.isfile(params.src_emb)
assert os.path.isfile(params.tgt_emb)
assert params.dico_eval == 'default' or os.path.isfile(params.dico_eval)
assert params.export in ["", "txt", "pth"]

# build logger / model / trainer / evaluator
logger = initialize_exp(params)
src_emb, tgt_emb, mapping, _ = build_model(params, False)
trainer = Trainer(src_emb, tgt_emb, mapping, None, params)
evaluator = Evaluator(trainer)

# load a training dictionary. if a dictionary path is not provided, use a default
# one ("default") or create one based on identical character strings ("identical_char")
trainer.load_training_dico(params.dico_train)

# define the validation metric
VALIDATION_METRIC = VALIDATION_METRIC_UNSUP if params.dico_train == 'identical_char' else VALIDATION_METRIC_SUP
logger.info("Validation metric: %s" % VALIDATION_METRIC)
"""
Learning loop for Procrustes Iterative Learning
"""
for n_iter in range(params.n_refinement + 1):

    logger.info('Starting iteration %i...' % n_iter)