Exemplo n.º 1
0
if not os.path.exists(args.save_path):
    os.makedirs(args.save_path)

if args.mode == 'eval':
    ckpt = args.load_from
else:
    ckpt = os.path.join(
        args.save_path,
        'etm_{}_K_{}_Htheta_{}_Optim_{}_Clip_{}_ThetaAct_{}_Lr_{}_Bsz_{}_RhoSize_{}_trainEmbeddings_{}'
        .format(args.dataset, args.num_topics, args.t_hidden_size,
                args.optimizer, args.clip, args.theta_act, args.lr,
                args.batch_size, args.rho_size, args.train_embeddings))

## define model and optimizer
model = ETM(args.num_topics, vocab_size, args.t_hidden_size, args.rho_size,
            args.emb_size, args.theta_act, embeddings, args.train_embeddings,
            args.enc_drop).to(device)

print('model: {}'.format(model))

if args.optimizer == 'adam':
    optimizer = optim.Adam(model.parameters(),
                           lr=args.lr,
                           weight_decay=args.wdecay)
elif args.optimizer == 'adagrad':
    optimizer = optim.Adagrad(model.parameters(),
                              lr=args.lr,
                              weight_decay=args.wdecay)
elif args.optimizer == 'adadelta':
    optimizer = optim.Adadelta(model.parameters(),
                               lr=args.lr,
Exemplo n.º 2
0
    if args.mode == 'eval':
        ckpt = args.load_from
    else:
        ckpt = os.path.join(
            args.save_path,
            'Dec17_etm_{}_K_{}_Htheta_{}_Optim_{}_Clip_{}_ThetaAct_{}_Lr_{}_Bsz_{}_RhoSize_{}_trainEmbeddings_{}'
            .format(args.dataset, args.num_topics, args.t_hidden_size,
                    args.optimizer, args.clip, args.theta_act, args.lr,
                    args.batch_size, args.rho_size, args.train_embeddings))

    for num_topics in [10, 15, 20, 25, 30, 35, 40, 45, 50]:
        args.num_topics = num_topics
        ## define model and optimizer
        model = ETM(args.num_topics, vocab_size, args.t_hidden_size,
                    args.rho_size, args.emb_size, args.theta_act, embeddings,
                    args.train_embeddings, args.enc_drop).to(device)
        # print('model: {}'.format(model))
        if args.optimizer == 'adam':
            optimizer = optim.Adam(model.parameters(),
                                   lr=args.lr,
                                   weight_decay=args.wdecay)

        def train(epoch):
            model.train()
            acc_loss = 0
            acc_kl_theta_loss = 0
            cnt = 0
            indices = torch.randperm(args.num_docs_train)
            indices = torch.split(indices, args.batch_size)
            for idx, ind in enumerate(indices):
Exemplo n.º 3
0

## -------------------------------------
## Finally training
## ------------------------------------

print("## -------------------------------------")
print("##\t TRAINING THE MODEL ")
print("## -------------------------------------")

# define model
etm_model = ETM(num_topics=num_topics,
                vocab_size=vocab_size,
                t_hidden_size=t_hidden_size,
                rho_size=rho_size,
                emsize=emb_size,
                theta_act=theta_act,
                embeddings=embedding,
                train_embeddings=train_embeddings,
                enc_drop=enc_drop).to(device)

print('model: {}'.format(etm_model))

optimizer = get_optimizer(name=_optimizer, model=etm_model)

# Initialising the data structures
best_epoch = 0
best_val_ppl = 1e9
all_val_ppls = []

# Let's get a sense of how bad the model is before training
Exemplo n.º 4
0
    dataset = dataset.map(parse,
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.padded_batch(batch_size, (None, ))
    return dataset


if __name__ == '__main__':
    vocab = [x.strip() for x in open(args.vocab_path, 'r').readlines()]
    vocab_dic = {x: i for i, x in enumerate(vocab)}

    # build model
    etm = ETM(num_topics=args.num_topics,
              rho_size=args.rho_size,
              theta_act=args.theta_act,
              train_embeddings=1,
              embeddings=None,
              topic_embeddings=None,
              enc_drop=0,
              vocab_size=len(vocab),
              t_hidden_size=args.t_hidden_size)
    input_layer = tf.keras.layers.Input(batch_shape=(None, None),
                                        dtype=tf.int32)
    model = tf.keras.Model(input_layer, etm(input_layer))
    model.load_weights(args.weight_path)
    print(model.summary())

    # loading data
    corpus = open(args.corpus, 'r').readlines()
    data = [[
        vocab_dic[word] for word in x.strip().split() if word in vocab_dic
    ] for x in corpus]
Exemplo n.º 5
0
            os.path.join(config_dict['saving_models_path'][machine],
                         'etm_{}_K_{}_Htheta_{}_Optim_{}_Clip_{}_ThetaAct_{}_Lr_{}_Bsz_{}_RhoSize_{}_'
                         'trainEmbeddings_{}'.format(config_dict['dataset'], config_dict['model_params']['num_topics'],
                                                     config_dict['model_params']['t_hidden_size'],
                                                     config_dict['optimization_params']['optimizer'],
                                                     config_dict['optimization_params']['clip'],
                                                     config_dict['model_params']['theta_act'],
                                                     config_dict['optimization_params']['lr'],
                                                     config_dict['batch_size'],
                                                     config_dict['model_params']['rho_size'],
                                                     config_dict['model_params']['train_embeddings']))

    if config_dict['optimization_params']['mode'] == 'train':
        # define model and optimizer
        etm_model = ETM(config_dict=config_dict,
                        machine=machine,
                        embeddings=embeddings)
        print('model: {}'.format(etm_model))
        optimizer = _set_optimizer()
        etm_model.fit(optimizer=optimizer,
                      train_tokens=train_tokens,
                      train_counts=train_counts,
                      test_1_tokens=test_1_tokens,
                      test_1_counts=test_1_counts,
                      test_2_tokens=test_2_tokens,
                      test_2_counts=test_2_counts,
                      vocab=vocab,
                      ckpt=ckpt)
        print('Visualizing model quality after training...')
        with open(ckpt, 'rb') as f:
            etm_model = torch.load(f)
Exemplo n.º 6
0
            tmp_emb = np.zeros(args.rho_size)
            for word in topic_words:
                tmp_emb += vectors[word]
            topic_embeddings[i] = tmp_emb / len(topic_words)
        topic_embeddings = np.float32(topic_embeddings)

    else:
        embeddings = None
        topic_embeddings = None

    # build model
    etm = ETM(num_topics=args.num_topics,
              rho_size=args.rho_size,
              theta_act=args.theta_act,
              train_embeddings=args.train_embeddings,
              embeddings=embeddings,
              topic_embeddings=topic_embeddings,
              enc_drop=args.enc_drop,
              vocab_size=du.vocab_size,
              t_hidden_size=args.t_hidden_size)
    input_layer = tf.keras.layers.Input(batch_shape=(None, None),
                                        dtype=tf.int32)
    model = tf.keras.Model(input_layer, etm(input_layer))
    print(model.summary())

    # loading data
    data = du.load_dataset(args.data_path, args.batch_size)

    # start training
    if not os.path.exists(args.save_path):
        os.makedirs(args.save_path)
Exemplo n.º 7
0
    os.makedirs(args.save_path)

if args.mode == 'eval':
    ckpt = args.load_from
else:
    ckpt = Path.cwd().joinpath(args.save_path, 
        'etm_{}_K_{}_Htheta_{}_Optim_{}_Clip_{}_ThetaAct_{}_Lr_{}_Bsz_{}_RhoSize_{}_trainEmbeddings_{}'.format(
        args.dataset, args.num_topics, args.t_hidden_size, args.optimizer, args.clip, args.theta_act, 
            args.lr, args.batch_size, args.rho_size, args.train_embeddings))

## define model and optimizer
model = ETM(args.num_topics, 
            vocab_size, 
            args.t_hidden_size, 
            args.rho_size, 
            args.emb_size, 
            args.theta_act, 
            embeddings, 
            args.train_embeddings, 
            args.enc_drop).to(device)

print('model: {}'.format(model))

optimizer = model.get_optimizer(args)


tracemalloc.start()
if args.mode == 'train':
    ## train model on data 
    best_epoch = 0
    best_val_ppl = 1e9
Exemplo n.º 8
0
Arquivo: main.py Projeto: abcp4/ETM
if not os.path.exists(args.save_path):
    os.makedirs(args.save_path)

if args.mode == 'eval':
    ckpt = args.load_from
else:
    ckpt = os.path.join(
        args.save_path,
        'etm_{}_K_{}_Htheta_{}_Optim_{}_Clip_{}_ThetaAct_{}_Lr_{}_Bsz_{}_RhoSize_{}_trainEmbeddings_{}'
        .format(args.dataset, args.num_topics, args.t_hidden_size,
                args.optimizer, args.clip, args.theta_act, args.lr,
                args.batch_size, args.rho_size, args.train_embeddings))

## define model and optimizer
model = ETM(args.num_topics, vocab_size, args.t_hidden_size, args.rho_size,
            args.emb_size, args.theta_act, embeddings, args.train_embeddings,
            args.enc_drop).to(device)

print('model: {}'.format(model))

if args.optimizer == 'adam':
    optimizer = optim.Adam(model.parameters(),
                           lr=args.lr,
                           weight_decay=args.wdecay)
elif args.optimizer == 'adagrad':
    optimizer = optim.Adagrad(model.parameters(),
                              lr=args.lr,
                              weight_decay=args.wdecay)
elif args.optimizer == 'adadelta':
    optimizer = optim.Adadelta(model.parameters(),
                               lr=args.lr,
Exemplo n.º 9
0
        ckpt = \
            os.path.join(config_dict['saving_models_path'][machine],
                         'etm_{}_K_{}_Htheta_{}_Optim_{}_Clip_{}_ThetaAct_{}_Lr_{}_Bsz_{}_RhoSize_{}_'
                         'trainEmbeddings_{}'.format(config_dict['dataset'], config_dict['model_params']['num_topics'],
                                                     config_dict['model_params']['t_hidden_size'],
                                                     config_dict['optimization_params']['optimizer'],
                                                     config_dict['optimization_params']['clip'],
                                                     config_dict['model_params']['theta_act'],
                                                     config_dict['optimization_params']['lr'],
                                                     config_dict['batch_size'],
                                                     config_dict['model_params']['rho_size'],
                                                     config_dict['model_params']['train_embeddings']))

    # define model and optimizer
    etm_model = ETM(config_dict=config_dict,
                    machine=machine,
                    embeddings=embeddings)
    print('model: {}'.format(etm_model))
    optimizer = _set_optimizer()

    if config_dict['optimization_params']['mode'] == 'train':
        etm_model.fit(optimizer=optimizer,
                      train_tokens=train_tokens,
                      train_counts=train_counts,
                      test_1_tokens=test_1_tokens,
                      test_1_counts=test_1_counts,
                      test_2_tokens=test_2_tokens,
                      test_2_counts=test_2_counts,
                      vocab=vocab,
                      ckpt=ckpt)