예제 #1
0
def main_train_team2vec():
    import dal.load_dblp_data as dblp
    if dblp.preprocessed_dataset_exist(
            file_path='./dataset/dblp_preprocessed_dataset.pkl'):
        team_matrix = dblp.preprocessed_dataset_exist()
    else:
        dblp.extract_data(filter_journals=True)
        team_matrix = dblp.preprocessed_dataset_exist()

    t2v = Team2Vec()

    help_str = 'team2vec.py [-m] [-s] [-d <dimension=100>] [-e <epochs=100>] [-w <window=2>] \n-m: distributed memory mode; default=distributed bag of members\n-s: member type = skill; default = user'
    try:
        opts, args = getopt.getopt(sys.argv[1:], "hmsd:w:",
                                   ["dimension=", "window="])
    except getopt.GetoptError:
        print(help_str)
        sys.exit(2)
    dimension = 100
    epochs = 100
    window = 2
    dm = 0
    member_type = 'user'
    for opt, arg in opts:
        if opt == '-h':
            print(help_str)
            sys.exit()
        elif opt == '-s':
            member_type = 'skill'
        elif opt == '-m':
            dm = 1
        elif opt in ("-d", "--dimension"):
            dimension = int(arg)
        elif opt in ("-e", "--epochs"):
            epochs = int(arg)
        elif opt in ("-w", "--window"):
            window = int(arg)

    t2v.init(team_matrix, member_type=member_type)
    t2v.train(dimension=dimension,
              window=window,
              dist_mode=dm,
              output='./output/Models/T2V/',
              epochs=epochs)
    t2v.plot_model('pca', output='./output/Figures/')
    t2v.plot_model('tsne', output='./output/Figures/')
예제 #2
0
epochs_overall = 10
back_propagation_batch_size = 64
training_batch_size = 6000
min_skill_size = 0
min_member_size = 0
latent_dim = 50

print(K.tensorflow_backend._get_available_gpus())

if dblp.preprocessed_dataset_exist() and dblp.train_test_indices_exist():
    dataset = dblp.load_preprocessed_dataset()
    train_test_indices = dblp.load_train_test_indices()
else:
    if not dblp.ae_data_exist(file_path='../dataset/ae_dataset.pkl'):
        dblp.extract_data(filter_journals=True,
                          skill_size_filter=min_skill_size,
                          member_size_filter=min_member_size)
    if not dblp.preprocessed_dataset_exist(
    ) or not dblp.train_test_indices_exist():
        dblp.dataset_preprocessing(
            dblp.load_ae_dataset(file_path='../dataset/ae_dataset.pkl'),
            seed=seed,
            kfolds=k_fold)
    dataset = dblp.load_preprocessed_dataset()
    train_test_indices = dblp.load_train_test_indices()

# k_fold Cross Validation
cvscores = []

# Defining evaluation scores holders for train data
r_at_k_all_train = dblp_eval.init_eval_holder(
back_propagation_batch_size = 32
training_batch_size = 6000
min_skill_size = 0
min_member_size = 0
latent_dim = 2
beta = 30

print(tf.test.is_gpu_available())
m2v_path = '../dataset/embedding_dict.pkl'


if dblp.ae_data_exist(file_path='../dataset/ae_e_m2v_tSkill_dataset.pkl'):
    dataset = dblp.load_ae_dataset(file_path='../dataset/ae_e_m2v_tSkill_dataset.pkl')
else:
    if not dblp.ae_data_exist(file_path='../dataset/ae_dataset.pkl'):
        dblp.extract_data(filter_journals=True, skill_size_filter=min_skill_size, member_size_filter=min_member_size, output_dir='../dataset/ae_dataset.pkl')
    if not dblp.preprocessed_dataset_exist(file_path='../dataset/dblp_preprocessed_dataset.pkl') or not dblp.train_test_indices_exist(file_path='../dataset/Train_Test_indices.pkl'):
        dblp.dataset_preprocessing(dblp.load_ae_dataset(file_path='../dataset/ae_dataset.pkl'), indices_dict_file_path='../dataset/Train_Test_indices.pkl', preprocessed_dataset_file_path='../dataset/dblp_preprocessed_dataset.pkl', seed=seed, kfolds=k_fold)
    preprocessed_dataset = dblp.load_preprocessed_dataset(file_path='../dataset/dblp_preprocessed_dataset.pkl')

    dblp.nn_m2v_embedding_dataset_generator(model_path=m2v_path, dataset=preprocessed_dataset, output_file_path='../dataset/ae_e_m2v_tSkill_dataset.pkl', mode='skill', max_length=22)
    del preprocessed_dataset
    dataset = dblp.load_ae_dataset(file_path='../dataset/ae_e_m2v_tSkill_dataset.pkl')



# reparameterization trick
# instead of sampling from Q(z|X), sample epsilon = N(0,I)
# z = z_mean + sqrt(var) * epsilon
def sampling(args):
    """Reparameterization trick by sampling from an isotropic unit Gaussian.