def main_train_team2vec(): import dal.load_dblp_data as dblp if dblp.preprocessed_dataset_exist( file_path='./dataset/dblp_preprocessed_dataset.pkl'): team_matrix = dblp.preprocessed_dataset_exist() else: dblp.extract_data(filter_journals=True) team_matrix = dblp.preprocessed_dataset_exist() t2v = Team2Vec() help_str = 'team2vec.py [-m] [-s] [-d <dimension=100>] [-e <epochs=100>] [-w <window=2>] \n-m: distributed memory mode; default=distributed bag of members\n-s: member type = skill; default = user' try: opts, args = getopt.getopt(sys.argv[1:], "hmsd:w:", ["dimension=", "window="]) except getopt.GetoptError: print(help_str) sys.exit(2) dimension = 100 epochs = 100 window = 2 dm = 0 member_type = 'user' for opt, arg in opts: if opt == '-h': print(help_str) sys.exit() elif opt == '-s': member_type = 'skill' elif opt == '-m': dm = 1 elif opt in ("-d", "--dimension"): dimension = int(arg) elif opt in ("-e", "--epochs"): epochs = int(arg) elif opt in ("-w", "--window"): window = int(arg) t2v.init(team_matrix, member_type=member_type) t2v.train(dimension=dimension, window=window, dist_mode=dm, output='./output/Models/T2V/', epochs=epochs) t2v.plot_model('pca', output='./output/Figures/') t2v.plot_model('tsne', output='./output/Figures/')
k_fold = 10 k_max = 100 evaluation_k_set = np.arange(1, k_max + 1, 1) #nn settings epochs_in_batch = 25 epochs_overall = 10 back_propagation_batch_size = 64 training_batch_size = 6000 min_skill_size = 0 min_member_size = 0 latent_dim = 50 print(K.tensorflow_backend._get_available_gpus()) if dblp.preprocessed_dataset_exist() and dblp.train_test_indices_exist(): dataset = dblp.load_preprocessed_dataset() train_test_indices = dblp.load_train_test_indices() else: if not dblp.ae_data_exist(file_path='../dataset/ae_dataset.pkl'): dblp.extract_data(filter_journals=True, skill_size_filter=min_skill_size, member_size_filter=min_member_size) if not dblp.preprocessed_dataset_exist( ) or not dblp.train_test_indices_exist(): dblp.dataset_preprocessing( dblp.load_ae_dataset(file_path='../dataset/ae_dataset.pkl'), seed=seed, kfolds=k_fold) dataset = dblp.load_preprocessed_dataset() train_test_indices = dblp.load_train_test_indices()
training_batch_size = 6000 min_skill_size = 0 min_member_size = 0 latent_dim = 2 beta = 30 print(tf.test.is_gpu_available()) m2v_path = '../dataset/embedding_dict.pkl' if dblp.ae_data_exist(file_path='../dataset/ae_e_m2v_tSkill_dataset.pkl'): dataset = dblp.load_ae_dataset(file_path='../dataset/ae_e_m2v_tSkill_dataset.pkl') else: if not dblp.ae_data_exist(file_path='../dataset/ae_dataset.pkl'): dblp.extract_data(filter_journals=True, skill_size_filter=min_skill_size, member_size_filter=min_member_size, output_dir='../dataset/ae_dataset.pkl') if not dblp.preprocessed_dataset_exist(file_path='../dataset/dblp_preprocessed_dataset.pkl') or not dblp.train_test_indices_exist(file_path='../dataset/Train_Test_indices.pkl'): dblp.dataset_preprocessing(dblp.load_ae_dataset(file_path='../dataset/ae_dataset.pkl'), indices_dict_file_path='../dataset/Train_Test_indices.pkl', preprocessed_dataset_file_path='../dataset/dblp_preprocessed_dataset.pkl', seed=seed, kfolds=k_fold) preprocessed_dataset = dblp.load_preprocessed_dataset(file_path='../dataset/dblp_preprocessed_dataset.pkl') dblp.nn_m2v_embedding_dataset_generator(model_path=m2v_path, dataset=preprocessed_dataset, output_file_path='../dataset/ae_e_m2v_tSkill_dataset.pkl', mode='skill', max_length=22) del preprocessed_dataset dataset = dblp.load_ae_dataset(file_path='../dataset/ae_e_m2v_tSkill_dataset.pkl') # reparameterization trick # instead of sampling from Q(z|X), sample epsilon = N(0,I) # z = z_mean + sqrt(var) * epsilon def sampling(args): """Reparameterization trick by sampling from an isotropic unit Gaussian.