def clustering(student_list, data, method): ''' @param student_list: list of student id, in the form (string)student_id @param data: the actual data of the students, with data_key->data @param method: string from command line argument(s), decide how to clustering ''' # TODO *yunfeiluo) do the actual clustering work, write to pkl file groups = dict() if method == 'one_for_each': groups = one_for_each(student_list) elif method[:10] == 'avg_stress': ''' avg_stress_eps_min-samples ''' groups = avg_stress_cluster(student_list=student_list, data=data, eps=float(method.split('_')[-2]), min_samples=int(method.split('_')[-1])) elif method[:7] == 'surveys': ''' surveys_eps_min-samples ''' features = ['avg_hours_slept', 'mode_sleep_rating', 'avg_dead_line_per_week'] eps = float(method.split('_')[1]) min_samples = int(method.split('_')[2]) groups = kmeans_features(student_list, features, eps, min_samples) elif method [:3] == 'dtw': ''' dtw_eps_min-samples ''' eps = float(method.split('_')[1]) min_samples = int(method.split('_')[2]) feature = -1 # stress label groups = time_warping(student_list, data, feature, eps, min_samples) else: groups = one_for_each(student_list) # write to pkl file filepath = 'Data/student_groups/' + method + '.pkl' print('write to the file: ' + filepath) write_utils.data_structure_to_pickle(groups, filepath)
statistics.generate_training_statistics_for_user( val_labels, val_preds, val_users)) if val_scores[2] > best_split_score: best_split_score = val_scores[2] epoch_at_best_score = epoch best_model = deepcopy(model) print("Split: {} Score This Epoch: {} Best Score: {}".format( split_no, val_scores[2], best_split_score)) split_val_scores.append(best_split_score) best_score_epoch_log.append(epoch_at_best_score) best_models.append(deepcopy(best_model)) print("Avg Cross Val Score: {}".format(list_mean(split_val_scores))) print("alpha: {} Beta: {}".format(alpha, beta)) print("Data File Path:", data_file_path) max_idx = split_val_scores.index(max(split_val_scores)) scores_and_epochs = (split_val_scores, epoch_at_best_score) scores_and_epochs_file_name = os.path.join( definitions.DATA_DIR, "cross_val_scores/lstm_classifier.pkl") write_utils.data_structure_to_pickle(scores_and_epochs, scores_and_epochs_file_name) model_file_name = "saved_models/lstm.model" model_file_name = os.path.join(definitions.DATA_DIR, model_file_name) checkpointing.save_checkpoint(best_models[max_idx].state_dict(), model_file_name)
from src.data_manager import student_life_var_binned_data_manager as data_manager from src.bin import statistics from src.utils import write_utils student_list = [ 4, 7, 8, 10, 14, 16, 17, 19, 22, 23, 24, 32, 33, 35, 36, 43, 44, 49, 51, 52, 53, 57, 58 ] data = data_manager.get_data_for_training_in_dict_format( *student_list, normalize=True, fill_na=True, flatten_sequence=False, split_type='percentage') print(statistics.get_train_test_val_label_counts_from_raw_data(data)) write_utils.data_structure_to_pickle( data, 'Data/training_data/shuffled_splits/training_date_normalized_shuffled_splits_select_features_no_prev_stress_all_students.pkl' )
def search_multitask_auto_encoder(hyper_parameters_list, data: dict): splits = cross_val.get_k_fod_cross_val_splits_stratified_by_students(data) student_list = conversions.extract_distinct_student_idsfrom_keys( data['data'].keys()) tensorified_data = tensorify.tensorify_data_gru_d( copy.deepcopy(data), torch.cuda.is_available()) final_scores_for_each_config = [] print("Label Distribution") print(statistics.get_train_test_val_label_counts_from_raw_data(data)) for model_params_no, model_params in enumerate(hyper_parameters_list): print( "###################### Param Config No: {} ########################" .format(model_params_no)) print("Params: ", model_params) (use_histogram, autoencoder_bottle_neck_feature_size, autoencoder_num_layers, alpha, beta, decay, num_features, num_covariates, shared_hidden_layer_size, user_dense_layer_hidden_size, num_classes, learning_rate, n_epochs, shared_layer_dropout_prob, user_head_dropout_prob, class_weights, device) = helper.get_params_from_model(model_params, data) best_val_scores = [] for split_no, split in enumerate(splits): print("Split {}".format(split_no)) best_split_score = -1 tensorified_data['train_ids'] = split["train_ids"] tensorified_data['val_ids'] = split["val_ids"] tensorified_data['test_ids'] = [] model, reconstruction_criterion, classification_criterion, optimizer = helper.init_multitask_autoencoder_learner( num_features, autoencoder_bottle_neck_feature_size, autoencoder_num_layers, shared_hidden_layer_size, user_dense_layer_hidden_size, num_classes, num_covariates, shared_layer_dropout_prob, user_head_dropout_prob, learning_rate, decay, class_weights, student_list) total_loss_over_epochs, scores_over_epochs = plotting.get_empty_stat_over_n_epoch_dictionaries( ) reconstruction_loss_over_epochs = copy.deepcopy( total_loss_over_epochs) classification_loss_over_epochs = copy.deepcopy( total_loss_over_epochs) for epoch in tqdm.tqdm(range(n_epochs)): (train_total_loss, train_total_reconstruction_loss, train_total_classification_loss, train_labels, train_preds, train_users), (val_total_loss, val_total_reconstruction_loss, val_total_classification_loss, val_labels, val_preds, val_users) = helper.train_for_one_epoch( tensorified_data, num_classes, model, reconstruction_criterion, classification_criterion, device, optimizer, alpha, beta, use_histogram) ######## Appending losses ######## total_loss_over_epochs['train_loss'].append(train_total_loss) total_loss_over_epochs['val_loss'].append(val_total_loss) reconstruction_loss_over_epochs['train_loss'].append( train_total_reconstruction_loss) reconstruction_loss_over_epochs['val_loss'].append( val_total_reconstruction_loss) classification_loss_over_epochs['train_loss'].append( train_total_classification_loss) classification_loss_over_epochs['val_loss'].append( val_total_classification_loss) ######## Appending Metrics ######## train_label_list = conversions.tensor_list_to_int_list( train_labels) train_pred_list = conversions.tensor_list_to_int_list( train_preds) val_label_list = conversions.tensor_list_to_int_list( val_labels) val_pred_list = conversions.tensor_list_to_int_list(val_preds) train_scores = metrics.precision_recall_fscore_support( train_label_list, train_pred_list, average='weighted')[F_SCORE_INDEX] val_scores = metrics.precision_recall_fscore_support( val_label_list, val_pred_list, average='weighted')[F_SCORE_INDEX] scores_over_epochs['train_scores'].append(train_scores) scores_over_epochs['val_scores'].append(val_scores) if val_scores > best_split_score: best_split_score = val_scores best_val_scores.append(best_split_score) avg_val_score = list_mean(best_val_scores) final_scores_for_each_config.append((avg_val_score, model_params)) print("Average score for current configuration: {}".format( avg_val_score)) grid_search_details_file_path = os.path.join(definitions.DATA_DIR, "grid_search_details.pkl") write_utils.data_structure_to_pickle(final_scores_for_each_config, grid_search_details_file_path)