예제 #1
0
def clustering(student_list, data, method):
    '''
    @param student_list: list of student id, in the form (string)student_id
    @param data: the actual data of the students, with data_key->data
    @param method: string from command line argument(s), decide how to clustering
    '''
    # TODO *yunfeiluo) do the actual clustering work, write to pkl file
    groups = dict()
    if method == 'one_for_each':
        groups = one_for_each(student_list)
    elif method[:10] == 'avg_stress':
        '''
        avg_stress_eps_min-samples
        '''
        groups = avg_stress_cluster(student_list=student_list, data=data, eps=float(method.split('_')[-2]), min_samples=int(method.split('_')[-1]))
    elif method[:7] == 'surveys':
        '''
        surveys_eps_min-samples
        '''
        features = ['avg_hours_slept', 'mode_sleep_rating', 'avg_dead_line_per_week']
        eps = float(method.split('_')[1])
        min_samples = int(method.split('_')[2])
        groups = kmeans_features(student_list, features, eps, min_samples)
    elif method [:3] == 'dtw':
        '''
        dtw_eps_min-samples
        '''
        eps = float(method.split('_')[1])
        min_samples = int(method.split('_')[2])
        feature = -1 # stress label
        groups = time_warping(student_list, data, feature, eps, min_samples)
    else:
        groups = one_for_each(student_list)

    # write to pkl file
    filepath = 'Data/student_groups/' + method + '.pkl'
    print('write to the file: ' + filepath)
    write_utils.data_structure_to_pickle(groups, filepath)
예제 #2
0
            statistics.generate_training_statistics_for_user(
                val_labels, val_preds, val_users))

        if val_scores[2] > best_split_score:
            best_split_score = val_scores[2]
            epoch_at_best_score = epoch
            best_model = deepcopy(model)

        print("Split: {} Score This Epoch: {} Best Score: {}".format(
            split_no, val_scores[2], best_split_score))

    split_val_scores.append(best_split_score)
    best_score_epoch_log.append(epoch_at_best_score)
    best_models.append(deepcopy(best_model))

print("Avg Cross Val Score: {}".format(list_mean(split_val_scores)))
print("alpha: {} Beta: {}".format(alpha, beta))
print("Data File Path:", data_file_path)
max_idx = split_val_scores.index(max(split_val_scores))

scores_and_epochs = (split_val_scores, epoch_at_best_score)
scores_and_epochs_file_name = os.path.join(
    definitions.DATA_DIR, "cross_val_scores/lstm_classifier.pkl")
write_utils.data_structure_to_pickle(scores_and_epochs,
                                     scores_and_epochs_file_name)

model_file_name = "saved_models/lstm.model"
model_file_name = os.path.join(definitions.DATA_DIR, model_file_name)
checkpointing.save_checkpoint(best_models[max_idx].state_dict(),
                              model_file_name)
예제 #3
0
from src.data_manager import student_life_var_binned_data_manager as data_manager
from src.bin import statistics
from src.utils import write_utils

student_list = [
    4, 7, 8, 10, 14, 16, 17, 19, 22, 23, 24, 32, 33, 35, 36, 43, 44, 49, 51,
    52, 53, 57, 58
]
data = data_manager.get_data_for_training_in_dict_format(
    *student_list,
    normalize=True,
    fill_na=True,
    flatten_sequence=False,
    split_type='percentage')

print(statistics.get_train_test_val_label_counts_from_raw_data(data))

write_utils.data_structure_to_pickle(
    data,
    'Data/training_data/shuffled_splits/training_date_normalized_shuffled_splits_select_features_no_prev_stress_all_students.pkl'
)
예제 #4
0
def search_multitask_auto_encoder(hyper_parameters_list, data: dict):
    splits = cross_val.get_k_fod_cross_val_splits_stratified_by_students(data)
    student_list = conversions.extract_distinct_student_idsfrom_keys(
        data['data'].keys())
    tensorified_data = tensorify.tensorify_data_gru_d(
        copy.deepcopy(data), torch.cuda.is_available())

    final_scores_for_each_config = []

    print("Label Distribution")
    print(statistics.get_train_test_val_label_counts_from_raw_data(data))

    for model_params_no, model_params in enumerate(hyper_parameters_list):
        print(
            "###################### Param Config No: {} ########################"
            .format(model_params_no))
        print("Params: ", model_params)

        (use_histogram, autoencoder_bottle_neck_feature_size,
         autoencoder_num_layers, alpha, beta, decay, num_features,
         num_covariates, shared_hidden_layer_size,
         user_dense_layer_hidden_size, num_classes, learning_rate, n_epochs,
         shared_layer_dropout_prob, user_head_dropout_prob, class_weights,
         device) = helper.get_params_from_model(model_params, data)

        best_val_scores = []

        for split_no, split in enumerate(splits):

            print("Split {}".format(split_no))

            best_split_score = -1

            tensorified_data['train_ids'] = split["train_ids"]
            tensorified_data['val_ids'] = split["val_ids"]
            tensorified_data['test_ids'] = []

            model, reconstruction_criterion, classification_criterion, optimizer = helper.init_multitask_autoencoder_learner(
                num_features, autoencoder_bottle_neck_feature_size,
                autoencoder_num_layers, shared_hidden_layer_size,
                user_dense_layer_hidden_size, num_classes, num_covariates,
                shared_layer_dropout_prob, user_head_dropout_prob,
                learning_rate, decay, class_weights, student_list)

            total_loss_over_epochs, scores_over_epochs = plotting.get_empty_stat_over_n_epoch_dictionaries(
            )
            reconstruction_loss_over_epochs = copy.deepcopy(
                total_loss_over_epochs)
            classification_loss_over_epochs = copy.deepcopy(
                total_loss_over_epochs)

            for epoch in tqdm.tqdm(range(n_epochs)):

                (train_total_loss, train_total_reconstruction_loss,
                 train_total_classification_loss, train_labels, train_preds,
                 train_users), (val_total_loss, val_total_reconstruction_loss,
                                val_total_classification_loss, val_labels,
                                val_preds,
                                val_users) = helper.train_for_one_epoch(
                                    tensorified_data, num_classes, model,
                                    reconstruction_criterion,
                                    classification_criterion, device,
                                    optimizer, alpha, beta, use_histogram)

                ######## Appending losses ########
                total_loss_over_epochs['train_loss'].append(train_total_loss)
                total_loss_over_epochs['val_loss'].append(val_total_loss)

                reconstruction_loss_over_epochs['train_loss'].append(
                    train_total_reconstruction_loss)
                reconstruction_loss_over_epochs['val_loss'].append(
                    val_total_reconstruction_loss)

                classification_loss_over_epochs['train_loss'].append(
                    train_total_classification_loss)
                classification_loss_over_epochs['val_loss'].append(
                    val_total_classification_loss)

                ######## Appending Metrics ########
                train_label_list = conversions.tensor_list_to_int_list(
                    train_labels)
                train_pred_list = conversions.tensor_list_to_int_list(
                    train_preds)
                val_label_list = conversions.tensor_list_to_int_list(
                    val_labels)
                val_pred_list = conversions.tensor_list_to_int_list(val_preds)

                train_scores = metrics.precision_recall_fscore_support(
                    train_label_list, train_pred_list,
                    average='weighted')[F_SCORE_INDEX]
                val_scores = metrics.precision_recall_fscore_support(
                    val_label_list, val_pred_list,
                    average='weighted')[F_SCORE_INDEX]

                scores_over_epochs['train_scores'].append(train_scores)
                scores_over_epochs['val_scores'].append(val_scores)

                if val_scores > best_split_score:
                    best_split_score = val_scores

            best_val_scores.append(best_split_score)

        avg_val_score = list_mean(best_val_scores)
        final_scores_for_each_config.append((avg_val_score, model_params))

        print("Average score for current configuration: {}".format(
            avg_val_score))

    grid_search_details_file_path = os.path.join(definitions.DATA_DIR,
                                                 "grid_search_details.pkl")
    write_utils.data_structure_to_pickle(final_scores_for_each_config,
                                         grid_search_details_file_path)