Exemplo n.º 1
0
def get_transposed_data(data: dict):
    validations.validate_data_dict_keys(data)
    for key in data['data']:
        transposed_data = [
            transpose_data(datum) for datum in data['data'][key]
        ]
        data['data'][key] = tuple(transposed_data)

    return data
Exemplo n.º 2
0
def get_sub_sampled_sequences(data: dict):
    validations.validate_all_data_present_in_data_dict(data)
    validations.validate_data_dict_keys(data)
    new_data = object_generator_utils.get_empty_data_dict()

    for key in data['data']:
        key_set = find_set_for_key(data, key)
        if key_set:
            sub_sample_sequences(data['data'][key], key, key_set, new_data)

    return new_data
Exemplo n.º 3
0
def tensorify_data_gru_d(data: dict, cuda_enabled=False):
    """

    @param data: Data dictionary that needs to be converted to tensors in GRUD style of data.
    @param cuda_enabled: If true, will convert data into cuda tensors.
    @return: Return Data dictionary with tensors which can be used to train.
    """
    validations.validate_data_dict_keys(data)
    validations.validate_all_data_present_in_data_dict(data)
    for key in data['data'].keys():
        data['data'][key] = get_data_and_label_tensor(data, key, cuda_enabled)

    return data
Exemplo n.º 4
0
def data_debug_string(data: dict, seq_limit):
    """

    @param data:
    @param seq_limit: Integer value to limit the seq.
    @return: Returns samples example of small slices of data.
    """
    validations.validate_data_dict_keys(data)
    first_key = next(iter(data['data'].keys()))
    print('first_key: ', first_key)

    for idx, datum in enumerate(data['data'][first_key]):
        if idx == 3:
            print("Label: ", datum)
        else:
            print(datum[:seq_limit])
Exemplo n.º 5
0
def evaluate_multitask_lstm_learner(data,
                                    key_set: str,
                                    multitask_lerner_model,
                                    classification_criterion,
                                    optimizer=None,
                                    use_histogram=False):
    validations.validate_data_dict_keys(data)
    validate_key_set_str(key_set)

    total_classification_loss = 0

    labels = []
    predictions = []
    users = []

    if not optimizer:
        multitask_lerner_model.eval()
    else:
        multitask_lerner_model.train()

    for key in data[key_set]:
        student_id = conversions.extract_student_id_from_key(key)
        student_key = 'student_' + str(student_id)
        actual_data, covariate_data, histogram_data, train_label = data[
            'data'][key]
        actual_data = actual_data[0].unsqueeze(0)
        if use_histogram:
            actual_data = histogram_data.unsqueeze(0)
        y_pred = multitask_lerner_model(student_key, actual_data,
                                        covariate_data)

        classification_loss = classification_criterion(y_pred, train_label)
        total_classification_loss += classification_loss.item()

        # Check if training
        if optimizer:
            multitask_lerner_model.zero_grad()
            classification_loss.backward()
            optimizer.step()

        labels.append(train_label)
        y_pred_squeezed = y_pred.squeeze(0)
        _, max_idx = y_pred_squeezed.max(0)
        predictions.append(max_idx)
        users.append(student_id)

    return total_classification_loss, labels, predictions, users
Exemplo n.º 6
0
def convert_logical_not_missing_flags(data):
    validations.validate_data_dict_keys(data)

    new_dict = {}
    data_dict = {}

    new_dict['train_ids'] = data['train_ids']
    new_dict['val_ids'] = data['val_ids']
    new_dict['test_ids'] = data['test_ids']

    for key in data['data'].keys():
        mutable_data = list(data['data'][key])
        mutable_data[1] = np.logical_not(np.array(
            data['data'][key][1])).astype(int).tolist()
        data_dict[key] = tuple(mutable_data)

    new_dict['data'] = data_dict

    return new_dict
Exemplo n.º 7
0
def get_statistics_on_data_dict(data: dict, feature_list: list):
    """
    @attention  Statistics returned are ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'].
    @param data: Data in classic dictionary format.
    @param feature_list: Feature list for the data.
    @return: Statistics on whole data and raw appended data.
    """
    validations.validate_data_dict_keys(data)
    validations.validate_all_data_present_in_data_dict(data)
    df_for_statistics = pd.DataFrame()

    for key in data['data']:
        unit_sequence = data['data'][key][0]
        df_for_statistics = df_for_statistics.append(pd.DataFrame(unit_sequence),
                                                     ignore_index=True)

    if not data_manager.FLATTEN_SEQUENCE_TO_COLS:
        df_for_statistics.columns = feature_list
    df_for_statistics.replace(to_replace=-1, value=np.nan, inplace=True)
    return df_for_statistics.describe(percentiles=[0.25, 0.5, 0.75]), df_for_statistics
Exemplo n.º 8
0
def add_mean_vector_to_data(data: dict):
    validations.validate_data_dict_keys(data)
    validations.validate_data_dict_data_len(data)

    for key in data['data']:
        data_list = list(data['data'][key])
        feature_data = data_list[0]
        missing_flags = data_list[1]
        time_delta = data_list[2]
        label = data_list[3]
        mean_vector = [0] * len(feature_data)

        for i in range(len(feature_data)):
            mean_vector[i] = get_mean_for_series(feature_data[i],
                                                 missing_flags[i])

        data_tuple = (feature_data, missing_flags, time_delta, mean_vector,
                      label)

        data['data'][key] = data_tuple

    return data
Exemplo n.º 9
0
def evaluate_set(data,
                 key_set: str,
                 model,
                 criterion,
                 optimizer=None,
                 train_covariates=False):
    validations.validate_data_dict_keys(data)
    validate_key_set_str(key_set)
    total_loss = 0
    labels = []
    predictions = []

    if not optimizer:
        model.eval()
    else:
        model.train()

    for key in data[key_set]:
        actual_data, covariate_data, train_label = data['data'][key]
        y_pred = model(
            actual_data,
            covariate_data) if train_covariates else model(actual_data)
        y_pred_unqueezed = y_pred.unsqueeze(0)
        loss = criterion(y_pred_unqueezed, train_label)
        total_loss += loss.item()

        # Check if training
        if criterion and optimizer:
            model.zero_grad()
            loss.backward()
            optimizer.step()

        labels.append(train_label)
        _, max_idx = y_pred.max(0)
        predictions.append(max_idx)

    return total_loss, labels, predictions
Exemplo n.º 10
0
def evaluate_multitask_learner(data,
                               key_set: str,
                               num_classes,
                               multitask_lerner_model,
                               reconstruction_criterion,
                               classification_criterion,
                               device,
                               optimizer=None,
                               alpha=1,
                               beta=1,
                               use_histogram=False,
                               histogram_seq_len=None,
                               ordinal_regression=False,
                               use_covariates=True):
    validations.validate_data_dict_keys(data)
    validate_key_set_str(key_set)

    total_reconstruction_loss = 0
    total_classification_loss = 0
    total_joint_loss = 0

    labels = []
    predictions = []
    users = []

    if not optimizer:
        multitask_lerner_model.eval()
    else:
        multitask_lerner_model.train()

    for key in data[key_set]:
        student_id = conversions.extract_student_id_from_key(key)
        student_key = 'student_' + str(student_id)
        actual_data, covariate_data, histogram_data, train_label = data[
            'data'][key]

        if ordinal_regression:
            train_label_vector = get_target_vector_for_ordinal_regression(
                train_label, num_classes, device)

        actual_data = actual_data[0].unsqueeze(0)
        if use_histogram:
            if histogram_seq_len:
                histogram_data = histogram_data[:max(histogram_seq_len,
                                                     len(histogram_data))]
            actual_data = histogram_data.unsqueeze(0)

        decoded_output, y_pred = multitask_lerner_model(
            student_key, actual_data,
            covariate_data if use_covariates else None)

        # decoded output is `None` if training on only co-variates.
        reconstruction_loss = reconstruction_criterion(
            actual_data, decoded_output
        ) if decoded_output is not None else object_generator.get_tensor_on_correct_device(
            [0])
        total_reconstruction_loss += reconstruction_loss.item()

        if ordinal_regression:
            classification_loss = classification_criterion(
                y_pred, train_label_vector)
        else:
            classification_loss = classification_criterion(y_pred, train_label)

        total_classification_loss += classification_loss.item()

        joint_loss = alpha * reconstruction_loss + beta * classification_loss
        total_joint_loss += joint_loss.item()

        # Check if training
        if optimizer:
            multitask_lerner_model.zero_grad()
            joint_loss.backward()
            optimizer.step()

        labels.append(train_label)
        predicted_class = get_predicted_class(
            y_pred, ordinal_regression=ordinal_regression)
        predictions.append(predicted_class)
        users.append(student_id)

    return total_joint_loss, total_reconstruction_loss, total_classification_loss, labels, predictions, users