def test_pfa(self):
     # Test PFA encoding
     X_pfa = df_to_sparse(self.data, self.q_mat,
                          ["skills", "wins", "fails"]).toarray()
     # Sort array
     X_pfa = X_pfa[X_pfa[:, 4].argsort(), 5:]  # Collect only sparse columns
     pfa_features = np.array(pd.read_csv("data/dummy/pfa.csv", sep=';'))
     self.assertSequenceEqual(X_pfa.tolist(), pfa_features.tolist(),
                              "Inconsistent PFA features")
 def test_ui(self):
     # Test IRT/MIRT encoding
     X_ui = df_to_sparse(self.data, self.q_mat,
                         ["users", "items"]).toarray()
     # Sort array
     X_ui = X_ui[X_ui[:, 4].argsort(), 5:]  # Collect only sparse columns
     irt_features = np.array(pd.read_csv("data/dummy/irt.csv", sep=';'))
     self.assertSequenceEqual(X_ui.tolist(), irt_features.tolist(),
                              "Inconsistent IRT features")
 def test_afm(self):
     # Test AFM encoding
     X_afm = df_to_sparse(self.data, self.q_mat,
                          ["skills", "attempts"]).toarray()
     # Sort array
     X_afm = X_afm[X_afm[:, 4].argsort(), 5:]  # Collect only sparse columns
     afm_features = np.array(pd.read_csv("data/dummy/afm.csv", sep=';'))
     self.assertSequenceEqual(X_afm.tolist(), afm_features.tolist(),
                              "Inconsistent AFM features")
    def test_dash(self):
        # Test DASH encoding
        X_uiwat2 = df_to_sparse(self.data,
                                self.q_mat,
                                ["users", "items", "wins", "attempts"],
                                tw="tw_items").toarray()
        # Sort array
        X_uiwat2 = X_uiwat2[X_uiwat2[:, 4].argsort(),
                            5:]  # Collect only sparse columns
        # Convert to simple counters to avoid using assertAlmostEqual and floats
        X_uiwat2[:, -10:] = np.exp(X_uiwat2[:, -10:]) - 1

        dash_features = np.array(pd.read_csv("data/dummy/dash.csv", sep=';'))
        self.assertSequenceEqual(X_uiwat2.tolist(), dash_features.tolist(),
                                 "Inconsistent DASH features")
示例#5
0
from encode import df_to_sparse
from train_lr import compute_metrics

data_path = 'data/new_sqai/'
original_df = pd.read_csv(os.path.join(data_path, "ElemMATHdata.csv"))
stats = []
for i in range(5, 18):
    print('using user: '******'preprocessed_data.csv'),
                     sep="\t")
    df = df[["user_id", "item_id", "timestamp", "correct", "skill_id"]]
    Q_mat = sparse.load_npz(os.path.join(data_path, 'q_mat.npz')).toarray()
    active_features = ['i', 's', 'ic', 'sc', 'tc', 'w', 'a']
    X = df_to_sparse(df, Q_mat, active_features)
    # sparse.save_npz(os.path.join(data_path, 'X-features'), X)
    #
    # parser = argparse.ArgumentParser(description='Train logistic regression on sparse feature matrix.')
    # parser.add_argument('--X_file', type=str)
    # parser.add_argument('--dataset', type=str)
    # parser.add_argument('--iter', type=int, default=1000)
    # args = parser.parse_args()
    #
    # features_suffix = (args.X_file.split("-")[-1]).split(".")[0]
    #
    # # Load sparse dataset
    # X = csr_matrix(load_npz(args.X_file))
    # print('encoded')

    train_df = pd.read_csv(os.path.join(data_path,