def preprocessing():
    # get label y
    # y = pd.read_csv('F/FS_post.csv', header=0)
    # y = pd.read_csv('P_neg/panas_negative_post.csv', header=0)
    y = pd.read_csv('P_po/panas_positive_post.csv', header=0)
    y = np.array(y)[:, 1:]
    # use median value as the threshold
    thr = np.median(y)
    # divide y into two groups
    y = Binarizer(threshold=thr).fit_transform(y)
    y = y.reshape(-1)

    # get input X
    # X = pd.read_csv('F/input_top5.csv', header=0)
    # X = pd.read_csv('P_neg/input_top5.csv', header=0)
    X = pd.read_csv('P_po/input_top5.csv', header=0)
    X = np.array(X)[:, 1:].astype(np.float64)
    # use  Pearson's Correlation to select useful features
    X = SelectKBest(lambda A, B: tuple(map(tuple, np.array(list(map(lambda a: pearsonr(a, B), A.T))).T)), k=10).fit_transform(X, y)
    return X, y
Пример #2
0
    y = y.flatten()
    return X, y


def get_column_names(path):
    with open(path) as fp:
        header = fp.readline().split(',')  #[1:-1]
    return header


X, y = get_training_data(data_path)
letter_names = X[:, 0].reshape(-1, 1)
letter_sounds = X[:, 1].reshape(-1, 1)

# Binarize labels
y = Binarizer(threshold=fail_threshold).transform(y.reshape(-1, 1))

datasets = [
    ('letter names', letter_names, (0, 13, 26, 39, 52)),
    # ('letter sounds', letter_sounds, (0, 13, 26))
]

for independent_variable_name, X_data, X_ticks in datasets:

    # Create linear regression object
    regr = linear_model.LinearRegression(normalize=True)

    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_data,
                                                        y,
                                                        test_size=0.2)
Пример #3
0
    y = y.flatten()
    return X, y


def get_column_names(path):
    with open(path) as fp:
        header = fp.readline().split(',')  #[1:-1]
    return header


X, y = get_training_data(data_path)
letter_names = X[:, 0].reshape(-1, 1)
letter_sounds = X[:, 1].reshape(-1, 1)

# Binarize labels
y = Binarizer(threshold=fail_threshold).transform(y.reshape(1, -1))[0]

reading_data = (X, y)

datasets = [reading_data]

# points where we want ticks, as well as the label for that tick
ticks = [[0, 0], [13, 7], [26, 13], [39, 20], [52, 26]]
ticks = np.array(ticks)

figure = plt.figure(figsize=(27, 9))
i = 1
# iterate over datasets
# LN is X, LS is Y
for ds_cnt, ds in enumerate(datasets):
    # preprocess dataset, split into training and test part