示例#1
0
# Parse csv files to numpy array
with open(X_train_fpath) as f:
    next(f)
    X_train = np.array([line.strip('\n').split(',')[1:] for line in f], dtype = float)
with open(Y_train_fpath) as f:
    next(f)
    Y_train = np.array([line.strip('\n').split(',')[1] for line in f], dtype = float)
with open(X_test_fpath) as f:
    next(f)
    X_test = np.array([line.strip('\n').split(',')[1:] for line in f], dtype = float)

print('Splitting data...\n')

# Normalize training and testing data
X_train, X_mean, X_std = u._normalize(X_train, train = True)
X_test, _, _= u._normalize(X_test, train = False, specified_column = None, X_mean = X_mean, X_std = X_std)

# Split data into training set and development set
dev_ratio = 0.1
X_train, Y_train, X_dev, Y_dev = u._train_dev_split(X_train, Y_train, dev_ratio = dev_ratio)

# extract features
# corr = abs(np.corrcoef(X_train.T, Y_train)[-1,:-1]) > 0.05
# print(corr.shape)

# X_train = X_train[:, corr]
# X_test = X_test[:, corr]
# X_dev = X_dev[:, corr]

train_size = X_train.shape[0]
示例#2
0
 def test_normalize(self):
     word = "?WoRD!"
     self.assertEqual(util._normalize(word), "word")
示例#3
0
 def test_normalize_whitespace(self):
     word = "   word\n"
     self.assertEqual(util._normalize(word), "word")
示例#4
0
 def test_normalize_punctuation(self):
     word = "!WORD! "
     self.assertEqual(util._normalize(word), "word")
示例#5
0
 def test_normalize_internal_punctuation(self):
     word = "shouldn't."
     self.assertEqual(util._normalize(word), "shouldn't")