forked from astro1860/data_proj2
-
Notifications
You must be signed in to change notification settings - Fork 0
/
mapper.py
72 lines (63 loc) · 2.95 KB
/
mapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!/local/anaconda/bin/python
# IMPORTANT: leave the above line as is.
import sys
import numpy as np
from sklearn.utils import check_array, check_random_state, as_float_array
from sklearn.utils.extmath import safe_sparse_dot
from sklearn.linear_model import SGDClassifier
from sklearn.kernel_approximation import (RBFSampler,Nystroem,AdditiveChi2Sampler,SkewedChi2Sampler)
#DIMENSION = 400 # Dimension of the original data.
CLASSES = (-1, +1) # The classes that we are trying to predict.
np.random.seed(seed=42)
ran_stat = None
random_state = check_random_state(ran_stat)
n_components = 12000
n_features = 400
random_weights = random_state.standard_cauchy(size = (n_features, n_components))
random_offset = random_state.uniform(0, 2 * np.pi, size = n_components)
def transform(x_original):
#x_original = preprocessing.scale(x_original)
#min_max_scaler = preprocessing.MinMaxScaler()
#x_original = np.sqrt(x_original)
#w_feature = RBFSampler(gamma=0.0001, n_components=1000, random_state=1)
#w_feature = Nystroem(kernel='rbf', gamma=1.0, n_components=400, random_state=1)
#x_original = preprocessing.normalize(x_original, norm='l1')
#w_feature = AdditiveChi2Sampler(sample_steps=6, sample_interval=0.29)
projection = safe_sparse_dot(x_original, random_weights)
projection += random_offset
np.cos(projection, projection)
projection *= np.sqrt(2.) / np.sqrt(n_components)
#x = w_feature.fit_transform(x_original)
return projection
if __name__ == "__main__":
#f = open('/Users/Charles/Code/DMProject2/data_proj2_past/training_set.txt')
clf = SGDClassifier(loss='hinge', penalty='l2',alpha=0.0001/13.775, l1_ratio=0.15, fit_intercept=False, n_iter=1, shuffle=True, verbose=0, epsilon=0.1, n_jobs=1, random_state=1, learning_rate='optimal', eta0=0.0, power_t=0.5, class_weight=None, warm_start=False, average=True)
#Extract Features
#alpha = 0.0001 = a l1_ratio = 0.15
#0.000001 --> 0.779509 a/100
#0.00001 --> 0.813194 a/10
#0.0001/20 --> 0.811584 a/20
#0.0001/15 --> 0.813287 a/15 !!
#0.0001/12-->0.807602 a/12
#0.0001/17 --> 0.811538
#0.814882 <-- L1_RATION 0.5 L1
#0.819020 <-- l1 RATION 0.5 L1 average = true alpha = a/15 !!
#0.817806 <-- l2 ration 0.15 average = true alpha = a/15
#0.818444 <-- l2 ration 0.15 average =true alpha = a/100
#0.818413 <-- l2 ration 0.15 average = true alpha = a/50 0.819020
count = 0
for line in sys.stdin:
#if count >= 5000:
# break
line = line.strip()
label, x_string = line.split(" ", 1)
label = int(label)
x_original = np.fromstring(x_string, sep=' ')
x = transform(x_original) #using the kernel function
clf.partial_fit(x, [label], CLASSES)
count += 1
for x in clf.coef_[0]:
print x,
# print
#cat training_set.txt | python mapper.py | python reducer.py > r_weights.txt
#python evaluate.py r_weights.txt test_data.txt test_label.txt /Users/Charles/Desktop/