Exemplo n.º 1
0
vectorizers = [tfidf1]
tfidf = vectorizers[0]
#comment = 'lsa = 1, tfidf2, 175000 -> 1000'
comment = 'tfidf1, transition 75'

y = np.array(t.ix[:,4:])#[:,9:]
y_original = np.array(t.ix[:,4:])#[:,9:]
cv_split = 0.2
n = int(np.round(len(t['tweet'].tolist())))
train_end = int(np.round(n*(1-cv_split)))
cv_beginning = int(np.round( n*(1-cv_split if cv_split > 0 else 0.8)))

train = t['tweet'].tolist()[0:train_end]
cv_X_original = np.array(t['tweet'].tolist()[cv_beginning:])
cv_y = np.array(y[cv_beginning:])
c = u.strings_to_classes(t['state'])

if cv_split == 0:
    train = t['tweet'].tolist()
else:
    y = y[0:int(np.round(len(t['tweet'].tolist())*(1-cv_split)))]   

prediction_grand_all = 0
predict_cv_grand_all = 0
list_predictions = []
list_predictions_test = []
for tfidf in vectorizers:    
    print 'fitting vectorizer...'
    tfidf.fit(t['tweet'].tolist() + t2['tweet'].tolist())
    print 'transforming train set...'
    #train = tfidf.transform(train)
Exemplo n.º 2
0
        sales = dict_sales[key][0]
        if repair_key not in dict_repair:
            dict_repair[repair_key] = [entry[-1],timespan.days,entry[0],entry[1],entry[2],entry[3],sales]
        else:
            dict_repair[repair_key][0] += entry[-1]
    else:
       error_count += 1

data = []
for value in dict_repair.values():
    data.append([ele for ele in value])

X =  np.array(data)
X = X[:,[0,1,2,3,6]]

fac1 = u.strings_to_classes(X[:,2])
fac2 = u.strings_to_classes(X[:,3])

t1 = u.create_t_matrix(fac1)
t2 = u.create_t_matrix(fac2)

X = np.hstack([np.float32(X[:,[0,1,4]]),t1,t2])
print X.shape

np.save('/home/tim/Downloads/repair/train.npy',X)
print 'Saved!'

#TODO: use util to create categories
#print(t1.ix[0:5,:])
#print(t2.ix[0:5,:])