def train_vectorized(feats, Y, model_path=None, grid=False): # Vectorize labels labels = [ labels_map[y] for y in Y ] Y = np.array( labels ) # Vectorize feature dictionary vec = DictVectorizer() X = vec.fit_transform(feats) norm_mat( X , axis=0 , copy=False) # Grid Search if grid: print 'Performing Grid Search' clf = do_grid_search(X, Y) else: #clf = LinearSVC(C=0.1, class_weight='auto') #clf = LogisticRegression(C=0.1, class_weight='auto') clf = SGDClassifier(penalty='elasticnet',alpha=0.001, l1_ratio=0.85, n_iter=1000,class_weight='auto') clf.fit(X, Y) # Save model if model_path: with open(model_path+'.dict' , 'wb') as f: pickle.dump(vec, f) with open(model_path+'.model', 'wb') as f: pickle.dump(clf, f) # return model return vec, clf
def predict_vectorized(feats, clf, vec): # Vectorize feature dictionary # NOTE: do not fit() during predicting #''' vectorized = vec.transform(feats) norm_mat( vectorized , axis=0 , copy=False ) confidences = clf.decision_function(vectorized) labels = clf.predict(vectorized) labels = [ reverse_labels_map[y] for y in labels ] with open('stuff-a','wb') as f: pickle.dump(confidences,f) with open('stuff-b','wb') as f: pickle.dump(labels,f) #''' ''' with open('stuff-a','rb') as f: confidences = pickle.load(f) with open('stuff-b','rb') as f: labels = pickle.load(f) ''' # Adjust edge cases of negative/neutrals adjusted = [] for l,c in zip(labels,confidences): if l == 'negative': # Bias predictions toward neutral unless very confident if False: #((c[1]-c[2]) < 1) and (c[2] > 0): #if ((c[1]-c[2]) < 1) and (c[2] > 0): adjusted.append('neutral') else: #print c, '\t', c[1] - c[2] adjusted.append(l) else: adjusted.append(l) labels = adjusted #print '\n\n' #print clf #print '\n\n' #for k,v in vars(clf).items(): # print k # print '\t', v # print #print #print '\n\n' #print clf.decision_function(vectorized) #print '\n\n' return labels