results = [] for perc in range(1, 100, 5): p = np.empty([numFolds]) ch2 = SelectPercentile(chi2, percentile=perc) #perfrom 5folds cross-validation for i in range(0, numFolds): #data_txt preproccessing - tokenization, selecting 90% of the best features vectorizer = TfidfVectorizer(tokenizer=uux_preprocessing.tokenize) X_train_features = vectorizer.fit_transform(x_train_folds[i]) X_train_features_names = vectorizer.fit(x_train_folds[i]).vocabulary_ X_train_features = ch2.fit_transform(X_train_features, y_train_folds[i]) selected_features_names = np.asarray( vectorizer.get_feature_names())[ch2.get_support()] classifier = Pipeline([('tfidf', vectorizer), ('chi2', ch2), ('clf', OneVsRestClassifier(LinearSVC()))]) classifier.fit(x_train_folds[i], y_train_folds[i]) predicted = classifier.predict(x_test_folds[i]) print metrics.precision_score(y_test_folds[i], predicted) p[i] = metrics.precision_score(y_test_folds[i], predicted) print p results = np.append(results, p.mean()) print "Results"
from sklearn.externals import joblib #get the data_txt from DB numDimensions = 22 numFolds = 5 X_train = uux_data.getUUXSentences(numDimensions) y_train = uux_data.getUUXSentenceDimension(numDimensions) y_train_binary = MultiLabelBinarizer().fit_transform(y_train) target_names = uux_data.getUUXDimensions(numDimensions) #data_txt preproccessing - tokenization, selecting 90% of the best features vectorizer = TfidfVectorizer(tokenizer=uux_preprocessing.tokenize) X_train_features = vectorizer.fit_transform(X_train) X_train_features_names = vectorizer.fit(X_train).vocabulary_ ch2 = SelectPercentile(chi2, percentile=16) X_train_features = ch2.fit_transform(X_train_features, y_train_binary) selected_features_names = np.asarray(vectorizer.get_feature_names())[ch2.get_support()] print str(len(selected_features_names)) classifier = Pipeline([ ('tfidf', vectorizer), ('chi2', ch2), ('clf', OneVsRestClassifier(LinearSVC()))]) classifier.fit(X_train, y_train_binary) joblib.dump(classifier, 'classifier/uux_classifier.pkl')
#Conctatenate the 4 folds for training for j in range(0, numFolds): if (i != j): fold_x_train = np.concatenate((fold_x_train, folds_X_train[j])) fold_y_train = np.concatenate((fold_y_train, folds_y_train[j])) fold_x_test = folds_X_train[i] fold_y_test = folds_y_train[i] #data_txt preproccessing - tokenization, selecting 90% of the best features vectorizer = TfidfVectorizer(tokenizer=uux_preprocessing.tokenize) X_train_features = vectorizer.fit_transform(fold_x_train) X_train_features_names = vectorizer.fit(fold_x_train).vocabulary_ ch2 = SelectPercentile(chi2, percentile=16) X_train_features = ch2.fit_transform(X_train_features, fold_y_train) selected_features_names = np.asarray( vectorizer.get_feature_names())[ch2.get_support()] print str(len(selected_features_names)) classifier = Pipeline([('tfidf', vectorizer), ('chi2', ch2), ('clf', OneVsRestClassifier(LinearSVC()))]) classifier.fit(fold_x_train, fold_y_train) predicted = classifier.predict(fold_x_test) print classification_report(fold_y_test, predicted, target_names=target_names) p[i], r[i], f1[i], s[i] = metrics.precision_recall_fscore_support(