test_labels = [] for row in test_corpus: test_data.append(row[0]) test_labels.append(row[1]) # Create feature vectors vectorizer = TfidfVectorizer(min_df = 4, max_df = 0.9) # Train the feature vectors train_vectors = vectorizer.fit_transform(train_data) # Apply model on test data test_vectors = vectorizer.transform(test_data) # Perform classification with SVM, kernal = linear model = svm.SVC(kernel = 'linear') model.fit(train_vectors, train_labels) prediction = model.predict(test_vectors) print(classification_report(test_labels, prediction)) # Text Matching / Similarity # Levenshtein Distance def levenshtein(s1, s2): if len(s1) > len(s2): s1, s2 = s2, s1 distances = range(len(s1) + 1) for index2, char2 in enumerate(s2): newDistances = [index2 + 1] for index1, char1 in enumerate(s1): if char1 == char2: newDistances.append(distances[index1])
# Random Forest: # Instead of doing these steps one at a time, we can use a pipeline to complete them all at once randomforest = Pipeline([('vect', vectorizer), ('chi', SelectKBest(chi2, k=1200)), ('clf', RandomForestClassifier(random_state=42))]) # fitting our model and save it in a pickle for later use model = randomforest.fit(X_train, y_train) with open('RandomForest.pickle', 'wb') as f: pickle.dump(model, f) ytest = np.array(y_test) # Evaluating Results: # Confusion Matrix: con_mat = cm(ytest, model.predict(X_test)) confusion_matrix = ConfusionMatrix(ytest, model.predict(X_test)) print("Confusion matrix for Random Forest:\n%s" % confusion_matrix) print( 'For Random Forest model: \nPrecision: {0:.3f} \nRecall: {1:.3f} \nf1score: {2:.3f} \nAccuracy: {3:.3f}' .format(precision_average(con_mat), recall_average(con_mat), f1score(con_mat), accuracy(con_mat))) # Confusion matrix plot: acc = accuracy(con_mat) confusion_matrix.plot(normalized=True) plt.title('Random Forest \nAccuracy:{0:.3f}'.format(acc)) plt.show()