def begin_testing(filename, classifier): print "\nLoading the test data..." test_docs = docR.get_list(filename) data = [] target = [] for doc in test_docs: data.append(doc.vector[:-1]) target.append(doc.vector[-1]) np_data = np.array(data) np_target = np.array(target) results = classifier.predict(np_data) kp = kappa(np_target, results) print "\nThe Average Quadratic Weighted Kappa obtained is: ", kp, "\n" print "="*50
np_data = np.array(data) np_target = np.array(target) results = classifier.predict(np_data) kp = kappa(np_target, results) print "\nThe Average Quadratic Weighted Kappa obtained is: ", kp, "\n" print "="*50 if __name__=="__main__": if len(sys.argv) < 5: print "USAGE: $ python run.py [-n | -o] input_file model_file data_file -t test_file" sys.exit(0) print "\n" print "="*50 if sys.argv[1] == "-n": print "\nTraining the model..." docs_list = docR.get_list(sys.argv[2]) classifier = svm.SVR() data = [] target = [] for doc in docs_list: data.append(doc.vector[:-1]) target.append(doc.vector[-1]) np_data = np.array(data) np_target = np.array(target) classifier.fit(np_data, np_target) joblib.dump(classifier, sys.argv[3]) save_data = data save_data.append(target) string = pickle.dumps(save_data) ofp = open(sys.argv[4], 'w') ofp.write(string)
Module to extract bag of words and generate term-document matrix. ''' import textmining as txtm import fileparse as docR def make_tdm(docs_list): textMatrices = [] for x in xrange(0, 9): textMatrix = txtm.TermDocumentMatrix() textMatrices.append(textMatrix) for doc in docs_list: doc_set = int(doc.essay_set) textMatrices[doc_set].add_doc(doc.essay) return textMatrices if __name__ == "__main__": docs_list = docR.get_list() tdMatrices = make_tdm(docs_list) for tdm in tdMatrices: for row in tdm.rows(cutoff=1): # Here, cutoff means the number of documents in which this word has to occur for it to be placed in the 'bag of words'. print row