def train_test_mode(postrain, negtrain, postest, negtest, ngram, params_str, pout_f): logging.info("counting") pos_con = nbt.counting(postrain, ngram) neg_con = nbt.counting(negtrain, ngram) logging.info("abstract features") dic = nbt.abstract_features(pos_con, neg_con) logging.info("generate training data in libSVM format") postrain.seek( 0, os.SEEK_SET) # the file has been read to the end , so move to the head negtrain.seek(0, os.SEEK_SET) pos_f_vecs = vectorize_docs_using_bool_feature(postrain, dic, ngram) neg_f_vecs = vectorize_docs_using_bool_feature(negtrain, dic, ngram) Y, X = fileprocessing.ready_SVM_data( [fileprocessing.POSITIVE_LABEL, fileprocessing.NEGATIVE_LABEL], [pos_f_vecs, neg_f_vecs]) logging.info("training using liblinear ") m = linearutil.train( Y, X, params_str) # -s 1 : using L2-resularized L2-loss SVM ; logging.info("generate testing data") pos_t = vectorize_docs_using_bool_feature(postest, dic, ngram) neg_t = vectorize_docs_using_bool_feature(negtest, dic, ngram) Y, X = fileprocessing.ready_SVM_data( [fileprocessing.POSITIVE_LABEL, fileprocessing.NEGATIVE_LABEL], [pos_t, neg_t]) p_labels, p_acc, p_vals = linearutil.predict(Y, X, m) print "ACC:%.2f%% MSE:%.2f SCC:%.2f" % (p_acc[0], p_acc[1], p_acc[2]) doc_ids = [] idx = 1 for idx in range(len(pos_t)): doc_ids.append("_".join([str(fileprocessing.POSITIVE_LABEL), str(idx)])) idx += 1 idx = 1 for idx in range(len(neg_t)): doc_ids.append("_".join([str(fileprocessing.NEGATIVE_LABEL), str(idx)])) idx += 1 fileprocessing.output_predict_detail(doc_ids, Y, p_labels, pout_f) pout_f.close() postrain.close() negtrain.close() postest.close() negtest.close()
def cv_mode(pos, neg, ngram, cv_num, params_str): logging.info("counting") pos_con = nbt.counting(pos, ngram) neg_con = nbt.counting(neg, ngram) logging.info("abstract features") dic = nbt.abstract_features(pos_con, neg_con) logging.info("generate data in libSVM format") pos.seek( 0, os.SEEK_SET) # the file has been read to the end , so move to the head neg.seek(0, os.SEEK_SET) pos_f_vecs = vectorize_docs_using_bool_feature(pos, dic, ngram) neg_f_vecs = vectorize_docs_using_bool_feature(neg, dic, ngram) Y, X = fileprocessing.ready_SVM_data( [fileprocessing.POSITIVE_LABEL, fileprocessing.NEGATIVE_LABEL], [pos_f_vecs, neg_f_vecs]) logging.info("%d-fold cross validing using liblinear " % (cv_num)) m = linearutil.train( Y, X, params_str + " -v " + str(cv_num)) # -s 1 : using L2-resularized L2-loss SVM ; -c 0.1 pos.close() neg.close()
def train_test_mode(postrain, negtrain, postest, negtest, ngram, params_str, pout_f): logging.info("counting") pos_con = nbt.counting(postrain, ngram) neg_con = nbt.counting(negtrain, ngram) logging.info("abstract features") dic = nbt.abstract_features(pos_con, neg_con) logging.info("generate training data in libSVM format") postrain.seek(0, os.SEEK_SET) # the file has been read to the end , so move to the head negtrain.seek(0, os.SEEK_SET) pos_f_vecs = vectorize_docs_using_bool_feature(postrain, dic, ngram) neg_f_vecs = vectorize_docs_using_bool_feature(negtrain, dic, ngram) Y, X = fileprocessing.ready_SVM_data( [fileprocessing.POSITIVE_LABEL, fileprocessing.NEGATIVE_LABEL], [pos_f_vecs, neg_f_vecs] ) logging.info("training using liblinear ") m = linearutil.train(Y, X, params_str) # -s 1 : using L2-resularized L2-loss SVM ; logging.info("generate testing data") pos_t = vectorize_docs_using_bool_feature(postest, dic, ngram) neg_t = vectorize_docs_using_bool_feature(negtest, dic, ngram) Y, X = fileprocessing.ready_SVM_data([fileprocessing.POSITIVE_LABEL, fileprocessing.NEGATIVE_LABEL], [pos_t, neg_t]) p_labels, p_acc, p_vals = linearutil.predict(Y, X, m) print "ACC:%.2f%% MSE:%.2f SCC:%.2f" % (p_acc[0], p_acc[1], p_acc[2]) doc_ids = [] idx = 1 for idx in range(len(pos_t)): doc_ids.append("_".join([str(fileprocessing.POSITIVE_LABEL), str(idx)])) idx += 1 idx = 1 for idx in range(len(neg_t)): doc_ids.append("_".join([str(fileprocessing.NEGATIVE_LABEL), str(idx)])) idx += 1 fileprocessing.output_predict_detail(doc_ids, Y, p_labels, pout_f) pout_f.close() postrain.close() negtrain.close() postest.close() negtest.close()
def cv_mode(pos, neg, ngram, cv_num, params_str): logging.info("counting") pos_con = nbt.counting(pos, ngram) neg_con = nbt.counting(neg, ngram) logging.info("abstract features") dic = nbt.abstract_features(pos_con, neg_con) logging.info("generate data in libSVM format") pos.seek(0, os.SEEK_SET) # the file has been read to the end , so move to the head neg.seek(0, os.SEEK_SET) pos_f_vecs = vectorize_docs_using_bool_feature(pos, dic, ngram) neg_f_vecs = vectorize_docs_using_bool_feature(neg, dic, ngram) Y, X = fileprocessing.ready_SVM_data( [fileprocessing.POSITIVE_LABEL, fileprocessing.NEGATIVE_LABEL], [pos_f_vecs, neg_f_vecs] ) logging.info("%d-fold cross validing using liblinear " % (cv_num)) m = linearutil.train(Y, X, params_str + " -v " + str(cv_num)) # -s 1 : using L2-resularized L2-loss SVM ; -c 0.1 pos.close() neg.close()