def train_test_mode(postrain, negtrain, postest, negtest, ngram, params_str,
                    pout_f):
    logging.info("counting")
    pos_con = nbt.counting(postrain, ngram)
    neg_con = nbt.counting(negtrain, ngram)
    logging.info("abstract features")
    dic = nbt.abstract_features(pos_con, neg_con)

    logging.info("generate training data in libSVM format")
    postrain.seek(
        0,
        os.SEEK_SET)  # the file has been read to the end , so move to the head
    negtrain.seek(0, os.SEEK_SET)
    pos_f_vecs = vectorize_docs_using_bool_feature(postrain, dic, ngram)
    neg_f_vecs = vectorize_docs_using_bool_feature(negtrain, dic, ngram)
    Y, X = fileprocessing.ready_SVM_data(
        [fileprocessing.POSITIVE_LABEL, fileprocessing.NEGATIVE_LABEL],
        [pos_f_vecs, neg_f_vecs])

    logging.info("training using liblinear ")
    m = linearutil.train(
        Y, X, params_str)  # -s 1 : using L2-resularized L2-loss SVM ;

    logging.info("generate testing data")
    pos_t = vectorize_docs_using_bool_feature(postest, dic, ngram)
    neg_t = vectorize_docs_using_bool_feature(negtest, dic, ngram)

    Y, X = fileprocessing.ready_SVM_data(
        [fileprocessing.POSITIVE_LABEL, fileprocessing.NEGATIVE_LABEL],
        [pos_t, neg_t])

    p_labels, p_acc, p_vals = linearutil.predict(Y, X, m)

    print "ACC:%.2f%% MSE:%.2f SCC:%.2f" % (p_acc[0], p_acc[1], p_acc[2])

    doc_ids = []
    idx = 1
    for idx in range(len(pos_t)):
        doc_ids.append("_".join([str(fileprocessing.POSITIVE_LABEL),
                                 str(idx)]))
        idx += 1
    idx = 1
    for idx in range(len(neg_t)):
        doc_ids.append("_".join([str(fileprocessing.NEGATIVE_LABEL),
                                 str(idx)]))
        idx += 1
    fileprocessing.output_predict_detail(doc_ids, Y, p_labels, pout_f)

    pout_f.close()
    postrain.close()
    negtrain.close()
    postest.close()
    negtest.close()
def cv_mode(pos, neg, ngram, cv_num, params_str):
    logging.info("counting")
    pos_con = nbt.counting(pos, ngram)
    neg_con = nbt.counting(neg, ngram)
    logging.info("abstract features")
    dic = nbt.abstract_features(pos_con, neg_con)

    logging.info("generate data in libSVM format")
    pos.seek(
        0,
        os.SEEK_SET)  # the file has been read to the end , so move to the head
    neg.seek(0, os.SEEK_SET)
    pos_f_vecs = vectorize_docs_using_bool_feature(pos, dic, ngram)
    neg_f_vecs = vectorize_docs_using_bool_feature(neg, dic, ngram)
    Y, X = fileprocessing.ready_SVM_data(
        [fileprocessing.POSITIVE_LABEL, fileprocessing.NEGATIVE_LABEL],
        [pos_f_vecs, neg_f_vecs])

    logging.info("%d-fold cross validing using liblinear " % (cv_num))
    m = linearutil.train(
        Y, X, params_str + " -v " +
        str(cv_num))  # -s 1 : using L2-resularized L2-loss SVM ; -c 0.1

    pos.close()
    neg.close()
def train_test_mode(postrain, negtrain, postest, negtest, ngram, params_str, pout_f):
    logging.info("counting")
    pos_con = nbt.counting(postrain, ngram)
    neg_con = nbt.counting(negtrain, ngram)
    logging.info("abstract features")
    dic = nbt.abstract_features(pos_con, neg_con)

    logging.info("generate training data in libSVM format")
    postrain.seek(0, os.SEEK_SET)  # the file has been read to the end , so move to the head
    negtrain.seek(0, os.SEEK_SET)
    pos_f_vecs = vectorize_docs_using_bool_feature(postrain, dic, ngram)
    neg_f_vecs = vectorize_docs_using_bool_feature(negtrain, dic, ngram)
    Y, X = fileprocessing.ready_SVM_data(
        [fileprocessing.POSITIVE_LABEL, fileprocessing.NEGATIVE_LABEL], [pos_f_vecs, neg_f_vecs]
    )

    logging.info("training using liblinear ")
    m = linearutil.train(Y, X, params_str)  # -s 1 : using L2-resularized L2-loss SVM ;

    logging.info("generate testing data")
    pos_t = vectorize_docs_using_bool_feature(postest, dic, ngram)
    neg_t = vectorize_docs_using_bool_feature(negtest, dic, ngram)

    Y, X = fileprocessing.ready_SVM_data([fileprocessing.POSITIVE_LABEL, fileprocessing.NEGATIVE_LABEL], [pos_t, neg_t])

    p_labels, p_acc, p_vals = linearutil.predict(Y, X, m)

    print "ACC:%.2f%% MSE:%.2f SCC:%.2f" % (p_acc[0], p_acc[1], p_acc[2])

    doc_ids = []
    idx = 1
    for idx in range(len(pos_t)):
        doc_ids.append("_".join([str(fileprocessing.POSITIVE_LABEL), str(idx)]))
        idx += 1
    idx = 1
    for idx in range(len(neg_t)):
        doc_ids.append("_".join([str(fileprocessing.NEGATIVE_LABEL), str(idx)]))
        idx += 1
    fileprocessing.output_predict_detail(doc_ids, Y, p_labels, pout_f)

    pout_f.close()
    postrain.close()
    negtrain.close()
    postest.close()
    negtest.close()
def cv_mode(pos, neg, ngram, cv_num, params_str):
    logging.info("counting")
    pos_con = nbt.counting(pos, ngram)
    neg_con = nbt.counting(neg, ngram)
    logging.info("abstract features")
    dic = nbt.abstract_features(pos_con, neg_con)

    logging.info("generate data in libSVM format")
    pos.seek(0, os.SEEK_SET)  # the file has been read to the end , so move to the head
    neg.seek(0, os.SEEK_SET)
    pos_f_vecs = vectorize_docs_using_bool_feature(pos, dic, ngram)
    neg_f_vecs = vectorize_docs_using_bool_feature(neg, dic, ngram)
    Y, X = fileprocessing.ready_SVM_data(
        [fileprocessing.POSITIVE_LABEL, fileprocessing.NEGATIVE_LABEL], [pos_f_vecs, neg_f_vecs]
    )

    logging.info("%d-fold cross validing using liblinear " % (cv_num))
    m = linearutil.train(Y, X, params_str + " -v " + str(cv_num))  # -s 1 : using L2-resularized L2-loss SVM ; -c 0.1

    pos.close()
    neg.close()