Exemplo n.º 1
0
        if "classifier__kernel" in model_params:
            model_params["classifier__kernel"] = str(model_params["classifier__kernel"])

    if test_type == "diagnostic":
        tag_attr = "diag_tag"
        TARGET_POSITIVE = "p"
    elif test_type == "sentiment":
        tag_attr = "sent_tag"
        TARGET_POSITIVE = "p"
    elif test_type == "class":
        tag_attr = "report_class"
        TARGET_POSITIVE = TARGET_CLASS
    else:
        raise ValueError("Unknown tag: " + test_type)

    data = data_utils.read_from_csv(data_file)
    filtered_data = [x for x in data if getattr(x, tag_attr) != "" and getattr(x, tag_attr) != "u"]
    filtered_data = filtered_data[:2500]  # put a limit on the size for performance

    labels = [np.float32(getattr(x, tag_attr) == TARGET_POSITIVE) for x in filtered_data]
    report_ids = [x.report_id for x in filtered_data]
    sentences = [x.processed_sentence for x in filtered_data]

    train_data, train_labels, test_data, test_labels = data_utils.split_data(sentences, labels, report_ids, split_value)

    # Create transformation pipeline
    if USE_RF:
        pipe = pipelines.get_count_lsi_randomforest()
    else:
        pipe = pipelines.get_count_lsi_SVM()
Exemplo n.º 2
0
from sklearn.metrics import classification_report, roc_curve, auc
import json
import data_utils
from sklearn.grid_search import GridSearchCV
import numpy as np
import pipelines

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print "USAGE: " + sys.argv[0] + " input_file output_model_file"
        sys.exit(1)

    input_file = sys.argv[1]
    output_model_file = sys.argv[2]

    data = data_utils.read_from_csv(input_file)

    filtered_data = [x for x in data if x.diag_tag != "" and x.diag_tag != "u"]
    labels = [np.float32(x.diag_tag == "p") for x in filtered_data]
    data = [x.processed_sentence for x in filtered_data]
    report_ids = [x.report_id for x in filtered_data]

    train_data, train_labels, test_data, test_labels = data_utils.split_data(data, labels, report_ids, split=0.7)

    # change these parameters for the grid search
    # parameters = {'lsi__n_components': [100],
    #               'classifier__C': [3, 4, 5, 6, 7, 8, 9, 10],
    #               'classifier__kernel': ["rbf"]
    #               }

    parameters = {'lsi__n_components': [100],