Exemplo n.º 1
0
def prepare_test_data():
    test_data = []
    log.write("Open test set")
    with open(TESTING_FILE, "r") as csv_file:
        csv_data = csv.DictReader(csv_file)
        for row in csv_data:
            test_data.append(row)
    log.write("Preprocessed test set")
    for data in test_data:
        data["preprocessed_kalimat"] = Preprocess(data).preprocess()
    log.write("Extract feature test set")
    feature = Features(test_data)
    feature.extract_feature()
    # feature.get_trainable_dataset()
    return test_data
Exemplo n.º 2
0
def main():
    log.write("Open dataset")
    dataset = open_dataset([TRAINING_DIR + files for files in TRAINING_FILES])
    merged_dataset = []
    log.write("Resolve disagreement data")
    for k, v in dataset.items():
        if k == TRAINING_DIR + TRAINING_FILES[
                2] or k == TRAINING_DIR + TRAINING_FILES[4]:
            dataset[k] = disagreement_handling(v)
        merged_dataset += dataset[k]
    analyze_data(merged_dataset)
    log.write("Analyzing sense")
    sense_id = set()
    for datum in merged_dataset:
        sense_id.add(datum["sense"])
    xml_root = ET.parse(SENSE_FILES).getroot()
    for word in xml_root:
        for sense in word.findall("senses/sense"):
            if word.attrib["wid"].zfill(2) + sense.attrib["sid"].zfill(
                    2) not in sense_id:
                log.write(
                    "Kata `{}` dengan sense `{}` tidak ditemukan di data training"
                    .format(word[0].text, sense.attrib))
    log.write("Preprocessing")
    for data in merged_dataset:
        data["preprocessed_kalimat"] = Preprocess(data).preprocess()

    # for datum in merged_dataset:
    #     datum["preprocessed_kalimat"] =

    print(merged_dataset[0])
    log.write("Feature extraction")
    feature = Features(merged_dataset)
    feature.extract_feature()
    # feature.get_trainable_dataset()
    # log.write(merged_dataset[0])
    with open("feature.csv", "w") as csv_file:
        csv_writer = csv.writer(csv_file)
        # csv_writer.writerow(["kalimat_id", "sense", "features"])
        for data in merged_dataset:
            if "data_embedding" in data:
                csv_writer.writerow(
                    [data["\ufeffkalimat_id"], data["kata"], data["sense"]] +
                    list(data["data_embedding"]))
    log.write("Build Dataset")
    word_feature_mat, dummy_train, dummy_test = build_dataset(merged_dataset)
    classifier = {
        "Random Forest":
        RandomForestClassifier(n_estimators=1000),
        "SVM":
        SVC(C=10000, gamma=0.1, tol=1e-6, decision_function_shape='ovo'),
        "Neural Net":
        MLPClassifier(hidden_layer_sizes=2000,
                      activation='tanh',
                      solver='adam',
                      tol=1e-6,
                      learning_rate_init=0.001,
                      max_iter=1000,
                      early_stopping=True)
    }
    best_model = None
    best_acc = 0.0000001
    test_data = prepare_test_data()
    for model_name, model_class in classifier.items():
        log.write("Try {} :".format(model_name))
        true_count = 0
        n_data = 0
        model = model_class
        ansfile = "answers/{}_{}.csv".format(model_name, int(time.time()))
        for word in sorted(list(word_feature_mat.keys())):
            print("predicting {}".format(word))
            model.fit(dummy_train[word][0], dummy_train[word][1])
            prediction = model.predict(dummy_test[word][0])
            n_data += len(prediction)
            for pred, true in zip(prediction, dummy_test[word][1]):
                if pred == true:
                    true_count += 1
            model = model.fit(word_feature_mat[word][0],
                              word_feature_mat[word][1])
            actual_test(test_data, model, word, ansfile)
        accuracy = 100 * true_count / n_data
        # if accuracy > best_acc:
        #     #     best_model = model_class
        log.write("Akurasi dari {} : {} %".format(model_name, accuracy))