def prepare_test_data(): test_data = [] log.write("Open test set") with open(TESTING_FILE, "r") as csv_file: csv_data = csv.DictReader(csv_file) for row in csv_data: test_data.append(row) log.write("Preprocessed test set") for data in test_data: data["preprocessed_kalimat"] = Preprocess(data).preprocess() log.write("Extract feature test set") feature = Features(test_data) feature.extract_feature() # feature.get_trainable_dataset() return test_data
def main(): log.write("Open dataset") dataset = open_dataset([TRAINING_DIR + files for files in TRAINING_FILES]) merged_dataset = [] log.write("Resolve disagreement data") for k, v in dataset.items(): if k == TRAINING_DIR + TRAINING_FILES[ 2] or k == TRAINING_DIR + TRAINING_FILES[4]: dataset[k] = disagreement_handling(v) merged_dataset += dataset[k] analyze_data(merged_dataset) log.write("Analyzing sense") sense_id = set() for datum in merged_dataset: sense_id.add(datum["sense"]) xml_root = ET.parse(SENSE_FILES).getroot() for word in xml_root: for sense in word.findall("senses/sense"): if word.attrib["wid"].zfill(2) + sense.attrib["sid"].zfill( 2) not in sense_id: log.write( "Kata `{}` dengan sense `{}` tidak ditemukan di data training" .format(word[0].text, sense.attrib)) log.write("Preprocessing") for data in merged_dataset: data["preprocessed_kalimat"] = Preprocess(data).preprocess() # for datum in merged_dataset: # datum["preprocessed_kalimat"] = print(merged_dataset[0]) log.write("Feature extraction") feature = Features(merged_dataset) feature.extract_feature() # feature.get_trainable_dataset() # log.write(merged_dataset[0]) with open("feature.csv", "w") as csv_file: csv_writer = csv.writer(csv_file) # csv_writer.writerow(["kalimat_id", "sense", "features"]) for data in merged_dataset: if "data_embedding" in data: csv_writer.writerow( [data["\ufeffkalimat_id"], data["kata"], data["sense"]] + list(data["data_embedding"])) log.write("Build Dataset") word_feature_mat, dummy_train, dummy_test = build_dataset(merged_dataset) classifier = { "Random Forest": RandomForestClassifier(n_estimators=1000), "SVM": SVC(C=10000, gamma=0.1, tol=1e-6, decision_function_shape='ovo'), "Neural Net": MLPClassifier(hidden_layer_sizes=2000, activation='tanh', solver='adam', tol=1e-6, learning_rate_init=0.001, max_iter=1000, early_stopping=True) } best_model = None best_acc = 0.0000001 test_data = prepare_test_data() for model_name, model_class in classifier.items(): log.write("Try {} :".format(model_name)) true_count = 0 n_data = 0 model = model_class ansfile = "answers/{}_{}.csv".format(model_name, int(time.time())) for word in sorted(list(word_feature_mat.keys())): print("predicting {}".format(word)) model.fit(dummy_train[word][0], dummy_train[word][1]) prediction = model.predict(dummy_test[word][0]) n_data += len(prediction) for pred, true in zip(prediction, dummy_test[word][1]): if pred == true: true_count += 1 model = model.fit(word_feature_mat[word][0], word_feature_mat[word][1]) actual_test(test_data, model, word, ansfile) accuracy = 100 * true_count / n_data # if accuracy > best_acc: # # best_model = model_class log.write("Akurasi dari {} : {} %".format(model_name, accuracy))