def test(): # load tf-idf with open(os.path.join(root_path, "models/pretrained/tfidf.pkl"), "rb") as f: vectorizer = pickle.load(f) # load validation data val_ques, val_lab = VQADataProvider.label_ques("val") val_ques_matrix = vectorizer.transform(val_ques) # load model with open(os.path.join(root_path, "models/pretrained/pretrained_svm.pkl"), "rb") as f: clf = pickle.load(f) # compute validation accuracy val_acc = clf.score(val_ques_matrix, val_lab) print("validation accuracy") print(val_acc) # compute validation confusion matrix preds = clf.predict(val_ques_matrix) val_c_m = confusion_matrix(val_lab, preds) print("validation confusion matrix") print(val_c_m)
def train(): # prepare the data # utils folder is in the root path, which is /home/leishi/vqa2019 questions, labels = VQADataProvider.label_ques("train_val") vectorizer = TfidfVectorizer() ques_matrix = vectorizer.fit_transform(questions) # build classifier # svc classifier = SVC(gamma="scale", verbose=False) # # rf # classifier = RandomForestClassifier() # # gb # classifier = GradientBoostingClassifier() # train classifier.fit(ques_matrix, labels) # # training accuracy train_acc = classifier.score(ques_matrix, labels) print("training accuracy: ", train_acc) # save tf-idf with open(os.path.join(root_path, "models/pretrained/tfidf_train_val.pkl"), "wb") as f: pickle.dump(vectorizer, f) # save classifier with open( os.path.join(root_path, "models/pretrained/pretrained_svm_train_val.pkl"), "wb") as f: pickle.dump(classifier, f) # # debug # print("\nraw questions") # print(questions[:5]) # print("question matrix") # print(vectorizer.transform(questions[:5]).todense()) # # print("corresponding words of transformed questions") # # print(vectorizer.inverse_transform(vectorizer.transform(questions[:5]))) # print("terms used") # print(vectorizer.get_feature_names()) return vectorizer, classifier