def getTestingTrainingData(): global save save = True ngram_path = "../data/Holmes_Training_Data/norvig.txt" glove_file = "../data/glove_vectors/glove.6B.50d.txt" print "Training N-Gram Models..." unigrams, bigrams, backoff = getGrams(path=ngram_path); print "Loading Training Questions..." training_questions = loadQuestions(directory="../data/train/") print "Loading Dev Questions...." dev_qs = loadQuestions(directory="../data/dev_set/") print "Loading Test Questions" test_qs = loadQuestions(directory="../data/test/") com_features = None com_labels = None com_questions = training_questions + dev_qs + test_qs # Check if testing data is already create if len(getRecursiveFiles("../data/ml_data/distance_test", filter_fn=lambda a: ".pickle" in a)) > 0: print "Found Saved Test Features" com_features = loadPickle("../data/ml_data/distance_test/com_traindevtest_features.pickle") com_labels = loadPickle("../data/ml_data/distance_test/com_traindevtest_labels.pickle") elif len(getRecursiveFiles("../data/ml_data/distance_train", filter_fn=lambda a: ".pickle" in a)) > 0: print "Found Saved Training/Dev Features" train_features = loadPickle("../data/ml_data/distance_train/com_traindev_features.pickle") train_labels = loadPickle("../data/ml_data/distance_train/com_traindev_labels.pickle") print "Calculating All Test Features" test_features, test_labels = feature_extractor.createFeatureExtractorForAll(test_qs, unigrams, bigrams, glove_file) com_features = train_features + test_features com_labels = train_labels + test_labels print "Saving Combined Features for later use" savePickle(com_features, "../data/ml_data/distance_test/com_traindevtest_features.pickle") savePickle(com_labels, "../data/ml_data/distance_test/com_traindevtest_labels.pickle") # Return Data training_data = (train_features, train_labels) testing_data = (test_features, test_labels) return (training_data, testing_data) else: print "No saved features found. Need to Calculate All" com_features, com_labels = feature_extractor.createFeatureExtractorForAll(com_questions, unigrams, bigrams, glove_file) print "Saving Combined Features for later use" savePickle(com_features, "../data/ml_data/distance_test/com_traindevtest_features.pickle") savePickle(com_labels, "../data/ml_data/distance_test/com_traindevtest_labels.pickle") # Separate Data + Return break_point = (len(training_questions) + len(dev_qs))*5 training_data = (com_features[:break_point], com_labels[:break_point]) test_data = (com_features[break_point:], com_labels[break_point:]) return (training_data, test_data)
def getEvaluatingTrainingData(): global save save = True ngram_path = "../data/Holmes_Training_Data/norvig.txt" glove_file = "../data/glove_vectors/glove.6B.300d.txt" print "Training N-Gram Models" unigrams, bigrams, backoff = getGrams(path=ngram_path); print "Loading Training Questions" training_questions = loadQuestions(directory="../data/train/") dev_qs = [] print "Loading Dev Questions" dev_qs = loadQuestions(directory="../data/dev_set/") com_questions = training_questions + dev_qs com_features = None com_labels = None # Check if saved if len(getRecursiveFiles("../data/ml_data/distance_train", filter_fn=lambda a: ".pickle" in a)) > 0: print "Found Saved Features" com_features = loadPickle("../data/ml_data/distance_train/com_traindev_features.pickle") com_labels = loadPickle("../data/ml_data/distance_train/com_traindev_labels.pickle") else: print "Getting AlL Features" com_features, com_labels = feature_extractor.createFeatureExtractorForAll(com_questions, unigrams, bigrams, glove_file) savePickle(com_features, "../data/ml_data/distance_train/com_traindev_features.pickle") savePickle(com_labels, "../data/ml_data/distance_train/com_traindev_labels.pickle") break_point = len(training_questions)*5 training_data = (com_features[:break_point], com_labels[:break_point]) dev_data = (com_features[break_point:], com_labels[break_point:]) return (training_data, dev_data)