def createFeatureExtractorForAll(examples, unigrams, bigrams, glove_file): print "Loading Glove None" glove = Glove(glove_file, delimiter=" ", header=False, quoting=csv.QUOTE_NONE, v=False); all_features = [] for i in range(len(examples)*5): all_features.append([]) all_ys = [] low_ranks = [None, "pmi", "ppmi", "tfidf"]; #low_ranks = [None] print "Calculating VSM Methods" # Get Glove Based Models for lr in low_ranks: if lr != None: print "Loading Glove %s" %(lr) glove = Glove(glove_file, delimiter=" ", header=False, quoting=csv.QUOTE_NONE, weighting=lr, v=False); glove.lsa(250) count = 0 for example in examples: for a in example.answers: data = (example, a) features = createSingleExtractorVSM(data, glove, unigrams) all_features[count] += features count += 1 print "Calculating N-Grams + Interactions" # Get answers + Unigram/Bigram + Add in interactions index = 0 for example in examples: for i,word in enumerate(example.answers): if i == example.correctAnswer: all_ys.append(1) else: all_ys.append(0) unigram_d = unigramModel(unigrams, example, word) bigram_d = bigramModel(bigrams, example, word) all_features[index].append(unigram_d) all_features[index].append(bigram_d) # Bias Term all_features[index].append(1) #Interaction Terms num_feats = len(all_features[index]) for i in range(num_feats-1): for j in range(i+1, num_feats-1): all_features[index].append(all_features[index][i]*all_features[index][j]) index += 1 print "Done" return (all_features, all_ys)
def getQuestionClassifications(questions, unigrams, bigrams, glove_file): model_classes = getModelClassifications(); # Mapping of types of models/parameters to integer prelim_mapping_array = [None]*len(questions) # Map of question to a list of corresponding to models that correctly predicted the answer # First Check if the prelim mapping is in a pickle if len(getRecursiveFiles("../data/ml_data/sentence_train_prelim", filter_fn=lambda a: ".pickle" in a)) > 0: print "found Saved Prelimninary Mappings" prelim_mapping_array = loadPickle("../data/ml_data/sentence_train_prelim/com_triandev_prelimmaparray.pickle") else: print "Finding Preliminary Mapping" # Do unigram + bigram first for i,question in enumerate(questions): u_answer = unigramModel(unigrams, question) b_answer = bigramModel(bigrams, question) if u_answer[0] == question.getCorrectWord(): tups = ("Unigram", 2) # TODO: change if prelim_mapping_array[i] != None: prelim_mapping_array[i].append(tups) else: prelim_mapping_array[i] = [tups] if b_answer[0] == question.getCorrectWord(): tups = ("Bigram", 2) # TODO: change if prelim_mapping_array[i] != None: prelim_mapping_array[i].append(tups) else: prelim_mapping_array[i] = [tups] # Do glove based now for lr in low_ranks: print "Loading Glove %s" %(lr) glove = Glove(glove_file, delimiter=" ", header=False, quoting=csv.QUOTE_NONE, weighting=lr, v=False); glove.lsa(250) # TODO: change to 250 for model_name, model_form in param_models: for d_form, d_name in distances: whole_name = lr + model_name + d_name for i,q in enumerate(questions): answer = None if model_name == "Weighted VSM": answer = model_form(glove, unigrams, q, threshold=.95) else: answer = model_form(glove, q, threshold=.95) if answer[0] != None and answer[0] != -1 and answer[0] == q.getCorrectWord(): tups = (whole_name, answer[1]) # (Name, Distance) if prelim_mapping_array[i] != None: prelim_mapping_array[i].append(tups) else: prelim_mapping_array[i] = [tups] print "saving preliminary mapping" savePickle(prelim_mapping_array, "../data/ml_data/sentence_train_prelim/com_triandev_prelimmaparray.pickle") #print prelim_mapping_array # Classify each question now + return # For now, randomly picks out of the right ones real_mapping = [None]*len(questions) for i,q in enumerate(questions): if prelim_mapping_array[i] != None: #best_model = random.choice(prelim_mapping_array[i]) best_model = min(prelim_mapping_array[i], key=lambda x: x[1])[0] # Get the name of the model with the lowest distance real_mapping[i] = model_classes[best_model] else: real_mapping[i] = model_classes["None"] #print real_mapping return real_mapping