def predict_segmented_tf_idf(data, docs_per_q, ids_and_categories): #index docs res = [] category_tf_idfs = {} for index, row in data.iterrows(): current_id = str(row['id']) print current_id current_category = ids_and_categories[current_id] if category_tf_idfs.get(current_category) is None: category_tf_idfs[current_category] = utils.get_docstf_idf(wiki_docs_dir + '/%s' % current_category) docs_tf, words_idf = category_tf_idfs[current_category] #get answers words w_A = set(utils.tokenize(row['answerA'])) w_B = set(utils.tokenize(row['answerB'])) w_C = set(utils.tokenize(row['answerC'])) w_D = set(utils.tokenize(row['answerD'])) sc_A = 0 sc_B = 0 sc_C = 0 sc_D = 0 q = row['question'] for d in zip(*utils.get_docs_importance_for_question(q, docs_tf, words_idf, docs_per_q))[0]: for w in w_A: if w in docs_tf[d]: sc_A += 1. * docs_tf[d][w] * words_idf[w] # count of how many times in the document, times log(numberofdocs/word) for each word for w in w_B: if w in docs_tf[d]: sc_B += 1. * docs_tf[d][w] * words_idf[w] for w in w_C: if w in docs_tf[d]: sc_C += 1. * docs_tf[d][w] * words_idf[w] for w in w_D: if w in docs_tf[d]: sc_D += 1. * docs_tf[d][w] * words_idf[w] res.append(['A','B','C','D'][np.argmax([sc_A, sc_B, sc_C, sc_D])]) return res
def predict(data, docs_per_q): #index docs docs_tf, words_idf = utils.get_docstf_idf(wiki_docs_dir) res = [] f = [] for index, row in data.iterrows(): #get answers words w_A = set(utils.tokenize(row['answerA'])) w_B = set(utils.tokenize(row['answerB'])) w_C = set(utils.tokenize(row['answerC'])) w_D = set(utils.tokenize(row['answerD'])) sc_A = 0 sc_B = 0 sc_C = 0 sc_D = 0 q = row['question'] for d in zip(*utils.get_docs_importance_for_question(q, docs_tf, words_idf, docs_per_q))[0]: for w in w_A: if w in docs_tf[d]: sc_A += 1. * docs_tf[d][w] * words_idf[w] for w in w_B: if w in docs_tf[d]: sc_B += 1. * docs_tf[d][w] * words_idf[w] for w in w_C: if w in docs_tf[d]: sc_C += 1. * docs_tf[d][w] * words_idf[w] for w in w_D: if w in docs_tf[d]: sc_D += 1. * docs_tf[d][w] * words_idf[w] res.append(['A','B','C','D'][np.argmax([sc_A, sc_B, sc_C, sc_D])]) f.append([sc_A, sc_B, sc_C, sc_D]) features = np.array(f) pd.DataFrame({'id': list(data['id']),'fA': features[:,0], 'fB': features[:,1], 'fC': features[:,2], 'fD': features[:,3]})[['id', 'fA', 'fB', 'fC', 'fD']].to_csv('features_ck12.csv', index = False) return res
def predict(data, docs_per_q): #index docs docs_tf, words_idf = utils.get_docstf_idf(wiki_docs_dir) res = [] doc_score = [["A", "B", "C", "D"]] for index, row in data.iterrows(): #get answers words w_A = set(utils.tokenize(row['answerA'])) w_B = set(utils.tokenize(row['answerB'])) w_C = set(utils.tokenize(row['answerC'])) w_D = set(utils.tokenize(row['answerD'])) sc_A = 0 sc_B = 0 sc_C = 0 sc_D = 0 q = row['question'] for d in list( zip(*utils.get_docs_importance_for_question( q, docs_tf, words_idf, docs_per_q)))[0]: for w in w_A: if w in docs_tf[d]: sc_A += 1. * docs_tf[d][w] * words_idf[w] for w in w_B: if w in docs_tf[d]: sc_B += 1. * docs_tf[d][w] * words_idf[w] for w in w_C: if w in docs_tf[d]: sc_C += 1. * docs_tf[d][w] * words_idf[w] for w in w_D: if w in docs_tf[d]: sc_D += 1. * docs_tf[d][w] * words_idf[w] res.append(['A', 'B', 'C', 'D'][np.argmax([sc_A, sc_B, sc_C, sc_D])]) doc_score.append([sc_A, sc_B, sc_C, sc_D]) return res, doc_score
def predict(data, docs_per_q): #index docs docs_tf, words_idf = utils.get_docstf_idf(wiki_docs_dir) res = [] doc_score = [["A","B","C","D"]] for index, row in data.iterrows(): #get answers words w_A = set(utils.tokenize(row['answerA'])) w_B = set(utils.tokenize(row['answerB'])) w_C = set(utils.tokenize(row['answerC'])) w_D = set(utils.tokenize(row['answerD'])) sc_A = 0 sc_B = 0 sc_C = 0 sc_D = 0 q = row['question'] for d in list(zip(*utils.get_docs_importance_for_question(q, docs_tf, words_idf, docs_per_q)))[0]: for w in w_A: if w in docs_tf[d]: sc_A += 1. * docs_tf[d][w] * words_idf[w] for w in w_B: if w in docs_tf[d]: sc_B += 1. * docs_tf[d][w] * words_idf[w] for w in w_C: if w in docs_tf[d]: sc_C += 1. * docs_tf[d][w] * words_idf[w] for w in w_D: if w in docs_tf[d]: sc_D += 1. * docs_tf[d][w] * words_idf[w] res.append(['A','B','C','D'][np.argmax([sc_A, sc_B, sc_C, sc_D])]) doc_score.append([sc_A, sc_B, sc_C, sc_D]) return res, doc_score
def predict(data, docs_per_q): #index docs docs_tf, words_idf = utils.get_docstf_idf(wiki_docs_dir) #docs_tf = pickle.load(open('docs_tf_data.p', 'rb')) #words_idf = pickle.load(open('words_idf_data.p', 'rb')) pickle.dump(docs_tf, open("docs_tf_data.p", 'wb')) pickle.dump(words_idf, open("words_idf_data.p", 'wb')) res = [] print 'predict' for index, row in data.iterrows(): #get answers words w_A = set(utils.tokenize(row['answerA'])) w_B = set(utils.tokenize(row['answerB'])) w_C = set(utils.tokenize(row['answerC'])) w_D = set(utils.tokenize(row['answerD'])) A_vec = np.zeros(N) B_vec = np.zeros(N) C_vec = np.zeros(N) D_vec = np.zeros(N) sc_A = 0 sc_B = 0 sc_C = 0 sc_D = 0 print index q = row['question'] q_vec = np.zeros(N) for w in utils.tokenize(q): if w in model.vocab and w not in stop: q_vec += model[w] q_vec = q_vec / linalg.norm(q_vec) for d in zip(*utils.get_docs_importance_for_question(q, docs_tf, words_idf, docs_per_q))[0]: for w in w_A: if w in docs_tf[d]: sc_A += 1. * docs_tf[d][w] * words_idf[w] if w in model.vocab: A_vec += model[w]# docs_tf (arr of tf for each doc for each word) [d] for the specific word for w in w_B: if w in docs_tf[d]: sc_B += 1. * docs_tf[d][w] * words_idf[w] if w in model.vocab: B_vec += model[w] for w in w_C: if w in docs_tf[d]: sc_C += 1. * docs_tf[d][w] * words_idf[w] if w in model.vocab: C_vec += model[w] for w in w_D: if w in docs_tf[d]: sc_D += 1. * docs_tf[d][w] * words_idf[w] if w in model.vocab: D_vec = model[w] A_vec = A_vec / linalg.norm(A_vec) B_vec = B_vec / linalg.norm(B_vec) C_vec = C_vec / linalg.norm(C_vec) D_vec = D_vec / linalg.norm(D_vec) semantic_scores = np.concatenate((A_vec, B_vec, C_vec, D_vec)).reshape(4, N).dot(q_vec) semantic_scores[np.isnan(semantic_scores)] = 0 #print semantic_scores combined_scores = [sc_A, sc_B, sc_C, sc_D] + semantic_scores #print combined_scores res.append(['A','B','C','D'][np.argmax(combined_scores)]) return res