def createW2VPredictionFileSubTaskA(filePath, model, withStops=True): testQuestions = elementParser(filePath) head, tail = os.path.split(filePath) tail = tail.split('.')[0] if(withStops): predFile = tail + '-w2v-with-stops.pred' else: predFile = tail + '-w2v.pred' modelPath = prepModelFolder() predFile = modelPath + predFile with open(predFile, "w") as tsvfile: writer = csv.writer(tsvfile, delimiter="\t") for t_question in testQuestions: t_question['question'] = filterPunctuation(t_question['question']) if(withStops): t_question['W2V_qVec1'] = generateQuestionVector(model,t_question['question'], DIM) else: t_question['relQNoStops'] = " ".join([i for i in t_question['question'].lower().split() if i not in stops]) t_question['W2V_qVec1'] = generateQuestionVector(model,t_question['relQNoStops'], DIM) vecList = [] for t_comment in t_question['comments']: t_comment['comment'] = filterPunctuation(t_comment['comment']) if(withStops): t_comment['W2V_cVec1'] = generateQuestionVector(model, t_comment['comment'], DIM) else: t_comment['relCNoStops'] = " ".join([i for i in t_comment['comment'].lower().split() if i not in stops]) t_comment['W2V_cVec1'] = generateQuestionVector(model, t_comment['relCNoStops'], DIM) vecList.append(t_comment['W2V_cVec1']) simMatrix = cosineSimilarity(t_question['W2V_qVec1'], vecList) for idx, row in enumerate(t_question['comments']): row['simVal'] = simMatrix[idx] writer.writerow([t_question['threadId'], row['comment_id'], 0, row['simVal'], row['comment_rel']])
def createPredictionFile(filePath, model, withStops=True, fileTag=''): testQuestions = originalQuestionParser(filePath) head, tail = os.path.split(filePath) tail = tail.split('.')[0] if(len(fileTag) > 0): fileTag = '-' + fileTag if(withStops): predFile = tail + '-d2v-with-stops' + fileTag + '.pred' else: predFile = tail + '-d2v' + fileTag + '.pred' modelPath = prepModelFolder() predFile = modelPath + predFile with open(predFile, "w") as tsvfile: writer = csv.writer(tsvfile, delimiter="\t") for t_question in testQuestions: t_question['origQuestion'] = filterPunctuation(t_question['origQuestion']) if(withStops): t_question['D2V_OVec1'] = model.infer_vector(t_question['origQuestion']) else: t_question['origQNoStops'] = " ".join([i for i in t_question['origQuestion'].lower().split() if i not in stops]) t_question['D2V_OVec1'] = model.infer_vector(t_question['origQNoStops']) vecList = [] for rel_quest in t_question['rel_questions']: rel_quest['question'] = filterPunctuation(rel_quest['question']) if(withStops): rel_quest['D2V_qVec1'] = model.infer_vector(rel_quest['question']) else: rel_quest['relQNoStops'] = " ".join([i for i in rel_quest['question'].lower().split() if i not in stops]) rel_quest['D2V_qVec1'] = model.infer_vector(rel_quest['relQNoStops']) vecList.append(rel_quest['D2V_qVec1']) simMatrix = cosineSimilarity(t_question['D2V_OVec1'], vecList) for idx, row in enumerate(t_question['rel_questions']): row['simVal'] = simMatrix[idx] writer.writerow([t_question['quest_ID'], row['rel_quest_ID'], 0, row['simVal'], row['relevant']])
def generateCosineSimilarities(testQuestion, questionList): questionVectors = [] for q in questionList: questionVectors.append(q['question_vector']) sims = cosineSimilarity(testQuestion['question_vector'], questionVectors) for i,sim in enumerate(sims): questionList[i]['W2V_sim'] = sim return questionList