def createLSIPredictionFileSubTaskA(filePath, dictionary, numFeatures=200, withStops=True, fileTag=''): testQuestions = elementParser(filePath) head, tail = os.path.split(filePath) tail = tail.split('.')[0] if(len(fileTag) > 0): fileTag = '-' + fileTag + '-' if(withStops): predFile = tail + '-lda' + str(numFeatures) + '-with-stops' + fileTag + '.pred' else: predFile = tail + '-lda' + str(numFeatures) + fileTag +'.pred' modelPath = prepModelFolder() with open(predFile,'w') as tsvfile: writer = csv.writer(tsvfile, delimiter='\t') for t_question in testQuestions: t_question['question'] = filterPunctuation(t_question['question']) corpus = [] for rel_comment in t_question['comments']: rel_comment['comment'] = filterPunctuation(rel_comment['comment']) corpus.append(dictionary.doc2bow(doc.lower().word_tokenize())) lda, index = generateLDAModel(corpus, dictionary, numFeatures) if(withStops): doc = t_question['question'] else: t_question['question'] = ' '.join([i for i in t_question['question'] if i not in stops]) doc = t_question['question'] vec_bow = dictionary.doc2bow(doc.lower().word_tokenize()) vec_lda = lda[vec_bow] sims = index[vec_lda] for idx, quest in enumerate(t_question['comments']): quest['simVal'] = sims[idx] writer.writerow([t_question['threadId'], row['comment_id'], 0, row['simVal'], row['comment_rel']])
def createW2VPredictionFileSubTaskA(filePath, model, withStops=True): testQuestions = elementParser(filePath) head, tail = os.path.split(filePath) tail = tail.split('.')[0] if(withStops): predFile = tail + '-w2v-with-stops.pred' else: predFile = tail + '-w2v.pred' modelPath = prepModelFolder() predFile = modelPath + predFile with open(predFile, "w") as tsvfile: writer = csv.writer(tsvfile, delimiter="\t") for t_question in testQuestions: t_question['question'] = filterPunctuation(t_question['question']) if(withStops): t_question['W2V_qVec1'] = generateQuestionVector(model,t_question['question'], DIM) else: t_question['relQNoStops'] = " ".join([i for i in t_question['question'].lower().split() if i not in stops]) t_question['W2V_qVec1'] = generateQuestionVector(model,t_question['relQNoStops'], DIM) vecList = [] for t_comment in t_question['comments']: t_comment['comment'] = filterPunctuation(t_comment['comment']) if(withStops): t_comment['W2V_cVec1'] = generateQuestionVector(model, t_comment['comment'], DIM) else: t_comment['relCNoStops'] = " ".join([i for i in t_comment['comment'].lower().split() if i not in stops]) t_comment['W2V_cVec1'] = generateQuestionVector(model, t_comment['relCNoStops'], DIM) vecList.append(t_comment['W2V_cVec1']) simMatrix = cosineSimilarity(t_question['W2V_qVec1'], vecList) for idx, row in enumerate(t_question['comments']): row['simVal'] = simMatrix[idx] writer.writerow([t_question['threadId'], row['comment_id'], 0, row['simVal'], row['comment_rel']])
def QuestionCreator(filePaths=[]): thisList = [] questions = [] for filePath in filePaths: thisList += elementParser(filePath) for row in thisList: questions.append(row['question']) return questions
if(Path("../tmp/QTL_list.p").is_file()): QTL_List = pickle.load(open("../tmp/QTL_List.p", "rb")) else: QTL_List = [] for filePath in QTLfilePaths: QTL_List += createObjectListFromJson(filePath) if not os.path.isdir('../tmp'): os.makedirs('../tmp') pickle.dump(QTL_List, open("../tmp/QTL_List.p", "wb")) if(Path("../tmp/thisList.p").is_file()): thisList = pickle.load(open("../tmp/thisList.p", "rb")) else: thisList = [] for filePath in filePaths: thisList += elementParser(filePath) if not os.path.isdir('../tmp'): os.makedirs('../tmp') pickle.dump(thisList, open("../tmp/thisList.p", "wb")) if(Path("../tmp/subTaskAList.p").is_file()): subTaskAList = pickle.load(open("../tmp/subTaskAList.p", "rb")) else: subTaskAList = [] for filePath in filePathsSubTaskA: subTaskAList += elementParser(filePath) if not os.path.isdir('../tmp'): os.makedirs('../tmp') pickle.dump(subTaskAList, open("../tmp/subTaskAList.p", "wb"))