Exemplo n.º 1
0
def createW2VPredictionFile(filePath, model, withStops=True):
	testQuestions = originalQuestionParser(filePath)
	head, tail = os.path.split(filePath)
	tail = tail.split('.')[0]
	if(withStops):
		predFile = tail + '-w2v-with-stops.pred'
	else:
		predFile = tail + '-w2v.pred'
	modelPath = prepModelFolder()
	predFile = modelPath + predFile
	with open(predFile, "w") as tsvfile:
		writer = csv.writer(tsvfile, delimiter="\t")
		for t_question in testQuestions:
			t_question['origQuestion'] = filterPunctuation(t_question['origQuestion'])
			if(withStops):
				t_question['origQuestion'] = filterPunctuation(t_question['origQuestion'])
				t_question['W2V_OVec1'] = generateQuestionVector(model,t_question['origQuestion'], DIM)
			else: 
				t_question['origQNoStops'] = " ".join([i for i in t_question['origQuestion'].lower().split() if i not in stops])
				t_question['W2V_OVec1'] = generateQuestionVector(model,t_question['origQNoStops'], DIM)

			vecList = []
			for rel_quest in t_question['rel_questions']:
				rel_quest['question'] = filterPunctuation(rel_quest['question'])
				if(withStops):
					rel_quest['W2V_qVec1'] = generateQuestionVector(model,rel_quest['question'], DIM)
				else:
					rel_quest['relQNoStops'] = " ".join([i for i in rel_quest['question'].lower().split() if i not in stops])
					rel_quest['W2V_qVec1'] = generateQuestionVector(model,rel_quest['relQNoStops'], DIM)
				vecList.append(rel_quest['W2V_qVec1'])		
			simMatrix = cosineSimilarity(t_question['W2V_OVec1'], vecList)
			for idx, row in enumerate(t_question['rel_questions']):
				row['simVal'] = simMatrix[idx]
				writer.writerow([t_question['quest_ID'], row['rel_quest_ID'], 0, row['simVal'], row['relevant']])
Exemplo n.º 2
0
def createLDAPredictionFile(filePath, dictionary, numFeatures=200, withStops=True, fileTag=''):
	testQuestions = originalQuestionParser(filePath)
	head, tail = os.path.split(filePath)
	tail = tail.split('.')[0]
	if(len(fileTag) > 0):
		fileTag = '-' + fileTag + '-'
	if(withStops):
		predFile = tail +'-lda' + str(numFeatures) +'-with-stops.pred'
	else:
		predFile = tail + '-lda' + str(numFeatures) + '.pred'
	modelPath = prepModelFolder()
	predFile = modelPath + predFile
	with open(predFile, 'w') as tsvfile:
		writer = csv.writer(tsvfile, delimiter='\t')
		for t_question in testQuestions:
			t_question['origQuestion'] = filterPunctuation(t_question['origQuestion'])
			corpus = []
			for rel_quest in t_question['rel_questions']:
				rel_quest['question'] = filterPunctuation(rel_quest['question'])
				corpus.append(dictionary.doc2vow(rel_quest['question'].lower().word_tokenize()))
			#corpus = [dictionary.doc2bow(q['question'].lower().word_tokenize()) for q in t_question['rel_questions']]
			lda, index = generateLDAModel(corpus, dictionary, numFeatures)
			if(withStops):
				doc = t_question['origQuestion']
			else: 
				t_question['origQNoStops'] = " ".join([i for i in t_question['origQuestion'].lower().word_tokenize() if i not in stops])
				doc = t_question['origQNoStops']
			vec_bow = dictionary.doc2bow(doc.lower().word_tokenize())
			vec_lda = lda[vec_bow]
			sims = index[vec_lda]
			for idx, quest in enumerate(t_question['rel_questions']):
				quest['simVal'] = sims[idx]
				writer.writerow([t_question['quest_ID'], quest['rel_quest_ID'], idx, quest['simVal'], quest['relevant']])
	return featureVec

# def getAvgFeatureVecs(questions, model, num_features):
# 	counter = 0
# 	#preallocation of numpy array for speed purposes
# 	reviewFeatureVecs = np.zeros((len(questions), num_features), dtype="float32")
# 	for question in questions:
# 		reviewFeatureVecs[counter] = makeFeatureVec(question, model, num_features)
# 		counter += 1
# 	return questionFeatureVecs


# TODO: Figure out how to implement tfidf weighting against
# the word2vec vectors

testQuestions = originalQuestionParser(origQfilePath)  

for t_quest in testQuestions:
	t_quest['wordList'] = t_quest['origQuestion'].lower().split()
	t_quest['w2vectors'] = makeFeatureVec(t_quest['wordList'], model, num_features)
	for rel_quest in t_quest['rel_questions']:
		rel_quest['wordList'] = rel_quest['question'].lower().split()
		rel_quest['w2vectors'] = makeFeatureVec(rel_quest['wordList'], model, num_features)



# def createW2VPredictionFile(filePath, model, withStops=True):
# 	testQuestions = originalQuestionParser(filePath)
# 	head, tail = os.path.split(filePath)
# 	tail = tail.split('.')[0]
# 	if(withStops):