Пример #1
0
def createW2VPredictionFileSubTaskA(filePath, model, withStops=True):
	testQuestions = elementParser(filePath)
	head, tail = os.path.split(filePath)
	tail = tail.split('.')[0]
	if(withStops):
		predFile = tail + '-w2v-with-stops.pred'
	else:
		predFile = tail + '-w2v.pred'
	modelPath = prepModelFolder()
	predFile = modelPath + predFile
	with open(predFile, "w") as tsvfile:
		writer = csv.writer(tsvfile, delimiter="\t")
		for t_question in testQuestions:
			t_question['question'] = filterPunctuation(t_question['question'])
			if(withStops):
				t_question['W2V_qVec1'] = generateQuestionVector(model,t_question['question'], DIM)
			else:
				t_question['relQNoStops'] = " ".join([i for i in t_question['question'].lower().split() if i not in stops])
				t_question['W2V_qVec1'] = generateQuestionVector(model,t_question['relQNoStops'], DIM)
			vecList = []
			for t_comment in t_question['comments']:
				t_comment['comment'] = filterPunctuation(t_comment['comment'])
				if(withStops):
					t_comment['W2V_cVec1'] = generateQuestionVector(model, t_comment['comment'], DIM)
				else:
					t_comment['relCNoStops'] = " ".join([i for i in t_comment['comment'].lower().split() if i not in stops])
					t_comment['W2V_cVec1'] = generateQuestionVector(model, t_comment['relCNoStops'], DIM)
				vecList.append(t_comment['W2V_cVec1'])
			simMatrix = cosineSimilarity(t_question['W2V_qVec1'], vecList)
			for idx, row in enumerate(t_question['comments']):
				row['simVal'] = simMatrix[idx]
				writer.writerow([t_question['threadId'], row['comment_id'], 0, row['simVal'], row['comment_rel']])
	def createPredictionFile(filePath, model, withStops=True, fileTag=''):
		testQuestions = originalQuestionParser(filePath)
		head, tail = os.path.split(filePath)
		tail = tail.split('.')[0]
		if(len(fileTag) > 0):
			fileTag = '-' + fileTag
		if(withStops):
			predFile = tail + '-d2v-with-stops' + fileTag + '.pred'
		else:
			predFile = tail + '-d2v' + fileTag + '.pred'
		modelPath = prepModelFolder()
		predFile = modelPath + predFile
		with open(predFile, "w") as tsvfile:
			writer = csv.writer(tsvfile, delimiter="\t")
			for t_question in testQuestions:
				t_question['origQuestion'] = filterPunctuation(t_question['origQuestion'])
				if(withStops):
					t_question['D2V_OVec1'] = model.infer_vector(t_question['origQuestion'])
				else: 
					t_question['origQNoStops'] = " ".join([i for i in t_question['origQuestion'].lower().split() if i not in stops])
					t_question['D2V_OVec1'] = model.infer_vector(t_question['origQNoStops'])

				vecList = []
				for rel_quest in t_question['rel_questions']:
					rel_quest['question'] = filterPunctuation(rel_quest['question'])
					if(withStops):
						rel_quest['D2V_qVec1'] = model.infer_vector(rel_quest['question'])
					else:
						rel_quest['relQNoStops'] = " ".join([i for i in rel_quest['question'].lower().split() if i not in stops])
						rel_quest['D2V_qVec1'] = model.infer_vector(rel_quest['relQNoStops'])
					vecList.append(rel_quest['D2V_qVec1'])		
				simMatrix = cosineSimilarity(t_question['D2V_OVec1'], vecList)
				for idx, row in enumerate(t_question['rel_questions']):
					row['simVal'] = simMatrix[idx]
					writer.writerow([t_question['quest_ID'], row['rel_quest_ID'], 0, row['simVal'], row['relevant']])
Пример #3
0
def createLSIPredictionFile(filePath,
                            dictionary,
                            numFeatures=200,
                            withStops=True,
                            fileTag=''):
    testQuestions = originalQuestionParser(filePath)
    head, tail = os.path.split(filePath)
    tail = tail.split('.')[0]
    if (len(fileTag) > 0):
        fileTag = '-' + fileTag + '-'
    if (withStops):
        predFile = tail + '-lsi' + str(
            numFeatures) + '-with-stops' + fileTag + '.pred'
    else:
        predFile = tail + '-lsi' + str(numFeatures) + fileTag + '.pred'
    modelPath = prepModelFolder()
    predFile = modelPath + predFile
    with open(predFile, 'w') as tsvfile:
        writer = csv.writer(tsvfile, delimiter='\t')
        for t_question in testQuestions:
            t_question['origQuestion'] = filterPunctuation(
                t_question['origQuestion'])
            corpus = []
            count = 0
            for rel_quest in t_question['rel_questions']:
                rel_quest['question'] = filterPunctuation(
                    rel_quest['question'])
                corpus.append(
                    dictionary.doc2bow(
                        word_tokenize(rel_quest['question'].lower())))
                # if(count < 5):
                # 	print('###############')
                # 	print(rel_quest['question'])
                # 	print(dictionary.doc2bow(word_tokenize(rel_quest['question'].lower())))
                # 	count += 1
            #corpus = [dictionary.doc2bow(q['question'].lower().word_tokenize()) for q in t_question['rel_questions']]
            #print(corpus)
            lsi, index = generateLSIModel(corpus, dictionary, numFeatures)
            if (withStops):
                doc = t_question['origQuestion']
            else:
                t_question['origQNoStops'] = " ".join([
                    i
                    for i in word_tokenize(t_question['origQuestion'].lower())
                    if i not in stops
                ])
                doc = t_question['origQNoStops']
            vec_bow = dictionary.doc2bow(word_tokenize(doc.lower()))
            vec_lsi = lsi[vec_bow]
            sims = index[vec_lsi]
            for idx, quest in enumerate(t_question['rel_questions']):
                quest['simVal'] = sims[idx]
                writer.writerow([
                    t_question['quest_ID'], quest['rel_quest_ID'], idx,
                    quest['simVal'], quest['relevant']
                ])
Пример #4
0
def createLSIPredictionFileSubTaskA(filePath,
                                    dictionary,
                                    numFeatures=200,
                                    withStops=True,
                                    fileTag=''):
    testQuestions = elementParser(filePath)
    head, tail = os.path.split(filePath)
    tail = tail.split('.')[0]
    if (len(fileTag) > 0):
        fileTag = '-' + fileTag + '-'
    if (withStops):
        predFile = tail + '-lsi' + str(
            numFeatures) + '-with-stops' + fileTag + '.pred'
    else:
        predFile = tail + '-lsi' + str(numFeatures) + fileTag + '.pred'
    modelPath = prepModelFolder()
    with open(predFile, 'w') as tsvfile:
        writer = csv.writer(tsvfile, delimiter='\t')
        for t_question in testQuestions:
            t_question['question'] = filterPunctuation(t_question['question'])
            corpus = []
            #print(len(t_question['comments']))
            for rel_comment in t_question['comments']:
                rel_comment['comment'] = filterPunctuation(
                    rel_comment['comment'])
                corpus.append(
                    dictionary.doc2bow(
                        word_tokenize(rel_comment['comment'].lower())))
                # if(withStops):
                # 	comment = word_tokenize(rel_comment['comment'].lower())
                # else:
                # 	rel_comment['comment'] = ' '.join([i for i in t_question['question'] if i not in stops])
                # 	comment = word_tokenize(rel_comment['comment'].lower())
                # print(dictionary.doc2bow(comment))
                #corpus.append(dictionary.doc2bow(comment))
            if (len(corpus) > 1):
                lsi, index = generateLSIModel(corpus, dictionary, numFeatures)
            if (withStops):
                doc = word_tokenize(t_question['question'].lower())
            else:
                t_question['question'] = ' '.join([
                    i for i in t_question['question'].lower() if i not in stops
                ])
                doc = word_tokenize(t_question['question'].lower())
            vec_bow = dictionary.doc2bow(doc)
            vec_lsi = lsi[vec_bow]
            sims = index[vec_lsi]
            for idx, quest in enumerate(t_question['comments']):
                quest['simVal'] = sims[idx]
                writer.writerow([
                    t_question['threadId'], quest['comment_id'], idx,
                    quest['simVal'], quest['comment_rel']
                ])