Пример #1
0
def getWNRelations(w):
	# print('Getting WordNet relations for word: ', w)
	if w in wnRelations:
		# print('Accessed from local store!')
		return wnRelations[w]

	neighbors = []
	for synset in wn.synsets(w):
		hypernyms = synset.hypernyms()
		for h in hypernyms:
			for l in h.lemmas():
				full_name = getWords(l.name())
				for word in full_name:
					# not the same word, not empty string
					if word != w.lower() and len(word) > 0: 
						neighbors.append(word)

	# get rid of duplicates
	neighbors = list(set(neighbors))

	# save locally
	wnRelations[w] = neighbors
	utils.saveData(wnRelations, cache + 'WN_relations.p')	
	print('WN relations saved:', len(neighbors))

	return neighbors
Пример #2
0
def getFBRelations(w):
	# check for memory, else get new relations
	if w in freebaseRelations:
		print('Accessed from local store!')
		return freebaseRelations[w]
	print('Getting FB relations for word: ', w)
	neighbors = []
	mids = searchQuery(w)
	for mid in mids:
		triples = topicQuery(mid)
		for triple in triples:
			t, _ = triple
			# print(triple)
			# Check if words are same as |query|
			words = getWords(t[0]) + getWords(t[2])
			for word in words:
				# exclude blanks strings
				if word != w.lower() and len(word) > 0:
					neighbors.append(word)

	# get rid of duplicates
	neighbors = list(set(neighbors))

	# Save relations to memory
	freebaseRelations[w] = neighbors
	utils.saveData(freebaseRelations, cache + 'FB_relations.p')
	print('FB relations saved:', len(neighbors))

	return neighbors
Пример #3
0
	def getSecondOrderKeywords(self):
		keywords = []
		for num, question in enumerate(self.test):
			print('\nQuestion {} ---------------------------'.format(num+1))
			if self.dataType == 'val':
				questionText, answers = question
			else:
				questionText, answers, correctAnswer = question

			# If available locally, return it
			if questionText in localKeywords:
				print('keywords accessed locally')
				keywords += localKeywords[questionText]
			else:
				wordGraph = WordGraph(questionText, self.N)
				newKeywords = wordGraph.getSecondOrderKeywords() 
				keywords += newKeywords

				localKeywords[questionText] = newKeywords
				utils.saveData(localKeywords, cache + 'keywords.p')
				print('keywords saved.')

		keywords = list(set(keywords))
		print('{} second order keywords found from {} questions'.format(len(keywords), num))
		return keywords
Пример #4
0
	def __init__(self, start, end, dataType, N):
		self.LETTERS = ['A', 'B', 'C', 'D']
		self.fullTest = self.validationSet()
		self.dataType = dataType
		self.test = [q for i, q in enumerate(self.fullTest) if (i < end and i >= start)]
		self.correct = 0
		self.incorrect = 0
		self.answerReport = []
		self.searchAnswerReport = []
		self.timeReport = []
		self.N = N

		# instantiate mindmaps
		if os.path.isfile(cache + 'mindmaps.p'): utils.loadData(cache + 'mindmaps.p')
		else:
			self.mindmaps = {}
Пример #5
0
def getSearchFromFile():
	'''Opens local copy of search results'''
	searchResults = utils.loadData(cache + 'searchResults.p')
	searchObject = json.loads(searchResults)
	snippetDoc = ''
	items = searchObject['items']
	for i in items:
		snippetDoc += i['snippet']
	return snippetDoc
Пример #6
0
import searchText as scraper
import util
import QAUtils as utils
from Models import Test
import pickle, os, time

cache = '../Dropbox/ScienceQASharedCache/'

# Get local copy of freebase
if os.path.isfile(cache + 'FB_relations.p'): freebaseRelations = utils.loadData(cache + 'FB_relations.p')
else:
	freebaseRelations = {}

# Setup for worker pool
poolWorkerNum = 200
poolIterations = 2
poolRedundancies = False

# Get all keywords
eightGradeExam = Test(start=0, end=8132, dataType='val', N=6)

keywords = eightGradeExam.getSecondOrderKeywords()

# save second order keywords
utils.saveData(keywords, cache + 'SecondOrderKeywords.p')
print('Keywords saved.')

# Filter keywords already in local freebaseRelations
keywords = [kw for kw in keywords if kw not in freebaseRelations]
print('Number of first order keywords left: {}'.format(len(keywords)))
Пример #7
0
#  - spacy word2vec cosine distance between question and answer (own and average of four)
#  - spacy word2vec cosine distance between answer option and other options (own and average of four)
print('- Basic formatting')
trainX = extractor.basicFormatFeatures(trainPairedQA)
valX = extractor.basicFormatFeatures(valPairedQA)
print(trainX.shape)


# Feature measuring proximity of a given Q-A pair to authoritative texts
#  - Q-A combined into a single statement then search carried out to see distance to closest sentence in text
#  - Authoritative text from wikipedia and CK12 free online textbooks for elementary school children
#  - Two measures given--one requiring relatively strict matches, one allowing loose matches
#  - return both absolute value as well as average of other 3 answers
print('- Text match features')
if os.path.isfile(cache + 'trainX'): 
    train_textMatch = utils.loadData(cache + 'trainX')
    print(train_textMatch.shape)
else:
    trainX = extractor.getTextMatchFeatures(trainPairedQA, kList=[100, 10, 100, 1000, 3])
trainX = extractor.concat(trainX, utils.loadData(cache + 'trainX'))

# if os.path.isfile(cache + 'valX'): 
#     valX = extractor.concat(valX, utils.loadData(cache + 'valX'))
# else:
#     valX = extractor.getTextMatchFeatures(valPairedQA, kList=[100, 10, 100, 1000, 3])
# print(trainX.shape)

# Features from the keyword graph from the Aristo paper
#  - size of question graph, size of answer graph, coherence score of answers, coherence score
#    of question keywords, number of pruned words for each Q-A pair
print('- Keyword graph features')
Пример #8
0
import os, sys
import QAUtils as utils
from whoosh.fields import *
from whoosh.index import *
from whoosh.query import *
from whoosh.qparser import QueryParser

# 0. Set global parameters
cache = '../Dropbox/ScienceQASharedCache/'

# 1. Get corpus
corpus = utils.loadData(cache + 'allTextLines')[:100]

# 2. Index using whoosh
schema = Schema(content=TEXT, stored_content=TEXT(stored=True))
if not os.path.exists(cache + 'IRindex'):
	os.mkdir(cache + 'IRindex')
ix = create_in(cache + 'IRindex', schema)
ix = open_dir(cache + 'IRindex')

writer = ix.writer()
for i, line in enumerate(corpus):
	sys.stdout.write('\rAdding line {} of {} to index'.format(i+1, len(corpus)))
	sys.stdout.flush()
	writer.add_document(content = line, stored_content = line)
writer.commit()

# Try out a search
with ix.searcher() as searcher:
	query = QueryParser('content', ix.schema).parse('Turkey')
	results = searcher.search(query)
Пример #9
0
###############################################################
# Setup

print('Initializing spacy...')
nlp = spacy.en.English()
print('Done!')

cache = '../Dropbox/ScienceQASharedCache/'

search_engine_id = '017856859473145577022:dswlvnrydbq'
api_key = 'AIzaSyDhCTJt6qh5UkH-t_p8_M2wZAI07NFNV_Y'
queryResultLimit = 5

# load FB from local store
if os.path.isfile(cache + 'FB_relations.p'): freebaseRelations = utils.loadData(cache + 'FB_relations.p')
else: freebaseRelations = {}

# load WN from local store
if os.path.isfile(cache + 'WN_relations.p'): wnRelations = utils.loadData(cache + 'WN_relations.p')
else: wnRelations = {}

###############################################################
# Utility Functions

# def getGoogleSnippets(q):
# 	'''Returns top 20 google snippets for search term q'''
# 	print('Searching for google snippets for query:', q)
# 	search_term = q
# 	service = build('customsearch', 'v1', developerKey=api_key)
# 	collection = service.cse()
Пример #10
0
	def takeTest(self):
		self.reset()
		densityCorrect = 0
		searchCorrect = 0
		w2vCorrect = 0
		# Take test
		for num, question in enumerate(self.test):
			print('\nQuestion {} ---------------------------'.format(num+1))
			# Think about question -> Generate scene
			start = time.time()
			questionText, answers, correctAnswer = question

			print('Question: {}'.format(questionText))

			# save mindmap for question
			if questionText in self.mindmaps:
				print('Mindmap accessed from local store!')
				wordGraph = self.mindmaps[questionText]
			else:
				wordGraph = WordGraph(questionText, self.N)
				self.mindmaps[questionText] = wordGraph
				utils.saveData(self.mindmaps, cache + 'mindmaps.p')
				print('Mindmap saved.')

			keywords = wordGraph.questionKeywords

			# Get density & search scores
			densityScores = []
			# searchScores = []
			# word2vecScores = []
			for ans in answers:
				questionGraph = copy.deepcopy(wordGraph)
				densityScore = questionGraph.getAnswerScore(ans)
				densityScores.append(densityScore)
				# searchScores.append(searchScore)
				# word2vecScores.append(util.averageSimilarity(keywords, ans))

			# Mark using density score
			density_index = densityScores.index(max(densityScores))
			if self.LETTERS[density_index] == correctAnswer:
				self.correct += 1
				densityCorrect += 1
			else:
				self.incorrect += 1

			# Mark question using search scores
			# search_index = searchScores.index(min(searchScores))
			# if self.LETTERS[search_index] == correctAnswer:
			# 	searchCorrect += 1

			# Mark question using word2vec
			# w2v_index = word2vecScores.index(max(word2vecScores))
			# if self.LETTERS[search_index] == correctAnswer:
			# 	w2vCorrect += 1

			end = time.time()

			self.answerReport.append((densityScores, density_index, correctAnswer))
			self.timeReport.append(end - start)

		print('Out of {} questions'.format(len(self.test)))
		print('Density: {}'.format(densityCorrect))
Пример #11
0
import util
import pickle
import copy
import time
import os
import QAUtils as utils

cache = '../Dropbox/ScienceQASharedCache/'

regentsDataPath = cache + 'Regents_Train.tsv'
trainData = cache + 'training_set.tsv'
validationData = cache + 'validation_set.tsv'

# second order keywords
if os.path.isfile(cache + 'keywords.p'): localKeywords = utils.loadData(cache + 'keywords.p')
else: localKeywords = {}

class WordGraph:
	def __init__(self, question, N):
		# print('Question:', question)
		self.graph = {}
		self.N = N
		self.questionKeywords = util.getKeywords(question)
		# print('Question keywords extracted:', self.questionKeywords)

		self.importance = {kw: 1/len(self.questionKeywords) for kw in self.questionKeywords}
		# self.importance = util.getImportanceDict(question)
		# print('Keyword importance:', self.importance)

		self.secondOrderKeywords = localKeywords[question] if question in localKeywords else self.bestWords()