def getWNRelations(w): # print('Getting WordNet relations for word: ', w) if w in wnRelations: # print('Accessed from local store!') return wnRelations[w] neighbors = [] for synset in wn.synsets(w): hypernyms = synset.hypernyms() for h in hypernyms: for l in h.lemmas(): full_name = getWords(l.name()) for word in full_name: # not the same word, not empty string if word != w.lower() and len(word) > 0: neighbors.append(word) # get rid of duplicates neighbors = list(set(neighbors)) # save locally wnRelations[w] = neighbors utils.saveData(wnRelations, cache + 'WN_relations.p') print('WN relations saved:', len(neighbors)) return neighbors
def getFBRelations(w): # check for memory, else get new relations if w in freebaseRelations: print('Accessed from local store!') return freebaseRelations[w] print('Getting FB relations for word: ', w) neighbors = [] mids = searchQuery(w) for mid in mids: triples = topicQuery(mid) for triple in triples: t, _ = triple # print(triple) # Check if words are same as |query| words = getWords(t[0]) + getWords(t[2]) for word in words: # exclude blanks strings if word != w.lower() and len(word) > 0: neighbors.append(word) # get rid of duplicates neighbors = list(set(neighbors)) # Save relations to memory freebaseRelations[w] = neighbors utils.saveData(freebaseRelations, cache + 'FB_relations.p') print('FB relations saved:', len(neighbors)) return neighbors
def getSecondOrderKeywords(self): keywords = [] for num, question in enumerate(self.test): print('\nQuestion {} ---------------------------'.format(num+1)) if self.dataType == 'val': questionText, answers = question else: questionText, answers, correctAnswer = question # If available locally, return it if questionText in localKeywords: print('keywords accessed locally') keywords += localKeywords[questionText] else: wordGraph = WordGraph(questionText, self.N) newKeywords = wordGraph.getSecondOrderKeywords() keywords += newKeywords localKeywords[questionText] = newKeywords utils.saveData(localKeywords, cache + 'keywords.p') print('keywords saved.') keywords = list(set(keywords)) print('{} second order keywords found from {} questions'.format(len(keywords), num)) return keywords
def __init__(self, start, end, dataType, N): self.LETTERS = ['A', 'B', 'C', 'D'] self.fullTest = self.validationSet() self.dataType = dataType self.test = [q for i, q in enumerate(self.fullTest) if (i < end and i >= start)] self.correct = 0 self.incorrect = 0 self.answerReport = [] self.searchAnswerReport = [] self.timeReport = [] self.N = N # instantiate mindmaps if os.path.isfile(cache + 'mindmaps.p'): utils.loadData(cache + 'mindmaps.p') else: self.mindmaps = {}
def getSearchFromFile(): '''Opens local copy of search results''' searchResults = utils.loadData(cache + 'searchResults.p') searchObject = json.loads(searchResults) snippetDoc = '' items = searchObject['items'] for i in items: snippetDoc += i['snippet'] return snippetDoc
import searchText as scraper import util import QAUtils as utils from Models import Test import pickle, os, time cache = '../Dropbox/ScienceQASharedCache/' # Get local copy of freebase if os.path.isfile(cache + 'FB_relations.p'): freebaseRelations = utils.loadData(cache + 'FB_relations.p') else: freebaseRelations = {} # Setup for worker pool poolWorkerNum = 200 poolIterations = 2 poolRedundancies = False # Get all keywords eightGradeExam = Test(start=0, end=8132, dataType='val', N=6) keywords = eightGradeExam.getSecondOrderKeywords() # save second order keywords utils.saveData(keywords, cache + 'SecondOrderKeywords.p') print('Keywords saved.') # Filter keywords already in local freebaseRelations keywords = [kw for kw in keywords if kw not in freebaseRelations] print('Number of first order keywords left: {}'.format(len(keywords)))
# - spacy word2vec cosine distance between question and answer (own and average of four) # - spacy word2vec cosine distance between answer option and other options (own and average of four) print('- Basic formatting') trainX = extractor.basicFormatFeatures(trainPairedQA) valX = extractor.basicFormatFeatures(valPairedQA) print(trainX.shape) # Feature measuring proximity of a given Q-A pair to authoritative texts # - Q-A combined into a single statement then search carried out to see distance to closest sentence in text # - Authoritative text from wikipedia and CK12 free online textbooks for elementary school children # - Two measures given--one requiring relatively strict matches, one allowing loose matches # - return both absolute value as well as average of other 3 answers print('- Text match features') if os.path.isfile(cache + 'trainX'): train_textMatch = utils.loadData(cache + 'trainX') print(train_textMatch.shape) else: trainX = extractor.getTextMatchFeatures(trainPairedQA, kList=[100, 10, 100, 1000, 3]) trainX = extractor.concat(trainX, utils.loadData(cache + 'trainX')) # if os.path.isfile(cache + 'valX'): # valX = extractor.concat(valX, utils.loadData(cache + 'valX')) # else: # valX = extractor.getTextMatchFeatures(valPairedQA, kList=[100, 10, 100, 1000, 3]) # print(trainX.shape) # Features from the keyword graph from the Aristo paper # - size of question graph, size of answer graph, coherence score of answers, coherence score # of question keywords, number of pruned words for each Q-A pair print('- Keyword graph features')
import os, sys import QAUtils as utils from whoosh.fields import * from whoosh.index import * from whoosh.query import * from whoosh.qparser import QueryParser # 0. Set global parameters cache = '../Dropbox/ScienceQASharedCache/' # 1. Get corpus corpus = utils.loadData(cache + 'allTextLines')[:100] # 2. Index using whoosh schema = Schema(content=TEXT, stored_content=TEXT(stored=True)) if not os.path.exists(cache + 'IRindex'): os.mkdir(cache + 'IRindex') ix = create_in(cache + 'IRindex', schema) ix = open_dir(cache + 'IRindex') writer = ix.writer() for i, line in enumerate(corpus): sys.stdout.write('\rAdding line {} of {} to index'.format(i+1, len(corpus))) sys.stdout.flush() writer.add_document(content = line, stored_content = line) writer.commit() # Try out a search with ix.searcher() as searcher: query = QueryParser('content', ix.schema).parse('Turkey') results = searcher.search(query)
############################################################### # Setup print('Initializing spacy...') nlp = spacy.en.English() print('Done!') cache = '../Dropbox/ScienceQASharedCache/' search_engine_id = '017856859473145577022:dswlvnrydbq' api_key = 'AIzaSyDhCTJt6qh5UkH-t_p8_M2wZAI07NFNV_Y' queryResultLimit = 5 # load FB from local store if os.path.isfile(cache + 'FB_relations.p'): freebaseRelations = utils.loadData(cache + 'FB_relations.p') else: freebaseRelations = {} # load WN from local store if os.path.isfile(cache + 'WN_relations.p'): wnRelations = utils.loadData(cache + 'WN_relations.p') else: wnRelations = {} ############################################################### # Utility Functions # def getGoogleSnippets(q): # '''Returns top 20 google snippets for search term q''' # print('Searching for google snippets for query:', q) # search_term = q # service = build('customsearch', 'v1', developerKey=api_key) # collection = service.cse()
def takeTest(self): self.reset() densityCorrect = 0 searchCorrect = 0 w2vCorrect = 0 # Take test for num, question in enumerate(self.test): print('\nQuestion {} ---------------------------'.format(num+1)) # Think about question -> Generate scene start = time.time() questionText, answers, correctAnswer = question print('Question: {}'.format(questionText)) # save mindmap for question if questionText in self.mindmaps: print('Mindmap accessed from local store!') wordGraph = self.mindmaps[questionText] else: wordGraph = WordGraph(questionText, self.N) self.mindmaps[questionText] = wordGraph utils.saveData(self.mindmaps, cache + 'mindmaps.p') print('Mindmap saved.') keywords = wordGraph.questionKeywords # Get density & search scores densityScores = [] # searchScores = [] # word2vecScores = [] for ans in answers: questionGraph = copy.deepcopy(wordGraph) densityScore = questionGraph.getAnswerScore(ans) densityScores.append(densityScore) # searchScores.append(searchScore) # word2vecScores.append(util.averageSimilarity(keywords, ans)) # Mark using density score density_index = densityScores.index(max(densityScores)) if self.LETTERS[density_index] == correctAnswer: self.correct += 1 densityCorrect += 1 else: self.incorrect += 1 # Mark question using search scores # search_index = searchScores.index(min(searchScores)) # if self.LETTERS[search_index] == correctAnswer: # searchCorrect += 1 # Mark question using word2vec # w2v_index = word2vecScores.index(max(word2vecScores)) # if self.LETTERS[search_index] == correctAnswer: # w2vCorrect += 1 end = time.time() self.answerReport.append((densityScores, density_index, correctAnswer)) self.timeReport.append(end - start) print('Out of {} questions'.format(len(self.test))) print('Density: {}'.format(densityCorrect))
import util import pickle import copy import time import os import QAUtils as utils cache = '../Dropbox/ScienceQASharedCache/' regentsDataPath = cache + 'Regents_Train.tsv' trainData = cache + 'training_set.tsv' validationData = cache + 'validation_set.tsv' # second order keywords if os.path.isfile(cache + 'keywords.p'): localKeywords = utils.loadData(cache + 'keywords.p') else: localKeywords = {} class WordGraph: def __init__(self, question, N): # print('Question:', question) self.graph = {} self.N = N self.questionKeywords = util.getKeywords(question) # print('Question keywords extracted:', self.questionKeywords) self.importance = {kw: 1/len(self.questionKeywords) for kw in self.questionKeywords} # self.importance = util.getImportanceDict(question) # print('Keyword importance:', self.importance) self.secondOrderKeywords = localKeywords[question] if question in localKeywords else self.bestWords()