def __init__(self, dbPath): self.wordExplorer = WordExplorer(dbPath); self.initWordCaptureTally();
class Evaluator(object): def __init__(self, dbPath): self.wordExplorer = WordExplorer(dbPath); self.initWordCaptureTally(); def getMaxDepthAllSentences(self): ''' Runs through all sentences this Evaluator instance has measured, and returns the deepest depth of all sentences: ''' maxDepth = 0; for sentencePerf in self.performanceTally: maxDepth = max(sentencePerf.getDeepestDepth(), maxDepth); return maxDepth; def toCSV(self, outFileFD=None): csv = self.getCSVHeader() + '\n'; for sentencePerf in self.performanceTally: csv += sentencePerf.toCSV() + '\n'; if outFileFD is not None: try: outFileFD.write(csv); outFileFD.flush(); except IOError: print "Warning: could not write to outfile FD: %s" + str(outFileFD); return csv; def getCSVHeader(self): header = 'EmailID,SentenceID,SentenceLen,Failures,OutofSeq,NetFailure,NetSuccess'; for depthIndex in range(1,self.getMaxDepthAllSentences() + 1): header += ',Depth_' + str(depthIndex); return header; def extractWordSet(self, jsonEchoTreeStr): ''' Given a JSON Echo Tree, return the root word and a flat set of all follow-on words. @param jsonEchoTreeStr: JSON EchoTree structure of any depth/breadth @type jsonEchoTreeStr: string ''' pythonEchoTree = json.loads(jsonEchoTreeStr); flatTree = self.extractWordSeqsHelper(pythonEchoTree); flatList = flatTree.split(); rootWord = flatList[0]; flatSet = set(flatList[1:]); return (rootWord, flatSet); def getDepthFromWord(self, pythonEchoTree, word): ''' Given a word, return its depth in the tree. Root postion is 0. @param pythonEchoTree: Python encoded EchoTree @type pythonEchoTree: Dict @param word: word to find in the EchoTree @type word: string @return: the depth at which the word occurs in the tree, or 0 if not present. @rtype: {int | None} ''' return self.getDepthFromWordHelper(pythonEchoTree, word, depth=0); def getDepthFromWordHelper(self, pythonEchoTree, wordToFind, depth=0): if pythonEchoTree is None: return None; if pythonEchoTree['word'] == wordToFind: return depth; for subtree in pythonEchoTree['followWordObjs']: newDepth = self.getDepthFromWordHelper(subtree, wordToFind, depth=depth+1); if newDepth is not None: return newDepth; return None; def extractSentences(self, jsonEchoTreeStr): ''' Print all sentences that can be made from the EchoTree. @param jsonEchoTreeStr: @type jsonEchoTreeStr: ''' #sentenceStructs = self.extractWordSeqs(jsonEchoTreeStr); pass def extractWordSeqs(self, jsonEchoTreeStr): ''' Given a JSON EchoTree structure, return a structure representing all 'sentences' generated by the tree via a depth-first walk. Example: root pig truffle mud tree deep broad generates: deque([root, OrderedDict([(tree, deque([broad, deep]))]), OrderedDict([(pig, deque([mud, truffle]))])]) from which one can generate: - root tree broad - root tree deep - root pig mud - root pig truffle @param jsonEchoTreeStr: JSON encoded EchoTree @type jsonEchoTreeStr:string ''' pythonEchoTree = json.loads(jsonEchoTreeStr); flatTree = self.extractWordSeqsHelper(pythonEchoTree); flatQueue = deque(flatTree.split()); # Number of words: breadth ** (depth-1) + 1 numSibPops = WORD_TREE_BREADTH ** (WORD_TREE_DEPTH - 2); # Root word first: resDictQueue = deque([flatQueue[0]]); for dummy in range(numSibPops): sibs = deque([]); parentDict = OrderedDict(); resDictQueue.append(parentDict); for dummy in range(WORD_TREE_BREADTH): sibs.append(flatQueue.pop()); parentDict[flatQueue.pop()] = sibs; return resDictQueue; def extractWordSeqsHelper(self, pythonEchoTreeDict): ''' Too-long example (it's what I had on hand: {u'word': u'reliability', u'followWordObjs': [ {u'word': u'new', u'followWordObjs': [ {u'word': u'power', u'followWordObjs': []}, {u'word': u'generation', u'followWordObjs': []}, {u'word': u'business', u'followWordObjs': []}, {u'word': u'product', u'followWordObjs': []}, {u'word': u'company', u'followWordObjs': []}]}, {u'word': u'issues', u'followWordObjs': [ {u'word': u'related', u'followWordObjs': []}, {u'word': u'need', u'followWordObjs': []}, {u'word': u'raised', u'followWordObjs': []}, {u'word': u'such', u'followWordObjs': []}, {u'word': u'addressed', u'followWordObjs': []}]}, {u'word': u'legislation', u'followWordObjs': [ {u'word': u'passed', u'followWordObjs': []}, {u'word': u'allow', u'followWordObjs': []}, {u'word': u'introduced', u'followWordObjs': []}, {u'word': u'require', u'followWordObjs': []}, {u'word': u'provide', u'followWordObjs': []}]}, {u'word': u'standards', u'followWordObjs': [ {u'word': u'conduct', u'followWordObjs': []}, {u'word': u'set', u'followWordObjs': []}, {u'word': u'needed', u'followWordObjs': []}, {u'word': u'facilitate', u'followWordObjs': []}, {u'word': u'required', u'followWordObjs': []}]}, {u'word': u'problems', u'followWordObjs': [ {u'word': u'please', u'followWordObjs': []}, {u'word': u'California', u'followWordObjs': []}, {u'word': u'accessing', u'followWordObjs': []}, {u'word': u'arise', u'followWordObjs': []}, {u'word': u'occur', u'followWordObjs': []}]}]} @param pythonEchoTreeDict: @type pythonEchoTreeDict: dict ''' res = ''; word = pythonEchoTreeDict['word']; res += ' ' + word; if len(pythonEchoTreeDict['followWordObjs']) == 0: return res; for subtree in pythonEchoTreeDict['followWordObjs']: res += self.extractWordSeqsHelper(subtree); return res; def initWordCaptureTally(self): self.performanceTally = []; def tallyWordCapture(self, sentenceTokens, emailID=-1, sentenceID=None): ''' Measures overlap of each sentence token with trees created by this evaluator's database. Stopwords are removed here. Measures: - sentenceLen: number of words that are not stopwords. - failures: number of times a tree did not contain one of the words, and a new tree needed to be constructed by typing in the word. - outOfSeqs: number of times future word in the sentence was in an early tree. - depths: for each tree depth, how many of the sentence's words appeared at that depth. Creates a SentencePerformance instance that stores the result measures. Adds that instance to this evaluator's performanceTally array. @param sentenceTokens: tokens that make up the sentence. @type sentenceTokens: [string] @param emailID: optional ID to identify from which email the given sentence was taken. @type emailID: <any> @param sentenceID: optional ID to identify the given sentence within its email. @type sentenceID: <any> ''' for word in sentenceTokens: if word.lower() in STOPWORDS or word in [';', ',', ':', '!', '%']: sentenceTokens.remove(word); # Make a new SentencePerformance instance, passing this evaluator, # the array of stopword-free tokens, and the index in the self.performanceTally # array at which this new SentencePerformance instance will reside: if sentenceID is None: sentenceID = len(self.performanceTally); sentencePerf = SentencePerformance(self, sentenceTokens, emailID=emailID, sentenceID=sentenceID); # Start for real: tree = self.wordExplorer.makeWordTree(sentenceTokens[0]); treeWords = self.extractWordSet(self.wordExplorer.makeJSONTree(tree)); for wordPos, word in enumerate(sentenceTokens[1:]): word = word.lower(); wordDepth = self.getDepthFromWord(tree, word); if wordDepth is None: # wanted word is not in tree anywhere: sentencePerf.addFailure(); # Is any of the future sentence words in the tree's word set? if wordDepth < len(sentenceTokens) - 1: for futureWord in sentenceTokens[wordPos+1:]: if futureWord in treeWords: sentencePerf.addOutOfSeq(); # Build a new tree by (virtually) typing in the word tree = self.wordExplorer.makeWordTree(word); treeWords = self.extractWordSet(self.wordExplorer.makeJSONTree(tree)); continue; # Found word in tree: sentencePerf.addWordDepth(wordDepth); # Finished looking at every toking in the sentence. self.performanceTally.append(sentencePerf); def readSentence(self, fd): sentenceOpener = '[' sentenceCloser= ']' res = ''; # Find start of next sentence: while 1: try: letter = fd.read(1); if letter == sentenceOpener: # Found start of sentence res = letter; break; if len(letter) == 0: # Gone through the whole file: return None; except IOError: return None while 1: try: letter = fd.read(1); # Reached end of file before closing bracket: if len(letter) == 0: raise IOError; except IOError: print "Warning: ignoring unfinished sentence: %s." % res; return None res += letter; if letter == sentenceCloser: return res; def checksum(self, theStr): ''' Returns the sum of all the given string's ASCCII values. @param theStr: string to be checksummed. @type theStr: string @return: sum of ASCII values as checksum @rtype: int ''' return reduce(lambda x,y:x+y, map(ord, theStr)) def measurePerformance(self, csvFilePath, dbFilePath, tokenFilePaths, verbose=False): ''' Token files must hold a string as produced by the Stanford NLP core tokenizer/sentence segmenter. Ex: "[foo, bar, fum]". Notice the ',<space>' after each token. That is the token separator. Assumed that db file is accessible for reading, that csv file can be opened/created for output, and that the token file paths are accessible for reading. @param evaluator: @type evaluator: @param csvFilePath: @type csvFilePath: @param dbFilePath: @type dbFilePath: @param tokenFilePaths: fully qualified paths to each token file. @type tokenFilePaths: @param verbose: @type verbose: @return: CSV formated table. @rtype: string ''' if verbose: numSentencesDone = 0; reportEvery = 10; # progress every 10 sentences self.initWordCaptureTally(); for tokenFilePath in tokenFilePaths: msgID = self.checksum(tokenFilePath); sentenceID = 0; with open(tokenFilePath, 'r') as tokenFD: while 1: pythonSentenceTokens = self.readSentence(tokenFD); if pythonSentenceTokens is None: # Done with one file. break; self.tallyWordCapture(pythonSentenceTokens.split(', '), emailID=msgID, sentenceID=sentenceID); sentenceID += 1; if verbose: numSentencesDone += 1; if numSentencesDone % reportEvery == 0: print "At file %s. Done %d sentences." % (os.path.basename(tokenFilePath), numSentencesDone); with open(csvFilePath,'w') as CsvFd: csvAll = self.toCSV(outFileFD=CsvFd); if verbose: print csvAll; return csvAll;