def getFeatureVector(poem): # Takes in an author vector and a poem string. extracts the feature vector # for this poem. This will mainly take the L1 norm between the poem characteristics # and the author characteristics phi = {} poemVector = poemFeatures.poemCharacter(("",poem)) # Don't need author name or word pairs phi['numLines'] = abs(poemVector['numLines']) phi['avgWordLength'] = abs(poemVector['avgWordLength']) phi['avgLineLength'] = abs(poemVector['avgLineLength']) phi['rhymePercentAA'] = abs(poemVector['rhymePercentAA']) phi['rhymePercentABA'] = abs(poemVector['rhymePercentABA']) return phi
def styleTrainer(): testVectors = [] authorVectors = {} data = readUtil.getTrainingData() trainingSet = [] testingSet = [] # go through each author in data for author in data.keys(): # initialize values for that author... could be simplified by making a vector class... halfPoems = len(data[author])/2 authorVectors[author] = {} authorVectors[author]['author'] = author authorVectors[author]['numLines'] = 0.0 authorVectors[author]['avgWordLength'] = 0.0 authorVectors[author]['avgLineLength'] = 0.0 authorVectors[author]['rhymePercentAA'] = 0.0 authorVectors[author]['rhymePercentABA'] = 0.0 authorVectors[author]['wordPairs'] = collections.Counter() authorVectors[author]['wordsPerLine'] = collections.Counter() authorVectors[author]['linesPerPoem'] = collections.Counter() authorVectors[author]['typeTokenCount'] = collections.Counter() authorVectors[author]['wordDomain'] = collections.Counter() authorVectors[author]['poemStart'] = collections.Counter() # go through all of poems for index in range(len(data[author])): # get stats on the poem poemVector = poemFeatures.poemCharacter(data[author][index]) # first half is for training if index < halfPoems: trainingSet.append(data[author][index]) # add those stats to the author authorVectors[author]['numLines'] += poemVector['numLines'] authorVectors[author]['avgWordLength'] += poemVector['avgWordLength'] authorVectors[author]['avgLineLength'] += poemVector['avgLineLength'] authorVectors[author]['rhymePercentAA'] += poemVector['rhymePercentAA'] authorVectors[author]['rhymePercentABA'] += poemVector['rhymePercentABA'] authorVectors[author]['wordPairs'].update(poemVector['wordPairs']) authorVectors[author]['wordsPerLine'].update(poemVector['wordsPerLine']) authorVectors[author]['linesPerPoem'][poemVector['numLines']] += 1 authorVectors[author]['typeTokenCount'][poemVector['typeTokenCount']] += 1 authorVectors[author]['wordDomain'].update(poemVector['wordDomain']) authorVectors[author]['poemStart'].update(poemVector['poemStart']) # second half goes into testing data else: testVectors.append(poemVector) testingSet.append(data[author][index]) # get averages authorVectors[author]['numLines'] = authorVectors[author]['numLines']/halfPoems authorVectors[author]['avgWordLength'] = authorVectors[author]['avgWordLength']/halfPoems authorVectors[author]['avgLineLength'] = authorVectors[author]['avgLineLength']/halfPoems authorVectors[author]['rhymePercentAA'] = authorVectors[author]['rhymePercentAA']/halfPoems authorVectors[author]['rhymePercentABA'] = authorVectors[author]['rhymePercentABA']/halfPoems return authorVectors, testVectors, trainingSet, testingSet