def buildTransitionMatrix(self, tagged_corpus: list, train_size): train = tagged_corpus[:int(train_size * len(tagged_corpus))] random.shuffle(train) #construction of the transition matrix transition = ConditionalFreqDist() for (tag1, tag2) in train: if tag1 not in transition: transition[tag1] = FreqDist() if tag2 not in transition[tag1]: transition[tag1][tag2] = 0.0 transition[tag1][tag2] += 1 for tag in transition.keys(): somme = 0.0 for value in transition[tag].values(): somme += value for successor in transition[tag].keys(): transition[tag][successor] = round( float("{0:.6f}".format(transition[tag][successor] / somme)), 6) self.TRANSITION_MATRIX = transition return transition
#pickle.dump( docfreqs, open( 'docfreqs.p','w' ) ) #apparently this doesn't work because docfreqs is honkin' big def idf(w): return (log(len(bnc.fileids()) + 1) - log(docfreqs[w].B()) ) # docfreqs[w].B() is how many docs word occurs in def tf_idf(w): return docfreqs[w].N() * idf( w) #docfreqs[w].N() is how often word occurs throughout entire BNC wordlist = [ w for w in sorted( docfreqs.keys(), key=lambda x: docfreqs[x].N(), reverse=True) if w not in stopset # comment this out if want to include stops if docfreqs[w].N() > 2 ] r2i = dict((w, i) for (i, w) in enumerate(wordlist[:ROWS])) c2i = dict((w, i) for (i, w) in enumerate(wordlist[50:COLS + 50]) ) # leave out the 50 most frequent words from the context columns #pickle.dump( r2i, open( 'r2iWithoutStops.p','w' ) ) #pickle.dump( c2i, open( 'c2iWithoutstops.p','w' ) ) #pickle.dump( r2i, open( 'r2iWithStops.p','w' ) ) #pickle.dump( c2i, open( 'c2iWithStops.p','w' ) ) #pickle.dump( r2i, open( 'r2iWithStopsNotLemmatized.p','w' ) ) #pickle.dump( c2i, open( 'c2iWithStopsNotLemmatized.p','w' ) ) #pickle.dump( r2i, open( 'r2iWithoutStopsNotLemmatized.p','w' ) ) #pickle.dump( c2i, open( 'c2iWithoutStopsNotLemmatized.p','w' ) ) pickle.dump(r2i, open('r2iWithoutStopsLemmatized.p', 'w'))