def train(labeled_featuresets, estimator=ELEProbDist): # ELEProbDist:为类名,类名作为参数 """ :param labeled_featuresets: A list of classified featuresets, i.e., a list of tuples ``(featureset, label)``. """ label_freqdist = FreqDist() feature_freqdist = defaultdict(FreqDist) # value 为 Freqdict 的字典 feature_values = defaultdict(set) # value 为 set 的字典 fnames = set() # Count up how many times each feature value occurred, given # the label and featurename. for featureset, label in labeled_featuresets: # 原始通用特征 [({feature dict},label) ,( )] label_freqdist.inc(label) for fname, fval in featureset.items(): # Increment freq(fval|label, fname) feature_freqdist[label, fname].inc(fval) # featureset 为 dict; feature_freqdist[label, fname] 为 freqdict: 统计每个特征,某个值的出现次数 # Record that fname can take the value fval. # !!! 所以,不管特征 feature_values[fname].add(fval) # value 为 set 的字典 # Keep a list of all feature names. fnames.add(fname) # If a feature didn't have a value given for an instance, then # we assume that it gets the implicit value 'None.' This loop # counts up the number of 'missing' feature values for each # (label,fname) pair, and increments the count of the fval # 'None' by that amount. for label in label_freqdist: num_samples = label_freqdist[label] # 所有样本中 某类的总次数 for fname in fnames: count = feature_freqdist[label, fname].N() # freqdict.N(): 为freqdict()的所有频数之和,即:与特定类共现过的所有特征(key)的种的出现次数 feature_freqdist[label, fname].inc(None, num_samples-count) # freqdist.inc(key): 给 key 的 value 加 1 (第二个参数默认为 1) # 每个特征的某一值对于某个类的概率都是基于该类的样本总数计算 feature_values[fname].add(None) # 每个特征值的种类,都增加一个 None。 # Create the P(label) distribution label_probdist = estimator(label_freqdist) # 默认平滑: gamma=0.5, bins = len(label_freqdist) # Create the P(fval|label, fname) distribution feature_probdist = {} for ((label, fname), freqdist) in feature_freqdist.items(): probdist = estimator(freqdist, bins=len(feature_values[fname])) # estimator:为类别名,用作概率平滑的类:LidstoneProbDist feature_probdist[label,fname] = probdist return NaiveBayesClassifier(label_probdist, feature_probdist)
class FamiliarWordCue(Cue): ''' Feature that scores words based on their lexical frequency. ''' def __init__(self, mbdp=False, subseq_counts=None, witten_bell=False, bad_score=0): ''' Initializes any counts to their default values, if necessary @param mbdp: Use MBDP-1 score adjustments when calculating word scores. @type mbdp: L{bool} @param subseq_counts: A frequency distribution for storing subsequence counts. Should use the same one for all L{Cues} of the current L{Segmenter}. @type subseq_counts: L{FreqDist} @param witten_bell: Use Witten-Bell smoothing (like Venkataraman's model) for familiar word scores. This also multiplies sub-word scores by Witten-Bell normalizing factor. This is ignored in no lexicon mode. @type witten_bell: L{bool} ''' super(FamiliarWordCue, self).__init__(Fraction(0), subseq_counts=subseq_counts) self._phonotactic = False self._lexicon = FreqDist(counttype=Fraction) self._mbdp = mbdp self._witten_bell = witten_bell self._bad_score = bad_score def in_lexicon(self, word): ''' @return: whether or not the given word is in the lexicon. ''' return word in self._lexicon @property def total_words(self): ''' Total number of word tokens in lexicon. ''' return self._lexicon.N() def eval_word(self, word): ''' @return: probability that proposed word is a word. @todo: Implement lexical decay. ''' if word in self._lexicon: word_count = Fraction(self._lexicon[word]) if not self._mbdp: word_types = self._lexicon.B( ) # Unlike OCaml version we're not adding utterance delimiter to lexicon, so no subtraction. raw_score = word_count / (Fraction( self.subseq_counts[word]) if self.subseq_counts else ( Fraction(self.total_words + word_types) if self._witten_bell else Fraction(self.total_words))) else: raw_score = ((word_count + Fraction(1)) / (self.total_words + Fraction(1))) * (( (word_count / (word_count + Fraction(1)))**Fraction(2))) elif self._witten_bell: word_types = Fraction( self._lexicon.B() - 1) # Subtract one for initial utterance delimiter addition raw_score = word_types / Fraction(self.total_words + word_types) else: raw_score = self._bad_score return raw_score # lexical decay stuff would need to be added here def dump(self, dump_file): for word in self._lexicon.iterkeys(): dump_file.write(word + str(self._lexicon[word]) + '\n') if self._subseq_counts: for seq in self.subseq_counts.iterkeys(): dump_file.write(seq + str(self.subseq_counts[seq]) + '\n') dump_file.close() def use_score(self, word): return self.in_lexicon(word) def update_evidence(self, word, increase_amount): self._lexicon.inc(word, increase_amount)
class FamiliarWordCue(Cue): ''' Feature that scores words based on their lexical frequency. ''' def __init__(self, mbdp=False, subseq_counts=None, witten_bell=False, bad_score=0): ''' Initializes any counts to their default values, if necessary @param mbdp: Use MBDP-1 score adjustments when calculating word scores. @type mbdp: L{bool} @param subseq_counts: A frequency distribution for storing subsequence counts. Should use the same one for all L{Cues} of the current L{Segmenter}. @type subseq_counts: L{FreqDist} @param witten_bell: Use Witten-Bell smoothing (like Venkataraman's model) for familiar word scores. This also multiplies sub-word scores by Witten-Bell normalizing factor. This is ignored in no lexicon mode. @type witten_bell: L{bool} ''' super(FamiliarWordCue, self).__init__(Fraction(0), subseq_counts=subseq_counts) self._phonotactic = False self._lexicon = FreqDist(counttype=Fraction) self._mbdp = mbdp self._witten_bell = witten_bell self._bad_score = bad_score def in_lexicon(self, word): ''' @return: whether or not the given word is in the lexicon. ''' return word in self._lexicon @property def total_words(self): ''' Total number of word tokens in lexicon. ''' return self._lexicon.N() def eval_word(self, word): ''' @return: probability that proposed word is a word. @todo: Implement lexical decay. ''' if word in self._lexicon: word_count = Fraction(self._lexicon[word]) if not self._mbdp: word_types = self._lexicon.B() # Unlike OCaml version we're not adding utterance delimiter to lexicon, so no subtraction. raw_score = word_count / (Fraction(self.subseq_counts[word]) if self.subseq_counts else (Fraction(self.total_words + word_types) if self._witten_bell else Fraction(self.total_words))) else: raw_score = ((word_count + Fraction(1)) / (self.total_words + Fraction(1))) * (((word_count / (word_count + Fraction(1))) ** Fraction(2))) elif self._witten_bell: word_types = Fraction(self._lexicon.B() - 1) # Subtract one for initial utterance delimiter addition raw_score = word_types / Fraction(self.total_words + word_types) else: raw_score = self._bad_score return raw_score # lexical decay stuff would need to be added here def dump(self, dump_file): for word in self._lexicon.iterkeys(): dump_file.write(word + str(self._lexicon[word]) + '\n') if self._subseq_counts: for seq in self.subseq_counts.iterkeys(): dump_file.write(seq + str(self.subseq_counts[seq]) + '\n') dump_file.close() def use_score(self, word): return self.in_lexicon(word) def update_evidence(self, word, increase_amount): self._lexicon.inc(word, increase_amount)