Exemplo n.º 1
0
    def train(labeled_featuresets, estimator=ELEProbDist):	# ELEProbDist:为类名,类名作为参数
        """
        :param labeled_featuresets: A list of classified featuresets,
            i.e., a list of tuples ``(featureset, label)``.
        """
        label_freqdist = FreqDist()
        feature_freqdist = defaultdict(FreqDist)	# value 为 Freqdict 的字典
        feature_values = defaultdict(set)		# value 为 set 的字典
        fnames = set()

        # Count up how many times each feature value occurred, given
        # the label and featurename.
        for featureset, label in labeled_featuresets:		# 原始通用特征 [({feature dict},label) ,( )]
            label_freqdist.inc(label)
            for fname, fval in featureset.items():
                # Increment freq(fval|label, fname)
                feature_freqdist[label, fname].inc(fval)	# featureset 为 dict; feature_freqdist[label, fname] 为 freqdict: 统计每个特征,某个值的出现次数
                # Record that fname can take the value fval.	# !!! 所以,不管特征

                feature_values[fname].add(fval)			# value 为 set 的字典
                # Keep a list of all feature names.
                fnames.add(fname)

        # If a feature didn't have a value given for an instance, then
        # we assume that it gets the implicit value 'None.'  This loop
        # counts up the number of 'missing' feature values for each
        # (label,fname) pair, and increments the count of the fval
        # 'None' by that amount.
        for label in label_freqdist:
            num_samples = label_freqdist[label]			# 所有样本中 某类的总次数
            for fname in fnames:
                count = feature_freqdist[label, fname].N()	# freqdict.N(): 为freqdict()的所有频数之和,即:与特定类共现过的所有特征(key)的种的出现次数
                feature_freqdist[label, fname].inc(None, num_samples-count)	# freqdist.inc(key): 给 key 的 value 加 1 (第二个参数默认为 1)
										# 每个特征的某一值对于某个类的概率都是基于该类的样本总数计算

                feature_values[fname].add(None)			# 每个特征值的种类,都增加一个 None。

        # Create the P(label) distribution
        label_probdist = estimator(label_freqdist)		# 默认平滑: gamma=0.5, bins = len(label_freqdist)

        # Create the P(fval|label, fname) distribution
        feature_probdist = {}
        for ((label, fname), freqdist) in feature_freqdist.items():
            probdist = estimator(freqdist, bins=len(feature_values[fname]))	# estimator:为类别名,用作概率平滑的类:LidstoneProbDist
            feature_probdist[label,fname] = probdist

        return NaiveBayesClassifier(label_probdist, feature_probdist)
Exemplo n.º 2
0
class FamiliarWordCue(Cue):
    '''
        Feature that scores words based on their lexical frequency.
    '''
    def __init__(self,
                 mbdp=False,
                 subseq_counts=None,
                 witten_bell=False,
                 bad_score=0):
        '''
            Initializes any counts to their default values, if necessary
            @param mbdp: Use MBDP-1 score adjustments when calculating word scores.
            @type mbdp: L{bool}
            @param subseq_counts: A frequency distribution for storing subsequence counts. Should use the same one for all L{Cues}
                                  of the current L{Segmenter}.
            @type subseq_counts: L{FreqDist}
            @param witten_bell: Use Witten-Bell smoothing (like Venkataraman's model) for familiar word scores. This also multiplies
                                sub-word scores by Witten-Bell normalizing factor. This is ignored in no lexicon mode.
            @type witten_bell: L{bool}
        '''
        super(FamiliarWordCue, self).__init__(Fraction(0),
                                              subseq_counts=subseq_counts)
        self._phonotactic = False
        self._lexicon = FreqDist(counttype=Fraction)
        self._mbdp = mbdp
        self._witten_bell = witten_bell
        self._bad_score = bad_score

    def in_lexicon(self, word):
        '''
            @return: whether or not the given word is in the lexicon.
        '''
        return word in self._lexicon

    @property
    def total_words(self):
        ''' Total number of word tokens in lexicon. '''
        return self._lexicon.N()

    def eval_word(self, word):
        '''
            @return: probability that proposed word is a word.
            @todo: Implement lexical decay.
        '''
        if word in self._lexicon:
            word_count = Fraction(self._lexicon[word])
            if not self._mbdp:
                word_types = self._lexicon.B(
                )  # Unlike OCaml version we're not adding utterance delimiter to lexicon, so no subtraction.
                raw_score = word_count / (Fraction(
                    self.subseq_counts[word]) if self.subseq_counts else (
                        Fraction(self.total_words + word_types)
                        if self._witten_bell else Fraction(self.total_words)))
            else:
                raw_score = ((word_count + Fraction(1)) /
                             (self.total_words + Fraction(1))) * ((
                                 (word_count /
                                  (word_count + Fraction(1)))**Fraction(2)))
        elif self._witten_bell:
            word_types = Fraction(
                self._lexicon.B() -
                1)  # Subtract one for initial utterance delimiter addition
            raw_score = word_types / Fraction(self.total_words + word_types)
        else:
            raw_score = self._bad_score
        return raw_score  # lexical decay stuff would need to be added here

    def dump(self, dump_file):
        for word in self._lexicon.iterkeys():
            dump_file.write(word + str(self._lexicon[word]) + '\n')
        if self._subseq_counts:
            for seq in self.subseq_counts.iterkeys():
                dump_file.write(seq + str(self.subseq_counts[seq]) + '\n')
        dump_file.close()

    def use_score(self, word):
        return self.in_lexicon(word)

    def update_evidence(self, word, increase_amount):
        self._lexicon.inc(word, increase_amount)
Exemplo n.º 3
0
class FamiliarWordCue(Cue):
    '''
        Feature that scores words based on their lexical frequency.
    '''

    def __init__(self, mbdp=False, subseq_counts=None, witten_bell=False, bad_score=0):
        '''
            Initializes any counts to their default values, if necessary
            @param mbdp: Use MBDP-1 score adjustments when calculating word scores.
            @type mbdp: L{bool}
            @param subseq_counts: A frequency distribution for storing subsequence counts. Should use the same one for all L{Cues}
                                  of the current L{Segmenter}.
            @type subseq_counts: L{FreqDist}
            @param witten_bell: Use Witten-Bell smoothing (like Venkataraman's model) for familiar word scores. This also multiplies
                                sub-word scores by Witten-Bell normalizing factor. This is ignored in no lexicon mode.
            @type witten_bell: L{bool}
        '''
        super(FamiliarWordCue, self).__init__(Fraction(0), subseq_counts=subseq_counts)
        self._phonotactic = False
        self._lexicon = FreqDist(counttype=Fraction)
        self._mbdp = mbdp
        self._witten_bell = witten_bell
        self._bad_score = bad_score

    def in_lexicon(self, word):
        '''
            @return: whether or not the given word is in the lexicon.
        '''
        return word in self._lexicon

    @property
    def total_words(self):
        ''' Total number of word tokens in lexicon. '''
        return self._lexicon.N()

    def eval_word(self, word):
        '''
            @return: probability that proposed word is a word.
            @todo: Implement lexical decay.
        '''
        if word in self._lexicon:
            word_count = Fraction(self._lexicon[word])
            if not self._mbdp:
                word_types = self._lexicon.B()  # Unlike OCaml version we're not adding utterance delimiter to lexicon, so no subtraction.
                raw_score = word_count / (Fraction(self.subseq_counts[word]) if self.subseq_counts
                                                                        else (Fraction(self.total_words + word_types) if self._witten_bell
                                                                                                                      else Fraction(self.total_words)))
            else:
                raw_score = ((word_count + Fraction(1)) / (self.total_words + Fraction(1))) * (((word_count / (word_count + Fraction(1))) ** Fraction(2)))
        elif self._witten_bell:
            word_types = Fraction(self._lexicon.B() - 1)  # Subtract one for initial utterance delimiter addition
            raw_score = word_types / Fraction(self.total_words + word_types)
        else:
            raw_score = self._bad_score
        return raw_score  # lexical decay stuff would need to be added here

    def dump(self, dump_file):
        for word in self._lexicon.iterkeys():
            dump_file.write(word + str(self._lexicon[word]) + '\n')
        if self._subseq_counts:
            for seq in self.subseq_counts.iterkeys():
                dump_file.write(seq + str(self.subseq_counts[seq]) + '\n')
        dump_file.close()

    def use_score(self, word):
        return self.in_lexicon(word)

    def update_evidence(self, word, increase_amount):
        self._lexicon.inc(word, increase_amount)