示例#1
0
    def init_GT_smoothing(self):
        self.nr = {}
        for i, state in self.state_counts:
            # Coincedence frequency counts
            c_nr = {}
            # and bigram frequency counts
            bigram_nr = {}
            for obs in self.observations:
                if (i, obs, state) in self.coincedences:
                    c_nr[self.coincedences[i, obs, state]] = c_nr.get(self.coincedences[i, obs, state], 0) + 1
            unseen_bigrams = 0
            for j, state1 in self.state_counts:
                if j != i: continue
                if (i, state, state1) in self.bigrams:
                    bigram_nr[self.bigrams[i, state, state1]] = bigram_nr.get(self.bigrams[i, state, state1], 0) + 1
                else:
                    unseen_bigrams += 1

            x,y = [1], [self.unseen_coincedences * len(self.states)]
            for n,count in c_nr.items():
                x.append(math.log(n+1))
                y.append(count)
            # Find a least squares fit to of the frequency counts to nr = a + b*log(x)
            # The fitted functions sometimes do dive under zero! (Which doesn't seem to be good)
            # By adding a very large value that is zero we sort of solve this?
            #(a,b) = y[0], 0
            x.append(math.log(100000))
            y.append(0)
            c_nr[0] = self.unseen_coincedences * len(self.states)
            (a,b) = tools.linear_fit(x,y)
            self.nr[(i, state), 'coincedence'] = (a,b, c_nr)

            p,q = [0], [unseen_bigrams]
            for n,count in bigram_nr.items():
                p.append(math.log(n))
                q.append(count+1)
            # Find a least squares fit to of the frequency counts to nr = a + b*log(x)
            # The fitted functions sometimes do dive under zero!
            #(a,b) = y[0], 0
            p.append(math.log(100))
            q.append(0)
            bigram_nr[0] = unseen_bigrams
            #print '{0}: {1}'.format((i, state), bigram_nr)
            (a,b) = tools.linear_fit(p,q)
            self.nr[(i, state), 'bigram'] = (a,b, bigram_nr)
示例#2
0
    def init_GT_smoothing(self):
        self.nr = {}
        undef = 0
        for state in self.states:
            nr = {}
            unseen = self.unseen_coincedences
            for obs in self.observations:
                if (obs, state) in self.coincedences:
                    nr[self.coincedences[obs, state]] = nr.get(self.coincedences[obs, state], 0) + 1
                else: unseen += 1

            # We can't use these
            if len(nr) == 1:
                self.nr[state] = 'undefined'
                undef += 1
                continue
            # Things that never occur
            # Variable should be set when decoding is called
            x, y = [], []
            for n,count in nr.items():
                x.append(math.log(n))
                y.append(math.log(count))
            if len(x) < 2:
                x.append(math.log(n+1))
                y.append(math.log(1))
            #x.append(math.log(max(nr.keys())+100))
            #y.append(1)
            #nr[0] = self.unseen_coincedences * len(self.states)
            #print state, x, y, nr
            # Find a least squares fit to of the frequency counts to nr = a + b*log(x)
            #(a,b) = y[0], 0
            #if len(x) > 1:
            (a,b) = tools.linear_fit(x,y)
            #if len(nr) == 1:
        #    print '{0} unseen: {1} Nr: {2}'.format(state, unseen, nr)
            if len(nr) > 5:
                import matplotlib.pyplot as plt
                print([nr[i] for i in list(nr.keys())])
                print([math.exp(a+b*math.log(i+1)) for i in list(nr.keys())])
                #plt.plot([1] + [i+1 for i in sorted(nr.keys())], [math.exp(i) for i in y])
                #plt.show()
                #raw_input("Press enter to continue...")
                #plt.plot(range(1, 1000), [math.exp(a+b*math.log(i)) for i in range(1, 1000)])
                #raw_input("Press enter to continue...")
            self.nr[state] = (a,b,unseen)
        print("Smoothing disabled on {0} out of {1} states".format(undef, len(self.states)))