示例#1
0
    def m_step(self, all_word_paths):
        # keep a dictionary of score-weighted pseudocounts of aligned elements
        pseudocounts = {}

        for word in all_word_paths:
            # get the total score to scale the paths
            # word_total_score = logSum([path.get_score() for path in word])
            if not len(word):
                continue

            word_total_score = logSum([path[1] for path in word \
                                        if not isinf(path[1])])

            for path in word:
                '''
                path_elements = path.get_elements()
                path_score = path.get_score()
                '''
                path_elements, path_score = path

                path_score_scaled = path_score - word_total_score

                for a_element, b_element in path_elements:
                    if a_element not in self.alignment_scores[-1]:
                        a_element = None
                    if a_element not in pseudocounts:
                        pseudocounts[a_element] = {}

                    if b_element not in pseudocounts[a_element] \
                        or isinf(pseudocounts[a_element][b_element]):
                        pseudocounts[a_element][b_element] = path_score_scaled
                    else:
                        pseudocounts[a_element][b_element] = \
                            logAdd(path_score_scaled,
                                   pseudocounts[a_element][b_element])

        self.pseudocounts.append(pseudocounts)

        # rescale all the pseudocounts so they sum to 1
        pseudoprobs = {}
        for a_element in pseudocounts:
            pseudoprobs[a_element] = {}
            a_element_total = logSum([v for v in \
                                        pseudocounts[a_element].values() \
                                        if not isinf(v)])
            for b_element in pseudocounts[a_element]:
                b_element_pseudocount = pseudocounts[a_element][b_element]
                if isinf(a_element_total):
                    b_element_prob = log(0)
                else:
                    b_element_prob = b_element_pseudocount - a_element_total
                pseudoprobs[a_element][b_element] = b_element_prob

        return pseudoprobs
示例#2
0
    def m_step(self, all_word_paths):
        # keep a dictionary of score-weighted pseudocounts of aligned elements
        pseudocounts = {}

        for word in all_word_paths:
            # get the total score to scale the paths
            # word_total_score = logSum([path.get_score() for path in word])
            if not len(word):
                continue

            word_total_score = logSum([path[1] for path in word \
                                        if not isinf(path[1])])

            for path in word:
                '''
                path_elements = path.get_elements()
                path_score = path.get_score()
                '''
                path_elements, path_score = path

                path_score_scaled = path_score - word_total_score

                for a_element, b_element in path_elements:
                    if a_element not in self.alignment_scores[-1]:
                        a_element = None
                    if a_element not in pseudocounts:
                        pseudocounts[a_element] = {}

                    if b_element not in pseudocounts[a_element] \
                        or isinf(pseudocounts[a_element][b_element]):
                        pseudocounts[a_element][b_element] = path_score_scaled
                    else:
                        pseudocounts[a_element][b_element] = \
                            logAdd(path_score_scaled,
                                   pseudocounts[a_element][b_element])

        self.pseudocounts.append(pseudocounts)

        # rescale all the pseudocounts so they sum to 1
        pseudoprobs = {}
        for a_element in pseudocounts:
            pseudoprobs[a_element] = {}
            a_element_total = logSum([v for v in \
                                        pseudocounts[a_element].values() \
                                        if not isinf(v)])
            for b_element in pseudocounts[a_element]:
                b_element_pseudocount = pseudocounts[a_element][b_element]
                if isinf(a_element_total):
                    b_element_prob = log(0)
                else:
                    b_element_prob = b_element_pseudocount - a_element_total
                pseudoprobs[a_element][b_element] = b_element_prob

        return pseudoprobs
示例#3
0
    def e_step(self, alignment_scores):
        # list(list(ViterbiPath))
        all_word_paths = []
        likelihood = log(0)

        positions_to_log = len(self.ab_pairs) / 100

        for ab_idx, ab_pair in enumerate(self.ab_pairs):
            if self.verbose:
                if not ab_idx % positions_to_log:
                    self.logfile = open(self.logfilename, 'a')
                    self.logfile.write('\t'.join([
                        str(s) for s in self.iteration_number,
                        time(), ab_idx, ab_pair
                    ]) + '\n')
                    self.logfile.close()

            a, b = ab_pair

            #print self.iteration_number,a,b

            v = ViterbiAligner(a, b, alignment_scores)

            word_paths = []
            aligned_paths = v.get_all_paths()
            if not len(aligned_paths):
                '''
                with open('bad_alignments/%s.pickle' % a, 'w') as fout:
                    pickle.dump(v, fout)
                '''
                continue

            for path in aligned_paths:
                # only append the elements, otherwise memory use skyrockets
                elements, score = path.get_elements(), path.get_score()
                word_paths.append((elements, score))

                if isinf(likelihood):
                    likelihood = score
                else:
                    likelihood = logAdd(likelihood, score)

            all_word_paths.append(word_paths)

            del v

        self.likelihood.append(likelihood)

        return all_word_paths
示例#4
0
    def e_step(self, alignment_scores):
        # list(list(ViterbiPath))
        all_word_paths = []
        likelihood = log(0)

        positions_to_log = len(self.ab_pairs) / 100

        for ab_idx,ab_pair in enumerate(self.ab_pairs):
            if self.verbose:
                if not ab_idx % positions_to_log:
                    self.logfile = open(self.logfilename, 'a')
                    self.logfile.write('\t'.join([str(s) for s in self.iteration_number, time(), ab_idx, ab_pair])+'\n')
                    self.logfile.close()

            a,b = ab_pair

            #print self.iteration_number,a,b

            v = ViterbiAligner(a, b, alignment_scores)

            word_paths = []
            aligned_paths = v.get_all_paths()
            if not len(aligned_paths):
                '''
                with open('bad_alignments/%s.pickle' % a, 'w') as fout:
                    pickle.dump(v, fout)
                '''
                continue

            for path in aligned_paths:
                # only append the elements, otherwise memory use skyrockets
                elements, score = path.get_elements(), path.get_score()
                word_paths.append((elements, score))

                if isinf(likelihood):
                    likelihood = score
                else:
                    likelihood = logAdd(likelihood, score)

            all_word_paths.append(word_paths)

            del v

        self.likelihood.append(likelihood)

        return all_word_paths