Exemplo n.º 1
0
def perform_conf_vote(voters):
    results = [voter["sequence"] for voter in voters]
    synclist = synchronize(results)

    final_result = []

    for sync in synclist:
        actual_voters, most_freq_length = find_voters_with_most_frequent_length(
            sync, voters)

        # set of all characters (check if all say the same, then the set size is one)
        s = []
        for r in [
                voters[voter_id]["sequence"]
            [sync.start(voter_id):sync.stop(voter_id) + 1]
                for voter_id in actual_voters
        ]:
            if r not in s:
                s.append(r)

        def add_char(actual_voters, i):
            c_p = {}
            for voter_id in actual_voters:
                idx = i + sync.start(voter_id)
                alts = voters[voter_id]["alternatives"][idx]
                pos = voters[voter_id]["positions"][idx]
                for k, p in alts.items():
                    if k in c_p:
                        c_p[k].merge(k, p / len(actual_voters),
                                     pos.global_start, pos.global_end)
                    else:
                        c_p[k] = MergeableCharacter(k, p / len(actual_voters),
                                                    pos.global_start,
                                                    pos.global_end)

            chars = sorted(c_p.values(), key=lambda v: -v.p)
            final_result.append(chars)

        if len(s) == 1:
            sentence = s.pop()
            for i in range(len(sentence)):
                add_char(actual_voters, i)

            continue

        if len(actual_voters) == 1:
            voter_id = actual_voters[0]
            for i in range(sync.length(voter_id)):
                add_char([voter_id], i)
        else:
            for i in range(most_freq_length):
                add_char(actual_voters, i)

    return final_result
Exemplo n.º 2
0
    def evaluate_single(_sentinel=None, gt='', pred='', skip_empty_gt=False):
        """ Evaluate a single pair of data

        Parameters
        ----------
        _sentinel : None
            Sentinel to force to specify gt and pred manually
        gt : str
            ground truth
        pred : str
            prediction
        skip_empty_gt : bool
            skip gt text lines that are empty

        Returns
        -------
        int
            length of ground truth
        int
            number of errors
        int
            number of synchronisation errors
        dict
            confusions dictionary
        tuple(str, str)
            ground_truth, prediction (same as input)

        """
        if _sentinel is not None:
            raise Exception('Call this function by specifying gt and pred explicitly')


        confusion = {}
        total_sync_errs = 0

        if len(gt) == 0 and skip_empty_gt:
            return 0, 0, 0, confusion, (gt, pred)

        errs, trues = edit_distance(gt, pred)
        synclist = synchronize([gt, pred])
        for sync in synclist:
            gt_str, pred_str = sync.get_text()
            if gt_str != pred_str:
                key = (gt_str, pred_str)
                total_sync_errs += max(len(gt_str), len(pred_str))
                if key not in confusion:
                    confusion[key] = 1
                else:
                    confusion[key] += 1

        return len(gt), errs, total_sync_errs, confusion, (gt, pred)
Exemplo n.º 3
0
    def process_text(self, texts):
        voters = SequenceVoter.text_to_voters(texts)

        if self.optimize:
            SequenceVoter.select_voters(voters)

            if self.n_best > 0:
                actual_voters = voters[:self.n_best]
            else:
                actual_voters = voters

        else:
            actual_voters = voters

        inputs = [voter.text for voter in actual_voters]

        synclist = synchronize(inputs)
        return SequenceVoter.perform_vote(inputs, synclist, actual_voters)