예제 #1
0
        def decode_helper(probs, uttid, uxxxx):
            res = self.lattice_decoder.Decode(probs, uttid)
            res_utf8 = ''
            if uxxxx == False:
                for uxxxx_word in res.split(' '):
                    res_utf8 += ''.join(
                        [uxxxx_to_utf8(r) for r in uxxxx_word.split('_')])
                res = res_utf8
            else:
                res_flatten = ''
                for uxxxx_word in res.split(' '):
                    for uxxxx_char in uxxxx_word.split('_'):
                        res_flatten += uxxxx_char
                        res_flatten += ' '
                res = res_flatten.strip()

            return res
예제 #2
0
    def decode_without_lm(self,
                          model_output,
                          batch_actual_timesteps,
                          uxxxx=False):
        start_decode = datetime.now()
        min_prob_thresh = 3 * 1 / len(self.alphabet)

        T = model_output.size()[0]
        B = model_output.size()[1]

        prev_char = ['' for _ in range(B)]
        result = ['' for _ in range(B)]

        for t in range(T):

            # #gpu argmax (bug!!!!!)
            # gpu_argmax = True
            # argmaxs, argmax_idxs = model_output.data[t].max(dim=1)
            # argmaxs.squeeze_()
            # argmax_idxs.squeeze_()

            # cpu argmax
            gpu_argmax = False
            model_output_at_t_cpu = model_output.data[t].cpu().numpy()
            argmaxs = model_output_at_t_cpu.max(1).flatten()
            argmax_idxs = model_output_at_t_cpu.argmax(1).flatten()

            for b in range(B):
                # Only look at valid model output for this batch entry
                if t >= batch_actual_timesteps[b]:
                    continue

                if argmax_idxs[b] == 0:  # CTC Blank
                    prev_char[b] = ''
                    continue

                # Heuristic
                # If model is predicting very low probability for all letters in alphabet, treat that the
                # samed as a CTC blank
                if argmaxs[b] < min_prob_thresh:
                    prev_char[b] = ''
                    continue

                char = self.alphabet.idx_to_char[argmax_idxs[b]]

                if prev_char[b] == char:
                    continue

                result[b] += char
                prev_char[b] = char

                # Add a space to all but last iteration
                if t != T - 1:
                    result[b] += ' '

        # Strip off final token-stream space if needed
        for b in range(B):
            if len(result[b]) > 0 and result[b][-1] == ' ':
                result[b] = result[b][:-1]

        # Check if we should return utf8 output
        if uxxxx == False:
            result = [uxxxx_to_utf8(r) for r in result]

        return result
예제 #3
0
    def decode_with_lm(self,
                       model_output,
                       batch_actual_timesteps,
                       uxxxx=False,
                       pmod=False):

        if self.lattice_decoder is None:
            raise Exception(
                "Must initialize lattice decoder prior to LM decoding")

        T = model_output.size()[0]
        B = model_output.size()[1]

        # Actual model output is not set to probability vector yet, need to run softmax
        probs = torch.nn.functional.log_softmax(
            model_output.view(-1, model_output.size(2)),
            dim=1).view(model_output.size(0), model_output.size(1), -1)

        # Make sure we're on CPU
        probs = probs.data.cpu()

        hyp_results = []
        for b in range(B):

            for t in range(T):
                if pmod:
                    if torch.max(probs[t, b]) < 0.9:
                        probs[t, b] = probs[t, b] * 0.6  # low conf
                    else:
                        probs[t, b] = probs[t, b] * 1.1  # 'normal, high conf'
                else:
                    probs[t, b] = probs[t, b] * self.acoustic_weight

            activations = probs[:, b, :].numpy()
            activations_remapped = np.zeros(
                (batch_actual_timesteps[b], len(self.lmidx_to_char)))

            for c in range(len(self.lmidx_to_char)):
                char = self.lmidx_to_char[c]
                if char in self.alphabet.char_to_idx:
                    mapped_c = self.alphabet.char_to_idx[char]
                    activations_remapped[:,
                                         c] = activations[:
                                                          batch_actual_timesteps[
                                                              b], mapped_c]
                else:
                    activations_remapped[:, c] = np.log(1e-10)

            # Now check that anything turned to NULL gets mapped to ctc-blank
            for t in range(batch_actual_timesteps[b]):
                psum = np.log(1e-10)
                for c in range(len(self.lmidx_to_char)):
                    psum = np.logaddexp(psum, activations_remapped[t, c])
                if psum < np.log(1e-2):
                    activations_remapped[t, 0] = 0

            res = self.lattice_decoder.Decode(activations_remapped)
            res_utf8 = ''
            if uxxxx == False:
                for uxxxx_word in res.split(' '):
                    res_utf8 += ''.join(
                        [uxxxx_to_utf8(r) for r in uxxxx_word.split('_')])
                res = res_utf8
            else:
                res_flatten = ''
                for uxxxx_word in res.split(' '):
                    for uxxxx_char in uxxxx_word.split('_'):
                        res_flatten += uxxxx_char
                        res_flatten += ' '
                res = res_flatten.strip()

            hyp_results.append(res)

        return hyp_results
예제 #4
0
    def decode_with_lm_mt(self,
                          model_output,
                          batch_actual_timesteps,
                          uxxxx=False,
                          n_workers=10):
        # Setup multi-threaded decoding

        #print("About to create threadpool")
        with ThreadPoolExecutor(max_workers=n_workers) as executor:

            if self.lattice_decoder is None:
                raise Exception(
                    "Must initialize lattice decoder prior to LM decoding")

            T = model_output.size()[0]
            B = model_output.size()[1]

            # Actual model output is not set to probability vector yet, need to run softmax
            probs = torch.nn.functional.log_softmax(
                model_output.view(-1, model_output.size(2)),
                dim=1).view(model_output.size(0), model_output.size(1), -1)

            # Need to take care of issue where prob goes to a char in model-alphabet but not in lm-alphabet
            # Just assign high prob to ctc-blank?
            #print("Sum of missing chars' prob = %s" % str(model_output[:,:,self.add_to_blank_idx].sum(dim=2)))
            #probs[:,:,0] += probs[:,:,self.add_to_blank_idx].sum(dim=2)
            #probs[:,:,self.add_to_blank_idx] = 0

            # Make sure we're on CPU
            probs = probs.data.cpu()

            # We process decoder parallely in worker threads; store those async futures here
            decoder_futures = [None] * B

            # probs = probs * self.acoustic_weight
            start_submitting = datetime.now()
            for b in range(B):
                probs_remapped = np.full(
                    (batch_actual_timesteps[b], len(self.lmidx_to_char)),
                    np.log(1e-10))
                probs_remapped[:, self.
                               lm_swap_idxs_lmidx] = probs[:batch_actual_timesteps[
                                   b], b, self.lm_swap_idxs_modelidx]

                decoder_futures[b] = executor.submit(
                    self.lattice_decoder.Decode, probs_remapped)

            end_submitting = datetime.now()
            #print("Waiting for threadpool jobs to finish. Took %f s to get here" % (end_submitting - start_submitting).total_seconds())
        # At this point all decoder tasks are done (we are outside scope of with ThreadPoolExecutor, so it has finished)
        end_waiting = datetime.now()
        #print("Took %f s to wait for batch decodes to finish" % (end_waiting - end_submitting).total_seconds())

        hyp_results = []

        for b in range(B):
            res = decoder_futures[b].result()

            res_utf8 = ''
            if uxxxx == False:
                for uxxxx_word in res.split(' '):
                    res_utf8 += ''.join(
                        [uxxxx_to_utf8(r) for r in uxxxx_word.split('_')])
                res = res_utf8
            else:
                res_flatten = ''
                for uxxxx_word in res.split(' '):
                    for uxxxx_char in uxxxx_word.split('_'):
                        res_flatten += uxxxx_char
                        res_flatten += ' '
                res = res_flatten.strip()

            hyp_results.append(res)

        return hyp_results
예제 #5
0
import sys
import textutils

input_file = sys.argv[1]

with open(input_file, 'r') as fh:

    for line in fh:
        lparen_location = line.rfind('(')
        rparen_location = line.rfind(')')

        utt = line[:lparen_location]
        utt_utf8 = ''
        for word in utt.split(" "):
            if word == "u0020":
                utt_utf8 += " "
            elif word == "u0009":
                utt_utf8 += " "
            else:
                word_utf8 = ''
                for char in word.split("_"):
                    char_utf8 = textutils.uxxxx_to_utf8(char)
                    word_utf8 += char_utf8

                utt_utf8 += word_utf8

        uttid = line[lparen_location + 1:rparen_location]
        uttid = uttid[:uttid.rfind('_')]

        print("%s (%s)" % (utt_utf8, uttid))
예제 #6
0
def undo_bidi(uxxxx_str, base_level=1):
    # Step 0: attach the unicode bidi type to each char
    augmented_char_array = []
    for char in uxxxx_str.split():
        bidi_type = bidirectional(uxxxx_to_utf8(char))
        # For now, hard-coding base level to always be RTL, because this is for Arabic corpus. Revisit this later!
        augmented_char_array.append({
            'char': char,
            'bidi-type': bidi_type,
            'bidi-orig-type': bidi_type,
            'level': base_level
        })

    # Step 1: Resolve Explicit embed and overrides
    #   See:  http://unicode.org/reports/tr9/#Explicit_Levels_and_Directions
    overflow_counter = almost_overflow_counter = 0
    directional_override = 'N'
    levels = deque()

    # X1
    embedding_level = base_level

    for _ch in augmented_char_array:
        bidi_type = _ch['bidi-type']

        level_func, override = X2_X5_MAPPINGS.get(bidi_type, (None, None))

        if level_func:
            # So this is X2 to X5
            # if we've past EXPLICIT_LEVEL_LIMIT, note it and do nothing

            if overflow_counter != 0:
                overflow_counter += 1
                continue

            new_level = level_func(embedding_level)
            if new_level < EXPLICIT_LEVEL_LIMIT:
                levels.append((embedding_level, directional_override))
                embedding_level, directional_override = new_level, override

            elif embedding_level == EXPLICIT_LEVEL_LIMIT - 2:
                # The new level is invalid, but a valid level can still be
                # achieved if this level is 60 and we encounter an RLE or
                # RLO further on.  So record that we 'almost' overflowed.
                almost_overflow_counter += 1

            else:
                overflow_counter += 1
        else:
            # X6
            if bidi_type not in X6_IGNORED:
                _ch['level'] = embedding_level
                if directional_override != 'N':
                    _ch['bidi-type'] = directional_override

            # X7
            elif bidi_type == 'PDF':
                if overflow_counter:
                    overflow_counter -= 1
                elif almost_overflow_counter and \
                                embedding_level != EXPLICIT_LEVEL_LIMIT - 1:
                    almost_overflow_counter -= 1
                elif levels:
                    embedding_level, directional_override = levels.pop()

            # X8
            elif bidi_type == 'B':
                levels.clear()
                overflow_counter = almost_overflow_counter = 0
                embedding_level = _ch['level'] = base_level
                directional_override = 'N'

    # Removes the explicit embeds and overrides of types
    # RLE, LRE, RLO, LRO, PDF, and BN. Adjusts extended chars
    # next and prev as well

    # Applies X9. See http://unicode.org/reports/tr9/#X9
    augmented_char_array = [
        _ch for _ch in augmented_char_array
        if _ch['bidi-type'] not in X9_REMOVED
    ]

    # Step 2: determine LTR / RTL runs
    #  See: See http://unicode.org/reports/tr9/#X10

    # First define utility function: Basically, RTL takes preference over LTR ... if either left/right boundary is RTL then all is RTL
    def calc_level_run(b_l, b_r):
        return ['L', 'R'][max(b_l, b_r) % 2]

    runs = []

    # After remoing RLO/LRO/etc, check length again
    if len(augmented_char_array) == 0:
        return ''

    first_char = augmented_char_array[0]
    run_start_level = calc_level_run(first_char['level'], base_level)
    run_end_level = None
    run_start = run_length = 0
    prev_level, prev_type = first_char['level'], first_char['bidi-type']

    for char in augmented_char_array:
        curr_level, curr_type = char['level'], char['bidi-type']

        if curr_level == prev_level:
            run_length += 1
        else:
            run_end_level = calc_level_run(prev_level, curr_level)
            runs.append({
                'sor': run_start_level,
                'eor': run_end_level,
                'start': run_start,
                'type': prev_type,
                'length': run_length
            })
            run_start_level = run_end_level
            run_start += run_length
            run_length = 1

        prev_level, prev_type = curr_level, curr_type

    # for the last char/runlevel
    run_end_level = calc_level_run(curr_level, base_level)
    runs.append({
        'sor': run_start_level,
        'eor': run_end_level,
        'start': run_start,
        'type': curr_type,
        'length': run_length
    })

    # Step 3: Resolve weak LTR/RTL types
    #   See: http://unicode.org/reports/tr9/#Resolving_Weak_Types
    for run in runs:
        prev_strong = prev_type = run['sor']
        start, length = run['start'], run['length']
        chars = augmented_char_array[start:start + length]

        for char in chars:
            # W1. Examine each nonspacing mark (NSM) in the level run, and
            # change the type of the NSM to the type of the previous character.
            # If the NSM is at the start of the level run, it will get the type
            # of sor.
            bidi_type = char['bidi-type']

            if bidi_type == 'NSM':
                char['bidi-type'] = bidi_type = prev_type

            # W2. Search backward from each instance of a European number until
            # the first strong type (R, L, AL, or sor) is found. If an AL is
            # found, change the type of the European number to Arabic number.
            if bidi_type == 'EN' and prev_strong == 'AL':
                char['bidi-type'] = 'AN'

            # update prev_strong if needed
            if bidi_type in ('R', 'L', 'AL'):
                prev_strong = bidi_type

            prev_type = char['bidi-type']

        # W3. Change all ALs to R
        for char in chars:
            if char['bidi-type'] == 'AL':
                char['bidi-type'] = 'R'

        # W4. A single European separator between two European numbers changes
        # to a European number. A single common separator between two numbers of
        # the same type changes to that type.
        for idx in range(1, len(chars) - 1):
            bidi_type = chars[idx]['bidi-type']
            prev_type = chars[idx - 1]['bidi-type']
            next_type = chars[idx + 1]['bidi-type']

            if bidi_type == 'ES' and (prev_type == next_type == 'EN'):
                chars[idx]['bidi-type'] = 'EN'

            if bidi_type == 'CS' and prev_type == next_type and \
                            prev_type in ('AN', 'EN'):
                chars[idx]['bidi-type'] = prev_type

        # W5. A sequence of European terminators adjacent to European numbers
        # changes to all European numbers.
        for idx in range(len(chars)):
            if chars[idx]['bidi-type'] == 'EN':
                for et_idx in range(idx - 1, -1, -1):
                    if chars[et_idx]['bidi-type'] == 'ET':
                        chars[et_idx]['bidi-type'] = 'EN'
                    else:
                        break
                for et_idx in range(idx + 1, len(chars)):
                    if chars[et_idx]['bidi-type'] == 'ET':
                        chars[et_idx]['bidi-type'] = 'EN'
                    else:
                        break

        # W6. Otherwise, separators and terminators change to Other Neutral.
        for char in chars:
            if char['bidi-type'] in ('ET', 'ES', 'CS'):
                char['bidi-type'] = 'ON'

        # W7. Search backward from each instance of a European number until the
        # first strong type (R, L, or sor) is found. If an L is found, then
        # change the type of the European number to L.
        prev_strong = run['sor']
        for char in chars:
            if char['bidi-type'] == 'EN' and prev_strong == 'L':
                char['bidi-type'] = 'L'

            if char['bidi-type'] in ('L', 'R'):
                prev_strong = char['bidi-type']

    # Step 4: Resolve Neutral Types
    #   See: http://unicode.org/reports/tr9/#Resolving_Neutral_Types
    for run in runs:
        start, length = run['start'], run['length']
        # use sor and eor
        chars = [{
            'bidi-type': run['sor']
        }] + augmented_char_array[start:start + length] + [{
            'bidi-type':
            run['eor']
        }]
        total_chars = len(chars)

        seq_start = None
        for idx in range(total_chars):
            _ch = chars[idx]
            if _ch['bidi-type'] in ('B', 'S', 'WS', 'ON'):
                # N1. A sequence of neutrals takes the direction of the
                # surrounding strong text if the text on both sides has the same
                # direction. European and Arabic numbers act as if they were R
                # in terms of their influence on neutrals. Start-of-level-run
                # (sor) and end-of-level-run (eor) are used at level run
                # boundaries.
                if seq_start is None:
                    seq_start = idx
                    prev_bidi_type = chars[idx - 1]['bidi-type']
            else:
                if seq_start is not None:
                    next_bidi_type = chars[idx]['bidi-type']

                    if prev_bidi_type in ('AN', 'EN'):
                        prev_bidi_type = 'R'

                    if next_bidi_type in ('AN', 'EN'):
                        next_bidi_type = 'R'

                    for seq_idx in range(seq_start, idx):
                        if prev_bidi_type == next_bidi_type:
                            chars[seq_idx]['bidi-type'] = prev_bidi_type
                        else:
                            # N2. Any remaining neutrals take the embedding
                            # direction. The embedding direction for the given
                            # neutral character is derived from its embedding
                            # level: L if the character is set to an even level,
                            # and R if the level is odd.
                            if chars[seq_idx]['level'] % 2 == 0:
                                chars[seq_idx]['bidi-type'] = 'L'
                            else:
                                chars[seq_idx]['bidi-type'] = 'R'

                    seq_start = None

    # Step 5: Resolve Implicit Levels
    #   See: http://unicode.org/reports/tr9/#Resolving_Implicit_Levels
    def _embedding_direction(x):
        return ('L', 'R')[x % 2]

    for run in runs:
        start, length = run['start'], run['length']
        chars = augmented_char_array[start:start + length]

        for _ch in chars:
            # only those types are allowed at this stage
            assert _ch['bidi-type'] in ('L', 'R', 'EN', 'AN'), \
                '[%s] not allowed here. Original string was: [%s]; Cur run = [%s] and cur char = [%s].' % (_ch['bidi-type'], uxxxx_str, str(chars), _ch)

            if _embedding_direction(_ch['level']) == 'L':
                # I1. For all characters with an even (left-to-right) embedding
                # direction, those of type R go up one level and those of type
                # AN or EN go up two levels.
                if _ch['bidi-type'] == 'R':
                    _ch['level'] += 1
                elif _ch['bidi-type'] != 'L':
                    _ch['level'] += 2
            else:
                # I2. For all characters with an odd (right-to-left) embedding
                # direction, those of type L, EN or AN  go up one level.
                if _ch['bidi-type'] != 'R':
                    _ch['level'] += 1

    # Step 6: Reorder Resolved Levels
    #   See: http://unicode.org/reports/tr9/#I2

    # Applies L1.

    should_reset = True
    chars = augmented_char_array

    for _ch in chars[::-1]:
        # L1. On each line, reset the embedding level of the following
        # characters to the paragraph embedding level:
        if _ch['bidi-orig-type'] in ('B', 'S'):
            # 1. Segment separators,
            # 2. Paragraph separators,
            _ch['level'] = base_level
            should_reset = True
        elif should_reset and _ch['bidi-orig-type'] in ('BN', 'WS'):
            # 3. Any sequence of whitespace characters preceding a segment
            # separator or paragraph separator
            # 4. Any sequence of white space characters at the end of the
            # line.
            _ch['level'] = base_level
        else:
            should_reset = False

    max_len = len(chars)

    # L2 should be per line
    # Calculates highest level and loweset odd level on the fly.
    line_start = line_end = 0
    highest_level = 0
    lowest_odd_level = EXPLICIT_LEVEL_LIMIT

    for idx in range(max_len):
        _ch = chars[idx]

        # calc the levels
        char_level = _ch['level']
        if char_level > highest_level:
            highest_level = char_level

        if char_level % 2 and char_level < lowest_odd_level:
            lowest_odd_level = char_level

        if _ch['bidi-orig-type'] == 'B' or idx == max_len - 1:
            line_end = idx
            # omit line breaks
            if _ch['bidi-orig-type'] == 'B':
                line_end -= 1

            _reverse_contiguous_sequence(chars, line_start, line_end,
                                         highest_level, lowest_odd_level)

            # reset for next line run
            line_start = idx + 1
            highest_level = 0
            lowest_odd_level = EXPLICIT_LEVEL_LIMIT

    # Finally, reverse entire string
    return ' '.join([char['char'] for char in reversed(chars)])