Пример #1
0
    def test_acceptor_wo_arcs_from_openfst(self):
        s1 = '''
        '''

        s2 = '''
            0 Inf
            1 0.1
        '''

        s3 = '''
            0 Inf
            1 0.1
            2 0.2
        '''

        for device in self.devices:
            fsa1 = k2.Fsa.from_openfst(s1)
            print("fsa1 = ", k2.to_str(fsa1))
            self.assertEqual('', k2.to_str(fsa1))

            fsa2 = k2.Fsa.from_openfst(s2)
            self.assertEqual(_remove_leading_spaces(k2.to_str(fsa2)),
                             "1 2 -1 -0.1\n2")
            arcs2 = fsa2.arcs.values()[:, :-1]
            assert torch.all(
                torch.eq(arcs2, torch.tensor([[1, 2, -1]], dtype=torch.int32)))

            fsa3 = k2.Fsa.from_openfst(s3)
            self.assertEqual(fsa3.arcs.dim0(), 4)
            self.assertEqual(_remove_leading_spaces(k2.to_str(fsa3)),
                             "1 3 -1 -0.1\n2 3 -1 -0.2\n3")
Пример #2
0
    def test_transducer2_ragged2_from_str(self):
        s = '''
            0 1  2  22  101 [] [] -1.2
            0 2  10 100 102 [] [] -2.2
            1 6 -1  16  103 [20 30] [40] -4.2
            1 3  3  33  104 [] [] -3.2
            2 6 -1  26  105 [] [] -5.2
            2 4  2  22  106 [] [] -6.2
            3 6 -1  36  107 [] [] -7.2
            5 0  1  50  108 [] [] -8.2
            6
        '''
        fsa = k2.Fsa.from_str(s,
                              aux_label_names=['aux_labels', 'aux_labels2'],
                              ragged_label_names=['ragged1', 'ragged2'])

        assert fsa.aux_labels.dtype == torch.int32
        assert fsa.aux_labels.device.type == 'cpu'
        assert isinstance(fsa.ragged1, k2.RaggedTensor)
        assert isinstance(fsa.ragged2, k2.RaggedTensor)

        assert torch.all(
            torch.eq(
                fsa.aux_labels,
                torch.tensor([22, 100, 16, 33, 26, 22, 36, 50],
                             dtype=torch.int32)))

        assert torch.all(
            torch.eq(
                fsa.aux_labels2,
                torch.tensor([101, 102, 103, 104, 105, 106, 107, 108],
                             dtype=torch.int32)))

        assert torch.allclose(
            fsa.scores,
            torch.tensor([-1.2, -2.2, -4.2, -3.2, -5.2, -6.2, -7.2, -8.2],
                         dtype=torch.float32))

        print("fsa.ragged1 = ", fsa.ragged1)
        print("fsa.ragged2 = ", fsa.ragged2)
        assert fsa.ragged1 == k2.RaggedTensor(
            '[ [] [] [20 30] [] [] [] [] [] ]')
        assert fsa.ragged2 == k2.RaggedTensor('[ [] [] [40] [] [] [] [] [] ]')

        # only aux_labels will be printed right now..
        expected_str = '''
        0 1 2 22 101 [ ] [ ] -1.2
        0 2 10 100 102 [ ] [ ] -2.2
        1 6 -1 16 103 [ 20 30 ] [ 40 ] -4.2
        1 3 3 33 104 [ ] [ ] -3.2
        2 6 -1 26 105 [ ] [ ] -5.2
        2 4 2 22 106 [ ] [ ] -6.2
        3 6 -1 36 107 [ ] [ ] -7.2
        5 0 1 50 108 [ ] [ ] -8.2
        6
        '''
        print("fsa = ", _remove_leading_spaces(k2.to_str(fsa)))
        assert _remove_leading_spaces(expected_str) == \
              _remove_leading_spaces(k2.to_str(fsa))
Пример #3
0
    def test_acceptor_from_openfst(self):
        s = '''
            0 1  2 -1.2
            0 2  10 -2.2
            1 6  1  -3.2
            1 3  3  -4.2
            2 6  2  -5.2
            2 4  2  -6.2
            3 6  3  -7.2
            5 0  1  -8.2
            7
            6 -9.2
        '''

        for i in range(4):
            if i == 0:
                fsa = k2.Fsa.from_openfst(s)
            elif i == 1:
                fsa = k2.Fsa.from_openfst(s, acceptor=True)
            elif i == 2:
                fsa = k2.Fsa.from_openfst(s, num_aux_labels=0)
            else:
                fsa = k2.Fsa.from_openfst(s, aux_label_names=[])

            expected_str = '''
            0 1 2 -1.2
            0 2 10 -2.2
            1 6 1 -3.2
            1 3 3 -4.2
            2 6 2 -5.2
            2 4 2 -6.2
            3 6 3 -7.2
            5 0 1 -8.2
            6 8 -1 -9.2
            7 8 -1 0
            8
            '''
            assert _remove_leading_spaces(expected_str) == \
                    _remove_leading_spaces(k2.to_str(fsa, openfst=True))

            arcs = fsa.arcs.values()[:, :-1]
            assert isinstance(arcs, torch.Tensor)
            assert arcs.dtype == torch.int32
            assert arcs.device.type == 'cpu'
            assert arcs.shape == (10, 3), 'there should be 10 arcs'
            assert torch.all(
                torch.eq(arcs[0], torch.tensor([0, 1, 2], dtype=torch.int32)))

            assert torch.allclose(
                fsa.scores,
                torch.tensor([1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2, 8.2, 9.2, 0],
                             dtype=torch.float32))

            fsa.scores *= -1

            assert torch.allclose(
                fsa.scores,
                torch.tensor(
                    [-1.2, -2.2, -3.2, -4.2, -5.2, -6.2, -7.2, -8.2, -9.2, 0],
                    dtype=torch.float32))
Пример #4
0
    def test_acceptor_from_openfst_ragged1(self):
        s = '''
            0 1  2 [] -1.2
            0 2  10 [10] -2.2
            1 6  1 [] -3.2
            1 3  3 [11 12] -4.2
            2 6  2 [] -5.2
            2 4  2 [] -6.2
            3 6  3 [] -7.2
            5 0  1 [13]  -8.2
            7
            6 -9.2
        '''
        fsa = k2.Fsa.from_openfst(s, num_aux_labels=0,
                                  ragged_label_names=['ragged'])

        expected_str = '''
        0 1 2 [ ] -1.2
        0 2 10 [ 10 ] -2.2
        1 6 1 [ ] -3.2
        1 3 3 [ 11 12 ] -4.2
        2 6 2 [ ] -5.2
        2 4 2 [ ] -6.2
        3 6 3 [ ] -7.2
        5 0 1 [ 13 ] -8.2
        6 8 -1 [ ] -9.2
        7 8 -1 [ ] 0
        8
        '''
        string = _remove_leading_spaces(k2.to_str(fsa, openfst=True))
        print("fsa=", string)
        assert _remove_leading_spaces(expected_str) == string
Пример #5
0
def transform(args):
    if args.normalise:
        normalise(args)
    symbols = symboletable(args)
    with open(os.path.join(args.data_directory, 'emission_symbols'), 'w') as f:
        f.write(symbols)
    lpzscp = os.path.join(args.data_directory, 'lpz.scp') if not args.normalise else os.path.join(
        args.data_directory, 'lpz_norm.scp')
    fstpath = os.path.join(args.data_directory, 'efst') if not args.normalise else os.path.join(
        args.data_directory, 'efst_norm')
    os.makedirs(fstpath, exist_ok=True)

    uttid2datafile = dict()
    with open(lpzscp) as f:
        for line in f:
            lc = line.strip().split()
            uttid2datafile[lc[0]] = lc[1]

    fst_scp = os.path.join(args.data_directory, 'efst.scp') if not args.normalise else os.path.join(
        args.data_directory, 'efst_norm.fst')
    with open(fst_scp, 'w') as f:
        fc = ''
        for uttid in uttid2datafile.keys():
            datafile = uttid2datafile[uttid]
            data = np.load(datafile)
            efst = transform_utt(data)
            efst_str = k2.to_str(efst)
            efst_file = os.path.join(fstpath, uttid)
            with open(efst_file, 'w') as g:
                g.write(efst_str)
            fc += '%s %s\n' % (uttid, efst_file)
        f.write(fc)
Пример #6
0
def lexicon_fst(args):
    '''
    This programme create lexicon.fst.pdf and lexicon.fst.txt based on args.word_file
    input:
    args: name_space
    return:
    lexicon: k2.Fsa, lexicon fst
    output:
    lexicon.fst.txt and lexicon.fst.pdf in args.data_directory

    By lexicon fst, we compress the repeated chars in emission fst. 
    '''
    symbols_str = symboletable(args)
    symbols_paris = symbols_str.split('\n')
    num_noneps = len(symbols_paris) - 1
    symbol2fst = [None]  # <eps> has no fst
    for i in range(1, num_noneps + 1):
        s = '''
        0 1 %d %d 0.0
        1 1 %d 0 0.0
        1 2 -1 -1 0.0
        2
        ''' % (i, i, i)
        g = k2.Fsa.from_str(s, acceptor=False)

        symbol2fst.append(g)
    fst_vec = k2.create_fsa_vec(symbol2fst[1:])
    fst_union = k2.union(fst_vec)
    lexicon = k2.closure(fst_union)
    lexicon.draw(os.path.join(args.data_directory, 'lexicon.fst.pdf'), title='lexicon')
    # lexicon.symbols = k2.SymbolTable.from_str(symbols_str)
    # lexicon.aux_symbols = k2.SymbolTable.from_str(symbols_str)
    with open(os.path.join(args.data_directory, 'lexicon.fst.txt'), 'w') as f:
        f.write(k2.to_str(lexicon))
Пример #7
0
    def test_transducer_from_str(self):
        s = '''
            0 1 2 22  -1.2
            0 2  10 100 -2.2
            1 6 -1  16  -4.2
            1 3  3  33  -3.2
            2 6 -1  26  -5.2
            2 4  2  22  -6.2
            3 6 -1  36  -7.2
            5 0  1  50  -8.2
            6
        '''
        fsa = k2.Fsa.from_str(_remove_leading_spaces(s))
        assert fsa.aux_labels.dtype == torch.int32
        assert fsa.aux_labels.device.type == 'cpu'
        assert torch.allclose(
            fsa.aux_labels,
            torch.tensor([22, 100, 16, 33, 26, 22, 36, 50], dtype=torch.int32))

        expected_str = '''
            0 1 2 22 -1.2
            0 2 10 100 -2.2
            1 6 -1 16 -4.2
            1 3 3 33 -3.2
            2 6 -1 26 -5.2
            2 4 2 22 -6.2
            3 6 -1 36 -7.2
            5 0 1 50 -8.2
            6
        '''
        assert _remove_leading_spaces(expected_str) == _remove_leading_spaces(
            k2.to_str(fsa))
Пример #8
0
    def test(self):
        # for the symbol table
        # <eps> 0
        # a 0
        # b 1
        # c 2

        # an FSA that recognizes a+(b|c)
        s = '''
            0 1 1 0.1
            1 1 1 0.2
            1 2 2 0.3
            1 3 3 0.4
            2 4 -1 0.5
            3 4 -1 0.6
            5
        '''
        a_fsa = k2.Fsa.from_str(s)
        a_fsa.requires_grad_(True)

        # an FSA that recognizes ab
        s = '''
            0 1 1 10
            1 2 2 20
            2 3 -1 30
            3
        '''
        b_fsa = k2.Fsa.from_str(s)
        b_fsa.requires_grad_(True)

        fsa = k2.intersect(a_fsa, b_fsa)
        assert len(fsa.shape) == 2
        actual_str = k2.to_str(fsa)
        expected_str = '\n'.join(
            ['0 1 1 10.1', '1 2 2 20.3', '2 3 -1 30.5', '3'])
        assert actual_str.strip() == expected_str

        loss = fsa.scores.sum()
        loss.backward()
        # arc 0, 2, and 4 of a_fsa are kept in the final intersected FSA
        assert torch.allclose(
            a_fsa.scores.grad,
            torch.tensor([1, 0, 1, 0, 1, 0], dtype=torch.float32))

        assert torch.allclose(b_fsa.scores.grad,
                              torch.tensor([1, 1, 1], dtype=torch.float32))

        # if any of the input FSA is an FsaVec,
        # the outupt FSA is also an FsaVec.
        a_fsa.scores.grad = None
        b_fsa.scores.grad = None
        a_fsa = k2.create_fsa_vec([a_fsa])
        fsa = k2.intersect(a_fsa, b_fsa)
        assert len(fsa.shape) == 3
Пример #9
0
    def test_treat_epsilon_specially_true(self):
        # this version works only on CPU and requires
        # arc-sorted inputs
        # a_fsa recognizes `(1|3)?2*`
        s1 = '''
            0 1 3 0.0
            0 1 1 0.2
            0 1 0 0.1
            1 1 2 0.3
            1 2 -1 0.4
            2
        '''
        a_fsa = k2.Fsa.from_str(s1)
        a_fsa.requires_grad_(True)

        # b_fsa recognizes `1|2|5`
        s2 = '''
            0 1 5 0
            0 1 1 1
            0 1 2 2
            1 2 -1 3
            2
        '''
        b_fsa = k2.Fsa.from_str(s2)
        b_fsa.requires_grad_(True)

        # fsa recognizes 1|2
        fsa = k2.intersect(k2.arc_sort(a_fsa), k2.arc_sort(b_fsa))
        assert len(fsa.shape) == 2
        actual_str = k2.to_str(fsa)
        expected_str = '\n'.join(
            ['0 1 0 0.1', '0 2 1 1.2', '1 2 2 2.3', '2 3 -1 3.4', '3'])
        assert actual_str.strip() == expected_str

        loss = fsa.scores.sum()
        (-loss).backward()
        # arc 1, 2, 3, and 4 of a_fsa are kept in the final intersected FSA
        assert torch.allclose(a_fsa.grad,
                              torch.tensor([0, -1, -1, -1, -1]).to(a_fsa.grad))

        # arc 1, 2, and 3 of b_fsa are kept in the final intersected FSA
        assert torch.allclose(b_fsa.grad,
                              torch.tensor([0, -1, -1, -1]).to(b_fsa.grad))

        # if any of the input FSA is an FsaVec,
        # the outupt FSA is also an FsaVec.
        a_fsa.scores.grad = None
        b_fsa.scores.grad = None
        a_fsa = k2.create_fsa_vec([a_fsa])
        fsa = k2.intersect(k2.arc_sort(a_fsa), k2.arc_sort(b_fsa))
        assert len(fsa.shape) == 3
Пример #10
0
    def test_treat_epsilon_specially_false(self):
        devices = [torch.device('cpu')]
        if torch.cuda.is_available():
            devices.append(torch.device('cuda'))

        for device in devices:
            # a_fsa recognizes `(0|1)2*`
            s1 = '''
                0 1 0 0.1
                0 1 1 0.2
                1 1 2 0.3
                1 2 -1 0.4
                2
            '''
            a_fsa = k2.Fsa.from_str(s1).to(device)
            a_fsa.requires_grad_(True)

            # b_fsa recognizes `1|2`
            s2 = '''
                0 1 1 1
                0 1 2 2
                1 2 -1 3
                2
            '''
            b_fsa = k2.Fsa.from_str(s2).to(device)
            b_fsa.requires_grad_(True)

            # fsa recognizes `1`
            fsa = k2.intersect(a_fsa, b_fsa, treat_epsilons_specially=False)
            assert len(fsa.shape) == 2
            actual_str = k2.to_str(fsa)
            expected_str = '\n'.join(['0 1 1 1.2', '1 2 -1 3.4', '2'])
            assert actual_str.strip() == expected_str

            loss = fsa.scores.sum()
            (-loss).backward()
            # arc 1 and 3 of a_fsa are kept in the final intersected FSA
            assert torch.allclose(a_fsa.grad,
                                  torch.tensor([0, -1, 0, -1]).to(a_fsa.grad))

            # arc 0 and 2 of b_fsa are kept in the final intersected FSA
            assert torch.allclose(b_fsa.grad,
                                  torch.tensor([-1, 0, -1]).to(b_fsa.grad))

            # if any of the input FSA is an FsaVec,
            # the outupt FSA is also an FsaVec.
            a_fsa.scores.grad = None
            b_fsa.scores.grad = None
            a_fsa = k2.create_fsa_vec([a_fsa])
            fsa = k2.intersect(a_fsa, b_fsa, treat_epsilons_specially=False)
            assert len(fsa.shape) == 3
Пример #11
0
    def test_transducer2_from_str(self):
        s = '''
            0 1  2  22  101 -1.2
            0 2  10 100 102 -2.2
            1 6 -1  16  103 -4.2
            1 3  3  33  104 -3.2
            2 6 -1  26  105 -5.2
            2 4  2  22  106 -6.2
            3 6 -1  36  107 -7.2
            5 0  1  50  108 -8.2
            6
        '''
        for i in range(2):
            if i == 0:
                fsa = k2.Fsa.from_str(s, num_aux_labels=2)
            else:
                fsa = k2.Fsa.from_str(
                    s, aux_label_names=['aux_labels', 'aux_labels2'])
            assert fsa.aux_labels.dtype == torch.int32
            assert fsa.aux_labels.device.type == 'cpu'
            assert torch.all(
                torch.eq(
                    fsa.aux_labels,
                    torch.tensor([22, 100, 16, 33, 26, 22, 36, 50],
                                 dtype=torch.int32)))
            assert torch.all(
                torch.eq(
                    fsa.aux_labels2,
                    torch.tensor([101, 102, 103, 104, 105, 106, 107, 108],
                                 dtype=torch.int32)))

            assert torch.allclose(
                fsa.scores,
                torch.tensor([-1.2, -2.2, -4.2, -3.2, -5.2, -6.2, -7.2, -8.2],
                             dtype=torch.float32))

            # only aux_labels will be printed right now..
            expected_str = '''
            0 1 2 22 -1.2
            0 2 10 100 -2.2
            1 6 -1 16 -4.2
            1 3 3 33 -3.2
            2 6 -1 26 -5.2
            2 4 2 22 -6.2
            3 6 -1 36 -7.2
            5 0 1 50 -8.2
            6
            '''
            assert _remove_leading_spaces(expected_str) == \
                    _remove_leading_spaces(k2.to_str(fsa))
Пример #12
0
    def test_transducer_from_openfst(self):
        s = '''
            0 1 2 22  -1.2
            0 2  10 100 -2.2
            1 6  1  16  -4.2
            1 3  3  33  -3.2
            2 6  2  26  -5.2
            2 4  2  22  -6.2
            3 6  3  36  -7.2
            5 0  1  50  -8.2
            7 -9.2
            6
        '''
        for i in range(3):
            if i == 0:
                fsa = k2.Fsa.from_openfst(s, acceptor=False)
            elif i == 1:
                fsa = k2.Fsa.from_openfst(s, num_aux_labels=1)
            else:
                fsa = k2.Fsa.from_openfst(s, aux_label_names=['aux_labels'])

            assert fsa.aux_labels.dtype == torch.int32
            assert fsa.aux_labels.device.type == 'cpu'
            assert torch.all(
                torch.eq(
                    fsa.aux_labels,
                    torch.tensor([22, 100, 16, 33, 26, 22, 36, 50, -1, -1],
                                 dtype=torch.int32)))

            assert torch.allclose(
                fsa.scores,
                torch.tensor([1.2, 2.2, 4.2, 3.2, 5.2, 6.2, 7.2, 8.2, 0, 9.2],
                             dtype=torch.float32))

            expected_str = '''
                0 1 2 22 -1.2
                0 2 10 100 -2.2
                1 6 1 16 -4.2
                1 3 3 33 -3.2
                2 6 2 26 -5.2
                2 4 2 22 -6.2
                3 6 3 36 -7.2
                5 0 1 50 -8.2
                6 8 -1 -1 0
                7 8 -1 -1 -9.2
                8
            '''
            assert _remove_leading_spaces(expected_str) == \
                    _remove_leading_spaces(k2.to_str(fsa, openfst=True))
Пример #13
0
    def test_acceptor_from_str(self):
        s = '''
            0 1 2 -1.2
            0 2  10 -2.2
            1 6 -1  -3.2
            1 3  3  -4.2
            2 6 -1  -5.2
            2 4  2  -6.2
            3 6 -1  -7.2
            5 0  1  -8.2
            6
        '''

        fsa = k2.Fsa.from_str(_remove_leading_spaces(s))

        expected_str = '''
            0 1 2 -1.2
            0 2 10 -2.2
            1 6 -1 -3.2
            1 3 3 -4.2
            2 6 -1 -5.2
            2 4 2 -6.2
            3 6 -1 -7.2
            5 0 1 -8.2
            6
        '''
        assert _remove_leading_spaces(expected_str) == _remove_leading_spaces(
            k2.to_str(fsa))

        arcs = fsa.arcs.values()[:, :-1]
        assert isinstance(arcs, torch.Tensor)
        assert arcs.dtype == torch.int32
        assert arcs.device.type == 'cpu'
        assert arcs.shape == (8, 3), 'there should be 8 arcs'
        assert torch.allclose(arcs[0],
                              torch.tensor([0, 1, 2], dtype=torch.int32))

        assert torch.allclose(
            fsa.scores,
            torch.tensor([-1.2, -2.2, -3.2, -4.2, -5.2, -6.2, -7.2, -8.2],
                         dtype=torch.float32))

        fsa.scores *= -1

        assert torch.allclose(
            fsa.scores,
            torch.tensor([1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2, 8.2],
                         dtype=torch.float32))
Пример #14
0
def gfst(args):
    '''
    This programme is for debugging only. 
    Usually, for different task, we need diffrent gfst.
    Imagine that we only have 
    '
    <eps> 0
    <blank> 1
    <unk> 2
    n 3 
    y 4
    <eos> 5
    '
    these 6 different symbols and try to generate yyn, ynn, and 3gram gfst.
    '''
    symbols = symboletable(args)
    yyn = '''
    0 1 0 0 0.0
    0 2 0 0 0.0
    0 3 0 0 0.0
    0 4 0 0 0.0
    0 5 0 0 0.0
    0 6 0 0 0.0
    1 2 4 4 0.0
    1 6 1 0 0.0
    1 7 5 5 0.0
    2 3 1 0 0.0
    2 7 5 5 0.0
    3 4 4 4 0.0
    3 7 5 5 0.0
    4 5 1 0 0.0
    4 1 3 3 0.0
    4 7 5 5 0.0
    5 1 3 3 0.0
    5 7 5 5 0.0
    6 2 4 4 0.0
    6 7 5 5 0.0
    7 8 -1 -1 0.0
    8
    '''
    yyn_fst = k2.Fsa.from_str(yyn, acceptor=False)
    # yyn_fst.symbols = k2.SymbolTable.from_str(symbols)
    # yyn_fst.aux_symbols = k2.SymbolTable.from_str(symbols)
    gfst_dir = os.path.join(args.data_directory, 'G')
    os.makedirs(gfst_dir, exist_ok=True)
    yyn_fst.draw(os.path.join(gfst_dir, 'yyn.pdf'), 'yyn')
    with open(os.path.join(gfst_dir, 'yyn.fst.txt'), 'w') as f:
        f.write(k2.to_str(yyn_fst))
Пример #15
0
    def test_acceptor_from_tensor(self):
        fsa_tensor = torch.tensor(
            [[0, 1, 2, _k2._float_as_int(-1.2)],
             [0, 2, 10, _k2._float_as_int(-2.2)],
             [1, 6, -1, _k2._float_as_int(-3.2)],
             [1, 3, 3, _k2._float_as_int(-4.2)],
             [2, 6, -1, _k2._float_as_int(-5.2)],
             [2, 4, 2, _k2._float_as_int(-6.2)],
             [3, 6, -1, _k2._float_as_int(-7.2)],
             [5, 0, 1, _k2._float_as_int(-8.2)]],
            dtype=torch.int32)

        fsa = k2.Fsa(fsa_tensor)

        expected_str = '''
            0 1 2 -1.2
            0 2 10 -2.2
            1 6 -1 -3.2
            1 3 3 -4.2
            2 6 -1 -5.2
            2 4 2 -6.2
            3 6 -1 -7.2
            5 0 1 -8.2
            6
        '''
        assert _remove_leading_spaces(expected_str) == _remove_leading_spaces(
            k2.to_str(fsa))

        arcs = fsa.arcs.values()[:, :-1]
        assert isinstance(arcs, torch.Tensor)
        assert arcs.dtype == torch.int32
        assert arcs.device.type == 'cpu'
        assert arcs.shape == (8, 3), 'there should be 8 arcs'
        assert torch.allclose(arcs[0],
                              torch.tensor([0, 1, 2], dtype=torch.int32))

        assert torch.allclose(
            fsa.score,
            torch.tensor([-1.2, -2.2, -3.2, -4.2, -5.2, -6.2, -7.2, -8.2],
                         dtype=torch.float32))

        fsa.score *= -1

        assert torch.allclose(
            fsa.score,
            torch.tensor([1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2, 8.2],
                         dtype=torch.float32))
Пример #16
0
    def test_transducer_from_str(self):
        s = '''
            0 1  2  22  -1.2
            0 2  10 100 -2.2
            1 6 -1  16  -4.2
            1 3  3  33  -3.2
            2 6 -1  26  -5.2
            2 4  2  22  -6.2
            3 6 -1  36  -7.2
            5 0  1  50  -8.2
            6
        '''
        for i in range(3):
            if i == 0:
                fsa = k2.Fsa.from_str(s, num_aux_labels=1)
            elif i == 1:
                fsa = k2.Fsa.from_str(s, acceptor=False)
            else:
                fsa = k2.Fsa.from_str(s, aux_label_names=['aux_labels'])
            assert fsa.aux_labels.dtype == torch.int32
            assert fsa.aux_labels.device.type == 'cpu'
            assert torch.all(
                torch.eq(
                    fsa.aux_labels,
                    torch.tensor([22, 100, 16, 33, 26, 22, 36, 50],
                                 dtype=torch.int32)))

            assert torch.allclose(
                fsa.scores,
                torch.tensor([-1.2, -2.2, -4.2, -3.2, -5.2, -6.2, -7.2, -8.2],
                             dtype=torch.float32))

            expected_str = '''
            0 1 2 22 -1.2
            0 2 10 100 -2.2
            1 6 -1 16 -4.2
            1 3 3 33 -3.2
            2 6 -1 26 -5.2
            2 4 2 22 -6.2
            3 6 -1 36 -7.2
            5 0 1 50 -8.2
            6
            '''
            assert _remove_leading_spaces(expected_str) == \
                    _remove_leading_spaces(k2.to_str(fsa))
Пример #17
0
    def test(self):
        s = '''
            0 1 2 0.1
            0 1 1 0.2
            1 2 -1 0.3
            2
        '''
        fsa = k2.Fsa.from_str(s)
        fsa.requires_grad_(True)
        sorted_fsa = k2.arc_sort(fsa)

        actual_str = k2.to_str(sorted_fsa)
        expected_str = '\n'.join(['0 1 1 0.2', '0 1 2 0.1', '1 2 -1 0.3', '2'])
        assert actual_str.strip() == expected_str

        loss = (sorted_fsa.scores[1] + sorted_fsa.scores[2]) / 2
        loss.backward()
        assert torch.allclose(fsa.scores.grad,
                              torch.tensor([0.5, 0, 0.5], dtype=torch.float32))
Пример #18
0
class TestConnect(unittest.TestCase):
    s = '''
        0 1 1 0.1
        0 2 2 0.2
        1 4 -1 0.3
        3 4 -1 0.4
        4
    '''
    fsa = k2.Fsa.from_str(s)
    fsa.requires_grad_(True)
    expected_str = '\n'.join(['0 1 1 0.1', '1 2 -1 0.3', '2'])
    connected_fsa = k2.connect(fsa)
    actual_str = k2.to_str(connected_fsa)
    assert actual_str.strip() == expected_str

    loss = connected_fsa.scores.sum()
    loss.backward()
    assert torch.allclose(fsa.scores.grad,
                          torch.tensor([1, 0, 1, 0], dtype=torch.float32))
Пример #19
0
    def test_transducer_from_tensor(self):
        for device in self.devices:
            fsa_tensor = torch.tensor(
                [[0, 1, 2, _k2.float_as_int(-1.2)],
                 [0, 2, 10, _k2.float_as_int(-2.2)],
                 [1, 6, -1, _k2.float_as_int(-4.2)],
                 [1, 3, 3, _k2.float_as_int(-3.2)],
                 [2, 6, -1, _k2.float_as_int(-5.2)],
                 [2, 4, 2, _k2.float_as_int(-6.2)],
                 [3, 6, -1, _k2.float_as_int(-7.2)],
                 [5, 0, 1, _k2.float_as_int(-8.2)]],
                dtype=torch.int32).to(device)
            aux_labels_tensor = torch.tensor([22, 100, 16, 33, 26, 22, 36, 50],
                                             dtype=torch.int32).to(device)
            fsa = k2.Fsa(fsa_tensor, aux_labels_tensor)
            assert fsa.aux_labels.dtype == torch.int32
            assert fsa.aux_labels.device.type == device.type
            assert torch.all(
                torch.eq(
                    fsa.aux_labels,
                    torch.tensor([22, 100, 16, 33, 26, 22, 36, 50],
                                 dtype=torch.int32).to(device)))

            assert torch.allclose(
                fsa.scores,
                torch.tensor([-1.2, -2.2, -4.2, -3.2, -5.2, -6.2, -7.2, -8.2],
                             dtype=torch.float32,
                             device=device))

            expected_str = '''
                0 1 2 22 -1.2
                0 2 10 100 -2.2
                1 6 -1 16 -4.2
                1 3 3 33 -3.2
                2 6 -1 26 -5.2
                2 4 2 22 -6.2
                3 6 -1 36 -7.2
                5 0 1 50 -8.2
                6
            '''
            assert _remove_leading_spaces(expected_str) == \
                    _remove_leading_spaces(k2.to_str(fsa))
Пример #20
0
    def test_transducer_from_tensor(self):
        devices = [torch.device('cpu')]
        if torch.cuda.is_available():
            devices.append(torch.device('cuda', 0))

        for device in devices:
            fsa_tensor = torch.tensor(
                [[0, 1, 2, _k2.float_as_int(-1.2)],
                 [0, 2, 10, _k2.float_as_int(-2.2)],
                 [1, 6, -1, _k2.float_as_int(-4.2)],
                 [1, 3, 3, _k2.float_as_int(-3.2)],
                 [2, 6, -1, _k2.float_as_int(-5.2)],
                 [2, 4, 2, _k2.float_as_int(-6.2)],
                 [3, 6, -1, _k2.float_as_int(-7.2)],
                 [5, 0, 1, _k2.float_as_int(-8.2)]],
                dtype=torch.int32).to(device)
            aux_labels_tensor = torch.tensor([22, 100, 16, 33, 26, 22, 36, 50],
                                             dtype=torch.int32).to(device)
            fsa = k2.Fsa(fsa_tensor, aux_labels_tensor)
            assert fsa.aux_labels.dtype == torch.int32
            assert fsa.aux_labels.device.type == device.type
            assert torch.allclose(
                fsa.aux_labels,
                torch.tensor([22, 100, 16, 33, 26, 22, 36, 50],
                             dtype=torch.int32).to(device))

            expected_str = '''
                0 1 2 22 -1.2
                0 2 10 100 -2.2
                1 6 -1 16 -4.2
                1 3 3 33 -3.2
                2 6 -1 26 -5.2
                2 4 2 22 -6.2
                3 6 -1 36 -7.2
                5 0 1 50 -8.2
                6
            '''
            assert _remove_leading_spaces(
                expected_str) == _remove_leading_spaces(k2.to_str(fsa))
Пример #21
0
    def test_acceptor_wo_arcs_from_str(self):
        s1 = '''
        '''

        s2 = '''
            0
            1
        '''

        s3 = '''
            1
        '''

        for device in self.devices:
            fsa1 = k2.Fsa.from_str(s1)
            self.assertEqual(k2.to_str(fsa1), '')

            with self.assertRaises(ValueError):
                _ = k2.Fsa.from_str(s2)

            fsa3 = k2.Fsa.from_str(s3)
            self.assertEqual(fsa3.arcs.dim0(), 0)
Пример #22
0
    def test_transducer_from_openfst(self):
        s = '''
            0 1 2 22  -1.2
            0 2  10 100 -2.2
            1 6  1  16  -4.2
            1 3  3  33  -3.2
            2 6  2  26  -5.2
            2 4  2  22  -6.2
            3 6  3  36  -7.2
            5 0  1  50  -8.2
            7 -9.2
            6
        '''
        fsa = k2.Fsa.from_openfst(_remove_leading_spaces(s), acceptor=False)
        assert fsa.aux_labels.dtype == torch.int32
        assert fsa.aux_labels.device.type == 'cpu'
        assert torch.allclose(
            fsa.aux_labels,
            torch.tensor([22, 100, 16, 33, 26, 22, 36, 50, 0, 0],
                         dtype=torch.int32))

        expected_str = '''
            0 1 2 22 -1.2
            0 2 10 100 -2.2
            1 6 1 16 -4.2
            1 3 3 33 -3.2
            2 6 2 26 -5.2
            2 4 2 22 -6.2
            3 6 3 36 -7.2
            5 0 1 50 -8.2
            6 8 -1 0 -0
            7 8 -1 0 -9.2
            8
        '''
        assert _remove_leading_spaces(expected_str) == _remove_leading_spaces(
            k2.to_str(fsa, openfst=True))
Пример #23
0
    def test(self):
        devices = [torch.device('cpu')]
        if torch.cuda.is_available():
            devices.append(torch.device('cuda'))

        for device in devices:
            for use_identity_map, sorted_match_a in [(True, True),
                                                     (False, True),
                                                     (True, False),
                                                     (False, False)]:
                # recognizes (0|1)(0|2)
                s1 = '''
                    0 1 0 0.1
                    0 1 1 0.2
                    1 2 0 0.4
                    1 2 2 0.3
                    2 3 -1 0.5
                    3
                '''

                # recognizes 02*
                s2 = '''
                    0 1 0 1
                    1 1 2 2
                    1 2 -1 3
                    2
                '''

                # recognizes 1*0
                s3 = '''
                    0 0 1 10
                    0 1 0 20
                    1 2 -1 30
                    2
                '''
                a_fsa = k2.Fsa.from_str(s1).to(device)
                b_fsa_1 = k2.Fsa.from_str(s2).to(device)
                b_fsa_2 = k2.Fsa.from_str(s3).to(device)

                a_fsa.requires_grad_(True)
                b_fsa_1.requires_grad_(True)
                b_fsa_2.requires_grad_(True)

                b_fsas = k2.create_fsa_vec([b_fsa_1, b_fsa_2])
                if use_identity_map:
                    a_fsas = k2.create_fsa_vec([a_fsa, a_fsa])
                    b_to_a_map = torch.tensor([0, 1],
                                              dtype=torch.int32).to(device)
                else:
                    a_fsas = k2.create_fsa_vec([a_fsa])
                    b_to_a_map = torch.tensor([0, 0],
                                              dtype=torch.int32).to(device)

                c_fsas = k2.intersect_device(a_fsas, b_fsas, b_to_a_map,
                                             sorted_match_a)
                assert c_fsas.shape == (2, None, None)
                c_fsas = k2.connect(c_fsas.to('cpu'))
                # c_fsas[0] recognizes: 02
                # c_fsas[1] recognizes: 10

                actual_str_0 = k2.to_str(c_fsas[0])
                expected_str_0 = '\n'.join(
                    ['0 1 0 1.1', '1 2 2 2.3', '2 3 -1 3.5', '3'])
                assert actual_str_0.strip() == expected_str_0

                actual_str_1 = k2.to_str(c_fsas[1])
                expected_str_1 = '\n'.join(
                    ['0 1 1 10.2', '1 2 0 20.4', '2 3 -1 30.5', '3'])
                assert actual_str_1.strip() == expected_str_1

                loss = c_fsas.scores.sum()
                (-loss).backward()
                assert torch.allclose(
                    a_fsa.grad,
                    torch.tensor([-1, -1, -1, -1, -2]).to(a_fsa.grad))
                assert torch.allclose(
                    b_fsa_1.grad,
                    torch.tensor([-1, -1, -1]).to(b_fsa_1.grad))
                assert torch.allclose(
                    b_fsa_2.grad,
                    torch.tensor([-1, -1, -1]).to(b_fsa_2.grad))
Пример #24
0
def lexicon_fst_whole(args):
    '''
    This programme create lexicon.fst.pdf and lexicon.fst.txt based on args.word_file
    input:
    args: name_space
    return:
    lexicon: k2.Fsa, lexicon fst
    output:
    lexicon.fst.txt and lexicon.fst.pdf in args.data_directory

    By lexicon fst, we compress the repeated chars in emission fst. 
    '''
    symbols_str = symboletable(args)
    symbols_paris = symbols_str.split('\n')
    num_noneps = len(symbols_paris) - 1
    s = ''
    count = 1
    for i in range(1, num_noneps + 1):
        s += '''
0 %d %d %d 0.0
%d %d %d 0 0.0
%d %d -1 -1 0.0
%d 0 0 0 0.0''' % (
            i, i, i, i, i, i, i, num_noneps + 1, i)
    slines = s.strip().split('\n')
    def extract_first_index(line):
        line_content = line.strip().split()
        return int(line_content[0])
    slines = sorted(slines, key=lambda l: extract_first_index(l))
    s = '\n'.join(slines)

    s += '\n%d\n' % (num_noneps + 1)
    # s1 = '''
    # 0 1 1 1 0.0
    # 0 2 2 2 0.0
    # 0 3 3 3 0.0
    # 0 4 4 4 0.0
    # 0 5 5 5 0.0
    # 1 1 1 0 0.0
    # 1 6 -1 -1 0.0
    # 1 0 0 0 0.0
    # 2 2 2 0 0.0
    # 2 6 -1 -1 0.0
    # 2 0 0 0 0.0
    # 3 3 3 0 0.0
    # 3 6 -1 -1 0.0
    # 3 0 0 0 0.0
    # 4 4 4 0 0.0
    # 4 6 -1 -1 0.0
    # 4 0 0 0 0.0
    # 5 5 5 0 0.0
    # 5 6 -1 -1 0.0
    # 5 0 0 0 0.0
    # 6
    # '''
    with open('lex.txt', 'w') as f:
        f.write(s)
    g = k2.Fsa.from_str(s, acceptor=False)
    # g.symbols = k2.SymbolTable.from_str(symbols_str)
    # g.aux_symbols = k2.SymbolTable.from_str(symbols_str)
    g.draw(os.path.join(args.data_directory, 'lexicon.newfst.pdf'), title='lexicon')
    with open(os.path.join(args.data_directory, 'lexicon.newfst.txt'), 'w') as f:
        f.write(k2.to_str(g))
Пример #25
0
    def test_transducer3_from_openfst(self):
        s = '''
            0 1  2  22  33  44  -1.2
            0 2  10 100 101 102 -2.2
            1 6  1  16  17  18  -4.2
            1 3  3  33  34  35  -3.2
            2 6  2  26  27  28  -5.2
            2 4  2  22  23  24  -6.2
            3 6  3  36  37  38  -7.2
            5 0  1  50  51  52  -8.2
            7 -9.2
            6
        '''
        for i in range(2):
            if i == 0:
                fsa = k2.Fsa.from_openfst(s, num_aux_labels=3)
            else:
                fsa = k2.Fsa.from_openfst(s,
                                          aux_label_names=[
                                              'aux_labels', 'aux_labels2',
                                              'aux_labels3'
                                          ])

            assert fsa.aux_labels.dtype == torch.int32
            assert fsa.aux_labels.device.type == 'cpu'
            assert torch.all(
                torch.eq(
                    fsa.aux_labels,
                    torch.tensor([22, 100, 16, 33, 26, 22, 36, 50, -1, -1],
                                 dtype=torch.int32)))

            assert fsa.aux_labels2.dtype == torch.int32
            assert fsa.aux_labels2.device.type == 'cpu'
            assert torch.all(
                torch.eq(
                    fsa.aux_labels2,
                    torch.tensor([33, 101, 17, 34, 27, 23, 37, 51, -1, -1],
                                 dtype=torch.int32)))

            assert fsa.aux_labels3.dtype == torch.int32
            assert fsa.aux_labels3.device.type == 'cpu'
            assert torch.all(
                torch.eq(
                    fsa.aux_labels3,
                    torch.tensor([44, 102, 18, 35, 28, 24, 38, 52, -1, -1],
                                 dtype=torch.int32)))

            assert torch.allclose(
                fsa.scores,
                torch.tensor([1.2, 2.2, 4.2, 3.2, 5.2, 6.2, 7.2, 8.2, 0, 9.2],
                             dtype=torch.float32))

            expected_str = '''
                0 1 2 22 -1.2
                0 2 10 100 -2.2
                1 6 1 16 -4.2
                1 3 3 33 -3.2
                2 6 2 26 -5.2
                2 4 2 22 -6.2
                3 6 3 36 -7.2
                5 0 1 50 -8.2
                6 8 -1 -1 0
                7 8 -1 -1 -9.2
                8
            '''
            assert _remove_leading_spaces(expected_str) == \
                    _remove_leading_spaces(k2.to_str(fsa, openfst=True))