Пример #1
0
class TestRawDecoder(TestCase):
    def __init__(self, *args, **kwargs):
        self.ps = Pocketsphinx()
        self.ps.decode()
        super(TestRawDecoder, self).__init__(*args, **kwargs)

    def test_raw_decoder_lookup_word(self):
        self.assertEqual(self.ps.lookup_word('hello'), 'HH AH L OW')
        self.assertEqual(self.ps.lookup_word('abcdf'), None)

    def test_raw_decoder_hypothesis(self):
        self.assertEqual(self.ps.hypothesis(), 'go forward ten meters')
        self.assertEqual(self.ps.score(), -7066)
        self.assertEqual(self.ps.confidence(), 0.04042641466841839)

    def test_raw_decoder_segments(self):
        self.assertEqual(
            self.ps.segments(),
            ['<s>', '<sil>', 'go', 'forward', 'ten', 'meters', '</s>'])

    def test_raw_decoder_best_hypothesis(self):
        self.assertEqual(self.ps.best(), [('go forward ten meters', -28034),
                                          ('go for word ten meters', -28570),
                                          ('go forward and majors', -28670),
                                          ('go forward and meters', -28681),
                                          ('go forward and readers', -28685),
                                          ('go forward ten readers', -28688),
                                          ('go forward ten leaders', -28695),
                                          ('go forward can meters', -28695),
                                          ('go forward and leaders', -28706),
                                          ('go for work ten meters', -28722)])
Пример #2
0
class TestRawDecoder(TestCase):

    def __init__(self, *args, **kwargs):
        self.ps = Pocketsphinx()
        self.ps.decode()
        super(TestRawDecoder, self).__init__(*args, **kwargs)

    def test_raw_decoder_lookup_word(self):
        self.assertEqual(self.ps.lookup_word('hello'), 'HH AH L OW')
        self.assertEqual(self.ps.lookup_word('abcdf'), None)

    def test_raw_decoder_hypothesis(self):
        self.assertEqual(self.ps.hypothesis(), 'go forward ten meters')
        self.assertEqual(self.ps.score(), -7066)
        self.assertEqual(self.ps.confidence(), 0.04042641466841839)

    def test_raw_decoder_segments(self):
        self.assertEqual(self.ps.segments(), [
            '<s>', '<sil>', 'go', 'forward', 'ten', 'meters', '</s>'
        ])

    def test_raw_decoder_best_hypothesis(self):
        self.assertEqual(self.ps.best(), [
            ('go forward ten meters', -28034),
            ('go for word ten meters', -28570),
            ('go forward and majors', -28670),
            ('go forward and meters', -28681),
            ('go forward and readers', -28685),
            ('go forward ten readers', -28688),
            ('go forward ten leaders', -28695),
            ('go forward can meters', -28695),
            ('go forward and leaders', -28706),
            ('go for work ten meters', -28722)
        ])
Пример #3
0
class SpeechProcessor:
    def __init__(self, hmm='data/spanish/CIEMPIESS_Spanish_Models_581h/Models/modelo',
                       lm='data/spanish/CIEMPIESS_Spanish_Models_581h/Models/leng.lm.bin',
                       dict='data/spanish/CIEMPIESS_Spanish_Models_581h/Models/dicc.dic',
                       grammar='data/gramatica-tp2.gram', dataPath='tmp/'):
        self.data_path = dataPath
        config = {
            'hmm': hmm,
            'lm': lm,
            'dict': dict
        }
        #model_path = get_model_path()

        self.ps = Pocketsphinx(**config)
        
        # Switch to JSGF grammar
        jsgf = Jsgf(grammar)
        rule = jsgf.get_rule('tp2.grammar')
        fsg = jsgf.build_fsg(rule, self.ps.get_logmath(), 7.5)
        self.ps.set_fsg('tp2', fsg)
        self.ps.set_search('tp2')

        # Síntesis
        self.tts_authenticator = IAMAuthenticator('cq9_4YcCXxClw2AfgUhbokFktZ-xSRT4kcHS2akcZ05J')
        self.tts = TextToSpeechV1(authenticator=self.tts_authenticator)
        self.tts.set_service_url('https://stream.watsonplatform.net/text-to-speech/api')


    def sintetizar(self, outFileName, msg):
        if len(msg) > 0:
            with open(outFileName, 'wb') as audio_file:
                audio_file.write(
                    self.tts.synthesize(
                        msg,
                        voice='es-LA_SofiaV3Voice',
                        accept='audio/wav'
                    ).get_result().content)

    def reconocer(self, inFileName='audio.wav'):
        # Reconocimiento
        print(self.data_path)
        self.ps.decode(
            audio_file=os.path.join(self.data_path,inFileName),
            buffer_size=2048,
            no_search=False,
            full_utt=False
        )
        return self.ps.segments(), self.ps.best(count=3)
Пример #4
0
class TestPhoneme(TestCase):

    def __init__(self, *args, **kwargs):
        self.ps = Pocketsphinx(
            lm=False,
            dic=False,
            allphone='deps/pocketsphinx/model/en-us/en-us-phone.lm.bin',
            lw=2.0,
            pip=0.3,
            beam=1e-200,
            pbeam=1e-20,
            mmap=False
        )
        self.ps.decode()
        super(TestPhoneme, self).__init__(*args, **kwargs)

    def test_phoneme_hypothesis(self):
        self.assertEqual(
            self.ps.hypothesis(),
            'SIL G OW F AO R W ER D T AE N M IY IH ZH ER Z S V SIL'
        )

    def test_phoneme_best_phonemes(self):
        self.assertEqual(self.ps.segments(), [
            'SIL',
            'G',
            'OW',
            'F',
            'AO',
            'R',
            'W',
            'ER',
            'D',
            'T',
            'AE',
            'N',
            'M',
            'IY',
            'IH',
            'ZH',
            'ER',
            'Z',
            'S',
            'V',
            'SIL'
        ])
Пример #5
0
class TestPhoneme(TestCase):

    def __init__(self, *args, **kwargs):
        self.ps = Pocketsphinx(
            lm=False,
            dic=False,
            allphone='deps/pocketsphinx/model/en-us/en-us-phone.lm.bin',
            lw=2.0,
            pip=0.3,
            beam=1e-200,
            pbeam=1e-20,
            mmap=False
        )
        self.ps.decode()
        super(TestPhoneme, self).__init__(*args, **kwargs)

    def test_phoneme_hypothesis(self):
        self.assertEqual(
            self.ps.hypothesis(),
            'SIL G OW F AO R D T AE N NG IY ZH ER S SIL'
        )

    def test_phoneme_best_phonemes(self):
        self.assertEqual(self.ps.segments(), [
            'SIL',
            'G',
            'OW',
            'F',
            'AO',
            'R',
            'D',
            'T',
            'AE',
            'N',
            'NG',
            'IY',
            'ZH',
            'ER',
            'S',
            'SIL'
        ])
data_path = get_data_path()

config = {
    'hmm': os.path.join(model_path, 'en-us'),
    'lm': os.path.join(model_path, 'en-us.lm.bin'),
    'dict': os.path.join(model_path, 'cmudict-en-us.dict')
}

ps = Pocketsphinx(**config)
ps.decode(audio_file=os.path.join(data_path, 'output.raw'),
          buffer_size=2048,
          no_search=False,
          full_utt=False)

# => ['<s>', '<sil>', 'go', 'forward', 'ten', 'meters', '</s>']
print(ps.segments())
print('Detailed segments:', *ps.segments(detailed=True), sep='\n')  # => [
#     word, prob, start_frame, end_frame
#     ('<s>', 0, 0, 24)
#     ('<sil>', -3778, 25, 45)
#     ('go', -27, 46, 63)
#     ('forward', -38, 64, 116)
#     ('ten', -14105, 117, 152)
#     ('meters', -2152, 153, 211)
#     ('</s>', 0, 212, 260)
# ]

print(ps.hypothesis())  # => go forward ten meters
print(ps.probability())  # => -32079
print(ps.score())  # => -7066
print(ps.confidence())  # => 0.04042641466841839
Пример #7
0
model_path = get_model_path()
data_path = get_data_path()

config = {
    'hmm': os.path.join(model_path, 'en-us'),
    'lm': r'C:\Users\BZT\Desktop\speech_segment\5446.lm',
    'dict': r'C:\Users\BZT\Desktop\speech_segment\5446.dic'
}

ps = Pocketsphinx(**config)
ps.decode(
    audio_file=
    r'C:\Users\BZT\Desktop\speech_segment\speech_segment\Ses01F_impro01_M013.wav',
    buffer_size=2048,
    no_search=False,
    full_utt=False)

# print(ps.segments())  # => ['<s>', '<sil>', 'go', 'forward', 'ten', 'meters', '</s>']
# print('Detailed segments:', *ps.segments(detailed=True), sep='\n')
for sample in ps.segments(detailed=True):
    for subsample in sample:
        print(subsample, end='\t')
    print()

print(ps.hypothesis())  # => go forward ten meters
# print(ps.probability())  # => -32079
# print(ps.score())  # => -7066
# print(ps.confidence())  # => 0.04042641466841839

# print(*ps.best(count=10), sep='\n')
Пример #8
0
class Sphinx(Thread):

    def __init__(self):
        Thread.__init__(self)
        self.ready = False

    def run(self):
        print_important("Info! Thread sphinx started.") 
        self.config = {
            'verbose': True,
            'hmm': os.path.join('s2m', 'core', 'sphinx', 'fr'),
            'lm': os.path.join('s2m', 'core', 'sphinx', 'fr.lm.dmp'),
            'dict': os.path.join('s2m', 'core', 'sphinx', 's2m.dict'),
            'jsgf': os.path.join('s2m', 'core', 'sphinx', 's2m.jsgf'),
        }
        self.pocketsphinx = Pocketsphinx(**self.config)
        self.ready = True

    def get_silence(self, duration):
        if duration < 0.25:
            return '[veryshortsil]'
        elif duration < 0.5:
            return '[shortsil]'
        elif duration < 1.5:
            return '[sil]'
        elif duration < 3.:
            return '[longsil]'
        else:
            return '[verylongsil]'

    def get_segment_string(self, segments):
        segment_list = []
        last_silence = 0
        spoken_duration = 0
        word_count = 0
        for segment in segments:
            if segment.word in ['<s>', '</s>']:
                continue
            elif segment.word == '<sil>':
                last_silence += segment.end_frame - segment.start_frame
            else:
                if last_silence > 0:
                    segment_list.append(last_silence)
                    last_silence = 0
                spoken_duration += segment.end_frame - segment.start_frame
                segment_list.append(segment.word)
                word_count += 1
        if word_count == 0:
            return ''
        avg_word_duration = spoken_duration / word_count
        return ' '.join((self.get_silence(s / avg_word_duration)
                         if type(s) is int else nobrackets(s))
                        for s in segment_list)
    
    def to_text(self, filename, erase=False):
        if not self.ready:
            raise EnvironmentError('Initialization of sphinx not finished.')
        FILLER_WORDS = ['<s>', '<sil>', '</s>']
        try:
            self.pocketsphinx.decode(filename)
        except Exception as e:
            print("An error was raised by sphinx while decoding file '%r', parsing aborted" % filename)
        text = " ".join(
           [s for s in self.pocketsphinx.segments() if s not in FILLER_WORDS])
        text = nobrackets(text)
        segment_string = self.get_segment_string(self.pocketsphinx.seg())
        nbest = [nobrackets(w[0])
                 for w in self.pocketsphinx.best(count=10)[1:]]
        if erase:
            os.remove(loc)
        return segment_string, nbest
data_path = get_data_path()
print "config set"
config = {
    'hmm': os.path.join(model_path, 'en-us'),
    'lm': os.path.join(model_path, 'en-us.lm.bin'),
    'dict': os.path.join(model_path, 'cmudict-en-us.dict')
}

ps = Pocketsphinx(**config)
print "decoging"
ps.decode(audio_file=sys.argv[1],
          buffer_size=2048,
          no_search=False,
          full_utt=False)
print "decoded"
print(ps.segments()
      )  # => ['<s>', '<sil>', 'go', 'forward', 'ten', 'meters', '</s>']
print "DETAILED SHIT"
print(ps.segments(detailed=True))  # => [
#     word, prob, start_frame, end_frame
#     ('<s>', 0, 0, 24)
#     ('<sil>', -3778, 25, 45)
#     ('go', -27, 46, 63)
#     ('forward', -38, 64, 116)
#     ('ten', -14105, 117, 152)
#     ('meters', -2152, 153, 211)
#     ('</s>', 0, 212, 260)
# ]

#print(ps.hypothesis())  # => go forward ten meters
#print(ps.probability()) # => -32079
Пример #10
0
    buffer_size=2048,
    no_search=False,
    full_utt=False
)

#print(ps.segments())

#save the detailed segments of the words, 
#which will contain details word, probablity, start_time and end_time
#print('Detailed segments:', *ps.segments(detailed=True), sep='\n')

# with open('output_segments_obama_farewell_speech.txt', 'a') as f:
#     print(*ps.segments(detailed=True), sep='\n', file=f)
    
with open(filename_output_segments, 'a') as f:
    print(*ps.segments(detailed=True), sep='\n', file=f)
    
#convert from audio to text and save    
text = ps.hypothesis()


file1 = open(filename_sphinx,"w")#write mode 
file1.write(text) 
file1.close() 

#load into dataframe
# For the above saved file, modify manually by removing '(',')',' and then save as modified fie
#df = pd.read_csv('output_segments_donaldTrump_modified.txt', sep=",", header=None)
df = pd.read_csv(filename_output_segments_mod, sep=",", header=None)
df.columns = ["word", "prob","startTime", "endTime"]
df.head()
#instantiate PyAudio  
p = pyaudio.PyAudio()  

#open stream  
stream = p.open(format = 8,  
                channels = 1,  
                rate = 16000,  
                output = True)


#read data  
data = f.read(chunk*2)

frames = []
phones = {}
phoneme_pred = ps.segments(detailed=True)

fps = 100 
for idx,s in enumerate(phoneme_pred):
    if (idx < len(phoneme_pred)-1):
        phones[phoneme_pred[idx+1][2]/fps] = phoneme_pred[idx][0]
    else:
        phones[phoneme_pred[idx][3]/fps] = phoneme_pred[idx][0]

mouth_list = []
pre_mouth_shape = 'M'

#lip sync style intensity variation
if (opt.level == 0):
    intensity = 0.2
elif (opt.level == 1):