def __init__(self, conf):
        '''TextProcessor constructor

        Args:
            conf: processor configuration as a configparser
        '''

        #create the normalizer
        self.normalizer = normalizer_factory.factory(
            conf.get('processor', 'normalizer'))

        self.alphabet = conf.get('processor', 'alphabet').strip().split(' ')
        self.alphabet = [c if c != '\\;' else ';' for c in self.alphabet]
        print 'in text processor'
        print conf.sections()
        print conf.items('processor')

        #initialize the metadata
        self.max_length = 0
        self.sequence_length_histogram = np.zeros(0, dtype=np.int32)
        if conf.get('processor', 'nonesymbol') != 'None':
            self.nonesymbol = conf.get('processor', 'nonesymbol')
        else:
            self.nonesymbol = ''

        super(TextProcessor, self).__init__(conf)
Exemplo n.º 2
0
    def __init__(self, conf):
        '''TextProcessor constructor

        Args:
            conf: processor configuration as a configparser
        '''

        #create the normalizer
        self.normalizer = normalizer_factory.factory(
            conf.get('processor', 'normalizer'))

        self.alphabet = conf.get('processor', 'alphabet').split(' ')

        #initialize the metadata
        self.max_length = 0
        self.sequence_length_histogram = np.zeros(0, dtype=np.int32)
        if conf.has_option('processor', 'nonesymbol'):
            self.nonesymbol = conf.get('processor', 'nonesymbol')
        else:
            self.nonesymbol = ''

        super(TextProcessor, self).__init__(conf)
Exemplo n.º 3
0
this file will do the dataprep for lm training'''

import os
from six.moves import configparser
from nabu.processing.target_normalizers import normalizer_factory

#pointer to the confif file
database_cfg_file = 'config/lm_databases/TIMIT.conf'

#read the database config file
database_cfg = configparser.ConfigParser()
database_cfg.read(database_cfg_file)
database_cfg = dict(database_cfg.items('database'))

#create the text normalizer
normalizer = normalizer_factory.factory(database_cfg['normalizer'])

print '------- normalizing training text -----------'
sourcefiles = database_cfg['train_data'].split(' ')
if not os.path.isdir(database_cfg['train_dir']):
    os.makedirs(database_cfg['train_dir'])
target_fid = open(os.path.join(database_cfg['train_dir'], 'text'), 'w')
max_num_chars = 0
numlines = 0

#read the textfiles line by line, normalize and write in target file
for sourcefile in sourcefiles:
    with open(sourcefile) as fid:
        for line in fid.readlines():
            normalized = normalizer(line.strip())
            max_num_chars = max(max_num_chars, len(normalized.split(' ')))