def __init__(self, holder_type): self.input = Input() # Input object for the archive. self.cur_key = None # Current key (if state == kHaveObject). self.type = holder_type # type of the holder self.holder = NewHolderByType(self.type) # Holds the object we just # read (if state == kHaveObject). self.rspecifier = None self.archive_rxfilename = None self.opts = None self.state = RandomAccessTableReaderStateType.kUninitialized
def __init__(self, holder_type): self.input = Input() self.opts = None self.rspecifier = None self.script = None self.keys = None self.script_rxfilename = None self.key = None self.type = holder_type self.holder = NewHolderByType(self.type) self.data_rxfilename = None self.last_found = 0 self.state = SequentialTableReaderStateType.kUninitialized
def __init__(self, holder_type): """Initialize the reader for the given holder type. Args: holder_type: The given holder type. """ self.rspecifier = None self.opts = None self.archive_rxfilename = None self.input = Input() self.type = holder_type self.holder = NewHolderByType(self.type) self.key = None self.state = SequentialTableReaderStateType.kUninitialized
def FindKeyInternal(self, key, need_value=False): """FindKeyInternal() tries to find the key in the dict "self.map" If it is not already there, it reads ahead either until it finds the key, or until end of file. If called with need_value == False, it assumes it's called from HasKey() and just returns True or False and doesn't otherwise have side effects. If called with need_value == True, it assumes it's called from Value(). Thus, it will crash if it cannot find the key. If it can find it it puts the value in return, and if opts_once == true it will mark that element of the map to be deleted. Args: key: The key to find. need_value: whether to return corresponding value or not. Returns: A tuple containing: 1. A boolean variable indicating if the operation is successful. 2. The value corresponding to the key and request, None if did not find it or not requested. """ if key in self.map.keys(): # Found in the map... if not need_value: # Called from HasKey() return (True, None) else: value = self.map[key].Value() # value won't be needed again, so mark for deletion. if self.opts.once: self.to_delete_key = key return (True, value) while self.state == RandomAccessTableReaderStateType.kNoObject: self.ReadNextObject() # Successfully read object. if self.state == RandomAccessTableReaderStateType.kHaveObject: # We are about to transfer ownership of the object in holder_ # to self.map. Insert it into self.map. self.state = RandomAccessTableReaderStateType.kNoObject if self.cur_key in self.map.keys(): self.holder.Clear() LogError('Duplicate key \"%s\" in archive \"%s\"' % (self.cur_key, self.archive_rxfilename)) self.map[self.cur_key] = self.holder self.holder = NewHolderByType(self.type) if self.cur_key == key: if not need_value: # Called from HasKey() return (True, None) else: # Called from Value() value = self.map[key].Value() if self.opts.once: self.to_delete_key = key return (True, value) return (False, None) # We read the entire archive (or got to error
class RandomAccessTableReaderScriptImpl(object): """RandomAccessTableReaderScriptImpl is for random-access reading of archives when a script file is specified. For simplicity we just read it in all in one go, as it's unlikely someone would generate this from a pipe. In principle we could read it on-demand as for the archives, but this would probably be overkill. """ def __init__(self, holder_type): self.input = Input() self.opts = None self.rspecifier = None self.script = None self.keys = None self.script_rxfilename = None self.key = None self.type = holder_type self.holder = NewHolderByType(self.type) self.data_rxfilename = None self.last_found = 0 self.state = SequentialTableReaderStateType.kUninitialized def Open(self, rspecifier): """Open a reader for the given rspecifier. Args: rspecifier: The given rspecifier. Returns: A boolean variable indicating if the operation is successful. """ # You may call Open from states kUninitialized and kError. # It may leave the object in any of the states. if self.state == RandomAccessTableReaderStateType.kNoObject or \ self.state == RandomAccessTableReaderStateType.kHaveObject: # call Close() yourself to suppress this exception. if not self.Close(): LogError( 'Error closing previous input, rspecifier was \"%s\"' % self.rspecifier) self.rspecifier = rspecifier (rspecifier_type, rxfilename, opts) = ClassifyRspecifier(rspecifier) self.script_rxfilename = rxfilename self.opts = opts if rspecifier_type != RspecifierType.kScriptRspecifier: LogError('Invalid rspecifier type \"%s\"' % rspecifier_type) script_input = Input() if not script_input.Open(self.script_rxfilename): LogError('Failed opening script file \"%s\"' % self.script_rxfilename) if script_input.IsBinary(): LogError('script file should not be in binary format.') script = list() while True: line = script_input.Stream().Readline() if not line: break token = line.rstrip().split() if len(token) != 2: LogError('Invalid line \"%s\"' % line) script.append((token[0], token[1])) self.script = sorted(script, key=itemgetter(0)) self.keys = [key for key, _ in self.script] self.state = RandomAccessTableReaderStateType.kNoObject self.key = None return True def IsOpen(self): if self.state == RandomAccessTableReaderStateType.kNoObject or \ self.state == RandomAccessTableReaderStateType.kHaveObject: return True else: return False def Close(self): if not self.IsOpen(): LogError('Called on input that was not open.') self.input.Close() self.holder.Clear() self.last_found = 0 self.script = None self.key = None self.data_rxfilename = None self.state = SequentialTableReaderStateType.kUninitialized return True def HasKey(self, key): preload = self.opts.permissive return self.HasKeyInternal(key, preload) def Value(self, key): if not self.HasKeyInternal(key, True): LogError('Could not get item for key = %s' % key) return self.holder.Value() def HasKeyInternal(self, key, preload): if self.state == SequentialTableReaderStateType.kUninitialized or \ self.state == SequentialTableReaderStateType.kError: LogError( 'Called on RandomAccessTableReader object that is not open.') elif self.state == RandomAccessTableReaderStateType.kHaveObject: if key == self.key: return True else: pass if not self.LookupKey(key): return False else: if not preload: return True else: data_rxfilename = self.script[self.last_found][1] if self.state == RandomAccessTableReaderStateType.kHaveObject and \ data_rxfilename != self.data_rxfilename: self.state = RandomAccessTableReaderStateType.kNoObject self.holder.Clear() self.key = key self.data_rxfilename = data_rxfilename if self.state == RandomAccessTableReaderStateType.kNoObject: success = self.input.Open(self.data_rxfilename) if not success: LogError('Failed to open file \"%s\"' % self.data_rxfilename) return False else: if self.holder.Read(self.input.Stream(), self.input.IsBinary()): self.state = RandomAccessTableReaderStateType.kHaveObject else: LogError('Failed to load object from \"%s\"' % self.data_rxfilename) return False return True def LookupKey(self, key): for i in xrange(2): if self.last_found < len(self.script) and \ self.script[self.last_found][0] == key: return True self.last_found += 1 self.last_found -= 1 idx = bisect.bisect(self.keys, key) - 1 if self.keys[idx] == key: self.last_found = idx return True else: return False
class SequentialTableReaderArchiveImpl(object): def __init__(self, holder_type): """Initialize the reader for the given holder type. Args: holder_type: The given holder type. """ self.rspecifier = None self.opts = None self.archive_rxfilename = None self.input = Input() self.type = holder_type self.holder = NewHolderByType(self.type) self.key = None self.state = SequentialTableReaderStateType.kUninitialized def Open(self, rspecifier): """Open a reader for the given rspecifier. Args: rspecifier: The given rspecifier. Returns: A boolean variable indicating if the operation is successful. """ if self.state != SequentialTableReaderStateType.kUninitialized: # call Close() yourself to suppress this exception. if not self.Close(): if self.opts.permissive: LogWarning('Error closing previous input (only warning, ' 'since permissive mode).') else: LogError('Error closing previous input, rspecifier was ' '\"%s\"' % self.rspecifier) self.rspecifier = rspecifier (rspecifier_type, rxfilename, opts) = ClassifyRspecifier(rspecifier) self.archive_rxfilename = rxfilename self.opts = opts if rspecifier_type != RspecifierType.kArchiveRspecifier: LogError('Invalid rspecifier type \"%s\"' % rspecifier_type) self.input = Input() if self.holder.IsReadInBinary(): success = self.input.Open(self.archive_rxfilename) else: success = self.input.OpenTextMode(self.archive_rxfilename) if not success: self.state = SequentialTableReaderStateType.kUninitialized LogError('Failed to open stream \"%s\"' % self.archive_rxfilename) self.state = SequentialTableReaderStateType.kFileStart self.Next() if self.state == SequentialTableReaderStateType.kError: self.input.Close() self.state = SequentialTableReaderStateType.kUninitialized LogError('Error beginning to read archive file \"%s\" (wrong ' 'filename?)' % self.archive_rxfilename) if self.state != SequentialTableReaderStateType.kHaveObject and \ self.state != SequentialTableReaderStateType.kEof: LogError('Invalid state \"%s\"' % self.state) return True def Next(self): if self.state == SequentialTableReaderStateType.kHaveObject: self.holder.Clear() elif self.state == SequentialTableReaderStateType.kFileStart or \ self.state == SequentialTableReaderStateType.kFreedObject: pass else: LogError('Invalid state \"%s\"' % self.state) if self.input.Stream().Eof(): self.state = SequentialTableReaderStateType.kEof return True self.key = ReadToken(self.input.Stream(), self.input.IsBinary(), False) c = self.input.Stream().Peek(1) # We expect a space ' ' after the key. We also allow tab, just so we # can read archives generated by scripts that may not be fully aware # of how this format works. if c != ' ' and c != '\t' and c != '\n': LogError('Invalid archive file format: expected space after key ' '\"%s\", got character \"%s\" when reading archive ' '\"%s\".' % (self.key, c, self.archive_rxfilename)) if c != '\n': # Consume the space or tab. self.input.Stream().Read(1) binary = InitKaldiInputStream(self.input.Stream()) if not self.holder.Read(self.input.Stream(), binary): self.holder.Clear() LogError('Failed to read object from archive \"%s\"' % self.archive_rxfilename) self.state = SequentialTableReaderStateType.kHaveObject return True def IsOpen(self): if self.state == SequentialTableReaderStateType.kEof or \ self.state == SequentialTableReaderStateType.kHaveObject or \ self.state == SequentialTableReaderStateType.kFreedObject: return True elif self.state == SequentialTableReaderStateType.kUninitialized: return False else: # note: kFileStart is not a valid state for the user to call a # member function (we never return from a public function in # this state). LogError('Invalid state \"%s\"' % self.state) def Done(self): if self.state == SequentialTableReaderStateType.kHaveObject: return False elif self.state == SequentialTableReaderStateType.kEof or \ self.state == SequentialTableReaderStateType.kError: # Error condition, like Eof, counts as Done(); the # destructor/Close() will inform the user of the error. return True else: LogError('Invalid state \"%s\"' % self.state) def Key(self): if self.state != SequentialTableReaderStateType.kHaveObject: LogError('Invalid state \"%s\"' % self.state) return self.key def Value(self): if self.state != SequentialTableReaderStateType.kHaveObject: LogError('Invalid state \"%s\"' % self.state) return self.holder.Value() def Close(self): if not self.IsOpen(): LogError('Called on input that was not open.') status = 0 if self.input.IsOpen(): status = self.input.Close() if self.state == SequentialTableReaderStateType.kHaveObject: self.holder.Clear() old_state = self.state self.state = SequentialTableReaderStateType.kUninitialized if old_state == SequentialTableReaderStateType.kError or \ (old_state == SequentialTableReaderStateType.kEof and status != 0): if self.opts.permissive: LogWarning('Error state detected closing reader. Ignoring ' 'it because you specified permissive mode.') return True else: return False else: return True
class RandomAccessTableReaderArchiveImplBase(object): """Base class for derived implementations such as unsorted/sorted/doubly sorted. """ def __init__(self, holder_type): self.input = Input() # Input object for the archive. self.cur_key = None # Current key (if state == kHaveObject). self.type = holder_type # type of the holder self.holder = NewHolderByType(self.type) # Holds the object we just # read (if state == kHaveObject). self.rspecifier = None self.archive_rxfilename = None self.opts = None self.state = RandomAccessTableReaderStateType.kUninitialized def Open(self, rspecifier): if self.state != RandomAccessTableReaderStateType.kUninitialized: if not self.Close(): LogError('Failed to close previous input \"%s\".' % self.rspecifier) (rspecifier_type, rxfilename, opts) = ClassifyRspecifier(rspecifier) if rspecifier_type != RspecifierType.kArchiveRspecifier: LogError('Invalid rspecifier type \"%s\"' % rspecifier_type) self.rspecifier = rspecifier self.archive_rxfilename = rxfilename self.opts = opts if self.holder.IsReadInBinary(): success = self.input.Open(self.archive_rxfilename) else: success = self.input.OpenTextMode(self.archive_rxfilename) if not success: self.state = RandomAccessTableReaderStateType.kUninitialized LogError('Failed to open stream \"%s\"' % self.archive_rxfilename) else: self.state = RandomAccessTableReaderStateType.kNoObject return True def ReadNextObject(self): if self.state != RandomAccessTableReaderStateType.kNoObject: LogError('Called from the wrong state \"%s\"' % self.state) if self.input.Stream().Eof(): self.state = RandomAccessTableReaderStateType.kEof return False self.cur_key = ReadToken(self.input.Stream(), self.input.IsBinary(), False) c = self.input.Stream().Peek(1) # We expect a space ' ' after the key. We also allow tab, just so we # can read archives generated by scripts that may not be fully aware # of how this format works. if c != ' ' and c != '\t' and c != '\n': LogError( 'Invalid archive file format: expected space after key ' '\"%s\", got character \"%s\" when reading archive \"%s\".' % (self.cur_key, c, self.archive_rxfilename)) if c != '\n': # Consume the space or tab. self.input.Stream().Read(1) binary = InitKaldiInputStream(self.input.Stream()) if not self.holder.Read(self.input.Stream(), binary): self.holder.Clear() LogError('Failed to read object from archive \"%s\"' % self.archive_rxfilename) self.state = RandomAccessTableReaderStateType.kHaveObject return True def IsOpen(self): if self.state == RandomAccessTableReaderStateType.kEof or \ self.state == RandomAccessTableReaderStateType.kError or \ self.state == RandomAccessTableReaderStateType.kHaveObject or \ self.state == RandomAccessTableReaderStateType.kNoObject: return True elif self.state == RandomAccessTableReaderStateType.kUninitialized: return False else: LogError('Invalid state \"%s\"' % self.state) def CloseInternal(self): """Called by the child-class virutal Close() functions, does the shared parts of the cleanup. """ if not self.IsOpen(): LogError('Called twice or otherwise wrongly.') if self.input.IsOpen(): self.input.Close() if self.state == RandomAccessTableReaderStateType.kHaveObject: self.holder.Clear() ans = (self.state != RandomAccessTableReaderStateType.kError) self.state = RandomAccessTableReaderStateType.kUninitialized if not ans and self.opts.permissive: LogWarning('Error state detected closing reader. Ignoring it ' 'because you specified permissive mode.') return ans
class SequentialTableReaderScriptImpl(object): def __init__(self, holder_type): """Initialize the reader for the given holder type. Args: holder_type: The given holder type. """ self.rspecifier = None self.opts = None self.script_rxfilename = None self.script_input = Input() self.data_input = Input() self.type = holder_type self.holder = NewHolderByType(self.type) self.range_holder = NewHolderByType(self.type) self.key = None self.data_rxfilename = None self.range = None self.state = SequentialTableReaderStateType.kUninitialized def Open(self, rspecifier): """Open a reader for the given rspecifier. Args: rspecifier: The given rspecifier. Returns: A boolean variable indicating if the operation is successful. """ # You may call Open from states kUninitialized and kError. # It may leave the object in any of the states. if self.state != SequentialTableReaderStateType.kUninitialized and \ self.state != SequentialTableReaderStateType.kError: # call Close() yourself to suppress this exception. if not self.Close(): LogError( 'Error closing previous input, rspecifier was \"%s\"' % self.rspecifier) self.rspecifier = rspecifier (rspecifier_type, rxfilename, opts) = ClassifyRspecifier(rspecifier) self.script_rxfilename = rxfilename self.opts = opts if rspecifier_type != RspecifierType.kScriptRspecifier: LogError('Invalid rspecifier type \"%s\"' % rspecifier_type) self.script_input = Input() if not self.script_input.Open(self.script_rxfilename): LogError('Failed opening script file \"%s\"' % self.script_rxfilename) if self.script_input.IsBinary(): self.SetErrorState() LogError('script file should not be in binary format.') else: self.state = SequentialTableReaderStateType.kFileStart self.Next() if self.state == SequentialTableReaderStateType.kError: return False # any other status, including kEof, is OK from the point of view # of the 'open' function (empty scp file is not inherently an # error). return True def IsOpen(self): if self.state == SequentialTableReaderStateType.kEof or \ self.state == SequentialTableReaderStateType.kHaveScpLine or \ self.state == SequentialTableReaderStateType.kHaveObject or \ self.state == SequentialTableReaderStateType.kHaveRange: return True elif self.state == SequentialTableReaderStateType.kUninitialized or \ self.state == SequentialTableReaderStateType.kError: return False else: # note: kFileStart is not a valid state for the user to call a # member function (we never return from a public function in # this state). LogError('Invalid state \"%s\"' % self.state) def Done(self): if self.state == SequentialTableReaderStateType.kHaveScpLine or \ self.state == SequentialTableReaderStateType.kHaveObject or \ self.state == SequentialTableReaderStateType.kHaveRange: return False elif self.state == SequentialTableReaderStateType.kEof or \ self.state == SequentialTableReaderStateType.kError: # Error condition, like Eof, counts as Done(); the # destructor/Close() will inform the user of the error. return True else: LogError('Invalid state \"%s\"' % self.state) def Key(self): if self.state != SequentialTableReaderStateType.kHaveScpLine and \ self.state != SequentialTableReaderStateType.kHaveObject and \ self.state != SequentialTableReaderStateType.kHaveRange: LogError('Invalid state \"%s\"' % self.state) return self.key def Value(self): if not self.EnsureObjectLoaded(): LogError('Failed to load object from \"%s\" to suppress this ' 'error, add the permissive (p, ) option to the ' 'rspecifier.' % self.data_rxfilename) if self.state == SequentialTableReaderStateType.kHaveRange: return self.range_holder.Value() elif self.state == SequentialTableReaderStateType.kHaveObject: return self.holder.Value() else: LogError('Invalid state \"%s\"' % self.state) def Next(self): while True: self.NextScpLine() if self.Done(): return if self.opts.permissive: # Permissive mode means, when reading scp files, we treat keys # whose scp entry cannot be read as nonexistent. This means # trying to read. if self.EnsureObjectLoaded(): return # Success. # else try the next scp line. else: # We go the next key; Value() will crash if we can't read the # object on the scp line. return def Close(self): status = 0 if self.script_input.IsOpen(): status = self.script_input.Close() if self.data_input.IsOpen(): self.data_input.Close() self.range_holder.Clear() self.holder.Clear() if not self.IsOpen(): LogError('Called on input that was not open.') def SetErrorState(self): self.state = SequentialTableReaderStateType.kError self.script_input.Close() self.data_input.Close() self.holder.Clear() self.range_holder.Clear() return True def NextScpLine(self): if self.state == SequentialTableReaderStateType.kHaveRange: sefl.range_holder.Clear() sefl.state = SequentialTableReaderStateType.kHaveObject if self.state != SequentialTableReaderStateType.kHaveScpLine and \ self.state != SequentialTableReaderStateType.kHaveObject and \ self.state != SequentialTableReaderStateType.kFileStart: LogError('Invalid state \"%s\"' % self.state) line = self.script_input.Stream().Readline() if line: token = line.rstrip().split() if len(token) != 2: LogError('Invalid line \"%s\"' % line) self.key = token[0] data_rxfilename = None if token[1].endswith(']'): LogError('Range specifier support not implemented yet.') else: data_rxfilename = token[1] self.range = None filenames_equal = (self.data_rxfilename == data_rxfilename) if not filenames_equal: self.data_rxfilename = data_rxfilename if self.state == SequentialTableReaderStateType.kHaveObject: if not filenames_equal: self.holder.Clear() self.state = SequentialTableReaderStateType.kHaveScpLine else: self.state = SequentialTableReaderStateType.kHaveScpLine else: self.state = SequentialTableReaderStateType.kEof # There is nothing more in the scp file. Might as well close input # streams as we don't need them. self.script_input.Close() if self.data_input.IsOpen(): self.data_input.Close() self.holder.Clear() # clear the holder if it was nonempty. self.range_holder.Clear( ) # clear the range holder if it was nonempty. def EnsureObjectLoaded(self): """Ensures that we have fully loaded any object associated with the current key. Returns: A boolean variable indicating if the operation is successful. """ if self.state != SequentialTableReaderStateType.kHaveScpLine and \ self.state != SequentialTableReaderStateType.kHaveObject and \ self.state != SequentialTableReaderStateType.kHaveRange: LogError('Invalid state \"%s\"' % self.state) if self.state == SequentialTableReaderStateType.kHaveScpLine: success = self.data_input.Open(self.data_rxfilename) if not success: LogError('Failed to open file \"%s\"' % self.data_rxfilename) if self.holder.Read(self.data_input.Stream(), self.data_input.IsBinary()): self.state = SequentialTableReaderStateType.kHaveObject else: LogError('Failed to load object from \"%s\"' % self.data_rxfilename) # At this point the state must be either kHaveObject or kHaveRange. if self.range: LogError('Range specifier support not implemented yet.') return True