def PackageFeatAndAli(all_package, input_lock, package_end, scp_file, ali_file, nstreams, skip_frame = 1, max_input_seq_length = 1500, criterion = 'ce'): logging.info('------start PackageFeatAndAli------') start_package = time.time() #all_package = [] # first read ali alignment_dict = read_alignment(ali_file) scp_list = [] ali_list = [] # second read feature scp file and package feature and ali for line in open(scp_file, 'r'): utt_id, utt_mat = read_next_utt(line) logging.debug(utt_id + ' read ok.') # overlength if int(len(utt_mat)/skip_frame) + 1 > max_input_seq_length: logging.info(utt_id + ' length '+ str(int(len(utt_mat)/skip_frame)+1) + ' > ' + str(max_input_seq_length)) continue try: ali_utt = alignment_dict[utt_id] except KeyError: logging.info('no '+ utt_id + ' align') continue if 'ce' in criterion: if len(utt_mat) != len(ali_utt): # delete one frame ali in utt_mat if len(ali_utt) - len(utt_mat) == 1: #logging.warn(utt_id + ' feat length + 1 = ali length , and delete one ali label') ali_utt = numpy.delete(ali_utt, -1) else: logging.info(utt_id + ' feat and ali isn\'t equal length') continue elif 'ctc' in criterion : if len(utt_mat) < len(ali_utt) * 2 - 1: logging.info(utt_id + ' feat < ali * 2 - 1 :%d < %d * 2 - 1' % (len(utt_mat), len(ali_utt))) continue scp_list.append(line) ali_list.append(ali_utt) if len(scp_list) == nstreams: input_lock.acquire() all_package.append([scp_list, ali_list]) input_lock.release() scp_list = [] ali_list = [] if len(scp_list) != 0: while len(scp_list) < nstreams: scp_list.append(scp_list[0]) ali_list.append(ali_list[0]) input_lock.acquire() all_package.append([scp_list, ali_list]) input_lock.release() input_lock.acquire() package_end.append(True) input_lock.release() end_package = time.time() logging.info('------PackageFeatAndAli end. Package time is : %f s, batch number : %d' % (end_package - start_package, len(all_package))) return True
def PackageFeatAndAliAndLat(all_package, input_lock, package_end, feat_scp_file, ali_file, lat_scp_file, nstreams, skip_frame = 1, max_input_seq_length = 1500, criterion = 'mmi'): logging.info('------start PackageFeatAndAliAndLat------') start_package = time.time() # first read ali alignment_dict = read_alignment(ali_file) lat_dict = ReadScp(lat_scp_file) feat_list = [] ali_list = [] lat_list = [] with open(feat_scp_file, 'r') as feat_fp: for line in feat_fp: utt_id, utt_mat = read_next_utt(line) logging.debug(utt_id + ' read ok.') if int(len(utt_mat)/skip_frame) + 1 > max_input_seq_length: logging.info(utt_id + ' length '+ str(int(len(utt_mat)/skip_frame)+1) + ' > ' + str(max_input_seq_length)) continue try: ali_utt = alignment_dict[utt_id] except KeyError: logging.info('no '+ utt_id + ' align') continue try: lat_scp_line = lat_dict[utt_id] except KeyError: logging.info('no '+ utt_id + ' lattice') continue # should check length is equal. # but because of time question , now it's not check length. feat_list.append(line) ali_list.append(ali_utt) lat_list.append(lat_scp_line) if len(feat_list) == nstreams: input_lock.acquire() all_package.append([feat_list, ali_list, lat_list]) input_lock.release() feat_list = [] ali_list = [] lat_list = [] if len(feat_list) != 0: while len(feat_list) < nstreams: feat_list.append(feat_list[0]) ali_list.append(ali_list[0]) lat_list.append(lat_list[0]) input_lock.acquire() all_package.append([feat_list, ali_list, lat_list]) input_lock.release() input_lock.acquire() package_end.append(True) input_lock.release() end_package = time.time() logging.info('------PackageFeatAndAliAndLat end. Package time is : %f s, batch number : %d' % (end_package - start_package, len(all_package)))
def LoadOnePackage(self): while True: self.input_lock.acquire() if self.read_offset.value >= len(self.package_feat_ali): if self.package_end[-1] is True: self.input_lock.release() return None, None, None, None, None else: self.input_lock.release() time.sleep(0.05) continue else: package = self.package_feat_ali[self.read_offset.value] self.read_offset.value += 1 self.input_lock.release() break feat_scp = package[0] label = package[1] lat_scp = None lat_list = None if len(package) == 3: lat_scp = package[2] # indexs_info_list, pdf_values_list, lmweight_values_list, amweight_values_list, statesinfo_list, statenum_list, time_list lat_list = PackageLattice(lat_scp, map_pdf_phone = self.ali_to_pdf_phone) max_frame_num = 0 length = [] feat_mat = [] for feat_line in feat_scp: utt_id, utt_mat = read_next_utt(feat_line) # do feature transform if self.feature_transform != None: utt_mat = self.feature_transform.Propagate(utt_mat) feat_mat.append( skip_frame(utt_mat ,self.skip_frame, self.skip_offset)) length.append(len(feat_mat[-1])) if max_frame_num < length[-1]: max_frame_num = length[-1] if self.do_skip_lab and self.skip_frame > 1: process_lab = [] for lab in label: # if self.skip_frame > 1: process_lab.append(lab[self.skip_offset : len(lab) : self.skip_frame]) label = process_lab # check lattice and label and feat if lat_list is not None: i = 0 lat_times = lat_list[-1] while i < len(lat_times): assert lat_times[i] == length[i] i += 1 return feat_mat, label, length, max_frame_num, lat_list
def LoadOnePackage(self): while True: self.input_lock.acquire() if self.read_offset >= len(self.package_feat_ali): if self.package_end[-1] is True: self.input_lock.release() return None, None, None, None else: self.input_lock.release() time.sleep(1.0) continue else: self.input_lock.release() break package = self.package_feat_ali[self.read_offset] self.read_offset += 1 feat_scp = package[0] label = package[1] max_frame_num = 0 length = [] feat_mat = [] for feat_line in feat_scp: utt_id, utt_mat = read_next_utt(feat_line) # do feature transform if self.feature_transform != None: utt_mat = self.feature_transform.Propagate(utt_mat) feat_mat.append( skip_frame(utt_mat, self.skip_frame, self.skip_offset)) length.append(len(feat_mat[-1])) if max_frame_num < length[-1]: max_frame_num = length[-1] if self.do_skip_lab and self.skip_frame > 1: process_lab = [] for lab in label: # if self.skip_frame > 1: process_lab.append( lab[self.skip_offset:len(lab):self.skip_frame]) label = process_lab return feat_mat, label, length, max_frame_num