def PackageFeatAndAli(all_package, input_lock, package_end, scp_file, ali_file, nstreams, skip_frame = 1,  max_input_seq_length = 1500, criterion = 'ce'):
    logging.info('------start PackageFeatAndAli------')
    start_package = time.time()
    #all_package = []
    # first read ali
    alignment_dict = read_alignment(ali_file)

    scp_list = []
    ali_list = []
    # second read feature scp file and package feature and ali
    for line in open(scp_file, 'r'):
        utt_id, utt_mat = read_next_utt(line)
        logging.debug(utt_id + ' read ok.')
        # overlength
        if int(len(utt_mat)/skip_frame) + 1 > max_input_seq_length:
            logging.info(utt_id + ' length '+ str(int(len(utt_mat)/skip_frame)+1) + ' > ' + str(max_input_seq_length))
            continue

        try:
            ali_utt = alignment_dict[utt_id]
        except KeyError:
            logging.info('no '+ utt_id + ' align')
            continue
        if 'ce' in criterion:
            if len(utt_mat) != len(ali_utt):
                # delete one frame ali in utt_mat
                if len(ali_utt) - len(utt_mat) == 1:
                    #logging.warn(utt_id + ' feat length + 1 = ali length , and delete one ali label')
                    ali_utt = numpy.delete(ali_utt, -1)
                else:
                    logging.info(utt_id + ' feat and ali isn\'t equal length')
                    continue
        elif 'ctc' in criterion :
            if len(utt_mat) < len(ali_utt) * 2 - 1:
                logging.info(utt_id + ' feat < ali * 2 - 1 :%d < %d * 2 - 1' % (len(utt_mat), len(ali_utt)))
                continue
        scp_list.append(line)
        ali_list.append(ali_utt)
        if len(scp_list) == nstreams:
            input_lock.acquire()
            all_package.append([scp_list, ali_list])
            input_lock.release()
            scp_list = []
            ali_list = []
    if len(scp_list) != 0:
        while len(scp_list) < nstreams:
            scp_list.append(scp_list[0])
            ali_list.append(ali_list[0])
        input_lock.acquire()
        all_package.append([scp_list, ali_list])
        input_lock.release()
    
    input_lock.acquire()
    package_end.append(True)
    input_lock.release()

    end_package = time.time()
    logging.info('------PackageFeatAndAli end. Package time is : %f s, batch number : %d' % (end_package - start_package, len(all_package)))
    return True
def PackageFeatAndAliAndLat(all_package, input_lock, package_end, feat_scp_file, ali_file, lat_scp_file, nstreams, 
        skip_frame = 1,  max_input_seq_length = 1500, criterion = 'mmi'):
    logging.info('------start PackageFeatAndAliAndLat------')
    start_package = time.time()
    # first read ali
    alignment_dict = read_alignment(ali_file)
    lat_dict = ReadScp(lat_scp_file)

    feat_list = []
    ali_list = []
    lat_list = []

    with open(feat_scp_file, 'r') as feat_fp:
        for line in feat_fp:
            utt_id, utt_mat = read_next_utt(line)
            logging.debug(utt_id + ' read ok.')
            if int(len(utt_mat)/skip_frame) + 1 > max_input_seq_length:
                logging.info(utt_id + ' length '+ str(int(len(utt_mat)/skip_frame)+1) + ' > ' + str(max_input_seq_length))
                continue
            try:
                ali_utt = alignment_dict[utt_id]
            except KeyError:
                logging.info('no '+ utt_id + ' align')
                continue
            try:
                lat_scp_line = lat_dict[utt_id]
            except KeyError:
                logging.info('no '+ utt_id + ' lattice')
                continue

            # should check length is equal.
            # but because of time question , now it's not check length.
            feat_list.append(line)
            ali_list.append(ali_utt)
            lat_list.append(lat_scp_line)
            
            if len(feat_list) == nstreams:
                input_lock.acquire()
                all_package.append([feat_list, ali_list, lat_list])
                input_lock.release()
                feat_list = []
                ali_list = []
                lat_list = []
    
    if len(feat_list) != 0:
        while len(feat_list) < nstreams:
            feat_list.append(feat_list[0])
            ali_list.append(ali_list[0])
            lat_list.append(lat_list[0])
        input_lock.acquire()
        all_package.append([feat_list, ali_list, lat_list])
        input_lock.release()

    input_lock.acquire()
    package_end.append(True)
    input_lock.release()
    
    end_package = time.time()
    logging.info('------PackageFeatAndAliAndLat end. Package time is : %f s, batch number : %d' % (end_package - start_package, len(all_package)))
    def LoadOnePackage(self):
        while True:
            self.input_lock.acquire()
            if self.read_offset.value >= len(self.package_feat_ali):
                if self.package_end[-1] is True:
                    self.input_lock.release()
                    return None, None, None, None, None
                else:
                    self.input_lock.release()
                    time.sleep(0.05)
                    continue
            else:
                package = self.package_feat_ali[self.read_offset.value]
                self.read_offset.value += 1
                self.input_lock.release()
                break

        feat_scp = package[0]
        label = package[1]
        lat_scp = None
        lat_list = None
        if len(package) == 3:
            lat_scp = package[2]
            # indexs_info_list, pdf_values_list, lmweight_values_list, amweight_values_list, statesinfo_list, statenum_list, time_list
            lat_list = PackageLattice(lat_scp, map_pdf_phone = self.ali_to_pdf_phone)

        max_frame_num = 0
        length = []
        feat_mat = []
        
        for feat_line in feat_scp:
            utt_id, utt_mat = read_next_utt(feat_line)
            # do feature transform
            if self.feature_transform != None:
                utt_mat = self.feature_transform.Propagate(utt_mat)
                feat_mat.append(
                        skip_frame(utt_mat ,self.skip_frame, self.skip_offset))
            length.append(len(feat_mat[-1]))
            if max_frame_num < length[-1]:
                max_frame_num = length[-1]

        if self.do_skip_lab and self.skip_frame > 1:
            process_lab = []
            for lab in label:
           #     if self.skip_frame > 1:
                process_lab.append(lab[self.skip_offset : len(lab) : self.skip_frame])
            label = process_lab


        # check lattice and label and feat
        if lat_list is not None:
            i = 0
            lat_times = lat_list[-1]
            while i < len(lat_times):
                assert lat_times[i] == length[i]
                i += 1
    
        return feat_mat, label, length, max_frame_num, lat_list
示例#4
0
    def LoadOnePackage(self):
        while True:
            self.input_lock.acquire()
            if self.read_offset >= len(self.package_feat_ali):
                if self.package_end[-1] is True:
                    self.input_lock.release()
                    return None, None, None, None
                else:
                    self.input_lock.release()
                    time.sleep(1.0)
                    continue
            else:
                self.input_lock.release()
                break

        package = self.package_feat_ali[self.read_offset]
        self.read_offset += 1

        feat_scp = package[0]
        label = package[1]
        max_frame_num = 0
        length = []
        feat_mat = []

        for feat_line in feat_scp:
            utt_id, utt_mat = read_next_utt(feat_line)
            # do feature transform
            if self.feature_transform != None:
                utt_mat = self.feature_transform.Propagate(utt_mat)
                feat_mat.append(
                    skip_frame(utt_mat, self.skip_frame, self.skip_offset))
            length.append(len(feat_mat[-1]))
            if max_frame_num < length[-1]:
                max_frame_num = length[-1]

        if self.do_skip_lab and self.skip_frame > 1:
            process_lab = []
            for lab in label:
                #     if self.skip_frame > 1:
                process_lab.append(
                    lab[self.skip_offset:len(lab):self.skip_frame])
            label = process_lab

        return feat_mat, label, length, max_frame_num