def run(args): priors = [[1.0, 1.0, 1.0]] if args.priors is not None: priors = common_lib.read_matrix_ascii(args.priors) if len(priors) != 0 and len(priors[0]) != 3: raise RuntimeError("Invalid dimension for priors {0}" "".format(priors)) priors_sum = sum(priors[0]) sil_prior = old_div(priors[0][0], priors_sum) speech_prior = old_div(priors[0][1], priors_sum) garbage_prior = old_div(priors[0][2], priors_sum) transform_mat = [[ old_div(args.sil_scale, sil_prior), old_div(args.speech_in_sil_weight, speech_prior), old_div(args.garbage_in_sil_weight, garbage_prior) ], [ old_div(args.sil_in_speech_weight, sil_prior), old_div(1.0, speech_prior), old_div(args.garbage_in_speech_weight, garbage_prior) ]] common_lib.write_matrix_ascii(sys.stdout, transform_mat)
def run(args): priors = [[1.0, 1.0, 1.0]] if args.priors is not None: priors = common_lib.read_matrix_ascii(args.priors) if len(priors) != 0 and len(priors[0]) != 3: raise RuntimeError("Invalid dimension for priors {0}" "".format(priors)) priors_sum = sum(priors[0]) sil_prior = priors[0][0] / priors_sum speech_prior = priors[0][1] / priors_sum garbage_prior = priors[0][2] / priors_sum transform_mat = [ [ args.sil_scale / sil_prior, args.speech_in_sil_weight / speech_prior, args.garbage_in_sil_weight / garbage_prior, ], [ args.sil_in_speech_weight / sil_prior, 1.0 / speech_prior, args.garbage_in_speech_weight / garbage_prior, ], ] common_lib.write_matrix_ascii(sys.stdout, transform_mat)
def run(args): num_done = 0 with common_lib.smart_open( args.pasted_targets) as targets_reader, common_lib.smart_open( args.out_targets, "w") as targets_writer: for key, mat in common_lib.read_mat_ark(targets_reader): mat = np.matrix(mat) if mat.shape[1] % args.dim != 0: raise RuntimeError( "For utterance {utt} in {f}, num-columns {nc} " "is not a multiple of dim {dim}" "".format( utt=key, f=args.pasted_targets.name, nc=mat.shape[1], dim=args.dim, )) num_sources = mat.shape[1] // args.dim out_mat = np.matrix(np.zeros([mat.shape[0], args.dim])) if args.remove_mismatch_frames: for n in range(mat.shape[0]): if should_remove_frame(mat[n, :].getA()[0], args.dim): out_mat[n, :] = np.zeros([1, args.dim]) else: for i in range(num_sources): out_mat[n, :] += mat[n, (i * args.dim):( (i + 1) * args.dim)] * (1.0 if args.weights is None else args.weights[i]) else: # Just interpolate the targets for i in range(num_sources): out_mat += mat[:, (i * args.dim):((i + 1) * args.dim)] * ( 1.0 if args.weights is None else args.weights[i]) common_lib.write_matrix_ascii(targets_writer, out_mat.tolist(), key=key) num_done += 1 logger.info("Merged {num_done} target matrices" "".format(num_done=num_done)) if num_done == 0: raise RuntimeError
def run(args): # Load priors. # - priors[0] -- prior probability of non-speech # - priors[1] -- prior probability of speech # - priors[2] -- prior probability of garbage; ignored priors = common_lib.read_matrix_ascii(args.priors) if len(priors) != 0 and len(priors[0]) != 3: raise RuntimeError(f'Invalid dimension for priors {priors}') priors = np.squeeze(np.array(priors, dtype=np.float64)) # Create matrix that converts posteriors to likelihoods by dividing by # normalized priors. pmass = priors[0] + priors[1] # Total mass devoted to speech/non-speech. priors /= pmass transform_mat = np.diag(1 / priors) transform_mat[2, 2] = 0.0 # Ignore garbage entirely transform_mat[1, 1] *= args.speech_likelihood_weight common_lib.write_matrix_ascii(sys.stdout, transform_mat)
def run(args): num_done = 0 with common_lib.smart_open(args.pasted_targets) as targets_reader, \ common_lib.smart_open(args.out_targets, 'w') as targets_writer: for key, mat in common_lib.read_mat_ark(targets_reader): mat = np.matrix(mat) if mat.shape[1] % args.dim != 0: raise RuntimeError( "For utterance {utt} in {f}, num-columns {nc} " "is not a multiple of dim {dim}" "".format(utt=key, f=args.pasted_targets.name, nc=mat.shape[1], dim=args.dim)) num_sources = mat.shape[1] // args.dim out_mat = np.matrix(np.zeros([mat.shape[0], args.dim])) if args.remove_mismatch_frames: for n in range(mat.shape[0]): if should_remove_frame(mat[n, :].getA()[0], args.dim): out_mat[n, :] = np.zeros([1, args.dim]) else: for i in range(num_sources): out_mat[n, :] += ( mat[n, (i * args.dim) : ((i+1) * args.dim)] * (1.0 if args.weights is None else args.weights[i])) else: # Just interpolate the targets for i in range(num_sources): out_mat += ( mat[:, (i * args.dim) : ((i+1) * args.dim)] * (1.0 if args.weights is None else args.weights[i])) common_lib.write_matrix_ascii(targets_writer, out_mat.tolist(), key=key) num_done += 1 logger.info("Merged {num_done} target matrices" "".format(num_done=num_done)) if num_done == 0: raise RuntimeError
def run(args): priors = [[1.0, 1.0, 1.0]] if args.priors is not None: priors = common_lib.read_matrix_ascii(args.priors) if len(priors) != 0 and len(priors[0]) != 3: raise RuntimeError("Invalid dimension for priors {0}" "".format(priors)) priors_sum = sum(priors[0]) sil_prior = priors[0][0] / priors_sum speech_prior = priors[0][1] / priors_sum garbage_prior = priors[0][2] / priors_sum transform_mat = [[args.sil_scale / sil_prior, args.speech_in_sil_weight / speech_prior, args.garbage_in_sil_weight / garbage_prior], [args.sil_in_speech_weight / sil_prior, 1.0 / speech_prior, args.garbage_in_speech_weight / garbage_prior]] common_lib.write_matrix_ascii(sys.stdout, transform_mat)
def run(args): num_utts = 0 for key, mat in common_lib.read_mat_ark(args.targets_in_ark): mat = np.matrix(mat) if args.subsampling_factor > 0: num_indexes = (old_div( (mat.shape[0] + args.subsampling_factor - 1), args.subsampling_factor)) out_mat = np.zeros([num_indexes, mat.shape[1]]) i = 0 for k in range(int(old_div(args.subsampling_factor, 2.0)), mat.shape[0], args.subsampling_factor): st = int(k - old_div(float(args.subsampling_factor), 2.0)) end = int(k + old_div(float(args.subsampling_factor), 2.0)) if st < 0: st = 0 if end > mat.shape[0]: end = mat.shape[0] try: out_mat[i, :] = old_div(np.sum(mat[st:end, :], axis=0), float(end - st)) except IndexError: logger.error("mat.shape = {0}, st = {1}, end = {2}" "".format(mat.shape, st, end)) raise assert i == old_div(k, args.subsampling_factor) i += 1 common_lib.write_matrix_ascii(args.targets_out_ark, out_mat, key=key) num_utts += 1 args.targets_in_ark.close() args.targets_out_ark.close() logger.info("Sub-sampled {num_utts} target matrices" "".format(num_utts=num_utts))
def run(args): num_utts = 0 for key, mat in common_lib.read_mat_ark(args.targets_in_ark): mat = np.matrix(mat) if args.subsampling_factor > 0: num_indexes = ((mat.shape[0] + args.subsampling_factor - 1) / args.subsampling_factor) out_mat = np.zeros([num_indexes, mat.shape[1]]) i = 0 for k in range(int(args.subsampling_factor / 2.0), mat.shape[0], args.subsampling_factor): st = int(k - float(args.subsampling_factor) / 2.0) end = int(k + float(args.subsampling_factor) / 2.0) if st < 0: st = 0 if end > mat.shape[0]: end = mat.shape[0] try: out_mat[i, :] = np.sum(mat[st:end, :], axis=0) / float(end - st) except IndexError: logger.error("mat.shape = {0}, st = {1}, end = {2}" "".format(mat.shape, st, end)) raise assert i == k / args.subsampling_factor i += 1 common_lib.write_matrix_ascii(args.targets_out_ark, out_mat, key=key) num_utts += 1 args.targets_in_ark.close() args.targets_out_ark.close() logger.info("Sub-sampled {num_utts} target matrices" "".format(num_utts=num_utts))
def run(args): silence_phones = {} with common_lib.smart_open(args.silence_phones) as silence_phones_fh: for line in silence_phones_fh: silence_phones[line.strip().split()[0]] = 1 if len(silence_phones) == 0: raise RuntimeError("Could not find any phones in {silence}" "".format(silence=args.silence_phones)) garbage_phones = {} with common_lib.smart_open(args.garbage_phones) as garbage_phones_fh: for line in garbage_phones_fh: word = line.strip().split()[0] if word in silence_phones: raise RuntimeError("Word '{word}' is in both {silence} " "and {garbage}".format( word=word, silence=args.silence_phones, garbage=args.garbage_phones)) garbage_phones[word] = 1 if len(garbage_phones) == 0: raise RuntimeError("Could not find any phones in {garbage}" "".format(garbage=args.garbage_phones)) num_utts = 0 num_err = 0 targets = [] prev_utt = "" with common_lib.smart_open(args.arc_info) as arc_info_reader, \ common_lib.smart_open(args.targets_file, 'w') as targets_writer: for line in arc_info_reader: try: parts = line.strip().split() utt = parts[0] if utt != prev_utt: if prev_utt != "": if len(targets) > 0: num_utts += 1 common_lib.write_matrix_ascii( targets_writer, targets, key=prev_utt) else: num_err += 1 prev_utt = utt targets = [] start_frame = int(parts[1]) num_frames = int(parts[2]) post = float(parts[3]) phone = parts[4] if start_frame + num_frames > len(targets): for t in range(len(targets), start_frame + num_frames): targets.append([0, 0, 0]) assert start_frame + num_frames == len(targets) for t in range(start_frame, start_frame + num_frames): if phone in silence_phones: targets[t][0] += post elif num_frames > args.max_phone_length: targets[t][2] += post elif phone in garbage_phones: targets[t][2] += post else: targets[t][1] += post except Exception: logger.error("Failed to process line {line} in {f}" "".format(line=line.strip(), f=args.arc_info)) logger.error("len(targets) = {l}".format(l=len(targets))) raise if prev_utt != "": if len(targets) > 0: num_utts += 1 common_lib.write_matrix_ascii(args.targets_file, targets, key=prev_utt) else: num_err += 1 logger.info("Wrote {num_utts} targets; failed with {num_err}" "".format(num_utts=num_utts, num_err=num_err)) if num_utts == 0 or num_err >= num_utts / 2: raise RuntimeError
def run(args): # Get all reco to num_frames, which will be used to decide the number of # rows of matrix reco2num_frames = {} with common_lib.smart_open(args.reco2num_frames) as f: for line in f: fields = line.strip().split() if len(fields) != 2: raise ValueError("Could not parse line {0}".format(line)) reco2num_frames[fields[0]] = int(fields[1]) # We read all segments and store as a list of objects segments = [] with common_lib.smart_open(args.rttm) as f: for line in f.readlines(): segment_fields = line.strip().split() start = float(segment_fields[3]) duration = float(segment_fields[4]) end = start + duration segments.append( Segment(reco=segment_fields[1], spk=segment_fields[7], start=start, dur=duration, end=end)) keyfunc = lambda x: x.reco segments_iterable = sorted(segments, key=keyfunc) reco2segs = defaultdict(list, { reco: list(g) for reco, g in itertools.groupby(segments_iterable, keyfunc) }) # Now, for each reco, create a matrix of shape num_frames x 2 and fill in using # the segments information for that reco reco2targets = {} for reco_id in reco2num_frames: segs = sorted(reco2segs[reco_id], key=lambda x: x.start) target_val = 1 - args.label_smoothing other_val = args.label_smoothing / 2 silence_vec = np.array([target_val, other_val], dtype=np.float) speech_vec = np.array([other_val, target_val], dtype=np.float) num_targets = [0, 0] # The default target (if not speech) is silence targets_mat = np.tile(silence_vec, (reco2num_frames[reco_id], 1)) # Now iterate over all segments of the recording and assign targets for seg in segs: start_frame = int(seg.start / args.frame_shift) end_frame = min(int(seg.end / args.frame_shift), reco2num_frames[reco_id]) num_frames = end_frame - start_frame if (num_frames <= 0): continue targets_mat[start_frame:end_frame] = np.tile( speech_vec, (num_frames, 1)) num_targets[1] += end_frame - start_frame num_targets[0] = reco2num_frames[reco_id] - sum(num_targets) reco2targets[reco_id] = targets_mat with common_lib.smart_open(args.out_targets_ark, 'w') as f: for reco_id in sorted(reco2targets.keys()): common_lib.write_matrix_ascii(f, reco2targets[reco_id].tolist(), key=reco_id)
def run(args): reco2utt = read_reco2utt_file(args.reco2utt) reco2num_frames = read_reco2num_frames_file(args.reco2num_frames) segments = read_segments_file(args.segments, reco2utt) targets = read_targets_scp(args.targets_scp, segments) if args.default_targets is not None: # Read the vector of default targets for out-of-segment regions default_targets = np.matrix( common_lib.read_matrix_ascii(args.default_targets)) else: default_targets = np.zeros([1, 3]) assert np.shape(default_targets)[0] == 1 and np.shape( default_targets)[1] == 3 num_utt_err = 0 num_utt = 0 num_reco = 0 with common_lib.smart_open(args.out_targets_ark, "w") as fh: for reco, utts in reco2utt.items(): # Read a recording and the list of its utterances from the # reco2utt dictionary reco_mat = np.repeat(default_targets, reco2num_frames[reco], axis=0) utts.sort(key=lambda x: segments[x][1]) # sort on start time end_frame_accounted = 0 for i, utt in enumerate(utts): if utt not in segments or utt not in targets: num_utt_err += 1 continue segment = segments[utt] # Read the targets corresponding to the segments cmd = "copy-feats --binary=false {mat_fn} -" "".format( mat_fn=targets[utt]) p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) try: mat = np.matrix(common_lib.read_matrix_ascii(p.stdout), dtype="float32") except Exception: logger.error("Command '{cmd}' failed".format(cmd=cmd)) raise finally: [stdout, stderr] = p.communicate() if p.returncode is not None and p.returncode != 0: raise RuntimeError( 'Command "{cmd}" failed with status {status}; ' "stderr = {stderr}".format(cmd=cmd, status=-p.returncode, stderr=stderr)) start_frame = int(segment[1] / args.frame_shift + 0.5) end_frame = int(segment[2] / args.frame_shift + 0.5) num_frames = end_frame - start_frame if num_frames <= 0: raise ValueError("Invalid line in segments file {0}" "".format(segment)) if abs(mat.shape[0] - num_frames) > args.length_tolerance: logger.warning("For utterance {utt}, mismatch in segment " "length and targets matrix size; " "{s_len} vs {t_len}".format( utt=utt, s_len=num_frames, t_len=mat.shape[0])) num_utt_err += 1 continue # Fix end_frame and num_frames if the segment goes beyond # the length of the recording. if end_frame > reco2num_frames[reco]: end_frame = reco2num_frames[reco] num_frames = end_frame - start_frame # Fix "num_frames" and "end_frame" if "num_frames" is lower # than the size of the targets matrix "mat" num_frames = min(num_frames, mat.shape[0]) end_frame = start_frame + num_frames if num_frames <= 0: logger.warning("For utterance {utt}, start-frame {start} " "is outside the recording" "".format(utt=utt, start=start_frame)) num_utt_err += 1 continue if end_frame < end_frame_accounted: logger.warning("For utterance {utt}, end-frame {end} " "is before the end of a previous segment. " "i.e. this segment is completely within " "another segment. Ignoring this segment." "".format(utt=utt, end=end_frame)) num_utt_err += 1 continue if start_frame < end_frame_accounted: # Segment overlaps with a previous utterance # Combine targets using a weighted interpolation using a # triangular window with a weight of 1 at the start/end of # overlap and 0 at the end/start of the segment for n in range(0, end_frame_accounted - start_frame): w = float(n) / float(end_frame_accounted - start_frame) reco_mat[n + start_frame, :] = ( reco_mat[n + start_frame, :] * (1.0 - w) + mat[n, :] * w) if end_frame > end_frame_accounted: reco_mat[end_frame_accounted:end_frame, :] = mat[( end_frame_accounted - start_frame):(end_frame - start_frame), :, ] else: # No overlap with the previous utterances. # So just add it to the output. reco_mat[start_frame:end_frame, :] = mat[0:num_frames, :] logger.debug( "reco_mat shape = %s, mat shape = %s, " "start_frame = %d, end_frame = %d", reco_mat.shape, mat.shape, start_frame, end_frame, ) end_frame_accounted = end_frame num_utt += 1 if reco_mat.shape[0] > 0: common_lib.write_matrix_ascii(fh, reco_mat, key=reco) num_reco += 1 logger.info("Merged {num_utt} segment targets from {num_reco} recordings; " "failed with {num_utt_err} utterances" "".format(num_utt=num_utt, num_reco=num_reco, num_utt_err=num_utt_err)) if num_utt == 0 or num_utt_err > num_utt // 2 or num_reco == 0: raise RuntimeError
def run(args): silence_phones = {} with common_lib.smart_open(args.silence_phones) as silence_phones_fh: for line in silence_phones_fh: silence_phones[line.strip().split()[0]] = 1 if len(silence_phones) == 0: raise RuntimeError("Could not find any phones in {silence}" "".format(silence=args.silence_phones)) garbage_phones = {} with common_lib.smart_open(args.garbage_phones) as garbage_phones_fh: for line in garbage_phones_fh: word = line.strip().split()[0] if word in silence_phones: raise RuntimeError("Word '{word}' is in both {silence} " "and {garbage}".format( word=word, silence=args.silence_phones, garbage=args.garbage_phones)) garbage_phones[word] = 1 if len(garbage_phones) == 0: raise RuntimeError("Could not find any phones in {garbage}" "".format(garbage=args.garbage_phones)) num_utts = 0 num_err = 0 targets = [] prev_utt = "" with common_lib.smart_open(args.arc_info) as arc_info_reader, \ common_lib.smart_open(args.targets_file, 'w') as targets_writer: for line in arc_info_reader: try: parts = line.strip().split() utt = parts[0] if utt != prev_utt: if prev_utt != "": if len(targets) > 0: num_utts += 1 common_lib.write_matrix_ascii( targets_writer, targets, key=prev_utt) else: num_err += 1 prev_utt = utt targets = [] start_frame = int(parts[1]) num_frames = int(parts[2]) post = float(parts[3]) phone = parts[4] if start_frame + num_frames > len(targets): for t in range(len(targets), start_frame + num_frames): targets.append([0, 0, 0]) assert start_frame + num_frames == len(targets) for t in range(start_frame, start_frame + num_frames): if phone in silence_phones: targets[t][0] += post elif num_frames > args.max_phone_length: targets[t][2] += post elif phone in garbage_phones: targets[t][2] += post else: targets[t][1] += post except Exception: logger.error("Failed to process line {line} in {f}" "".format(line=line.strip(), f=args.arc_info)) logger.error("len(targets) = {l}".format(l=len(targets))) raise if prev_utt != "": if len(targets) > 0: num_utts += 1 common_lib.write_matrix_ascii(args.targets_file, targets, key=prev_utt) else: num_err += 1 logger.info("Wrote {num_utts} targets; failed with {num_err}" "".format(num_utts=num_utts, num_err=num_err)) if num_utts == 0 or num_err >= num_utts // 2: raise RuntimeError
num_frames = end_frame - start_frame if (num_frames <= 0): continue targets_mat[start_frame:end_frame] = np.tile(speech_vec, (num_frames,1)) num_targets[1] += end_frame - start_frame <<<<<<< HEAD num_targets[0] = reco2num_frames[reco_id] - sum(num_targets) # print ("{}: {}".format(reco_id, num_targets)) ======= num_targets[0] = reco2num_frames[reco_id] - sum(num_targets) >>>>>>> upstream/master reco2targets[reco_id] = targets_mat with common_lib.smart_open(args.out_targets_ark, 'w') as f: for reco_id in sorted(reco2targets.keys()): common_lib.write_matrix_ascii(f, reco2targets[reco_id].tolist(), key=reco_id) def main(): args = get_args() try: run(args) except Exception: raise if __name__ == "__main__": main()
def run(args): reco2utt = {} with common_lib.smart_open(args.reco2utt) as f: for line in f: parts = line.strip().split() if len(parts) < 2: raise ValueError("Could not parse line {0}".format(line)) reco2utt[parts[0]] = parts[1:] reco2num_frames = {} with common_lib.smart_open(args.reco2num_frames) as f: for line in f: parts = line.strip().split() if len(parts) != 2: raise ValueError("Could not parse line {0}".format(line)) if parts[0] not in reco2utt: continue reco2num_frames[parts[0]] = int(parts[1]) segments = {} with common_lib.smart_open(args.segments) as f: for line in f: parts = line.strip().split() if len(parts) not in [4, 5]: raise ValueError("Could not parse line {0}".format(line)) utt = parts[0] reco = parts[1] if reco not in reco2utt: continue start_time = float(parts[2]) end_time = float(parts[3]) segments[utt] = [reco, start_time, end_time] num_utt_err = 0 num_utt = 0 num_reco = 0 if args.default_targets is not None: default_targets = np.matrix(common_lib.read_matrix_ascii(args.default_targets)) else: default_targets = np.matrix([[1, 0, 0]]) assert (np.shape(default_targets)[0] == 1 and np.shape(default_targets)[1] == 3) with common_lib.smart_open(args.out_targets_ark, 'w') as f: for reco, utts in reco2utt.iteritems(): reco_mat = np.repeat(default_targets, reco2num_frames[reco], axis=0) utts.sort(key=lambda x: segments[x][1]) # sort on start time for i, utt in enumerate(utts): if utt not in segments: num_utt_err += 1 continue segment = segments[utt] start_frame = int(segment[1] / args.frame_shift) end_frame = int(segment[2] / args.frame_shift) num_frames = end_frame - start_frame if end_frame > reco2num_frames[reco]: end_frame = reco2num_frames[reco] num_frames = end_frame - start_frame reco_mat[start_frame:end_frame] = np.zeros([num_frames, 3]) num_utt += 1 if reco_mat.shape[0] > 0: common_lib.write_matrix_ascii(f, reco_mat.tolist(), key=reco) num_reco += 1 logger.info("Got default out-of-segment targets for {num_reco} recordings " "containing {num_utt} in-segment regions; " "failed to account {num_utt_err} utterances" "".format(num_reco=num_reco, num_utt=num_utt, num_utt_err=num_utt_err)) if num_utt == 0 or num_utt_err > num_utt / 2 or num_reco == 0: raise RuntimeError
def WriteDistMatrices(D, wark): with common_lib.smart_open(wark, 'w') as f: for id in sorted(D.keys()): common_lib.write_matrix_ascii(f, D[id].tolist(), key=id)
def run(args): reco2utt = {} with common_lib.smart_open(args.reco2utt) as f: for line in f: parts = line.strip().split() if len(parts) < 2: raise ValueError("Could not parse line {0}".format(line)) reco2utt[parts[0]] = parts[1:] reco2num_frames = {} with common_lib.smart_open(args.reco2num_frames) as f: for line in f: parts = line.strip().split() if len(parts) != 2: raise ValueError("Could not parse line {0}".format(line)) if parts[0] not in reco2utt: continue reco2num_frames[parts[0]] = int(parts[1]) segments = {} with common_lib.smart_open(args.segments) as f: for line in f: parts = line.strip().split() if len(parts) not in [4, 5]: raise ValueError("Could not parse line {0}".format(line)) utt = parts[0] reco = parts[1] if reco not in reco2utt: continue start_time = float(parts[2]) end_time = float(parts[3]) segments[utt] = [reco, start_time, end_time] num_utt_err = 0 num_utt = 0 num_reco = 0 if args.default_targets is not None: default_targets = np.matrix( common_lib.read_matrix_ascii(args.default_targets)) else: default_targets = np.matrix([[1, 0, 0]]) assert (np.shape(default_targets)[0] == 1 and np.shape(default_targets)[1] == 3) with common_lib.smart_open(args.out_targets_ark, 'w') as f: for reco, utts in reco2utt.iteritems(): reco_mat = np.repeat(default_targets, reco2num_frames[reco], axis=0) utts.sort(key=lambda x: segments[x][1]) # sort on start time for i, utt in enumerate(utts): if utt not in segments: num_utt_err += 1 continue segment = segments[utt] start_frame = int(segment[1] / args.frame_shift) end_frame = int(segment[2] / args.frame_shift) num_frames = end_frame - start_frame if end_frame > reco2num_frames[reco]: end_frame = reco2num_frames[reco] num_frames = end_frame - start_frame reco_mat[start_frame:end_frame] = np.zeros([num_frames, 3]) num_utt += 1 if reco_mat.shape[0] > 0: common_lib.write_matrix_ascii(f, reco_mat.tolist(), key=reco) num_reco += 1 logger.info("Got default out-of-segment targets for {num_reco} recordings " "containing {num_utt} in-segment regions; " "failed to account {num_utt_err} utterances" "".format(num_reco=num_reco, num_utt=num_utt, num_utt_err=num_utt_err)) if num_utt == 0 or num_utt_err > num_utt // 2 or num_reco == 0: raise RuntimeError
def run(args): # Get all reco to num_frames, which will be used to decide the number of # rows of matrix reco2num_frames = {} with common_lib.smart_open(args.reco2num_frames) as f: for line in f: parts = line.strip().split() if len(parts) != 2: raise ValueError("Could not parse line {0}".format(line)) reco2num_frames[parts[0]] = int(parts[1]) # We read all segments and store as a list of objects segments = [] with common_lib.smart_open(args.overlap_rttm) as f: for line in f.readlines(): parts = line.strip().split() segments.append( Segment(parts[1], float(parts[3]), dur=float(parts[4]), label=parts[7])) # We group the segment list into a dictionary indexed by reco_id reco2segs = defaultdict(list, { reco_id: list(g) for reco_id, g in groupby(segments, lambda x: x.reco_id) }) # Now, for each reco, create a matrix of shape num_frames x 3 and fill in using # the segments information for that reco reco2targets = {} for reco_id in reco2num_frames: segs = sorted(reco2segs[reco_id], key=lambda x: x.start_time) target_val = 1 - args.label_smoothing other_val = args.label_smoothing / 2 silence_vec = np.array([target_val, other_val, other_val], dtype=np.float) single_vec = np.array([other_val, target_val, other_val], dtype=np.float) overlap_vec = np.array([other_val, other_val, target_val], dtype=np.float) num_targets = [0, 0, 0] # The default target (if not single or overlap) is silence targets_mat = np.tile(silence_vec, (reco2num_frames[reco_id], 1)) # Now iterate over all segments of the recording and assign targets for seg in segs: start_frame = int(seg.start_time / args.frame_shift) end_frame = min(int(seg.end_time / args.frame_shift), reco2num_frames[reco_id]) num_frames = end_frame - start_frame if (num_frames <= 0): continue if (seg.label == "overlap"): targets_mat[start_frame:end_frame] = np.tile( overlap_vec, (num_frames, 1)) num_targets[2] += end_frame - start_frame else: targets_mat[start_frame:end_frame] = np.tile( single_vec, (num_frames, 1)) num_targets[1] += end_frame - start_frame num_targets[0] = reco2num_frames[reco_id] - sum(num_targets) # print ("{}: {}".format(reco_id, num_targets)) reco2targets[reco_id] = targets_mat with common_lib.smart_open(args.out_targets_ark, 'w') as f: for reco_id in sorted(reco2targets.keys()): common_lib.write_matrix_ascii(f, reco2targets[reco_id].tolist(), key=reco_id)
def run(args): reco2utt = read_reco2utt_file(args.reco2utt) reco2num_frames = read_reco2num_frames_file(args.reco2num_frames) segments = read_segments_file(args.segments, reco2utt) targets = read_targets_scp(args.targets_scp, segments) if args.default_targets is not None: # Read the vector of default targets for out-of-segment regions default_targets = np.matrix( common_lib.read_matrix_ascii(args.default_targets)) else: default_targets = np.zeros([1, 3]) assert (np.shape(default_targets)[0] == 1 and np.shape(default_targets)[1] == 3) num_utt_err = 0 num_utt = 0 num_reco = 0 with common_lib.smart_open(args.out_targets_ark, 'w') as fh: for reco, utts in reco2utt.iteritems(): # Read a recording and the list of its utterances from the # reco2utt dictionary reco_mat = np.repeat(default_targets, reco2num_frames[reco], axis=0) utts.sort(key=lambda x: segments[x][1]) # sort on start time for i, utt in enumerate(utts): if utt not in segments or utt not in targets: num_utt_err += 1 continue segment = segments[utt] # Read the targets corresponding to the segments cmd = ("copy-feats --binary=false {mat_fn} -" "".format(mat_fn=targets[utt])) p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) try: mat = np.matrix(common_lib.read_matrix_ascii(p.stdout), dtype='float32') except Exception: logger.error("Command '{cmd}' failed".format(cmd=cmd)) raise finally: [stdout, stderr] = p.communicate() if p.returncode is not None and p.returncode != 0: raise RuntimeError( 'Command "{cmd}" failed with status {status}; ' 'stderr = {stderr}'.format(cmd=cmd, status=-p.returncode, stderr=stderr)) start_frame = int(segment[1] / args.frame_shift + 0.5) end_frame = int(segment[2] / args.frame_shift + 0.5) num_frames = end_frame - start_frame if num_frames <= 0: raise ValueError("Invalid line in segments file {0}" "".format(segment)) if abs(mat.shape[0] - num_frames) > args.length_tolerance: logger.warning("For utterance {utt}, mismatch in segment " "length and targets matrix size; " "{s_len} vs {t_len}".format( utt=utt, s_len=num_frames, t_len=mat.shape[0])) num_utt_err += 1 continue if end_frame > reco2num_frames[reco]: end_frame = reco2num_frames[reco] num_frames = end_frame - start_frame if num_frames < 0: logger.warning("For utterance {utt}, start-frame {start} " "is outside the recording" "".format(utt=utt, start=start_frame)) num_utt_err += 1 continue prev_utt_end_frame = ( int(segments[utts[i-1]][2] / args.frame_shift + 0.5) if i > 0 else 0) if start_frame < prev_utt_end_frame: # Segment overlaps with the previous utterance # Combine targets using a weighted interpolation using a # triangular window with a weight of 1 at the start/end of # overlap and 0 at the end/start of the segment for n in range(0, prev_utt_end_frame - start_frame): w = float(n) / float(prev_utt_end_frame - start_frame) reco_mat[n + start_frame, :] = ( reco_mat[n + start_frame, :] * (1.0 - w) + mat[n, :] * w) num_frames = min(num_frames, mat.shape[0]) end_frame = start_frame + num_frames reco_mat[prev_utt_end_frame:end_frame, :] = ( mat[(prev_utt_end_frame-start_frame): (end_frame-start_frame), :]) else: # No overlap with the previous utterances. # So just add it to the output. num_frames = min(num_frames, mat.shape[0]) reco_mat[start_frame:(start_frame + num_frames), :] = ( mat[0:num_frames, :]) logger.debug("reco_mat shape = %s, mat shape = %s, " "start_frame = %d, end_frame = %d", reco_mat.shape, mat.shape, start_frame, end_frame) num_utt += 1 if reco_mat.shape[0] > 0: common_lib.write_matrix_ascii(fh, reco_mat, key=reco) num_reco += 1 logger.info("Merged {num_utt} segment targets from {num_reco} recordings; " "failed with {num_utt_err} utterances" "".format(num_utt=num_utt, num_reco=num_reco, num_utt_err=num_utt_err)) if num_utt == 0 or num_utt_err > num_utt / 2 or num_reco == 0: raise RuntimeError