help="# of process for Multiprocessing", default=8, type=int) args = parser.parse_args() assert os.path.isdir(str( args.wsj0)), "WSJ0 directory not found - '{d}'".format(d=args.wsj0) assert os.path.isdir(str( args.wsj1)), "WSJ1 directory not found - '{d}'".format(d=args.wsj1) assert args.wsj0 != args.wsj1, "WSJ0 and WSJ1 directories can't be the same" assert os.path.exists( args.sph2pipe), "sph2pipe not found '{d}'".format(d=args.sph2pipe) transcripts = {} utils.find_transcripts(args.wsj0, transcripts) utils.find_transcripts(args.wsj1, transcripts) sets = {} sets["si84"] = utils.ndx2idlist( args.wsj0, "11-13.1/wsj0/doc/indices/train/tr_s_wv1.ndx", transcripts, lambda line: None if "11_2_1:wsj0/si_tr_s/401" in line else line, ) assert len(sets["si84"]) == 7138 sets["si284"] = utils.ndx2idlist( args.wsj0, "11-13.1/wsj0/doc/indices/train/tr_s_wv1.ndx", transcripts,
default=8, type=int) args = parser.parse_args() wsj1_sep = "-" if args.wsj1_type == "LDC94S13A" else "_" assert os.path.isdir(str( args.wsj0)), "WSJ0 directory is not found - '{d}'".format(d=args.wsj0) assert os.path.isdir(str( args.wsj1)), "WSJ1 directory is not found - '{d}'".format(d=args.wsj1) assert args.wsj0 != args.wsj1, "WSJ0 and WSJ1 directories can't be the same" assert os.path.exists( args.sph2pipe), "sph2pipe not found '{d}'".format(d=args.sph2pipe) # Prepare audio data transcripts = find_transcripts([args.wsj0, args.wsj1]) subsets = dict() subsets["si84"] = ndx_to_samples( args.wsj0, "11-13.1/wsj0/doc/indices/train/tr_s_wv1.ndx", transcripts, lambda line: None if "11_2_1:wsj0/si_tr_s/401" in line else line, ) assert len( subsets["si84"]) == 7138, "Incorrect number of samples in si84 part:" " should be 7138, but fould #{}.".format(len(subsets["si84"])) subsets["si284"] = ndx_to_samples( args.wsj0, "11-13.1/wsj0/doc/indices/train/tr_s_wv1.ndx",