def main(argv): # Args parser args = parseArgs(argv) print("=============================================================") print(f"Quantizing data from {args.pathDB}") print("=============================================================") # Get splits if args.split: assert len(args.split.split("-"))==2 and int(args.split.split("-")[1]) >= int(args.split.split("-")[0]) >= 1, \ "SPLIT must be under the form idxSplit-numSplits (numSplits >= idxSplit >= 1), eg. --split 1-20" idx_split, num_splits = args.split.split("-") idx_split = int(idx_split) num_splits = int(num_splits) # Find all sequences print("") print(f"Looking for all {args.file_extension} files in {args.pathDB}") seqNames, _ = findAllSeqs(args.pathDB, speaker_level=1, extension=args.file_extension, loadCache=True) if len(seqNames) == 0 or not os.path.splitext(seqNames[0][1])[1].endswith( args.file_extension): print( f"Seems like the _seq_cache.txt does not contain the correct extension, reload the file list" ) seqNames, _ = findAllSeqs(args.pathDB, speaker_level=1, extension=args.file_extension, loadCache=False) print(f"Done! Found {len(seqNames)} files!") # Filter specific sequences if args.pathSeq: print("") print(f"Filtering seqs in {args.pathSeq}") with open(args.pathSeq, 'r') as f: seqs = set([x.strip() for x in f]) filtered = [] for s in seqNames: if os.path.splitext(s[1].split('/')[-1])[0] in seqs: filtered.append(s) seqNames = filtered print(f"Done! {len(seqNames)} files filtered!") # Check if directory exists if not os.path.exists(args.pathOutputDir): print("") print(f"Creating the output directory at {args.pathOutputDir}") Path(args.pathOutputDir).mkdir(parents=True, exist_ok=True) writeArgs(os.path.join(args.pathOutputDir, "_info_args.json"), args) # Check if output file exists if not args.split: nameOutput = "quantized_outputs.txt" else: nameOutput = f"quantized_outputs_split_{idx_split}-{num_splits}.txt" outputFile = os.path.join(args.pathOutputDir, nameOutput) # Get splits if args.split: startIdx = len(seqNames) // num_splits * (idx_split - 1) if idx_split == num_splits: endIdx = len(seqNames) else: endIdx = min( len(seqNames) // num_splits * idx_split, len(seqNames)) seqNames = seqNames[startIdx:endIdx] print("") print( f"Quantizing split {idx_split} out of {num_splits} splits, with {len(seqNames)} files (idx in range({startIdx}, {endIdx}))." ) # Debug mode if args.debug: nsamples = 20 print("") print(f"Debug mode activated, only load {nsamples} samples!") # shuffle(seqNames) seqNames = seqNames[:nsamples] # Continue addEndLine = False # to add end line (\n) to first line or not if args.resume: if os.path.exists(outputFile): with open(outputFile, 'r') as f: lines = [line for line in f] existing_files = set([x.split()[0] for x in lines if x.split()]) seqNames = [ s for s in seqNames if os.path.splitext(s[1].split('/')[-1])[0] not in existing_files ] print( f"Found existing output file, continue to quantize {len(seqNames)} audio files left!" ) if len(lines) > 0 and not lines[-1].endswith("\n"): addEndLine = True else: assert not os.path.exists(outputFile), \ f"Output file {outputFile} already exists !!! If you want to continue quantizing audio files, please check the --resume option." assert len(seqNames) > 0, \ "No file to be quantized!" # Load Clustering args assert args.pathClusteringCheckpoint[-3:] == ".pt" if os.path.exists(args.pathClusteringCheckpoint[:-3] + "_args.json"): pathConfig = args.pathClusteringCheckpoint[:-3] + "_args.json" elif os.path.exists( os.path.join(os.path.dirname(args.pathClusteringCheckpoint), "checkpoint_args.json")): pathConfig = os.path.join( os.path.dirname(args.pathClusteringCheckpoint), "checkpoint_args.json") else: assert False, \ f"Args file not found in the directory {os.path.dirname(args.pathClusteringCheckpoint)}" clustering_args = readArgs(pathConfig) print("") print( f"Clutering args:\n{json.dumps(vars(clustering_args), indent=4, sort_keys=True)}" ) print('-' * 50) # Load CluterModule print("") print(f"Loading ClusterModule at {args.pathClusteringCheckpoint}") clusterModule = loadClusterModule(args.pathClusteringCheckpoint) if not args.cpu: clusterModule.cuda() print("ClusterModule loaded!") # Get the CPC checkpoint path from clustering args if not os.path.isabs( clustering_args.pathCheckpoint): # Maybe it's relative path clustering_args.pathCheckpoint = os.path.join( os.path.dirname(os.path.abspath(args.pathClusteringCheckpoint)), clustering_args.pathCheckpoint) assert os.path.exists(clustering_args.pathCheckpoint), \ f"CPC path at {clustering_args.pathCheckpoint} does not exist!!" # Load FeatureMaker print("") print(f"Loading CPC FeatureMaker from {clustering_args.pathCheckpoint}") ## If we don't apply batch implementation, we can set LSTM model to keep hidden units ## making the quality of the quantized units better (that's why I set keep_hidden=args.nobatch) featureMaker = loadCPCFeatureMaker( clustering_args.pathCheckpoint, gru_level=vars(clustering_args).get('level_gru', None), get_encoded=clustering_args.encoder_layer, keep_hidden=args.nobatch) if clustering_args.dimReduction is not None: dimRed = loadDimReduction(clustering_args.dimReduction, clustering_args.centroidLimits) featureMaker = torch.nn.Sequential(featureMaker, dimRed) if not clustering_args.train_mode: featureMaker.eval() if not args.cpu: featureMaker.cuda() def cpc_feature_function(x): if args.nobatch is False: return buildFeature_batch(featureMaker, x, seqNorm=False, strict=args.strict, maxSizeSeq=args.max_size_seq, batch_size=args.batch_size) else: return buildFeature(featureMaker, x, seqNorm=False, strict=args.strict) print("CPC FeatureMaker loaded!") # Quantization of files print("") print(f"Quantizing audio files and saving outputs to {outputFile}...") f = open(outputFile, "a") bar = progressbar.ProgressBar(maxval=len(seqNames)) bar.start() start_time = time() for index, vals in enumerate(seqNames): bar.update(index) file_path = vals[1] file_path = os.path.join(args.pathDB, file_path) # Quantizing quantLine = quantize_file(file_path, cpc_feature_function, clusterModule) # Save the outputs file_name = os.path.splitext(os.path.basename(file_path))[0] outLine = "\t".join([file_name, quantLine]) if addEndLine: f.write("\n" + outLine) else: f.write(outLine) addEndLine = True bar.finish() print(f"...done {len(seqNames)} files in {time()-start_time} seconds.") f.close()
def main(argv): # Args parser args = parseArgs(argv) print("=============================================================") print(f"Building 1-hot features from {args.pathQuantizedUnits}") print("=============================================================") # Load input file print("") print(f"Reading input file from {args.pathQuantizedUnits}") seqNames = [] seqInputs = [] with open(args.pathQuantizedUnits, 'r') as f: for line in f: file_name, file_seq = line.strip().split("\t") # Convert sequence to the desired input form file_seq = file_seq.replace(",", " ") # Add to lists seqNames.append(file_name) seqInputs.append(file_seq) print(f"Found {len(seqNames)} sequences!") # Verify the output directory if os.path.exists(args.pathOutputDir): existing_files = set([ os.path.splitext(os.path.basename(x))[0] for x in os.listdir(args.pathOutputDir) if x[-4:] == ".npy" ]) seqNames = [ s for s in seqNames if os.path.splitext(os.path.basename(s[1]))[0] not in existing_files ] print( f"Found existing output directory at {args.pathOutputDir}, continue to build features of {len(seqNames)} audio files left!" ) else: print("") print(f"Creating the output directory at {args.pathOutputDir}") Path(args.pathOutputDir).mkdir(parents=True, exist_ok=True) writeArgs(os.path.join(args.pathOutputDir, "_info_args.json"), args) # Debug mode if args.debug: nsamples = 20 print("") print(f"Debug mode activated, only load {nsamples} samples!") # shuffle(seqNames) seqNames = seqNames[:nsamples] seqInputs = seqInputs[:nsamples] # Load 1hot dictionary in case we use it if seqInputs and not seqInputs[0].split()[0].isdigit( ): #multi-group ie. 65-241 assert args.dict is not None, \ "A dictionary must be given when the quantized outputs is not digits (multi-group case)!" if args.dict: print("") print(f"Loading onehot dictionary from {args.dict}...") with open(args.dict, "r") as f: lines = f.read().split("\n") pair2idx = { word.split()[0]: i for i, word in enumerate(lines) if word and not word.startwith("madeupword") } args.n_units = len(pair2idx) # Define onehot_feature_function def onehot_feature_function(input_sequence): if args.dict: indexes_sequence = np.array( [pair2idx[item] for item in input_sequence.split()]) else: indexes_sequence = np.array( [int(item) for item in input_sequence.split()]) onehotFeatures = np.eye(args.n_units)[indexes_sequence] return onehotFeatures # Building features print("") print( f"Building 1-hot features and saving outputs to {args.pathOutputDir}..." ) bar = progressbar.ProgressBar(maxval=len(seqNames)) bar.start() start_time = time() for index, (name_seq, input_seq) in enumerate(zip(seqNames, seqInputs)): bar.update(index) # Computing features onehot_features = onehot_feature_function(input_seq) # Save the outputs file_name = os.path.splitext(name_seq)[0] + ".txt" file_out = os.path.join(args.pathOutputDir, file_name) np.savetxt(file_out, onehot_features) bar.finish() print(f"...done {len(seqNames)} files in {time()-start_time} seconds.")
def main(argv): # Args parser args = parseArgs(argv) print("=============================================================") print(f"Building BERT features from {args.pathQuantizedUnits}") print("=============================================================") # Load input file print("") print(f"Reading input file from {args.pathQuantizedUnits}") seqNames = [] seqInputs = [] with open(args.pathQuantizedUnits, 'r') as f: for line in f: file_name, file_seq = line.strip().split("\t") # Convert sequence to the desired input form file_seq = file_seq.replace(",", " ") # Add to lists seqNames.append(file_name) seqInputs.append(file_seq) print(f"Found {len(seqNames)} sequences!") # Verify the output directory if os.path.exists(args.pathOutputDir): existing_files = set([ os.path.splitext(os.path.basename(x))[0] for x in os.listdir(args.pathOutputDir) if x[-4:] == ".npy" ]) seqNames = [ s for s in seqNames if os.path.splitext(os.path.basename(s[1]))[0] not in existing_files ] print( f"Found existing output directory at {args.pathOutputDir}, continue to build features of {len(seqNames)} audio files left!" ) else: print("") print(f"Creating the output directory at {args.pathOutputDir}") Path(args.pathOutputDir).mkdir(parents=True, exist_ok=True) writeArgs(os.path.join(args.pathOutputDir, "_info_args.json"), args) # Debug mode if args.debug: nsamples = 20 print("") print(f"Debug mode activated, only load {nsamples} samples!") # shuffle(seqNames) seqNames = seqNames[:nsamples] seqInputs = seqInputs[:nsamples] # Load LSTM model if args.dict is None: pathData = os.path.dirname(args.pathLSTMCheckpoint) else: pathData = os.path.dirname(args.dict) assert os.path.exists(os.path.join(pathData, "dict.txt")), \ f"Dictionary file (dict.txt) not found in {pathData}" print("") print(f"Loading LSTM model from {args.pathLSTMCheckpoint}...") print(f"Path data {pathData}") model, task = loadLSTMLMCheckpoint(args.pathLSTMCheckpoint, pathData) model.eval() # disable dropout (or leave in train mode to finetune) if not args.cpu: model.cuda() print("Model loaded !") # Define LSTM_feature_function def LSTM_feature_function(input_sequence, n_hidden=-1): # Get the number of layers num_layers = len(model.decoder.layers) assert abs(n_hidden) <= num_layers, \ "absolute value of n_hidden must be less than or equal to the number of hidden layers = {}".format(num_layers) if n_hidden < 0: n_hidden = num_layers + 1 + n_hidden # Get input tensor input_tensor = task.source_dictionary.encode_line( "<s> " + input_sequence, append_eos=True, add_if_not_exist=False).type(torch.LongTensor).unsqueeze(0) if not args.cpu: input_tensor = input_tensor.cuda() # Get the output if n_hidden == 0: # Take the embedding layer with torch.no_grad(): output_tensor = model.decoder.embed_tokens(input_tensor) else: decoder_clone = deepcopy(model.decoder) # We don't take the final fc features decoder_clone.fc_out = torch.nn.Identity() decoder_clone.additional_fc = torch.nn.Identity() # Restrict the number of hiddden layers to n_hidden decoder_clone.layers = decoder_clone.layers[:n_hidden] with torch.no_grad(): output_tensor = decoder_clone(input_tensor)[0] return output_tensor[0].data.cpu().numpy() # Building features print("") print( f"Building LSTM features and saving outputs to {args.pathOutputDir}..." ) bar = progressbar.ProgressBar(maxval=len(seqNames)) bar.start() start_time = time() for index, (name_seq, input_seq) in enumerate(zip(seqNames, seqInputs)): bar.update(index) # Computing features LSTM_features = LSTM_feature_function(input_seq, n_hidden=args.hidden_level) # Save the outputs file_name = os.path.splitext(name_seq)[0] + ".txt" file_out = os.path.join(args.pathOutputDir, file_name) np.savetxt(file_out, LSTM_features) bar.finish() print(f"...done {len(seqNames)} files in {time()-start_time} seconds.")
def main(argv): # Args parser args = parseArgs(argv) print("=============================================================") print(f"Building BERT features from {args.pathQuantizedUnits}") print("=============================================================") # Load input file print("") print(f"Reading input file from {args.pathQuantizedUnits}") seqNames = [] seqInputs = [] with open(args.pathQuantizedUnits, 'r') as f: for line in f: file_name, file_seq = line.strip().split("\t") # Convert sequence to the desired input form file_seq = file_seq.replace(",", " ") # Add to lists seqNames.append(file_name) seqInputs.append(file_seq) print(f"Found {len(seqNames)} sequences!") # Verify the output directory if os.path.exists(args.pathOutputDir): existing_files = set([ os.path.splitext(os.path.basename(x))[0] for x in os.listdir(args.pathOutputDir) if x[-4:] == ".npy" ]) seqNames = [ s for s in seqNames if os.path.splitext(os.path.basename(s[1]))[0] not in existing_files ] print( f"Found existing output directory at {args.pathOutputDir}, continue to build features of {len(seqNames)} audio files left!" ) else: print("") print(f"Creating the output directory at {args.pathOutputDir}") Path(args.pathOutputDir).mkdir(parents=True, exist_ok=True) writeArgs(os.path.join(args.pathOutputDir, "_info_args.json"), args) # Debug mode if args.debug: nsamples = 20 print("") print(f"Debug mode activated, only load {nsamples} samples!") # shuffle(seqNames) seqNames = seqNames[:nsamples] seqInputs = seqInputs[:nsamples] # Load BERT model if args.dict is None: pathData = os.path.dirname(args.pathBERTCheckpoint) else: pathData = os.path.dirname(args.dict) assert os.path.exists(os.path.join(pathData, "dict.txt")), \ f"Dictionary file (dict.txt) not found in {pathData}" print("") print(f"Loading RoBERTa model from {args.pathBERTCheckpoint}...") print(f"Path data {pathData}") roberta = loadRobertaCheckpoint(args.pathBERTCheckpoint, pathData, from_pretrained=False) roberta.eval() # disable dropout (or leave in train mode to finetune) if not args.cpu: roberta.cuda() print("Model loaded !") # Define BERT_feature_function def BERT_feature_function(input_sequence, n_hidden=-1): sentence_tokens = roberta.task.source_dictionary.encode_line( "<s> " + input_sequence, append_eos=True, add_if_not_exist=False).type(torch.LongTensor) if not args.cpu: sentence_tokens = sentence_tokens.cuda() with torch.no_grad(): outputs = roberta.extract_features(sentence_tokens, return_all_hiddens=True) return outputs[n_hidden].squeeze(0).float().cpu().numpy() # Building features print("") print( f"Building BERT features and saving outputs to {args.pathOutputDir}..." ) bar = progressbar.ProgressBar(maxval=len(seqNames)) bar.start() start_time = time() for index, (name_seq, input_seq) in enumerate(zip(seqNames, seqInputs)): bar.update(index) # Computing features BERT_features = BERT_feature_function(input_seq, n_hidden=args.hidden_level) # Save the outputs file_name = os.path.splitext(name_seq)[0] + ".txt" file_out = os.path.join(args.pathOutputDir, file_name) np.savetxt(file_out, BERT_features) bar.finish() print(f"...done {len(seqNames)} files in {time()-start_time} seconds.")
def main(argv): # Args parser args = parseArgs(argv) print("=============================================================") print(f"Building CPC features from {args.pathDB}") print("=============================================================") # Find all sequences print("") print(f"Looking for all {args.file_extension} files in {args.pathDB}") seqNames, _ = findAllSeqs(args.pathDB, speaker_level=1, extension=args.file_extension, loadCache=True) if len(seqNames) == 0 or not os.path.splitext(seqNames[0][-1])[1].endswith( args.file_extension): print( f"Seems like the _seq_cache.txt does not contain the correct extension, reload the file list" ) seqNames, _ = findAllSeqs(args.pathDB, speaker_level=1, extension=args.file_extension, loadCache=False) print(f"Done! Found {len(seqNames)} files!") # Verify the output directory if os.path.exists(args.pathOutputDir): existing_files = set([ os.path.splitext(os.path.basename(x))[0] for x in os.listdir(args.pathOutputDir) if x[-4:] == ".npy" ]) seqNames = [ s for s in seqNames if os.path.splitext(os.path.basename(s[1]))[0] not in existing_files ] print( f"Found existing output directory at {args.pathOutputDir}, continue to build features of {len(seqNames)} audio files left!" ) else: print("") print(f"Creating the output directory at {args.pathOutputDir}") Path(args.pathOutputDir).mkdir(parents=True, exist_ok=True) writeArgs(os.path.join(args.pathOutputDir, "_info_args.json"), args) # Debug mode if args.debug: nsamples = 20 print("") print(f"Debug mode activated, only load {nsamples} samples!") # shuffle(seqNames) seqNames = seqNames[:nsamples] # Load CPC feature maker print("") print(f"Loading CPC featureMaker from {args.pathCPCCheckpoint}") featureMaker = loadCPCFeatureMaker(args.pathCPCCheckpoint, gru_level=args.gru_level, get_encoded=args.get_encoded, keep_hidden=True) featureMaker.eval() if not args.cpu: featureMaker.cuda() print("CPC FeatureMaker loaded!") # Define CPC_feature_function def CPC_feature_function(x): CPC_features = buildFeature(featureMaker, x, seqNorm=args.seq_norm, strict=args.strict, maxSizeSeq=args.max_size_seq) return CPC_features.squeeze(0).float().cpu().numpy() # Building features print("") print( f"Building CPC features and saving outputs to {args.pathOutputDir}...") bar = progressbar.ProgressBar(maxval=len(seqNames)) bar.start() start_time = time() file_out = os.path.join(args.pathOutputDir, file_name) for index, vals in enumerate(seqNames): bar.update(index) file_path = vals[1] file_path = os.path.join(args.pathDB, file_path) # Computing features CPC_features = CPC_feature_function(file_path) # Save the outputs file_name = os.path.splitext( os.path.basename(file_path))[0] + ".ark.gz" with WriteHelper(f"ark:| gzip -c > {file_name}") as writer: writer('arr_0', CPC_features) bar.finish() print(f"...done {len(seqNames)} files in {time()-start_time} seconds.")