def main(): """Main process""" args = parse_args() config = parse_yaml(args.config_path)["avel"] train_config = config["train"] model_config = config["model"] # If audio and visual features has not been extracted, # extract them from video file, and save them. if args.extract_features is True: extract_feature(args.ave_root) model = DMRFE( 128, 512, 7 * 7, model_config["att_embed_dim"], model_config["lstm_hidden_dim"], model_config["lstm_num_layers"], model_config["target_size"], ) # AVE training dataset. train_ds = AVELDataset( args.ave_root, args.train_annot, args.features_path, train_config["batch_size"], model_config["target_size"], ) # AVE validation dataset. valid_ds = AVELDataset( args.ave_root, args.valid_annot, args.features_path, train_config["batch_size"], model_config["target_size"], ) training = Training( model, train_ds, valid_ds, train_config["batch_size"], train_config["epoch"], train_config["learning_rate"], train_config["valid_span"], train_config["save_span"], train_config["save_dir"], ) training.train()
def concat_features(file_list): """ Extract features from audio files and then concate them into an array. """ # calculate the number of dimensions n_features = [] for file_id, file_name in enumerate(file_list): # extract feature from audio file. feature = util.extract_feature(file_name, CONFIG["feature"]) feature = feature[:: CONFIG["feature"]["n_hop_frames"], :] if file_id == 0: features = numpy.zeros( ( len(file_list) * feature.shape[0], CONFIG["feature"]["n_mels"] * CONFIG["feature"]["n_frames"], ), dtype=float, ) features[ feature.shape[0] * file_id : feature.shape[0] * (file_id + 1), : ] = feature n_features.append(feature.shape[0]) return features, n_features
def __init__(self, files, transform=None): self.transform = transform for file_id, file_name in enumerate(files): # shape = (#frames, #dims) features = util.extract_feature(file_name, config=CONFIG["feature"]) features = features[::CONFIG["feature"]["n_hop_frames"], :] if file_id == 0: # shape = (#total frames over all audio files, #dim. of feature vector) dataset = numpy.zeros( ( features.shape[0] * len(files), CONFIG["feature"]["n_mels"] * CONFIG["feature"]["n_frames"], ), numpy.float32, ) dataset[features.shape[0] * file_id:features.shape[0] * (file_id + 1), :] = features self.feat_data = dataset train_size = int( len(dataset) * (1.0 - CONFIG["training"]["validation_split"])) print("train_size: %d, val_size: %d" % ( train_size, int(len(dataset) * CONFIG["training"]["validation_split"]), ))
def calc_anomaly_score(model, file_path): """ Calculate anomaly score. """ try: data = util.extract_feature(file_path, config=CONFIG["feature"]) except FileNotFoundError: print("File broken!!: {}".format(file_path)) feed_data = torch.from_numpy(data).clone() feed_data.to(DEVICE) feed_data = feed_data.float() with torch.no_grad(): pred = model(feed_data) pred = pred.to("cpu").detach().numpy().copy() errors = np.mean(np.square(data - pred), axis=1) # average over dim. return np.mean(errors) # average over frames
def calc_anomaly_score(model, file_path, section_index): """ Calculate anomaly score. """ try: # extract features (log-mel spectrogram) data = util.extract_feature(file_name=file_path, config=CONFIG["feature"]) data = data.reshape( ( # must be a tuple of ints data.shape[0], 1, CONFIG["feature"]["n_frames"], CONFIG["feature"]["n_mels"], ) ) except FileNotFoundError: print("File broken!!: {}".format(file_path)) condition = numpy.zeros((data.shape[0]), dtype=int) if section_index != -1: condition[:] = section_index feed_data = torch.from_numpy(data).clone() feed_data = feed_data.to(DEVICE).float() with torch.no_grad(): output = model(feed_data) # notice: unnormalized output output = output.to("cpu").detach().numpy().copy() # tensor to numpy array. output = softmax(output, axis=1) prob = output[:, section_index] y_pred = numpy.mean( numpy.log( numpy.maximum(1.0 - prob, sys.float_info.epsilon) - numpy.log(numpy.maximum(prob, sys.float_info.epsilon)) ) ) return y_pred