예제 #1
0
def main():
    """Execute a task based on the given command-line arguments.

    This function is the main entry-point of the program. It allows the
    user to extract features, train a model, generate predictions, or
    evaluate predictions using the command-line interface.
    """
    parser = argparse.ArgumentParser()
    subparsers = parser.add_subparsers(dest='mode')

    parser_preprocess = subparsers.add_parser('preprocess')
    parser_preprocess.add_argument('dataset', choices=['training', 'test'])

    # Add sub-parser for feature extraction
    parser_extract = subparsers.add_parser('extract')
    parser_extract.add_argument('dataset', choices=['training', 'test'])
    parser_extract.add_argument('--recompute', action='store_true')

    # Add sub-parser for training
    parser_train = subparsers.add_parser('train')
    parser_train.add_argument(
        '--model',
        choices=[
            'vgg13',
            'gcnn',
            'crnn',
            'gcrnn',
        ],
        default='gcnn',
    )
    parser_train.add_argument('--fold', type=int, default=-1)
    parser_train.add_argument('--class_weight', action='store_true')
    parser_train.add_argument('--sample_weight', type=float)

    # Add sub-parser for inference
    parser_predict = subparsers.add_parser('predict')
    parser_predict.add_argument('dataset', choices=['training', 'test'])
    parser_predict.add_argument('--fold', type=int, default=-1)

    # Add sub-parser for evaluation
    parser_evaluate = subparsers.add_parser('evaluate')
    parser_evaluate.add_argument('dataset', choices=['training', 'test'])
    parser_evaluate.add_argument('--fold', type=int, default=-1)

    args = parser.parse_args()
    if args.mode == 'preprocess':
        preprocess(cfg.to_dataset(args.dataset, preprocessed=False))
    elif args.mode == 'extract':
        extract(cfg.to_dataset(args.dataset), args.recompute)
    elif args.mode == 'train':
        train(args.model, args.fold, args.class_weight, args.sample_weight)
    elif args.mode == 'predict':
        predict(cfg.to_dataset(args.dataset), args.fold)
    elif args.mode == 'evaluate':
        dataset = cfg.to_dataset(args.dataset, preprocessed=False)
        evaluate_audio_tagging(dataset, args.fold)
예제 #2
0
파일: main.py 프로젝트: tqbl/gccaps
def main():
    """Execute a task based on the given command-line arguments.

    This function is the main entry-point of the program. It allows the
    user to extract features, train a model, generate predictions, or
    evaluate predictions using the command-line interface.
    """
    parser = argparse.ArgumentParser()
    subparsers = parser.add_subparsers(dest='mode')

    # Add sub-parser for feature extraction
    parser_extract = subparsers.add_parser('extract')
    parser_extract.add_argument('dataset',
                                choices=['training', 'validation', 'test'],
                                )

    # Add sub-parser for training
    subparsers.add_parser('train')

    # Add sub-parser for inference
    parser_predict = subparsers.add_parser('predict')
    parser_predict.add_argument('dataset',
                                nargs='?',
                                choices=['validation', 'test'],
                                default='test',
                                )

    # Add sub-parser for evaluation
    parser_evaluate = subparsers.add_parser('evaluate')
    parser_evaluate.add_argument('task',
                                 nargs='?',
                                 choices=['tagging', 'sed', 'all'],
                                 default='all',
                                 )
    parser_evaluate.add_argument('dataset',
                                 nargs='?',
                                 choices=['validation', 'test'],
                                 default='test',
                                 )
    parser_evaluate.add_argument('--thresholds', action='store_true')

    args = parser.parse_args()

    if args.mode == 'extract':
        extract(cfg.to_dataset(args.dataset))
    elif args.mode == 'train':
        train()
    elif args.mode == 'predict':
        predict(cfg.to_dataset(args.dataset))
    elif args.mode == 'evaluate':
        eval_all = args.task == 'all'
        dataset = cfg.to_dataset(args.dataset)
        if args.task == 'tagging' or eval_all:
            evaluate_audio_tagging(dataset, args.thresholds)
        if args.task == 'sed' or eval_all:
            evaluate_sed(dataset)
예제 #3
0
def evaluate_audio_tagging(fold):
    """Evaluate the audio tagging predictions and write results.

    Args:
        fold (int): The fold (validation set) to evaluate.
    """
    import evaluation

    # Load grouth truth data and predictions
    dataset = cfg.to_dataset('training', preprocessed=False)
    df = io.read_metadata(dataset.metadata_path)
    df = df[df.fold == fold]
    y_true = utils.to_categorical(df.label)
    fold_str = 'training' + str(fold)
    path = cfg.predictions_path.format('predictions', fold_str)
    y_pred = pd.read_csv(path, index_col=0).values

    # Mask out those that are not manually verified
    mask = df.manually_verified == 1
    y_pred = y_pred[mask]
    y_true = y_true[mask]

    # Evaluate audio tagging performance
    scores = evaluation.evaluate_audio_tagging(y_true,
                                               y_pred,
                                               threshold=cfg.threshold)

    # Ensure output directory exist and write results
    os.makedirs(os.path.dirname(cfg.results_path), exist_ok=True)
    output_path = cfg.results_path.format(fold_str)
    scores.to_csv(output_path)

    # Print scores to 3 decimal places
    pd.options.display.float_format = '{:,.3f}'.format
    print('\n' + str(scores))
예제 #4
0
def compute_vgg13_features(waveform, hparams):
    x, _ = utils_tf._load_dataset(cfg.to_dataset('training'))
    generator = utils.fit_scaler(x)
    mel_filt = librosa.filters.mel(sr=32000, n_fft=1024, n_mels=64).T

    sample_rate = hparams.sample_rate
    mel_filt = tf.convert_to_tensor(mel_filt)
    stfts = tf.contrib.signal.stft(waveform,
                                   frame_length=1024,
                                   frame_step=512,
                                   fft_length=1024,
                                   pad_end=True)
    spectrograms = tf.abs(stfts)

    # Warp the linear scale spectrograms into the mel-scale.
    num_spectrogram_bins = stfts.shape[-1].value
    mel_spectrograms = tf.tensordot(tf.pow(spectrograms, 2), mel_filt, 1)
    mel_spectrograms.set_shape(spectrograms.shape[:-1].concatenate(
        mel_filt.shape[-1:]))

    max_val = tf.reduce_max(mel_spectrograms, axis=None)

    # Compute a stabilized log to get log-magnitude mel-scale spectrograms.
    log_mel_spectrograms = 10 * (
        (tf.log(mel_spectrograms + 1e-6) - tf.log(max_val + 1e-6)) /
        tf.log(tf.constant(10, dtype=tf.float32)))
    log_mel_spectrograms = tf.contrib.signal.frame(log_mel_spectrograms,
                                                   128,
                                                   128,
                                                   axis=0,
                                                   pad_end=True)
    features = generator.standardize(log_mel_spectrograms)
    features.set_shape(shape=[None, 128, 64])
    return features
예제 #5
0
def experiment2(audio_path, audio_number, metadata_path, save_path):
    #Run the attacks to generate adversarial attacks on manually verified examples on the training and test data
    #Load dataset to normalize new data

    x, df = _load_dataset(cfg.to_dataset('training'))
    generator = utils.fit_scaler(x)
    file_names = df.index.to_list()
    audio_file_name = file_names[audio_number]
    mel_fb = librosa.filters.mel(sr=32000, n_fft=1024, n_mels=64).T
    sample_rate = 32000

    with tf.Graph().as_default() as graph:
        mel_filt = tf.convert_to_tensor(mel_fb, dtype=tf.float32)
        model = CleverHansModel(save_path + '.meta', sample_rate, generator,
                                mel_filt)
        pcm = tf.placeholder(tf.float32, shape=[None], name='input_audio')
        saver = model.build_graph(pcm)
        saliencymap = SM.SaliencyMapMethod(model, 41)
        saliencymap.build_attack(pcm)
    with tf.Session(graph=graph) as sess:
        saver.restore(sess, save_path)
        for i in range(111, 112):
            audio_file_name = file_names[i]
            try:
                data, q = _preprocess_data(audio_path, audio_file_name)
            except EOFError:
                print("EOF Error dammit")
            label_name = _get_label_from_audio(audio_path, audio_file_name,
                                               metadata_path)
            labels = _convert_label_name_to_label(label_name)
            s = sess.run([model.get_probs()],
                         feed_dict={'input_audio:0': data})

            s = np.squeeze(s)
            if (s.ndim != 1):
                s = np.mean(s, axis=0)

            if (np.argmax(s) == labels):

                print('Iteration number:', i)
                print('Original label number:', np.argmax(s))
                print('Original label confidence:', np.max(s))
                labels = np.repeat(20, int(q))
                adv = saliencymap.attack(data, labels, sess)

                preds = sess.run([model.get_probs()], feed_dict={pcm: adv})
                preds = np.squeeze(preds)
                if (preds.ndim == 1):
                    print(np.max(preds), np.argmax(preds))
                else:
                    print(np.max(preds, axis=1), np.argmax(preds, axis=1))

                if (preds.ndim != 1):
                    preds = np.mean(preds, axis=0)

                print(np.argmax(preds), np.max(preds))

                librosa.output.write_wav('adv-cw.wav', adv, sample_rate)
                librosa.output.write_wav('original-cw.wav', data, sample_rate)
예제 #6
0
def predict(dataset, fold):
    """Generate predictions for audio tagging.

    This function uses an ensemble of trained models to generate the
    predictions, with the averaging function being an arithmetic mean.
    Computed predictions are then saved to disk.

    Args:
        dataset: Dataset to generate predictions for.
        fold (int): The specific fold to generate predictions for. Only
            applicable for the training dataset.
    """
    import inference

    # Load input data and associated metadata
    x, df = _load_data(dataset)
    dataset_name = dataset.name
    if dataset.name == 'training':
        if fold == -1:
            raise ValueError('Invalid fold: %d' % fold)

        dataset_name += str(fold)
        mask = df.fold == fold
        tr_x = x[~mask]
        x = x[mask]
        df = df[mask]
    else:
        tr_x, tr_df = _load_data(cfg.to_dataset('training'))
        if fold >= 0:
            dataset_name += str(fold)
            tr_x = tr_x[tr_df.fold != fold]

    generator = utils.fit_scaler(tr_x)
    x = generator.standardize(x)

    # Predict class probabilities for each model (epoch)
    preds = []
    for epoch in _determine_epochs(cfg.prediction_epochs, fold, n=4):
        pred = utils.timeit(lambda: _load_model(fold, epoch).predict(x),
                            '[Epoch %d] Predicted class probabilities' % epoch)

        preds.append(inference.merge_predictions(pred, df.index))

    pred_mean = pd.concat(preds).groupby(level=0).mean()

    # Ensure output directory exists and set file path format
    os.makedirs(os.path.dirname(cfg.predictions_path), exist_ok=True)
    predictions_path = cfg.predictions_path.format('%s', dataset_name)

    # Save free parameters to disk
    utils.log_parameters({'prediction_epochs': cfg.prediction_epochs},
                         os.path.join(os.path.dirname(cfg.predictions_path),
                                      'parameters.json'))

    # Write predictions to disk
    pred_mean.to_csv(predictions_path % 'predictions')
    io.write_predictions(pred_mean, predictions_path % 'submission')
예제 #7
0
def dataset_iterator(train_csv_file, train_audio_dir, label_data_file,
                     hparams):
    """
    Create an iterator for the training process
    """
    label_index_table = load_data(train_csv_file)
    label_data = np.load(label_data_file)
    print(label_data.shape)
    num_classes = 41
    dataset = tf.data.TextLineDataset(train_csv_file).skip(1)

    dataset = dataset.shuffle(buffer_size=10000)

    if (hparams.vgg13_features):
        x, _ = utils_tf._load_dataset(cfg.to_dataset('training'))
        generator = utils.fit_scaler(x)
        mel_filt = librosa.filters.mel(sr=32000, n_fft=1024, n_mels=64).T
        dataset = dataset.map(map_func=functools.partial(
            get_vgg13_data,
            train_audio_dir=train_audio_dir,
            hparams=hparams,
            label_index_table=label_index_table,
            label_data=label_data,
            generator=generator,
            mel_filt=mel_filt),
                              num_parallel_calls=6)
    else:
        dataset = dataset.map(
            map_func=functools.partial(get_data,
                                       train_audio_dir=train_audio_dir,
                                       hparams=hparams,
                                       label_index_table=label_index_table,
                                       label_data=label_data))

    dataset = dataset.apply(tf.contrib.data.unbatch())
    dataset = dataset.shuffle(buffer_size=10000)
    dataset = dataset.repeat(6)
    dataset = dataset.batch(hparams.batch_size)

    dataset = dataset.prefetch(10)
    iterator = dataset.make_initializable_iterator()
    features, label = iterator.get_next()

    return features, label, num_classes, iterator.initializer

    return
예제 #8
0
def label_data(model_path, train_csv_file, train_audio_dir):
    """
    Label the data using a particular model and save the softmax values.
    Generates one softmax values per file
    """

    sr = 32000
    df = pd.read_csv(train_csv_file)
    x, _ = utils_tf._load_dataset(cfg.to_dataset('training'))
    generator = utils.fit_scaler(x)
    file_names = df.iloc[:, 0].values
    print(file_names)
    with tf.Graph().as_default() as graph:
        mel_filt = librosa.filters.mel(sr=32000, n_fft=1024, n_mels=64).T
        mel_filt = tf.convert_to_tensor(mel_filt, dtype=tf.float32)
        pcm = tf.placeholder(tf.float32, shape=[None], name='input_audio')
        model = CleverHansModel(model_path + '.meta', sr, generator, mel_filt)
        saver = model.build_graph(pcm)

    probs = []
    temp = np.zeros((len(file_names), 41))
    print(temp.shape)
    #temp = {}
    with tf.Session(graph=graph) as sess:
        saver.restore(sess, model_path)
        print(len(file_names))
        for i in range(len(file_names)):
            data, _ = utils_tf._preprocess_data(train_audio_dir, file_names[i])
            l = sess.run([model.get_probs()], feed_dict={pcm: data})
            l = np.squeeze(l)
            if (l.ndim != 1):
                l = np.mean(l, axis=0)

            temp[i, :] = l
            #       temp[file_names[i]] = l
            print(i)
    #   print(temp)

    #file = open('label_data','wb')
    #np.save('labels.npy',temp)
    #pickle.dump(temp,file)
    #file.close()

    return
예제 #9
0
def target():
    """
    Label the data using a particular model and save the softmax values.
    Generates one softmax values per file
    """
    flags = parse_flags()
    hparams = parse_hparams(flags.hparams)
    num_classes = 41
    df = pd.read_csv(flags.infer_csv_file)
    file_names = df.iloc[:, 0].values

    count = 0

    sr = 32000
    df = pd.read_csv(flags.infer_csv_file)
    x, _ = utils_tf._load_dataset(cfg.to_dataset('training'))
    generator = utils.fit_scaler(x)
    file_names = df.iloc[:, 0].values
    with tf.Graph().as_default() as graph:
        mel_filt = librosa.filters.mel(sr=32000, n_fft=1024, n_mels=64).T
        mel_filt = tf.convert_to_tensor(mel_filt, dtype=tf.float32)
        pcm = tf.placeholder(tf.float32, shape=[None], name='input_audio')
        model = CleverHansModel(flags.save_model_dir + '.meta', sr, generator,
                                mel_filt)
        saver = model.build_graph(pcm)

    with tf.Session(graph=graph) as sess:
        saver.restore(sess, flags.save_model_dir)
        print(len(file_names))
        for i in range(100):
            data, _ = utils_tf._preprocess_data(flags.infer_audio_dir,
                                                file_names[i])
            l = sess.run([model.get_probs()], feed_dict={pcm: data})
            l = np.squeeze(l)
            if (l.ndim != 1):
                l = np.mean(l, axis=0)

            lab = utils_tf._convert_label_name_to_label(df.iloc[i, 1])
            if (lab == np.argmax(l)):
                count += 1
                print(lab, np.argmax(l))

            print(count / 100)
예제 #10
0
def deepfoolattack(audio_path,metadata_path,model_path,exp_data_path,adv_audio_path,save_data=False):
    #Run the attacks to generate adversarial attacks on manually verified examples on the training and test data
    #Load dataset to normalize new data
    x,_ = utils_tf._load_dataset(cfg.to_dataset('training'))
    generator = utils.fit_scaler(x)
    df = pd.read_csv(metadata_path)
    label_names= df.iloc[:,2].values
    file_names = df.iloc[:,1].values
    mel_fb = librosa.filters.mel(sr=32000,n_fft=1024,n_mels=64).T
    sample_rate = 32000
    
    audio_name = []
    audio_length = []
    original_label = []
    original_confidence = []
    new_label = []
    new_confidence = []
    new_o_label_conf = []
    snr = []
    with tf.Graph().as_default() as graph:
        mel_filt = tf.convert_to_tensor(mel_fb,dtype=tf.float32)
        model = CleverHansModel(model_path +'.meta',sample_rate,generator,mel_filt)
        pcm = tf.placeholder(tf.float32,shape=[None],name='input_audio')
        saver= model.build_graph(pcm)
        deepfool = DFM.DeepFool(model)
        deepfool.build_attack(pcm)
    with tf.Session(graph=graph) as sess:
        saver.restore(sess,model_path)
        for i in range(df.shape[0]):
            audio_file_name = file_names[i]
            try:
                data,q = utils_tf._preprocess_data(audio_path,audio_file_name)
            except EOFError:
                print("EOF Error")
 
            labels= utils_tf._convert_label_name_to_label(label_names[i])
            s = sess.run([model.get_probs()],feed_dict={'input_audio:0':data})
            
            s = np.squeeze(s)
            if (s.ndim != 1):
                s = np.mean(s,axis=0)
                      
                
            print('Original label number:',np.argmax(s))
            print('Original label confidence:',np.max(s))
                
            tic = time.process_time()
            adv = deepfool.attack(sess,data,int(q))
            toc = time.process_time()

            print('Time for processing sample:',toc-tic,'for iteration:',i)
            preds = sess.run([model.get_probs()],feed_dict={pcm:adv})
            preds = np.squeeze(preds)

            if(preds.ndim !=1):
                preds = np.mean(preds,axis=0)
            print('New label number:',np.argmax(preds))
            print('New label confidence:',np.max(preds))
                
            if(save_data):
                librosa.output.write_wav(adv_audio_path + 'adv-' + audio_file_name,adv,sample_rate)
                
            audio_name.append(audio_file_name)
            audio_length.append(int(q))
            original_label.append(np.argmax(s))
            original_confidence.append(np.max(s))
            new_label.append(np.argmax(preds))
            new_confidence.append(np.max(preds))
            new_o_label_conf.append(preds[np.argmax(s)])
            snr.append(10*np.log10(np.mean(data**2)/(np.mean((adv-data)**2))))
        if(save_data):
            df_deepfool = pd.DataFrame({'audio_name':audio_name,'audio_length':audio_length,'original_label':original_label,'original_confidence':original_confidence,'new_label':new_label,'new_confidence':new_confidence,'new_orig_conf':new_o_label_conf,'SNR':snr})
        
            with open(exp_data_path,'a') as f:
                df_deepfool.to_csv(f,header=False)
예제 #11
0
def inferenceiqbal(audio_path,
                   metadata_path,
                   model_path,
                   exp_data_path,
                   adv_audio_path,
                   save_data=False):
    #Run the attacks to generate adversarial attacks on manually verified examples on the training and test data
    #Load dataset to normalize new data
    x, _ = utils_tf._load_dataset(cfg.to_dataset('training'))
    generator = utils.fit_scaler(x)
    df = pd.read_csv(metadata_path)
    label_names = df.iloc[:, 1].values
    file_names = df.iloc[:, 0].values

    mel_fb = librosa.filters.mel(sr=32000, n_fft=1024, n_mels=64).T
    sample_rate = 32000

    audio_name = []
    ground_truth = []
    inferred_label = []
    inferred_confidence = []
    with tf.Graph().as_default() as graph:
        mel_filt = tf.convert_to_tensor(mel_fb, dtype=tf.float32)
        model = CleverHansModel(model_path + '.meta', sample_rate, generator,
                                mel_filt)
        pcm = tf.placeholder(tf.float32, shape=[None], name='input_audio')
        saver = model.build_graph(pcm)
    with tf.Session(graph=graph) as sess:
        saver.restore(sess, model_path)
        count = 0
        count_tot = 0
        for i in range(df.shape[0]):
            audio_file_name = file_names[i]
            try:
                data, q = utils_tf._preprocess_data(audio_path,
                                                    audio_file_name)
            except EOFError:
                print("EOF Error")

            gt_label = utils_tf._convert_label_name_to_label(label_names[i])
            s = sess.run([model.get_probs()],
                         feed_dict={'input_audio:0': data})

            s = np.squeeze(s)
            if (s.ndim != 1):
                s = np.mean(s, axis=0)
            label = np.argmax(s)
            count_tot += 1
            if (label == gt_label):
                count += 1

            if (i % 1000 == 0):
                print('Iteration number:', i)
                print('Current accuracy:', float(count / count_tot))
            audio_name.append(audio_file_name)
            ground_truth.append(gt_label)
            inferred_label.append(label)
            inferred_confidence.append(np.max(s))
        if (save_data):
            df_deepfool = pd.DataFrame({
                'audio_name':
                audio_name,
                'ground_truth':
                ground_truth,
                'inferred_label':
                inferred_label,
                'inferred_confidence':
                inferred_confidence
            })

            with open(exp_data_path, 'w') as f:
                df_deepfool.to_csv(f, header=False)
예제 #12
0
def train(model, fold, use_class_weight, noisy_sample_weight):
    """Train the neural network model.

    Args:
        model (str): The neural network architecture.
        fold (int): The fold to use for validation.
        use_class_weight (bool): Whether to use class-wise weights.
        noisy_sample_weight (float): Examples that are not verified are
            weighted according to this value.

    Note:
        For reproducibility, the random seed is set to a fixed value.
    """
    import training

    # Try to create reproducible results
    np.random.seed(cfg.initial_seed)

    # Load training data and associated metadata
    x, df = _load_data(cfg.to_dataset('training'))
    # Get one-hot representation of target values
    y = utils.to_categorical(df.label)

    # Split training data into training and validation
    if fold >= 0:
        mask = df.fold == fold
    else:
        mask = np.zeros(len(df), dtype=bool)
    val_mask = mask & (df.manually_verified == 1)

    tr_x = x[~mask]
    tr_y = y[~mask]
    val_x = x[val_mask]
    val_y = y[val_mask]
    val_index = df.index[val_mask]

    # Compute class weights based on number of class examples
    if use_class_weight:
        group = utils.group_by_name(df)
        n_examples = group.first().groupby('label').size().values
        class_weight = len(group) / (len(n_examples) * n_examples)
    else:
        class_weight = None

    # Assign a specific sample weight to unverified examples
    if noisy_sample_weight:
        sample_weight = df[~mask].manually_verified.values.astype(float)
        sample_weight[sample_weight == 0] = noisy_sample_weight
    else:
        sample_weight = None

    # Ensure output directories exist
    fold_dir = str(fold) if fold >= 0 else 'all'
    os.makedirs(os.path.join(cfg.model_path, fold_dir), exist_ok=True)
    os.makedirs(cfg.log_path.format(fold_dir), exist_ok=True)

    # Save free parameters to disk
    utils.log_parameters(cfg.training,
                         os.path.join(cfg.model_path, 'parameters.json'))

    training.train(tr_x,
                   tr_y,
                   val_x,
                   val_y,
                   val_index,
                   model,
                   fold,
                   class_weight=class_weight,
                   sample_weight=sample_weight)
def carliniwagneruntargeted(audio_path,
                            metadata_path,
                            model_path,
                            exp_data_path,
                            adv_audio_path,
                            save_data=False):
    #Run the attacks to generate adversarial attacks on manually verified examples on the training and test data
    #Load dataset to normalize new data
    x, _ = utils_tf._load_dataset(cfg.to_dataset('training'))
    generator = utils.fit_scaler(x)
    df = pd.read_csv(metadata_path)
    label_names = df.iloc[:, 2].values
    file_names = df.iloc[:, 1].values
    mel_fb = librosa.filters.mel(sr=32000, n_fft=1024, n_mels=64).T
    sample_rate = 32000

    audio_name = []
    audio_length = []
    original_label = []
    original_confidence = []
    new_label = []
    new_confidence = []
    new_o_label_conf = []
    snr = []
    with tf.Graph().as_default() as graph:
        mel_filt = tf.convert_to_tensor(mel_fb, dtype=tf.float32)
        model = CleverHansModel(model_path + '.meta', sample_rate, generator,
                                mel_filt)
        pcm = tf.placeholder(tf.float32, shape=[None], name='input_audio')
        carliniwagner = CW.CarliniWagnerAttack(model,
                                               learning_rate=1e-5,
                                               initial_const=1e-2,
                                               max_iterations=1000,
                                               confidence=500,
                                               binary_search_steps=2)
        saver = carliniwagner.build_attack(pcm)
    with tf.Session(graph=graph) as sess:
        saver.restore(sess, model_path)
        for i in range(df.shape[0]):
            audio_file_name = file_names[i]
            try:
                data, q = utils_tf._preprocess_data(audio_path,
                                                    audio_file_name)
            except EOFError:
                print("EOF Error")

            label = utils_tf._convert_label_name_to_label(label_names[i])

            print('Ground truth label:', label_names[i])

            labels_batchwise = np.repeat(label, int(q))

            tic = time.process_time()
            adv, o_label, o_conf, n_label, n_conf, n_conf_gt = carliniwagner.attack(
                sess,
                data,
                label,
                labels_batchwise,
                int(q),
                prob_thresh=0.0244)
            toc = time.process_time()

            print('Time for iteration:', i, 'is', toc - tic)
            if (save_data):
                librosa.output.write_wav(
                    adv_audio_path + 'adv-' + audio_file_name, adv,
                    sample_rate)

            audio_name.append(audio_file_name)
            audio_length.append(int(q))
            original_label.append(o_label)
            original_confidence.append(o_conf)
            new_label.append(n_label)
            new_confidence.append(n_conf)
            new_o_label_conf.append(n_conf_gt)
            snr.append(10 *
                       np.log10(np.mean(data**2) / (np.mean((adv - data)**2))))
        if (save_data):
            df_cw = pd.DataFrame({
                'audio_name': audio_name,
                'audio_length': audio_length,
                'original_label': original_label,
                'original_confidence': original_confidence,
                'new_label': new_label,
                'new_confidence': new_confidence,
                'new_orig_conf': new_o_label_conf,
                'SNR': snr
            })

            with open(exp_data_path, 'w') as f:
                df_cw.to_csv(f, header=False)
def carliniwagnertargeted(audio_path,
                          metadata_path,
                          model_path,
                          exp_data_path,
                          adv_audio_path,
                          save_data=False):
    #Run the attacks to generate adversarial attacks on manually verified examples on the training and test data
    #Load dataset to normalize new data
    x, _ = utils_tf._load_dataset(cfg.to_dataset('training'))
    generator = utils.fit_scaler(x)
    df = pd.read_csv(metadata_path)
    gt_labels = df.iloc[:, 2].values
    file_names = df.iloc[:, 1].values
    mel_fb = librosa.filters.mel(sr=32000, n_fft=1024, n_mels=64).T
    sample_rate = 32000
    label_list = [
        "Bass_drum", "Cello", "Clarinet", "Oboe", "Snare_drum",
        "Violin_or_fiddle"
    ]

    audio_name = []
    audio_length = []
    original_label = []
    original_confidence = []
    new_label = []
    new_confidence = []
    new_o_label_conf = []
    snr = []
    with tf.Graph().as_default() as graph:
        mel_filt = tf.convert_to_tensor(mel_fb, dtype=tf.float32)
        model = CleverHansModel(model_path + '.meta', sample_rate, generator,
                                mel_filt)
        pcm = tf.placeholder(tf.float32, shape=[None], name='input_audio')
        carliniwagner = CW.CarliniWagnerAttack(model,
                                               learning_rate=1e-5,
                                               confidence=500,
                                               targeted=True,
                                               max_iterations=1000,
                                               binary_search_steps=2)
        saver = carliniwagner.build_attack(pcm)
    with tf.Session(graph=graph) as sess:
        saver.restore(sess, model_path)
        for i in range(df.shape[0]):
            audio_file_name = file_names[i]
            try:
                data, q = utils_tf._preprocess_data(audio_path,
                                                    audio_file_name)
            except EOFError:
                print("EOF Error")

            gt_label = gt_labels[i]

            print('Ground truth label:', gt_label, 'Audio_file:',
                  file_names[i])
            for l in range(len(label_list)):
                label = utils_tf._convert_label_name_to_label(label_list[l])
                if (label == gt_label):
                    continue

                adv, o_label, o_conf, n_label, n_conf, n_gt_conf = carliniwagner.attack(
                    sess,
                    data,
                    label,
                    np.repeat(label, int(q)),
                    int(q),
                    prob_thresh=0.975)

                if (save_data):
                    librosa.output.write_wav(
                        adv_audio_path + 'adv-' + label_list[l] + '-' +
                        audio_file_name, adv, sample_rate)

                audio_name.append(audio_file_name)
                audio_length.append(int(q))
                original_label.append(o_label)
                original_confidence.append(o_conf)
                new_label.append(n_label)
                new_confidence.append(n_conf)
                new_o_label_conf.append(n_gt_conf)

                snr.append(
                    10 * np.log10(np.mean(data**2) / (np.mean(adv - data)**2)))
        if (save_data):
            df_cw = pd.DataFrame(
                {
                    'audio_name': audio_name,
                    'audio_length': audio_length,
                    'original_label': original_label,
                    'original_confidence': original_confidence,
                    'new_label': new_label,
                    'new_confidence': new_confidence,
                    'SNR': snr
                },
                columns=[
                    'audio_name', 'audio_length', 'original_label',
                    'original_confidence', 'new_label', 'new_confidence', 'SNR'
                ])

            with open(exp_data_path, 'a') as f:
                df_cw.to_csv(f)
예제 #15
0
def compare_features(audio_path, audio_number, metadata_path, save_path,
                     keras_model_path):

    m = keras.models.load_model(keras_model_path)
    #Load dataset
    x, df = _load_dataset(cfg.to_dataset('training'))
    generator = utils.fit_scaler(x)
    file_names = df.index.to_list()
    audio_file_name = file_names[audio_number]
    print(audio_file_name)
    path = os.path.join(audio_path, audio_file_name)
    x, fs = librosa.load(path, sr=None)
    print('Sample rate:', fs)
    print('length in sec:', float(x.shape[0] / fs))
    #Librosa features

    x = librosa.resample(x, fs, 32000)
    x_tf = x
    D = librosa.stft(x, n_fft=1024, hop_length=512)
    mel_fb = librosa.filters.mel(sr=32000, n_fft=1024, n_mels=64)
    S = np.dot(mel_fb, np.abs(D)**2).T
    feat = librosa.power_to_db(S, ref=np.max, top_db=None)
    spec_test = generator.standardize(_reshape_spec(feat))
    graph = tf.Graph()
    with graph.as_default():
        pcm = tf.placeholder(tf.float32, shape=[None])
        filter_banks = tf.placeholder(tf.float32, shape=[None, None])
        stfts = tf.contrib.signal.stft(pcm,
                                       frame_length=1024,
                                       frame_step=512,
                                       fft_length=1024,
                                       pad_end=True)
        spectrograms = tf.abs(stfts)

        # Warp the linear scale spectrograms into the mel-scale.
        num_spectrogram_bins = stfts.shape[-1].value
        lower_edge_hertz, upper_edge_hertz, num_mel_bins = 0, 16000, 64
        linear_to_mel_weight_matrix = tf.contrib.signal.linear_to_mel_weight_matrix(
            num_mel_bins, num_spectrogram_bins, 32000, lower_edge_hertz,
            upper_edge_hertz)
        mel_spectrograms = tf.tensordot(tf.pow(spectrograms, 2), filter_banks,
                                        1)
        mel_spectrograms.set_shape(spectrograms.shape[:-1].concatenate(
            linear_to_mel_weight_matrix.shape[-1:]))

        max_val = tf.reduce_max(mel_spectrograms, axis=None)
        # Compute a stabilized log to get log-magnitude mel-scale spectrograms.
        log_mel_spectrograms = 10 * (
            (tf.log(mel_spectrograms + 1e-6) - tf.log(max_val + 1e-6)) /
            tf.log(tf.constant(10, dtype=tf.float32)))

    with tf.Session(graph=graph) as sess:
        lms = sess.run([log_mel_spectrograms],
                       feed_dict={
                           pcm: x_tf,
                           filter_banks: mel_fb.T
                       })
        lms = np.squeeze(lms)
        spec_tf = generator.standardize(_reshape_spec(lms, 32))

    p = m.predict(spec_tf[:, :, :, np.newaxis])
    print(np.argmax(p, axis=1), np.max(p, axis=1))
예제 #16
0
def experiment1(audio_path, audio_number, metadata_path, save_path,
                save_data_path):
    #Run the attacks to generate adversarial attacks on manually verified examples on the training and test data
    #Load dataset to normalize new data

    x, df = _load_dataset(cfg.to_dataset('training'))
    generator = utils.fit_scaler(x)
    file_names = df.index.to_list()
    audio_file_name = file_names[audio_number]
    mel_fb = librosa.filters.mel(sr=32000, n_fft=1024, n_mels=64).T
    sample_rate = 32000

    #df_fgsm = pd.DataFrame(columns=['audio_name','audio_length','original_label','original_confidence','new_label','new_confidence','mean_absolute_error'])
    #df_fgsm.to_csv('adv-audio/vgg13/fgsm.csv')
    #df_baseline = pd.DataFrame(columns=['audio_name','audio_length','original_label','original_confidence','new_label','new_confidence','mean_absolute_error'])
    #df_baseline.to_csv = ('adv-audio/vgg13/baseline.csv')
    with tf.Graph().as_default() as graph:
        mel_filt = tf.convert_to_tensor(mel_fb, dtype=tf.float32)
        model = CleverHansModel(save_path + '.meta', sample_rate, generator,
                                mel_filt)
        pcm = tf.placeholder(tf.float32, shape=[None], name='input_audio')
        saver = model.build_graph(pcm)
        pgd = PGM.ProjectedGradientDescent(model, rms_ratio=6)
        pgd.build_attack(pcm)
        bwn = BWN.BaselineWhiteNoise(model, rms_ratio=6)
        bwn.build_attack(pcm)

    with tf.Session(graph=graph) as sess:
        saver.restore(sess, save_path)
        for i in range(len(file_names)):
            audio_file_name = file_names[i]
            try:
                data, q = _preprocess_data(audio_path, audio_file_name)
            except EOFError:
                print("EOF Error dammit")
            label_name = _get_label_from_audio(audio_path, audio_file_name,
                                               metadata_path)
            labels = _convert_label_name_to_label(label_name)
            s = sess.run([model.get_probs()],
                         feed_dict={'input_audio:0': data})

            s = np.squeeze(s)
            if (s.ndim != 1):
                s = np.max(s, axis=0)

            if (np.argmax(s) == labels):

                print('Iteration number:', i)
                print('Original label number:', np.argmax(s))
                print('Original label confidence:', np.max(s))
                labels = np.repeat(labels, int(q))
                adv, mae, label, confidence = pgd.attack(data, labels, 1, sess)
                librosa.output.write_wav(
                    save_data_path + 'fgsm/' + audio_file_name[:-4] +
                    '-adv.wav', adv, sample_rate)
                df_fgsm = pd.DataFrame(columns=[
                    'audio_name', 'audio_length', 'original_label',
                    'original_confidence', 'new_label', 'new_confidence',
                    'mean_absolute_error'
                ])

                df_fgsm = df_fgsm.append(
                    {
                        'audio_name': audio_file_name,
                        'audio_length': data.shape[0],
                        'original_label': np.argmax(s),
                        'original_confidence': np.max(s),
                        'new_label': label,
                        'new_confidence': confidence,
                        'mean_absolute_error': mae
                    },
                    ignore_index=True)
                with open(save_data_path + 'fgsm.csv', 'a') as f:
                    df_fgsm.to_csv(f, header=False)

                adv, mae, label, confidence = bwn.attack(data, sess)
                librosa.output.write_wav(
                    save_data_path + 'baseline/' + audio_file_name[:-4] +
                    '-adv.wav', adv, sample_rate)
                df_baseline = pd.DataFrame(columns=[
                    'audio_name', 'audio_length', 'original_label',
                    'original_confidence', 'new_label', 'new_confidence',
                    'mean_absolute_error'
                ])
                df_baseline = df_baseline.append(
                    {
                        'audio_name': audio_file_name,
                        'audio_length': data.shape[0],
                        'original_label': np.argmax(s),
                        'original_confidence': np.max(s),
                        'new_label': label,
                        'new_confidence': confidence,
                        'mean_absolute_error': mae
                    },
                    ignore_index=True)
                with open(save_data_path + 'baseline.csv', 'a') as f:
                    df_baseline.to_csv(f, header=False)