Пример #1
0
def sed_event_wise_tps_fns_fps(sed_pds, sed_gts, at_detected2d, sed_thres):
    """shape: (n_clips, n_time, n_classes)
    """
    (n_clips, n_time, n_classes) = sed_pds.shape
    
    detected3d = (np.sign(sed_pds - sed_thres) + 1.) / 2
    detected3d *= at_detected2d[:, None, :]

    # Event based SED. 
    tps, fns, fps = [], [], []
    for j1 in xrange(n_classes):
        total_tp = 0
        total_fn = 0
        total_fp = 0
        for i1 in xrange(n_clips):
            pd_pairs = vad.activity_detection(detected3d[i1, :, j1], sed_thres, n_smooth=24, n_salt=4)
            gt_pairs = vad.activity_detection(sed_gts[i1, :, j1], sed_thres, n_smooth=24, n_salt=4)
            pd_pairs = np.array(pd_pairs)
            gt_pairs = np.array(gt_pairs)
            if len(gt_pairs) > 0 and len(pd_pairs) > 0:
                matching = mir_eval.transcription.match_note_onsets(gt_pairs, pd_pairs, onset_tolerance=12) # onset collar=500 ms
                tp = len(matching)
            else:
                tp = 0
            fn = len(gt_pairs) - tp
            fp = len(pd_pairs) - tp
            total_tp += tp
            total_fn += fn
            total_fp += fp
        tps.append(total_tp)
        fns.append(total_fn)
        fps.append(total_fp)
        
    return tps, fns, fps
Пример #2
0
def get_est_event_list(sed_outputs, audio_names, labels):

    hop_size = config.window_size - config.overlap
    seconds_per_frame = hop_size / float(config.sample_rate)
    ix_to_lb = config.ix_to_lb

    est_event_list = []

    for (n, audio_name) in enumerate(audio_names):

        for k in range(len(labels)):

            bgn_fin_pairs = activity_detection(sed_outputs[n, :, k],
                                               thres=0.2,
                                               low_thres=0.1,
                                               n_smooth=10,
                                               n_salt=10)

            if len(bgn_fin_pairs) > 0:

                for [bgn, fin] in bgn_fin_pairs:

                    est_event = {
                        'filename': audio_name,
                        'onset': bgn * seconds_per_frame,
                        'offset': fin * seconds_per_frame,
                        'event_label': ix_to_lb[k]
                    }

                    est_event_list.append(est_event)

    return est_event_list
Пример #3
0
def frame_prediction_to_event_prediction(output_dict, sed_params_dict):
    """Write output to submission file. 
    
    Args:
      output_dict: {
          'audio_name': (audios_num), 
          'clipwise_output': (audios_num, classes_num), 
          'framewise_output': (audios_num, frames_num, classes_num)}
      sed_params_dict: {
          'audio_tagging_threshold': float between 0 and 1, 
          'sed_high_threshold': : float between 0 and 1, 
          'sed_low_threshold': : float between 0 and 1, 
          'n_smooth': int, silence between the same sound event shorter than 
              this number will be filled with the sound event
          'n_salt': int, sound event shorter than this number will be removed}
    """
    (audios_num, frames_num,
     classes_num) = output_dict['framewise_output'].shape
    frames_per_second = config.frames_per_second
    labels = config.labels

    event_list = []

    def _float_to_list(x):
        if 'list' in str(type(x)):
            return x
        else:
            return [x] * classes_num

    sed_params_dict['audio_tagging_threshold'] = _float_to_list(
        sed_params_dict['audio_tagging_threshold'])
    sed_params_dict['sed_high_threshold'] = _float_to_list(
        sed_params_dict['sed_high_threshold'])
    sed_params_dict['sed_low_threshold'] = _float_to_list(
        sed_params_dict['sed_low_threshold'])
    sed_params_dict['n_smooth'] = _float_to_list(sed_params_dict['n_smooth'])
    sed_params_dict['n_salt'] = _float_to_list(sed_params_dict['n_salt'])

    for n in range(audios_num):
        for k in range(classes_num):
            if output_dict['clipwise_output'][n, k] \
                > sed_params_dict['audio_tagging_threshold'][k]:

                bgn_fin_pairs = activity_detection(
                    x=output_dict['framewise_output'][n, :, k],
                    thres=sed_params_dict['sed_high_threshold'][k],
                    low_thres=sed_params_dict['sed_low_threshold'][k],
                    n_smooth=sed_params_dict['n_smooth'][k],
                    n_salt=sed_params_dict['n_salt'][k])

                for pair in bgn_fin_pairs:
                    event = {
                        'filename': output_dict['audio_name'][n],
                        'onset': pair[0] / float(frames_per_second),
                        'offset': pair[1] / float(frames_per_second),
                        'event_label': labels[k]
                    }
                    event_list.append(event)
    return event_list
Пример #4
0
def write_submission(output_dict, sed_params_dict, submission_path):
    '''Write output to submission file. 
    
    Args:
      output_dict: {
          'audio_name': (audios_num), 
          'clipwise_output': (audios_num, classes_num), 
          'framewise_output': (audios_num, frames_num, classes_num)}
      sed_params_dict: {
          'audio_tagging_threshold': float between 0 and 1, 
          'sed_high_threshold': : float between 0 and 1, 
          'sed_low_threshold': : float between 0 and 1, 
          'n_smooth': int, silence between the same sound event shorter than 
              this number will be filled with the sound event
          'n_salt': int, sound event shorter than this number will be removed}
      submission_path: string, path to write out submission
    '''
    (audios_num, frames_num,
     classes_num) = output_dict['framewise_output'].shape
    frames_per_second = config.frames_per_second
    labels = config.labels

    f = open(submission_path, 'w')
    f.write('{}\t{}\t{}\t{}\n'.format('filename', 'onset', 'offset',
                                      'event_label'))

    for n in range(audios_num):
        for k in range(classes_num):
            if output_dict['clipwise_output'][n, k] \
                > sed_params_dict['sed_high_threshold']:

                bgn_fin_pairs = activity_detection(
                    x=output_dict['framewise_output'][n, :, k],
                    thres=sed_params_dict['sed_high_threshold'],
                    low_thres=sed_params_dict['sed_low_threshold'],
                    n_smooth=sed_params_dict['n_smooth'],
                    n_salt=sed_params_dict['n_salt'])

                for pair in bgn_fin_pairs:
                    bgn_time = pair[0] / float(frames_per_second)
                    fin_time = pair[1] / float(frames_per_second)
                    f.write('{}\t{}\t{}\t{}\n'.format(
                        output_dict['audio_name'][n], bgn_time, fin_time,
                        labels[k]))
    f.close()

    logging.info('    Write submission file to {}' ''.format(submission_path))
Пример #5
0
def calculate_estimated_event_list(audio_names, predictions, frame_wise_probs,
                                   seconds_per_frame, sed_thres,
                                   sed_low_thres):
    """Calculate estimated event list from frame wise probabilites. 
    
    Args:
      audio_names: list of str. 
      predictions: (audios_num, classes_num), value of 0 or 1
      frame_wise_probs: (audios_num, time_steps, classes_num)
      seconds_per_frame: float
      
    Returns:
      estimated_event_list: list of events
    """

    ix_to_lb = config.ix_to_lb

    estimated_event_list = []

    for (n, audio_name) in enumerate(audio_names):

        for event_index in predictions[n]:

            bgn_fin_pairs = activity_detection(frame_wise_probs[n, :,
                                                                event_index],
                                               thres=sed_thres,
                                               low_thres=sed_low_thres,
                                               n_smooth=1,
                                               n_salt=0)

            for [bgn, fin] in bgn_fin_pairs:

                event = {
                    'filename': audio_name,
                    'onset': bgn * seconds_per_frame,
                    'offset': fin * seconds_per_frame,
                    'event_label': ix_to_lb[event_index]
                }

                estimated_event_list.append(event)

    return estimated_event_list
def print_stats(at_pds, at_gts, sed_pds, sed_gts):
    np.set_printoptions(threshold=np.nan,
                        linewidth=1000,
                        precision=3,
                        suppress=True)
    events = cfg.events

    # AT evaluate.
    # TP, FN, FP, TN
    logging.info("====== Audio tagging (AT) ======")
    logging.info("%stp\tfn\tfp\ttn" % "".ljust(16))
    (tp, fn, fp, tn) = tp_fn_fp_tn(at_pds, at_gts, at_thres, average='micro')
    logging.info("%s%d\t%d\t%d\t%d\t" % ("Global".ljust(16), tp, fn, fp, tn))

    (tps, fns, fps, tns) = tp_fn_fp_tn(at_pds, at_gts, at_thres, average=None)
    for i1 in xrange(len(tps)):
        logging.info(
            "%s%d\t%d\t%d\t%d\t" %
            (events[i1].ljust(16), tps[i1], fns[i1], fps[i1], tns[i1]))

    # Prec, recall, fvalue, AUC, eer
    logging.info("%sPrec\tRecall\tFvalue\tAUC\tEER" % "".ljust(16))
    (prec, recall, fvalue) = prec_recall_fvalue(at_pds,
                                                at_gts,
                                                at_thres,
                                                average='micro')
    auc = metrics.roc_auc_score(at_gts, at_pds, average='micro')
    eer = equal_error_rate(at_pds, at_gts, average='micro')
    logging.info("%s*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f" %
                 ("*Global".ljust(16), prec, recall, fvalue, auc, eer))
    (prec, recall, fvalue) = prec_recall_fvalue(at_pds,
                                                at_gts,
                                                at_thres,
                                                average='macro')
    auc = metrics.roc_auc_score(at_gts, at_pds, average='macro')
    eer = equal_error_rate(at_pds, at_gts, average='macro')
    logging.info("%s*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f" %
                 ("*Avg. of each".ljust(16), prec, recall, fvalue, auc, eer))

    (precs, recalls, fvalues) = prec_recall_fvalue(at_pds,
                                                   at_gts,
                                                   at_thres,
                                                   average=None)
    aucs = metrics.roc_auc_score(at_gts, at_pds, average=None)
    eers = equal_error_rate(at_pds, at_gts, average=None)
    for i1 in xrange(len(tps)):
        logging.info("%s%.3f\t%.3f\t%.3f\t%.3f\t%.3f" %
                     (events[i1].ljust(16), precs[i1], recalls[i1],
                      fvalues[i1], aucs[i1], eers[i1]))

    # SED evaluate
    logging.info("====== Frame based SED ======")

    (n_clips, n_time, n_classes) = sed_pds.shape

    logging.info("%stp\tfn\tfp\ttn" % "".ljust(16))
    sed_pds_2d = sed_pds.reshape((n_clips * n_time, n_classes))
    sed_gts_2d = sed_gts.reshape((n_clips * n_time, n_classes))
    (tp, fn, fp, tn) = tp_fn_fp_tn(sed_pds_2d,
                                   sed_gts_2d,
                                   sed_thres,
                                   average='micro')
    logging.info("%s*%d\t*%d\t*%d\t*%d\t" %
                 ("*Global".ljust(16), tp, fn, fp, tn))

    (tps, fns, fps, tns) = tp_fn_fp_tn(sed_pds_2d,
                                       sed_gts_2d,
                                       sed_thres,
                                       average=None)
    for i1 in xrange(len(tps)):
        logging.info(
            "%s%d\t%d\t%d\t%d\t" %
            (events[i1].ljust(16), tps[i1], fns[i1], fps[i1], tns[i1]))

    # Prec, recall, fvalue
    logging.info("%sPrec\tRecall\tFvalue\tAUC\tER\tn_sub\tn_del\tn_ins" %
                 "".ljust(16))
    (prec, recall, fvalue) = prec_recall_fvalue(sed_pds_2d,
                                                sed_gts_2d,
                                                sed_thres,
                                                average='micro')
    auc = metrics.roc_auc_score(sed_gts_2d, sed_pds_2d, average='micro')
    (er, n_sub, n_del, n_ins) = error_rate(sed_pds_2d,
                                           sed_gts_2d,
                                           sed_thres,
                                           average='micro')
    logging.info("%s*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f" %
                 ("*Global".ljust(16), prec, recall, fvalue, auc, er, n_sub,
                  n_del, n_ins))
    (prec, recall, fvalue) = prec_recall_fvalue(sed_pds_2d,
                                                sed_gts_2d,
                                                sed_thres,
                                                average='macro')
    auc = metrics.roc_auc_score(sed_gts_2d, sed_pds_2d, average='macro')
    (er, n_sub, n_del, n_ins) = error_rate(sed_pds_2d,
                                           sed_gts_2d,
                                           sed_thres,
                                           average='macro')
    logging.info("%s*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f" %
                 ("*Avg. of each".ljust(16), prec, recall, fvalue, auc, er,
                  n_sub, n_del, n_ins))

    (precs, recalls, fvalues) = prec_recall_fvalue(sed_pds_2d,
                                                   sed_gts_2d,
                                                   sed_thres,
                                                   average=None)
    aucs = metrics.roc_auc_score(sed_gts_2d, sed_pds_2d, average=None)
    (ers, n_subs, n_dels, n_inss) = error_rate(sed_pds_2d,
                                               sed_gts_2d,
                                               sed_thres,
                                               average=None)
    for i1 in xrange(len(tps)):
        logging.info(
            "%s%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f" %
            (events[i1].ljust(16), precs[i1], recalls[i1], fvalues[i1],
             aucs[i1], ers[i1], n_subs[i1], n_dels[i1], n_inss[i1]))

    logging.info("====== Event based SED ======")

    at_detected2d = (np.sign(at_pds - at_thres) + 1.) / 2
    (tps, fns, fps) = sed_event_wise_tps_fns_fps(sed_pds, sed_gts,
                                                 at_detected2d, sed_thres)

    logging.info("%stp\tfn\tfp" % "".ljust(16))
    logging.info("%s*%d\t*%d\t*%d" %
                 ("*Total:".ljust(16), np.sum(tps), np.sum(fns), np.sum(fps)))
    for i1 in xrange(n_classes):
        logging.info("%s%d\t%d\t%d" %
                     (events[i1].ljust(16), tps[i1], fns[i1], fps[i1]))

    logging.info("------ Prec, recall, fvalue ------")
    logging.info("%sPrec\tRecall\tFvalue\tAUC\tER\tn_sub\tn_del\tn_ins" %
                 "".ljust(16))
    (prec, recall, fvalue) = prec_recall_fvalue_from_tp_fn_fp(tps,
                                                              fns,
                                                              fps,
                                                              average='micro')
    (er, n_sub, n_del, n_ins) = error_rate_from_tp_fn_fp(tps,
                                                         fns,
                                                         fps,
                                                         average='micro')
    logging.info("%s*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f" %
                 ("*Global".ljust(16), prec, recall, fvalue, auc, er, n_sub,
                  n_del, n_ins))
    (prec, recall, fvalue) = prec_recall_fvalue_from_tp_fn_fp(tps,
                                                              fns,
                                                              fps,
                                                              average='macro')
    (er, n_sub, n_del, n_ins) = error_rate_from_tp_fn_fp(tps,
                                                         fns,
                                                         fps,
                                                         average='macro')
    logging.info("%s*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f" %
                 ("*Avg. of each".ljust(16), prec, recall, fvalue, auc, er,
                  n_sub, n_del, n_ins))

    (precs, recalls, fvalues) = prec_recall_fvalue_from_tp_fn_fp(tps,
                                                                 fns,
                                                                 fps,
                                                                 average=None)
    (ers, n_subs, n_dels, n_inss) = error_rate_from_tp_fn_fp(tps,
                                                             fns,
                                                             fps,
                                                             average=None)
    for i1 in xrange(n_classes):
        logging.info(
            "%s%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f" %
            (events[i1].ljust(16), precs[i1], recalls[i1], fvalues[i1],
             aucs[i1], ers[i1], n_subs[i1], n_dels[i1], n_inss[i1]))

    if False:
        post_det3d = np.zeros_like(detected3d)
        for i1 in xrange(n_clips):
            for j1 in xrange(n_classes):
                lists = vad.activity_detection(detected3d[i1, :, j1],
                                               sed_thres,
                                               n_smooth=24,
                                               n_salt=18)
                for (bgn, fin) in lists:
                    post_det3d[i1, bgn:fin, j1] = 1

        for i1 in xrange(len(sed_pds)):
            print("gt:", at_gts[i1])
            print("pd:", at_pds[i1])
            fig, axs = plt.subplots(4, 1, sharex=True)
            axs[0].matshow(sed_gts[i1].T, origin='lower', aspect='auto')
            axs[1].matshow(sed_pds[i1].T, origin='lower', aspect='auto')
            axs[2].matshow(detected3d[i1].T, origin='lower', aspect='auto')
            axs[3].matshow(post_det3d[i1].T, origin='lower', aspect='auto')
            plt.show()