def eval(md, x, y, out_dir, out_probs_dir, iter_):

    # Predict
    t1 = time.time()
    (n_clips, n_time, n_freq) = x.shape
    (x, y) = pp_data.transform_data(x, y)
    prob = md.predict(x)
    prob = prob.astype(np.float32)

    if out_dir:
        pp_data.create_folder(out_dir)
        #out_prob_path = os.path.join(out_probs_dir, "prob_%d_iters.p" %iter_)
    # Dump predicted probabilites for future average
    if out_probs_dir:
        pp_data.create_folder(out_probs_dir)
        out_prob_path = os.path.join(out_probs_dir, "prob_%d_iters.p" % iter_)
        cPickle.dump(prob,
                     open(out_prob_path, 'wb'),
                     protocol=cPickle.HIGHEST_PROTOCOL)

    # Compute and dump stats
    n_out = y.shape[1]
    stats = []
    t1 = time.time()
    for k in range(n_out):
        (precisions, recalls,
         thresholds) = metrics.precision_recall_curve(y[:, k], prob[:, k])
        avg_precision = metrics.average_precision_score(y[:, k],
                                                        prob[:, k],
                                                        average=None)
        (fpr, tpr, thresholds) = metrics.roc_curve(y[:, k], prob[:, k])
        auc = metrics.roc_auc_score(y[:, k], prob[:, k], average=None)
        #eer = pp_data.eer(prob[:, k], y[:, k])
        skip = 1000
        dict = {
            'precisions': precisions[0::skip],
            'recalls': recalls[0::skip],
            'AP': avg_precision,
            'fpr': fpr[0::skip],
            'fnr': 1. - tpr[0::skip],
            'auc': auc
        }

        stats.append(dict)
    logging.info("Callback time: %s" % (time.time() - t1, ))

    dump_path = os.path.join(out_dir, "md%d_iters.p" % iter_)
    cPickle.dump(stats,
                 open(dump_path, 'wb'),
                 protocol=cPickle.HIGHEST_PROTOCOL)

    logging.info("mAP: %f" % np.mean([e['AP'] for e in stats]))
def eval(model, x, y, out_dir, out_probs_dir, md_iter):
    pp_data.create_folder(out_dir)

    # Predict
    t1 = time.time()
    (n_clips, n_time_, n_freq) = x.shape
    (x, y) = pp_data.transform_data(x, y)
    prob = model.predict(x)
    prob = prob.astype(np.float32)
    print("The %d time into evalution." % md_iter)
    if out_probs_dir:
        pp_data.create_folder(out_probs_dir)
        out_prob_path = os.path.join(out_probs_dir,
                                     "prob_%d_iters.p" % md_iter)
        #cPickle.dump(prob, open(out_prob_path, 'wb'))
    # Dump predicted probabilities for future average
    n_out = y.shape[1]
    stats = []
    t1 = time.time()
    for k in range(n_out):
        (precisions, recalls,
         thresholds) = metrics.precision_recall_curve(y[:, k], prob[:, k])
        avg_precision = metrics.average_precision_score(y[:, k],
                                                        prob[:, k],
                                                        average=None)
        (fpr, tpr, thresholds) = metrics.roc_curve(y[:, k], prob[:, k])
        auc = metrics.roc_auc_score(y[:, k], prob[:, k], average=None)
        eer = pp_data.eer(prob[:, k], y[:, k])
        skip = 1000
        dict = {
            'precisions': precisions[0::skip],
            'recalls': recalls[0::skip],
            'AP': avg_precision,
            'fpr': fpr[0::skip],
            'fnr': 1. - tpr[0::skip],
            'auc': auc
        }
        stats.append(dict)

    logging.info("Callback time: %s" % (time.time() - t1, ))
    dump_path = os.path.join(out_dir, "model_%d_iters.p" % (md_iter, ))
    cPickle.dump(stats, open(dump_path, 'wb'))
    mAP = np.mean([e['AP'] for e in stats])
    logging.info("mAP of %d iteration: %f" % (md_iter, mAP))
    return mAP
Пример #3
0
def train(args):
    cpickle_dir = args.cpickle_dir
    workspace = args.workspace

    # Path of hdf5 data
    bal_train_hdf5_path = os.path.join(cpickle_dir, "bal_train.h5")
    unbal_train_hdf5_path = os.path.join(cpickle_dir, "unbal_train.h5")
    eval_hdf5_path = os.path.join(cpickle_dir, "eval.h5")

    # Load data
    t1 = time.time()
    (tr_x1, tr_y1, tr_id_list1) = pp_data.load_data(bal_train_hdf5_path)
    (tr_x2, tr_y2, tr_id_list2) = pp_data.load_data(unbal_train_hdf5_path)
    tr_x = np.concatenate((tr_x1, tr_x2))
    tr_y = np.concatenate((tr_y1, tr_y2))
    tr_id_list = tr_id_list1 + tr_id_list2

    (te_x, te_y, te_id_list) = pp_data.load_data(eval_hdf5_path)
    logging.info("Loading data time: %s s" % (time.time() - t1))

    logging.info(tr_x1.shape, tr_x2.shape)
    logging.info("tr_x.shape: %s" % (tr_x.shape, ))

    (_, n_time, n_freq) = tr_x.shape

    # Build model
    n_hid = 500
    n_out = tr_y.shape[1]

    lay_in = InputLayer(in_shape=(n_time, n_freq))
    a = Dense(n_out=n_hid, act='relu')(lay_in)
    a = Dropout(p_drop=0.2)(a)
    a = Dense(n_out=n_hid, act='relu')(a)
    a = Dropout(p_drop=0.2)(a)
    a = Dense(n_out=n_hid, act='relu')(a)
    a = Dropout(p_drop=0.2)(a)
    cla = Dense(n_out=n_out, act='sigmoid', name='cla')(a)
    att = Dense(n_out=n_out, act='softmax', name='att')(a)

    # Attention
    lay_out = Lambda(_attention)([cla, att])

    # Compile model
    md = Model(in_layers=[lay_in], out_layers=[lay_out])
    md.compile()
    md.summary(is_logging=True)

    # Save model every several iterations
    call_freq = 1000
    dump_fd = os.path.join(workspace, "models", pp_data.get_filename(__file__))
    pp_data.create_folder(dump_fd)
    save_model = SaveModel(dump_fd=dump_fd,
                           call_freq=call_freq,
                           type='iter',
                           is_logging=True)

    # Callbacks function
    callbacks = [save_model]

    batch_size = 500
    tr_gen = RatioDataGenerator(batch_size=batch_size, type='train')

    # Optimization method
    optimizer = Adam(lr=args.lr)

    # Train
    stat_dir = os.path.join(workspace, "stats", pp_data.get_filename(__file__))
    pp_data.create_folder(stat_dir)
    prob_dir = os.path.join(workspace, "probs", pp_data.get_filename(__file__))
    pp_data.create_folder(prob_dir)

    tr_time = time.time()
    for (tr_batch_x, tr_batch_y) in tr_gen.generate(xs=[tr_x], ys=[tr_y]):
        # Compute stats every several interations
        if md.iter_ % call_freq == 0:
            # Stats of evaluation dataset
            t1 = time.time()
            te_err = eval(md=md,
                          x=te_x,
                          y=te_y,
                          out_dir=os.path.join(stat_dir, "test"),
                          out_probs_dir=os.path.join(prob_dir, "test"))
            logging.info("Evaluate test time: %s" % (time.time() - t1, ))

            # Stats of training dataset
            t1 = time.time()
            tr_bal_err = eval(md=md,
                              x=tr_x1,
                              y=tr_y1,
                              out_dir=os.path.join(stat_dir, "train_bal"),
                              out_probs_dir=None)
            logging.info("Evaluate tr_bal time: %s" % (time.time() - t1, ))

        # Update params
        (tr_batch_x,
         tr_batch_y) = pp_data.transform_data(tr_batch_x, tr_batch_y)
        md.train_on_batch(batch_x=tr_batch_x,
                          batch_y=tr_batch_y,
                          loss_func='binary_crossentropy',
                          optimizer=optimizer,
                          callbacks=callbacks)

        # Stop training when maximum iteration achieves
        if md.iter_ == call_freq * 31:
            break
def train(args):
    EVAL_MAP = -1000.
    PATIENCE = 0
    data_dir = args.data_dir
    workspace = args.workspace
    tag = args.tag
    levels = args.levels
    # Path for the hdf5 dara
    bal_train_path = os.path.join(data_dir, "bal_train.h5")
    unbal_train_path = os.path.join(data_dir, "unbal_train.h5")
    eval_path = os.path.join(data_dir, "eval.h5")

    # Load data
    t1 = time.time()
    (tr_x1, tr_y1, tr_id_list1) = pp_data.load_data(bal_train_path)
    (tr_x2, tr_y2, tr_id_list2) = pp_data.load_data(unbal_train_path)
    (eval_x, eval_y, eval_id_list) = pp_data.load_data(eval_path)
    #tr_x = tr_x1
    #tr_y = tr_y1
    #tr_id_list = tr_id_list1
    tr_x = np.concatenate((tr_x1, tr_x2))
    tr_y = np.concatenate((tr_y1, tr_y2))
    tr_id_list = tr_id_list1 + tr_id_list2

    logging.info("Loading dat time: %s s" % (time.time() - t1))
    logging.info(tr_x1.shape, tr_x2.shape)
    logging.info("tr_x.shape: %s" % (tr_x.shape, ))

    (_, n_time, n_freq) = tr_x.shape

    # Build Model

    model = get_ml_attention(levels)
    logging.info(model.to_json())
    # Optimization method
    optimizer = Adam(lr=args.lr)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['binary_accuracy'])
    #logging.info(model.summary())
    #
    batch_size = 500
    tr_gen = RatioDataGenerator(batch_size=batch_size, type='train')
    # Save Model every call_freq iterations
    model_iter = 0
    call_freq = 1000
    dump_fd = os.path.join(workspace, "models", pp_data.get_filename(__file__))
    pp_data.create_folder(dump_fd)

    # Train
    stat_dir = os.path.join(workspace, "stats", pp_data.get_filename(__file__))
    pp_data.create_folder(stat_dir)
    prob_dir = os.path.join(workspace, "probs", pp_data.get_filename(__file__))
    pp_data.create_folder(prob_dir)

    tr_time = time.time()

    for (tr_batch_x, tr_batch_y) in tr_gen.generate(xs=[tr_x], ys=[tr_y]):
        # Computes stats every several iterations
        print(model_iter)
        if model_iter % call_freq == 0:  # every 1000 iterations
            # Stats of evaluation dataset
            t1 = time.time()
            eval_MAP = eval(model=model,
                            x=eval_x,
                            y=eval_y,
                            out_dir=os.path.join(stat_dir, "eval"),
                            out_probs_dir=os.path.join(prob_dir, "eval"),
                            md_iter=model_iter)

            logging.info("Evaluate evaluation-set time: %s" %
                         (time.time() - t1, ))
            if eval_MAP >= EVAL_MAP:
                #md_name = "/scratch/work/xuz2/model_" + tag + "_.h5"
                md_name = tag + "_.h5"
                model.save(md_name)
                EVAL_MAP = eval_MAP
                PATIENCE = 0
            else:
                PATIENCE += 1
                logging.info("Patience now: %d" % (PATIENCE, ))
                if PATIENCE >= 10:
                    break
            #	print("Training stop at %s iterations" % (model_iter,))
            #	break
            # Stats of training dataset
            #t1 =time.time()
            #tr_bal_err = eval(model=model, x=tr_x1, y=tr_y1,

            #				  out_dir=os.path.join(stat_dir, "train_bal"),
            ##				  out_probs_dir=None,
            #				  md_iter=model_iter)
            #logging.info("Evaluate tr_bal time: %s" % (time.time() - t1,))

            # Save Model
            #if eval_MAP > 0.342:
            #	md_name = "/scratch/work/xuz2/model_" + str(model_iter) + "_.h5"
            #	model.save(md_name)

        # Update params
        (tr_batch_x,
         tr_batch_y) = pp_data.transform_data(tr_batch_x, tr_batch_y)
        model.train_on_batch(tr_batch_x, tr_batch_y)

        model_iter += 1

        # Stop training when maximum iteration achieves
        if model_iter == call_freq * 151:
            break
def train(args):
    cpickle_dir = args.cpickle_dir
    workspace = args.workspace

    # Path of hdf5 data
    bal_train_hdf5_path = os.path.join(cpickle_dir, "bal_train.h5")
    unbal_train_hdf5_path = os.path.join(cpickle_dir, "unbal_train.h5")
    eval_hdf5_path = os.path.join(cpickle_dir, "eval.h5")

    # Load data
    t1 = time.time()
    (tr_x1, tr_y1, tr_id_list1) = pp_data.load_data(bal_train_hdf5_path)
    (tr_x2, tr_y2, tr_id_list2) = pp_data.load_data(unbal_train_hdf5_path)
    print(tr_x1.shape)
    print(tr_x2.shape)
    tr_x = np.concatenate((tr_x1, tr_x2))
    tr_y = np.concatenate((tr_y1, tr_y2))
    tr_id_list = tr_id_list1 + tr_id_list2

    (te_x, te_y, te_id_list) = pp_data.load_data(eval_hdf5_path)
    logging.info("Loading data time: %s s" % (time.time() - t1))

    logging.info(tr_x1.shape, tr_x2.shape)
    logging.info("tr_x.shape: %s" % (tr_x.shape, ))

    (_, n_time, n_freq) = tr_x.shape

    # Build model
    n_hid = 600
    n_out = tr_y.shape[1]

    lay_in = Input(shape=(n_time, n_freq))
    a_0 = BatchNormalization()(lay_in)
    a_1 = Dense(n_hid, kernel_regularizer=regularizers.l2(0.001))(a_0)
    a_1 = BatchNormalization()(a_1)
    a_1 = Activation('relu')(a_1)
    a_1 = Dropout(rate=0.4)(a_1)
    a_2 = Dense(n_hid, kernel_regularizer=regularizers.l2(0.001))(a_1)
    a_2 = BatchNormalization()(a_2)
    a_2 = Activation('relu')(a_2)
    a_2 = Dropout(rate=0.4)(a_2)
    a_3 = Dense(n_hid, kernel_regularizer=regularizers.l2(0.001))(a_2)
    a_3 = BatchNormalization()(a_3)
    a_3 = Activation('relu')(a_3)
    a_3 = Dropout(rate=0.4)(a_3)
    cla_1 = Dense(n_out, name='cla_1')(a_3)
    cla_1 = BatchNormalization()(cla_1)
    cla_1 = Activation('sigmoid')(cla_1)
    att_1 = Dense(n_out, name='att_1')(a_3)
    att_1 = BatchNormalization()(att_1)
    att_1 = Activation('softmax')(att_1)

    # Attention
    lay_out_a = Lambda(_attention,
                       output_shape=_att_output_shape)([cla_1, att_1])
    cla_2 = Dense(n_out, name='cla_2')(a_2)
    cla_2 = BatchNormalization()(cla_2)
    cla_2 = Activation('sigmoid')(cla_2)
    att_2 = Dense(n_out, name='att2')(a_2)
    att_2 = BatchNormalization()(att_2)
    att_2 = Activation('softmax')(att_2)

    lay_out_b = Lambda(_attention,
                       output_shape=_att_output_shape)([cla_2, att_2])
    lay_out_c = Concatenate(axis=1)([lay_out_a, lay_out_b])

    #lay_out = Dense(n_out, activation='sigmoid', name='output')(lay_out_c)
    lay_out = Dense(n_out, name='output')(lay_out_c)
    lay_out = BatchNormalization()(lay_out)
    lay_out = Activation('sigmoid')(lay_out)
    # Compile model
    md = Model(inputs=lay_in, outputs=lay_out)
    md.summary()

    # Save model every several iterations
    call_freq = 1000
    dump_fd = os.path.join(workspace, "models", pp_data.get_filename(__file__))
    pp_data.create_folder(dump_fd)
    # save_model = SaveModel(dump_fd=dump_fd, call_freq=call_freq, type='iter', is_logging=True)

    # Callbacks function
    #callbacks = []#save_model]

    batch_size = 500
    tr_gen = RatioDataGenerator(batch_size=batch_size, type='train')

    # Optimization method
    optimizer = Adam(lr=args.lr)
    md.compile(loss='binary_crossentropy', optimizer=optimizer)
    #callbacks=callbacks)

    # Train
    stat_dir = os.path.join(workspace, "stats", pp_data.get_filename(__file__))
    pp_data.create_folder(stat_dir)
    prob_dir = os.path.join(workspace, "probs", pp_data.get_filename(__file__))
    pp_data.create_folder(prob_dir)

    tr_time = time.time()
    iter_ = 1
    for (tr_batch_x, tr_batch_y) in tr_gen.generate(xs=[tr_x], ys=[tr_y]):
        # Compute stats every several interations
        if iter_ % call_freq == 0:
            # Stats of evaluation dataset
            t1 = time.time()
            te_err = eval(md=md,
                          x=te_x,
                          y=te_y,
                          out_dir=os.path.join(stat_dir, "test"),
                          out_probs_dir=os.path.join(prob_dir, "test"),
                          iter_=iter_)
            logging.info("Evaluate test time: %s" % (time.time() - t1, ))

            # Stats of training dataset
            t1 = time.time()
            tr_bal_err = eval(md=md,
                              x=tr_x1,
                              y=tr_y1,
                              out_dir=os.path.join(stat_dir, "train_bal"),
                              out_probs_dir=None,
                              iter_=iter_)
            logging.info("Evaluate tr_bal time: %s" % (time.time() - t1, ))
        iter_ += 1
        # Update params
        (tr_batch_x,
         tr_batch_y) = pp_data.transform_data(tr_batch_x, tr_batch_y)
        md.train_on_batch(x=tr_batch_x, y=tr_batch_y)
        # Stop training when maximum iteration achieves
        if iter_ == call_freq * 151:
            break