예제 #1
0
def get_magrove_generator(text_processing_func, data_dir: str, label_dict: dict):
    owasp_train_file = r'D:\Store\document\all_my_work\CZY\bishe\mangrove_old\lstm\data\extraction\owasp-slice-train-2.txt'
    owasp_test_file = r'D:\Store\document\all_my_work\CZY\bishe\mangrove_old\lstm\data\extraction\owasp-slice-test-2.txt'

    owasp_train_file = settings.relative_path_from_root('data/mangrove/t-train.txt')
    owasp_test_file = settings.relative_path_from_root('data/mangrove/t-test.txt')
    train_x, train_y = parseDataFile(owasp_train_file)
    test_x, test_y = parseDataFile(owasp_test_file)

    def gen() -> (str, int):
        for idx in range(len(train_x)):
            yield train_x[idx], train_y[idx]
        for idx in range(len(test_x)):
            yield test_x[idx], test_y[idx]

    return gen
예제 #2
0
def server(model_npz=None, host='127.0.0.1', port=8888, debug=False):
    global MODEL
    if model_npz:
        MODEL = lstm.load_model(settings.relative_path_from_root(model_npz))
    else:
        logging.warning("Model not specify, can't predict")
    app.run(host=host, port=port, debug=debug)
예제 #3
0
def transform(data_dir, label_dir):
    tokenizer = Tokenizer()
    train, valid, test = dataloader.load_data(data_dir, label_dir, tokenizer=tokenizer, valid_portion=0)
    with open(settings.relative_path_from_root('data/mangrove/t-train.txt'), 'w') as f:
        for idx in range(len(train[0])):
            label = 'truepositive' if train[1][idx] == 0 else "falsepositive"
            _slice = tokenizer.decode(train[0][idx])
            f.write('{} :: {}\n'.format(_slice, label))

    with open(settings.relative_path_from_root('data/mangrove/t-test.txt'), 'w') as f:
        for idx in range(len(test[0])):
            label = 'truepositive' if train[1][idx] == 0 else "falsepositive"
            _slice = tokenizer.decode(train[0][idx])
            f.write('{} :: {}\n'.format(_slice, label))

    with open(settings.relative_path_from_root('data/mangrove/dict.txt'), 'w') as f:
        for token, _int in tokenizer.get_token_dict().items():
            f.write('{} {}\n'.format(token, _int))
예제 #4
0
def label():
    """
    接受一条标记数据
    :return:
    """
    label_json = request.get_json()
    data_dir = settings.relative_path_from_root('data/label/' +
                                                label_json['project'])
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    with open(data_dir + "/label-" + label_json["flowHash"] + ".json",
              'w') as f:
        json.dump(label_json, f)

    return jsonify({"msg": "true"}), 200
예제 #5
0
 def train(self,
           slice_dir: str,
           label_dir: str,
           dim: int = 128,
           epochs: int = 20,
           timeout: float = 5):
     tokenizer = Tokenizer()
     current_time = time.strftime("%Y-%m-%d-%H-%M", time.localtime())
     model_file = settings.relative_path_from_root(
         'model/theano-lstm-{}.npz'.format(current_time))
     train_lstm(data_dir=slice_dir,
                label_dir=label_dir,
                tokenizer=tokenizer,
                dim_proj=dim,
                max_epochs=epochs,
                batch_size=8,
                saveto=model_file,
                time_out=timeout * 60.0)
예제 #6
0
def predict():
    """
    预测一个slice, 同时将slice保存
    :return:
    """
    if not MODEL:
        logging.warning("Model not specify, can't predict")
        return jsonify({"msg": "Model not specify, can't predict"}), 500
    slice_json = request.get_json()
    data_dir = settings.relative_path_from_root('data/slice/' +
                                                slice_json['project'])
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    with open(data_dir + '/slice-' + slice_json["flowHash"] + ".json",
              'w') as f:
        json.dump(slice_json, f)
    isTP = lstm.predict(MODEL, slice_json["slice"])
    logging.info("Predict {0} as {1}".format(slice_json["flowHash"], isTP))
    return jsonify({"msg": str(isTP)}), 200
예제 #7
0
:copyright: (c) 2019 by Anemone Xu.
:license: Apache 2.0, see LICENSE for more details.
"""
import _theano.dataloader as dataloader
from _theano.tokenizer import *
import settings


def transform(data_dir, label_dir):
    tokenizer = Tokenizer()
    train, valid, test = dataloader.load_data(data_dir, label_dir, tokenizer=tokenizer, valid_portion=0)
    with open(settings.relative_path_from_root('data/mangrove/t-train.txt'), 'w') as f:
        for idx in range(len(train[0])):
            label = 'truepositive' if train[1][idx] == 0 else "falsepositive"
            _slice = tokenizer.decode(train[0][idx])
            f.write('{} :: {}\n'.format(_slice, label))

    with open(settings.relative_path_from_root('data/mangrove/t-test.txt'), 'w') as f:
        for idx in range(len(test[0])):
            label = 'truepositive' if train[1][idx] == 0 else "falsepositive"
            _slice = tokenizer.decode(train[0][idx])
            f.write('{} :: {}\n'.format(_slice, label))

    with open(settings.relative_path_from_root('data/mangrove/dict.txt'), 'w') as f:
        for token, _int in tokenizer.get_token_dict().items():
            f.write('{} {}\n'.format(token, _int))

if __name__ == '__main__':
    transform(settings.relative_path_from_root('data/slice/benchmark1.2'),
              settings.relative_path_from_root('data/label/benchmark1.2'))
예제 #8
0
    n_train = int(numpy.round(n_samples * (1. - test_portion)))
    test_set_x = [all_set_x[s] for s in sidx[n_train:]]
    test_set_y = [all_set_y[s] for s in sidx[n_train:]]
    train_set_x = [all_set_x[s] for s in sidx[:n_train]]
    train_set_y = [all_set_y[s] for s in sidx[:n_train]]

    # split train set into valid set
    # TODO 每次训练都应重新shuffle
    n_samples = len(train_set_x)
    sidx = numpy.random.permutation(n_samples)
    n_train = int(numpy.round(n_samples * (1. - valid_portion)))

    valid_set_x = [train_set_x[s] for s in sidx[n_train:]]
    valid_set_y = [train_set_y[s] for s in sidx[n_train:]]

    real_train_set_x = [train_set_x[s] for s in sidx[:n_train]]
    real_train_set_y = [train_set_y[s] for s in sidx[:n_train]]

    # TODO sort_by_len 感觉没啥用啊 会影响实验结果吗?

    train = (real_train_set_x, real_train_set_y)
    valid = (valid_set_x, valid_set_y)
    test = (test_set_x, test_set_y)

    return train, valid, test


if __name__ == '__main__':
    load_data(settings.relative_path_from_root('data/slice/benchmark'),
              settings.relative_path_from_root('data/label/benchmark'))
예제 #9
0
def train_lstm(
        data_dir: str,
        label_dir: str,
        tokenizer: Tokenizer,
        dim_proj=16,  # word embeding dimension and LSTM number of hidden units.
        patience=10,  # Number of epoch to wait before early stop if no progress
        max_epochs=5000,  # The maximum number of epoch to run
        disp_freq=10,  # Display to stdout the training progress every N updates
        decay_c=0.,  # Weight decay for the classifier applied to the U weights.
        lrate=0.001,  # Learning rate for sgd (not used for adadelta and rmsprop)
        optimizer=adam,
        # sgd, adadelta and rmsprop available, sgd very hard to use, not recommanded (probably need momentum and decaying learning rate).
        saveto='lstm_model.npz',  # The best model will be saved there
        validFreq=70,  # Compute the validation error after this number of update.
        maxlen=None,  # Sequence longer then this get ignored
        batch_size=16,  # The batch size during training.
        valid_batch_size=16,  # The batch size used for validation/test set.
        # Parameter for extra option
    noise_std=0.,
        use_dropout=True,  # if False slightly faster, but worst test error
        # This frequently need a bigger model.
    reload_model=False,  # Path to a saved model we want to start from.
        test_size=-1,  # If >0, we keep only this number of test example.
        time_out=1000,  # timeout
):
    # Model options
    model_options = locals().copy()
    logging.info("model options: {}".format(model_options))

    logging.info('Loading data')
    train, valid, test = dataloader.load_data(data_dir,
                                              label_dir,
                                              tokenizer=tokenizer,
                                              valid_portion=0,
                                              test_portion=0.2)
    model_options['n_words'] = len(tokenizer.get_token_dict()) + 10
    if test_size > 0:
        # The test set is sorted by size, but we want to keep random
        # size example.  So we must select a random selection of the
        # examples.
        idx = numpy.arange(len(test[0]))
        numpy.random.shuffle(idx)
        idx = idx[:test_size]
        test = ([test[0][n] for n in idx], [test[1][n] for n in idx])
    if not train:
        raise Exception("Dataset could not be zero records")

    ydim = numpy.max(train[1]) + 1

    model_options['ydim'] = ydim

    logging.info('Building model')
    # This create the initial parameters as numpy ndarrays.
    # Dict name (string) -> numpy ndarray
    params = init_params(model_options)

    if reload_model:
        load_params('lstm_model.npz', params)

    # This create Theano Shared Variable from the parameters.
    # Dict name (string) -> Theano Tensor Shared Variable
    # params and tparams have different copy of the weights.
    tparams = init_tparams(params)

    # use_noise is for dropout
    (use_noise, x, mask, y, f_pred_prob, f_pred,
     cost) = build_model(tparams, model_options, False)

    if decay_c > 0.:
        decay_c = theano.shared(numpy_floatX(decay_c), name='decay_c')
        weight_decay = 0.
        weight_decay += (tparams['U']**2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    f_cost = theano.function([x, mask, y], cost, name='f_cost')

    grads = tensor.grad(cost, wrt=list(tparams.values()))
    f_grad = theano.function([x, mask, y], grads, name='f_grad')

    lr = tensor.scalar(name='lr')
    f_grad_shared, f_update = optimizer(lr, tparams, grads, x, mask, y, cost)

    logging.info("%d training and %d test datapoints" %
                 (len(train[0]), len(test[0])))
    history_errs = []
    best_p = None
    bad_count = 0

    if validFreq == -1:
        validFreq = len(train[0]) // batch_size

    uidx = 0  # the number of update done
    estop = False  # early stop
    start_time = time.time()
    last_best_time = 0.
    total_batches = None
    try:
        eidx = 0
        # 一次训练
        while eidx < max_epochs and time_out > (time.time() -
                                                start_time) / 60.0:
            eidx = eidx + 1
            n_samples = 0

            logging.info('Epoch {}'.format(eidx))
            # Get new shuffled index for the training set.
            # 与py2(mangrove)不同,py3迭代器用完不会会从0开始,因此需要复制一个作为后面评估结果试用
            kf, train_batch = tee(
                get_minibatches_idx(len(train[0]), batch_size, shuffle=True),
                2)
            kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size)
            test_batch = get_minibatches_idx(len(test[0]), valid_batch_size)

            id_of_batches = 0
            for _, train_index in kf:
                id_of_batches += 1
                if id_of_batches % disp_freq == 0:
                    logging.info("Calculating {}/{} batch".format(
                        id_of_batches, total_batches))
                uidx += 1
                use_noise.set_value(1.)

                # Select the random examples for this minibatch
                y = [train[1][t] for t in train_index]
                x = [train[0][t] for t in train_index]

                x, mask, y = prepare_data(x, y)
                n_samples += x.shape[1]

                cost = f_grad_shared(x, mask, y)
                f_update(lrate)

                if numpy.isnan(cost) or numpy.isinf(cost):
                    logging.error('bad cost detected: {}'.format(cost))
                    return 1., 1., 1.
            if total_batches is None:
                total_batches = id_of_batches
            use_noise.set_value(0.)
            train_err, _ = pred_error(f_pred, prepare_data, train, train_batch)
            test_err, details = pred_error(f_pred, prepare_data, test,
                                           test_batch)
            valid_err = test_err if len(test[0]) > 0 else train_err
            history_errs.append([valid_err, test_err])
            if best_p is None or (valid_err <=
                                  numpy.array(history_errs)[:, 0].min()):
                best_p = unzip(tparams)
                bad_counter = 0
                last_best_time = (time.time() - start_time) / 60.0

            now_time = time.time()
            total_time = (now_time - start_time) / 60.0
            # print("{train_file}\t{dim}\t{batch_size}\t{depth}\t{train_acc}\t{test_acc}\t{tp}\t{tn}\t{fp}\t{fn}")
            logging.info(
                "Train Acc: {train_acc}%, Test Acc: {test_acc}%, Recall & Precision: {details}"
                .format(train_acc=(1 - train_err) * 100,
                        test_acc=(1 - test_err) * 100,
                        details=details))

            # print('%s\t%d\t%d\t%d\t%.2f\t%.2f\t%s\t%d\t%.2f\t%.2f\t%.2f' % (
            #     data_dir, dim_proj, batch_size, 0, , ,
            #     details, eidx, total_time / eidx, last_best_time, total_time))

            if (len(history_errs) > patience and valid_err >=
                    numpy.array(history_errs)[:-patience, 0].min()):
                bad_counter += 1
                if bad_counter > patience:
                    logging.warning('Early Stop!')
                    estop = True
                    break
            if estop:
                break
    except KeyboardInterrupt:
        logging.error("Training interupted")

    end_time = time.time()
    total_time = (end_time - start_time) / 60.0

    if best_p is not None:
        zipp(best_p, tparams)
    else:
        best_p = unzip(tparams)

    use_noise.set_value(0.)
    kf_train_sorted = get_minibatches_idx(len(train[0]), batch_size)
    train_err, _ = pred_error(f_pred, prepare_data, train, kf_train_sorted)

    test_size = len(test[0])
    valid_err, test_err = 0, 0
    if test_size > 0:
        kf_test = get_minibatches_idx(test_size, test_size)
        test_err, details = pred_error(f_pred, prepare_data, test, kf_test)
        valid_err = test_err

    if saveto:
        with open(settings.relative_path_from_root(saveto + '.tokenizer'),
                  'wb') as f:
            pickle.dump(tokenizer, f)
        with open(saveto + '.args', 'wb') as f:
            pickle.dump(model_options, f)
        numpy.savez(saveto,
                    train_err=train_err,
                    valid_err=valid_err,
                    test_err=test_err,
                    history_errs=history_errs,
                    **best_p)
    print('%s\t%d\t%d\t%d\t%.2f\t%.2f\t%s\t%d\t%.2f\t%.2f\t%.2f' %
          (data_dir, dim_proj, batch_size, 0, (1 - train_err) * 100,
           (1 - test_err) * 100, details, eidx, total_time / eidx,
           last_best_time, total_time))
예제 #10
0

if __name__ == '__main__':
    logging.basicConfig(
        format=
        '%(asctime)s : %(levelname)s : %(filename)s : %(funcName)s : %(message)s',
        level=logging.INFO)
    numpy.set_printoptions(threshold=10000000, precision=2, suppress=True)
    data_dir = sys.argv[1]
    label_dir = sys.argv[2]
    dim = int(sys.argv[3])
    max_epochs = int(sys.argv[4])
    time_out_h = int(sys.argv[5])
    vocab = {}

    model_file = settings.relative_path_from_root('model/theano-lstm.npz')
    if len(sys.argv) > 6 and sys.argv[6] == 'test':
        modelFile = sys.argv[7]
        # TODO
        # test_lstm(dim_proj=dim, n_words=130, dataFile=train_file, reload_model=modelFile, vocab=vocab)
    else:
        tokenizer = Tokenizer()
        train_lstm(data_dir=data_dir,
                   label_dir=label_dir,
                   tokenizer=tokenizer,
                   dim_proj=dim,
                   max_epochs=max_epochs,
                   batch_size=8,
                   saveto=model_file,
                   time_out=time_out_h * 60.0)