Exemplo n.º 1
0
def predict(model):
    # Load data to carry out prediction
    # x: input;     y: label
    x_predict = load_test_data(in_height, in_width, num_rows)

    # Define the input function for prediction
    predict_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={'file': x_predict},
        batch_size=batch_size,
        num_epochs=1,
        shuffle=False)

    # Use the model for prediction
    pred_results = model.predict(input_fn=predict_input_fn)

    # results[:, 0] is the probability of class 0 (i.e. not malware)
    # results[:, 1] is the probability of class 1 (i.e. being malware)
    results = np.asarray(list(pred_results))
    df_out = pd.DataFrame(results[:, 1])

    header = ["malware"]
    df_out.to_csv('./result.csv', header=header, index=True, index_label="sample_id")

    # i = 0
    # with open('result.csv', 'w') as csvfile:
    #     csv_writer = csv.writer(csvfile,)
    #     csv_writer.writerow(["sample_id", "malware"])
    #     for result in pred_results:
    #         csv_writer.writerow([i, result[1]])
    #         i = i+1

    print('You can find the prediction results in ./result.csv.')
Exemplo n.º 2
0
def eval():

    test_data,test_labels=load_test_data()


    with tf.Graph().as_default() as g:

        test_inputs_placeholder=tf.placeholder(tf.float32,shape=[100,32,32,3],name='test_inputs')
        test_labels_placeholder=tf.placeholder(tf.int32,shape=[100],name='test_labels')
        logits=vgg.inference_vgg(test_inputs_placeholder,train=False)
        test_correct_op = tf.nn.in_top_k(logits,test_labels_placeholder,1)
        saver=tf.train.Saver()

        with tf.Session() as sess:
            #load most recent checkpoint
            ckpt=tf.train.get_checkpoint_state('./checkpoints')
            if ckpt:
                saver.restore(sess, ckpt.model_checkpoint_path)

            mean_acc=0.0
            for step in range((len(test_labels)/100)):
                test_batch_data,test_batch_labels=create_batch(step,test_data,test_labels)
                feed_dict={
                    test_inputs_placeholder:test_batch_data,
                    test_labels_placeholder:test_batch_labels
                }
                curr_correct=sess.run([test_correct_op],feed_dict=feed_dict)

                curr_acc=np.sum(curr_correct)/100.0
                mean_acc=((step)*mean_acc+curr_acc)/(step+1)
                print(mean_acc)
            print('Total mean accuracy is %f' %mean_acc)
Exemplo n.º 3
0
def predict():
    word_weights, tag_weights = load_embedding()
    word_voc, tag_voc, label_voc = load_voc()

    # train data
    sentences, tags, labels = load_train_data(word_voc, tag_voc, label_voc)
    seed = 137
    np.random.seed(seed)
    np.random.shuffle(sentences)
    np.random.seed(seed)
    np.random.shuffle(tags)
    np.random.seed(seed)
    np.random.shuffle(labels)

    # load data
    sentences_test, tags_test = load_test_data(word_voc, tag_voc, label_voc)
    labels_test = None
    
    # clear reslut
    command = 'rm ./Data/result/*'
    os.popen(command)

    # 划分训练、开发、测试集
    kf = KFold(n_splits=config.KFOLD)
    train_indices, dev_indices = [], []
    for train_index, dev_index in kf.split(labels):
        train_indices.append(train_index)
        dev_indices.append(dev_index)
    for num in range(config.KFOLD):
        train_index, dev_index = train_indices[num], dev_indices[num]
        sentences_train, sentences_dev = sentences[train_index], sentences[dev_index]
        tags_train, tags_dev = tags[train_index], tags[dev_index]
        labels_train, labels_dev = labels[train_index], labels[dev_index]

        # init model
        model = DCModel(
            config.MAX_LEN, word_weights, tag_weights, result_path='./Data/result/result.txt',
            label_voc=label_voc)

        # fit model
        model.fit(
            sentences_train, tags_train, labels_train,
            sentences_dev, tags_dev, labels_dev,
            sentences_test, tags_test, labels_test,
            config.BATCH_SIZE, config.NB_EPOCH, keep_prob=config.KEEP_PROB,
            word_keep_prob=config.WORD_KEEP_PROB, tag_keep_prob=config.TAG_KEEP_PROB)
        print(model.get_best_score())
        [p_test, r_test, f_test], nb_epoch = model.get_best_score()
        command = 'cp ./Data/result/epoch_%d.csv ./Data/result/best_%d' % (nb_epoch+1, num)
        print(command)
        os.popen(command)
        print(p_test, r_test, f_test, '\n')
        # evaluate
        # result_path_k = result_path % k
        # p_test, r_test, f_test = model.evaluate(sentences_test, tags_test, positions_test,
        #    labels_test, simple_compute=False, ignore_label=IGNORE_LABEL,
        #    label_voc=relation_voc, result_path=result_path_k)
        # clear model
        model.clear_model()
        del model
Exemplo n.º 4
0
def main():
    train_file              = "data_train.txt"
    test_file               = "data_test.txt"
    epoches                 = 100
    alpha                   = 0.000000001
    data_array, label_array = load_train_data(train_file)
    test_array              = load_test_data(test_file)
    data_matrix             = np.mat(data_array)
    label_matrix            = np.mat(label_array)
    test_matrix             = np.mat(test_array)
    theta, cost_vector      = train(data_matrix, label_matrix, epoches, alpha)
    test_result             = test(theta, test_matrix)
    print(theta)
    print(cost_vector[np.size(cost_vector)-1])
    print(test_matrix, test_result)

    # Plot Result
    m,n       = np.shape(data_array)
    plot_x = []
    plot_y = []
    plot_z = []

    for i in range(m):
        plot_x.append(data_matrix[i,1])
        plot_y.append(data_matrix[i,n-1])
        plot_z.append(label_matrix[i,0])

    test_m, test_n       = np.shape(test_matrix)
    plot_testx = []
    plot_testy = []
    plot_testz = []
    for i in range(test_m):
        plot_testx.append(test_matrix[i,1])
        plot_testy.append(test_matrix[i,test_n-1])
        plot_testz.append(test_result[i,0])
    
    figure = plt.figure("Result")
    fig_plot = figure.add_subplot(111, projection='3d')
    fig_plot.scatter(plot_x, plot_y, plot_z, s=5, c='red', marker='s') # plot 0
    fig_plot.scatter(plot_testx, plot_testy, plot_testz, s=30, c='green', marker='s') # plot 0
    x = np.random.randint(1000, 5000, size=[10000])
    y = np.random.randint(2, 5, size=[10000])
    z = theta[0,0] + theta[1,0] * x + theta[2,0] * y
    fig_plot.plot(x,y,z)
    fig_plot.set_title("The Result Linear Regression")
    fig_plot.set_xlabel('Area')
    fig_plot.set_ylabel('Rooms')
    fig_plot.set_zlabel('Price')

    # Plot Cost
    cost_fig  = plt.figure("Cost")
    cost_plot = cost_fig.add_subplot(111)
    epoch   = np.arange(0, epoches+1, 1)
    cost_plot.plot(epoch, cost_vector)
    plt.title("The Cost")
    plt.xlabel('Epoch')
    plt.ylabel('Cost')

    plt.show()
    def generate_recons(self):
        """
        generate all reconstructed CT samples from the FDK neural network which will be used for later training in U-Net
        """

        # load all the data
        train_data_numpy, train_labels_numpy = load_data.load_training_data()
        validation_data_numpy, validation_labels_numpy = load_data.load_validation_data(
        )
        test_data_numpy, test_labels_numpy = load_data.load_test_data()

        # normalize the input data
        train_data_numpy = self.normalize_sino(train_data_numpy)
        validation_data_numpy = self.normalize_sino(validation_data_numpy)
        test_data_numpy = self.normalize_sino(test_data_numpy)

        # normalize the labels
        train_labels_numpy = self.normalize_labels(train_labels_numpy)
        validation_labels_numpy = self.normalize_labels(
            validation_labels_numpy)
        test_labels_numpy = self.normalize_labels(test_labels_numpy)

        # session
        config = tf.ConfigProto()
        config.gpu_options.per_process_gpu_memory_fraction = 0.9
        config.gpu_options.allow_growth = True
        with tf.Session(config=config) as sess:
            # Build Graph
            self.build_inital_graph()
            self.build_model_proj_graph()
            self.build_model_recon_graph()
            self.build_train_op_graph()

            sess.run(tf.global_variables_initializer())
            sess.run(tf.local_variables_initializer())

            # generation on set
            print('\n############################### generating')
            best_model_sess_file = tf.train.latest_checkpoint(
                'fdk_nn_model/saved_session/')
            self.saver.restore(sess, best_model_sess_file)

            self.do_model_eval(
                sess, train_data_numpy, train_labels_numpy,
                NUM_TRAINING_SAMPLES, TRAIN_INDEX,
                [True, self.model_name + '/eval_recon/generation_recons/'])
            self.do_model_eval(
                sess, validation_data_numpy, validation_labels_numpy,
                NUM_VALIDATION_SAMPLES, VALID_INDEX,
                [True, self.model_name + '/eval_recon/generation_recons/'])
            self.do_model_eval(
                sess, test_data_numpy, test_labels_numpy, NUM_TEST_SAMPLES,
                TEST_INDEX,
                [True, self.model_name + '/eval_recon/generation_recons/'])
Exemplo n.º 6
0
def eval():
    transformer = Transformer(training=False)

    X, Sources, Targets = load_test_data()
    en2idx, idx2en = load_vocab('./preprocessed/en.vocab.tsv')

    with transformer.graph.as_default():
        sv = tf.train.Supervisor()
        with sv.managed_session(config=tf.ConfigProto(
                allow_soft_placement=True)) as sess:
            sv.saver.restore(sess, tf.train.latest_checkpoint(hp.logdir))
            print 'restored'
            mname = open(hp.logdir + '/checkpoint',
                         'r').read().split('"')[1]  # model name
            if not os.path.exists('results'):
                os.makedirs('results')
            with codecs.open('results/' + mname, 'w', 'utf-8') as fout:
                list_of_refs, hypotheses = [], []
                for i in range(len(X) // hp.batch_size):
                    x = X[i * hp.batch_size:(i + 1) * hp.batch_size]
                    sources = Sources[i * hp.batch_size:(i + 1) *
                                      hp.batch_size]
                    targets = Targets[i * hp.batch_size:(i + 1) *
                                      hp.batch_size]

                    ### Autoregressive inference
                    preds = np.zeros((hp.batch_size, hp.max_len), np.int32)
                    for j in range(hp.max_len):
                        _preds = sess.run(transformer.preds, {
                            transformer.x: x,
                            transformer.y: preds
                        })
                        preds[:, j] = _preds[:, j]

                    for source, target, pred in zip(sources, targets, preds):
                        got = " ".join(
                            idx2en[idx]
                            for idx in pred).split("</S>")[0].strip()
                        fout.write('- source: {}\n'.format(source))
                        fout.write('- expected: {}\n'.format(target))
                        fout.write('- got: {}\n\n'.format(got))

                        ref = target.split()
                        hypothesis = got.split()
                        if len(ref) > 3 and len(hypothesis) > 3:
                            list_of_refs.append(ref)
                            hypotheses.append(hypothesis)
                score = corpus_bleu(list_of_refs, hypotheses)
                fout.write("Bleu Score = " + str(100 * score))
Exemplo n.º 7
0
def test(net, criterion, device):
    testloader = load_test_data()

    correct, total = 0, 0
    with torch.no_grad():
        for data in tqdm(testloader):
            inputs, labels = data[0].to(device), data[1].to(device)
            outputs = net(inputs)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum()

    print('Accuracy of the network on the 10000 test images: %d %%' %
          (100 * correct / total))
    torch.cuda.empty_cache()
Exemplo n.º 8
0
def main():
    log_fmt = Formatter(
        '%(asctime)s %(name)s %(lineno)d [%(levelname)s][%(funcName)s] %(message)s '
    )
    handler = StreamHandler()
    handler.setLevel(INFO)
    handler.setFormatter(log_fmt)
    logger.addHandler(handler)

    handler = FileHandler(DIR + 'train_lgb_clf_hyperopt.py.log', 'a')
    handler.setLevel(DEBUG)
    handler.setFormatter(log_fmt)
    logger.setLevel(DEBUG)
    logger.addHandler(handler)

    logger.info('start')

    logger.info("start exploring best params")

    logger.info("start exploring best params without iteration")
    df_train = load_train_data()
    x_train = df_train.loc[:, 'ABC':'2047']
    y_train = df_train['Active_Nonactive'].values
    best_params = lgb_opt_params(x_train, y_train)
    logger.info("end exploring best params without iteration")

    logger.info("start optimizing iteration")
    best_iter = opt_iter(x_train, y_train, best_params)
    logger.info("end optimizing iteration")

    logger.info("end exploring best params")

    logger.info("start best params train")
    best_model_No, cutoff = create_models(x_train, y_train, best_params,
                                          best_iter)
    logger.info("end best params train")

    logger.info("start predict unknown data(test data)")
    df_test = load_test_data().sort_values('Name')
    use_cols = x_train.columns.values
    # x_test = df_test[use_cols]
    df_all = pd.concat([df_train, df_test], axis=0,
                       sort=False).sort_values('Name')
    x_all = df_all[use_cols]
    predict_test(x_all, best_model_No, cutoff)
    logger.info("end predict unknown data(test data)")

    logger.info("end")
def evaluate_on_metrics(model):
    """
    do evaluation on mse, ssim, ms-ssim and psnr

    Parameters
    ----------
    model : str
        The model for evaluation
    """

    # get the labels
    _, labels = load_data.load_test_data()
    labels = normalize(labels)

    # load the recons on the model
    recon_phantoms = np.empty(labels.shape)
    for i in range(recon_phantoms.shape[0]):
        recon_file = model + '/eval_recon/recon_' + str(TEST_INDEX[i]) + '.npy'
        recon_phantoms[i, :, :, :] = np.load(recon_file)

    # MSE
    mse = np.mean(np.square(recon_phantoms - labels))

    #
    max_val = 1.0

    # SSIM
    ssim = calculate_ssim(recon_phantoms, labels, max_val)

    # MS-SSIM
    ms_ssim = calculate_ms_ssim(recon_phantoms, labels, max_val)

    # Peak Signal-to-Noise Ratio
    psnr = calculate_psnr(recon_phantoms, labels, max_val)

    # print the results
    print('mse value: ', str(mse))
    print('ssim value: ', str(ssim))
    print('ms-ssim value: ', str(ms_ssim))
    print('psnr value: ', str(psnr))

    # save the metrics results
    f = open(model + '/eval_result/metrics_result.txt', 'a+')
    f.write(
        "Model: {0}, Date: {1:%Y-%m-%d_%H:%M:%S} \nMSE: {2:3.8f} \nSSIM: {3:3.8f} \nMS-SSIM: {4:3.8f} \nPSNR: {5:3.8f}\n\n"
        .format(model, datetime.datetime.now(), mse, ssim, ms_ssim, psnr))
    f.close()
def eval_pure_fdk():
    """
    do evaluation on mse, ssim, ms-ssim and psnr for the conventional FDK algorithm
    """

    # get the labels
    _, labels = load_data.load_test_data()
    labels = normalize(labels)

    # load the recons
    recon_phantoms = np.empty(labels.shape)
    for i in range(recon_phantoms.shape[0]):
        recon_file = '../data_preprocessing/recon_145/recon_' + str(
            TEST_INDEX[i]) + '.npy'
        recon_phantoms[i, :, :, :] = np.load(recon_file)
    recon_phantoms = normalize(recon_phantoms)

    # MSE
    mse = np.mean(np.square(recon_phantoms - labels))

    #
    max_val = 1.0

    # SSIM
    ssim = calculate_ssim(recon_phantoms, labels, max_val)

    # MS-SSIM
    ms_ssim = calculate_ms_ssim(recon_phantoms, labels, max_val)

    # Peak Signal-to-Noise Ratio
    psnr = calculate_psnr(recon_phantoms, labels, max_val)

    # print the results
    print('mse value: ', str(mse))
    print('ssim value: ', str(ssim))
    print('ms-ssim value: ', str(ms_ssim))
    print('psnr value: ', str(psnr))

    # save the metrics results
    f = open('pure_fdk_model/eval_result/metrics_result.txt', 'a+')
    f.write(
        "Model: {0}, Date: {1:%Y-%m-%d_%H:%M:%S} \nMSE: {2:3.8f} \nSSIM: {3:3.8f} \nMS-SSIM: {4:3.8f} \nPSNR: {5:3.8f}\n\n"
        .format('pure_fdk_model', datetime.datetime.now(), mse, ssim, ms_ssim,
                psnr))
    f.close()
Exemplo n.º 11
0
def predict():
    with open(DIR + 'model.pkl', 'rb') as f:
        clf = pickle.load(f)

    with open(DIR + 'usecols.pkl', 'rb') as f:
        usecols = pickle.load(f)

    imp = pd.DataFrame(clf.feature_importance(), columns=['imp'])
    imp['col'] = usecols
    n_features = imp.shape[0]
    imp = imp.sort_values('imp', ascending=False)
    imp.to_csv(DIR + 'feature_importances.csv')
    logger.info('imp use {} {}'.format(imp[imp.imp > 0].shape, n_features))

    df = load_test_data()
    logger.info('data size {}'.format(df.shape))

    for col in usecols:
        if col not in df.columns.values:
            df[col] = np.zeros(df.shape[0])
            logger.info('no col %s' % col)

    x_test = df[usecols]

    if x_test.shape[1] != n_features:
        raise Exception('Not match feature num: %s %s' %
                        (x_test.shape[1], n_features))

    logger.info('test load end')

    p_test = clf.predict(x_test)
    with open(DIR + 'test_tmp_pred.pkl', 'wb') as f:
        pickle.dump(p_test, f, -1)

    logger.info('test save end')

    sub = pd.DataFrame()

    sub['click_id'] = df['click_id']
    sub['is_attributed'] = p_test
    sub.to_csv(DIR + 'submit.csv', index=False)
    logger.info('exit')
Exemplo n.º 12
0
def model_selection_and_evaluation():
    """
    Test some candidate models with validation set, select highest scoring,
    train on full train + validation set, evluate on test set
    :return: tuple: best model, list of feature sets it uses
    """
    # Load train and test sets
    df_tr, df_te = load_data.load_train_data(), load_data.load_test_data()

    # Split train into validation (for model selection) and train
    df_tr_tr, df_tr_val = utils.split_train_validation(df_tr)

    # Assess accuracies of all models on validation set
    # Get best scoring canditate
    best_model, best_model_feats = model_selection(df_tr_tr, df_tr_val)

    print('Best scoring model is: {}, Using feature sets: {}'.format(
        best_model.name, best_model_feats))

    # Evaluate test set accuracy of chosen model
    test_set_evaluation(df_tr, df_te, {best_model: best_model_feats})

    return best_model, best_model_feats
Exemplo n.º 13
0
import numpy as np


###############################
# Untar data
def untar_data(name, outdir='./data'):
    my_tar = tarfile.open('./Indoor-scene-recognition/' + name)
    my_tar.extractall(outdir)
    my_tar.close()


# Uncomment to untar data
# untar_data("indoorCVPR_09annotations.tar")
# untar_data("indoorCVPR_09.tar")
###############################

###############################
# Load data
test_data = load_data.load_test_data()
train_data = load_data.load_train_data()

# Show the data
print(test_data.shape)
print(train_data.shape)
train_i = np.random.choice(train_data.shape[0])
test_i = np.random.choice(test_data.shape[0])
cv2.imshow("example in train", train_data[train_i])
cv2.imshow("example in test", test_data[test_i])
cv2.waitKey(0)

###############################
Exemplo n.º 14
0
    ###
    with open(DIR + 'model.pkl', 'rb') as f:
        clf = pickle.load(f)
    with open(DIR + 'usecols.pkl', 'rb') as f:
        usecols = pickle.load(f)
    imp = pd.DataFrame(clf.feature_importance(), columns=['imp'])
    imp['col'] = usecols
    n_features = imp.shape[0]
    imp = imp.sort_values('imp', ascending=False)
    imp.to_csv(DIR + 'feature_importances.csv')
    logger.info('imp use {} {}'.format(imp[imp.imp > 0].shape, n_features))

    with open(DIR + 'fillna_mean.pkl', 'rb') as f:
        fillna_mean = pickle.load(f)
    x_test = load_test_data()

    id_cols = [
        col for col in x_test.columns.values
        if re.search('_id$', col) is not None and col not in set(
            ['o_user_id', 'o_product_id', 'p_aisle_id', 'p_department_id'])
    ]
    logger.debug('id_cols {}'.format(id_cols))
    x_test.drop(id_cols, axis=1, inplace=True)

    logger.info('usecols')
    x_test = x_test[usecols]
    gc.collect()
    logger.info('values {} {}'.format(len(usecols), x_test.shape))
    x_test.fillna(fillna_mean, inplace=True)
Exemplo n.º 15
0
                      xg_trn,
                      num_boost_round=5000,
                      evals=watchlist,
                      early_stopping_rounds=100,
                      verbose_eval=50)

    return model


if __name__ == '__main__':
    logger.info('Start')

    train_df = load_train_data(nrows=100)
    logger.info('train load end {}'.format(train_df.shape))

    test_df = load_test_data(nrows=100)
    logger.info('test load end {}'.format(test_df.shape))

    # Labels
    train_y = train_df["deal_probability"].values
    test_id = test_df["item_id"].values

    # Feature Weekday
    train_df["activation_weekday"] = train_df["activation_date"].dt.weekday
    test_df["activation_weekday"] = test_df["activation_date"].dt.weekday

    # Label encode the categorical variables
    cat_vars = [
        "region", "city", "parent_category_name", "category_name", "user_type",
        "param_1", "param_2", "param_3"
    ]
from os import path

import random

#os.environ["CUDA_VISIBLE_DEVICES"]="3,4,5,6"

training_progress = []
development_progress = []
test_progress = []

model = load_model()
model.compile(optimizer='adagrad', loss='mse', metrics=['mae'])

X_train, Y_train = load_training_data()
X_dev, Y_dev = load_development_data()
X_test, Y_test = load_test_data()

min_mse_dev = 10000
min_mae_dev = 10000

min_mse_test = 10000
min_mae_test = 10000

current_epoch_number = 1
total_epoch_count = 100

m = X_train.shape[0]
batch_size_list = list(range(1, m))

print("\n\n")
Exemplo n.º 17
0
    log_fmt = Formatter('%(asctime)s %(name)s %(lineno)d [%(levelname)s][%(funcName)s] %(message)s ')
    handler = StreamHandler()
    handler.setLevel('INFO')
    handler.setFormatter(log_fmt)
    logger.addHandler(handler)

    handler = FileHandler(DIR + 'train.py.log', 'a')
    handler.setLevel(DEBUG)
    handler.setFormatter(log_fmt)
    logger.setLevel(DEBUG)
    logger.addHandler(handler)

    logger.info('start')

    df_train0 = load_train_data()
    df_test0 = load_test_data()

    logger.info('concat train and test datasets: {} {}'.format(df_train0.shape, df_test0.shape))

    df_train0['train'] = 1
    df_test0['train'] = 0
    df = pd.concat([df_train0, df_test0], axis=0, sort=False)

    logger.info('Data preprocessing')

    # Drop PoolQC, MiscFeature, Alley and Fence features
    # because they have more than 80% of missing values.
    df = df.drop(['Alley','PoolQC','Fence','MiscFeature'],axis=1)

    object_columns_df = df.select_dtypes(include=['object'])
    numerical_columns_df =df.select_dtypes(exclude=['object'])
Exemplo n.º 18
0
           epsilon=None,
           decay=0.0,
           amsgrad=True)
batch_size = 20

#Parameters
show_num = 1
map_index = 2
dataset_index = 3

#Load pre-trained weights
model.load_weights(pretrained_model_weights)
model.compile(optimizer=opt, loss='mse')

#Load data
test_input, expected_output, obs = load_test_data(dataset_index=dataset_index,
                                                  map_index=map_index)
print(test_input[2].shape)

# print(expected_ouput[1])

#sys.exit()
print('Predicting...')
predicted_output = model.predict(test_input, batch_size=batch_size, verbose=1)
print('Predicting Done!')

print('Calculating Predicting Error...')
mean_FDE = calculate_FDE(expected_output, predicted_output,
                         len(expected_output), show_num)
mean_ADE = calculate_ADE(expected_output, predicted_output,
                         len(expected_output), 12, show_num)
all_FDE = calculate_FDE(expected_output, predicted_output,
Exemplo n.º 19
0
import torch
from model import FNet
from load_data import load_result_data, load_test_data
import numpy as np
import pandas as pd

#device=torch.device('cuda:0' if torch.cuda.is_available else 'cpu')
#[1000,10]  [1000,10]  [1000,10,2]

np_test_x, np_test_y, np_test_xy = load_test_data(
    x_path="data/task9_evaluate_finetune_x.csv",
    y_path="data/task9_evaluate_finetune_y.csv")
np_result_x = load_result_data(x_path="data/task9_evaluate_x.csv")

fnet = FNet(1, 50, 1)
map_location = lambda storage, loc: storage
fnet.load_state_dict(torch.load("fnetmodel.pkl", map_location=map_location))

fnet.eval()

# 结果
result = []
for i in range(np_test_xy.shape[0]):  # [100, 5, 2]
    fnet.eval()
    test_xy = np_test_xy[i]  # [5, 2]
    test_x = test_xy[:, 0]  # [5, ]
    test1_x = np_result_x[i]  # [100, ]

    tensor_test1_x = torch.from_numpy(test1_x[:,
                                              np.newaxis]).float()  # [100, 1]
    tensor_test_xy = torch.from_numpy(test_xy).float()  # [5, 2]
Exemplo n.º 20
0
    def run_training(self):
        """
        do training
        """

        # load all the data
        train_data_numpy, train_labels_numpy = load_data.load_training_data()
        validation_data_numpy, validation_labels_numpy = load_data.load_validation_data(
        )
        test_data_numpy, test_labels_numpy = load_data.load_test_data()

        # normalize the input data
        train_data_numpy = self.normalize_sino(train_data_numpy)
        validation_data_numpy = self.normalize_sino(validation_data_numpy)
        test_data_numpy = self.normalize_sino(test_data_numpy)

        # normalize the labels
        train_labels_numpy = self.normalize_labels(train_labels_numpy)
        validation_labels_numpy = self.normalize_labels(
            validation_labels_numpy)
        test_labels_numpy = self.normalize_labels(test_labels_numpy)

        # Training session
        config = tf.ConfigProto()
        config.gpu_options.per_process_gpu_memory_fraction = 0.9
        config.gpu_options.allow_growth = True
        with tf.Session(config=config) as sess:
            # Build Graph
            self.build_inital_graph()
            self.build_model_proj_graph()
            self.build_model_recon_graph()
            self.build_train_op_graph()

            sess.run(tf.global_variables_initializer())
            sess.run(tf.local_variables_initializer())

            valid_losses = []
            best_valid_loss = np.Inf
            for epoch in range(MAX_EPOCHS):
                # Initialise dataset iterator
                sess.run(self.iter.initializer,
                         feed_dict={
                             self.data_placeholder: train_data_numpy,
                             self.labels_placeholder: train_labels_numpy,
                             self.index_placeholder: TRAIN_INDEX
                         })

                training_losses = 0
                for step in range(NUM_TRAINING_SAMPLES):
                    # run the training
                    training_loss, _ = sess.run([self.loss, self.train_op])
                    training_losses += np.mean(training_loss)

                valid_loss = self.do_model_eval(sess, validation_data_numpy,
                                                validation_labels_numpy,
                                                NUM_VALIDATION_SAMPLES,
                                                VALID_INDEX, [False, ''])
                valid_losses.append(valid_loss)

                print(
                    'Epoch: {0:3d}/{1:3d}, training loss: {2:3.8f}, validation loss: {3:3.8f}'
                    .format(epoch + 1, MAX_EPOCHS,
                            training_losses / NUM_TRAINING_SAMPLES,
                            valid_loss))

                # early stopping if validation loss is increasing or staying the same after five epoches
                last_five_valid_losses = valid_losses[-5:]
                if valid_loss < best_valid_loss:
                    best_valid_loss = valid_loss

                    # Save a checkpoint of the least validation loss model so far
                    # print("saving this least validation loss model so far!")
                    self.saver.save(sess,
                                    self.model_name + '/saved_session/sess-' +
                                    '{date:%m_%d_%H:%M}'.format(
                                        date=datetime.datetime.now()) +
                                    '.ckpt',
                                    global_step=epoch)
                elif len(last_five_valid_losses) == 5 and all(
                    [valid_loss >= x for x in last_five_valid_losses]):
                    # print('early stopping !!!')
                    break
                else:
                    # print('no improvement on validation at this epoch, continue training...')
                    continue

            # evaluate on test set
            print(
                '\n############################### testing evaluation on best trained model so far'
            )
            best_model_sess_file = tf.train.latest_checkpoint(
                self.model_name + '/saved_session/')
            self.saver.restore(sess, best_model_sess_file)

            test_loss = self.do_model_eval(
                sess, test_data_numpy, test_labels_numpy, NUM_TEST_SAMPLES,
                TEST_INDEX, [True, self.model_name + '/eval_recon/'])
            print("average test loss: ", test_loss)
Exemplo n.º 21
0
vectorizer_param = {'preprocessor': preprocessor, 'ngram_range': parameters['ngram_range'], 'analyzer': 'word',
                    'min_df': parameters['min_df'], 'max_df': parameters['max_df'],
                    'binary': parameters['TF_binary'], 'norm': parameters['norm'],
                    'sublinear_tf': parameters['sublinear_tf'], 'max_features': parameters['max_features']}

if __name__ == "__main__":
    unigram = StemmedTfidfVectorizer(**vectorizer_param)
    anew = anew_vectorizer()
    pct = punctuation_estimator()
    strength = strength_vectorizer()
    avg_strength = avg_affective_vectorizer()
    log_state('combine unigram and avg strength features')
    combined_features = FeatureUnion([('unigram', unigram), ('avg_strength', avg_strength)])
    # log_state('combine unigram and strength features')
    # combined_features =FeatureUnion([('unigram',unigram),('strength',strength)])
    # log_state('combine unigram and anew features')
    # combined_features =FeatureUnion([('unigram',unigram),('anew',anew)])
    # log_state('combine unigram and punctuation features')
    # combined_features =FeatureUnion([('unigram',unigram),('pct',pct)])
    texts, _ = load_train_data('Sentiment140')

    transformed_train = combined_features.fit_transform(texts)

    testdata, _ = load_test_data()
    transformed_test = combined_features.transform(testdata)

    dump_picle(combined_features.get_feature_names(), './data/features/feature_names.p')
    dump_picle(transformed_train, "./data/transformed_data/transformed_train.p")
    dump_picle(transformed_test, "./data/transformed_data/transformed_test.p")
Exemplo n.º 22
0
def adjust_param():
    # 以第几类数据作为训练集
    type_num = 0
    dim = 4
    C = 0.6
    toler = 0.0001
    maxIter = 40

    best_acc = 0
    best_a = 0
    best_r = 0
    best_label = []

    # 数据预处理
    if type_num == 0:
        train_data = load_data.load_train_data('../data/iris.data',
                                               0,
                                               30,
                                               dim=dim)
        test_data, correct_label = load_data.load_test_data('iris.data',
                                                            type_num,
                                                            30,
                                                            150,
                                                            dim=dim)
    elif type_num == 1:
        train_data = load_data.load_train_data('../data/iris.data', 50, 80)

        test_data1, correct_label1 = load_data.load_test_data('iris.data',
                                                              type_num,
                                                              80,
                                                              150,
                                                              dim=dim)
        test_data2, correct_label2 = load_data.load_test_data('iris.data',
                                                              type_num,
                                                              0,
                                                              50,
                                                              dim=dim)

        test_data = np.vstack((test_data1, test_data2))
        correct_label = np.hstack((correct_label1, correct_label2))
    elif type_num == 2:
        train_data = load_data.load_train_data('../data/iris.data',
                                               100,
                                               130,
                                               dim=dim)

        test_data1, correct_label1 = load_data.load_test_data('iris.data',
                                                              type_num,
                                                              130,
                                                              150,
                                                              dim=dim)
        test_data2, correct_label2 = load_data.load_test_data('iris.data',
                                                              type_num,
                                                              0,
                                                              100,
                                                              dim=dim)

        test_data = np.vstack((test_data1, test_data2))
        correct_label = np.hstack((correct_label1, correct_label2))

    min_acc = 2
    avrg_acc = 0
    max_acc = -1
    for i in range(50):
        a, R = one_class_svm.smo(train_data, C, toler, maxIter)
        result_label = judge(test_data, a, R)
        acc = calculate_acc(result_label, correct_label)
        if acc > best_acc:
            best_acc = acc
            best_a = a
            best_r = R
            best_label = result_label

        avrg_acc += acc

        if acc < min_acc:
            min_acc = acc
        if acc > max_acc:
            max_acc = acc

        #print("accuracy: " + str(acc))

    avrg_acc /= 100
    print("train type:" + str(type_num) + ", dim=" + str(dim) +
          " => best acc = " + str(max_acc))
    print("model: a=" + str(best_a) + ", R=" + str(best_r) + ",C=" + str(C))
    print("label(0-20:positive sample):")
    print(best_label)
    draw_picture(train_data, test_data, correct_label, best_a, best_r, C,
                 toler, best_acc)
from TFNN.layers.EmbeddingLayer import Embedding
from sklearn.model_selection import KFold
from triggerType_to_trigger import get_trigger
'''
For Chinese word segmentation.
'''

#############################1.load data   ######################################
class_type = 3
training_count = 16796
test_count = 2570
word_weights, tag_weights = load_embedding()  #矩阵形式
word_voc, tag_voc, label_voc = load_voc()  #字典形式
sentences, tags, labels = load_train_data(word_voc, tag_voc, label_voc,
                                          class_type, training_count)
Xend_sentence, Xend_tag_test, yend_test = load_test_data(
    word_voc, tag_voc, label_voc, class_type, test_count)

#划分训练集,测试集(这里的y为词性tag

kf = KFold(n_splits=10)
train_indices, dev_indices = [], []
for train_index, dev_index in kf.split(labels):
    train_indices.append(train_index)
    dev_indices.append(dev_index)
for num in range(10):
    train_index, dev_index = train_indices[num], dev_indices[num]
    sentences_train, sentences_dev = sentences[train_index], sentences[
        dev_index]
    tags_train, tags_dev = tags[train_index], tags[dev_index]
    labels_train, labels_dev = labels[train_index], labels[dev_index]
"""kf = KFold(n_splits=10)
Exemplo n.º 24
0
    if len(result_label) != len(correct_label):
        print("Number of label isn't equal!")
        return 0

    n = len(result_label)
    acc = 0
    for i in range(n):
        if result_label[i] == correct_label[i]:
            acc += 1

    return acc / n


if __name__ == "__main__":

    training_data = load_data.load_training_data('data/iris.data')
    # 获取训练集,
    # training_data = [ [type1_data], [type2_data], …… [typeN_data] ]

    w_and_b = test_iris(training_data)  # 得到支持向量

    test_data, label = load_data.load_test_data(
        'data/iris.data')  # 获取测试集和正确的标签

    result_label = judge(test_data, w_and_b)  # 测试
    print(result_label)  # 打印结果标签

    acc = calculate_acc(result_label, label)  # 计算正确率
    print("Accuracy: " + str(acc))
Exemplo n.º 25
0
def train(argv=None):
    # load data
    print("Loading data ... ")
    x_train, y_train = load_data.load_train_data()
    x_test, y_test = load_data.load_test_data()

    # concatenate  and shuffle .
    x_sum = numpy.concatenate((x_train, x_test))
    y_sum = numpy.concatenate((y_train, y_test))
    numpy.random.seed(10)
    shuffle_indices = numpy.random.permutation(numpy.arange(len(y_sum)))
    x_shuffled = x_sum[shuffle_indices]
    y_shuffled = y_sum[shuffle_indices]

    # split to train and test .
    x_train = x_shuffled[1000:]
    y_train = y_shuffled[1000:]
    x_test = x_shuffled[:1000]
    y_test = y_shuffled[:1000]

    print(x_train.shape)
    print(x_test.shape)

    # expand (batch_size,MAX_SENTENCE_LENGTH,EMBEDDING_SIZE) to (batch_size,MAX_SENTENCE_LENGTH,EMBEDDING_SIZE,1)
    x_train = numpy.expand_dims(x_train, -1)
    x_test = numpy.expand_dims(x_test, -1)

    filter_sizes = [2, 3, 4, 5]
    filter_numbers = [300, 200, 100, 50]

    # input
    # input is sentence
    train_data_node = tf.placeholder(tf.float32,
                                     shape=(None, max_document_length,
                                            EMBEDDING_SIZE, NUM_CHANNELS))

    train_labels_node = tf.placeholder(tf.float32, shape=(None, NUM_CLASSES))

    dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")

    # full connected - softmax layer,
    fc1_weights = tf.Variable(
        tf.truncated_normal([sum(filter_numbers), NUM_CLASSES],
                            stddev=0.1,
                            seed=SEED,
                            dtype=tf.float32))

    fc1_biases = tf.Variable(
        tf.constant(0.1, shape=[NUM_CLASSES], dtype=tf.float32))

    # model
    def model(data):
        pooled_outputs = []
        for idx, filter_size in enumerate(filter_sizes):
            conv = conv2d(train_data_node,
                          filter_numbers[idx],
                          filter_size,
                          EMBEDDING_SIZE,
                          name="kernel%d" % idx)
            # 1-max pooling,leave a tensor of shape[batch_size,1,1,num_filters]
            pool = tf.nn.max_pool(
                conv,
                ksize=[1, max_document_length - filter_size + 1, 1, 1],
                strides=[1, 1, 1, 1],
                padding='VALID')
            pooled_outputs.append(tf.squeeze(pool))

        if len(filter_sizes) > 1:
            cnn_output = tf.concat(1, pooled_outputs)
        else:
            cnn_output = pooled_outputs[0]

        # add dropout
        reshape = tf.nn.dropout(cnn_output, dropout_keep_prob)
        # fc1 layer
        fc1_output = tf.matmul(reshape, fc1_weights) + fc1_biases
        return fc1_output

    # Training computation
    logits = model(train_data_node)
    loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(
            tf.clip_by_value(logits, 1e-10, 1.0), train_labels_node))
    # L2 regularization for the fully connected parameters.
    regularizers = (tf.nn.l2_loss(fc1_weights) + tf.nn.l2_loss(fc1_biases))
    loss += 0.05 * regularizers

    tf.scalar_summary('loss', loss)

    # optimizer
    global_step = tf.Variable(0, name="global_step", trainable=False)
    learning_rate = tf.Variable(start_learning_rate, name="learning_rate")
    # learning_rate=tf.train.exponential_decay(start_learning_rate,global_step*BATCH_SIZE,train_size,0.9,staircase=True)

    optimizer = tf.train.AdamOptimizer(learning_rate)
    grads_and_vars = optimizer.compute_gradients(loss)
    train_op = optimizer.apply_gradients(grads_and_vars,
                                         global_step=global_step)

    # Evaluate model
    train_predict = tf.argmax(logits, 1)
    train_label = tf.argmax(train_labels_node, 1)
    # train accuracy
    train_correct_pred = tf.equal(train_predict, train_label)
    train_accuracy = tf.reduce_mean(tf.cast(train_correct_pred, tf.float32))
    tf.scalar_summary('acc', train_accuracy)
    merged = tf.merge_all_summaries()

    def compute_index(y_label, y_predict):
        # macro
        print("{}: acc {:g}, recall {:g}, f1 {:g} ".format(
            "macro", accuracy_score(y_label, y_predict),
            recall_score(y_label, y_predict, average='macro'),
            f1_score(y_label, y_predict, average='macro')))
        # macro
        print("{}: acc {:g}, recall {:g}, f1 {:g} ".format(
            "micro", accuracy_score(y_label, y_predict),
            recall_score(y_label, y_predict, average='micro'),
            f1_score(y_label, y_predict, average='micro')))

        # weighted
        print("{}: acc {:g}, recall {:g}, f1 {:g} ".format(
            "weighted", accuracy_score(y_label, y_predict),
            recall_score(y_label, y_predict, average='weighted'),
            f1_score(y_label, y_predict, average='weighted')))

    def dev_step(x_batch, y_batch, best_test_loss, sess):
        feed_dict = {
            train_data_node: x_batch,
            train_labels_node: y_batch,
            dropout_keep_prob: 1.0
        }
        # Run the graph and fetch some of the nodes.
        # test dont apply train_op (train_op is update gradient).
        summary, step, losses, lr, acc, y_label, y_predict = sess.run(
            [
                merged, global_step, loss, learning_rate, train_accuracy,
                train_label, train_predict
            ],
            feed_dict=feed_dict)
        test_writer.add_summary(summary, step)
        time_str = datetime.datetime.now().isoformat()
        print("{}: step {}, loss {:g}, lr {:g} ,acc {:g}".format(
            time_str, step, losses, lr, acc))
        # print("{}: step {}, loss {:g} ,acc {:g}".format(time_str, step, losses,acc))
        # compute index
        compute_index(y_label, y_predict)

        new_best_test_loss = best_test_loss
        # decide if need to decay learning rate
        if (step % steps_each_check < 100) and (step > 100):
            loss_delta = (best_test_loss
                          if best_test_loss is not None else 0) - losses
            if best_test_loss is not None and loss_delta < decay_delta:
                print(
                    'validation loss did not improve enough, decay learning rate'
                )
                current_learning_rate = min_learning_rate if lr * learning_rate_decay < min_learning_rate else lr * learning_rate_decay
                if current_learning_rate == min_learning_rate:
                    print('It is already the smallest learning rate.')
                sess.run(learning_rate.assign(current_learning_rate))
                print('new learning rate is: ', current_learning_rate)
            else:
                # update
                new_best_test_loss = losses

        return new_best_test_loss

    # run the training
    with tf.Session() as sess:
        train_writer = tf.train.SummaryWriter(FLAGS.summaries_dir + '/train',
                                              sess.graph)
        test_writer = tf.train.SummaryWriter(FLAGS.summaries_dir + '/test')
        tf.initialize_all_variables().run()
        print('Initialized!')
        # Generate batches
        batches = data_helpers.batch_iter(list(zip(x_train, y_train)),
                                          BATCH_SIZE, NUM_EPOCHS)
        # batch count
        batch_count = 0
        best_test_loss = None
        # Training loop.For each batch...
        for batch in batches:
            batch_count += 1
            if batch_count % EVAL_FREQUENCY == 0:
                print("\nEvaluation:")
                best_test_loss = dev_step(x_test, y_test, best_test_loss, sess)
                print("")
            else:
                if batch_count % META_FREQUENCY == 99:
                    x_batch, y_batch = zip(*batch)
                    feed_dict = {
                        train_data_node: x_batch,
                        train_labels_node: y_batch,
                        dropout_keep_prob: 0.5
                    }
                    # Run the graph and fetch some of the nodes.
                    # option
                    run_options = tf.RunOptions(
                        trace_level=tf.RunOptions.FULL_TRACE)
                    run_metadata = tf.RunMetadata()
                    _, summary, step, losses, acc = sess.run(
                        [train_op, merged, global_step, loss, train_accuracy],
                        feed_dict=feed_dict,
                        options=run_options,
                        run_metadata=run_metadata)
                    train_writer.add_run_metadata(run_metadata,
                                                  'step%03d' % step)
                    train_writer.add_summary(summary, step)
                    time_str = datetime.datetime.now().isoformat()
                    print("{}: step {}, loss {:g},acc {:g}".format(
                        time_str, step, losses, acc))
                else:
                    x_batch, y_batch = zip(*batch)
                    feed_dict = {
                        train_data_node: x_batch,
                        train_labels_node: y_batch,
                        dropout_keep_prob: 0.5
                    }
                    # Run the graph and fetch some of the nodes.
                    _, summary, step, losses, acc = sess.run(
                        [train_op, merged, global_step, loss, train_accuracy],
                        feed_dict=feed_dict)
                    train_writer.add_summary(summary, step)
                    time_str = datetime.datetime.now().isoformat()
                    print("{}: step {}, loss {:g}, acc {:g}".format(
                        time_str, step, losses, acc))

        train_writer.close()
        test_writer.close()
Exemplo n.º 26
0
#!/usr/bin/env python3

import json

import numpy as np
import pandas as pd
from sklearn.metrics import classification_report

from load_data import load_test_data
from predict import predict

X_df, y_df = load_test_data()
y_pred = predict(X_df)
y_true = (y_df['PINCP'] > 84770).astype(int)

# drop nans
nans = np.isnan(y_pred).ravel()
failures = np.sum(nans)
y_true_clean, y_pred_clean = y_true[~nans], y_pred[~nans]
report = classification_report(y_true_clean,
                               y_pred_clean,
                               target_names=['High Income', 'Low Income'],
                               output_dict=True)
report['failures'] = failures / len(y_true)

with open('./report.json', 'w') as f:
    json.dump(report, f)

print('done')
Exemplo n.º 27
0
batch_size = 1024       # mini batch size during training
strides = 9             # CNN strides
kernel_size = 9         # size of 1D convolutional kernels
filters = 4             # number of convolutional kernels
noise = 0.04            # noise factor for additive white Gaussian noise


# save models here:
model_path_name = r"specify_path_and_file_name"


### load data:
#X,Y = load_data.load_pure_component_spectra_training_data()  # load pure component spectra training dataset (NNi) 
X,Y = load_data.load_spectral_model_training_data()           # OR load spectral model training dataset (NNii):
X_val_meas, Y_val_meas = load_data.load_validation_data()     # load validation data
X_test_meas, Y_test_meas, Y_test_meas_nfNMR_IHM = load_data.load_test_data()  # load test data

label_factor = np.max(Y)      # compute label scaling factor
X_scaling_factor = np.max(X)  # compute input scaling factor


### scale and reshape spectra, labels and ground truth / add channel dimension:
X,X_val_meas,X_test_meas,Y,Y_val_meas,Y_test_meas,Y_test_meas_nfNMR_IHM = scale_reshape.scale_add_channels(X,X_val_meas,X_test_meas,Y,Y_val_meas,Y_test_meas,Y_test_meas_nfNMR_IHM,X_scaling_factor,label_factor)


### add noise to training data:
X = X + np.random.normal(0,noise,(np.shape(X)))


### build and compile model:
model = model_def.CNN_model(filters,kernel_size,X.shape[1:],strides)
Exemplo n.º 28
0
                        ])
    print(history.history.keys())
    print(history)

else:
    if len(saved_weights) == 0:
        print("network hasn't been trained!")
        sys.exit()
    else:
        test_sample_num = 0

        test_sentences = pickle.load(open('sentences_test', 'rb'))
        test_roots = pickle.load(open('rootwords_test', 'rb'))
        test_features = pickle.load(open('features_test', 'rb'))

        X_test, X_unique, y_unique = load_test_data(test_sentences, test_roots,
                                                    X_word_to_ix)

        X_test = pad_sequences(X_test,
                               maxlen=X_max_len,
                               dtype='int32',
                               padding='post')

        model.load_weights(saved_weights)

        plot_model(model, to_file="model2_arch.png", show_shapes=True)

        predictions = np.argmax(model.predict(X_test), axis=2)
        print(predictions)

        sequences = []
Exemplo n.º 29
0
        sc_logloss = np.mean(list_logloss_score)
        sc_gini = np.mean(list_gini_score)
        if min_score > sc_gini:
            min_score = sc_gini
            min_params = params
        logger.info('logloss: {}, gini: {}'.format(sc_logloss, sc_gini))
        logger.info('current min score: {}, params: {}'.format(
            min_score, min_params))

    logger.info('minimum params: {}'.format(min_params))
    logger.info('minimum gini: {}'.format(min_score))

    clf = LogisticRegression(**min_params)
    clf.fit(x_train, y_train)

    logger.info('train end')

    df = load_test_data()

    x_test = df[use_cols].sort_values('id')

    logger.info('test data load end {}'.format(x_test.shape))

    pred_test = clf.predict_proba(df)[:, 1]

    df_submit = pd.read_csv(SAMPLE_SUBMIT_FILE).sort_values('id')
    df_submit['target'] = pred_test

    df_submit.to_csv(DIR + 'submit.csv', index=False)
    logger.info('end')
Exemplo n.º 30
0
import numpy as np 
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from load_data import load_test_data, load_train_data
from data_cleaning import clean_data

# laod data
train_data = clean_data(load_train_data())
train_data.drop(['PassengerId'], axis=1, inplace=True)
test_data = clean_data(load_test_data())

# split training data into training/testing sets
train,test=train_test_split(train_data,test_size=0.3,random_state=0,stratify=train_data['Survived'])
train_X=train[train.columns[1:]]
train_Y=train[train.columns[:1]]
test_X=test[test.columns[1:]]
test_Y=test[test.columns[:1]]
X=train_data[train_data.columns[1:]]
Y=train_data['Survived']

# Hyper-Parameter Tuning for AdaBoost
n_estimators=list(range(100,1100,100))
learn_rate=[0.05,0.1,0.2,0.3,0.25,0.4,0.5,0.6,0.7,0.8,0.9,1]
hyper={'n_estimators':n_estimators,'learning_rate':learn_rate}
gd=GridSearchCV(estimator=AdaBoostClassifier(),param_grid=hyper,verbose=True)
Exemplo n.º 31
0
    for i in range(15):
        logger.debug('\t{0:20s} : {1:>10.6f}'.format(
                            df_tmp.ix[i, 0], df_tmp.ix[i, 1]))
    return model

if __name__ == '__main__':
    logger.info('Start')

    # temp1_df = load_train_data(nrows=ROW)
    # temp2_df = pd.read_csv('../input/city_population_wiki_v3.csv')
    # train_df = pd.merge(temp1_df, temp2_df, on='city', how='left')
    # del temp1_df, temp2_df
    train_df = load_train_data(nrows=ROW)
    logger.info('Train Data load end {}'.format(train_df.shape))

    test_df = load_test_data(nrows=ROW)
    logger.info('test load end {}'.format(test_df.shape))

    # test_df = load_period_train_data(nrows=ROW)
    # logger.info('period train load end {}'.format(test_df.shape))

    # pr_test_df = load_period_test_data(nrows=ROW)
    # logger.info('period test load end {}'.format(pr_test_df.shape))

    # test_df = load_train_act_data(nrows=ROW)
    # tmp_df = pd.read_csv(TRN_PRED_FILE, index_col=['item_id'])
    # trn_act_df = load_train_act_data(nrows=ROW)
    # trn_act_df = trn_act_df.join(tmp_df, how='left')
    # train_df = pd.concat([train_df, trn_act_df], axis=0)
    # del trn_act_df, tmp_df
    f1 = f1_score(true, predict, average="binary")
    precision_binary, recall_binary, fbeta_score_binary, _ = precision_recall_fscore_support(
        true, predict, average="binary"
    )
    accuracy = accuracy_score(true, predict)
    print("正确率(Accuracy):%.3f\nF值(Macro-F score):%.3f" % (accuracy, f1))
    print("精确度(Precision):%.3f\n召回率:%.3f\nF值: %.3f" % (precision_binary, recall_binary, fbeta_score_binary))
    log_performance(accuracy, f1, precision_binary, recall_binary, len(true))
    if figure == False:
        return
    # 画图
    n_groups = 5
    values = (accuracy, f1, precision_binary, recall_binary, fbeta_score_binary)
    fig, ax = plt.subplots()
    index = np.arange(n_groups)
    bar_width = 0.35
    rects1 = plt.bar(index + bar_width / 2, values, bar_width, alpha=0.6, color="b")
    plt.xlabel("Result")
    plt.ylabel("Scores")
    plt.title("Experiment analysis")
    plt.xticks(index + bar_width, ("Accuracy", "F", "Precision", "Recall", "F"))
    plt.ylim(0, 1)
    plt.tight_layout()
    plt.show()


if __name__ == "__main__":
    predict = load_pickle("./data/predict_labels/predict_labels.p")
    _, true_labels = load_test_data()
    analysis_result(predict, true_labels)