示例#1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--save-dir', '-s', type=str, default='model')
    parser.add_argument('--val', '-v', dest='val', action='store_true')
    parser.add_argument('--test', '-t', dest='val', action='store_false')
    parser.add_argument('--resize', dest='resize', action='store_true')
    parser.set_defaults(val=False, resize=False)
    args = parser.parse_args()

    if args.val:
        images, labels = load_data(True)
        _, val_dset = prepare_dataset(images,
                                      labels,
                                      train=True,
                                      augment=False)
    else:
        images, labels = load_data(False)
        test_dset = prepare_dataset(images,
                                    labels,
                                    train=False,
                                    resize=args.resize)
    dataset = val_dset if args.val else test_dset

    graph = tf.Graph()
    with graph.as_default():
        with tf.Session(graph=graph) as sess:
            tf.saved_model.loader.load(sess,
                                       [tf.saved_model.tag_constants.SERVING],
                                       args.save_dir)
            X = graph.get_tensor_by_name('images_ph:0')
            logits_op = graph.get_tensor_by_name('dense_2/BiasAdd:0')
            acc, num_correct, num_samples = check_accuracy(
                sess, dataset, X, logits_op)
        print('{} acc: {:.2%} ({}/{})'.format('val' if args.val else 'test',
                                              acc, num_correct, num_samples))
示例#2
0
def get_test_data(test_file=None, level='word'):
    if level == 'word':
        x_a_test, x_b_test, vocab = load_data(test_file,
                                              level=level,
                                              test=True)
        return [x_a_test, x_b_test], vocab
    else:
        x_a_test, x_b_test, x_a_char_test, x_b_char_test, vocab = load_data(
            test_file, level=level, test=True)
        return [x_a_test, x_b_test, x_a_char_test, x_b_char_test], vocab
示例#3
0
def get_data(train_file=None, test_file=None, level='word'):
    if level == 'word':
        x_a_train, x_b_train, x_c_train, vocab = load_data(train_file,
                                                           level=level)
        x_a_test, x_b_test, _ = load_data(test_file, level=level, test=True)
        return [x_a_train, x_b_train, x_c_train], [x_a_test, x_b_test], vocab
    else:
        x_a_train, x_b_train, x_c_train, x_a_char_train, x_b_char_train, x_c_char_train, vocab = \
            load_data(train_file, level=level)
        x_a_test, x_b_test, x_a_char_test, x_b_char_test, _ = load_data(
            test_file, level=level, test=True)
        return [x_a_train, x_b_train, x_c_train, x_a_char_train, x_b_char_train, x_c_char_train], \
               [x_a_test, x_b_test, x_a_char_test, x_b_char_test], vocab
示例#4
0
def preprocess():
    # Load Data
    print("Data Preprocess Stage...")
    data_text, class_list = data_preprocess.load_data(FLAGS.data_file, FLAGS.class_file, FLAGS.char)

    # Build Vocabulary
    data_max_length = max([len(s.split(" ")) for s in data_text])
    print("Data Max Length: ", data_max_length)
    data_processor = learn.preprocessing.VocabularyProcessor(data_max_length)
    print("Data Processor Made")
    x = np.array(list(data_processor.fit_transform(data_text)))
    del data_text
    print("Data Transformed to NPArray")

    class_processor = learn.preprocessing.VocabularyProcessor(1)
    print("Class Processor Made")
    y_np = np.array(list(class_processor.fit_transform(class_list)))
    del class_list
    print("Class Transformed to NPArray")
    y_max = np.max(y_np)
    print("Number of Class: ", y_max)

    #y = np.zeros((y_np.shape[0], y_max), dtype=int)
    #print("Zero NPArray for Class Made")
    #y_np = y_np.ravel()
    #y[np.arange(y_np.size), y_np-1] = 1
    #y = tf.one_hot(y_np, y_max)
    #print("One-Hot Encoding for Class Finished")
    #del y_np

    # Randomly shuffle data
    np.random.seed(10)
    dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y_np)))
    shuffle_indices = np.random.permutation(np.arange(len(y_np)))
    shuffle_indices = shuffle_indices[dev_sample_index:]
    #x_shuffled = x[shuffle_indices]
    #del x
    #y_shuffled = y[shuffle_indices]
    #del y

    # Split train/test set
    # TODO: This is very crude, should use cross-validation
    # dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
    # x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
    # y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]

    x_dev = x[shuffle_indices]
    #np.delete(x, shuffle_indices)
    y_dev = y_np[shuffle_indices]
    #np.delete(y, shuffle_indices)

    #del x_shuffled, y_shuffled

    if (FLAGS.char):
        print("Data Character Size: {:d}".format(len(data_processor.vocabulary_)))
        print("Class List Size: {:d}".format(len(class_processor.vocabulary_)))
        print("Train/Dev split: {:d}/{:d}".format(len(y_np), len(y_dev)))

    return x, y_np, data_processor, class_processor, x_dev, y_dev
示例#5
0
def test_env():
    path = '../master_capston/the-movies-dataset/'
    features_embedding_movies = pd.read_csv(
        os.path.join(path, 'movie_embedding_features.csv'))
    sampler = LTSDocumentSampler(dataset=features_embedding_movies)

    # this mean the number of items in the recommendation return from the agent
    slate_size = 3

    # i am assuming this number mean the # of possible items to send to the agent for recommend for each slate
    num_candidates = 10

    format_data = data_preprocess.load_data(path)
    # print(format_data.head())
    # print(format_data.shape)

    features_embedding_movies = pd.read_csv(
        os.path.join(path, 'movie_embedding_features.csv'))
    positive_user_ids, positive_history_data = data_preprocess.get_user_positive(
        format_data)
    user_sampler = LTSStaticUserSampler(positive_user_ids,
                                        positive_history_data,
                                        features_embedding_movies)

    LTSUserModel = UserModel(user_sampler, slate_size, LTSResponse)

    ltsenv = environment.Environment(LTSUserModel,
                                     sampler,
                                     num_candidates,
                                     slate_size,
                                     resample_documents=True)
    lts_gym_env = recsim_gym.RecSimGymEnv(ltsenv, clicked_engagement_reward)

    observation_0 = lts_gym_env.reset()
    # print(observation_0['user'][:5])
    # print('Observation 0')
    # print('Available documents')
    # doc_strings = ['doc_id ' + key + " kaleness " + str(value) for key, value
    #                in observation_0['doc'].items()]
    # print('\n'.join(doc_strings))

    recommendation_slate_0 = [0, 1, 2]
    observation_1, reward, done, _ = lts_gym_env.step(recommendation_slate_0)

    print(observation_1['user'][:5])
    # print('Noisy user state observation')
    # print(observation_0['user'])

    print(lts_gym_env.observation_space)
    print(lts_gym_env.action_space)


# test_doc_model()

# test_user_model()
# test_env()
示例#6
0
def main(unused_argv):
    # Load training and eval data
    (train_data,
     train_labels) = dt.load_data("E:\\BUAA\\实验室\\阶段2:\\training_set")
    train_data = train_data.astype(np.float32)
    #train_labels = train_labels.astype(np.float32)
    print(train_labels.dtype)
    (eval_data, eval_labels) = dt.load_data("E:\\BUAA\\实验室\\阶段2:\\test_set")
    eval_data = eval_data.astype(np.float32)
    #eval_labels = eval_labels.astype(np.float32)

    # Create the Estimator
    road_classifier = tf.estimator.Estimator(
        model_fn=cnn_model_fn,
        model_dir="E:\\BUAA\\实验室\\阶段2:\\road_convnet_model")

    # Set up logging for predictions
    tensors_to_log = {"probabilities": "softmax_tensor"}
    logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log,
                                              every_n_iter=50)

    # Train the model

    train_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": train_data},
                                                        y=train_labels,
                                                        batch_size=100,
                                                        num_epochs=None,
                                                        shuffle=True)
    road_classifier.train(input_fn=train_input_fn,
                          steps=20000,
                          hooks=[logging_hook])

    # Evaluate the model and print results
    eval_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": eval_data},
                                                       y=eval_labels,
                                                       num_epochs=1,
                                                       shuffle=False)
    eval_results = road_classifier.evaluate(input_fn=eval_input_fn)
    print(eval_results)
示例#7
0
def test_user_model():
    path = '../master_capston/the-movies-dataset/'
    format_data = data_preprocess.load_data(path)
    # print(format_data.head())
    # print(format_data.shape)

    features_embedding_movies = pd.read_csv(
        os.path.join(path, 'movie_embedding_features.csv'))
    positive_user_ids, positive_history_data = data_preprocess.get_user_positive(
        format_data)
    user_sampler = LTSStaticUserSampler(positive_user_ids,
                                        positive_history_data,
                                        features_embedding_movies)
    current_user = user_sampler.sample_user()
    current_user.create_observation()
def main():
    path = 'dataset/'
    use_100k = True
    if use_100k:
        rating_file_name = 'process_small_rating.csv'
        embedding_file_name = 'movie_embedding_features.csv'
    else:
        rating_file_name = 'process_1m_rating.csv'
        embedding_file_name = 'movie_embedding_features_1m.csv'

    # train_mode = True
    np.random.seed(0)
    test_mode = True
    offline_mode = True
    sample_user_randomly = False
    save_model_path = "model_test/save_model/49"
    # build enviroment

    # this mean the number of items in the recommendation return from the agent
    slate_size = 5

    # i am assuming this number mean the # of possible items to send to the agent for recommend for each slate
    num_candidates = 30

    rating_pivot = 4
    time_budget = -1
    time_budget_range = [2, 6]
    min_num_positive_rating = 40
    min_num_rating = 70
    resample_documents = True

    format_data = data_preprocess.load_data(path, file_name=rating_file_name)
    features_embedding_movies = pd.read_csv(
        os.path.join(path, embedding_file_name))
    positive_user_ids, positive_history_data = data_preprocess.get_user_positive(
        format_data, number_positive_pivot=min_num_positive_rating)
    print("unique user id : ", len(np.unique(positive_user_ids)))
    print(len(positive_history_data))

    # restrict number of total rating
    positive_user_ids, positive_history_data = data_preprocess.generate_new_dataset(
        positive_history_data, num_rating_pivot=min_num_rating)
    print("unique user id : ", len(np.unique(positive_user_ids)))
    print(len(positive_history_data))

    # generate train and test set
    # user_details = positive_history_data.groupby('userId').size().reset_index()
    # user_details.columns = ['userId', 'number of rating']
    # print(user_details.describe())

    train_set, test_set = data_preprocess.generate_train_test_data(
        positive_history_data)
    print("train set size : ", len(train_set))
    print("test set size : ", len(test_set))
    users_history_data, train_set = data_preprocess.create_recent_history(
        train_set)
    train_set = train_set.astype({'rating': 'float64'})
    test_set = test_set.astype({'rating': 'float64'})
    print("user history set size : ", len(users_history_data))
    print("new train set size : ", len(train_set))
    user_details = test_set.groupby('userId').size().reset_index()
    user_details.columns = ['userId', 'number of rating']
    print(user_details.describe())

    # check the train set and test set quality

    offline_dataset = train_set
    if test_mode:
        offline_dataset = test_set

    sampler = LTSDocumentSampler(dataset=features_embedding_movies,
                                 num_candidate=num_candidates)
    user_sampler = LTSStaticUserSampler(users_history_data,
                                        features_embedding_movies,
                                        offline_data=offline_dataset,
                                        offline_mode=offline_mode,
                                        time_budget=time_budget,
                                        random=sample_user_randomly,
                                        time_budget_range=time_budget_range)
    # need to handle where we update dataset with num candidate< available

    func_select_train_set = select_dataset(features_embedding_movies,
                                           train_set)
    func_select_test_set = select_dataset(features_embedding_movies, test_set)

    # user_train_set = func(user_id=39)
    # print(len(user_train_set))

    LTSUserModel = UserModel(user_sampler,
                             offline_mode=offline_mode,
                             rating_pivot=rating_pivot,
                             slate_size=slate_size,
                             response_ctor=LTSResponse)

    ltsenv = CustomSingleUserEnviroment(
        LTSUserModel,
        sampler,
        num_candidates,
        slate_size,
        resample_documents=resample_documents,
        offline_mode=offline_mode,
        select_subset_func=func_select_train_set)

    if test_mode:
        ltsenv = CustomSingleUserEnviroment(
            LTSUserModel,
            sampler,
            num_candidates,
            slate_size,
            resample_documents=resample_documents,
            offline_mode=offline_mode,
            select_subset_func=func_select_test_set)

    lts_gym_env = recsim_gym.RecSimGymEnv(ltsenv, clicked_engagement_reward)
    max_num_user = len(np.unique(users_history_data['userId']))

    with tf.Session() as sess:
        popularAgent = PopularityRecommender(train_set, slate_size, "1")

        # RL_Agent = create_RL_Agent(sess,lts_gym_env,slate_size,save_model_path=save_model_path)
        # contentAgent = contentModel(slate_size,embedding_size=30)

        evaluate_agent_offline(popularAgent, slate_size, max_num_user,
                               lts_gym_env)
示例#9
0
文件: my_NN.py 项目: Ianiusha/AutoLSF
def build_NN_classifier(filename, option, model_name=None):

    # LOAD DATA
    descriptors = qm_descriptors
    X, Y = data_preprocess.load_data(filename, descriptors)

    # IF DOWNSAMPLING:
    #print('>> Down sampling.')
    #smaller_x, smaller_y = data_preprocess.do_down_sampling(X,Y)

    if option == 'default':

        print('Training Logist...')
        print('*-----------------------------*')
        print('Training on default parameters.')

        accuracies_default = []
        for i in range(10):
            x_train, x_valid, y_train, y_valid = data_preprocess.split_data(
                X, Y, partition=0.20)
            accuracies_default.append(
                train_NN(x_train, y_train, x_valid, y_valid))

        print('Average accuracy over 3 default runs: %.2f' %
              numpy.mean(accuracies_default))

    elif option == 'train':

        print('*-----------------------------*')
        print('Searchig for best parameters.')

        params = []
        accuracies = []

        for i in range(10):
            x_train, x_valid, y_train, y_valid = data_preprocess.split_data(
                X, Y, partition=0.20)
            best_parameters = scan_parameters(x_train, y_train)
            params.append(best_parameters)
            accuracy = train_NN(x_train, y_train, x_valid, y_valid,
                                best_parameters)
            accuracies.append(accuracy)

        print('*-----------------------------*')
        print('Summary of Results.')
        print('*-----------------------------*')

        for i in range(len(accuracies)):
            print('Run ' + str(i + 1) + ' ', params[i], ' : ', accuracies[i])

    elif option == 'test':

        print('TESTING')
        print('*-----------------------------*')

        hidden_layer_sizes = (100, 100)
        solver = 'adam'
        alpha = 0.001

        params_dict = {
            'hidden_layer_sizes': hidden_layer_sizes,
            'solver': solver,
            'alpha': alpha,
            'max_iter': [400]
        }

        print(params_dict)

        acc_list = []
        for i in range(10):
            x_train, x_valid, y_train, y_valid = data_preprocess.split_data(
                X, Y, partition=0.20)
            acc_list.append(
                train_NN(x_train, y_train, x_valid, y_valid, params_dict))

        print('Summary of Results.')
        print('*-----------------------------*')
        print('Average accuracy over 10 runs: %.2f' % numpy.mean(acc_list))
示例#10
0
文件: train.py 项目: wudapeng268/hcan
def main(options):
    args = get_default_args()
    set_args(args, options)
    mode, dataset_name = args['mode'], args['dataset']

    # default setting
    args['raw_data'] = "data/%s/" % args['dataset']
    args['qrels_file'] = "data/%s/qrels.all.txt" % args['dataset']
    print_args(args)

    # get train/val/test names for specific dataset
    train_name, val_name, test_name, train_set, val_set, test_set, num_classes, with_url = config_dataset(
        args)

    max_query_len, max_doc_len, max_url_len = defaultdict(int), defaultdict(
        int), defaultdict(int)
    vocab = {'word': {}, '3gram': {}}
    test_vocab = {'word': {}, '3gram': {}}
    train_vocab_emb, test_vocab_emb = None, None

    ############################# LOAD DATA ##################################
    data_name = ("data_m%s_%s_%s_%s" %
                 (mode, dataset_name, train_name, test_name)).lower()
    if args["load_data"]:
        train_dataset, vocab, train_vocab_emb, max_query_len, max_doc_len, max_url_len = load_data(
            "%s/%s/%s" % (args["experimental_data"], data_name, train_name),
            True)
        test_dataset, test_vocab, test_vocab_emb, _, _, _ = load_data(
            "%s/%s/%s" % (args["experimental_data"], data_name, test_name),
            False)
        if dataset_name != 'twitter' and dataset_name != 'TwitterURL':
            val_dataset, _, _, _, _, _ = load_data(
                "%s/%s/%s" % (args["experimental_data"], data_name, val_name),
                False)
        if args['embedding'] == 'glove':
            train_vocab_emb, test_vocab_emb = construct_vocab_emb(
                "%s/%s" % (args["experimental_data"], data_name),
                vocab['word'],
                test_vocab['word'],
                300,
                "word",
                base_embed_path=args["base_embed_path"],
                type=args["embedding"])
        print('load dataset successfully')
    else:
        train_dataset = gen_data(args["raw_data"], train_set, vocab,
                                 test_vocab, True, max_query_len, max_doc_len,
                                 max_url_len, num_classes, args)
        print("create training set successfully...")
        if dataset_name != 'twitter' and dataset_name != 'TwitterURL':
            val_dataset = gen_data(args["raw_data"], val_set, vocab,
                                   test_vocab, False, max_query_len,
                                   max_doc_len, max_url_len, num_classes, args)
            print("create validation set successfully...")

        test_dataset = gen_data(args["raw_data"], test_set, vocab, test_vocab,
                                False, max_query_len, max_doc_len, max_url_len,
                                num_classes, args)
        train_vocab_emb, test_vocab_emb = construct_vocab_emb(
            "%s/%s" % (args["experimental_data"], data_name),
            vocab['word'],
            test_vocab['word'],
            300,
            "word",
            base_embed_path=args["base_embed_path"])
        save_data(
            "%s/%s/%s" % (args["experimental_data"], data_name, train_name),
            True, train_dataset, max_query_len, max_doc_len, max_url_len,
            vocab, train_vocab_emb)
        print("save training set successfully...")
        if dataset_name != 'twitter' and dataset_name != 'TwitterURL':
            save_data("%s/%s/%s" %
                      (args["experimental_data"], data_name, val_name),
                      False,
                      val_dataset,
                      vocab=test_vocab,
                      vocab_emb=test_vocab_emb)
            print("save val set successfully...")
        save_data("%s/%s/%s" %
                  (args["experimental_data"], data_name, test_name),
                  False,
                  test_dataset,
                  vocab=test_vocab,
                  vocab_emb=test_vocab_emb)
        print("save test set successfully...")

    if dataset_name == 'twitter' or dataset_name == 'TwitterURL':
        val_split = args['val_split']
        num_samples, _ = train_dataset["query_word_input"].shape
        # randomly sample queries and all their documents if query_random is True
        # otherwise, query-doc pairs are randomly sampled
        query_random = True if dataset_name == 'twitter' else False
        if query_random:
            del train_dataset["overlap_feat"]
            val_indices = sample_aaai_val_set(args["raw_data"], train_set,
                                              val_split)
        else:
            val_split = 0.1
            val_indices, val_set = [], set()
            for i in range(int(num_samples * val_split)):
                val_index = np.random.randint(num_samples)
                while val_index in val_set:
                    val_index = np.random.randint(num_samples)
                val_indices.append(val_index)
                val_set.add(val_index)

        val_dataset = {}
        for key in train_dataset:
            #print(key, train_dataset[key].shape)
            val_dataset[key] = train_dataset[key][val_indices]
            train_dataset[key] = np.delete(train_dataset[key], val_indices, 0)

    # shuffle the train dataset explicitly to make results reproducible
    # whether the performance will be affected remains a question
    keys, values = [], []
    for key in train_dataset:
        if train_dataset[key].size == 0:
            continue
        keys.append(key)
        values.append(train_dataset[key])
    zipped_values = list(zip(*values))
    random.shuffle(zipped_values)
    shuffled_values = list(zip(*zipped_values))
    for i, key in enumerate(keys):
        train_dataset[key] = np.array(shuffled_values[i])
    print('after shuffle:', train_dataset['id'][:5], train_dataset['sim'][:5],
          train_dataset['query_word_input'][:5])

    # merge the vocabulory of train and test set
    merged_vocab = {}
    merged_vocab['word'] = merge_two_dicts(vocab['word'], test_vocab['word'])
    merged_vocab['3gram'] = merge_two_dicts(vocab['3gram'],
                                            test_vocab['3gram'])
    print("TRAIN vocab: word(%d) 3gram(%d)" %
          (len(vocab['word']), len(vocab['3gram'])))
    print("TEST vocab: word(%d) 3gram(%d)" %
          (len(test_vocab['word']), len(test_vocab['3gram'])))
    print("MERGED vocab: word(%d) 3gram(%d)" %
          (len(merged_vocab['word']), len(merged_vocab['3gram'])))

    vocab_inv, vocab_size = {}, {}
    for key in vocab:
        vocab_inv[key] = invert_dict(merged_vocab[key])
        vocab_size[key] = len(vocab[key])
    print(vocab_size)

    # Print data samples for debug purpose
    print_dataset(mode, train_dataset, vocab_inv)
    print_dataset(mode, test_dataset, vocab_inv)

    ############################ TRAIN MODEL #################################
    # create model
    model = create_attention_model(max_query_len,
                                   max_doc_len,
                                   max_url_len,
                                   vocab_size,
                                   train_vocab_emb,
                                   args["nb_filters"],
                                   args["nb_layers"],
                                   embed_size=300,
                                   dropout_rate=args['dropout'],
                                   trainable=args["trainable"],
                                   weighting=args['weighting'],
                                   mask=args["mask"],
                                   conv_option=args['conv_option'],
                                   model_option=args['model_option'],
                                   join=args['join'],
                                   num_classes=num_classes,
                                   with_url=with_url,
                                   highway=args['highway'],
                                   att=args['co_attention'],
                                   ext_feat=args["external_feat"],
                                   encoder_option=args['encoder_option'])
    model_name = (
        "model_N%s_data%s_mo%s_e%s_c%s_NumFilter%d_nblayer%d_T%s_D%.1f_W%s_M%s_B%d_Val%.2f_Join%s_H%s_Att%s"
        % (mode, train_name, args['model_option'], args["encoder_option"],
           args['conv_option'], args["nb_filters"], args["nb_layers"],
           args["trainable"], args['dropout'], args['weighting'], args['mask'],
           args['batch_size'], args['val_split'], args['join'],
           args['highway'], args['co_attention'])).lower()
    model_path = "%s/%s/%s" % (args['experimental_data'], data_name,
                               model_name)
    print(model_path)

    if args['optimizer'] == "adam":
        opt = optimizers.Adam(lr=args["learning_rate"],
                              beta_1=0.9,
                              beta_2=0.999,
                              epsilon=None,
                              decay=0.0,
                              amsgrad=True)
        print('use Adam optimizer')
    elif args['optimizer'] == "sgd":
        opt = optimizers.SGD(lr=args["learning_rate"],
                             decay=1e-6,
                             momentum=0.9,
                             nesterov=True)
        print('use SGD optimizer')
    elif args['optimizer'] == 'rmsprop':
        opt = optimizers.RMSprop(lr=args["learning_rate"],
                                 rho=0.9,
                                 epsilon=None,
                                 decay=0.0)
        print('use RMSprop optimizer')

    if num_classes <= 2:
        model.compile(loss='binary_crossentropy',
                      optimizer=opt,
                      metrics=['accuracy'])
    else:
        print('compile model with categorical cross-entropy')
        model.compile(loss='categorical_crossentropy',
                      optimizer=opt,
                      metrics=['accuracy'])
    class_weight = None
    if args['dataset'] == 'Quora':
        #class_weight = {0:1, 1:2}
        print('apply class weight:', class_weight)

    print(model.summary())
    print('model init weights sum: %.4f' % get_model_weights(model))
    if not args['load_model']:
        early_stopping = EarlyStopping(monitor='val_loss', patience=4)
        checkpoint = ModelCheckpoint(filepath=model_path + ".best.weights",
                                     monitor='val_loss',
                                     save_best_only=True,
                                     verbose=1)
        lr_reducer = ReduceLROnPlateau(monitor='val_loss',
                                       factor=0.5,
                                       patience=2,
                                       min_lr=0.0001,
                                       verbose=1)
        model.fit(
            train_dataset,
            train_dataset['sim'],  #validation_split=0.05,
            batch_size=args['batch_size'],
            validation_data=(val_dataset, val_dataset['sim']),
            epochs=args['epochs'],
            shuffle=False,
            callbacks=[checkpoint, lr_reducer, early_stopping],
            class_weight=class_weight,
            verbose=args['verbose'])

    ############################ TEST MODEL #################################
    print('load best model from %s.best.weights' % model_path)
    model.load_weights("%s.best.weights" % model_path)
    # load trained vocab embedding.
    trained_vocab_emb = model.get_layer('word-embedding').get_weights()[0]
    # merge trained vocab embedding with test OOV word embeddings
    merged_vocab_emb = np.zeros(shape=(len(merged_vocab['word']), 300))
    merged_vocab_emb[0:len(vocab['word']), :] = trained_vocab_emb
    merged_vocab_emb[
        len(vocab['word']):len(merged_vocab['word']), :] = test_vocab_emb
    for key in vocab:
        vocab_size[key] = len(merged_vocab[key])
    print(vocab_size)

    new_model = create_attention_model(max_query_len,
                                       max_doc_len,
                                       max_url_len,
                                       vocab_size,
                                       merged_vocab_emb,
                                       args["nb_filters"],
                                       args["nb_layers"],
                                       embed_size=300,
                                       dropout_rate=args['dropout'],
                                       trainable=args["trainable"],
                                       weighting=args['weighting'],
                                       mask=args["mask"],
                                       conv_option=args['conv_option'],
                                       model_option=args['model_option'],
                                       join=args['join'],
                                       num_classes=num_classes,
                                       with_url=with_url,
                                       highway=args['highway'],
                                       att=args['co_attention'],
                                       ext_feat=args["external_feat"],
                                       encoder_option=args['encoder_option'])
    new_model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
    #print(new_model.summary())
    for layer_id in range(len(model.layers)):
        layer = model.layers[layer_id]
        if layer.name != 'word-embedding':
            new_model.layers[layer_id].set_weights(layer.get_weights())
    print('copy weight done.')
    val_predictions = new_model.predict(val_dataset)
    predictions = new_model.predict(test_dataset)

    if dataset_name == 'twitter' or dataset_name == 'TrecQA':
        val_predictions = val_predictions[:, 1]
        predictions = predictions[:, 1]
        print(predictions[:10])
        predictions_file = "%s/%s/predictions_%s.txt" % (
            args["experimental_data"], data_name, model_name)
        with open(predictions_file, 'w') as f:
            for i in range(test_dataset['id'].shape[0]):
                f.write("%s %.4f %s\n" %
                        (test_dataset['id'][i], predictions[i], args['mode']))
        print('write predictions with trec format to %s' % predictions_file)
        val_predictions_file = "%s/%s/val_predictions_%s.txt" % (
            args["experimental_data"], data_name, model_name)
        with open(val_predictions_file, 'w') as f:
            for i in range(val_dataset['id'].shape[0]):
                f.write(
                    "%s %.4f %s\n" %
                    (val_dataset['id'][i], val_predictions[i], args['mode']))
        map, mrr, p30 = evaluate(val_predictions_file, args["qrels_file"])
        print('write val predictions with trec format to %s' %
              val_predictions_file)
        print('Validation MAP: %.4f P30: %.4f MRR: %.4f' % (map, p30, mrr))
        map, mrr, p30 = evaluate(predictions_file, args["qrels_file"])
        print('MAP: %.4f P30: %.4f MRR: %.4f' % (map, p30, mrr))
    else:
        preds = np.argmax(predictions, axis=-1)
        labels = np.argmax(test_dataset['sim'], axis=-1)
        corrects = preds == labels
        predictions_file = "%s/%s/predictions_%s.txt" % (
            args["experimental_data"], data_name, model_name)
        with open(predictions_file, 'w') as f:
            f.write("id label pred prob model\n")
            for i in range(len(preds)):
                f.write("%s %s %s %.4f %s\n" %
                        (test_dataset['id'][i], labels[i], preds[i],
                         predictions[i][preds[i]], args['mode']))
        print('write predictions with trec format to %s' % predictions_file)
        val_preds = np.argmax(val_predictions, axis=-1)
        val_labels = np.argmax(val_dataset['sim'], axis=-1)
        val_corrects = val_preds == val_labels
        val_predictions_file = "%s/%s/val_predictions_%s.txt" % (
            args["experimental_data"], data_name, model_name)
        with open(val_predictions_file, 'w') as f:
            for i in range(val_dataset['id'].shape[0]):
                f.write("%s %s %s %.4f %s\n" %
                        (val_dataset['id'][i], val_labels[i], val_preds[i],
                         val_predictions[i][val_preds[i]], args['mode']))
        print('write val predictions with trec format to %s' %
              val_predictions_file)

        print('val accuracy: %.4f' %
              (np.count_nonzero(val_corrects) * 1.0 / len(val_preds)))
        print('accuracy: %.4f' %
              (np.count_nonzero(corrects) * 1.0 / len(preds)))
        macro_prec = precision_score(labels, preds, average="macro")
        macro_recall = recall_score(labels, preds, average="macro")
        print('Macro Precision: %.3f, Recall: %.3f, F1: %.3f' %
              (macro_prec, macro_recall, 2 * macro_prec * macro_recall /
               (macro_prec + macro_recall)))
        print('Micro Precision: %.3f, Recall: %.3f, F1: %.3f' %
              (precision_score(labels, preds, average="micro"),
               recall_score(labels, preds, average="micro"),
               f1_score(labels, preds, average="micro")))
        print('Confusion matrix:', confusion_matrix(labels, preds))
示例#11
0
def main(options):
    args = get_default_args()
    set_args(args, options)
    print_args(args)
    mode = args['mode']
    train_name, test_name = args['split']['train'], args['split']['test']
    if train_name == 'train_all':
        train_set = ['train_2011', 'test_2011', 'train_2013', 'test_2013']
        train_set.remove(test_name)
    else:
        train_set = [train_name]
    test_set = [test_name]
    print("train_set", train_set)
    print("test_set", test_set)
    max_query_len, max_doc_len, max_url_len = defaultdict(int), defaultdict(
        int), defaultdict(int)
    vocab = {'word': {}, '3gram': {}, 'url': {}}
    test_vocab = {'word': {}, '3gram': {}, 'url': {}}
    train_vocab_emb, test_vocab_emb = None, None

    ############################# LOAD DATA ##################################
    data_name = ("data_m%s_%s_%s" % (mode, train_name, test_name)).lower()
    if args["load_data"]:
        train_dataset, vocab, train_vocab_emb, max_query_len, max_doc_len, max_url_len = load_data(
            "%s/%s/%s" % (args["experimental_data"], data_name, train_name),
            True)
        test_dataset, test_vocab, test_vocab_emb, _, _, _ = load_data(
            "%s/%s/%s" % (args["experimental_data"], data_name, test_name),
            False)
        print('load dataset successfully')
    else:
        #vocab = build_vocab(args["raw_data"], train_set, test_set, vocab)
        #print('build vocab done. %d' % len(vocab['word']))
        train_dataset = gen_data(args["raw_data"], train_set, vocab,
                                 test_vocab, True, max_query_len, max_doc_len,
                                 max_url_len, args)
        print("create training set successfully...")
        test_dataset = gen_data(args["raw_data"], test_set, vocab, test_vocab,
                                False, max_query_len, max_doc_len, max_url_len,
                                args)
        train_vocab_emb, test_vocab_emb = construct_vocab_emb(
            "%s/%s" % (args["experimental_data"], data_name),
            vocab['word'],
            test_vocab['word'],
            300,
            "word",
            base_embed_path=args["base_embed_path"])
        save_data(
            "%s/%s/%s" % (args["experimental_data"], data_name, train_name),
            True, train_dataset, max_query_len, max_doc_len, max_url_len,
            vocab, train_vocab_emb)
        print("save training set successfully...")
        save_data("%s/%s/%s" %
                  (args["experimental_data"], data_name, test_name),
                  False,
                  test_dataset,
                  vocab=test_vocab,
                  vocab_emb=test_vocab_emb)
        print("save test set successfully...")

    if mode == 'dssm':
        train_dataset = convert_data_to_dssm_format(train_dataset,
                                                    vocab,
                                                    is_train_or_val=True)
        test_dataset = convert_data_to_dssm_format(test_dataset,
                                                   vocab,
                                                   is_train_or_val=False)
        print('data convertion done!')

    val_split = args['val_split']
    num_samples, _ = train_dataset["query_word_input"].shape
    # randomly sample queries and all their documents if query_random is True
    # otherwise, query-doc pairs are randomly sampled
    query_random = True
    if query_random:
        val_indices = sample_val(train_set,
                                 num_samples=num_samples,
                                 val_split=val_split)
    else:
        val_indices, val_set = [], set()
        for i in range(int(num_samples * val_split)):
            val_index = np.random.randint(num_samples)
            while val_index in val_set:
                val_index = np.random.randint(num_samples)
            val_indices.append(val_index)
            val_set.add(val_index)

    print(val_indices[:5], np.sum(np.array(val_indices)))

    # sample validation set for debug purpose
    # val_indices = val_indices[:100]

    train_dataset["query_word_weight"] = train_dataset[
        "query_word_weight"][:, :args['deeplevel']]
    train_dataset["query_3gram_weight"] = train_dataset[
        "query_3gram_weight"][:, :args['deeplevel']]
    train_dataset["doc_word_weight"] = train_dataset[
        "doc_word_weight"][:, :args['deeplevel']]
    train_dataset["doc_3gram_weight"] = train_dataset[
        "doc_3gram_weight"][:, :args['deeplevel']]
    train_dataset["url_3gram_weight"] = train_dataset[
        "url_3gram_weight"][:, :args['deeplevel']]
    test_dataset["query_word_weight"] = test_dataset[
        "query_word_weight"][:, :args['deeplevel']]
    test_dataset["query_3gram_weight"] = test_dataset[
        "query_3gram_weight"][:, :args['deeplevel']]
    test_dataset["doc_word_weight"] = test_dataset[
        "doc_word_weight"][:, :args['deeplevel']]
    test_dataset["doc_3gram_weight"] = test_dataset[
        "doc_3gram_weight"][:, :args['deeplevel']]
    test_dataset["url_3gram_weight"] = test_dataset[
        "url_3gram_weight"][:, :args['deeplevel']]
    # print("SHAPEEEEEEEEEEEEEEEEEEEE: {}".format(len(train_dataset["query_word_weight"][100])))

    val_dataset = {}
    for key in train_dataset:
        val_dataset[key] = train_dataset[key][val_indices]
        train_dataset[key] = np.delete(train_dataset[key], val_indices, 0)

    # shuffle the train dataset explicitly to make results reproducible
    # whether the performance will be affected remains a question
    keys, values = [], []
    for key in train_dataset:
        keys.append(key)
        values.append(train_dataset[key])
    zipped_values = list(zip(*values))
    random.shuffle(zipped_values)
    shuffled_values = list(zip(*zipped_values))
    for i, key in enumerate(keys):
        train_dataset[key] = np.array(shuffled_values[i])
    print('after shuffle:', train_dataset['id'][:5], train_dataset['sim'][:5],
          train_dataset['query_word_input'][:5])

    # sample training dataset for debug purpose
    # sample_num = 1000
    # for key in train_dataset:
    #     train_dataset[key] = train_dataset[key][:sample_num]

    # merge the vocabulory of train and test set
    print("TRAIN vocab: word(%d) 3gram(%d) url(%d)" %
          (len(vocab['word']), len(vocab['3gram']), len(vocab['url'])))
    print("TEST vocab: word(%d) 3gram(%d) url(%d)" % (len(
        test_vocab['word']), len(test_vocab['3gram']), len(test_vocab['url'])))
    merged_vocab = {'url': vocab['url'], '3gram': vocab['3gram']}
    merged_vocab['word'] = merge_two_dicts(vocab['word'], test_vocab['word'])
    print("merged vocab: word(%d) 3gram(%d) url(%d)" %
          (len(merged_vocab['word']), len(
              merged_vocab['3gram']), len(merged_vocab['url'])))
    vocab_inv, vocab_size = {}, {}
    vocab['char'] = merge_two_dicts(vocab['3gram'], vocab['url'])
    test_vocab['char'] = merge_two_dicts(test_vocab['3gram'],
                                         test_vocab['url'])
    merged_vocab['char'] = merge_two_dicts(vocab['char'], test_vocab['char'])

    for key in vocab:
        vocab_inv[key] = invert_dict(merged_vocab[key])
        vocab_size[key] = len(vocab[key])

    print(vocab_size)
    # Print data samples for debug purpose
    # print_dataset(mode, train_dataset, vocab_inv)
    # print_dataset(mode, test_dataset, vocab_inv)

    ############################ TRAIN MODEL #################################
    model = None
    if mode == 'deep_twitter':
        model = create_attention_model(max_query_len,
                                       max_doc_len,
                                       max_url_len,
                                       vocab_size,
                                       train_vocab_emb,
                                       args["nb_filters"],
                                       embed_size=300,
                                       dropout_rate=args['dropout'],
                                       trainable=args["trainable"],
                                       weighting=args['weighting'],
                                       mask=args["mask"],
                                       conv_option=args['conv_option'],
                                       model_option=args['model_option'],
                                       external=args["external_feat"],
                                       norm_weight=args['norm_weight'],
                                       cos_norm=args['cos'],
                                       only_word=args['only_word'],
                                       only_char=args['only_char'],
                                       pooling=args['pooling'],
                                       deeplevel=args['deeplevel'])
    elif mode == 'dssm':
        model = create_dssm_model(max_query_len,
                                  max_doc_len,
                                  max_url_len,
                                  vocab_size,
                                  train_vocab_emb,
                                  args["nb_filters"],
                                  embed_size=300,
                                  dropout_rate=args['dropout'],
                                  trainable=args["trainable"])
    model_name = (
        "model_N%s_data%s_mo%s_c%s_NumFilter%d_T%s_D%.1f_W%s_M%s_B%d_Val%.2f" %
        (mode, train_name, args['model_option'], args['conv_option'],
         args["nb_filters"], args["trainable"], args['dropout'],
         args['weighting'], args['mask'], args['batch_size'],
         args['val_split'])).lower()
    model_path = "%s/%s/%s" % (args['experimental_data'], data_name,
                               model_name)
    print(model_path)

    if args['optimizer'] == "adam":
        opt = optimizers.Adam(lr=args["learning_rate"],
                              beta_1=0.9,
                              beta_2=0.999,
                              epsilon=None,
                              decay=0.0,
                              amsgrad=False)
    elif args['optimizer'] == "sgd":
        opt = optimizers.SGD(lr=args["learning_rate"],
                             decay=1e-6,
                             momentum=0.9,
                             nesterov=True)

    model.compile(loss='binary_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])
    print(model.summary())
    model_weights, parameter_num = get_model_weights(model)
    print('model init weights sum: {} of {} parameters'.format(
        model_weights, parameter_num))
    #

    if not args['load_model']:
        early_stopping = EarlyStopping(monitor='val_loss', patience=4)
        checkpoint = ModelCheckpoint(filepath=model_path + ".best.weights",
                                     monitor='val_loss',
                                     save_best_only=True,
                                     verbose=1)
        lr_reducer = ReduceLROnPlateau(monitor='val_loss',
                                       factor=0.3,
                                       patience=3,
                                       min_lr=0.0001)

        fit_mode = "fit"
        if fit_mode == "fit":
            model.fit(
                train_dataset,
                train_dataset['sim'],  # validation_split=0.05,
                batch_size=args['batch_size'],
                validation_data=(val_dataset, val_dataset['sim']),
                epochs=args['epochs'],
                shuffle=False,
                callbacks=[checkpoint, lr_reducer, early_stopping],
                verbose=2)
        else:
            train_steps, train_batches = batch_iter(
                train_dataset,
                train_dataset["sim"],
                batch_size=args['batch_size'])
            valid_steps, valid_batches = batch_iter(
                val_dataset, val_dataset["sim"], batch_size=args['batch_size'])
            model.fit_generator(
                train_batches,
                train_steps,
                epochs=args['epochs'],
                validation_data=valid_batches,
                validation_steps=valid_steps,
                callbacks=[checkpoint, lr_reducer, early_stopping],
                verbose=2)

    #plot_model(model, to_file='model.png')
    ############################ TEST MODEL #################################
    print('load best model from %s.best.weights' % model_path)
    model.load_weights("%s.best.weights" % model_path)
    if mode == 'deep_twitter':
        # load trained vocab embedding.
        if args["only_char"]:
            merged_vocab_emb = None
        else:
            embedding_layer_name = 'word_embedding'
            trained_vocab_emb = model.get_layer(
                embedding_layer_name).get_weights()[0]
            # merge trained vocab embedding with test OOV word embeddings
            merged_vocab_emb = np.zeros(shape=(len(merged_vocab['word']), 300))
            merged_vocab_emb[0:len(vocab['word']), :] = trained_vocab_emb
            merged_vocab_emb[len(vocab['word']):len(merged_vocab['word']
                                                    ), :] = test_vocab_emb
            for key in vocab:
                vocab_size[key] = len(merged_vocab[key])
            print(vocab_size)

        new_model = create_attention_model(max_query_len,
                                           max_doc_len,
                                           max_url_len,
                                           vocab_size,
                                           merged_vocab_emb,
                                           args["nb_filters"],
                                           embed_size=300,
                                           dropout_rate=args['dropout'],
                                           trainable=args["trainable"],
                                           weighting=args['weighting'],
                                           mask=args["mask"],
                                           conv_option=args['conv_option'],
                                           model_option=args['model_option'],
                                           external=args["external_feat"],
                                           norm_weight=args['norm_weight'],
                                           cos_norm=args['cos'],
                                           only_word=args['only_word'],
                                           only_char=args['only_char'],
                                           pooling=args['pooling'],
                                           deeplevel=args['deeplevel'])
        new_model.compile(loss='binary_crossentropy',
                          optimizer='adam',
                          metrics=['accuracy'])
        # print(new_model.summary())

        num_layers = 0
        for layer in model.layers:
            num_layers += 1
        for layer_id in range(num_layers):
            layer = model.layers[layer_id]
            if not args["only_char"] and layer.name != embedding_layer_name:
                new_model.layers[layer_id].set_weights(layer.get_weights())
        print('copy weight done.')
        predictions = new_model.predict(test_dataset)
    elif mode == 'dssm':
        getter = K.function([model.layers[0].input, model.layers[1].input],
                            model.layers[-2].output)
        print('create DSSM functional getter...')
        num_samples, _, _ = test_dataset['query_3gram_input'].shape
        batch_size = 128
        num_batch = int(math.ceil(num_samples * 1.0 / batch_size))
        predictions = np.zeros((num_samples, ))
        for i in range(num_batch):
            start_idx, end_idx = i * batch_size, min(num_samples,
                                                     (i + 1) * batch_size)
            predictions[start_idx:end_idx] = getter([
                test_dataset['query_3gram_input'][start_idx:end_idx],
                test_dataset['doc_3gram_input'][start_idx:end_idx]
            ])[:, 0]

    #predictions = getter([test_dataset['query_3gram_input'], test_dataset['doc_3gram_input']])
    print(predictions[:10])
    predictions_file = "%s/%s/predictions_%s.txt" % (args["experimental_data"],
                                                     data_name, model_name)
    with open(predictions_file, 'w') as f:
        for i in range(test_dataset['id'].shape[0]):
            f.write("%s %.4f %s\n" %
                    (test_dataset['id'][i], predictions[i], args['mode']))
    print('write predictions with trec format to %s' % predictions_file)
    map, mrr, p30 = evaluate(predictions_file, args["qrels_file"])
    print('MAP: %.4f P30: %.4f MRR: %.4f' % (map, p30, mrr))
def test_custom_env():
    path = '../master_capston/the-movies-dataset/'
    features_embedding_movies = pd.read_csv(
        os.path.join(path, 'movie_embedding_features.csv'))

    # this mean the number of items in the recommendation return from the agent
    slate_size = 3

    # i am assuming this number mean the # of possible items to send to the agent for recommend for each slate
    num_candidates = 11

    format_data = data_preprocess.load_data(path)
    features_embedding_movies = pd.read_csv(
        os.path.join(path, 'movie_embedding_features.csv'))
    positive_user_ids, positive_history_data = data_preprocess.get_user_positive(
        format_data)
    # generate train and test set
    train_set, test_set = data_preprocess.generate_train_test_data(
        positive_history_data)
    users_history_data, train_set = data_preprocess.create_recent_history(
        train_set, embedding_features_data=features_embedding_movies)

    offline_mode = True
    rating_pivot = 4

    sampler = LTSDocumentSampler(dataset=features_embedding_movies,
                                 num_candidate=num_candidates)
    user_sampler = LTSStaticUserSampler(users_history_data,
                                        features_embedding_movies,
                                        offline_data=test_set,
                                        offline_mode=offline_mode)
    # need to handle where we update dataset with num candidate< available
    func = select_dataset(features_embedding_movies, test_set)
    LTSUserModel = UserModel(user_sampler,
                             offline_mode=offline_mode,
                             rating_pivot=rating_pivot,
                             slate_size=slate_size,
                             response_ctor=LTSResponse)
    ltsenv = CustomSingleUserEnviroment(LTSUserModel,
                                        sampler,
                                        num_candidates,
                                        slate_size,
                                        resample_documents=False,
                                        offline_mode=True,
                                        select_subset_func=func)
    lts_gym_env = recsim_gym.RecSimGymEnv(ltsenv, clicked_engagement_reward)

    observation_0 = lts_gym_env.reset()
    print("current user : "******"current history of user items :",
          observation_0['user']['record_ids'])
    print("candidate recommend docs ids : ", observation_0['doc'].keys())
    done = False
    while (not done):
        # for i in range(4):
        recommendation_slate_0 = [0, 1, 2]
        observation_1, reward, done, _ = lts_gym_env.step(
            recommendation_slate_0)
        print("response : ", observation_1['response'])
        print("reward : ", reward)
        print("next history of recommend items :",
              observation_1['user']['record_ids'])
        print("total remaind candidate items to recommend : ",
              len(observation_1['doc'].keys()))
        print("docs ids : ", observation_1['doc'].keys())


# test_custom_env()
from data_preprocess import load_data
from models.Fac_Model import Fac_Model
from tensorflow import keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

X_train, X_test, X_val, y_train, y_test, y_val = load_data()

model = Fac_Model(7)
model.build(input_shape=(None, 48, 48, 1))
model.summary()
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

reducePlateau = ReduceLROnPlateau(monitor='val_accuracy',
                                  factor=0.1,
                                  min_delta=0.0001,
                                  patience=1,
                                  verbose=1)
history = model.fit(X_train,
                    y_train,
                    epochs=14,
                    batch_size=128,
                    steps_per_epoch=250,
                    validation_data=(X_val, y_val),
                    verbose=1,
                    callbacks=[reducePlateau])
result = model.evaluate(X_test, y_test, verbose=1)
print('Resultado Final: ', result)
示例#14
0
def build_logist(filename, option, model_name=None):

    # LOAD DATA
    descriptors = qm_descriptors
    X, Y = data_preprocess.load_data(filename, descriptors)

    if option == 'default':

        print('Training Logist...')
        print('*-----------------------------*')
        print('Training on default parameters.')

        accuracies_default = []
        for i in range(10):
            x_train, x_valid, y_train, y_valid = data_preprocess.split_data(
                X, Y, partition=0.20)
            accuracies_default.append(
                train_logist(x_train, y_train, x_valid, y_valid))

        print('Average accuracy over 10 default runs: %.2f' %
              numpy.mean(accuracies_default))

    elif option == 'train':

        print('*-----------------------------*')
        print('Searchig for best parameters.')

        params = []
        accuracies = []

        for i in range(10):
            x_train, x_valid, y_train, y_valid = data_preprocess.split_data(
                X, Y, partition=0.20)
            best_parameters = scan_parameters(x_train, y_train)
            params.append(best_parameters)
            accuracy = train_logist(x_train, y_train, x_valid, y_valid,
                                    best_parameters)
            accuracies.append(accuracy)

        print('*-----------------------------*')
        print('Summary of Results.')
        print('*-----------------------------*')

        for i in range(len(accuracies)):
            print('Run ' + str(i + 1) + ' ', params[i], ' : ', accuracies[i])

    elif option == 'RFE':

        print('*-----------------------------*')
        print('Recursive feature estimation.')
        #http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html#sklearn.feature_selection.RFE

        ranking = perform_RFE(X, Y)

        print('*-----------------------------*')
        print('Ranking of descriptors.')
        print('*-----------------------------*')
        for d in range(len(qm_descriptors)):
            print(qm_descriptors[d], ranking[d])

    elif option == 'test':

        print('TESTING')
        print('*-----------------------------*')
        #penalties = 'l2'
        #Cs = 0.001
        #weights = None

        penalties = 'l1'
        Cs = 10
        weights = None

        params_dict = {'C': Cs, 'class_weight': weights, 'penalty': penalties}
        print(params_dict)

        acc_list = []
        for i in range(10):
            x_train, x_valid, y_train, y_valid = data_preprocess.split_data(
                X, Y, partition=0.20)
            acc_list.append(
                train_logist(x_train, y_train, x_valid, y_valid, params_dict))

        print('Summary of Results.')
        print('*-----------------------------*')
        print('Average accuracy over 20 runs: %.2f' % numpy.mean(acc_list))
示例#15
0
def train():
    # args is a global variable in this task
    BATCH_SIZE = args.batch_size
    EPOCH = args.epoch
    EMBED_DIM = args.embeddingDim
    MAXLEN = args.maxLen
    NUM_UNITS = args.units
    LEARNING_RATE = args.learning_rate
    DROPOUT = args.dropout
    METHOD = args.method
    GPUNUM = args.gpuNum
    CKPT = args.checkpoint
    LIMIT = args.limit
    start_word = "<s>"
    end_word = "</s>"
    #Here, tokenizer saves all info to split data.
    #Itself is not a part of data.
    train_source_tensor, train_source_tokenizer, train_target_tensor, train_target_tokenizer = \
        load_data(pad_length = MAXLEN, limit=LIMIT)
    buffer_size = len(train_source_tensor)
    train_source_tensor, val_source_tensor, train_target_tensor, val_target_tensor = \
        train_test_split(train_source_tensor, train_target_tensor, random_state=2019)

    #TODO: check if we need target tokenizer
    training_steps = len(train_source_tensor) // BATCH_SIZE
    vocab_source_size = len(train_source_tokenizer.word_index) + 1
    print("vocab_input_size: ", vocab_source_size)
    vocab_target_size = len(train_target_tokenizer.word_index) + 1
    print("vocab_target_size: ", vocab_target_size)

    step = tf.Variable(0, trainable=False)
    # boundaries = [100, 200]
    # values = [1.0, 0.5, 0.1]
    # boundaries = [30, 40]
    # values = [1.0, 0.5, 0.0]
    # learning_rate_fn = tf.compat.v1.train.piecewise_constant(step,
    #     boundaries, values)
    # optimizer = tf.optimizers.SGD(learning_rate=learning_rate_fn(step))
    optimizer = tf.compat.v1.train.GradientDescentOptimizer(
        learning_rate=0.001)
    # set up checkpoint
    if not os.path.exists(CKPT):
        os.makedirs(CKPT)
    else:
        print(
            "Warning: current Checkpoint dir already exist! ",
            "\nPlease consider to choose a new dir to save your checkpoint!")
    checkpoint = tf.train.Checkpoint(optimzier=optimizer)
    checkpoint_prefix = os.path.join(CKPT, "ckpt")

    dataset = train_input_fn(train_source_tensor, train_target_tensor,
                             buffer_size, EPOCH, BATCH_SIZE)
    apply_loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
    encoder = Encoder(vocab_source_size,
                      EMBED_DIM,
                      NUM_UNITS,
                      dropout_rate=DROPOUT,
                      batch_size=BATCH_SIZE)
    decoder = Decoder(vocab_target_size,
                      EMBED_DIM,
                      NUM_UNITS,
                      batch_size=BATCH_SIZE,
                      method=None,
                      dropout_rate=DROPOUT)

    def train_wrapper(source, target):
        # with tf.GradientTape(watch_accessed_variables=False) as tape:
        with tf.GradientTape() as tape:
            # source_out, source_state, source_trainable_var, tape = encoder(source, encoder_state, vocab_source_size,
            #                                                          EMBED_DIM, NUM_UNITS, activation="tanh",
            #                                                          dropout_rate = DROPOUT)
            source_out, source_state = encoder(source,
                                               encoder_state,
                                               activation="tanh")

            initial = tf.expand_dims(
                [train_target_tokenizer.word_index[start_word]] * BATCH_SIZE,
                1)
            attention_state = tf.zeros((BATCH_SIZE, 1, EMBED_DIM))
            # cur_total_loss is a sum of loss for current steps, namely batch loss
            cur_total_loss, cur_loss = 0, 0
            for i in range(1, target.shape[1]):
                output_state, source_state, attention_state = decoder(
                    initial, source_state, source_out, attention_state)
                # TODO: check for the case where target is 0
                cur_loss = apply_loss(target[:, i], output_state)
                # 0 should be the padding value in target.
                # I assumed that there should not be 0 value in target
                # for safety reason, we apply this mask to final loss
                # Mask is a array contains binary value(0 or 1)
                mask = tf.math.logical_not(tf.math.equal(target[:, i], 0))
                mask = tf.cast(mask, dtype=cur_loss.dtype)
                cur_loss *= mask
                cur_total_loss += tf.reduce_mean(cur_loss)
                initial = tf.expand_dims(target[:, i], 1)
                # print(cur_loss)
                # print(cur_total_loss)
        batch_loss = cur_total_loss / target.shape[1]
        ## debug
        variables = encoder.trainable_variables + decoder.trainable_variables
        # print("check variable: ", len(variables))
        #variables = encoder.trainable_variables
        # print("check var:", len(variables), variables[12:])
        gradients = tape.gradient(cur_total_loss, variables)
        # print("check gradient: ", len(gradients))
        # g_e = [type(ele) for ele in gradients if not isinstance(ele, tf.IndexedSlices)]
        # sum_g = [ele.numpy().sum() for ele in gradients if not isinstance(ele, tf.IndexedSlices)]

        # print(len(gradients), len(sum_g))
        optimizer.apply_gradients(zip(gradients, variables), global_step=step)
        return batch_loss

    # print(len(train_source_tensor),BATCH_SIZE,training_steps,LIMIT)
    for epoch in range(EPOCH):
        per_epoch_loss = 0
        start = time.time()
        encoder_hidden = encoder.initialize_hidden_state()
        encoder_ceil = encoder.initialize_cell_state()
        encoder_state = [[encoder_hidden, encoder_ceil],
                         [encoder_hidden, encoder_ceil],
                         [encoder_hidden, encoder_ceil],
                         [encoder_hidden, encoder_ceil]]
        # TODO : Double check to make sure all re-initialization is performed
        for idx, data in enumerate(dataset.take(training_steps)):

            source, target = data
            cur_total_loss = train_wrapper(source, target)
            per_epoch_loss += cur_total_loss
            if idx % 10 == 0:
                # print("current step is: "+str(tf.compat.v1.train.get_global_step()))
                # print(dir(optimizer))
                print("current learning rate is:" +
                      str(optimizer._learning_rate))
                print('Epoch {}/{} Batch {}/{} Loss {:.4f}'.format(
                    epoch + 1, EPOCH, idx + 10, training_steps,
                    cur_total_loss.numpy()))
                # tf.print(step)
                # print(dir(step))
                # print(int(step))
            if step >= 5:
                optimizer._learning_rate /= 2.0

        print('Epoch {}/{} Total Loss per epoch {:.4f} - {} sec'.format(
            epoch + 1, EPOCH, per_epoch_loss / training_steps,
            time.time() - start))
        # TODO: for evaluation add bleu score
        if epoch % 10 == 0:
            print('Saving checkpoint for each 10 epochs')
            checkpoint.save(file_prefix=checkpoint_prefix)
示例#16
0
def test_autoencoder(finetune_lr=0.01,momentum=0.5,training_epochs=30,dataset='grayscale.pkl.gz',batch_size=10,pretrain='output/gray_pre.save',model_save='output/gray.save'):
	"""
	Take pre-trained models as input. Fold the network and fine-tune weights.
	:type finetune_lr: float
    :param finetune_lr: learning rate used in the finetune stage
    :type training_epochs: int
    :param training_epochs: maximal number of iterations ot run the optimizer
    :type dataset: string
    :param dataset: path the the pickled dataset
    :type batch_size: int
    :param batch_size: the size of a minibatch
	:type momentum: float
	:param momentum
    """
	
	print 'loading data'
	datasets = load_data(dataset)

	train_set_x, train_set_y = datasets[0]
	valid_set_x, valid_set_y = datasets[1]
	test_set_x, test_set_y = datasets[2]
	x_mean = datasets[3]

	# compute number of minibatches for training, validation and testing
	n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    # numpy random generator
	numpy_rng = numpy.random.RandomState(123)
	
	# load trained model
	print 'loading the model'
	f = file(pretrain,'rb')
	s_rbm = cPickle.load(f)
	f.close()
	s_rbm.rbm_layers
	n_layers_rbm = s_rbm.n_layers
	bb = AutoEncoder(None, numpy_rng,s_rbm.rbm_layers,n_layers_rbm)
	#return bb

	print 'getting the fine-tuning functions'
	train_fn, validate_model, test_model = bb.build_finetune_functions(
		datasets=datasets,
		batch_size=batch_size,
		learning_rate=finetune_lr,
		momentum=momentum
	)

	print '... fine-tuning the model'
	# early-stopping parameters
	patience = 10 * n_train_batches  # look as this many examples regardless
 	patience_increase = 2.    # wait this much longer when a new best is
								# found
	improvement_threshold = 0.995  # a relative improvement of this much is
									# considered significant
	validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatches before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

	best_validation_loss = numpy.inf
	test_score = 0.
	start_time = time.clock()

	done_looping = False
	epoch = 0
	print n_train_batches
	print patience, patience_increase, validation_frequency, best_validation_loss	
	while (epoch < training_epochs): # and (not done_looping):
		epoch = epoch + 1
		for minibatch_index in xrange(n_train_batches):

			minibatch_avg_cost = train_fn(minibatch_index)
			iter = (epoch - 1) * n_train_batches + minibatch_index

			if (iter + 1) % validation_frequency == 0:

				validation_losses = validate_model()
				this_validation_loss = numpy.mean(validation_losses)
				print(
						'epoch %i, minibatch %i/%i, validation error %f '
                    % (
						epoch,
						minibatch_index + 1,
						n_train_batches,
						this_validation_loss
					)
				)

				# if we got the best validation score until now
				if this_validation_loss < best_validation_loss:

					#improve patience if loss improvement is good enough
					if (
						this_validation_loss < best_validation_loss *
						improvement_threshold
					):
						patience = max(patience, iter * patience_increase)

					# save best validation score and iteration number
					best_validation_loss = this_validation_loss
					best_iter = iter

					# test it on the test set
					test_losses = test_model()
					test_score = numpy.mean(test_losses)
					print(('     epoch %i, minibatch %i/%i, test error of '
							'best model %f ') %
							(epoch, minibatch_index + 1, n_train_batches,
							test_score ))

			if patience <= iter:
				done_looping = True
				break
	f = file(model_save,'wb')
	cPickle.dump(bb,f,protocol=cPickle.HIGHEST_PROTOCOL)
	f.close()
	return bb
示例#17
0
def train(args):
    images, labels = load_data(True)
    train_dset, val_dset = prepare_dataset(images,
                                           labels,
                                           True,
                                           augment=args.augment)
    # images, labels = load_data(False)
    # test_dset = prepare_dataset(images, labels, False)

    # learning_rate = (1 + 11*np.random.random(10)) * 1e-5
    # learning_rate = [7.453742807604199e-05] # augment
    learning_rate = [0.00021837457574664458]  # no augment
    for lr in learning_rate:
        tf.reset_default_graph()
        graph = tf.Graph()

        with graph.as_default():
            global_step = tf.Variable(0, name='global_step', trainable=False)
            # global_step = tf.get_variable('global_step', shape=[None], dtype=tf.int32, initializer=tf.constant_initializer(0), trainable=False)

            # input placeholder
            input_shape = (32, 32) if not args.augment else (24, 24)
            X = tf.placeholder(tf.float32, [None, *input_shape, 3],
                               name='images_ph')
            Y = tf.placeholder(tf.int32, [None], name='labels_ph')
            logits_op, loss_op, acc_op, train_op = build_model(
                X,
                Y,
                lr,
                global_step,
                lrn=args.lrn,
                full_model=args.full_model)

            # add variables to summary
            log_dir = os.path.join(args.log_dir, 'lr-{:.8f}'.format(lr), '')
            if not args.eval:
                rm_dir(log_dir)

            tf.summary.scalar('loss', loss_op)
            tf.summary.scalar('train_acc', acc_op)
            tf.summary.histogram('loss', loss_op)
            summary_op = tf.summary.merge_all()
            summary_writer = tf.summary.FileWriter(log_dir, graph)

            # model saver
            save_dir = os.path.join(args.save_dir, 'lr-{:.8f}'.format(lr),
                                    '') if not args.eval else args.save_dir
            print('save at {}'.format(save_dir))
            saver = tf.train.Saver(filename=save_dir)

            # run training session
            config = tf.ConfigProto(log_device_placement=False)
            with tf.Session(graph=graph, config=config) as sess:
                sess.run(tf.global_variables_initializer())
                if args.eval:
                    print('Resume from {}'.format(args.save_dir))
                    saver.restore(sess, tf.train.latest_checkpoint(save_dir))
                step = sess.run(global_step)
                print('lr={}, normalization: {}, initializer:{}, step: {}'.
                      format(lr, args.lrn, args.init, step))

                if not args.eval:
                    for e in range(args.epochs):
                        print('Epoch {}'.format(e))
                        for x_batch, y_batch in train_dset:
                            _, acc, summary = sess.run(
                                [train_op, acc_op, summary_op],
                                feed_dict={
                                    X: x_batch,
                                    Y: y_batch
                                })
                            step = sess.run(global_step)
                            summary_writer.add_summary(summary, step)
                        saver.save(sess, save_dir, global_step=global_step)

                    acc, num_correct, num_samples = check_accuracy(
                        sess, val_dset, X, logits_op)
                    step = sess.run(global_step)
                    print('step:{}, val acc: {:.2%} ({}/{})'.format(
                        step, acc, num_correct, num_samples))
                else:
                    images, labels = load_data(False)
                    test_dset = prepare_dataset(images,
                                                labels,
                                                train=False,
                                                augment=args.augment)
                    acc, num_correct, num_samples = check_accuracy(
                        sess, test_dset, X, logits_op)
                    step = sess.run(global_step)
                    print('step:{}, test acc: {:.2%} ({}/{})'.format(
                        step, acc, num_correct, num_samples))
def test_train_agent_offline():
    path = 'dataset'
    use_100k = True
    if use_100k:
        rating_file_name = 'process_small_rating.csv'
        embedding_file_name = 'movie_embedding_features.csv'
    else:
        rating_file_name = 'process_1m_rating.csv'
        embedding_file_name = 'movie_embedding_features_1m.csv'

    np.random.seed(0)
    # list_slate = [3,5,7,9]
    # list_time_budget = [2,4,6,8]
    # list_num_candidate = [20,30]

    list_slate = [5]
    list_time_budget = [4]
    list_num_candidate = [20]
    slate_size = 5
    time_budget = -4
    time_budget_range = [2, 6]
    num_candidates = 30
    min_num_positive_rating = 40
    min_num_rating = 70
    test_mode = False
    offline_mode = True

    rating_pivot = 4
    resample_documents = True

    #agent params
    embedding_size = 30
    num_positive_hisotry_items = 10
    num_action_vector = 1
    s_dim = num_positive_hisotry_items * embedding_size
    a_dim = num_action_vector * embedding_size
    actor_lr = 0.001
    critic_lr = 0.001
    hidden_layer_1 = 32
    hidden_layer_2 = 16
    tau = 0.01
    batch_size = 32
    gamma = 0.75
    buffer_size = 20000

    # train agent
    max_eps = 50
    sample_user_randomly = False
    max_num_user = 100
    save_frequenly = 10
    model_folder = "model_test"
    # config_file =
    save_model_path = "save_model"
    log_path = "logs/scalars/"
    history_path = "history_log"
    model_count = 0
    # for slate_size in list_slate:
    #     for time_budget in list_time_budget:
    #         for num_candidates in list_num_candidate:
    #             curret_model_path = os.path.join(save_model_path, str(slate_size), str(time_budget), str(num_candidates))
    #             current_log_path = os.path.join(log_path, str(slate_size), str(time_budget), str(num_candidates))
    #             current_history_path = os.path.join(history_path, str(slate_size), str(time_budget), str(num_candidates))
    curret_model_path = os.path.join(model_folder, save_model_path)
    current_log_path = os.path.join(model_folder, log_path)
    current_history_path = os.path.join(model_folder, history_path)
    if not os.path.exists(curret_model_path):
        print("make model path")
        os.makedirs(curret_model_path)
    if not os.path.exists(current_log_path):
        os.makedirs(current_log_path)
    if not os.path.exists(current_history_path):
        os.makedirs(current_history_path)
    with tf.Session() as sess:
        # train_mode = True

        # build enviroment
        # this mean the number of items in the recommendation return from the agent
        # i am assuming this number mean the # of possible items to send to the agent for recommend for each slate
        # time_budget = 2

        format_data = data_preprocess.load_data(path,
                                                file_name=rating_file_name)
        features_embedding_movies = pd.read_csv(
            os.path.join(path, embedding_file_name))
        positive_user_ids, positive_history_data = data_preprocess.get_user_positive(
            format_data, min_num_positive_rating)
        print("unique user id : ", len(np.unique(positive_user_ids)))
        print(len(positive_history_data))

        # restrict number of total rating
        positive_user_ids, positive_history_data = data_preprocess.generate_new_dataset(
            positive_history_data, num_rating_pivot=min_num_rating)
        print("unique user id : ", len(np.unique(positive_user_ids)))
        print(len(positive_history_data))
        # generate train and test set

        train_set, test_set = data_preprocess.generate_train_test_data(
            positive_history_data)
        users_history_data, train_set = data_preprocess.create_recent_history(
            train_set)
        #check the train set and test set quality
        user_details = train_set.groupby('userId').size().reset_index()
        user_details.columns = ['userId', 'number of rating']
        print("train set quality : ", user_details.describe())
        user_details = test_set.groupby('userId').size().reset_index()
        user_details.columns = ['userId', 'number of rating']
        print("test set quality : ", user_details.describe())

        offline_dataset = train_set
        if test_mode:
            offline_dataset = test_set

        sampler = LTSDocumentSampler(dataset=features_embedding_movies,
                                     num_candidate=num_candidates)
        user_sampler = LTSStaticUserSampler(
            users_history_data,
            features_embedding_movies,
            offline_data=offline_dataset,
            offline_mode=offline_mode,
            time_budget=time_budget,
            random=sample_user_randomly,
            time_budget_range=time_budget_range)
        # need to handle where we update dataset with num candidate< available

        func_select_train_set = select_dataset(features_embedding_movies,
                                               train_set)
        func_select_test_set = select_dataset(features_embedding_movies,
                                              test_set)

        # user_train_set = func(user_id=39)
        # print(len(user_train_set))

        LTSUserModel = UserModel(user_sampler,
                                 offline_mode=offline_mode,
                                 rating_pivot=rating_pivot,
                                 slate_size=slate_size,
                                 response_ctor=LTSResponse)

        ltsenv = CustomSingleUserEnviroment(
            LTSUserModel,
            sampler,
            num_candidates,
            slate_size,
            resample_documents=resample_documents,
            offline_mode=offline_mode,
            select_subset_func=func_select_train_set)

        if test_mode:
            ltsenv = CustomSingleUserEnviroment(
                LTSUserModel,
                sampler,
                num_candidates,
                slate_size,
                resample_documents=resample_documents,
                offline_mode=offline_mode,
                select_subset_func=func_select_test_set)

        lts_gym_env = recsim_gym.RecSimGymEnv(ltsenv,
                                              clicked_engagement_reward)
        # simulated environment

        # build agent
        actor = Actor(sess, s_dim, a_dim, batch_size, slate_size,
                      embedding_size, tau, actor_lr, hidden_layer_1,
                      hidden_layer_2)
        critic = Critic(sess, s_dim, a_dim, slate_size, embedding_size, gamma,
                        tau, critic_lr, hidden_layer_1, hidden_layer_2)
        buffer = RelayBuffer(buffer_size, s_dim, a_dim)
        noise_model = Noise(a_dim)
        agent = Actor_Critic_Agent(sess, lts_gym_env.observation_space,
                                   lts_gym_env.action_space, actor, critic,
                                   buffer, noise_model, slate_size,
                                   embedding_size)
        #train section
        # max_num_user = len(np.unique(users_history_data['userId']))

        history = agent.train(max_eps, max_num_user, batch_size, lts_gym_env,
                              save_frequenly, curret_model_path,
                              current_log_path, current_history_path)
        #
        # # print(history.keys())
        # history_table = pd.DataFrame(history)
        # history_table.to_csv(os.path.join(current_history_path,"history_record.csv"),index=False)
        #
        # print("finish training for model : ",model_count )
        # model_count += 1
        #evaluate section
        config_info = {
            "use_teriminal_info": True,
            "use_100k": use_100k,
            "slate_size": slate_size,
            "num_candidates": num_candidates,
            "time_budget": time_budget,
            "time_budget_range": time_budget_range,
            "min_num_rating": min_num_rating,
            "min_num_positive_rating": min_num_positive_rating,
            "actor_lr": actor_lr,
            "critic_lr": critic_lr,
            "hidden_layer_1": hidden_layer_1,
            "hidden_layer_2": hidden_layer_2,
            "batch_size": batch_size,
            "tau": tau,
            "gamma": gamma,
            "buffer_size": buffer_size,
            "max_eps": max_eps,
            "max_num_user": max_num_user,
            "sample_user_randomly": sample_user_randomly,
            "save_frequenly": save_frequenly,
        }
        config_file_name = 'config.json'

        with open(os.path.join(model_folder, config_file_name), 'w') as fp:
            json.dump(config_info, fp, indent=4)
示例#19
0
)

# top_k
tf.flags.DEFINE_integer("top_k", 1, "Allow evaluate ranking")

FLAGS = tf.flags.FLAGS
# FLAGS._parse_flags()
# print("\nParameters:")
# for attr, value in sorted(FLAGS.__flags.items()):
#     print("{}={}".format(attr.upper(), value))
# print("")

# CHANGE THIS: Load data. Load your own data here
# TODO: Modify Eval_train
if FLAGS.eval_train:
    x_raw, y_raw = data_preprocess.load_data(FLAGS.data_file, FLAGS.class_file,
                                             FLAGS.char)
    class_vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "class_voca")
    class_processor = learn.preprocessing.VocabularyProcessor.restore(
        class_vocab_path)
    y_test = np.array(list(class_processor.transform(y_raw)), dtype="float32")
    y_test = y_test.ravel()
    # y_test = np.argmax(y_test, axis=1)
else:
    x_raw = [
        "a masterpiece four years in the making", "everything is off.",
        "what the f**k", "i love you", "hello, ma friend?", "go to hell",
        "do you want to be killed?"
    ]
    y_test = [1, 0, 0, 1, 1, 0, 0]

# Map data into vocabulary
示例#20
0
def test_DBN(finetune_lr=0.1,
             pretraining_epochs=50,
             pretrain_lr=0.001,
             k=1,
             training_epochs=1,
             dataset='grayscale.pkl.gz',
             batch_size=10,
             hidden_layers_sizes=[1000, 200, 50],
             pretrain_model='gray_pre1.save',
             logfile='newLog'):
    """
    Demonstrates how to train and test a Deep Belief Network.

    This is demonstrated on MNIST.

    :type finetune_lr: float
    :param finetune_lr: learning rate used in the finetune stage
    :type pretraining_epochs: int
    :param pretraining_epochs: number of epoch to do pretraining
    :type pretrain_lr: float
    :param pretrain_lr: learning rate to be used during pre-training
    :type k: int
    :param k: number of Gibbs steps in CD/PCD
    :type training_epochs: int
    :param training_epochs: maximal number of iterations ot run the optimizer
    :type dataset: string
    :param dataset: path the the pickled dataset
    :type batch_size: int
    :param batch_size: the size of a minibatch
    """
    f = open(logfile, "w")
    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size

    # numpy random generator
    numpy_rng = numpy.random.RandomState(123)

    print >> f, '... building the model'
    # construct the Deep Belief Network
    dbn = DBN(
        numpy_rng=numpy_rng,
        n_ins=48 * 64,  # 26 * 56
        hidden_layers_sizes=hidden_layers_sizes,
        n_outs=4)

    # start-snippet-2
    #########################
    # PRETRAINING THE MODEL #
    #########################
    print >> f, '... getting the pretraining functions'
    pretraining_fns = dbn.pretraining_functions(train_set_x=train_set_x,
                                                batch_size=batch_size,
                                                k=k)

    print >> f, '... pre-training the model'
    best_obj = -99999999
    start_time = time.clock()
    ## Pre-train layer-wise
    for i in xrange(dbn.n_layers):
        # go through pretraining epochs
        for epoch in xrange(pretraining_epochs):
            # go through the training set
            c = []
            for batch_index in xrange(n_train_batches):
                c.append(pretraining_fns[i](index=batch_index, lr=pretrain_lr))
            cost_e = numpy.mean(c)
            print >> f, 'Pre-training layer %i, epoch %d, cost ' % (i, epoch),
            print >> f, cost_e
            if cost_e > best_obj:
                best_obj = cost_e

    end_time = time.clock()
    # end-snippet-2
    ff = file(pretrain_model, 'wb')
    cPickle.dump(dbn, ff, protocol=cPickle.HIGHEST_PROTOCOL)
    ff.close
    print >> f, ('The pretraining code for file ' +
                 os.path.split(__file__)[1] + ' ran for %.2fm' %
                 ((end_time - start_time) / 60.))
    print >> f, ("final pretraining cost: %f" % best_obj)
    f.close()

    # sys.stdout.close()
    return best_obj
    ########################
    # FINETUNING THE MODEL #
    ########################
    # get the training, validation and testing function for the model
    print '... getting the finetuning functions'
    train_fn, validate_model, test_model = dbn.build_finetune_functions(
        datasets=datasets, batch_size=batch_size, learning_rate=finetune_lr)

    print '... finetuning the model'
    # early-stopping parameters

    patience = 4 * n_train_batches  # look as this many examples regardless
    patience_increase = 2.  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatches before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_validation_loss = numpy.inf
    test_score = 0.
    start_time = time.clock()

    done_looping = False
    epoch = 0

    while (epoch < training_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):

            minibatch_avg_cost = train_fn(minibatch_index)
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:

                validation_losses = validate_model()
                this_validation_loss = numpy.mean(validation_losses)
                print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    #improve patience if loss improvement is good enough
                    if (this_validation_loss <
                            best_validation_loss * improvement_threshold):
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = test_model()
                    test_score = numpy.mean(test_losses)
                    print(('     epoch %i, minibatch %i/%i, test error of '
                           'best model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.))

            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print(('Optimization complete with best validation score of %f %%, '
           'obtained at iteration %i, '
           'with test performance %f %%') %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The fine tuning code for file ' +
                          os.path.split(__file__)[1] + ' ran for %.2fm' %
                          ((end_time - start_time) / 60.))
示例#21
0
def main(options):
    args = get_default_args()
    load_best_args(args, options, get_best_args())
    set_args(args, options)
    print_args(args)
    mode = args['mode']
    train_name, test_name = args['split']['train'], args['split']['test']
    if train_name == 'train_all':
        train_set = ['trec-2011', 'trec-2012', 'trec-2013', 'trec-2014']
        train_set.remove(test_name)
    else:
        train_set = [train_name]
    test_set = test_name
    print('train_set: {}, test_set: {}'.format(train_set, test_set))
    max_query_len, max_doc_len, max_url_len = defaultdict(int), defaultdict(
        int), defaultdict(int)
    vocab = {'word': {}, '3gram': {}, 'url': {}}
    test_vocab = {'word': {}, '3gram': {}, 'url': {}}

    ############################# LOAD DATA ##################################
    data_name = ("data_m%s_%s_%s" % (mode, train_name, test_name)).lower()
    if args["load_data"]:
        train_dataset, vocab, train_vocab_emb, max_query_len, max_doc_len, max_url_len = load_data(
            "%s/%s/%s" % (args["experimental_data"], data_name, train_name),
            True)
        test_dataset, test_vocab, test_vocab_emb, _, _, _ = load_data(
            "%s/%s/%s" % (args["experimental_data"], data_name, test_name),
            False)
        print('load dataset successfully')
    else:
        train_dataset = gen_data(args["raw_data"], train_set, vocab,
                                 test_vocab, True, max_query_len, max_doc_len,
                                 max_url_len, args)
        print("create training set successfully...")
        test_dataset = gen_data(args["raw_data"], [test_set], vocab,
                                test_vocab, False, max_query_len, max_doc_len,
                                max_url_len, args)
        train_vocab_emb, test_vocab_emb = construct_vocab_emb(
            "%s/%s" % (args["experimental_data"], data_name),
            vocab['word'],
            test_vocab['word'],
            300,
            "word",
            base_embed_path=args["base_embed_path"])
        save_data(
            "%s/%s/%s" % (args["experimental_data"], data_name, train_name),
            True, train_dataset, max_query_len, max_doc_len, max_url_len,
            vocab, train_vocab_emb)
        print("save training set successfully...")
        save_data("%s/%s/%s" %
                  (args["experimental_data"], data_name, test_name),
                  False,
                  test_dataset,
                  vocab=test_vocab,
                  vocab_emb=test_vocab_emb)
        print("save test set successfully...")

    val_split = args['val_split']
    num_samples, _ = train_dataset["query_word_input"].shape
    # randomly sample queries and all their documents if query_random is True
    # otherwise, query-doc pairs are randomly sampled
    query_random = True
    if query_random:
        val_indices = sample_val_set(args["raw_data"], train_set, val_split)
    else:
        val_indices, val_set = [], set()
        for i in range(int(num_samples * val_split)):
            val_index = np.random.randint(num_samples)
            while val_index in val_set:
                val_index = np.random.randint(num_samples)
            val_indices.append(val_index)
            val_set.add(val_index)

    val_dataset = {}
    for key in train_dataset:
        val_dataset[key] = train_dataset[key][val_indices]
        train_dataset[key] = np.delete(train_dataset[key], val_indices, 0)

    # shuffle the train dataset explicitly to make results reproducible
    # whether the performance will be affected remains a question
    keys, values = [], []
    for key in train_dataset:
        keys.append(key)
        values.append(train_dataset[key])
    zipped_values = list(zip(*values))
    random.shuffle(zipped_values)
    shuffled_values = list(zip(*zipped_values))
    for i, key in enumerate(keys):
        train_dataset[key] = np.array(shuffled_values[i])
    print('after shuffle: id {}, sim {}, query_word_input'.format(
        train_dataset['id'][:3], train_dataset['sim'][:3],
        train_dataset['query_word_input'][:3]))

    # merge the vocabulory of train and test set
    merged_vocab = {'url': vocab['url'], '3gram': vocab['3gram']}
    merged_vocab['word'] = merge_two_dicts(vocab['word'], test_vocab['word'])
    print("merged vocab: word(%d) 3gram(%d)" %
          (len(merged_vocab['word']), len(test_vocab['3gram'])))
    vocab_inv, vocab_size = {}, {}
    vocab['char'] = merge_two_dicts(vocab['3gram'], vocab['url'])
    test_vocab['char'] = merge_two_dicts(test_vocab['3gram'],
                                         test_vocab['url'])
    merged_vocab['char'] = merge_two_dicts(vocab['char'], test_vocab['char'])

    for key in vocab:
        vocab_inv[key] = invert_dict(merged_vocab[key])
        vocab_size[key] = len(vocab[key])
    print(vocab_size)

    # Print data samples for debug purpose
    print_dataset(mode, train_dataset, vocab_inv)
    print_dataset(mode, test_dataset, vocab_inv)

    ############################ TRAIN MODEL #################################
    model = None
    if mode == 'deep_twitter':
        model = create_attention_model(max_query_len,
                                       max_doc_len,
                                       max_url_len,
                                       vocab_size,
                                       train_vocab_emb,
                                       args["nb_filters"],
                                       embed_size=300,
                                       dropout_rate=args['dropout'],
                                       trainable=args["trainable"],
                                       weighting=args['weighting'],
                                       mask=args["mask"],
                                       conv_option=args['conv_option'],
                                       model_option=args['model_option'])
    model_name = (
        "model_N%s_data%s_mo%s_c%s_NumFilter%d_T%s_D%.1f_W%s_M%s_B%d_Val%.2f" %
        (mode, train_name, args['model_option'], args['conv_option'],
         args["nb_filters"], args["trainable"], args['dropout'],
         args['weighting'], args['mask'], args['batch_size'],
         args['val_split'])).lower()
    model_path = "%s/%s/%s" % (args['experimental_data'], data_name,
                               model_name)
    print(model_path)

    if args['optimizer'] == "adam":
        opt = optimizers.Adam(lr=args["learning_rate"],
                              beta_1=0.9,
                              beta_2=0.999,
                              epsilon=None,
                              decay=0.0,
                              amsgrad=True)
        print('use Adam optimizer')
    elif args['optimizer'] == "sgd":
        opt = optimizers.SGD(lr=args["learning_rate"],
                             decay=1e-6,
                             momentum=0.9,
                             nesterov=True)
        print('use SGD optimizer')
    elif args['optimizer'] == 'rmsprop':
        opt = optimizers.RMSprop(lr=args["learning_rate"],
                                 rho=0.9,
                                 epsilon=None,
                                 decay=0.0)
        print('use RMSprop optimizer')

    model.compile(loss='binary_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])
    print(model.summary())
    print('model init weights sum: %.4f' % get_model_weights(model))
    if not args['load_model']:
        early_stopping = EarlyStopping(monitor='val_loss', patience=4)
        checkpoint = ModelCheckpoint(filepath=model_path + ".best.weights",
                                     monitor='val_loss',
                                     save_best_only=True,
                                     verbose=1)
        lr_reducer = ReduceLROnPlateau(monitor='val_loss',
                                       factor=0.5,
                                       patience=2,
                                       min_lr=0.0001,
                                       verbose=1)
        #print(train_dataset['id'][:3], val_dataset['id'][:3], val_dataset['id'][-3:])
        model.fit(train_dataset,
                  train_dataset['sim'],
                  validation_data=(val_dataset, val_dataset['sim']),
                  batch_size=args['batch_size'],
                  epochs=args['epochs'],
                  shuffle=False,
                  callbacks=[checkpoint, lr_reducer, early_stopping],
                  verbose=args['verbose'])

    ############################ TEST MODEL #################################
    print('load best model from %s.best.weights' % model_path)
    model.load_weights("%s.best.weights" % model_path)
    if mode == 'deep_twitter':
        # load trained vocab embedding.
        trained_vocab_emb = model.get_layer('sequential_2').get_weights()[0]
        # merge trained vocab embedding with test OOV word embeddings
        merged_vocab_emb = np.zeros(shape=(len(merged_vocab['word']), 300))
        merged_vocab_emb[0:len(vocab['word']), :] = trained_vocab_emb
        merged_vocab_emb[
            len(vocab['word']):len(merged_vocab['word']), :] = test_vocab_emb
        for key in vocab:
            vocab_size[key] = len(merged_vocab[key])
        print(vocab_size)

        new_model = create_attention_model(max_query_len,
                                           max_doc_len,
                                           max_url_len,
                                           vocab_size,
                                           merged_vocab_emb,
                                           args["nb_filters"],
                                           embed_size=300,
                                           dropout_rate=args['dropout'],
                                           trainable=args["trainable"],
                                           weighting=args['weighting'],
                                           mask=args["mask"],
                                           conv_option=args['conv_option'],
                                           model_option=args['model_option'])
        new_model.compile(loss='binary_crossentropy',
                          optimizer='adam',
                          metrics=['accuracy'])
        print(new_model.summary())
        num_layers = 0
        for layer in model.layers:
            num_layers += 1
        for layer_id in range(num_layers):
            layer = model.layers[layer_id]
            if layer.name != 'sequential_2':
                new_model.layers[layer_id].set_weights(layer.get_weights())
        print('copy weight done.')
        predictions = new_model.predict(test_dataset)

    print(predictions[:10])
    predictions_file = "%s/%s/predictions_%s.txt" % (args["experimental_data"],
                                                     data_name, model_name)
    with open(predictions_file, 'w') as f:
        for i in range(test_dataset['id'].shape[0]):
            f.write("%s %.4f %s\n" %
                    (test_dataset['id'][i], predictions[i], args['mode']))
    print('write predictions with trec format to %s' % predictions_file)
    map, mrr, p30 = evaluate(predictions_file, args["qrels_file"])
    print('MAP: %.4f P30: %.4f MRR: %.4f' % (map, p30, mrr))
示例#22
0
def sgd_optimization_mnist(learning_rate=0.13,
                           momentum=0,
                           n_epochs=25,
                           dataset='grayscale_seg_binary_data.pkl.gz',
                           batch_size=50):
    """
    Demonstrate stochastic gradient descent optimization of a log-linear
    model

    This is demonstrated on MNIST.

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: the path of the MNIST dataset file from
                 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz

    """
    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch

    # generate symbolic variables for input (x and y represent a
    # minibatch)
    x = T.matrix('x')  # data, presented as rasterized images
    y = T.ivector('y')  # labels, presented as 1D vector of [int] labels

    # construct the logistic regression class
    # Each MNIST image has size 28*28
    # numpy.array(train_set_y.eval()).max() + 1
    classifier = LogisticRegression(input=x, n_in=48 * 64, n_out=2)

    # the cost we minimize during training is the negative log likelihood of
    # the model in symbolic format
    cost = classifier.negative_log_likelihood(y)

    # compiling a Theano function that computes the mistakes that are made by
    # the model on a minibatch
    test_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    validate_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    # compute the gradient of cost with respect to theta = (W,b)
    #    g_W = T.grad(cost=cost, wrt=classifier.W)
    #    g_b = T.grad(cost=cost, wrt=classifier.b)

    # start-snippet-3
    # specify how to update the parameters of the model as a list of
    # (variable, update expression) pairs.
    #    updates = [(classifier.W, classifier.W - learning_rate * g_W),
    #               (classifier.b, classifier.b - learning_rate * g_b)]

    # LK change: sgd with momentum. (tested. okay)
    #    updates = [(classifier.model_update_W, momentum * classifier.model_update_W - learning_rate * g_W),
    #               (classifier.model_update_b, momentum * classifier.model_update_b - learning_rate * g_b),
    #               (classifier.W, classifier.W + classifier.model_update_W),
    #               (classifier.b, classifier.b + classifier.model_update_b)]
    # LK change2: sgd with momentum 2.
    grads = T.grad(cost, classifier.params)

    #    updates1 = [
    #        (model_update_i, model_update_i * momentum - learning_rate * grad_i)
    #        for model_update_i, grad_i in zip(classifier.model_update, grads)
    #    ]
    #    updates2 = [
    #        (param_i, param_i + model_update_i)
    #        for param_i, model_update_i in zip(classifier.params, classifier.model_update)
    #    ]

    updates1 = [(param_i,
                 param_i + model_update_i * momentum - learning_rate * grad_i)
                for param_i, model_update_i, grad_i in zip(
                    classifier.params, classifier.model_update, grads)]
    updates2 = [
        (model_update_i, model_update_i * momentum - learning_rate * grad_i)
        for model_update_i, grad_i in zip(classifier.model_update, grads)
    ]
    updates = updates1 + updates2

    # compiling a Theano function `train_model` that returns the cost, but in
    # the same time updates the parameter of the model based on the rules
    # defined in `updates`
    train_model = theano.function(
        inputs=[index],
        outputs=cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })
    # end-snippet-3

    ###############
    # TRAIN MODEL #
    ###############
    print '... training the model'
    # early-stopping parameters
    patience = 5000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_validation_loss = numpy.inf
    test_score = 0.
    start_time = time.clock()

    done_looping = False
    epoch = 0
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):

            minibatch_avg_cost = train_model(minibatch_index)
            # iteration number
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                # compute zero-one loss on validation set
                validation_losses = [
                    validate_model(i) for i in xrange(n_valid_batches)
                ]
                this_validation_loss = numpy.mean(validation_losses)

                print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:
                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    best_validation_loss = this_validation_loss
                    # test it on the test set

                    test_losses = [
                        test_model(i) for i in xrange(n_test_batches)
                    ]
                    test_score = numpy.mean(test_losses)

                    print(('     epoch %i, minibatch %i/%i, test error of'
                           ' best model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.))

            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print(('Optimization complete with best validation score of %f %%,'
           'with test performance %f %%') %
          (best_validation_loss * 100., test_score * 100.))
    print 'The code run for %d epochs, with %f epochs/sec' % (
        epoch, 1. * epoch / (end_time - start_time))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.1fs' % ((end_time - start_time)))
    #
    #     # update target networks
    #     self.actor.update_target_network()
    #     self.critic.update_target_network()
    #
    #     return np.amax(q_value), critic_loss


path = '../master_capston/the-movies-dataset/'
features_embedding_movies = pd.read_csv(
    os.path.join(path, 'movie_embedding_features.csv'))
sampler = LTSDocumentSampler(dataset=features_embedding_movies)
slate_size = 3
num_candidates = 15

format_data = data_preprocess.load_data(path)
# print(format_data.head())
# print(format_data.shape)

features_embedding_movies = pd.read_csv(
    os.path.join(path, 'movie_embedding_features.csv'))
positive_user_ids, positive_history_data = data_preprocess.get_user_positive(
    format_data)
user_sampler = LTSStaticUserSampler(positive_user_ids, positive_history_data,
                                    features_embedding_movies)
LTSUserModel = UserModel(user_sampler, slate_size, LTSResponse)
ltsenv = environment.Environment(LTSUserModel,
                                 sampler,
                                 num_candidates,
                                 slate_size,
                                 resample_documents=True)
示例#24
0
def build_SVM(filename, option, svm_type = None, poly_degree = None):  
     
    
    # LOAD DATA
    descriptors = qm_descriptors
    X, Y = data_preprocess.load_data(filename, descriptors)
    
    if svm_type == None:
        svm_type = 'linear'
    
    if poly_degree == None:
         poly_degree = 2
         #print('training polynomial SVM of degree', poly_degree)    
    
   
    if option == 'default':
        
        print('Training SVM...')
        print('*-----------------------------*')
        print('Training on default parameters.')
        
        accuracies_default = []
        for i in range(10):
            x_train, x_valid, y_train, y_valid = data_preprocess.split_data (X, Y, partition=0.20)
            accuracies_default.append(train_SVM(x_train, y_train, x_valid, y_valid))
        
        print('Average accuracy over 10 default runs: %.2f' % numpy.mean(accuracies_default))
        
        
    elif option == 'train':
         
        print('*-----------------------------*')
        print('Searchig for best parameters.')
        
        params = []
        accuracies = []

        for i in range(10):
            x_train, x_valid, y_train, y_valid = data_preprocess.split_data (X, Y, partition=0.20)
            best_parameters = scan_parameters(x_train, y_train)
            params.append(best_parameters)
            accuracy = train_SVM(x_train, y_train, x_valid, y_valid, best_parameters)
            accuracies.append(accuracy)
            
        print('*-----------------------------*')
        print('Summary of Results.')            
        print('*-----------------------------*')
        
        
        for i in range(len(accuracies)):
            print('Run ' + str (i+1)+ ' ', params[i], ' : ', accuracies[i])
                

    elif option == 'RFE':
        
        print('*-----------------------------*')
        print('Recursive feature estimation.')
        #http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html#sklearn.feature_selection.RFE
        
        ranking = perform_RFE(X, Y)
            
        print('*-----------------------------*')
        print('Ranking of descriptors.')            
        print('*-----------------------------*')
        for d in range(len(qm_descriptors)):
            print(qm_descriptors[d], ranking[d])


    elif option == 'test':
        
        print('TESTING')
        print('*-----------------------------*')
        
        #kernels  = 'rbf'
        #Cs = 1
        #gammas = 1
        #degrees = 3
        #weights = None
        
        kernels  = 'rbf'
        Cs = 10
        gammas = 0.1
        degrees = 3
        weights = None
        
        params_dict = {'kernel': kernels, 'C': Cs, 'class_weight' : weights, 'degree': degrees, 'gamma' : gammas}
        
        acc_list = []
        
        for i in range(10):
            x_train, x_valid, y_train, y_valid = data_preprocess.split_data (X, Y, partition=0.20)
            
            acc_list.append(train_SVM(x_train, y_train, x_valid, y_valid, params_dict))
        
        print('Summary of Results.')            
        print('*-----------------------------*')
        print('Average accuracy over 10 runs: %.2f' % numpy.mean(acc_list))
示例#25
0
def train(opts):
    """ training process starts here """
    
    print '==> Training a language model'  
    print '    [Word only]'
 
    
    #---------------------------------------------------------
    # prepare ingredients
    #---------------------------------------------------------   

    print '==> Loading dictionaries: ',
    
    # load word dictionary
    print 'word dict,',
    if opts['word_dictionary']:
        with open(opts['word_dictionary'], 'rb') as f:
            word_dict = pkl.load(f) # word -> index 
        word_idict = dict()
        for kk, vv in word_dict.iteritems():
            word_idict[vv] = kk     # index -> word 
    print 'Done'        
    
    # reload options 
    if opts['reload_'] and os.path.exists(opts['saveto']):
        with open('%s.pkl' % opts['saveto'], 'rb') as f:
            reloaded_options = pkl.load(f)
            opts.update(reloaded_options)
   
    # load training data
    train = load_data(path=opts['train_text'])
 
    # initialize params
    print '==> Building model:'
    params = init_params(opts)

    # reload parameters
    if opts['reload_'] and os.path.exists(opts['saveto']):
        params = load_params(opts['saveto'], params)

    # convert params to Theano shared variabel 
    tparams = init_tparams(params)
    
    # build computational graph 
    trng, is_train, x_word_input, x_mask, cost = build_model(tparams, opts)
    inps = [x_word_input, x_mask]

    print '==> Building f_cost...',
    f_cost = theano.function(inps, cost)
    print 'Done'

    # get gradients
    print '==> Computing gradient...',
    grads = tensor.grad(cost, wrt=itemlist(tparams))

    # gradient clipping
    print 'gradient clipping...',
    grad_norm = tensor.sqrt(tensor.sum([tensor.sum(g**2.) for g in grads]))
    tau = opts['gradclip']
    grad_clipped = []
    for g in grads:
        grad_clipped.append(tensor.switch(tensor.ge(grad_norm, tau), g * tau / grad_norm, g))
    print 'Done'

    # build optimizer
    lr = tensor.scalar(name='lr')
    print '==> Building optimizers...',
    f_grad_shared, f_update = eval(opts['optimizer'])(lr, tparams, grad_clipped, inps, cost)
    print 'Done'
 
    #---------------------------------------------------------
    # start optimization
    #---------------------------------------------------------   

    print '==> Optimization:'

    # reload history
    history_errs = []
    if opts['reload_'] and os.path.exists(opts['saveto']):
        history_errs = list(numpy.load(opts['saveto'])['history_errs'])
    best_p = None
    bad_counter = 0

    # load validation and test data
    if opts['valid_text']:
        valid_lines = []
        with open(opts['valid_text'], 'r') as f:
            for l in f:
                valid_lines.append(l)
        n_valid_lines = len(valid_lines)
    if opts['test_text']:
        test_lines = []
        with open(opts['test_text'], 'r') as f:
            for l in f:
                test_lines.append(l)
        n_test_lines = len(test_lines)
    
    # initialize some values
    uidx = 0                 # update counter
    estop = False            # early stopping flag
    lrate = opts['lrate'] 
    batch_size = opts['batch_size']

    # outer loop: epochs
    for eidx in xrange(opts['max_epochs']):
        
        n_samples = 0  # sample counter
              
        # shuffle training data every epoch
        print '==> Shuffling sentences...',
        shuffle(train)
        print 'Done'
      
        # learning rate decay
        if eidx >= opts['lr_decay_start']:
            lrate /= opts['lr_decay'] 

        print 'epoch = ', eidx, 'lr = ', lrate
 
        # training iterator 
        kf_train = KFold(len(train), n_folds=len(train)/(batch_size-1), shuffle=False)
  
        # inner loop: batches
        for _, index in kf_train:
            n_samples += len(index)
            uidx += 1

            # is_train=1 at training time
            is_train.set_value(1.)

            # get a batch
            x = [train[i] for i in index]
                
            # format input data
            x_word_input_, x_mask_ = txt_to_word_inps(x, word_dict, opts) 

            # compute cost 
            cost = f_grad_shared(x_word_input_, (1 - x_mask_))     

            # update parameters 
            f_update(lrate)

            # check cost  
            if numpy.isnan(cost) or numpy.isinf(cost):
                print 'NaN detected'
                return 1., 1., 1.
  
            # display cost
            if numpy.mod(uidx, opts['dispFreq']) == 0:
                print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost

            # save params
            if numpy.mod(uidx, opts['saveFreq']) == 0:
                print 'Saving...',
                if best_p is not None:
                    params = best_p
                else:
                    params = unzip(tparams)
                numpy.savez(opts['saveto'], history_errs=history_errs, **params)
                pkl.dump(opts, open('%s.pkl' % opts['saveto'], 'wb'))
                print 'Done'

            # compute validation/test perplexity
            if numpy.mod(uidx, opts['validFreq']) == 0:
                print "Computing Dev/Test Perplexity"
                
                # is_train=0 at valid/test time
                is_train.set_value(0.)                  
                valid_err = perplexity(f_cost, valid_lines, word_dict, opts)               
                test_err = perplexity(f_cost, test_lines, word_dict, opts)
                history_errs.append([valid_err, test_err])
                
                # save the best params
                if len(history_errs) > 1:
                    if uidx == 0 or valid_err <= numpy.array(
                            history_errs)[:, 0].min():
                        best_p = unzip(tparams)
                        print 'Saving best params...',
                        numpy.savez(opts['savebestto'], history_errs=history_errs, **params)
                        pkl.dump(opts, open('%s.pkl' % opts['savebestto'], 'wb'))
                        print 'Done'
                        bad_counter = 0
                    if len(history_errs) > opts['patience'] and valid_err >= numpy.array(
                                history_errs)[:-opts['patience'], 0].min():
                        bad_counter += 1
                        if bad_counter > opts['patience']:
                            print 'Early Stop!'
                            estop = True
                            break

                print 'Valid ', valid_err, 'Test ', test_err 
   
        # inner loop: end
  
        print 'Seen %d samples' % n_samples

        # early stopping
        if estop:
            break
    
    # outer loop: end 
   
    if best_p is not None:
        zipp(best_p, tparams)
    
    # compute validation/test perplexity at the end of training
    is_train.set_value(0.)
    valid_err = perplexity(f_cost, valid_lines, word_dict, opts)
    test_err = perplexity(f_cost, test_lines, word_dict, opts)
    print 'Valid ', valid_err, 'Test ', test_err

    # save everithing
    params = copy.copy(best_p)
    numpy.savez(opts['saveto'], zipped_params=best_p, valid_err=valid_err, 
                test_err=test_err, history_errs=history_errs, **params)

    return valid_err, test_err
示例#26
0
def train(opts):
    """ training process starts here """

    print '==> Training a language model'
    print '    [Word only]'

    #---------------------------------------------------------
    # prepare ingredients
    #---------------------------------------------------------

    print '==> Loading dictionaries: ',

    # load word dictionary
    print 'word dict,',
    if opts['word_dictionary']:
        with open(opts['word_dictionary'], 'rb') as f:
            word_dict = pkl.load(f)  # word -> index
        word_idict = dict()
        for kk, vv in word_dict.iteritems():
            word_idict[vv] = kk  # index -> word
    print 'Done'

    # reload options
    if opts['reload_'] and os.path.exists(opts['saveto']):
        with open('%s.pkl' % opts['saveto'], 'rb') as f:
            reloaded_options = pkl.load(f)
            opts.update(reloaded_options)

    # load training data
    train = load_data(path=opts['train_text'])

    # initialize params
    print '==> Building model:'
    params = init_params(opts)

    # reload parameters
    if opts['reload_'] and os.path.exists(opts['saveto']):
        params = load_params(opts['saveto'], params)

    # convert params to Theano shared variabel
    tparams = init_tparams(params)

    # build computational graph
    trng, is_train, x_word_input, x_mask, cost = build_model(tparams, opts)
    inps = [x_word_input, x_mask]

    print '==> Building f_cost...',
    f_cost = theano.function(inps, cost)
    print 'Done'

    # get gradients
    print '==> Computing gradient...',
    grads = tensor.grad(cost, wrt=itemlist(tparams))

    # gradient clipping
    print 'gradient clipping...',
    grad_norm = tensor.sqrt(tensor.sum([tensor.sum(g**2.) for g in grads]))
    tau = opts['gradclip']
    grad_clipped = []
    for g in grads:
        grad_clipped.append(
            tensor.switch(tensor.ge(grad_norm, tau), g * tau / grad_norm, g))
    print 'Done'

    # build optimizer
    lr = tensor.scalar(name='lr')
    print '==> Building optimizers...',
    f_grad_shared, f_update = eval(opts['optimizer'])(lr, tparams,
                                                      grad_clipped, inps, cost)
    print 'Done'

    #---------------------------------------------------------
    # start optimization
    #---------------------------------------------------------

    print '==> Optimization:'

    # reload history
    history_errs = []
    if opts['reload_'] and os.path.exists(opts['saveto']):
        history_errs = list(numpy.load(opts['saveto'])['history_errs'])
    best_p = None
    bad_counter = 0

    # load validation and test data
    if opts['valid_text']:
        valid_lines = []
        with open(opts['valid_text'], 'r') as f:
            for l in f:
                valid_lines.append(l)
        n_valid_lines = len(valid_lines)
    if opts['test_text']:
        test_lines = []
        with open(opts['test_text'], 'r') as f:
            for l in f:
                test_lines.append(l)
        n_test_lines = len(test_lines)

    # initialize some values
    uidx = 0  # update counter
    estop = False  # early stopping flag
    lrate = opts['lrate']
    batch_size = opts['batch_size']

    # outer loop: epochs
    for eidx in xrange(opts['max_epochs']):

        n_samples = 0  # sample counter

        # shuffle training data every epoch
        print '==> Shuffling sentences...',
        shuffle(train)
        print 'Done'

        # learning rate decay
        if eidx >= opts['lr_decay_start']:
            lrate /= opts['lr_decay']

        print 'epoch = ', eidx, 'lr = ', lrate

        # training iterator
        kf_train = KFold(len(train),
                         n_folds=len(train) / (batch_size - 1),
                         shuffle=False)

        # inner loop: batches
        for _, index in kf_train:
            n_samples += len(index)
            uidx += 1

            # is_train=1 at training time
            is_train.set_value(1.)

            # get a batch
            x = [train[i] for i in index]

            # format input data
            x_word_input_, x_mask_ = txt_to_word_inps(x, word_dict, opts)

            # compute cost
            cost = f_grad_shared(x_word_input_, (1 - x_mask_))

            # update parameters
            f_update(lrate)

            # check cost
            if numpy.isnan(cost) or numpy.isinf(cost):
                print 'NaN detected'
                return 1., 1., 1.

            # display cost
            if numpy.mod(uidx, opts['dispFreq']) == 0:
                print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost

            # save params
            if numpy.mod(uidx, opts['saveFreq']) == 0:
                print 'Saving...',
                if best_p is not None:
                    params = best_p
                else:
                    params = unzip(tparams)
                numpy.savez(opts['saveto'],
                            history_errs=history_errs,
                            **params)
                pkl.dump(opts, open('%s.pkl' % opts['saveto'], 'wb'))
                print 'Done'

            # compute validation/test perplexity
            if numpy.mod(uidx, opts['validFreq']) == 0:
                print "Computing Dev/Test Perplexity"

                # is_train=0 at valid/test time
                is_train.set_value(0.)
                valid_err = perplexity(f_cost, valid_lines, word_dict, opts)
                test_err = perplexity(f_cost, test_lines, word_dict, opts)
                history_errs.append([valid_err, test_err])

                # save the best params
                if len(history_errs) > 1:
                    if uidx == 0 or valid_err <= numpy.array(
                            history_errs)[:, 0].min():
                        best_p = unzip(tparams)
                        print 'Saving best params...',
                        numpy.savez(opts['savebestto'],
                                    history_errs=history_errs,
                                    **params)
                        pkl.dump(opts, open('%s.pkl' % opts['savebestto'],
                                            'wb'))
                        print 'Done'
                        bad_counter = 0
                    if len(history_errs
                           ) > opts['patience'] and valid_err >= numpy.array(
                               history_errs)[:-opts['patience'], 0].min():
                        bad_counter += 1
                        if bad_counter > opts['patience']:
                            print 'Early Stop!'
                            estop = True
                            break

                print 'Valid ', valid_err, 'Test ', test_err

        # inner loop: end

        print 'Seen %d samples' % n_samples

        # early stopping
        if estop:
            break

    # outer loop: end

    if best_p is not None:
        zipp(best_p, tparams)

    # compute validation/test perplexity at the end of training
    is_train.set_value(0.)
    valid_err = perplexity(f_cost, valid_lines, word_dict, opts)
    test_err = perplexity(f_cost, test_lines, word_dict, opts)
    print 'Valid ', valid_err, 'Test ', test_err

    # save everithing
    params = copy.copy(best_p)
    numpy.savez(opts['saveto'],
                zipped_params=best_p,
                valid_err=valid_err,
                test_err=test_err,
                history_errs=history_errs,
                **params)

    return valid_err, test_err
示例#27
0
#                print(str(b+1)+"): Label "+label_list[b]+ " corresponds with topic "+str(max_indexes[i]))
#                print("Topic: "+tps[max_indexes[i]][1])
#        else:
#            print(str(b+1)+"): Label "+label_list[b]+ " has no clear topic match")
#        
##        for index, score in sorted(model[bow_corpus[i]], key=lambda tup: -1*tup[1]):
##            print(index, score, labels[i])
##            print(model.get_document_topics(bow_corpus[i]))
##            print("\nScore: {}\t \nTopic: {}".format(score, model.print_topic(index, 10)))
#    return scores, topics
        
    
if __name__ == "__main__":
    
    print("Loading and splitting data for training and testing.\n")
    data = dp.load_data()
    training_data, testing_data = dp.get_split_data(data)
    files = list(data.keys())
    training_files = []
    training_labels = []
    training_content = []
    testing_files = []
    testing_labels = []
    testing_content = []
    training_keys = list(training_data.keys())
    for i,f in enumerate(files):
        if files[i] in training_keys:
            training_files.append(files[i])
            training_labels.append(training_data[files[i]]['label'])
            training_content.append(training_data[files[i]]['content'])
        else: