def build_word2vec_model(model_name):
    ''' retrieve the model from persistent files.
    Args:
        model_name (string): the model name, assuming models
        are located in the standard folder.
    Return:
        model (word2vec object): the retrieved model
    '''
    NUM_THREADS = 2 * multiprocessing.cpu_count() - 1
    COMMON_PATH = os.path.join(os.path.expanduser("~"),
                               'local_tensorflow_content')

    model_config = {}
    model_config['model_name'] = model_name
    model_config['restore_model'] = True
    model_config['eval_mode'] = True

    use_gpu = False
    if use_gpu:
        model_config['sess_config'] = tf.ConfigProto(
            log_device_placement=False,
            gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=0.5))
    else:
        os.environ[
            'CUDA_VISIBLE_DEVICES'] = '-1'  # the only way to completely not use GPU
        model_config['sess_config'] = tf.ConfigProto(
            intra_op_parallelism_threads=NUM_THREADS)

    model_config['model_path'] = create_local_model_path(
        COMMON_PATH, model_config['model_name'])
    model_config['log_path'] = create_local_log_path(
        COMMON_PATH, model_config['model_name'])

    model = word2vec(**model_config)
    return model
def predict_with_word2vec():
    model_config = {}
    model_config['model_name'] = 'word2vec'
    model_config['restore_model'] = True
    model_config['eval_mode'] = True

    use_gpu = False
    if use_gpu:
        model_config['sess_config'] = tf.ConfigProto(
            log_device_placement=False,
            gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=0.5))
    else:
        os.environ[
            'CUDA_VISIBLE_DEVICES'] = '-1'  # the only way to completely not use GPU
        model_config['sess_config'] = tf.ConfigProto(
            intra_op_parallelism_threads=NUM_THREADS)

    model_config['model_path'] = create_local_model_path(
        COMMON_PATH, model_config['model_name'])
    model_config['log_path'] = create_local_log_path(
        COMMON_PATH, model_config['model_name'])

    model = word2vec(**model_config)
    max_X, min_X, mean_X = model.predict([2, 4, 5])
    print max_X
    print min_X
def main():

    NUM_THREADS = multiprocessing.cpu_count()
    COMMON_PATH = os.path.join(os.path.expanduser("~"),
                               'local_tensorflow_content')

    pickle_file = 'titles_CBOW_data.pkl'
    pickle_file_path = os.path.join(os.path.expanduser("~"), pickle_file)
    dataGen = DataGenerator(pickle_file_path)

    model_config, training_config = {}, {}
    model_config['vocab_size'] = dataGen.vocab_size
    model_config['batch_size'] = 32
    model_config['context_window'] = 2
    model_config['embedding_size'] = 128
    model_config['neg_sample_size'] = 2
    model_config['learning_rate'] = 0.0005
    model_config['model_name'] = 'word2vec'
    batches = dataGen.generate_sequence(model_config['batch_size'])
    model = word2vec(**model_config)

    use_gpu = False
    if use_gpu:
        training_config['sess_config'] = tf.ConfigProto(
            log_device_placement=False,
            gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=0.5))
    else:
        os.environ[
            'CUDA_VISIBLE_DEVICES'] = '-1'  # the only way to completely not use GPU
        training_config['sess_config'] = tf.ConfigProto(
            intra_op_parallelism_threads=NUM_THREADS)

    training_config['model_path'] = create_local_model_path(
        COMMON_PATH, model_config['model_name'])
    training_config['log_path'] = create_local_log_path(
        COMMON_PATH, model_config['model_name'])
    generate_tensorboard_script(
        training_config['log_path']
    )  # create the script to start a tensorboard session

    training_config['epoch_num'] = 20000
    training_config['display_steps'] = 1000
    training_config['saving_steps'] = 1 * training_config['display_steps']
    training_config['num_batches'] = int(dataGen.data_size *
                                         training_config['epoch_num'] /
                                         model_config['batch_size'])
    print 'total #batches: {}, vocab_size: {}'.format(
        training_config['num_batches'], model_config['vocab_size'])

    model.train(batches, training_config, restore_model=False)
def model_train():

    pickle_file = 'lemmatized_only_skip_gram_window_2_skips_2.pkl'
    pickle_file_path = os.path.join(os.path.expanduser("~"), pickle_file)
    dataGen = DataGenerator(pickle_file_path)

    model_config, training_config = {}, {}
    model_config['vocab_size'] = dataGen.vocab_size
    model_config['model_name'] = 'word2vec_skip_gram_lemmatized_only_window_2_skips_2'
    model_config['model_type'] = "SKIP_GRAM"
    model_config['batch_size'] = 32
    model_config['context_window'] = 2
    model_config['embedding_size'] = 128
    model_config['neg_sample_size'] = 10
    model_config['learning_rate'] = 0.001
    model_config['saving_steps'] = 20000
    #model_config['model_name'] = 'word2vec_model'
    model_config['restore_model'] = False
    model_config['eval_mode'] = False
    batches = dataGen.generate_sequence(model_config['batch_size'])

    use_gpu = False
    if use_gpu:
        model_config['sess_config'] = tf.ConfigProto(log_device_placement=False,
                                                     gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=0.5))
    else:
        os.environ['CUDA_VISIBLE_DEVICES'] = '-1'  # the only way to completely not use GPU
        model_config['sess_config'] = tf.ConfigProto(intra_op_parallelism_threads=NUM_THREADS)

    model_config['model_path'] = create_local_model_path(COMMON_PATH, model_config['model_name'])
    model_config['log_path'] = create_local_log_path(COMMON_PATH, model_config['model_name'])
    generate_tensorboard_script(model_config['log_path'])  # create the script to start a tensorboard session
    model = word2vec(**model_config)

    epoch_num = 20000
    training_config['batches'] = batches
    training_config['display_steps'] = 10000
    training_config['saving_steps'] = model_config['saving_steps']
    training_config['num_batches'] = int(dataGen.data_size * epoch_num / model_config['batch_size'])
    print 'total #batches: {}, vocab_size: {}'.format(training_config['num_batches'], model_config['vocab_size'])
    model.train(**training_config)
示例#5
0
def main():
    model_config = {}
    model_config['batch_size'] = 32
    model_config['context_window'] = 2
    model_config['embedding_size'] = 128
    model_config['neg_sample_size'] = 10
    model_config['learning_rate'] = 0.001
    model_config['model_name'] = 'word2vec_test'
    model_config['restore_model'] = False
    model_config['eval_mode'] = False
    model_config['model_type'] = 'SKIP_GRAM'
    model_config['model_path'] = create_local_model_path(
        COMMON_PATH, model_config['model_name'])
    model_config['log_path'] = create_local_log_path(
        COMMON_PATH, model_config['model_name'])

    use_gpu = False
    if use_gpu:
        model_config['sess_config'] = tf.ConfigProto(
            log_device_placement=False,
            gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=0.5))
    else:
        os.environ[
            'CUDA_VISIBLE_DEVICES'] = '-1'  # the only way to completely not use GPU
        model_config['sess_config'] = tf.ConfigProto(
            intra_op_parallelism_threads=NUM_THREADS)

    model = word2vec(**model_config)

    with model.graph.as_default():
        input_X_, input_y = skip_gram_batch_content()
        print "input_X_: ", input_X_
        print "input_y: ", input_y
        _, loss = model.sess.run([model.train_op, model.loss],
                                 feed_dict={
                                     model.X: input_X_,
                                     model.y: input_y
                                 })
        print "the model loss: {}".format(loss)