def build_word2vec_model(model_name): ''' retrieve the model from persistent files. Args: model_name (string): the model name, assuming models are located in the standard folder. Return: model (word2vec object): the retrieved model ''' NUM_THREADS = 2 * multiprocessing.cpu_count() - 1 COMMON_PATH = os.path.join(os.path.expanduser("~"), 'local_tensorflow_content') model_config = {} model_config['model_name'] = model_name model_config['restore_model'] = True model_config['eval_mode'] = True use_gpu = False if use_gpu: model_config['sess_config'] = tf.ConfigProto( log_device_placement=False, gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=0.5)) else: os.environ[ 'CUDA_VISIBLE_DEVICES'] = '-1' # the only way to completely not use GPU model_config['sess_config'] = tf.ConfigProto( intra_op_parallelism_threads=NUM_THREADS) model_config['model_path'] = create_local_model_path( COMMON_PATH, model_config['model_name']) model_config['log_path'] = create_local_log_path( COMMON_PATH, model_config['model_name']) model = word2vec(**model_config) return model
def predict_with_word2vec(): model_config = {} model_config['model_name'] = 'word2vec' model_config['restore_model'] = True model_config['eval_mode'] = True use_gpu = False if use_gpu: model_config['sess_config'] = tf.ConfigProto( log_device_placement=False, gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=0.5)) else: os.environ[ 'CUDA_VISIBLE_DEVICES'] = '-1' # the only way to completely not use GPU model_config['sess_config'] = tf.ConfigProto( intra_op_parallelism_threads=NUM_THREADS) model_config['model_path'] = create_local_model_path( COMMON_PATH, model_config['model_name']) model_config['log_path'] = create_local_log_path( COMMON_PATH, model_config['model_name']) model = word2vec(**model_config) max_X, min_X, mean_X = model.predict([2, 4, 5]) print max_X print min_X
def main(): NUM_THREADS = multiprocessing.cpu_count() COMMON_PATH = os.path.join(os.path.expanduser("~"), 'local_tensorflow_content') pickle_file = 'titles_CBOW_data.pkl' pickle_file_path = os.path.join(os.path.expanduser("~"), pickle_file) dataGen = DataGenerator(pickle_file_path) model_config, training_config = {}, {} model_config['vocab_size'] = dataGen.vocab_size model_config['batch_size'] = 32 model_config['context_window'] = 2 model_config['embedding_size'] = 128 model_config['neg_sample_size'] = 2 model_config['learning_rate'] = 0.0005 model_config['model_name'] = 'word2vec' batches = dataGen.generate_sequence(model_config['batch_size']) model = word2vec(**model_config) use_gpu = False if use_gpu: training_config['sess_config'] = tf.ConfigProto( log_device_placement=False, gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=0.5)) else: os.environ[ 'CUDA_VISIBLE_DEVICES'] = '-1' # the only way to completely not use GPU training_config['sess_config'] = tf.ConfigProto( intra_op_parallelism_threads=NUM_THREADS) training_config['model_path'] = create_local_model_path( COMMON_PATH, model_config['model_name']) training_config['log_path'] = create_local_log_path( COMMON_PATH, model_config['model_name']) generate_tensorboard_script( training_config['log_path'] ) # create the script to start a tensorboard session training_config['epoch_num'] = 20000 training_config['display_steps'] = 1000 training_config['saving_steps'] = 1 * training_config['display_steps'] training_config['num_batches'] = int(dataGen.data_size * training_config['epoch_num'] / model_config['batch_size']) print 'total #batches: {}, vocab_size: {}'.format( training_config['num_batches'], model_config['vocab_size']) model.train(batches, training_config, restore_model=False)
def model_train(): pickle_file = 'lemmatized_only_skip_gram_window_2_skips_2.pkl' pickle_file_path = os.path.join(os.path.expanduser("~"), pickle_file) dataGen = DataGenerator(pickle_file_path) model_config, training_config = {}, {} model_config['vocab_size'] = dataGen.vocab_size model_config['model_name'] = 'word2vec_skip_gram_lemmatized_only_window_2_skips_2' model_config['model_type'] = "SKIP_GRAM" model_config['batch_size'] = 32 model_config['context_window'] = 2 model_config['embedding_size'] = 128 model_config['neg_sample_size'] = 10 model_config['learning_rate'] = 0.001 model_config['saving_steps'] = 20000 #model_config['model_name'] = 'word2vec_model' model_config['restore_model'] = False model_config['eval_mode'] = False batches = dataGen.generate_sequence(model_config['batch_size']) use_gpu = False if use_gpu: model_config['sess_config'] = tf.ConfigProto(log_device_placement=False, gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=0.5)) else: os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # the only way to completely not use GPU model_config['sess_config'] = tf.ConfigProto(intra_op_parallelism_threads=NUM_THREADS) model_config['model_path'] = create_local_model_path(COMMON_PATH, model_config['model_name']) model_config['log_path'] = create_local_log_path(COMMON_PATH, model_config['model_name']) generate_tensorboard_script(model_config['log_path']) # create the script to start a tensorboard session model = word2vec(**model_config) epoch_num = 20000 training_config['batches'] = batches training_config['display_steps'] = 10000 training_config['saving_steps'] = model_config['saving_steps'] training_config['num_batches'] = int(dataGen.data_size * epoch_num / model_config['batch_size']) print 'total #batches: {}, vocab_size: {}'.format(training_config['num_batches'], model_config['vocab_size']) model.train(**training_config)
def main(): model_config = {} model_config['batch_size'] = 32 model_config['context_window'] = 2 model_config['embedding_size'] = 128 model_config['neg_sample_size'] = 10 model_config['learning_rate'] = 0.001 model_config['model_name'] = 'word2vec_test' model_config['restore_model'] = False model_config['eval_mode'] = False model_config['model_type'] = 'SKIP_GRAM' model_config['model_path'] = create_local_model_path( COMMON_PATH, model_config['model_name']) model_config['log_path'] = create_local_log_path( COMMON_PATH, model_config['model_name']) use_gpu = False if use_gpu: model_config['sess_config'] = tf.ConfigProto( log_device_placement=False, gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=0.5)) else: os.environ[ 'CUDA_VISIBLE_DEVICES'] = '-1' # the only way to completely not use GPU model_config['sess_config'] = tf.ConfigProto( intra_op_parallelism_threads=NUM_THREADS) model = word2vec(**model_config) with model.graph.as_default(): input_X_, input_y = skip_gram_batch_content() print "input_X_: ", input_X_ print "input_y: ", input_y _, loss = model.sess.run([model.train_op, model.loss], feed_dict={ model.X: input_X_, model.y: input_y }) print "the model loss: {}".format(loss)