def get_embeddings(hparams): if hparams.glove_path and hparams.vocab_path: tf.logging.info("Loading Glove embeddings...") vocab_array, vocab_dict = helpers.load_vocab(hparams.vocab_path) glove_vectors, glove_dict = helpers.load_glove_vectors( hparams.glove_path, vocab=set(vocab_array)) initializer = helpers.build_initial_embedding_matrix( vocab_dict, glove_dict, glove_vectors, hparams.embedding_dim) else: tf.logging.info( "No glove/vocab path specificed, starting with random embeddings.") initializer = tf.random_uniform_initializer(-0.25, 0.25) return tf.get_variable("word_embeddings", shape=[hparams.vocab_size, hparams.embedding_dim], initializer=initializer)
# # Read Training data # ---------------------------------------------------------------------------- print ("Loading data..") train_texts = [] train_tags = [] train_labels = [] word_index_pickle = open(TRAIN_DATA_FILE, 'rb') pickling = pickle.load(word_index_pickle) x = pickling['word_indices'] y = pickling['y'] tags= pickling['meta_tag'] word_index = 60000 embedding_matrix = helpers.build_initial_embedding_matrix(word_index, EMBEDDING_DIM) # # Prepare embeddings # ---------------------------------------------------------------------------- print('Preparing embedding matrix') #nb_words = min(MAX_NB_WORDS, len(word_index)) + 1 nb_words = min(MAX_NB_WORDS, len(word_index)) # # Sample train/validation data # ---------------------------------------------------------------------------- np.random.seed(1234)
# EMBEDDING_FILE = 'GoogleNews-vectors-negative300.bin' # EMBEDDING_DIM = 300 # elif(embedding == "twitter"): # EMBEDDING_FILE = 'glove.twitter.27B.200d.txt' # EMBEDDING_DIM = 200 EMBEDDING_FILE = 'glove.twitter.27B.200d.txt' EMBEDDING_DIM = 200 EMBEDDING_FILE = DATA_PATH + EMBEDDING_FILE embedding_matrix = '' nb_words = min(MAX_NB_WORDS, len(word_index)) + 1 glove_vectors, glove_dict = helpers.load_glove_vectors(EMBEDDING_FILE, vocab=set(word_index)) embedding_matrix = helpers.build_initial_embedding_matrix( word_index, glove_dict, glove_vectors, EMBEDDING_DIM) # if(embedding == 'google'): # word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, # binary=True) # embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM)) # for word, i in word_index.items(): # if word in word2vec.vocab: # embedding_matrix[i] = word2vec.word_vec(word) # else: # glove_vectors, glove_dict = helpers.load_glove_vectors(EMBEDDING_FILE, vocab=set(word_index)) # embedding_matrix = helpers.build_initial_embedding_matrix(word_index, glove_dict, glove_vectors, EMBEDDING_DIM) ######################################## ## sample train/validation data ########################################