def load_vocab(): vocab_file = os.path.join(os.path.dirname(__file__),"vocab.pkl") pklData = read(vocab_file) # list ['我','是'] vocab = {} for idx, item in enumerate(pklData): vocab[item] = idx # .strip() return vocab
def load_vocab(filename): pklData = pkl.read(filename) # list ['我','是'] vocab = {} for idx, item in enumerate(pklData): vocab[item.strip()] = idx return vocab # 一个对象{key,value} value是idx
import numpy as np import tensorflow as tf import config import pkl GO_TOKEN = 0 END_TOKEN = 1 UNK_TOKEN = 2 input_max_length,output_max_length = config.get_max() batch_size = config.get_size() data_list = pkl.read('./corpus.pkl') def input_fn(): # 创建placeholder 然后切片放进打印机打印 inp = tf.placeholder(tf.int64, shape=[None, None], name='input') output = tf.placeholder(tf.int64, shape=[None, None], name='output') # output 指label吗 tf.identity(inp[0], 'input_0') # 好像内存中多了一个叫做input_0的op,然后到时候就可以根据名字调用 tf.identity(output[0], 'output_0') return {'input': inp,'output': output}, None # 这个None是怎么回事 def get_feed_fn(vocab): input_filename='input' output_filename='output' # 每次输入的样本,长一点好还是短一点好呢? # 会不会长一点不太好预测 # 短一点的会不会就捕捉不到句子之间的语义关系 # 一篇文章要一个词一个词的分割吗 # 每隔两个词就当一成一个新的sample,一句话可以有10个sample # 文字生成器 # 还是一篇文章只用一次? def str2idx(string): # string = string.split(' ') # 为了适应demo, 之后注释这一句
import numpy as np def load_vocab(): vocab_file = os.path.join(os.getcwd(), "vocab.pkl") pklData = read(vocab_file) # list ['我','是'] vocab = {} for idx, item in enumerate(pklData): vocab[item] = idx # .strip() return vocab vocab = load_vocab() embedding_file = os.path.join(os.getcwd(), "embeddings.pkl") trainedEmbeddings = read(embedding_file) # 长度 4549 3个手工增加 3个预留 # print(embeddings[100]) def str2embed(string): embeddingList = [] for word in string: embedding = trainedEmbeddings[vocab.get(word)] embedding = np.array(embedding) embeddingList.append(embedding) return np.array(embeddingList) # str2embed('string')[1] # def getSize():