예제 #1
0
def load_vocab():
    vocab_file = os.path.join(os.path.dirname(__file__),"vocab.pkl")
    pklData = read(vocab_file) # list ['我','是']
    vocab = {} 
    for idx, item in enumerate(pklData):
        vocab[item] = idx  # .strip()
    return vocab 
예제 #2
0
def load_vocab(filename):
    pklData = pkl.read(filename)  # list ['我','是']
    vocab = {}
    for idx, item in enumerate(pklData):
        vocab[item.strip()] = idx
    return vocab  # 一个对象{key,value} value是idx
예제 #3
0
import numpy as np
import tensorflow as tf
import config
import pkl

GO_TOKEN = 0
END_TOKEN = 1
UNK_TOKEN = 2

input_max_length,output_max_length = config.get_max()
batch_size = config.get_size()
data_list = pkl.read('./corpus.pkl')
def input_fn():
    # 创建placeholder 然后切片放进打印机打印
    inp = tf.placeholder(tf.int64, shape=[None, None], name='input')
    output = tf.placeholder(tf.int64, shape=[None, None], name='output') # output 指label吗
    tf.identity(inp[0], 'input_0') # 好像内存中多了一个叫做input_0的op,然后到时候就可以根据名字调用
    tf.identity(output[0], 'output_0')
    return {'input': inp,'output': output}, None # 这个None是怎么回事

def get_feed_fn(vocab):
    input_filename='input'
    output_filename='output'
    # 每次输入的样本,长一点好还是短一点好呢?
        # 会不会长一点不太好预测
        # 短一点的会不会就捕捉不到句子之间的语义关系
    # 一篇文章要一个词一个词的分割吗
        # 每隔两个词就当一成一个新的sample,一句话可以有10个sample # 文字生成器
        # 还是一篇文章只用一次?
    def str2idx(string):
        # string = string.split(' ') # 为了适应demo, 之后注释这一句
예제 #4
0
import numpy as np


def load_vocab():
    vocab_file = os.path.join(os.getcwd(), "vocab.pkl")
    pklData = read(vocab_file)  # list ['我','是']
    vocab = {}
    for idx, item in enumerate(pklData):
        vocab[item] = idx  # .strip()

    return vocab


vocab = load_vocab()
embedding_file = os.path.join(os.getcwd(), "embeddings.pkl")
trainedEmbeddings = read(embedding_file)  # 长度 4549 3个手工增加 3个预留
# print(embeddings[100])


def str2embed(string):
    embeddingList = []
    for word in string:
        embedding = trainedEmbeddings[vocab.get(word)]
        embedding = np.array(embedding)
        embeddingList.append(embedding)
    return np.array(embeddingList)


# str2embed('string')[1]

# def getSize():