Пример #1
0
    def __init__(self, is_training, config):
        self.batch_size = batch_size = config.batch_size  # batch_size
        self.num_steps = num_steps = config.num_steps  #
        size = config.hidden_size  # 隐藏层
        vocab_size = config.vocab_size  # 词表size
        # 输入占位符
        self._input_data = tf.placeholder(tf.int32, [batch_size, num_steps])
        self._targets = tf.placeholder(tf.int32, [batch_size, num_steps])

        lstm_cell = rnn_cell.BasicLSTMCell(size, forget_bias=0.0)
        if is_training and config.keep_prob < 1:
            lstm_cell = rnn_cell.DropoutWrapper(
                lstm_cell, output_keep_prob=config.keep_prob)
        cell = rnn_cell.MultiRNNCell([lstm_cell] * config.num_layers)

        self._initial_state = cell.zero_state(batch_size, tf.float32)

        with tf.device("/cpu:0"):
            embedding = tf.get_variable("embedding", [vocab_size, size])
            inputs = tf.nn.embedding_lookup(embedding, self._input_data)

        if is_training and config.keep_prob < 1:
            inputs = tf.nn.dropout(inputs, config.keep_prob)

        outputs = []
        states = []
        state = self._initial_state
        with tf.variable_scope("RNN"):
            for time_step in range(num_steps):
                if time_step > 0:
                    tf.get_variable_scope().reuse_variables()
                (cell_output, state) = cell(inputs[:, time_step, :], state)
                outputs.append(cell_output)
                states.append(state)

        output = tf.reshape(tf.concat(outputs, 1), [-1, size])
        softmax_w = tf.get_variable("softmax_w", [size, vocab_size])
        softmax_b = tf.get_variable("softmax_b", [vocab_size])
        logits = tf.matmul(output, softmax_w) + softmax_b
        loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example(
            [logits], [tf.reshape(self._targets, [-1])],
            [tf.ones([batch_size * num_steps])], vocab_size)
        self._cost = cost = tf.reduce_sum(loss) / batch_size
        self._final_state = states[-1]

        if not is_training:
            return
        self._lr = tf.Variable(0.0, trainable=False)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars),
                                          config.max_grad_norm)
        optimizer = tf.train.GradientDescentOptimizer(self.lr)
        self._train_op = optimizer.apply_gradients(zip(grads, tvars))
Пример #2
0

PS_OPS = ['Variable', 'VariableV2', 'AutoReloadVariable']


def assign_to_device(device, ps_device='/cpu:0'):
    def _assign(op):
        node_def = op if isinstance(op, tf.NodeDef) else op.node_def
        if node_def.op in PS_OPS:
            return "/" + ps_device
        else:
            return device


# 所有的Graph的操作默认放在CPU上
with tf.device('/cpu:0'):
    tower_grads = []
    reuse_vars = False
    # tf Graph input
    X = tf.placeholder(tf.float32, [None, num_input])
    Y = tf.placeholder(tf.float32, [None, num_classes])

    # 遍历所有GPU并构建自己的计算图
    for i in range(num_gpus):
        with tf.device(
                assign_to_device('/gpu:{}'.format(i), ps_device='/cpu:0')):
            # Split data between GPUs
            _x = X[i * batch_size:(i + 1) * batch_size]
            _y = Y[i * batch_size:(i + 1) * batch_size]
            # 由于dropout在训练和预测阶段是不同的行为,所以需要创建两个独立的图来共享相同参数
            logits_train = conv_net(_x,
A = np.random.rand(10000, 10000).astype('float32')
B = np.random.rand(10000, 10000).astype('float32')
# 创建图存储结果
c1 = []
c2 = []


def matpow(M, n):
    if n < 1:
        return M
    else:
        return tf.matmul(M, matpow(M, n - 1))


"""单GPU情况"""
with tf.device('/gpu:0'):
    a = tf.placeholder(tf.float32, [10000, 10000])
    b = tf.placeholder(tf.float32, [10000, 10000])

    c1.append(matpow(a, n))
    c2.append(matpow(b, n))
with tf.device("/cpu:0"):
    sum = tf.add_n(c1)

t1_1 = datetime.datetime.now()
with tf.Session(config=tf.ConfigProto(
        log_device_placement=log_device_placement)) as sess:
    sess.run(sum, {a: A, b: B})
t2_1 = datetime.datetime.now()
"""多GPU情况"""
with tf.device('/gpu:0'):
Пример #4
0
            labels[i * num_skips + j, 0] = buffer[context_words]
        if data_index == len(data):
            buffer.extend(data[0:span])
            data_index = span
        else:
            buffer.append(data[data_index])
            data_index += 1
    data_index = (data_index + len(data) - span) % len(data)
    return batch, labels


# Input data
X = tf.placeholder(tf.int32, shape=[None])
Y = tf.placeholder(tf.int32, shape=[None, 1])

with tf.device("/cpu:0"):
    embedding = tf.Variable(tf.random_normal([vocab_size, embedding_size]))
    X_embed = tf.nn.embedding_lookup(embedding, X)

    # 为NCE loss 构造变量
    nce_weights = tf.Variable(tf.random_normal([vocab_size, embedding_size]))
    nce_biases = tf.Variable(tf.zeros([vocab_size]))

# 为每个batch计算平均的nce loss
loss_op = tf.reduce_mean(
    tf.nn.nce_loss(weights=nce_weights,
                   biases=nce_biases,
                   labels=Y,
                   inputs=X_embed,
                   num_sampled=num_sampled,
                   num_classes=vocab_size))
Пример #5
0
def word2vec_basic(log_dir):
    # 创建tensorboard的可视化目录
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    # 第一步,下载数据
    url = 'http://mattmahoney.net/dc/'

    def maybe_download(filename, expected_bytes, sha256=None):
        local_filename = os.path.join(gettempdir(), filename)
        if not os.path.exists(local_filename):
            local_filename, _ = urllib.request.urlretrieve(
                url + filename, local_filename)
        statinfo = os.stat(local_filename)

        if sha256 and _hash_file(local_filename) != sha256:
            raise Exception('Failed to verify ' + local_filename +
                            ' due to hash '
                            'mismatch. Can you get to it with a browser?')

        if statinfo.st_size == expected_bytes:
            print("found and verified", filename)
        else:
            print(statinfo.st_size)
            raise Exception('Failed to verify ' + local_filename +
                            '. Can you get to it with a browser?')
        return local_filename

    filename = maybe_download(
        'text8.zip',
        31344016,
        sha256=
        'a6640522afe85d1963ad56c05b0ede0a0c000dddc9671758a6cc09b7a38e5232')

    # 数据转为List<String>
    def read_data(filename):
        with zipfile.ZipFile(filename) as f:
            data = tf.compat.as_str(f.read(f.namelist()[0])).split()
        return data

    vocabulary = read_data(filename)
    print('data_size', len(vocabulary))

    # 第二步,建词典并且把罕见词替换成UNK
    vocabulary_size = 50000

    def build_dataset(words, n_words):

        count = [['UNK', -1]]
        count.extend(collections.Counter(words).most_common(n_words - 1))
        dictionary = {word: index for index, (word, _) in enumerate(count)}
        data = []
        unk_count = 0
        for word in words:
            index = dictionary.get(word, 0)
            if index == 0:  # dictionary['UNK']
                unk_count += 1
            data.append(index)
        count[0][1] = unk_count
        reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
        return data, count, dictionary, reversed_dictionary

    # data: 词表中的所有的词的id
    # count: 单词和出现次数的map
    # dictionary: 单词-->index 的映射
    # reverse_dictionary:index -->单词
    data, count, dictionary, reversed_dictionary = build_dataset(
        vocabulary, vocabulary_size)
    del vocabulary
    print('Most common words (+UNK)', count[:5])
    print('Sample data', data[:10],
          [reversed_dictionary[i] for i in data[:10]])

    # 针对skip-gram模型生成batch数据
    def generate_batch(batch_size, num_skips, skip_window):
        global data_index
        assert batch_size % num_skips == 0
        assert num_skips <= 2 * skip_window
        batch = np.ndarray(shape=(batch_size), dtype=np.int32)
        labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
        # skip的范围
        span = 2 * skip_window + 1
        buffer = collections.deque(maxlen=span)
        if data_index + span > len(data):
            data_index = 0
        buffer.extend(data[data_index:data_index + span])  # 向后取一个窗口内的结果
        data_index += span
        for i in range(batch_size // num_skips):
            context_words = [w for w in range(span) if w != skip_window]
            words_to_use = random.sample(context_words, num_skips)
            for j, context_words in enumerate(words_to_use):
                batch[i * num_skips + j] = buffer[skip_window]
                labels[i * num_skips + j, 0] = buffer[context_words]
            if data_index == len(data):
                buffer.extend(data[0:span])
                data_index = span
            else:
                buffer.append(data[data_index])
                data_index += 1
        # Backtrack a little bit to avoid skipping words in the end of a batch
        data_index = (data_index - span) % len(data)
        return batch, labels

    batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
    for i in range(8):
        print(batch[i], reversed_dictionary[batch[i]], '->', labels[i, 0],
              reversed_dictionary[labels[i, 0]])

    # 建立并且训练模型

    batch_size = 128
    embedding_size = 128  # 词向量维度
    skip_window = 1  # 考虑左右几个单词
    num_skips = 2  # 复用输入生成标签的次数
    num_sampled = 64  # 负样本数量

    # 采样一个样本的近邻作为随机验证机,将验证集样本限制为 较低id的单词,是比较高频的构造词汇
    # 这三个变量用作显示模型准确率,不影响计算。
    valid_size = 16  # 用于评估相似性的随机单词集合
    valid_window = 100  #
    valid_examples = np.random.choice(valid_window, valid_size, replace=False)

    graph = tf.Graph()

    with graph.as_default():

        # 输入数据
        with tf.name_scope('input'):
            train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
            train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
            valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

        # 操作op和变量variables 固定在CPU上。
        with tf.device('/cpu:0'):
            with tf.name_scope('embeddings'):
                embeddings = tf.Variable(
                    tf.random_uniform([vocabulary_size, embedding_size], -1.0,
                                      1.0))
                embed = tf.nn.embedding_lookup(embeddings, train_inputs)

            # 构造NCE损失的变量
            with tf.name_scope('weights'):
                nce_weights = tf.Variable(
                    tf.truncated_normal([vocabulary_size, embedding_size],
                                        stddev=1.0 /
                                        math.sqrt(embedding_size)))

            with tf.name_scope('biases'):
                nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

        # 计算该批次的平均nce损失,当评估损失的时候,自动绘制一个新的负样本。
        with tf.name_scope('loss'):
            loss = tf.reduce_mean(
                tf.nn.nce_loss(weights=nce_weights,
                               biases=nce_biases,
                               labels=train_labels,
                               inputs=embed,
                               num_sampled=num_sampled,
                               num_classes=vocabulary_size))
        # 汇总损失
        tf.summary.scalar('loss', loss)

        # 构造SGD
        with tf.name_scope('opytimizer'):
            optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
        # 计算小批次样本和所有样本之间的余弦相似度
        norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
        normalized_embeddings = embeddings / norm
        valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,
                                                  valid_dataset)
        similarity = tf.matmul(valid_embeddings,
                               normalized_embeddings,
                               transpose_b=True)

        # merge all summary
        merged = tf.summary.merge_all()

        init = tf.global_variables_initializer()

        saver = tf.train.Saver()

    # 开始训练
    num_steps = 1000001

    with tf.compat.v1.Session(graph=graph) as session:
        # 写入摘要
        writer = tf.summary.FileWriter(log_dir, session.graph)

        init.run()
        print('inited..')
        average_loss = 0
        for step in range(num_steps):
            batch_inputs, batch_labels = generate_batch(
                batch_size, num_skips, skip_window)
            feed_dict = {
                train_inputs: batch_inputs,
                train_labels: batch_labels
            }
            # 定义元变量
            run_metadata = tf.RunMetadata()

            _, summary, loss_val = session.run([optimizer, merged, loss],
                                               feed_dict=feed_dict,
                                               run_metadata=run_metadata)
            average_loss += loss_val

            writer.add_summary(summary, step)

            if step == (num_steps - 1):
                writer.add_run_metadata(run_metadata, 'step%d' % step)

            if step % 2000 == 0:
                if step > 0:
                    average_loss /= 2000
                    # 平均损失是对最近的2000个批次样本的估计。
                print('Average loss at step ', step, ': ', average_loss)
                average_loss = 0

            if step % 10000 == 0:
                sim = similarity.eval()
                for i in range(valid_size):
                    valid_word = reversed_dictionary[valid_examples[i]]
                    top_k = 8
                    nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                    log_str = 'Nearest to %s:' % valid_word

                    print(
                        log_str, ', '.join([
                            reversed_dictionary[nearest[k]]
                            for k in range(top_k)
                        ]))
        final_embeddings = normalized_embeddings.eval()

        # 写下embedding的相应标签
        with open(log_dir + '/metadata.tsv', 'w') as f:
            for i in range(vocabulary_size):
                f.write(reversed_dictionary[i] + '\n')

        # 保存checkpoint
        saver.save(session, os.path.join(log_dir, 'model.ckpt'))

        # 配置Tensorboard
        config = projector.ProjectorConfig()
        embedding_conf = config.embeddings.add()
        embedding_conf.tensor_name = embeddings.name
        embedding_conf.metadata_path = os.path.join(log_dir, 'metadata.tsv')
        projector.visualize_embeddings(writer, config)
    writer.close()

    # Step 6: Visualize the embeddings.

    # pylint: disable=missing-docstring
    # Function to draw visualization of distance between embeddings.
    def plot_with_labels(low_dim_embs, labels, filename):
        assert low_dim_embs.shape[0] >= len(
            labels), 'More labels than embeddings'
        plt.figure(figsize=(18, 18))  # in inches
        for i, label in enumerate(labels):
            x, y = low_dim_embs[i, :]
            plt.scatter(x, y)
            plt.annotate(label,
                         xy=(x, y),
                         xytext=(5, 2),
                         textcoords='offset points',
                         ha='right',
                         va='bottom')

        plt.savefig(filename)

    try:
        # pylint: disable=g-import-not-at-top
        from sklearn.manifold import TSNE
        import matplotlib.pyplot as plt

        tsne = TSNE(perplexity=30,
                    n_components=2,
                    init='pca',
                    n_iter=5000,
                    method='exact')
        plot_only = 500
        low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
        labels = [reversed_dictionary[i] for i in xrange(plot_only)]
        plot_with_labels(low_dim_embs, labels,
                         os.path.join(gettempdir(), 'tsne.png'))

    except ImportError as ex:
        print(
            'Please install sklearn, matplotlib, and scipy to show embeddings.'
        )
        print(ex)
Пример #6
0
def _variable_on_cpu(name, shape, initializer):
    # 保存参数到cpu
    with tf.device('/cpu:0'):
        var = tf.get_variable(name, shape, initializer=initializer)
    return var