示例#1
0
    def __init__(self, embed_matrix, dictionary: dict, **kwargs):
        """
        Parameters
        ----------
        embed_matrix: numpy array
        dictionary: dictionary
        """

        self._embed_matrix = embed_matrix
        self._dictionary = dictionary
        self._reverse_dictionary = {v: k for k, v in dictionary.items()}
        self.words = list(dictionary.keys())
        self._jarowinkler = JaroWinkler()
        device = get_device(**kwargs)
        _graph = tf.Graph()
        with _graph.as_default():
            with tf.device(device):
                self._embedding = tf.compat.v1.placeholder(
                    tf.float32, self._embed_matrix.shape)
                self._x = tf.compat.v1.placeholder(
                    tf.float32, [None, self._embed_matrix.shape[1]])
                normed_embedding = tf.nn.l2_normalize(self._embedding, axis=1)
                normed_array = tf.nn.l2_normalize(self._x, axis=1)
                self._cosine_similarity = tf.matmul(
                    normed_array, tf.transpose(normed_embedding, [1, 0]))
                self._sess = generate_session(_graph, **kwargs)
示例#2
0
    def __init__(self,
                 xlnet_config,
                 tokenizer,
                 checkpoint,
                 pool_mode='last',
                 **kwargs):

        kwargs_config = dict(
            is_training=True,
            use_tpu=False,
            use_bfloat16=False,
            dropout=0.0,
            dropatt=0.0,
            init='normal',
            init_range=0.1,
            init_std=0.05,
            clamp_len=-1,
        )

        xlnet_parameters = xlnet_lib.RunConfig(**kwargs_config)

        self._tokenizer = tokenizer
        device = get_device(**kwargs)
        _graph = tf.Graph()
        with _graph.as_default():
            with tf.device(device):
                self.X = tf.placeholder(tf.int32, [None, None])
                self.segment_ids = tf.placeholder(tf.int32, [None, None])
                self.input_masks = tf.placeholder(tf.float32, [None, None])

                xlnet_model = xlnet_lib.XLNetModel(
                    xlnet_config=xlnet_config,
                    run_config=xlnet_parameters,
                    input_ids=tf.transpose(self.X, [1, 0]),
                    seg_ids=tf.transpose(self.segment_ids, [1, 0]),
                    input_mask=tf.transpose(self.input_masks, [1, 0]),
                )

                self.logits = xlnet_model.get_pooled_out(pool_mode, True)
                self._sess = generate_session(_graph, **kwargs)
                self._sess.run(tf.global_variables_initializer())
                tvars = tf.trainable_variables()
                assignment_map, _ = get_assignment_map_from_checkpoint(
                    tvars, checkpoint)
                self._saver = tf.train.Saver(var_list=assignment_map)
                attentions = [
                    n.name for n in tf.get_default_graph().as_graph_def().node
                    if 'rel_attn/Softmax' in n.name
                ]
                g = tf.get_default_graph()
                self.attention_nodes = [
                    g.get_tensor_by_name('%s:0' % (a)) for a in attentions
                ]
示例#3
0
文件: pagerank.py 项目: lantip/Malaya
def pagerank(array, fast_pagerank=True, retry=5, **kwargs):
    device = get_device(**kwargs)
    cpu = False
    fail = True
    if 'GPU' in device:
        try:
            import cugraph

            cpu = True
        except Exception as e:
            msg = (
                'cugraph not installed. Please install it from https://github.com/rapidsai/cugraph. \n'
                'Will calculate pagerank using networkx CPU version.')
            logging.warning(msg)
            cpu = True

    else:
        cpu = True

    if cpu:
        if fast_pagerank:
            from scipy import sparse
            from malaya.graph import fast_pagerank

            G = sparse.csr_matrix(array)
            r = fast_pagerank.pagerank(G)
            scores = {i: r[i] for i in range(len(r))}
            fail = False

        else:
            nx_graph = nx.from_numpy_array(array)
            for _ in range(retry):
                try:
                    scores = nx.pagerank(nx_graph, max_iter=10000)
                    fail = False
                    break
                except Exception as e:
                    logging.warning(e)

    if fail:
        raise Exception(
            'pagerank not able to converge, rerun may able to solve it.')

    return scores
示例#4
0
 def __init__(self, hparams, encoder, generate_length, temperature, top_k,
              **kwargs):
     self._encoder = encoder
     device = get_device(**kwargs)
     self._graph = tf.Graph()
     with self._graph.as_default():
         with tf.device(device):
             self._X = tf.placeholder(tf.int32, [1, None])
             self._model = sample_sequence(
                 hparams=hparams,
                 length=generate_length,
                 context=self._X,
                 batch_size=1,
                 temperature=temperature,
                 top_k=top_k,
             )
             self._sess = generate_session(self._graph, **kwargs)
             self._sess.run(tf.global_variables_initializer())
             self._saver = tf.train.Saver(tf.trainable_variables())
示例#5
0
    def __init__(self, bert_config, tokenizer, **kwargs):

        device = get_device(**kwargs)
        _graph = tf.Graph()

        with _graph.as_default():
            with tf.device(device):
                self.X = tf.placeholder(tf.int32, [None, None])
                self.segment_ids = tf.placeholder(tf.int32, [None, None])
                self.top_p = tf.placeholder(tf.float32, None)
                self.top_k = tf.placeholder(tf.int32, None)
                self.k = tf.placeholder(tf.int32, None)
                self.temperature = tf.placeholder(tf.float32, None)
                self.indices = tf.placeholder(tf.int32, [None, None])
                self.MASK = tf.placeholder(tf.int32, [None, None])
                self._tokenizer = tokenizer

                self.model = modeling.AlbertModel(
                    config=bert_config,
                    is_training=False,
                    input_ids=self.X,
                    input_mask=self.MASK,
                    use_one_hot_embeddings=False,
                )
                self.logits = self.model.get_pooled_output()
                input_tensor = self.model.get_sequence_output()
                output_weights = self.model.get_embedding_table()

                with tf.variable_scope('cls/predictions'):
                    with tf.variable_scope('transform'):
                        input_tensor = tf.layers.dense(
                            input_tensor,
                            units=bert_config.embedding_size,
                            activation=modeling.get_activation(
                                bert_config.hidden_act),
                            kernel_initializer=modeling.create_initializer(
                                bert_config.initializer_range),
                        )
                        input_tensor = modeling.layer_norm(input_tensor)

                    output_bias = tf.get_variable(
                        'output_bias',
                        shape=[bert_config.vocab_size],
                        initializer=tf.zeros_initializer(),
                    )
                    logits = tf.matmul(input_tensor,
                                       output_weights,
                                       transpose_b=True)
                    self._logits = tf.nn.bias_add(logits, output_bias)
                    self._log_softmax = tf.nn.log_softmax(self._logits,
                                                          axis=-1)

                logits = tf.gather_nd(self._logits, self.indices)
                logits = logits / self.temperature

                def necleus():
                    return top_p_logits(logits, self.top_p)

                def select_k():
                    return top_k_logits(logits, self.top_k)

                logits = tf.cond(self.top_p > 0, necleus, select_k)
                self.samples = tf.multinomial(logits,
                                              num_samples=self.k,
                                              output_dtype=tf.int32)
                self._sess = generate_session(_graph, **kwargs)
                self._sess.run(tf.global_variables_initializer())
                var_lists = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                              scope='bert')
                var_lists.extend(
                    tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                      scope='cls'))
                self._saver = tf.train.Saver(var_list=var_lists)
                attns = _extract_attention_weights(
                    bert_config.num_hidden_layers, tf.get_default_graph())
                self.attns = attns
示例#6
0
    def __init__(self,
                 num_unique_documents,
                 vocab_size,
                 num_topics,
                 freqs,
                 embedding_size=128,
                 num_sampled=40,
                 learning_rate=1e-3,
                 lmbda=150.0,
                 alpha=None,
                 power=0.75,
                 batch_size=32,
                 clip_gradients=5.0,
                 **kwargs):
        device = get_device(**kwargs)
        _graph = tf.Graph()

        with _graph.as_default():
            with tf.device(device):
                moving_avgs = tf.train.ExponentialMovingAverage(0.9)
                self.batch_size = batch_size
                self.freqs = freqs

                self.X = tf.placeholder(tf.int32, shape=[None])
                self.Y = tf.placeholder(tf.int64, shape=[None])
                self.DOC = tf.placeholder(tf.int32, shape=[None])
                self.switch_loss = tf.Variable(0, trainable=False)
                train_labels = tf.reshape(self.Y, [-1, 1])
                sampler = tf.nn.fixed_unigram_candidate_sampler(
                    train_labels,
                    num_true=1,
                    num_sampled=num_sampled,
                    unique=True,
                    range_max=vocab_size,
                    distortion=power,
                    unigrams=self.freqs,
                )

                self.word_embedding = tf.Variable(
                    tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0))
                self.nce_weights = tf.Variable(
                    tf.truncated_normal(
                        [vocab_size, embedding_size],
                        stddev=tf.sqrt(1 / embedding_size),
                    ))
                self.nce_biases = tf.Variable(tf.zeros([vocab_size]))
                scalar = 1 / np.sqrt(num_unique_documents + num_topics)
                self.doc_embedding = tf.Variable(
                    tf.random_normal(
                        [num_unique_documents, num_topics],
                        mean=0,
                        stddev=50 * scalar,
                    ))
                self.topic_embedding = tf.get_variable(
                    'topic_embedding',
                    shape=[num_topics, embedding_size],
                    dtype=tf.float32,
                    initializer=tf.orthogonal_initializer(gain=scalar),
                )
                pivot = tf.nn.embedding_lookup(self.word_embedding, self.X)
                proportions = tf.nn.embedding_lookup(self.doc_embedding,
                                                     self.DOC)
                doc = tf.matmul(proportions, self.topic_embedding)
                doc_context = doc
                word_context = pivot
                context = tf.add(word_context, doc_context)
                loss_word2vec = tf.reduce_mean(
                    tf.nn.nce_loss(
                        weights=self.nce_weights,
                        biases=self.nce_biases,
                        labels=self.Y,
                        inputs=context,
                        num_sampled=num_sampled,
                        num_classes=vocab_size,
                        num_true=1,
                        sampled_values=sampler,
                    ))
                self.fraction = tf.Variable(1,
                                            trainable=False,
                                            dtype=tf.float32)

                n_topics = self.doc_embedding.get_shape()[1].value
                log_proportions = tf.nn.log_softmax(self.doc_embedding)
                if alpha is None:
                    alpha = 1.0 / n_topics
                loss = (alpha - 1) * log_proportions
                prior = tf.reduce_sum(loss)

                loss_lda = lmbda * self.fraction * prior
                global_step = tf.Variable(0,
                                          trainable=False,
                                          name='global_step')
                self.cost = tf.cond(
                    global_step < self.switch_loss,
                    lambda: loss_word2vec,
                    lambda: loss_word2vec + loss_lda,
                )
                loss_avgs_op = moving_avgs.apply(
                    [loss_lda, loss_word2vec, self.cost])
                with tf.control_dependencies([loss_avgs_op]):
                    optimizer = tf.train.AdamOptimizer(
                        learning_rate=learning_rate)
                    gvs = optimizer.compute_gradients(self.cost)
                    capped_gvs = [(
                        tf.clip_by_value(grad, -clip_gradients,
                                         clip_gradients),
                        var,
                    ) for grad, var in gvs]
                    self.optimizer = optimizer.apply_gradients(capped_gvs)
                self.sess = generate_session(_graph, **kwargs)
                self.sess.run(tf.global_variables_initializer())