Python Vocab.decode примеры использования

Язык программирования: Python

Пространство имен/Пакет: utils

Класс/Тип: Vocab

Метод/Функция: decode

Примеров на hotexamples.com: 3

Python Vocab.decode - 3 примера найдено. Это лучшие примеры Python кода для utils.Vocab.decode, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

encode(30)

construct(30)

load(15)

Vocab(9)

add_word(6)

to_input_tensor(4)

get_word_list(3)

pickle(3)

decode(3)

get_train_dev_test(3)

get_pre_trained_examples(2)

build_vocab(2)

add_dataframe(2)

save_to_file(2)

add(2)

add_special_token(2)

update(2)

build_bert_vocab(2)

build(2)

add_words(2)

add_special_tokens(2)

build_embedding_matrix(2)

word2id(1)

get_wv(1)

id2word(1)

indices2tokens(1)

transform(1)

_looking_up(1)

load_pretrained_char_embeddings(1)

load_pretrained_word_embeddings(1)

py_size(1)

randomly_init_py_embeddings(1)

size(1)

add_char(1)

filter_pys_by_cnt(1)

get_vocab(1)

construct_phrase(1)

add_list(1)

add_py(1)

build_from_counter(1)

char_size(1)

check_words(1)

construct_batch(1)

convert_to_str(1)

getIndex(1)

decode_docs(1)

emb_wordtoindex(1)

__len__(1)

filter_chars_by_cnt(1)

filter_tokens_by_cnt(1)

Пример #1

Показать файл

Файл: rnn.py Проект: rachelsterneck/fakenewsnetwork

class RNN_Model():
    def load_data(self):
        """Loads train/dev/test data and builds vocabulary."""
        self.train_data, self.dev_data, self.test_data = tr.simplified_data(
            300, 70, 100)

        # build vocab from training data
        self.vocab = Vocab()
        train_sents = [t.get_words() for t in self.train_data]
        self.vocab.construct(list(itertools.chain.from_iterable(train_sents)))

        self.w2v_vocab, w2v_embd, embedding_dict = self.load_w2v()
        self.embedding_dim = len(w2v_embd[0])
        self.w2v_vocab_size = len(self.w2v_vocab)

        self.vocab_size = len(self.vocab)
        embeddings_tmp = []
        for i in range(self.vocab_size):
            item = self.vocab.decode(i)
            if item in self.w2v_vocab:
                embeddings_tmp.append(embedding_dict[item])
                # print("Found word {}".format(item))
            else:
                # print("Couldn't find {}.".format(item))
                rand_num = np.random.uniform(low=-0.2,
                                             high=0.2,
                                             size=self.embedding_dim)
                embeddings_tmp.append(rand_num)

        self.embed = np.asarray(embeddings_tmp)

    def inference(self, tree, predict_only_root=True):
        """For a given tree build the RNN models computation graph up to where it
            may be used for inference.
        Args:
            tree: a Tree object on which to build the computation graph for the RNN
        Returns:
            softmax_linear: Output tensor with the computed logits.
        """
        node_tensors = self.add_model(tree.root)
        if predict_only_root:
            node_tensors = node_tensors[tree.root]
        else:
            node_tensors = [
                tensor for node, tensor in node_tensors.items()
                if node.label != 2
            ]
            node_tensors = tf.concat(node_tensors, 0)
        return self.add_projections(node_tensors)

    def add_model_vars(self):
        '''
        You model contains the following parameters:
            embedding:  tensor(vocab_size, embed_size)
            W1:         tensor(2* embed_size, embed_size)
            b1:         tensor(1, embed_size)
            U:          tensor(embed_size, output_size)
            bs:         tensor(1, output_size)
        Hint: Add the tensorflow variables to the graph here and *reuse* them while building
                the compution graphs for composition and projection for each tree
        Hint: Use a variable_scope "Composition" for the composition layer, and
              "Projection") for the linear transformations preceding the softmax.
        Hint: Look up tf.get_variable
        '''
        with tf.variable_scope('Composition'):
            ### YOUR CODE HERE
            # embedding = tf.get_variable(
            #     "embedding", (len(self.vocab), self.config.embed_size))

            embedding = tf.get_variable(
                "embedding",
                shape=[self.vocab_size, self.embedding_dim],
                initializer=tf.constant_initializer(self.embed),
                trainable=False)
            # embedding = tf.Variable(
            #     tf.constant(0.0, shape=[self.vocab_size, self.embedding_dim]),
            #     trainable=False,
            #     name="embedding")
            # self.embedding_placeholder = tf.placeholder(
            #     tf.float32, [self.vocab_size, self.embedding_dim])
            # self.embedding_init = embedding.assign(self.embedding_placeholder)

            # embedding = tf.get_variable("embedding", shape=[self.w2v_vocab_size, self.config.embed_size],
            #             initializaer=tf.constant_initializer(self.embed), trainable=False)
            W1 = tf.get_variable("W1",
                                 (self.embedding_dim, self.embedding_dim))
            b1 = tf.get_variable("b1", (1, self.embedding_dim))
            ### END YOUR CODE
        with tf.variable_scope('Projection'):
            ### YOUR CODE HERE
            U = tf.get_variable("U",
                                (self.embedding_dim, self.config.label_size))
            bs = tf.get_variable("bs", (1, self.config.label_size))
            ### END YOUR CODE

    def add_model(self, node):
        """Recursively build the model to compute the phrase embeddings in the tree

        Hint: Refer to tree.py and vocab.py before you start. Refer to
              the model's vocab with self.vocab
        Hint: Reuse the "Composition" variable_scope here
        Hint: Store a node's vector representation in node.tensor so it can be
              used by its parent
        Hint: If node is a leaf node, it's vector representation is just that of the
              word vector (see tf.gather()).
        Args:
            node: a Node object
        Returns:
            node_tensors: Dict: key = Node, value = tensor(1, embed_size)
        """
        with tf.variable_scope('Composition', reuse=True):
            ### YOUR CODE HERE
            embedding = tf.get_variable("embedding")
            W1 = tf.get_variable("W1")
            b1 = tf.get_variable("b1")
            # the variables are already stored in self?
        ## END YOUR CODE

        node_tensors = dict()
        curr_node_tensor = None
        if node.isLeaf:
            ### YOUR CODE HERE
            # word_id = self.vocab.encode(node.word)
            # embedded_chars = tf.nn.embedding_lookup(embedding, word_id)
            # curr_node_tensor = tf.unstack(embedded_chars, 1, 1)

            word_id = self.vocab.encode(node.word)
            curr_node_tensor = tf.expand_dims(tf.gather(embedding, word_id), 0)
            ### END YOUR CODE
        else:
            node_input = tf.zeros((1, self.embedding_dim))
            for child in node.children:
                node_tensors.update(self.add_model(child))
                node_input = tf.add(node_input, node_tensors[child])
            ### YOUR CODE HERE
            curr_node_tensor = tf.nn.relu(tf.matmul(node_input, W1) + b1)
            ### END YOUR CODE
        node_tensors[node] = curr_node_tensor
        return node_tensors

    def add_projections(self, node_tensors):
        """Add projections to the composition vectors to compute the raw sentiment scores

        Hint: Reuse the "Projection" variable_scope here
        Args:
            node_tensors: tensor(?, embed_size)
        Returns:
            output: tensor(?, label_size)
        """
        logits = None
        ### YOUR CODE HERE
        with tf.variable_scope('Projection', reuse=True):
            U = tf.get_variable("U")
            bs = tf.get_variable("bs")
            logits = tf.matmul(node_tensors, U) + bs
        ### END YOUR CODE
        return logits

    def loss(self, logits, labels):
        """Adds loss ops to the computational graph.

        Hint: Use sparse_softmax_cross_entropy_with_logits
        Hint: Remember to add l2_loss (see tf.nn.l2_loss)
        Args:
            logits: tensor(num_nodes, output_size)
            labels: python list, len = num_nodes
        Returns:
            loss: tensor 0-D
        """
        loss = None
        # YOUR CODE HERE
        with tf.variable_scope('Composition', reuse=True):
            W1 = tf.get_variable("W1")

        with tf.variable_scope('Projection', reuse=True):
            U = tf.get_variable("U")

        loss = tf.reduce_sum(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=labels,
                logits=logits)) + self.config.l2 * tf.nn.l2_loss(
                    W1) + self.config.l2 * tf.nn.l2_loss(U)

        # END YOUR CODE
        return loss

    def training(self, loss):
        """Sets up the training Ops.

        Creates an optimizer and applies the gradients to all trainable variables.
        The Op returned by this function is what must be passed to the
        `sess.run()` call to cause the model to train. See

        https://www.tensorflow.org/versions/r0.7/api_docs/python/train.html#Optimizer

        for more information.

        Hint: Use tf.train.GradientDescentOptimizer for this model.
                Calling optimizer.minimize() will return a train_op object.

        Args:
            loss: tensor 0-D
        Returns:
            train_op: tensorflow op for training.
        """
        train_op = None
        # YOUR CODE HERE
        train_op = tf.train.GradientDescentOptimizer(
            self.config.lr).minimize(loss)
        # END YOUR CODE
        return train_op

    def predictions(self, y):
        """Returns predictions from sparse scores

        Args:
            y: tensor(?, label_size)
        Returns:
            predictions: tensor(?,1)
        """
        predictions = None
        # YOUR CODE HERE
        predictions = tf.argmax(y, axis=1)
        # END YOUR CODE
        return predictions

    def __init__(self, config):
        self.config = config
        self.load_data()

    def predict(self, trees, weights_path, get_loss=False):
        """Make predictions from the provided model."""
        results = []
        losses = []
        for i in range(int(math.ceil(len(trees) / float(RESET_AFTER)))):
            with tf.Graph().as_default(), tf.Session() as sess:
                self.add_model_vars()
                # sess.run(
                #     self.embedding_init,
                #     feed_dict={self.embedding_placeholder: self.embed})
                saver = tf.train.Saver()
                saver.restore(sess, weights_path)
                for tree in trees[i * RESET_AFTER:(i + 1) * RESET_AFTER]:
                    logits = self.inference(tree, True)
                    predictions = self.predictions(logits)
                    root_prediction = sess.run(predictions)[0]
                    if root_prediction == 1:
                        root_prediction = 4
                    if get_loss:
                        root_label = tree.root.label
                        loss = sess.run(self.loss(logits, [root_label]))
                        losses.append(loss)
                    results.append(root_prediction)
        return results, losses

    def run_epoch(self, new_model=False, verbose=True):
        step = 0
        loss_history = []
        while step < len(self.train_data):
            with tf.Graph().as_default(), tf.Session() as sess:
                self.add_model_vars()
                # sess.run(
                #     self.embedding_init,
                #     feed_dict={self.embedding_placeholder: self.embed})
                if new_model:
                    init = tf.global_variables_initializer()
                    sess.run(init)
                else:
                    saver = tf.train.Saver()
                    saver.restore(sess,
                                  './weights/%s.temp' % self.config.model_name)
                for _ in range(RESET_AFTER):
                    if step >= len(self.train_data):
                        break
                    tree = self.train_data[step]
                    logits = self.inference(tree)
                    # print(sess.run(logits))
                    labels = [l for l in tree.labels if l != 2]
                    if labels[0] == 4:
                        labels = [1]
                    # print(labels)
                    loss = self.loss(logits, labels)
                    train_op = self.training(loss)
                    loss, _ = sess.run([loss, train_op])
                    loss_history.append(loss)
                    if verbose:
                        sys.stdout.write('\r{} / {} :    loss = {}'.format(
                            step, len(self.train_data), np.mean(loss_history)))
                        sys.stdout.flush()
                    step += 1
                saver = tf.train.Saver()
                if not os.path.exists("./weights"):
                    os.makedirs("./weights")
                saver.save(sess, './weights/%s.temp' % self.config.model_name)
        train_preds, _ = self.predict(
            self.train_data, './weights/%s.temp' % self.config.model_name)
        val_preds, val_losses = self.predict(self.dev_data,
                                             './weights/%s.temp1' %
                                             self.config.model_name,
                                             get_loss=True)
        train_labels = [t.root.label for t in self.train_data]
        val_labels = [t.root.label for t in self.dev_data]
        train_acc = np.equal(train_preds, train_labels).mean()
        val_acc = np.equal(val_preds, val_labels).mean()

        print()
        print('Training acc (only root node): {}'.format(train_acc))
        print('Validation acc (only root node): {}'.format(val_acc))
        print('Confusion matrix:')
        print(self.make_conf(train_labels, train_preds))
        print(self.make_conf(val_labels, val_preds))
        return train_acc, val_acc, loss_history, np.mean(val_losses)

    def train(self, verbose=True):
        complete_loss_history = []
        train_acc_history = []
        val_acc_history = []
        prev_epoch_loss = float('inf')
        best_val_loss = float('inf')
        best_val_epoch = 0
        stopped = -1
        for epoch in range(self.config.max_epochs):
            print('epoch %d' % epoch)
            if epoch == 0:
                train_acc, val_acc, loss_history, val_loss = self.run_epoch(
                    new_model=True)
            else:
                train_acc, val_acc, loss_history, val_loss = self.run_epoch()
            complete_loss_history.extend(loss_history)
            train_acc_history.append(train_acc)
            val_acc_history.append(val_acc)

            #lr annealing
            epoch_loss = np.mean(loss_history)
            if epoch_loss > prev_epoch_loss * self.config.anneal_threshold:
                self.config.lr /= self.config.anneal_by
                print('annealed lr to %f' % self.config.lr)
            prev_epoch_loss = epoch_loss

            # save if model has improved on val
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_val_epoch = epoch

            # if model has not improved for a while stop
            if epoch - best_val_epoch > self.config.early_stopping:
                stopped = epoch
                # break
        if verbose:
            sys.stdout.write('\r')
            sys.stdout.flush()

        print('\n\nstopped at %d\n' % stopped)
        return {
            'loss_history': complete_loss_history,
            'train_acc_history': train_acc_history,
            'val_acc_history': val_acc_history,
        }

    def make_conf(self, labels, predictions):
        confmat = np.zeros([2, 2])
        labels = [l if l != 4 else 1 for l in labels]
        predictions = [p if p != 4 else 1 for p in predictions]
        for l, p in zip(labels, predictions):
            confmat[l, p] += 1
        return confmat

    def load_w2v(self):
        vocab = []
        embd = []
        e_dict = {}
        # change 100d to 50d for smaller-dimension GloVe embedding
        file = open("./glove.6B.100d.txt", 'r', encoding='UTF-8')
        for line in file.readlines():
            row = line.strip().split(' ')
            vocab.append(row[0])
            embd.append(row[1:])
            e_dict[row[0]] = [float(i) for i in row[1:]]
        print("Loaded word2vec!")
        file.close()
        return vocab, embd, e_dict

Пример #2

Показать файл

        print('-' * 20)
        print('epoch: {}'.format(epoch + 1))

        for (x, t) in train_dataloader:
            train_step(x, t, depth_t)

        for (x, t) in val_dataloader:
            val_step(x, t, depth_t)

        print('loss: {:.3f}, val_loss: {:.3}'.format(train_loss.result(),
                                                     val_loss.result()))

        for idx, (x, t) in enumerate(test_dataloader):
            preds = test_step(x)

            source = x.numpy().reshape(-1)
            target = t.numpy().reshape(-1)
            out = tf.argmax(preds, axis=-1).numpy().reshape(-1)

            source = ' '.join(en_vocab.decode(source))
            target = ' '.join(ja_vocab.decode(target))
            out = ' '.join(ja_vocab.decode(out))

            print('>', source)
            print('=', target)
            print('<', out)
            print()

            if idx >= 9:
                break

Пример #3

Показать файл

Файл: datasets.py Проект: chiayewken/sutd-materials

class StarTrekCharGenerationDataset(Dataset):
    def __init__(self, hparams: HyperParams, data_split: str, sep_line="\n"):
        assert Splits.check_split(data_split)
        self.hparams = hparams
        self.root = Path(self.hparams.root)
        self.data_split = data_split
        self.sep_line = sep_line

        self.path_data = self.download()
        self.lines = self.preprocess_data()
        self.vocab = Vocab(list(self.sep_line.join(self.lines)))
        self.text = self.train_val_test_split()
        self.tensor = self.get_sequences()
        if self.hparams.verbose:
            self.show_samples()
            print(dict(vocab_size=len(self.vocab)))

    def download(self) -> List[str]:
        url = "https://github.com/chiayewken/sutd-materials/releases/download/v0.1.0/star_trek_transcripts_all_episodes.csv"
        path = self.root / Path(url).name
        if not path.exists():
            download_url(url, str(self.root), filename=path.name)
        assert path.exists()
        return path

    def preprocess_data(self) -> List[str]:
        with open(str(self.path_data)) as f:
            return [
                line.strip().strip(",") for line in f
                if "NEXTEPISODE" not in line
            ]

    def train_val_test_split(self, fractions=(0.8, 0.1, 0.1)) -> str:
        indices_all = list(range(len(self.lines)))
        indices_split = shuffle_multi_split(indices_all, fractions)
        indices = indices_split[[Splits.train, Splits.val,
                                 Splits.test].index(self.data_split)]
        lines = [self.lines[i] for i in indices]
        text = self.sep_line.join(lines)
        if self.hparams.verbose:
            print(dict(lines=len(lines), text=len(text)))
        return text

    def get_sequences(self) -> torch.Tensor:
        path_cache = self.root / f"cache_tensor_{self.data_split}.pt"
        token_start = self.vocab.stoi[self.vocab.start]

        if not path_cache.exists():
            encoded = self.vocab.encode(list(self.text))
            sequences = []
            for i in tqdm(range(len(encoded) - self.hparams.seq_len)):
                sequences.append([token_start] +
                                 encoded[i:i + self.hparams.seq_len])
            tensor = torch.from_numpy(np.array(sequences)).type(torch.long)
            torch.save(tensor, str(path_cache))

        tensor = torch.load(str(path_cache))
        if self.hparams.verbose:
            print(dict(tensor=tensor.shape))
        return tensor

    def __len__(self):
        return self.tensor.shape[0]

    def __getitem__(self, i):
        sequence = self.tensor[i, :]
        return sequence[:-1], sequence[1:]

    def sequence_to_text(self, sequence: torch.Tensor):
        assert sequence.ndim == 1
        return "".join(self.vocab.decode(sequence.numpy()))

    def show_samples(self, num=3):
        print(self.__class__.__name__, dict(show_samples=num))
        indices = np.random.choice(len(self), size=num, replace=False)
        for i in indices:
            sequence = self.tensor[i, :]
            print(dict(text=self.sequence_to_text(sequence), raw=sequence))

    def extract_quotes(self, lines: List[str]) -> List[str]:
        def check_speaker_start(s: str) -> bool:
            for char in s:
                if char.isupper():
                    pass
                elif char == ":":
                    return True
                else:
                    return False
            return False

        def handle_newlines(_lines: List[str]) -> List[str]:
            out = []
            for line in _lines:
                out.extend(line.split(self.sep_line))
            return out

        def check_line_finish(s: str) -> bool:
            return s[-1] in "!.?"

        lines = handle_newlines(lines)
        quotes = []
        for row in csv.reader(lines, delimiter=",", quotechar='"'):
            for part in row:
                if not part:
                    continue
                if not check_line_finish(part):
                    continue
                if not check_speaker_start(part):
                    continue
                quotes.append(part)
        if self.hparams.verbose:
            print(dict(extract_quotes=len(quotes)))
        return quotes