Пример #1
0
class RNN_Model():
    def load_data(self):
        """Loads train/dev/test data and builds vocabulary."""
        self.train_data, self.dev_data, self.test_data = tr.simplified_data(
            300, 70, 100)

        # build vocab from training data
        self.vocab = Vocab()
        train_sents = [t.get_words() for t in self.train_data]
        self.vocab.construct(list(itertools.chain.from_iterable(train_sents)))

        self.w2v_vocab, w2v_embd, embedding_dict = self.load_w2v()
        self.embedding_dim = len(w2v_embd[0])
        self.w2v_vocab_size = len(self.w2v_vocab)

        self.vocab_size = len(self.vocab)
        embeddings_tmp = []
        for i in range(self.vocab_size):
            item = self.vocab.decode(i)
            if item in self.w2v_vocab:
                embeddings_tmp.append(embedding_dict[item])
                # print("Found word {}".format(item))
            else:
                # print("Couldn't find {}.".format(item))
                rand_num = np.random.uniform(low=-0.2,
                                             high=0.2,
                                             size=self.embedding_dim)
                embeddings_tmp.append(rand_num)

        self.embed = np.asarray(embeddings_tmp)

    def inference(self, tree, predict_only_root=True):
        """For a given tree build the RNN models computation graph up to where it
            may be used for inference.
        Args:
            tree: a Tree object on which to build the computation graph for the RNN
        Returns:
            softmax_linear: Output tensor with the computed logits.
        """
        node_tensors = self.add_model(tree.root)
        if predict_only_root:
            node_tensors = node_tensors[tree.root]
        else:
            node_tensors = [
                tensor for node, tensor in node_tensors.items()
                if node.label != 2
            ]
            node_tensors = tf.concat(node_tensors, 0)
        return self.add_projections(node_tensors)

    def add_model_vars(self):
        '''
        You model contains the following parameters:
            embedding:  tensor(vocab_size, embed_size)
            W1:         tensor(2* embed_size, embed_size)
            b1:         tensor(1, embed_size)
            U:          tensor(embed_size, output_size)
            bs:         tensor(1, output_size)
        Hint: Add the tensorflow variables to the graph here and *reuse* them while building
                the compution graphs for composition and projection for each tree
        Hint: Use a variable_scope "Composition" for the composition layer, and
              "Projection") for the linear transformations preceding the softmax.
        Hint: Look up tf.get_variable
        '''
        with tf.variable_scope('Composition'):
            ### YOUR CODE HERE
            # embedding = tf.get_variable(
            #     "embedding", (len(self.vocab), self.config.embed_size))

            embedding = tf.get_variable(
                "embedding",
                shape=[self.vocab_size, self.embedding_dim],
                initializer=tf.constant_initializer(self.embed),
                trainable=False)
            # embedding = tf.Variable(
            #     tf.constant(0.0, shape=[self.vocab_size, self.embedding_dim]),
            #     trainable=False,
            #     name="embedding")
            # self.embedding_placeholder = tf.placeholder(
            #     tf.float32, [self.vocab_size, self.embedding_dim])
            # self.embedding_init = embedding.assign(self.embedding_placeholder)

            # embedding = tf.get_variable("embedding", shape=[self.w2v_vocab_size, self.config.embed_size],
            #             initializaer=tf.constant_initializer(self.embed), trainable=False)
            W1 = tf.get_variable("W1",
                                 (self.embedding_dim, self.embedding_dim))
            b1 = tf.get_variable("b1", (1, self.embedding_dim))
            ### END YOUR CODE
        with tf.variable_scope('Projection'):
            ### YOUR CODE HERE
            U = tf.get_variable("U",
                                (self.embedding_dim, self.config.label_size))
            bs = tf.get_variable("bs", (1, self.config.label_size))
            ### END YOUR CODE

    def add_model(self, node):
        """Recursively build the model to compute the phrase embeddings in the tree

        Hint: Refer to tree.py and vocab.py before you start. Refer to
              the model's vocab with self.vocab
        Hint: Reuse the "Composition" variable_scope here
        Hint: Store a node's vector representation in node.tensor so it can be
              used by its parent
        Hint: If node is a leaf node, it's vector representation is just that of the
              word vector (see tf.gather()).
        Args:
            node: a Node object
        Returns:
            node_tensors: Dict: key = Node, value = tensor(1, embed_size)
        """
        with tf.variable_scope('Composition', reuse=True):
            ### YOUR CODE HERE
            embedding = tf.get_variable("embedding")
            W1 = tf.get_variable("W1")
            b1 = tf.get_variable("b1")
            # the variables are already stored in self?
        ## END YOUR CODE

        node_tensors = dict()
        curr_node_tensor = None
        if node.isLeaf:
            ### YOUR CODE HERE
            # word_id = self.vocab.encode(node.word)
            # embedded_chars = tf.nn.embedding_lookup(embedding, word_id)
            # curr_node_tensor = tf.unstack(embedded_chars, 1, 1)

            word_id = self.vocab.encode(node.word)
            curr_node_tensor = tf.expand_dims(tf.gather(embedding, word_id), 0)
            ### END YOUR CODE
        else:
            node_input = tf.zeros((1, self.embedding_dim))
            for child in node.children:
                node_tensors.update(self.add_model(child))
                node_input = tf.add(node_input, node_tensors[child])
            ### YOUR CODE HERE
            curr_node_tensor = tf.nn.relu(tf.matmul(node_input, W1) + b1)
            ### END YOUR CODE
        node_tensors[node] = curr_node_tensor
        return node_tensors

    def add_projections(self, node_tensors):
        """Add projections to the composition vectors to compute the raw sentiment scores

        Hint: Reuse the "Projection" variable_scope here
        Args:
            node_tensors: tensor(?, embed_size)
        Returns:
            output: tensor(?, label_size)
        """
        logits = None
        ### YOUR CODE HERE
        with tf.variable_scope('Projection', reuse=True):
            U = tf.get_variable("U")
            bs = tf.get_variable("bs")
            logits = tf.matmul(node_tensors, U) + bs
        ### END YOUR CODE
        return logits

    def loss(self, logits, labels):
        """Adds loss ops to the computational graph.

        Hint: Use sparse_softmax_cross_entropy_with_logits
        Hint: Remember to add l2_loss (see tf.nn.l2_loss)
        Args:
            logits: tensor(num_nodes, output_size)
            labels: python list, len = num_nodes
        Returns:
            loss: tensor 0-D
        """
        loss = None
        # YOUR CODE HERE
        with tf.variable_scope('Composition', reuse=True):
            W1 = tf.get_variable("W1")

        with tf.variable_scope('Projection', reuse=True):
            U = tf.get_variable("U")

        loss = tf.reduce_sum(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=labels,
                logits=logits)) + self.config.l2 * tf.nn.l2_loss(
                    W1) + self.config.l2 * tf.nn.l2_loss(U)

        # END YOUR CODE
        return loss

    def training(self, loss):
        """Sets up the training Ops.

        Creates an optimizer and applies the gradients to all trainable variables.
        The Op returned by this function is what must be passed to the
        `sess.run()` call to cause the model to train. See

        https://www.tensorflow.org/versions/r0.7/api_docs/python/train.html#Optimizer

        for more information.

        Hint: Use tf.train.GradientDescentOptimizer for this model.
                Calling optimizer.minimize() will return a train_op object.

        Args:
            loss: tensor 0-D
        Returns:
            train_op: tensorflow op for training.
        """
        train_op = None
        # YOUR CODE HERE
        train_op = tf.train.GradientDescentOptimizer(
            self.config.lr).minimize(loss)
        # END YOUR CODE
        return train_op

    def predictions(self, y):
        """Returns predictions from sparse scores

        Args:
            y: tensor(?, label_size)
        Returns:
            predictions: tensor(?,1)
        """
        predictions = None
        # YOUR CODE HERE
        predictions = tf.argmax(y, axis=1)
        # END YOUR CODE
        return predictions

    def __init__(self, config):
        self.config = config
        self.load_data()

    def predict(self, trees, weights_path, get_loss=False):
        """Make predictions from the provided model."""
        results = []
        losses = []
        for i in range(int(math.ceil(len(trees) / float(RESET_AFTER)))):
            with tf.Graph().as_default(), tf.Session() as sess:
                self.add_model_vars()
                # sess.run(
                #     self.embedding_init,
                #     feed_dict={self.embedding_placeholder: self.embed})
                saver = tf.train.Saver()
                saver.restore(sess, weights_path)
                for tree in trees[i * RESET_AFTER:(i + 1) * RESET_AFTER]:
                    logits = self.inference(tree, True)
                    predictions = self.predictions(logits)
                    root_prediction = sess.run(predictions)[0]
                    if root_prediction == 1:
                        root_prediction = 4
                    if get_loss:
                        root_label = tree.root.label
                        loss = sess.run(self.loss(logits, [root_label]))
                        losses.append(loss)
                    results.append(root_prediction)
        return results, losses

    def run_epoch(self, new_model=False, verbose=True):
        step = 0
        loss_history = []
        while step < len(self.train_data):
            with tf.Graph().as_default(), tf.Session() as sess:
                self.add_model_vars()
                # sess.run(
                #     self.embedding_init,
                #     feed_dict={self.embedding_placeholder: self.embed})
                if new_model:
                    init = tf.global_variables_initializer()
                    sess.run(init)
                else:
                    saver = tf.train.Saver()
                    saver.restore(sess,
                                  './weights/%s.temp' % self.config.model_name)
                for _ in range(RESET_AFTER):
                    if step >= len(self.train_data):
                        break
                    tree = self.train_data[step]
                    logits = self.inference(tree)
                    # print(sess.run(logits))
                    labels = [l for l in tree.labels if l != 2]
                    if labels[0] == 4:
                        labels = [1]
                    # print(labels)
                    loss = self.loss(logits, labels)
                    train_op = self.training(loss)
                    loss, _ = sess.run([loss, train_op])
                    loss_history.append(loss)
                    if verbose:
                        sys.stdout.write('\r{} / {} :    loss = {}'.format(
                            step, len(self.train_data), np.mean(loss_history)))
                        sys.stdout.flush()
                    step += 1
                saver = tf.train.Saver()
                if not os.path.exists("./weights"):
                    os.makedirs("./weights")
                saver.save(sess, './weights/%s.temp' % self.config.model_name)
        train_preds, _ = self.predict(
            self.train_data, './weights/%s.temp' % self.config.model_name)
        val_preds, val_losses = self.predict(self.dev_data,
                                             './weights/%s.temp1' %
                                             self.config.model_name,
                                             get_loss=True)
        train_labels = [t.root.label for t in self.train_data]
        val_labels = [t.root.label for t in self.dev_data]
        train_acc = np.equal(train_preds, train_labels).mean()
        val_acc = np.equal(val_preds, val_labels).mean()

        print()
        print('Training acc (only root node): {}'.format(train_acc))
        print('Validation acc (only root node): {}'.format(val_acc))
        print('Confusion matrix:')
        print(self.make_conf(train_labels, train_preds))
        print(self.make_conf(val_labels, val_preds))
        return train_acc, val_acc, loss_history, np.mean(val_losses)

    def train(self, verbose=True):
        complete_loss_history = []
        train_acc_history = []
        val_acc_history = []
        prev_epoch_loss = float('inf')
        best_val_loss = float('inf')
        best_val_epoch = 0
        stopped = -1
        for epoch in range(self.config.max_epochs):
            print('epoch %d' % epoch)
            if epoch == 0:
                train_acc, val_acc, loss_history, val_loss = self.run_epoch(
                    new_model=True)
            else:
                train_acc, val_acc, loss_history, val_loss = self.run_epoch()
            complete_loss_history.extend(loss_history)
            train_acc_history.append(train_acc)
            val_acc_history.append(val_acc)

            #lr annealing
            epoch_loss = np.mean(loss_history)
            if epoch_loss > prev_epoch_loss * self.config.anneal_threshold:
                self.config.lr /= self.config.anneal_by
                print('annealed lr to %f' % self.config.lr)
            prev_epoch_loss = epoch_loss

            # save if model has improved on val
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_val_epoch = epoch

            # if model has not improved for a while stop
            if epoch - best_val_epoch > self.config.early_stopping:
                stopped = epoch
                # break
        if verbose:
            sys.stdout.write('\r')
            sys.stdout.flush()

        print('\n\nstopped at %d\n' % stopped)
        return {
            'loss_history': complete_loss_history,
            'train_acc_history': train_acc_history,
            'val_acc_history': val_acc_history,
        }

    def make_conf(self, labels, predictions):
        confmat = np.zeros([2, 2])
        labels = [l if l != 4 else 1 for l in labels]
        predictions = [p if p != 4 else 1 for p in predictions]
        for l, p in zip(labels, predictions):
            confmat[l, p] += 1
        return confmat

    def load_w2v(self):
        vocab = []
        embd = []
        e_dict = {}
        # change 100d to 50d for smaller-dimension GloVe embedding
        file = open("./glove.6B.100d.txt", 'r', encoding='UTF-8')
        for line in file.readlines():
            row = line.strip().split(' ')
            vocab.append(row[0])
            embd.append(row[1:])
            e_dict[row[0]] = [float(i) for i in row[1:]]
        print("Loaded word2vec!")
        file.close()
        return vocab, embd, e_dict
Пример #2
0
        print('-' * 20)
        print('epoch: {}'.format(epoch + 1))

        for (x, t) in train_dataloader:
            train_step(x, t, depth_t)

        for (x, t) in val_dataloader:
            val_step(x, t, depth_t)

        print('loss: {:.3f}, val_loss: {:.3}'.format(train_loss.result(),
                                                     val_loss.result()))

        for idx, (x, t) in enumerate(test_dataloader):
            preds = test_step(x)

            source = x.numpy().reshape(-1)
            target = t.numpy().reshape(-1)
            out = tf.argmax(preds, axis=-1).numpy().reshape(-1)

            source = ' '.join(en_vocab.decode(source))
            target = ' '.join(ja_vocab.decode(target))
            out = ' '.join(ja_vocab.decode(out))

            print('>', source)
            print('=', target)
            print('<', out)
            print()

            if idx >= 9:
                break
Пример #3
0
class StarTrekCharGenerationDataset(Dataset):
    def __init__(self, hparams: HyperParams, data_split: str, sep_line="\n"):
        assert Splits.check_split(data_split)
        self.hparams = hparams
        self.root = Path(self.hparams.root)
        self.data_split = data_split
        self.sep_line = sep_line

        self.path_data = self.download()
        self.lines = self.preprocess_data()
        self.vocab = Vocab(list(self.sep_line.join(self.lines)))
        self.text = self.train_val_test_split()
        self.tensor = self.get_sequences()
        if self.hparams.verbose:
            self.show_samples()
            print(dict(vocab_size=len(self.vocab)))

    def download(self) -> List[str]:
        url = "https://github.com/chiayewken/sutd-materials/releases/download/v0.1.0/star_trek_transcripts_all_episodes.csv"
        path = self.root / Path(url).name
        if not path.exists():
            download_url(url, str(self.root), filename=path.name)
        assert path.exists()
        return path

    def preprocess_data(self) -> List[str]:
        with open(str(self.path_data)) as f:
            return [
                line.strip().strip(",") for line in f
                if "NEXTEPISODE" not in line
            ]

    def train_val_test_split(self, fractions=(0.8, 0.1, 0.1)) -> str:
        indices_all = list(range(len(self.lines)))
        indices_split = shuffle_multi_split(indices_all, fractions)
        indices = indices_split[[Splits.train, Splits.val,
                                 Splits.test].index(self.data_split)]
        lines = [self.lines[i] for i in indices]
        text = self.sep_line.join(lines)
        if self.hparams.verbose:
            print(dict(lines=len(lines), text=len(text)))
        return text

    def get_sequences(self) -> torch.Tensor:
        path_cache = self.root / f"cache_tensor_{self.data_split}.pt"
        token_start = self.vocab.stoi[self.vocab.start]

        if not path_cache.exists():
            encoded = self.vocab.encode(list(self.text))
            sequences = []
            for i in tqdm(range(len(encoded) - self.hparams.seq_len)):
                sequences.append([token_start] +
                                 encoded[i:i + self.hparams.seq_len])
            tensor = torch.from_numpy(np.array(sequences)).type(torch.long)
            torch.save(tensor, str(path_cache))

        tensor = torch.load(str(path_cache))
        if self.hparams.verbose:
            print(dict(tensor=tensor.shape))
        return tensor

    def __len__(self):
        return self.tensor.shape[0]

    def __getitem__(self, i):
        sequence = self.tensor[i, :]
        return sequence[:-1], sequence[1:]

    def sequence_to_text(self, sequence: torch.Tensor):
        assert sequence.ndim == 1
        return "".join(self.vocab.decode(sequence.numpy()))

    def show_samples(self, num=3):
        print(self.__class__.__name__, dict(show_samples=num))
        indices = np.random.choice(len(self), size=num, replace=False)
        for i in indices:
            sequence = self.tensor[i, :]
            print(dict(text=self.sequence_to_text(sequence), raw=sequence))

    def extract_quotes(self, lines: List[str]) -> List[str]:
        def check_speaker_start(s: str) -> bool:
            for char in s:
                if char.isupper():
                    pass
                elif char == ":":
                    return True
                else:
                    return False
            return False

        def handle_newlines(_lines: List[str]) -> List[str]:
            out = []
            for line in _lines:
                out.extend(line.split(self.sep_line))
            return out

        def check_line_finish(s: str) -> bool:
            return s[-1] in "!.?"

        lines = handle_newlines(lines)
        quotes = []
        for row in csv.reader(lines, delimiter=",", quotechar='"'):
            for part in row:
                if not part:
                    continue
                if not check_line_finish(part):
                    continue
                if not check_speaker_start(part):
                    continue
                quotes.append(part)
        if self.hparams.verbose:
            print(dict(extract_quotes=len(quotes)))
        return quotes