Пример #1
0
    def __init__(self, V=10000):
        self.vocab = None
        self.zipped_filename = "data/sst/trainDevTestTrees_PTB.zip"
        self.target_names = None  # set by self.process()

        # Download datasets
        if not os.path.isfile(self.zipped_filename):
            data_dir = os.path.dirname(self.zipped_filename)
            print("Downloading treebank to {:s}".format(data_dir))
            self.zipped_filename = download_sst(data_dir)
        print("Loading SST from {:s}".format(self.zipped_filename))

        self.train_trees = self.get_trees("train")
        print("Training set:     {:,} trees".format(len(self.train_trees)))
        self.dev_trees = self.get_trees("dev")
        print("Development set:  {:,} trees".format(len(self.dev_trees)))
        self.test_trees = self.get_trees("test")
        print("Test set:         {:,} trees".format(len(self.test_trees)))

        # Verify that number of sentences matches the published size.
        assert (len(self.train_trees) == 8544)
        assert (len(self.dev_trees) == 1101)
        assert (len(self.test_trees) == 2210)

        # Build vocabulary over training set
        print("Building vocabulary - ", end="")
        train_words = utils.flatten(
            self.canonicalize(t.leaves()) for t in self.train_trees)
        self.vocab = vocabulary.Vocabulary(train_words, size=V)
        print("{:,} words".format(self.vocab.size))
Пример #2
0
    def __init__(self, ndim=50):
        assert(ndim in self._AVAILABLE_DIMS)

        self.vocab = None
        self.W = None
        self.zipped_filename = "data/glove/glove.6B.zip"

        # Download datasets
        if not os.path.isfile(self.zipped_filename):
            data_dir = os.path.dirname(self.zipped_filename)
            print("Downloading GloVe vectors to {:s}".format(data_dir))
            self.zipped_filename = download_glove(data_dir)
        print("Loading vectors from {:s}".format(self.zipped_filename))

        words, W = parse_glove_file(self.zipped_filename, ndim)
        # Set nonzero value for special tokens
        half = W.shape[0]//2
        mean_vec = np.mean(W[3:], axis=0)
        random_1 = np.mean(W[3:half], axis=0)
        random_2 = np.mean(W[half:], axis=0)
        #for i in range(3):
        #    W[i] = mean_vec
        W[0] = mean_vec
        W[1] = random_1
        W[2] = random_2
        self.W = W
        self.vocab = vocabulary.Vocabulary(words[3:])
        assert(self.vocab.size == self.W.shape[0])
Пример #3
0
    def __init__(self, ndim=50, quiet=False):
        assert (ndim in self._AVAILABLE_DIMS)

        self.vocab = None
        self.W = None
        self.zipped_filename = "data/glove/glove.6B.zip"

        # Download datasets
        if not os.path.isfile(self.zipped_filename):
            data_dir = os.path.dirname(self.zipped_filename)

            if not quiet:
                print("Downloading GloVe vectors to {:s}".format(data_dir))
            self.zipped_filename = download_glove(data_dir)

        if not quiet:
            print("Loading vectors from {:s}".format(self.zipped_filename))

        words, W = parse_glove_file(self.zipped_filename, ndim, quiet)
        # Set nonzero value for special tokens
        mean_vec = np.mean(W[3:], axis=0)
        for i in range(3):
            W[i] = mean_vec
        self.W = W
        self.vocab = vocabulary.Vocabulary(words[3:])
        assert (self.vocab.size == self.W.shape[0])
Пример #4
0
    def setUp(self):
        sequence = ["a", "b", "c", "d"]
        self.vocab = vocabulary.Vocabulary(sequence)
        ids = self.vocab.words_to_ids(sequence)
        self.train_ids = np.array(ids * 50000, dtype=int)
        self.test_ids = np.array(ids * 100, dtype=int)

        model_params = dict(V=self.vocab.size,
                            H=10,
                            softmax_ns=2,
                            num_layers=1)
        self.lm = rnnlm.RNNLM(**model_params)
        self.lm.BuildCoreGraph()
        self.lm.BuildTrainGraph()
        self.lm.BuildSamplerGraph()
        # For toy model, ignore sampled softmax.
        self.lm.train_loss_ = self.lm.loss_

# In[39]:

reload(ds)
post, mbti_type, user = ds.splitPosts(df)

# Split data: 80% train, 20% test
post_train, post_test, label_train, label_test = train_test_split(post, mbti_type, test_size=0.2, random_state=88)

print("MBIT posts", post_train[:5])
print('')
print("MBTI Labels: ",label_train[:5])

# Build a vocabulary (V size is defaulted to full text) for train corpus
vocab_mbti = vocabulary.Vocabulary((utils.canonicalize_word(w) for w in post_train))
print("Vocab Size: ",vocab_mbti.size)

# tokenize and canonicalize train and test sets
x_train = []
for post in post_train:
    x_train.append(vocab_mbti.words_to_ids(post.split()))

x_test = []
for post in post_test:
    x_test.append(vocab_mbti.words_to_ids(post.split()))
    
reload(ds)
y_train, y_test = ds.one_hot_label(mbti_type, label_train, label_test)
y_train_id, y_test_id, label_map = ds.label_to_id(mbti_type, label_train, label_test)
Пример #6
0
def full_vocab_canon(x):
    # Build a vocabulary (V size is defaulted to full text) for train corpus
    vocab_mbti = vocabulary.Vocabulary((utils.canonicalize_word(w) for w in x))
    print("Full Vocab Built, size: ", vocab_mbti.size)
    return vocab_mbti.size, vocab_mbti
 def generate_vocab(self, max_size=None):
     self._generate_word_list()
     self.vocab = vocabulary.Vocabulary(self.word_list, size=max_size)