Пример #1
0
    def __init__(self, V=10000):
        self.vocab = None
        self.zipped_filename = "data/sst/trainDevTestTrees_PTB.zip"
        self.target_names = None  # set by self.process()

        # Download datasets
        if not os.path.isfile(self.zipped_filename):
            data_dir = os.path.dirname(self.zipped_filename)
            print("Downloading treebank to {:s}".format(data_dir))
            self.zipped_filename = download_sst(data_dir)
        print("Loading SST from {:s}".format(self.zipped_filename))

        self.train_trees = self.get_trees("train")
        print("Training set:     {:,} trees".format(len(self.train_trees)))
        self.dev_trees = self.get_trees("dev")
        print("Development set:  {:,} trees".format(len(self.dev_trees)))
        self.test_trees = self.get_trees("test")
        print("Test set:         {:,} trees".format(len(self.test_trees)))

        # Verify that number of sentences matches the published size.
        assert (len(self.train_trees) == 8544)
        assert (len(self.dev_trees) == 1101)
        assert (len(self.test_trees) == 2210)

        # Build vocabulary over training set
        print("Building vocabulary - ", end="")
        train_words = utils.flatten(
            self.canonicalize(t.leaves()) for t in self.train_trees)
        self.vocab = vocabulary.Vocabulary(train_words, size=V)
        print("{:,} words".format(self.vocab.size))
    def buildVocab(self,
                   vocabSize=None,
                   verbose=False,
                   return_vocab_objects=False):
        """
        Builds the vocabulary based on the initial data file
        
        vocabSize(int, default: None-all words) - max number of words to use for vocabulary
                                                  (only used for training)
        verbose(boolean, default: False)        - print extra info
        """
        print("----------------------------------------------------")
        print("building vocabulary from TRAINING data...")

        flatData = [w for w in zip(*utils.flatten(self.train_sentences))]

        # remember these vocabs will have the <s>, </s>, and <unk> tags in there
        # sizes need to be interpreted "-3" - consider replacing...
        self.vocab = vocabulary.Vocabulary(flatData[0], size=vocabSize)
        self.posTags = vocabulary.Vocabulary(flatData[1])
        self.nerTags = vocabulary.Vocabulary(flatData[2])
        self.capitalTags = vocabulary.Vocabulary(flatData[3])

        if verbose:
            print(
                "vocabulary for words, posTags, nerTags built and stored in object"
            )
            print("vocab size =", vocabSize)
            print("10 sampled words from vocabulary\n",
                  list(self.vocab.wordset)[:10], "\n")
            print("number of unique pos Tags in training =", self.posTags.size)
            print("all posTags used\n", list(self.posTags.wordset), "\n")
            print("number of unique NER tags in training =", self.nerTags.size)
            print("all nerTags for prediction", list(self.nerTags.wordset),
                  "\n")
            print("number of unique capitalization tags in training =",
                  self.capitalTags.size)
            print('all capitalTags for prediction',
                  list(self.capitalTags.wordset), "\n")

        if return_vocab_objects:
            return self.vocab, self.posTags, self.nerTags, self.capitalTags
    def setUp(self):
        sequence = ["a", "b", "c", "d"]
        self.vocab = vocabulary.Vocabulary(sequence)
        ids = self.vocab.words_to_ids(sequence)
        self.train_ids = np.array(ids * 50000, dtype=int)
        self.test_ids = np.array(ids * 100, dtype=int)

        model_params = dict(V=self.vocab.size,
                            H=10,
                            softmax_ns=2,
                            num_layers=1)
        self.lm = rnnlm.RNNLM(**model_params)
        self.lm.BuildCoreGraph()
        self.lm.BuildTrainGraph()
        self.lm.BuildSamplerGraph()
        # For toy model, ignore sampled softmax.
        self.lm.train_loss_ = self.lm.loss_
Пример #4
0
    def __init__(self, ndim=50):
        assert (ndim in self._AVAILABLE_DIMS)

        self.vocab = None
        self.W = None
        self.zipped_filename = "data/glove/glove.6B.zip"

        # Download datasets
        if not os.path.isfile(self.zipped_filename):
            data_dir = os.path.dirname(self.zipped_filename)
            print("Downloading GloVe vectors to {:s}".format(data_dir))
            self.zipped_filename = download_glove(data_dir)
        print("Loading vectors from {:s}".format(self.zipped_filename))

        words, W = parse_glove_file(self.zipped_filename, ndim)
        # Set nonzero value for special tokens
        mean_vec = np.mean(W[3:], axis=0)
        for i in range(3):
            W[i] = mean_vec
        self.W = W
        self.vocab = vocabulary.Vocabulary(words[3:])
        assert (self.vocab.size == self.W.shape[0])
Пример #5
0
    def process(self,
                train_test_split=0.9,
                train_validate_split=0.9,
                shuffle=True,
                input_length=500,
                sample=False):

        if sample:
            self.filepath = ['../reviews/Chicago_Illinois_3.csv']
        else:
            self.filepath = ['../reviews/*.csv']
        self.master_df = read_file(self.filepath)

        self.input_length = input_length

        # shuffle
        if shuffle:
            self.master_df = self.master_df.sample(frac=1)
            self.master_df = self.master_df.reset_index(drop=True)

        # split
        train_test_split_count = int(self.master_df.shape[0] *
                                     train_test_split)
        train_validate_split_count = int(train_test_split_count *
                                         train_validate_split)

        # train, validate and test dataframe
        self.train_df = self.master_df.loc[:train_validate_split_count]
        self.validate_df = self.master_df.loc[
            train_validate_split_count:train_test_split_count]
        self.test_df = self.master_df.loc[train_test_split_count:]

        # build vocab over training set
        tokens = word_tokenize(self.train_df.review.to_string().lower())
        print("Tokens: {}".format(len(set(tokens))))
        self.vocab = vocabulary.Vocabulary(
            set(utils.canonicalize_word(w) for w in tokens))
        print("Vocabulary: {:,} types".format(self.vocab.size))

        # build training, validation and test set
        self.train_features = []
        self.train_labels = []
        self.validate_features = []
        self.validate_labels = []
        self.test_features = []
        self.test_labels = []

        for i in range(0, self.master_df.shape[0], 1):

            if not isNaN(self.master_df.loc[i].review):

                tokens = word_tokenize(self.master_df.loc[i].review.lower())
                feature = self.vocab.words_to_ids(
                    utils.canonicalize_word(w, self.vocab) for w in tokens)

                #rating = self.master_df.loc[i].rating - 1

                if (self.master_df.loc[i].rating
                        == 1) or (self.master_df.loc[i].rating == 2):
                    rating = 0
                elif self.master_df.loc[i].rating == 3:
                    rating = 1
                elif self.master_df.loc[i].rating == 4:
                    rating = 2
                else:
                    rating = 3

                if i < train_validate_split_count:
                    self.train_features.append(feature)
                    self.train_labels.append(rating)
                else:
                    if i < train_test_split_count:
                        self.validate_features.append(feature)
                        self.validate_labels.append(rating)
                    else:
                        self.test_features.append(feature)
                        self.test_labels.append(rating)
            printProgressBar(i, self.master_df.shape[0] - 1)

        self.train_features = np.asarray(self.train_features)
        self.train_labels = np.asarray(self.train_labels)
        self.validate_features = np.asarray(self.validate_features)
        self.validate_labels = np.asarray(self.validate_labels)
        self.test_features = np.asarray(self.test_features)
        self.test_labels = np.asarray(self.test_labels)

        print("number of train_features = {}, train_labels = {}".format(
            len(self.train_features), len(self.train_labels)))
        print("number of validate_features = {}, validate_labels = {}".format(
            len(self.validate_features), len(self.validate_labels)))
        print("number of test_features = {}, test_labels = {}".format(
            len(self.test_features), len(self.test_labels)))

        self.padded_train_features, self.train_ns = utils.pad_np_array(
            self.train_features, self.input_length)
        self.padded_validate_features, self.validate_ns = utils.pad_np_array(
            self.validate_features, self.input_length)
        self.padded_test_features, self.test_ns = utils.pad_np_array(
            self.test_features, self.input_length)

        self.target_labels = [0, 1, 2, 3]

        return self