def __init__(self, V=10000): self.vocab = None self.zipped_filename = "data/sst/trainDevTestTrees_PTB.zip" self.target_names = None # set by self.process() # Download datasets if not os.path.isfile(self.zipped_filename): data_dir = os.path.dirname(self.zipped_filename) print("Downloading treebank to {:s}".format(data_dir)) self.zipped_filename = download_sst(data_dir) print("Loading SST from {:s}".format(self.zipped_filename)) self.train_trees = self.get_trees("train") print("Training set: {:,} trees".format(len(self.train_trees))) self.dev_trees = self.get_trees("dev") print("Development set: {:,} trees".format(len(self.dev_trees))) self.test_trees = self.get_trees("test") print("Test set: {:,} trees".format(len(self.test_trees))) # Verify that number of sentences matches the published size. assert (len(self.train_trees) == 8544) assert (len(self.dev_trees) == 1101) assert (len(self.test_trees) == 2210) # Build vocabulary over training set print("Building vocabulary - ", end="") train_words = utils.flatten( self.canonicalize(t.leaves()) for t in self.train_trees) self.vocab = vocabulary.Vocabulary(train_words, size=V) print("{:,} words".format(self.vocab.size))
def buildVocab(self, vocabSize=None, verbose=False, return_vocab_objects=False): """ Builds the vocabulary based on the initial data file vocabSize(int, default: None-all words) - max number of words to use for vocabulary (only used for training) verbose(boolean, default: False) - print extra info """ print("----------------------------------------------------") print("building vocabulary from TRAINING data...") flatData = [w for w in zip(*utils.flatten(self.train_sentences))] # remember these vocabs will have the <s>, </s>, and <unk> tags in there # sizes need to be interpreted "-3" - consider replacing... self.vocab = vocabulary.Vocabulary(flatData[0], size=vocabSize) self.posTags = vocabulary.Vocabulary(flatData[1]) self.nerTags = vocabulary.Vocabulary(flatData[2]) self.capitalTags = vocabulary.Vocabulary(flatData[3]) if verbose: print( "vocabulary for words, posTags, nerTags built and stored in object" ) print("vocab size =", vocabSize) print("10 sampled words from vocabulary\n", list(self.vocab.wordset)[:10], "\n") print("number of unique pos Tags in training =", self.posTags.size) print("all posTags used\n", list(self.posTags.wordset), "\n") print("number of unique NER tags in training =", self.nerTags.size) print("all nerTags for prediction", list(self.nerTags.wordset), "\n") print("number of unique capitalization tags in training =", self.capitalTags.size) print('all capitalTags for prediction', list(self.capitalTags.wordset), "\n") if return_vocab_objects: return self.vocab, self.posTags, self.nerTags, self.capitalTags
def setUp(self): sequence = ["a", "b", "c", "d"] self.vocab = vocabulary.Vocabulary(sequence) ids = self.vocab.words_to_ids(sequence) self.train_ids = np.array(ids * 50000, dtype=int) self.test_ids = np.array(ids * 100, dtype=int) model_params = dict(V=self.vocab.size, H=10, softmax_ns=2, num_layers=1) self.lm = rnnlm.RNNLM(**model_params) self.lm.BuildCoreGraph() self.lm.BuildTrainGraph() self.lm.BuildSamplerGraph() # For toy model, ignore sampled softmax. self.lm.train_loss_ = self.lm.loss_
def __init__(self, ndim=50): assert (ndim in self._AVAILABLE_DIMS) self.vocab = None self.W = None self.zipped_filename = "data/glove/glove.6B.zip" # Download datasets if not os.path.isfile(self.zipped_filename): data_dir = os.path.dirname(self.zipped_filename) print("Downloading GloVe vectors to {:s}".format(data_dir)) self.zipped_filename = download_glove(data_dir) print("Loading vectors from {:s}".format(self.zipped_filename)) words, W = parse_glove_file(self.zipped_filename, ndim) # Set nonzero value for special tokens mean_vec = np.mean(W[3:], axis=0) for i in range(3): W[i] = mean_vec self.W = W self.vocab = vocabulary.Vocabulary(words[3:]) assert (self.vocab.size == self.W.shape[0])
def process(self, train_test_split=0.9, train_validate_split=0.9, shuffle=True, input_length=500, sample=False): if sample: self.filepath = ['../reviews/Chicago_Illinois_3.csv'] else: self.filepath = ['../reviews/*.csv'] self.master_df = read_file(self.filepath) self.input_length = input_length # shuffle if shuffle: self.master_df = self.master_df.sample(frac=1) self.master_df = self.master_df.reset_index(drop=True) # split train_test_split_count = int(self.master_df.shape[0] * train_test_split) train_validate_split_count = int(train_test_split_count * train_validate_split) # train, validate and test dataframe self.train_df = self.master_df.loc[:train_validate_split_count] self.validate_df = self.master_df.loc[ train_validate_split_count:train_test_split_count] self.test_df = self.master_df.loc[train_test_split_count:] # build vocab over training set tokens = word_tokenize(self.train_df.review.to_string().lower()) print("Tokens: {}".format(len(set(tokens)))) self.vocab = vocabulary.Vocabulary( set(utils.canonicalize_word(w) for w in tokens)) print("Vocabulary: {:,} types".format(self.vocab.size)) # build training, validation and test set self.train_features = [] self.train_labels = [] self.validate_features = [] self.validate_labels = [] self.test_features = [] self.test_labels = [] for i in range(0, self.master_df.shape[0], 1): if not isNaN(self.master_df.loc[i].review): tokens = word_tokenize(self.master_df.loc[i].review.lower()) feature = self.vocab.words_to_ids( utils.canonicalize_word(w, self.vocab) for w in tokens) #rating = self.master_df.loc[i].rating - 1 if (self.master_df.loc[i].rating == 1) or (self.master_df.loc[i].rating == 2): rating = 0 elif self.master_df.loc[i].rating == 3: rating = 1 elif self.master_df.loc[i].rating == 4: rating = 2 else: rating = 3 if i < train_validate_split_count: self.train_features.append(feature) self.train_labels.append(rating) else: if i < train_test_split_count: self.validate_features.append(feature) self.validate_labels.append(rating) else: self.test_features.append(feature) self.test_labels.append(rating) printProgressBar(i, self.master_df.shape[0] - 1) self.train_features = np.asarray(self.train_features) self.train_labels = np.asarray(self.train_labels) self.validate_features = np.asarray(self.validate_features) self.validate_labels = np.asarray(self.validate_labels) self.test_features = np.asarray(self.test_features) self.test_labels = np.asarray(self.test_labels) print("number of train_features = {}, train_labels = {}".format( len(self.train_features), len(self.train_labels))) print("number of validate_features = {}, validate_labels = {}".format( len(self.validate_features), len(self.validate_labels))) print("number of test_features = {}, test_labels = {}".format( len(self.test_features), len(self.test_labels))) self.padded_train_features, self.train_ns = utils.pad_np_array( self.train_features, self.input_length) self.padded_validate_features, self.validate_ns = utils.pad_np_array( self.validate_features, self.input_length) self.padded_test_features, self.test_ns = utils.pad_np_array( self.test_features, self.input_length) self.target_labels = [0, 1, 2, 3] return self