def load(self): """ Loads the Stanford Sentiment Treebank dataset. Uses the sentences from the "sentences_clean.txt" :return: Tuple: (X, Y) where X = [ "The cat sat on the mat", "Anther cat was also sitting on the mat", ... ] Y = [ 0, 1, 1, 1, 0, 1, 0, ... ] """ f = open(self.folder_path + "/sst_cleaned.txt") f_splits = open(self.folder_path + "/sst_splits.txt") f_splits.readline() train = [] val = [] test = [] X = [] Y = [] idx = 0 for line in f: splitted = line.split("\t") split_split = f_splits.readline().strip().split(",") if self.clean_string: splitted[0] = dataset.clean_str(splitted[0].strip()) if self.fine_grained_classes: X.append(splitted[0]) Y.append(int(math.floor(float(splitted[1])*5))) else: if float(splitted[1]) <= 0.4: X.append(splitted[0]) Y.append(0) elif float(splitted[1]) > 0.6: X.append(splitted[0]) Y.append(1) else: continue if split_split[1] == "1": train.append(idx) elif split_split[1] == "2": test.append(idx) elif split_split[1] == "3": val.append(idx) idx += 1 random.shuffle(train) random.shuffle(val) self.splits = ([(train, val)], test) return (X, Y)
def predict(text, model, word2idx, max_len=62): """Predict probability that a review is positive.""" # Tokenize, pad and encode text text = dataset.clean_str(text) tokens = word_tokenize(text.lower()) padded_tokens = tokens + ['<pad>'] * (max_len - len(tokens)) input_id = [ word2idx.get(token, word2idx['<unk>']) for token in padded_tokens ] # Convert to PyTorch tensors input_id = torch.tensor(input_id).unsqueeze(dim=0) # Compute logits logits = model.forward(input_id) # Compute probability probs = F.softmax(logits, dim=1).squeeze(dim=0) print(f"This review is {probs[1] * 100:.2f}% positive.")
def load_file(f, y): for line in f: if self.clean_string: line = dataset.clean_str(line.strip()) X.append(line) Y.append(y)