Exemplo n.º 1
0
    def load(self):

        """
        Loads the Stanford Sentiment Treebank dataset. Uses the sentences from the "sentences_clean.txt"

        :return: Tuple: (X, Y) where
            X = [ "The cat sat on the mat",
                  "Anther cat was also sitting on the mat",
                  ... ]

            Y = [ 0, 1, 1, 1, 0, 1, 0, ... ]
        """

        f = open(self.folder_path + "/sst_cleaned.txt")
        f_splits = open(self.folder_path + "/sst_splits.txt")
        f_splits.readline()
        train = []
        val = []
        test = []

        X = []
        Y = []
        idx = 0
        for line in f:
            splitted = line.split("\t")
            split_split = f_splits.readline().strip().split(",")

            if self.clean_string:
                splitted[0] = dataset.clean_str(splitted[0].strip())

            if self.fine_grained_classes:
                X.append(splitted[0])
                Y.append(int(math.floor(float(splitted[1])*5)))
            else:
                if float(splitted[1]) <= 0.4:
                    X.append(splitted[0])
                    Y.append(0)
                elif float(splitted[1]) > 0.6:
                    X.append(splitted[0])
                    Y.append(1)
                else: continue

                if split_split[1] == "1":
                    train.append(idx)
                elif split_split[1] == "2":
                    test.append(idx)
                elif split_split[1] == "3":
                    val.append(idx)
                idx += 1

        random.shuffle(train)
        random.shuffle(val)

        self.splits = ([(train, val)], test)

        return (X, Y)
Exemplo n.º 2
0
def predict(text, model, word2idx, max_len=62):
    """Predict probability that a review is positive."""

    # Tokenize, pad and encode text
    text = dataset.clean_str(text)
    tokens = word_tokenize(text.lower())
    padded_tokens = tokens + ['<pad>'] * (max_len - len(tokens))
    input_id = [
        word2idx.get(token, word2idx['<unk>']) for token in padded_tokens
    ]

    # Convert to PyTorch tensors
    input_id = torch.tensor(input_id).unsqueeze(dim=0)

    # Compute logits
    logits = model.forward(input_id)

    #  Compute probability
    probs = F.softmax(logits, dim=1).squeeze(dim=0)

    print(f"This review is {probs[1] * 100:.2f}% positive.")
Exemplo n.º 3
0
 def load_file(f, y):
     for line in f:
         if self.clean_string:
             line = dataset.clean_str(line.strip())
         X.append(line)
         Y.append(y)