예제 #1
0
        test_encodings = self.tokenizer(train_texts,
                                        truncation=True,
                                        padding=True)
        # Create IMDB dataset
        self.data_test = IMDBTransformerDataset(test_encodings, test_labels)
        train_ds = IMDBTransformerDataset(train_encodings, train_labels)
        self.data_train, self.data_val = random_split(train_ds, [20000, 5000])

    def __repr__(self) -> str:
        """Print infor about the dataset"""
        basic = f"IMDB Dataset\nNum classes: {len(self.mapping)}\nMapping: {self.mapping}\n"
        data = (
            f"Train/val/test sizes: {len(self.data_train), len(self.data_val), len(self.data_test)}"
        )
        return basic + data


def read_imdb_split(split_dir):
    split_dir = Path(split_dir)
    texts = []
    labels = []
    for label_dir in ["pos", "neg"]:
        for text_file in (split_dir / label_dir).iterdir():
            texts.append(text_file.read_text())
            labels.append(0 if label_dir is "neg" else 1)
    return texts, labels


if __name__ == "__main__":
    load_and_print_info(IMDBTransformer)
예제 #2
0
        tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
        train_encodings = tokenizer(train_texts, truncation=True, padding=True)
        test_encodings = tokenizer(train_texts, truncation=True, padding=True)
        # Create IMDB dataset
        self.data_test = IMDBDataset(test_encodings, test_labels)
        train_ds = IMDBDataset(train_encodings, train_labels)
        self.data_train, self.data_val = random_split(train_ds, [20000, 5000])

    def __repr__(self) -> str:
        """Print infor about the dataset"""
        basic = (
            f"IMDB Dataset\nNum classes: {len(self.mapping)}\nMapping: {self.mapping}\n"
        )
        data = f"Train/val/test sizes: {len(self.data_train), len(self.data_val), len(self.data_test)}"
        return basic + data


def read_imdb_split(split_dir):
    split_dir = Path(split_dir)
    texts = []
    labels = []
    for label_dir in ["pos", "neg"]:
        for text_file in (split_dir / label_dir).iterdir():
            texts.append(text_file.read_text())
            labels.append(0 if label_dir is "neg" else 1)
    return texts, labels


if __name__ == "__main__":
    load_and_print_info(IMDB)
예제 #3
0
            val_df["labels"])
        test_texts, test_labels = list(test_df["sentences"]), list(
            test_df["labels"])

        train_encodings = self.tokenizer(train_texts,
                                         truncation=True,
                                         padding=True)
        val_encodings = self.tokenizer(val_texts,
                                       truncation=True,
                                       padding=True)
        test_encodings = self.tokenizer(train_texts,
                                        truncation=True,
                                        padding=True)
        # Create IMDB dataset
        self.data_train = UIT_VSFCTranformerDataset(train_encodings,
                                                    train_labels)
        self.data_val = UIT_VSFCTranformerDataset(val_encodings, val_labels)
        self.data_test = UIT_VSFCTranformerDataset(test_encodings, test_labels)

    def __repr__(self) -> str:
        """Print infor about the dataset"""
        basic = f"UIT-VSFC Dataset\nNum classes: {len(self.mapping)}\nMapping: {self.mapping}\n"
        data = (
            f"Train/val/test sizes: {len(self.data_train), len(self.data_val), len(self.data_test)}"
        )
        return basic + data


if __name__ == "__main__":
    load_and_print_info(UIT_VSFCTransformer)