Пример #1
0
    def transform(self, X, y=None):

        logging.info("   DataVectorizer.transform running...")
        start_time = time.time()

        # Apply pre-processing function
        if self.preprocess_func:
            _X = parallelApply(X, self.preprocess_func, self.preprocess_ncore)
        else:
            _X = X
        if y is not None and self.encode_label:
            _y = self.label_encoder.transform(y)
        else:
            _y = np.asarray(y)

        # transform sentences into list of word indexes
        self.vectorizer.transform(X)
        _X = self.texts_to_sequences(_X)

        logging.info("   DataVectorizer.transform completed - Time elapsed: " +
                     get_elapsed_time(start_time))

        if y is not None:
            return _X, _y
        else:
            return _X
Пример #2
0
    def to_numpy(self, dest_folder: str):

        logging.info("Starting Data Export...")
        start_time = time.time()

        if self.tokenizer is not None:
            self.tokenizer.to_json(os.path.join(dest_folder, "tokenizer.json"))

        if self.X_train is not None:
            np.savez_compressed(
                os.path.join(dest_folder, "train_data_nn.npz"),
                X=self.X_train,
                y=self.y_train,
            )

        if self.X_test is not None:
            np.savez_compressed(
                os.path.join(dest_folder, "test_data_nn.npz"),
                X=self.X_test,
                y=self.y_test,
            )

        if self.X_val is not None:
            np.savez_compressed(os.path.join(dest_folder, "val_data_nn.npz"),
                                X=self.X_val,
                                y=self.y_val)

        logging.info("Data Export Completed - Time elapsed: " +
                     get_elapsed_time(start_time))
Пример #3
0
    def fit_transform(self, X, y=None):
        logging.info("DataVectorizer.fit_transform running...")
        start_time = time.time()

        # Apply pre-processing function
        if self.preprocess_func:
            _X = parallelApply(X, self.preprocess_func, self.preprocess_ncore)
        else:
            _X = X
        if y is not None:
            if self.encode_label:
                self.label_encoder = LabelEncoder()
                _y = self.label_encoder.fit_transform(y)
                self.labels = self.label_encoder.classes_
            else:
                _y = np.asarray(y)

        if self.preprocess_func is None:
            re_tok = re.compile('([%s“”¨«»®´·º½¾¿¡§£₤‘’])' %
                                string.punctuation)
            tokenizer = lambda x: re_tok.sub(r' \1 ', x).split()
            self.vectorizer = CountVectorizer(tokenizer=tokenizer,
                                              ngram_range=self.ngram_range,
                                              min_df=self.min_df,
                                              max_df=self.max_df,
                                              max_features=self.max_features,
                                              stop_words=self.stop_words,
                                              lowercase=True)
        else:
            self.vectorizer = CountVectorizer(tokenizer=None,
                                              ngram_range=self.ngram_range,
                                              min_df=self.min_df,
                                              max_df=self.max_df,
                                              max_features=self.max_features,
                                              stop_words=self.stop_words)
        self.vectorizer.fit_transform(_X)
        kept_tokens = set(self.vectorizer.vocabulary_.keys())

        for token in kept_tokens:
            self.index2w.append(token)
            self.w2index[token] = len(self.index2w) - 1
        self.vocab_size = len(self.index2w)
        del kept_tokens

        # transform sentences into list of word indexes
        _X = self.texts_to_sequences(_X)

        logging.info(
            "   DataVectorizer.fit_transform completed - Time elapsed: " +
            get_elapsed_time(start_time))

        if y is not None:
            return _X, _y
        else:
            return _X
Пример #4
0
def get_pretrained_vecs(
    input_vec_file: str, target_vocab: dict, dim: int = 300, output_file=None
):

    logging.getLogger(__name__)
    start_time = time.time()
    found_words = 0
    missing_words = 0

    # import word vector text file into a pandas dataframe (quicker)
    df_wvecs = pd.read_csv(input_vec_file, sep=" ", quoting=3, header=None, index_col=0)

    # create word index dict from pretrained vectors
    w_index = df_wvecs.index.tolist()
    w_index = dict(zip(w_index, range(len(w_index))))
    np_w_vecs = df_wvecs.to_numpy()
    del df_wvecs

    # initialize embedding matrix weights
    emb_mean, emb_std = np_w_vecs.mean(), np_w_vecs.std()
    embedding_matrix = np.random.normal(emb_mean, emb_std, (len(target_vocab), dim))
    embedding_matrix[0] = np.zeros(dim)

    # recopy pretrained vect weights into embedding_matrix
    # TODO: vectorize the following code
    for k, v in tqdm(
        target_vocab.items(),
        desc="{} Processing pretrained vectors...".format(
            datetime.today().strftime("%Y-%m-%d %H:%M:%S")
        ),
        total=len(target_vocab),
    ):
        if k in w_index.keys():
            found_words += 1
            embedding_matrix[v] = np_w_vecs[w_index[k]]
        else:
            missing_words += 1

    if output_file is not None:
        np.save(output_file, embedding_matrix)

    logging.info(
        "Matching words: {} - input vocab: {} - coverage: {}".format(
            found_words, len(target_vocab), found_words / len(target_vocab)
        )
    )
    logging.info(
        "Pretrained Vectors Preparation - Completed - Time elapsed: "
        + get_elapsed_time(start_time)
    )

    return embedding_matrix
Пример #5
0
    def from_numpy(
        self,
        train_data_file: str,
        test_data_file: str = None,
        val_data_file: str = None,
        ds_type="TensorDataset",
    ):

        logging.info("Starting Data Preparation...")
        start_time = time.time()

        self.tokenizer = text.tokenizer_from_json()

        train_npz = np.load(train_data_file, allow_pickle=True)
        self.X_train = train_npz["X"].item()
        self.y_train = train_npz["y"]

        self.num_classes = len(np.unique(self.y_train))
        self.vocab_size = np.shape(self.X_train)[1]

        train_ds = CSRDataset(self.X_train, self.y_train)

        if test_data_file is not None:
            test_npz = np.load(test_data_file, allow_pickle=True)
            self.X_test = test_npz["X"].item()
            self.y_test = test_npz["y"]

            test_ds = CSRDataset(self.X_test, self.y_test)

        if val_data_file is not None:
            val_npz = np.load(val_data_file, allow_pickle=True)
            self.X_val = val_npz["X"].item()
            self.y_val = val_npz["y"]

            val_ds = CSRDataset(self.X_val, self.y_val)

        logging.info("Data Import Completed - Time elapsed: " +
                     get_elapsed_time(start_time))

        if val_data_file is not None:
            if test_data_file is not None:
                return train_ds, val_ds, test_ds
            else:
                return train_ds, val_ds
        else:
            return train_ds
Пример #6
0
    def from_csv(
        self,
        train_file: str,
        test_file: str = None,
        val_file: str = None,
        val_size: float = 0.0,
        text_col_idx=0,
        label_col_idx=1,
        sep: str = ",",
        header=0,
        encoding: str = "utf8",
        preprocess_func=None,
        preprocess_ncore=2,
        ngram_range=(1, 3),
        min_df=1,
        max_df=1.0,
        stop_words="english",
        max_features=20000,
        ds_max_seq=1000,
        ds_type="TensorDataset",
    ):

        logging.info("Starting Data Preparation ...")
        logging.info("  Training Data ...")
        start_time = time.time()

        np.random.seed(self.seed)
        self.vectorizer = DataVectorizer(
            preprocess_func,
            preprocess_ncore,
            ngram_range=ngram_range,
            min_df=min_df,
            max_df=max_df,
            max_features=max_features,
            stop_words=stop_words,
        )

        self.train_file = train_file
        self.test_file = test_file
        self.val_file = val_file

        self.text_col_idx = text_col_idx
        self.label_col_idx = label_col_idx

        df = pd.read_csv(self.train_file,
                         sep=sep,
                         encoding=encoding,
                         header=header)
        X = df[df.columns[self.text_col_idx]].tolist()
        y = df[df.columns[self.label_col_idx]].to_numpy(dtype=int)
        del df

        if val_size > 0.0 and self.val_file is None:
            # create valid partition from train partition, keeping class distribution
            X_train, X_val, y_train, y_val = train_test_split(
                X, y, stratify=y, test_size=val_size, random_state=self.seed)
            X = X_train
            y = y_train
            del X_train, y_train
            gc.collect()

        # Input features features
        X = self.vectorizer.fit_transform(X)
        self.X_train = sequence.pad_sequences(X,
                                              maxlen=ds_max_seq,
                                              padding="post")
        self.y_train = y
        del X, y
        gc.collect()

        self.vocab_size = self.vectorizer.vocab_size
        self.vocab = self.vectorizer.w2index
        self.num_classes = len(list(set(self.y_train)))

        if ds_type == "TensorDataset":
            train_ds = TensorDataset(
                torch.from_numpy(self.X_train).long(),
                torch.from_numpy(self.y_train).long(),
            )
        else:
            train_ds = NNDataset(self.X_train,
                                 self.y_train,
                                 max_seq=ds_max_seq)

        if self.test_file is not None:
            logging.info("  Test Data ...")
            df = pd.read_csv(self.test_file,
                             sep=sep,
                             encoding=encoding,
                             header=header)
            X = df[df.columns[self.text_col_idx]].tolist()
            y = df[df.columns[self.label_col_idx]].to_numpy(dtype=int)
            del df

            X = self.vectorizer.transform(X)
            self.X_test = sequence.pad_sequences(X, maxlen=ds_max_seq)
            self.y_test = y
            del X, y
            gc.collect()

            if ds_type == "TensorDataset":
                test_ds = TensorDataset(
                    torch.from_numpy(self.X_test).long(),
                    torch.from_numpy(self.y_test).long(),
                )
            else:
                test_ds = NNDataset(self.X_test,
                                    self.y_test,
                                    max_seq=ds_max_seq)

        if (val_size > 0.0
                and self.val_file is None) or self.val_file is not None:
            logging.info("  Validation Data ...")
            if self.val_file is not None:
                df = pd.read_csv(self.val_file, sep=sep, encoding=encoding)
                X_val = df[df.columns[self.text_col_idx]].tolist()
                y_val = df[df.columns[self.label_col_idx]].to_numpy(dtype=int)
                del df
            X_val = self.vectorizer.transform(X_val)
            self.X_val = sequence.pad_sequences(X_val, maxlen=ds_max_seq)
            self.y_val = y_val
            del X_val, y_val
            gc.collect()

            if ds_type == "TensorDataset":
                val_ds = TensorDataset(
                    torch.from_numpy(self.X_val).long(),
                    torch.from_numpy(self.y_val).long(),
                )
            else:
                val_ds = NNDataset(self.X_val, self.y_val, max_seq=ds_max_seq)

        logging.info("Data Preparation Completed - Time elapsed: " +
                     get_elapsed_time(start_time))

        self.params = {
            "seed": self.seed,
            "train_file": self.train_file,
            "test_file": self.test_file,
            "val_file": self.val_file,
            "vocabulary_size": self.vocab_size,
            "preprocess_ncore": preprocess_ncore,
            "stop_words": stop_words,
            "max_features": max_features,
            "ngram_range": ngram_range,
            "min_df": min_df,
            "max_df": max_df,
            "ds_max_seq": ds_max_seq,
            "num_classes": self.num_classes
        }
        if preprocess_func is not None:
            self.params.update({"preprocess_func": preprocess_func.__name__})
        else:
            self.params.update({"preprocess_func": None})

        if self.val_file is not None:
            if self.test_file is not None:
                return train_ds, test_ds, val_ds
            else:
                return train_ds, val_ds
        else:
            return train_ds
Пример #7
0
    def from_csv(
        self,
        train_file: str,
        test_file: str = None,
        val_file: str = None,
        val_size: float = 0.0,
        text_col_idx=0,
        label_col_idx=1,
        sep: str = ",",
        header=0,
        encoding: str = "utf8",
        ngram_range=(1, 2),
        min_df=5,
        max_df=0.9,
        use_idf: bool = True,
        sublinear_tf: bool = False,
        norm="l2",
        binary=False,
        max_features=None,
        stop_words=None,
        preprocess_func=None,
        preprocess_ncore=2,
    ):

        logging.info("Starting Data Preparation...")
        start_time = time.time()

        self.train_file = train_file
        self.test_file = test_file
        self.val_file = val_file

        self.text_col_idx = text_col_idx
        self.label_col_idx = label_col_idx

        re_tok = re.compile("([%s“”¨«»®´·º½¾¿¡§£₤‘’])" % string.punctuation)
        tokenizer = lambda x: re_tok.sub(r" \1 ", x).split()

        self.vectorizer = TfidfVectorizer(
            use_idf=use_idf,
            tokenizer=tokenizer,
            ngram_range=ngram_range,
            min_df=min_df,
            max_df=max_df,
            sublinear_tf=sublinear_tf,
            norm=norm,
            binary=binary,
            max_features=max_features,
            stop_words=stop_words,
        )

        df = pd.read_csv(self.train_file,
                         sep=sep,
                         encoding=encoding,
                         header=header)
        if preprocess_func is not None:
            df[df.columns[self.text_col_idx]] = parallelApply(
                df[df.columns[self.text_col_idx]], preprocess_func,
                preprocess_ncore)
        X = df[df.columns[self.text_col_idx]].tolist()
        y = df[df.columns[self.label_col_idx]].to_numpy(float)
        del df

        self.X_train = self.vectorizer.fit_transform(X)
        self.y_train = y
        self.vocab_size = len(
            [v for k, v in self.vectorizer.vocabulary_.items()])
        self.num_classes = len(np.unique(self.y_train))
        del X, y

        train_ds = CSRDataset(self.X_train, self.y_train)
        gc.collect()

        if self.test_file is not None:
            df = pd.read_csv(self.test_file,
                             sep=sep,
                             encoding=encoding,
                             header=header)
            if preprocess_func is not None:
                df[df.columns[self.text_col_idx]] = parallelApply(
                    df[df.columns[self.text_col_idx]], preprocess_func,
                    preprocess_ncore)
            X = df[df.columns[self.text_col_idx]].tolist()
            y = df[df.columns[self.label_col_idx]].to_numpy(float)
            del df
            self.X_test = self.vectorizer.transform(X)
            self.y_test = y
            del X, y
            test_ds = CSRDataset(self.X_test, self.y_test)
            gc.collect()

        if self.val_file is not None:  # or val_size > 0.0:
            df = pd.read_csv(self.val_file,
                             sep=sep,
                             encoding=encoding,
                             header=header)
            if preprocess_func is not None:
                df[df.columns[self.text_col_idx]] = parallelApply(
                    df[df.columns[self.text_col_idx]], preprocess_func,
                    preprocess_ncore)
            X = df[df.columns[self.text_col_idx]].tolist()
            y = df[df.columns[self.label_col_idx]].to_numpy(float)
            del df
            self.X_val = self.vectorizer.transform(X)
            self.y_val = y
            del X, y
            val_ds = CSRDataset(self.X_val, self.y_val)

            gc.collect()

        logging.info("Data Preparation Completed - Time elapsed: " +
                     get_elapsed_time(start_time))

        if self.val_file is not None:
            if self.test_file is not None:
                return train_ds, test_ds, val_ds
            else:
                return train_ds, val_ds
        else:
            return train_ds
Пример #8
0
    def from_csv(
        self,
        train_file: str,
        test_file: str = None,
        val_file: str = None,
        val_size: float = 0.1,
        text_col_idx=0,
        label_col_idx=1,
        sep: str = ",",
        header=0,
        encoding: str = "utf8",
        preprocess_func=None,
        preprocess_ncore=2,
        ngram_range=(1, 3),
        max_features=20000,
        ds_max_seq=1000,
        ds_type="TensorDataset",
    ):

        logging.info("Starting Data Preparation...")
        start_time = time.time()

        self.train_file = train_file
        self.test_file = test_file
        self.val_file = val_file

        self.text_col_idx = text_col_idx
        self.label_col_idx = label_col_idx

        df = pd.read_csv(self.train_file,
                         sep=sep,
                         encoding=encoding,
                         header=header)
        if preprocess_func is not None:
            df[df.columns[self.text_col_idx]] = parallelApply(
                df[df.columns[self.text_col_idx]], preprocess_func,
                preprocess_ncore)
        X = df[df.columns[self.text_col_idx]].tolist()
        y = df[df.columns[self.label_col_idx]].to_numpy(dtype=int)

        del df
        logging.info("Adding 1-gram features".format(ngram_range[1]))
        self.tokenizer = Tokenizer(num_words=max_features,
                                   lower=False,
                                   filters="")
        self.tokenizer.fit_on_texts(X)
        self.X_train = self.tokenizer.texts_to_sequences(X)

        if ngram_range[1] > 1:
            logging.info("Adding N-gram features".format(ngram_range[1]))
            # Create set of unique n-gram from the training set.
            ngram_set = set()
            for input_list in self.X_train:
                for i in range(2, ngram_range[1] + 1):
                    set_of_ngram = self.create_ngram_set(input_list,
                                                         ngram_value=i)
                    ngram_set.update(set_of_ngram)

            # Dictionary mapping n-gram token to a unique integer.
            # Integer values are greater than max_features in order
            # to avoid collision with existing features.
            start_index = max_features + 1
            token_indice = {
                v: k + start_index
                for k, v in enumerate(ngram_set)
            }
            indice_token = {token_indice[k]: k for k in token_indice}

            # max_features is the highest integer that could be found in the dataset.
            max_features = np.max(list(indice_token.keys())) + 1

            # Augmenting input tokens with n-grams features
            self.X_train = self.add_ngram(self.X_train, token_indice,
                                          ngram_range[1])

        self.X_train = sequence.pad_sequences(self.X_train, maxlen=ds_max_seq)
        self.y_train = y

        self.vocab_size = max_features
        logging.info("Building final vocab...")
        vocab_wrd_idx = set()
        _ = [vocab_wrd_idx.add(idx) for sent in self.X_train for idx in sent]
        del _
        self.vocab = {
            self.tokenizer.index_word[i]: i
            for i in vocab_wrd_idx if i in self.tokenizer.index_word
        }
        # self.strt = start_index
        # if ngram_range[1] > 1:
        #     self._start = start_index
        #     a = [str(indice_token[i]) for i in range(start_index, len(vocab_wrd_idx)) if i in indice_token[i]]
        self.num_classes = len(np.unique(self.y_train))
        del X, y
        gc.collect()

        if ds_type == "TensorDataset":
            train_ds = TensorDataset(
                torch.from_numpy(self.X_train).long(),
                torch.from_numpy(self.y_train).long(),
            )
        else:
            train_ds = NNDataset(self.X_train,
                                 self.y_train,
                                 max_seq=ds_max_seq)

        if self.test_file is not None:
            df = pd.read_csv(self.test_file,
                             sep=sep,
                             encoding=encoding,
                             header=header)
            if preprocess_func is not None:
                df[df.columns[self.text_col_idx]] = parallelApply(
                    df[df.columns[self.text_col_idx]], preprocess_func,
                    preprocess_ncore)
            X = df[df.columns[self.text_col_idx]].tolist()
            y = df[df.columns[self.label_col_idx]].to_numpy(dtype=int)
            del df
            self.X_test = self.tokenizer.texts_to_sequences(X)
            if ngram_range[1] > 1:
                self.X_test = self.add_ngram(self.X_test, token_indice,
                                             ngram_range[1])
            self.X_test = sequence.pad_sequences(self.X_train,
                                                 maxlen=ds_max_seq)
            self.y_test = y
            del X, y
            gc.collect()
            if ds_type == "TensorDataset":
                test_ds = TensorDataset(
                    torch.from_numpy(self.X_test).long(),
                    torch.from_numpy(self.y_test).long(),
                )
            else:
                test_ds = NNDataset(self.X_test,
                                    self.y_test,
                                    max_seq=ds_max_seq)

        if self.val_file is not None:
            df = pd.read_csv(self.val_file, sep=sep, encoding=encoding)
            if preprocess_func is not None:
                df[df.columns[self.text_col_idx]] = parallelApply(
                    df[df.columns[self.text_col_idx]], preprocess_func,
                    preprocess_ncore)
            X = df[df.columns[self.text_col_idx]].tolist()
            y = df[df.columns[self.label_col_idx]].to_numpy(dtype=int)
            del df
            self.X_val = self.tokenizer.texts_to_sequences(X)
            if ngram_range[1] > 1:
                self.X_val = self.add_ngram(self.X_val, token_indice,
                                            ngram_range[1])
            self.X_val = sequence.pad_sequences(self.X_val, maxlen=ds_max_seq)
            self.y_val = y
            del X, y
            gc.collect()

            if ds_type == "TensorDataset":
                val_ds = TensorDataset(
                    torch.from_numpy(self.X_val).long(),
                    torch.from_numpy(self.y_val).long(),
                )
            else:
                val_ds = NNDataset(self.X_val, self.y_val, max_seq=ds_max_seq)

        logging.info("Data Preparation Completed - Time elapsed: " +
                     get_elapsed_time(start_time))

        if self.val_file is not None:
            if self.test_file is not None:
                return train_ds, test_ds, val_ds
            else:
                return train_ds, val_ds
        else:
            return train_ds
Пример #9
0
    def from_csv(
        self,
        train_file: str,
        test_file: str = None,
        val_file: str = None,
        val_size: float = 0.0,
        text_col_idx=0,
        label_col_idx=1,
        sep: str = ",",
        header=0,
        encoding: str = "utf8",
        ngram_range=(1, 3),
        min_df=1,
        max_df=1.0,
        use_idf: bool = False,
        sublinear_tf: bool = False,
        norm="l2",
        binary=False,
        max_features=None,
        stop_words=None,
        preprocess_func=None,
        preprocess_ncore=2,
        ds_max_seq=1000,
        ds_type="Dataset",
    ):

        logging.info("Starting Data Preparation...")
        logging.info("  Training Data ...")
        start_time = time.time()

        self.train_file = train_file
        self.test_file = test_file
        self.val_file = val_file

        self.text_col_idx = text_col_idx
        self.label_col_idx = label_col_idx

        re_tok = re.compile("([%s“”¨«»®´·º½¾¿¡§£₤‘’])" % string.punctuation)
        tokenizer = lambda x: re_tok.sub(r" \1 ", x).split()
        #
        self.vectorizer = TfidfVectorizer(
            use_idf=use_idf,
            tokenizer=tokenizer,
            ngram_range=ngram_range,
            min_df=min_df,
            max_df=max_df,
            sublinear_tf=sublinear_tf,
            norm=norm,
            binary=binary,
            max_features=max_features,
            stop_words=stop_words,
        )

        df = pd.read_csv(self.train_file,
                         sep=sep,
                         encoding=encoding,
                         header=header)
        if preprocess_func is not None:
            df[df.columns[self.text_col_idx]] = parallelApply(
                df[df.columns[self.text_col_idx]], preprocess_func,
                preprocess_ncore)
        X = df[df.columns[self.text_col_idx]].tolist()
        y = df[df.columns[self.label_col_idx]].to_numpy(int)
        del df
        self.X_train = self.vectorizer.fit_transform(X)
        self.y_train = y
        self.vocab_size = len(
            [v for k, v in self.vectorizer.vocabulary_.items()])
        self.num_classes = len(np.unique(self.y_train))
        self.X_train_words_seq, _ = self._bow2adjlist(self.X_train,
                                                      max_seq=ds_max_seq)
        self.r = np.column_stack([
            self.calc_r(i, self.X_train, self.y_train)
            for i in range(self.num_classes)
        ])
        del X, y

        train_ds = TensorDataset(
            torch.from_numpy(self.X_train_words_seq.toarray()).long(),
            torch.from_numpy(self.y_train).long(),
        )
        gc.collect()

        if self.test_file is not None:
            logging.info("  Test Data ...")
            df = pd.read_csv(self.test_file,
                             sep=sep,
                             encoding=encoding,
                             header=header)
            if preprocess_func is not None:
                df[df.columns[self.text_col_idx]] = parallelApply(
                    df[df.columns[self.text_col_idx]], preprocess_func,
                    preprocess_ncore)
            X = df[df.columns[self.text_col_idx]].tolist()
            y = df[df.columns[self.label_col_idx]].to_numpy(int)
            del df
            self.X_test = self.vectorizer.transform(X)
            self.y_test = y
            self.X_test_words_seq, _ = self._bow2adjlist(self.X_test,
                                                         max_seq=ds_max_seq)
            del X, y
            test_ds = TensorDataset(
                torch.from_numpy(self.X_test_words_seq.toarray()).long(),
                torch.from_numpy(self.y_test).long(),
            )
            gc.collect()

        if self.val_file is not None:
            logging.info("  Validation Data ...")
            df = pd.read_csv(self.val_file,
                             sep=sep,
                             encoding=encoding,
                             header=header)
            if preprocess_func is not None:
                df[df.columns[self.text_col_idx]] = parallelApply(
                    df[df.columns[self.text_col_idx]], preprocess_func,
                    preprocess_ncore)
            X = df[df.columns[self.text_col_idx]].tolist()
            y = df[df.columns[self.label_col_idx]].to_numpy(int)
            del df
            self.X_val = self.vectorizer.transform(X)
            self.y_val = y
            self.X_val_words_seq, _ = self._bow2adjlist(self.X_val,
                                                        max_seq=ds_max_seq)
            del X, y
            val_ds = TensorDataset(
                torch.from_numpy(self.X_val_words_seq.toarray()).long(),
                torch.from_numpy(self.y_val).long(),
            )
            gc.collect()

        logging.info("Data Preparation Completed - Time elapsed: " +
                     get_elapsed_time(start_time))
        if self.val_file is not None:
            if self.test_file is not None:
                return self.r, train_ds, test_ds, val_ds
            else:
                return self.r, train_ds, val_ds
        else:
            return self.r, train_ds
Пример #10
0
    def train_evaluate(
        self,
        seed=42,
        check_dl=True,
        run_lr_finder=False,
        show_lr_plot: bool = False,
    ):

        set_seed(seed)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.train_dl = DataLoader(self.train_ds,
                                   batch_size=self.batch_size,
                                   shuffle=True)
        if self.test_ds is not None:
            self.test_dl = DataLoader(self.test_ds,
                                      batch_size=self.batch_size,
                                      shuffle=False)
        if self.val_ds is not None:
            self.val_dl = DataLoader(self.val_ds,
                                     batch_size=self.batch_size,
                                     shuffle=False)

        self.model = self.model.to(device)
        self.criterion = self.criterion.to(device)

        if run_lr_finder:
            logging.info("LR Finder Running....")
            lr_finder = LRFinder(self.model,
                                 self.optimizer,
                                 criterion=self.criterion,
                                 device=device)
            lr_finder.range_test(self.train_dl,
                                 start_lr=10e-6,
                                 end_lr=1,
                                 num_iter=100)
            lr_finder.plot(
                show=show_lr_plot,
                output_path="LR_finder_{}_{}.png".format(
                    self.model.__class__.__name__,
                    datetime.now().strftime("%Y%m%d_%H%M%S"),
                ),
            )
            logging.info("LR Finder Run Completed....")

        # Checking the dataloaders
        if check_dl:
            for data, labels in self.train_dl:
                logging.info("----------------------------------")
                logging.info("---       DATALOADER INFO      ---")
                logging.info("----------------------------------")
                logging.info("Train DataLoader Details:")
                logging.info("   batch dimensions: {}".format(data.shape))
                logging.info("   label dimensions: {}".format(labels.shape))
                break

            for data, labels in self.val_dl:
                logging.info("Val DataLoader Details:")
                logging.info("   batch dimensions: {}".format(data.shape))
                logging.info("   label dimensions: {}".format(labels.shape))
                break

        logging.info("----------------------------------")
        logging.info("---       MODEL TRAINING       ---")
        logging.info("----------------------------------")
        model_parameters_count = sum(p.numel()
                                     for p in self.model.parameters()
                                     if p.requires_grad)

        n_iters = round(len(self.train_ds) / self.batch_size)
        logging.info("Number of iterations/epoch : {}".format(n_iters))
        log_interval = self.log_interval

        # Loop over epochs
        start_time = time.time()
        for epoch in range(self.n_epochs):
            train_losses = []
            losses = []
            epoch_start_time = time.time()
            self.model.train()
            for batch_index, (batch_train_data,
                              batch_train_labels) in enumerate(self.train_dl):

                # transfer data to target device
                batch_train_data = batch_train_data.to(device)
                batch_train_labels = batch_train_labels.to(device)

                # zero the parameter gradients
                self.optimizer.zero_grad()

                # forward pass
                outputs = self.model(batch_train_data)
                loss = self.criterion(outputs, batch_train_labels)
                # Store loss values
                self.all_train_loss_hist.append(loss.item())
                losses.append(loss.item())

                # Computes gradient
                if self.apex:
                    with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()

                # Gradient Clipping
                if self.max_grad_clip_norm is not None:
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                                   self.max_grad_clip_norm)

                # Update model parameters
                self.optimizer.step()

                # Adjust learning rate / scheduler if specified
                if self.lr_scheduler is not None:
                    self.lr_scheduler.step()

                # Report intermediate loss value after a certain amount of batches
                if batch_index % log_interval == 0:
                    avg_train_loss = np.mean(losses)
                    train_losses.append(avg_train_loss)
                    logging.info(
                        "   Info | Epoch: %03d/%03d | Batch %04d/%04d | Average Loss: %.6f"
                        % (epoch + 1, self.n_epochs, batch_index + 1, n_iters,
                           avg_train_loss))
                    losses = []

            logging.info("   Info | " + get_gpu_info(device))

            # End of epoch - Evaluate the model performance
            self.model.eval()
            with torch.set_grad_enabled(False):  # save memory during inference
                logging.info("Epoch: %03d/%03d | Train Accuracy: %.6f" % (
                    epoch + 1,
                    self.n_epochs,
                    compute_accuracy(self.model, self.train_dl, device=device),
                ))
                val_acc = compute_accuracy(self.model,
                                           self.val_dl,
                                           device=device)
                logging.info("Epoch: %03d/%03d | Val accuracy: %.6f" %
                             (epoch + 1, self.n_epochs, val_acc))
                logging.info("Epoch: %03d/%03d | Epoch duration: %s" %
                             (epoch + 1, self.n_epochs,
                              get_elapsed_time(epoch_start_time)))
                logging.info(
                    "Epoch: %03d/%03d | Total time elapsed: %s" %
                    (epoch + 1, self.n_epochs, get_elapsed_time(start_time)))

                # early stopping & checkpoint
                current_score = val_acc
                if self.best_score is None:
                    self.best_score = current_score.to(
                        torch.device("cpu")).numpy()
                    self.best_epoch = epoch + 1
                    self.save_checkpoint()
                elif (self.apply_early_stopping and current_score <
                      self.best_score + self.es_improvement_delta):
                    self.es_counter += 1
                    logging.info(
                        f"EarlyStopping patience counter: {self.es_counter} out of {self.es_patience}"
                    )
                    if self.es_counter >= self.es_patience:
                        self.early_stop = True
                        logging.warning(
                            "/!\ Early stopping model training /!\ ")
                        break
                else:
                    self.best_score = current_score
                    self.best_epoch = epoch + 1
                    self.save_checkpoint()
                    self.es_counter = 0

        # Final results
        logging.info("------------------------------------------")
        logging.info("---              SUMMARY               ---")
        logging.info("------------------------------------------")
        logging.info(
            "Number of model parameters : {}".format(model_parameters_count))
        logging.info("Total Training Time: {}".format(
            get_elapsed_time(start_time)))
        logging.info("Total Time: {}".format(get_elapsed_time(start_time)))
        logging.info("Best Epoch: {} - Accuracy Score: {:.6f}".format(
            self.best_epoch, self.best_score))
        logging.info("------------------------------------------")