예제 #1
0
    def split_data(self, dataset, y_label, train_size = .8, random_state = 0, shuffle = True):
        """
        Splitting data into train and test split, has appropriate imports for different compute modes.
        CPU compute - Uses sklearn, we manually filter y_label column in the split call
        GPU Compute - Single GPU uses cuml and multi GPU uses dask, both split y_label internally.

        Parameters
        ----------
        dataset : dataframe
                  The dataframe on which we wish to perform the split
        y_label : string
                  The name of the column (not the series itself)
        train_size : float
                     The size for the split. Takes values between 0 to 1.
        random_state : int
                       Useful for running reproducible splits.
        shuffle : binary
                  Specifies if the data must be shuffled before splitting.

        Returns
        ----------
        X_train : dataframe
                  The data to be used for training. Has same type as input dataset.
        X_test : dataframe
                  The data to be used for testing. Has same type as input dataset.
        y_train : dataframe
                  The label to be used for training. Has same type as input dataset.
        y_test : dataframe
                  The label to be used for testing. Has same type as input dataset.
        duration : float
                   The time it took to perform the split
        """
        self.log_to_file('\n> Splitting train and test data')
        start_time = time.perf_counter()

        with PerfTimer() as split_timer:
            if 'CPU' in self.compute_type:
                X_train, X_test, y_train, y_test = sklearn_train_test_split(dataset.loc[:, dataset.columns != y_label],
                                                                            dataset[y_label],
                                                                            train_size = train_size,
                                                                            shuffle = shuffle,
                                                                            random_state = random_state)

            elif 'GPU' in self.compute_type:
                if 'single' in self.compute_type:
                    X_train, X_test, y_train, y_test = cuml_train_test_split(X = dataset,
                                                                             y = y_label,
                                                                             train_size = train_size,
                                                                             shuffle = shuffle,
                                                                             random_state = random_state) 
                elif 'multi' in self.compute_type:
                    X_train, X_test, y_train, y_test = dask_train_test_split(dataset,
                                                                             y_label,
                                                                             train_size = train_size,
                                                                             shuffle = False, # shuffle not available for dask_cudf yet
                                                                             random_state = random_state)
        
        self.log_to_file(f'\n\tX_train shape and type{X_train.shape} {type(X_train)}')
        self.log_to_file( f'\n\tSplit completed in {split_timer.duration}')
        return X_train, X_test, y_train, y_test, split_timer.duration
예제 #2
0
    def split_data(self,
                   dataset,
                   y_label,
                   train_size=.8,
                   random_state=0,
                   shuffle=True):
        """
        split dataset into train and test subset
        NOTE: assumes the first column of the dataset is the classification labels
            ! in the case of sklearn, we manually filter this column in the split call
            ! in the case of cuml, the filtering happens internally
        """
        self.log_to_file('\tsplitting train and test data')
        start_time = time.perf_counter()

        with PerfTimer() as split_timer:
            if 'CPU' in self.compute_type:
                X_train, X_test, y_train, y_test = sklearn_train_test_split(
                    dataset.loc[:, dataset.columns != y_label],
                    dataset[y_label],
                    train_size=train_size,
                    shuffle=shuffle,
                    random_state=random_state)
            elif 'GPU' in self.compute_type:
                X_train, X_test, y_train, y_test = cuml_train_test_split(
                    X=dataset,
                    y=y_label,
                    train_size=train_size,
                    shuffle=shuffle,
                    random_state=random_state)

        self.log_to_file(f'\t> split completed in {split_timer.duration}')
        return X_train, X_test, y_train, y_test, split_timer.duration
    def split_data(self,
                   dataset,
                   y_label,
                   train_size=.8,
                   random_state=0,
                   shuffle=True):

        self.log_to_file('\n\t splitting train and test data')
        start_time = time.perf_counter()

        with PerfTimer() as split_timer:
            if 'CPU' in self.compute_type:
                X_train, X_test, y_train, y_test = sklearn_train_test_split(
                    dataset,
                    dataset[y_label],
                    train_size=train_size,
                    shuffle=shuffle,
                    random_state=random_state)
            elif 'GPU' in self.compute_type:
                X_train, X_test, y_train, y_test = cuml_train_test_split(
                    X=dataset,
                    y=y_label,
                    train_size=train_size,
                    shuffle=shuffle,
                    random_state=random_state)
        self.log_to_file(f'\t split completed in {split_timer.duration}')
        return X_train, X_test, y_train, y_test, split_timer.duration
예제 #4
0
    def validation_split(self, annotations_df):
        """Takes annotations DataFrame and return indexes that
        corresponds to validation and training data.

        Parameters
        ----------
        annotations_df : pandas.DataFrame
            DataFrame with data annotations.

        Returns
        -------
        train_idx : list of int
            indexes of the dataframe that corresponds to training set.
        val_idx : list of int
            indexes of the dataframe that corresponds to validation set.
            This indexes only point to ISIC2020 data.
        """

        isic_2020_df = annotations_df[annotations_df.source == "ISIC20"]
        external_data = annotations_df[annotations_df.source != "ISIC20"]
        indexes = isic_2020_df.index.tolist()
        train_idx, val_idx = sklearn_train_test_split(
            indexes,
            stratify=isic_2020_df.target,
            test_size=self.validation_fraction,
            shuffle=True,
            random_state=1234,
        )
        train_idx += external_data.index.tolist()
        return train_idx, val_idx
예제 #5
0
def train_test_split(df, frac):
    """
    df is a dataframe
    A Train/Test split function for a dataframe and returns both the Training and Testing sets. 
    frac referes to the precent of data you would like to set aside for training.
    """
    train, val = sklearn_train_test_split(df, train_size=frac, random_state=42)
    return train, val
예제 #6
0
def train_test_split(labeled_sections):
    # split people
    speakers = labeled_sections.original_spk.unique()
    train_speakers, val_speakers = sklearn_train_test_split(speakers,
                                                            test_size=0.1)

    # split samples
    train_samples, val_samples = sklearn_train_test_split(
        labeled_sections[labeled_sections.original_spk.isin(train_speakers)],
        test_size=0.15)

    def label_train_val(row):
        nonlocal val_samples
        nonlocal val_speakers

        if row['original_spk'] in val_speakers or row['sample'] in val_samples:
            return "val"
        else:
            return "train"

    labeled_sections['set'] = labeled_sections.apply(label_train_val, axis=1)
예제 #7
0
def train_test_split(dataset, test_ratio, shuffle=False, transform=None, test_transform=None, random_state=None):
    if isinstance(transform, Compose):
        transform = InputTransform(transform)
    if isinstance(test_transform, Compose):
        test_transform = InputTransform(test_transform)

    n = len(dataset)
    train_indices, test_indices = sklearn_train_test_split(
        list(range(n)), test_size=test_ratio, shuffle=shuffle, random_state=random_state)
    ds_train = Subset(dataset, train_indices, transform)
    ds_test = Subset(dataset, test_indices, test_transform)
    return ds_train, ds_test
예제 #8
0
def split_rows(data_matrix,
               train_size=0.75,
               columnToSplitOn='pat_id',
               random_state=0):

    # from sklearn.model_selection import GroupShuffleSplit
    # print data_matrix.shape
    # train_inds, evalu_inds = next(
    #     GroupShuffleSplit(n_splits=2, test_size=1-train_size, random_state=random_state)
    #         .split(data_matrix, groups=data_matrix[columnToSplitOn])
    # )
    # print len(train_inds) + len(evalu_inds)
    #
    # data_matrix_train, data_matrix_evalu = data_matrix.iloc[train_inds], data_matrix.iloc[evalu_inds]
    #
    # pat_ids_train = data_matrix_train['pat_id'].values.tolist()
    # pat_ids_evalu = data_matrix_evalu['pat_id'].values.tolist()

    all_possible_ids = sorted(set(
        data_matrix[columnToSplitOn].values.tolist()))

    train_ids, test_ids = sklearn_train_test_split(all_possible_ids,
                                                   test_size=1. - train_size,
                                                   random_state=random_state)

    data_matrix_train = data_matrix[data_matrix[columnToSplitOn].isin(
        train_ids)].copy()
    # y_train = pd.DataFrame(train_matrix.pop(outcome_label))
    # X_train = train_matrix

    data_matrix_evalu = data_matrix[data_matrix[columnToSplitOn].isin(
        test_ids)].copy()
    # y_test = pd.DataFrame(test_matrix.pop(outcome_label))
    # X_test = test_matrix

    pat_ids_train = data_matrix_train['pat_id'].values.tolist()
    pat_ids_evalu = data_matrix_evalu['pat_id'].values.tolist()
    assert (set(pat_ids_train) & set(pat_ids_evalu)) == set([])

    assert data_matrix_train.shape[0] + data_matrix_evalu.shape[
        0] == data_matrix.shape[0]

    return data_matrix_train, data_matrix_evalu
예제 #9
0
    def validation_split(self, annotations_df):
        """Takes annotations DataFrame and return indexes that
        corresponds to validation and training data.

        Parameters
        ----------
        annotations_df : pandas.DataFrame
            DataFrame with data annotations.

        Returns
        -------
        train_idx : list of int
            indexes of the dataframe that corresponds to training set.
        val_idx : list of int
            indexes of the dataframe that corresponds to validation set.
        """
        indexes = list(range(len(annotations_df)))
        train_indexes, val_indexes = sklearn_train_test_split(
            indexes, test_size=self.validation_fraction, random_state=1234)
        return list(train_indexes), list(val_indexes)
예제 #10
0
def train_test_split(df, x_cols, y_col, test_percent, isStandardize=False, isNormalize=False):
    # print(df.head(5))
    df_X = df[x_cols]
    df_y = df[y_col]

    X_train, X_test, y_train, y_test = sklearn_train_test_split(df_X,
                                                                df_y,
                                                                test_size=test_percent,
                                                                random_state=None)

    if isStandardize:
        X_train = standardize(X_train)
        X_test = standardize(X_test)
        return X_train, y_train, X_test, y_test

    if isNormalize:
        X_train = normalize(X_train)
        X_test= normalize(X_test)
        return X_train, y_train, X_test, y_test

    return X_train, y_train, X_test, y_test
예제 #11
0
def train_test_split(examples, labels, test_size=0.1, verbose=0):
    if verbose:
        print("Train/Test split ")
        print(100-test_size*100, "% of training data")
        print(test_size*100, "% of testing data")    

    # split data into train and test sets
    train_examples, test_examples, train_labels, test_labels = sklearn_train_test_split(
        examples, 
        labels, 
        test_size=0.1, 
        random_state=42, 
        shuffle=True
    )

    # transform train and test examples to their corresponding one-hot representations
    train_users = train_examples[:, 0]
    test_users = test_examples[:, 0]

    train_items = train_examples[:, 1]
    test_items = test_examples[:, 1]

    # Final training and test set
    x_train = np.array(list(zip(train_users, train_items)))
    x_test = np.array(list(zip(test_users, test_items)))

    y_train = train_labels
    y_test = test_labels

    if verbose:
        print()
        print('number of training examples : ', x_train.shape)
        print('number of training labels : ', y_train.shape)
        print('number of test examples : ', x_test.shape)
        print('number of test labels : ', y_test.shape)

    return (x_train, x_test), (y_train, y_test)
예제 #12
0
    def _data_preprocess(self,
                         inference=False,
                         infer_dataset=None,
                         validate=False,
                         transform_input="standardize",
                         transform_output="standardize",
                         test_size=0.1,
                         shuffle=False,
                         kwargs={}):
        if not inference:
            np_dataset = self._dataset.data_to_numpy()
            data_column_names = self._dataset.data_columns
        else:
            np_dataset = infer_dataset.data_to_numpy()
            if not validate:
                data_column_names = [c[0] for c in infer_dataset.data_columns]
            else:
                data_column_names = infer_dataset.data_columns

        self.input_data_continuous = []
        self.input_data_categorical = []
        self.input_data_descriptors = []
        self.output_data = []
        if not inference:
            self.data_transformation_dict = {}

        # this loop makes sure that the inputs are always in the same order and only
        # data with the same column names as in the domain is considered
        for v in self._domain.variables:
            v_in_dataset = False
            for i, c_name in enumerate(data_column_names):
                if c_name == v.name:
                    v_in_dataset = True
                    if not v.is_objective:
                        if v.variable_type == "continuous":
                            # Standardize continuous inputs
                            tmp_cont_inp = np.asarray(np_dataset[:, i],
                                                      dtype=float)
                            if not inference:
                                tmp_cont_inp, _reduce, _divide = self._transform_data(
                                    data=tmp_cont_inp,
                                    transformation_type=transform_input)
                                self.data_transformation_dict[v.name] = [
                                    _reduce, _divide
                                ]
                            else:
                                tmp_cont_inp, _, _ = self._transform_data(
                                    data=tmp_cont_inp,
                                    reduce=self.data_transformation_dict[
                                        v.name][0],
                                    divide=self.data_transformation_dict[
                                        v.name][1])
                            self.input_data_continuous.append(tmp_cont_inp)
                        elif v.variable_type == "descriptors" or (
                                v.variable_type == "categorical"
                                and self._cat_to_descr == True):
                            tmp_descr_inp = []
                            for ent in np_dataset[:, i]:
                                tmp_descr_inp.append(
                                    v.ds.loc[[ent], :].values[0].tolist())
                            tmp_descr_inp = np.asarray(tmp_descr_inp)
                            for i in range(len(tmp_descr_inp[0])):
                                if not inference:
                                    tmp_descr_inp[:,
                                                  i], _reduce, _divide = self._transform_data(
                                                      data=tmp_descr_inp[:, i],
                                                      transformation_type=
                                                      transform_input)
                                    self.data_transformation_dict[
                                        v.ds.data_columns[i]] = [
                                            _reduce, _divide
                                        ]
                                else:
                                    tmp_descr_inp[:,
                                                  i], _, _ = self._transform_data(
                                                      data=tmp_descr_inp[:, i],
                                                      reduce=self.
                                                      data_transformation_dict[
                                                          v.ds.data_columns[i]]
                                                      [0],
                                                      divide=self.
                                                      data_transformation_dict[
                                                          v.ds.data_columns[i]]
                                                      [1])
                            self.input_data_descriptors.append(
                                np.asarray(tmp_descr_inp))
                        elif v.variable_type == "categorical":
                            # create one-hot tensor for categorical inputs
                            one_hot_enc = sklearn.preprocessing.OneHotEncoder(
                                categories=[v.levels])
                            tmp_disc_inp_one_hot = one_hot_enc.fit_transform(
                                np_dataset[:, i].reshape(-1, 1)).toarray()
                            self.input_data_categorical.append(
                                np.asarray(tmp_disc_inp_one_hot))
                        else:
                            raise TypeError(
                                "Unknown variable type: {}.".format(
                                    v.variable_type))
                    elif not inference:
                        if v.variable_type == "continuous":
                            tmp_cont_out = np.asarray(np_dataset[:, i],
                                                      dtype=float)
                            if not inference:
                                tmp_cont_out, _reduce, _divide = self._transform_data(
                                    data=tmp_cont_out,
                                    transformation_type=transform_output)
                                self.data_transformation_dict[v.name] = [
                                    _reduce, _divide
                                ]
                            self.output_data.append(tmp_cont_out)
                        elif v.variable_type == "categorical":
                            raise TypeError(
                                "{} is a categorical variable. Regressor not trainable for categorical outputs."
                                .format(v.name))
                        elif v.variable_type == "descriptors":
                            raise TypeError(
                                "{} is a descriptor variable. Regressor not trainable for descriptor outputs."
                                .format(v.name))
                        else:
                            raise TypeError(
                                "Unknown variable type: {}.".format(
                                    v.variable_type))
                elif inference and v.is_objective:
                    v_in_dataset = True
            if v_in_dataset == False:
                raise ValueError(
                    "Variable {} defined in the domain is missing in the given dataset."
                    .format(v.name))

        self.input_data_continuous = np.asarray(
            self.input_data_continuous).transpose()
        if len(self.input_data_categorical) != 0:
            self.input_data_categorical = np.concatenate(
                [one_hot for one_hot in self.input_data_categorical], axis=1)
        if len(self.input_data_descriptors) != 0:
            self.input_data_descriptors = np.concatenate(
                [d for d in self.input_data_descriptors], axis=1)
        self.output_data = np.asarray(self.output_data).transpose()

        # Set up training and test data
        if not inference:
            final_np_dataset = np.concatenate([
                inp for inp in [
                    self.input_data_continuous, self.input_data_descriptors,
                    self.input_data_categorical, self.output_data
                ] if len(inp) != 0
            ],
                                              axis=1)
            X, y = final_np_dataset[:, :-self.
                                    output_dim], final_np_dataset[:, -self.
                                                                  output_dim:]
            X_train, X_test, y_train, y_test = sklearn_train_test_split(
                X, y, test_size=test_size, shuffle=shuffle)
            return [X_train.astype(dtype=float),
                    y_train.astype(dtype=float)], [
                        X_test.astype(dtype=float),
                        y_test.astype(dtype=float)
                    ]
        else:
            X = np.concatenate([
                inp for inp in [
                    self.input_data_continuous, self.input_data_descriptors,
                    self.input_data_categorical
                ] if len(inp) != 0
            ],
                               axis=1)
            return X.astype(dtype=float)
예제 #13
0
def preprocessing(data_df, is_training):
    ### Create embedding matrix
    def text_wordlist(text):
        """
        Pre process and convert texts to a list of words
        :param text: text(sentence)
        :return: text
        """

        text = str(text)
        text = text.lower()
        # Clean the text
        text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
        text = re.sub(r"what's", "what is ", text)
        text = re.sub(r"\'s", " ", text)
        text = re.sub(r"\'ve", " have ", text)
        text = re.sub(r"can't", "cannot ", text)
        text = re.sub(r"n't", " not ", text)
        text = re.sub(r"i'm", "i am ", text)
        text = re.sub(r"\'re", " are ", text)
        text = re.sub(r"\'d", " would ", text)
        text = re.sub(r"\'ll", " will ", text)
        text = re.sub(r",", " ", text)
        text = re.sub(r"\.", " ", text)
        text = re.sub(r"!", " ! ", text)
        text = re.sub(r"\/", " ", text)
        text = re.sub(r"\^", " ^ ", text)
        text = re.sub(r"\+", " + ", text)
        text = re.sub(r"\-", " - ", text)
        text = re.sub(r"\=", " = ", text)
        text = re.sub(r"'", " ", text)
        text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
        text = re.sub(r":", " : ", text)
        text = re.sub(r" e g ", " eg ", text)
        text = re.sub(r" b g ", " bg ", text)
        text = re.sub(r" u s ", " american ", text)
        text = re.sub(r"\0s", "0", text)
        text = re.sub(r" 9 11 ", "911", text)
        text = re.sub(r"e - mail", "email", text)
        text = re.sub(r"j k", "jk", text)
        text = re.sub(r"\s{2,}", " ", text)

        text = text.split()

        return text

    ### Prepare embedding
    vocab = dict()  # word vocabulary
    inverse_vocab = ["<unk>"]
    word_to_vec = utils.WORD_2_VECTOR

    que_cols = ["question1", "question2"]

    def iter_over_df(df, que_cols, text_wordlist, stops, word_to_vec, vocab,
                     inverse_vocab):
        """
        Iterate over the questions
        :param df: dataframe
        :return: dataframe
        """
        for index, row in df.iterrows():
            for que in que_cols:

                que_to_vec = []
                for word in text_wordlist(row[que]):

                    # skip unwanted words
                    if word in stops and word not in word_to_vec.vocab:
                        continue

                    if word not in vocab:
                        vocab[word] = len(inverse_vocab)
                        que_to_vec.append(len(inverse_vocab))
                        inverse_vocab.append(word)
                    else:
                        que_to_vec.append(vocab[word])

                df.set_value(index, que, que_to_vec)

        return df

    stops = set(utils.STOPWORDS)
    data_df = iter_over_df(data_df, que_cols, text_wordlist, stops,
                           word_to_vec, vocab, inverse_vocab)

    ### Build the embedding matrix
    def build_embed_matr(vocab, word_to_vec, embed_dim):
        embed_mat = 1 * np.random.randn(len(vocab) + 1, embed_dim)
        embed_mat[0] = 0  # to ignore padding
        for word, index in vocab.items():
            if word in word_to_vec.vocab:
                embed_mat[index] = word_to_vec.word_vec(word)

        return embed_mat

    ### Zero padding
    def zero_padding_train(*args, Y_train=None, max_seq_len):
        data = [*args]
        for dataset, side in itertools.product(data, ["left", "right"]):
            dataset[side] = pad_sequences(dataset[side], maxlen=max_seq_len)

        if is_training:
            assert X_train["left"].shape == X_train["right"].shape
            assert len(X_train["left"]) == len(Y_train)

        return data

    # Split to train validation
    if is_training:
        EMBEDDING_MATRIX = build_embed_matr(vocab, word_to_vec,
                                            utils.EMBEDDING_DIM)
        validation_size = int(0.1 * data_df.shape[0])
        X = data_df[que_cols]
        Y = data_df["is_duplicate"]
        X_train, X_val, Y_train, Y_val = sklearn_train_test_split(
            X, Y, test_size=validation_size)
        del X, Y, validation_size, data_df, que_cols
        X_train = {"left": X_train.question1, "right": X_train.question2}
        X_val = {"left": X_val.question1, "right": X_val.question2}
        Y_train = Y_train.values
        Y_val = Y_val.values
        X_TRAIN, X_VAL = zero_padding_train(X_train,
                                            X_val,
                                            Y_train=Y_train,
                                            max_seq_len=utils.MAX_SEQ_LEN)
        Y_TRAIN = Y_train
        Y_VAL = Y_val
        del X_train, X_val, Y_train, Y_val
        return X_TRAIN, X_VAL, Y_TRAIN, Y_VAL, EMBEDDING_MATRIX
    else:
        data_df = {"left": data_df.question1, "right": data_df.question2}
        return zero_padding_train(data_df,
                                  Y_train=None,
                                  max_seq_len=utils.MAX_SEQ_LEN)[0]
예제 #14
0
def train_test_split(X, y, test_size, random_state=random.randint(1, 1000)):
    return sklearn_train_test_split(X, y, test_size=test_size, random_state=random_state)