Пример #1
0
def preprocess_data(dat, col_names, scale) -> TrainData:
    proc_dat = scale.transform(dat)

    mask = np.ones(proc_dat.shape[1], dtype=bool)
    dat_cols = list(dat.columns)
    for col_name in col_names:
        mask[dat_cols.index(col_name)] = False

    feats = proc_dat[:, mask]
    targs = proc_dat[:, ~mask]

    return TrainData(feats, targs)
Пример #2
0
def preprocess_data(dat, col_names) -> Tuple[TrainData, StandardScaler]:
    scale = StandardScaler().fit(dat)
    proc_dat = scale.transform(dat)

    mask = np.ones(proc_dat.shape[1], dtype=bool)
    dat_cols = list(dat.columns)
    for col_name in col_names:
        mask[dat_cols.index(col_name)] = False

    feats = proc_dat[:, mask]
    targs = proc_dat[:, ~mask]

    return TrainData(feats, targs), scale
Пример #3
0
def preprocess_data(dat, col_names) -> Tuple[TrainData, StandardScaler]:
    """
    This function will simply standardize the data and also mask the target
    then split in features and target + send the scaler
    """
    scale = StandardScaler().fit(dat)
    proc_dat = scale.transform(dat)

    mask = np.ones(proc_dat.shape[1], dtype=bool)
    dat_cols = list(dat.columns)
    for col_name in col_names:
        mask[dat_cols.index(col_name)] = False

    feats = proc_dat[:, mask]
    targs = proc_dat[:, ~mask]

    return TrainData(feats, targs), scale
def preprocess_data(
    passed_raw_data, label_col_names
) -> Tuple[TrainData,
           StandardScaler]:  # -> gives Annotations for return type here

    scale = StandardScaler().fit(
        passed_raw_data
    )  #calculate mean and standard deviation for standard scaler for all the columns in the datad.

    processed_data = scale.transform(
        passed_raw_data
    )  #scale using the mean and standard deviation calculated above

    #print(processed_data.shape)#(40560, 82)

    mask = np.ones(
        processed_data.shape[1], dtype=bool
    )  #creates an array of "True" with the length equal to the number of columns in the .processed data

    dat_cols = list(
        passed_raw_data.columns
    )  #gets the list of the names of the columns, not the columns themselves

    for col_name in label_col_names:  #label

        mask[dat_cols.index(
            col_name)] = False  #set mask of the label column to False

        #print(processed_data.shape) (40560, 82)
    features = processed_data[:,
                              mask]  #---------------all rows but only columns with mask true
    #print(features.shape) (40560, 81)
    targets = processed_data[:,
                             ~mask]  #---------------all rows but only with not mask true

    #TrainData(features, targets) #returns a tuple of feat = ndarray[], targs = ndarray[]

    return TrainData(features, targets), scale
Пример #5
0
def preprocess_data(dat, speed_data, col_names, mean_stand=True) -> Tuple[TrainData, StandardScaler]:
    if mean_stand:
        scale = dat.mean()
        scale = np.array(scale)
        scale_speed = speed_data.mean()
        scale_speed = np.array(scale_speed)
        proc_dat = dat
        proc_dat_speed = speed_data
        i = 0
        for col in proc_dat.columns:
            proc_dat[col] -= scale[i]
            i += 1
        proc_dat = np.array(proc_dat)

        i = 0
        for col in proc_dat_speed.columns:
            proc_dat_speed[col] -= scale_speed[i]
            i += 1
        proc_dat_speed = np.array(proc_dat_speed)
    else:
        scale = StandardScaler().fit(dat)
        proc_dat = scale.transform(dat)
        scale_speed = StandardScaler().fit(speed_data)
        proc_dat_speed = scale_speed.transform(speed_data)

    # origin data
    # proc_dat = np.array(dat)

    mask = np.ones(proc_dat.shape[1], dtype=bool)
    dat_cols = list(dat.columns)
    for col_name in col_names:
        mask[dat_cols.index(col_name)] = False

    feats = proc_dat[:, mask]
    targs = proc_dat[:, ~mask]

    return TrainData(feats, targs, proc_dat_speed), scale, scale_speed