예제 #1
0
def get_results_mice_imputation_includingy(X_incomplete, y):
    # Impute incomplete data using the IterativeImputer as a MICEImputer
    # Now using the output variable in the imputation loop
    m = 5
    multiple_imputations = []
    for i in range(m):
        Xy = np.column_stack((X_incomplete, y))
        imputer = IterativeImputer(n_iter=100, sample_posterior=True,
                                   random_state=i)
        imputer.fit(Xy)
        data_imputed = imputer.transform(Xy)

        # We save only the X imputed data because we do not want to use y to
        # predict y later on.
        X_imputed = data_imputed[:, :-1]
        multiple_imputations.append(X_imputed)

    # Perform linear regression on mice multiple imputed data
    # Estimate beta estimates and their variances
    m_coefs = []
    m_vars = []
    for i in range(m):
        estimator = LinearRegression()
        estimator.fit(multiple_imputations[i], y)
        y_predict = estimator.predict(multiple_imputations[i])
        m_coefs.append(estimator.coef_)
        m_vars.append(calculate_variance_of_beta_estimates(
                y, y_predict, multiple_imputations[i]))

    # Calculate the end estimates by applying Rubin's rules.
    Qbar = calculate_Qbar(m_coefs)
    T = calculate_T(m_coefs, m_vars, Qbar)
    mice_errorbar = 1.96 * np.sqrt(T)

    return Qbar, T, mice_errorbar
예제 #2
0
def MultiIterBayesian(dataset):

    Dim = dataset['d']
    trainX = dataset['train_x']
    testX = dataset['test_x']
    trainM = dataset['train_m']
    testM = dataset['test_m']
    # Train_No = dataset['train_no']
    # Test_No = dataset['test_no']

    test_X = testX.copy()
    train_X = trainX.copy()

    train_X[trainM == 0] = np.nan
    test_X[testM == 0] = np.nan

    # Bayesian imputation
    br_estimator = BayesianRidge()

    by_imp = IterativeImputer(random_state=0, estimator=br_estimator)
    by_imp.fit(train_X)

    imputed_X = by_imp.transform(test_X)

    print('>>>BayesianRidge IterativeImputer result: \n')
    print(imputed_X)

    _all_rmse = compute_rmse(testX, imputed_X, testM)

    print('>>>all_rmse', _all_rmse)

    return _all_rmse
예제 #3
0
def test_iterative_imputer_early_stopping():
    rng = np.random.RandomState(0)
    n = 50
    d = 5
    A = rng.rand(n, 1)
    B = rng.rand(1, d)
    X = np.dot(A, B)
    nan_mask = rng.rand(n, d) < 0.5
    X_missing = X.copy()
    X_missing[nan_mask] = np.nan

    imputer = IterativeImputer(max_iter=100,
                               tol=1e-3,
                               sample_posterior=False,
                               verbose=1,
                               random_state=rng)
    X_filled_100 = imputer.fit_transform(X_missing)
    assert len(imputer.imputation_sequence_) == d * imputer.n_iter_

    imputer = IterativeImputer(max_iter=imputer.n_iter_,
                               sample_posterior=False,
                               verbose=1,
                               random_state=rng)
    X_filled_early = imputer.fit_transform(X_missing)
    assert_allclose(X_filled_100, X_filled_early, atol=1e-7)

    imputer = IterativeImputer(max_iter=100,
                               tol=0,
                               sample_posterior=False,
                               verbose=1,
                               random_state=rng)
    imputer.fit(X_missing)
    assert imputer.n_iter_ == imputer.max_iter
예제 #4
0
def prepare_data_letter():
    data_path = os.path.join("data_raw", "fully_labeled", "letter_recognition",
                             "letter_recognition.csv")
    df = pd.read_csv(data_path)
    df = df.drop(columns="id")
    df = df.replace(list("ABCDEFGHIJKLMNOPQRSTUVWXYZ"), list(range(26)))
    df = df.astype(float)

    test_mask = np.random.rand(len(df)) < config.TEST_RATIO
    train_df = df[~test_mask]
    test_df = df[test_mask]

    # Impute missing values
    imp = IterativeImputer(max_iter=10, random_state=0)
    imp.fit(train_df)

    train_np = imp.transform(train_df)
    test_np = imp.transform(test_df)

    x_train = train_np[:, :-1].astype(float)
    y_train = train_np[:, -1].astype(int)

    x_test = test_np[:, :-1].astype(float)
    y_test = test_np[:, -1].astype(int)

    return x_train, y_train, x_test, y_test
def main():
    time_series = pd.read_csv(data_path + "TimeSeries.csv")
    print("original dim", time_series.shape)
    time_series = time_series[time_series.Hour < 25]

    time_series.iloc[:, 7:(len(time_series.columns) - 1)] = remove_alpha(
        time_series.iloc[:, 7:(len(time_series.columns) - 1)])

    time_series['PatientID2'] = time_series['PatientID']
    print(time_series.columns)
    #print(time_series.columns)
    aggregate_series = time_series.groupby('PatientID2').first()

    missingness = (aggregate_series.isnull().sum() * 100 /
                   len(aggregate_series))
    missingness.reindex(aggregate_series.columns)

    aggregate_series = aggregate_series.loc[:,
                                            pd.notnull(aggregate_series).sum(
                                            ) > len(aggregate_series) * .80]

    #Impute missing values as SOM clustering does not accommodate missingness
    imp = IterativeImputer(max_iter=10, random_state=0)
    imp.fit(aggregate_series.iloc[:, 7:])
    aggregate_series.iloc[:, 7:] = imp.transform(aggregate_series.iloc[:, 7:])
    aggregate_series.to_csv(clustering_path + "BaselineObservations.csv",
                            index=False)
예제 #6
0
    def fill_empty_fields(dataframe):

        imp = IterativeImputer(max_iter=100, random_state=0)
        imp.fit(dataframe)
        dataframe = imp.transform(dataframe)

        return dataframe
예제 #7
0
def process_y(data):
    df = data[['fustat', 'futime']]
    years = [1,3,5]
    values = []  # [1-year, 2-year, 3-year, 5-year]
    #import pdb
    #pdb.set_trace()
    for i in range(df.shape[0]):
        stat, time = df.loc[i]
        row = []
        for year in years:
            if time >= year * 365: row.append(0)
            elif stat == 1 and time < year * 365: row.append(1)
            elif stat == 0 and time < year * 365: 
                row.append(np.NaN)
        values.append(row)
    column_name = ['year_'+str(i) for i in years]
    year_survival_rate = pd.DataFrame(values, index=data.index, columns=column_name)
    data = data.join(year_survival_rate)
    #print (data[['fustat', 'futime']+column_name])
    imp = IterativeImputer(max_iter=20, random_state=random_seed, min_value=0, max_value=1)
    imp.fit(data)
    arr_values = (imp.transform(data))
    arr_values = np.round(arr_values)
    data[column_name] = pd.DataFrame(arr_values, columns=data.columns)[column_name]
    del data['fustat']
    del data['futime']
    return data, column_name
예제 #8
0
    def replace_nan_values(self):
        # we have almost 30 patients that have missing values or nan values in their information
        # as we don't want to delete these rows, we will replace these values using IterativeImputer from sklearn
        # it models each feature with missing values as a function of other features, and uses that estimate for imputation.
        # more info : https://scikit-learn.org/stable/modules/impute.html#iterative-imputer
        imp = IterativeImputer(max_iter=50, random_state=0)
        imp.fit(self.training_data)

        transformed_train = imp.transform(self.training_data)
        transformed_test = imp.transform(self.testing_data)

        index_train = self.training_data.index
        index_test = self.testing_data.index

        columns = self.training_data.columns

        self.training_data = pd.DataFrame(transformed_train,
                                          index=index_train,
                                          columns=columns)

        self.testing_data = pd.DataFrame(transformed_test,
                                         index=index_test,
                                         columns=columns)

        # this method will predict float values while we want int values for categorical data
        for cat in self.categorical_col:
            self.training_data[cat] = self.training_data[cat].round()
            self.testing_data[cat] = self.testing_data[cat].round()

        return self.training_data, self.testing_data
예제 #9
0
def test_iterative_imputer_catch_min_max_error(min_value, max_value, err_msg):
    # check that passing scalar or array-like
    # for min_value and max_value in IterativeImputer works
    X = np.random.random((10, 3))
    imputer = IterativeImputer(min_value=min_value, max_value=max_value)
    with pytest.raises(ValueError, match=err_msg):
        imputer.fit(X)
예제 #10
0
def test_iterative_imputer_early_stopping():
    rng = np.random.RandomState(0)
    n = 50
    d = 5
    A = rng.rand(n, 1)
    B = rng.rand(1, d)
    X = np.dot(A, B)
    nan_mask = rng.rand(n, d) < 0.5
    X_missing = X.copy()
    X_missing[nan_mask] = np.nan

    imputer = IterativeImputer(max_iter=100,
                               tol=1e-3,
                               sample_posterior=False,
                               verbose=1,
                               random_state=rng)
    X_filled_100 = imputer.fit_transform(X_missing)
    assert len(imputer.imputation_sequence_) == d * imputer.n_iter_

    imputer = IterativeImputer(max_iter=imputer.n_iter_,
                               sample_posterior=False,
                               verbose=1,
                               random_state=rng)
    X_filled_early = imputer.fit_transform(X_missing)
    assert_allclose(X_filled_100, X_filled_early, atol=1e-7)

    imputer = IterativeImputer(max_iter=100,
                               tol=0,
                               sample_posterior=False,
                               verbose=1,
                               random_state=rng)
    imputer.fit(X_missing)
    assert imputer.n_iter_ == imputer.max_iter
예제 #11
0
class IterativeInterpolate(BaseEstimator, TransformerMixin):
    def __init__(self,
                 estimater=None,
                 is_estimate=False,
                 missing_values=np.nan,
                 max_iter=10,
                 random_state=None):
        self.estimater = estimater
        self.is_estimate = is_estimate
        self.missing_values = missing_values
        self.max_iter = max_iter
        self.random_state = random_state

    def fit(self, X, y=None):
        if self.is_estimate:
            self.imp = IterativeImputer(estimator=self.estimater,
                                        missing_values=self.missing_values,
                                        max_iter=self.max_iter,
                                        random_state=self.random_state)
        else:
            self.imp = IterativeImputer(missing_values=self.missing_values,
                                        max_iter=self.max_iter,
                                        random_state=self.random_state)
        if y is None:
            self.imp.fit(X)
        else:
            self.imp.fit(X, y)
        return self

    def transform(self, X):
        return self.imp.transform(X)
예제 #12
0
def get_results_mice_imputation(X_incomplete, y):
    # Impute incomplete data using the IterativeImputer to perform multiple
    # imputation. We set n_burn_in at 99 and use only last imputation and
    # loop this procedure m times.
    m = 5
    multiple_imputations = []
    for i in range(m):
        imputer = IterativeImputer(n_iter=100, sample_posterior=True,
                                   random_state=i)
        imputer.fit(X_incomplete)
        X_imputed = imputer.transform(X_incomplete)
        multiple_imputations.append(X_imputed)

    # Perform a model on each of the m imputed datasets
    # Estimate the estimates for each model/dataset
    m_coefs = []
    m_vars = []
    for i in range(m):
        estimator = LinearRegression()
        estimator.fit(multiple_imputations[i], y)
        y_predict = estimator.predict(multiple_imputations[i])
        m_coefs.append(estimator.coef_)
        m_vars.append(calculate_variance_of_beta_estimates(
                y, y_predict, multiple_imputations[i]))

    # Calculate the end estimates by applying Rubin's rules.
    Qbar = calculate_Qbar(m_coefs)
    T = calculate_T(m_coefs, m_vars, Qbar)
    mice_errorbar = 1.96 * np.sqrt(T)

    return Qbar, T, mice_errorbar
예제 #13
0
def impute(df, impute_columns):

    imp = IterativeImputer(max_iter=10, random_state=0)
    imp.fit(df[impute_columns])
    df[impute_columns] = imp.transform(df[impute_columns])

    return df[impute_columns]
예제 #14
0
def prepare_data_breast():
    data_path = os.path.join("data_raw", "fully_labeled", "breast_w",
                             "breast_w.csv")
    df = pd.read_csv(data_path)
    df = df.iloc[:, 1:]
    df = df.replace("?", np.nan)
    df = df.replace("benign", 0)
    df = df.replace("malignant", 1)
    df = df.astype(float)

    test_mask = np.random.rand(len(df)) < config.TEST_RATIO
    train_df = df[~test_mask]
    test_df = df[test_mask]

    # Impute missing values
    imp = IterativeImputer(max_iter=10, random_state=0)
    imp.fit(train_df)

    train_np = imp.transform(train_df)
    test_np = imp.transform(test_df)

    x_train = train_np[:, :-1].astype(float)
    y_train = train_np[:, -1].astype(int)

    x_test = test_np[:, :-1].astype(float)
    y_test = test_np[:, -1].astype(int)

    return x_train, y_train, x_test, y_test
예제 #15
0
def iter_impute(data, subject=None, cols=None, rounding=3, max_iter=10):
    # Prepare input
    # if cols is none, perform for all columns (except first column)
    if cols is None:
        cols = data.columns[1:]
    # If subject is null, perform for all subjects
    if subject is None:
        inp = data[cols]
    else:
        # Create a dataframe with all selected subjects
        inp = pandas.DataFrame()
        for s in subject:
            inp = inp.append(get_subject(data, s, data.columns[0]).loc[:, cols])
    if len(inp.columns) < 2:
        raise Exception("Multiple variables must be given as input")

    # Create imputer
    imputer = IterativeImputer(max_iter=max_iter)
    imputer.fit(inp)

    # Impute missing values and round to the third decimal point
    res = pandas.DataFrame(np.round(imputer.transform(inp), decimals=rounding), index=inp.index, columns=inp.columns)

    data.loc[res.index, res.columns] = res
    return data
예제 #16
0
def fill_missing_value_using_iterative(dataset):
    """
    将空缺值按该行其他未空缺值计算贝叶斯回归来估计空缺值
    :param dataset: 传入pandas数据
    :return:
    """
    # dataset[[1, 2, 3, 4, 5]] = dataset[[1, 2, 3, 4, 5]].replace(0, nan)
    data = dataset.values

    X, y = data[:, 1:], data[:, 0]
    y = np.expand_dims(y, 1)
    X = np.asarray(X, dtype=np.float64)
    # print total missing
    print('Missing: %d' % sum(isnan(X).flatten()))
    # define imputer
    imputer = IterativeImputer()
    # fit on the dataset
    imputer.fit(X)
    # transform the dataset
    Xtrans = imputer.transform(X)
    # print total missing
    print('Missing: %d' % sum(isnan(Xtrans).flatten()))
    print(X.shape)
    return_data = np.hstack((y, Xtrans))
    return return_data
예제 #17
0
def MultiIterTrees(dataset):
    from sklearn.impute import IterativeImputer

    Dim = dataset['d']
    trainX = dataset['train_x']
    testX = dataset['test_x']
    trainM = dataset['train_m']
    testM = dataset['test_m']
    # Train_No = dataset['train_no']
    # Test_No = dataset['test_no']

    test_X = testX.copy()
    train_X = trainX.copy()

    train_X[trainM == 0] = np.nan
    test_X[testM == 0] = np.nan

    # Bayesian imputation
    etr_estimator = ExtraTreesRegressor(n_estimators=10, random_state=0)

    etr_imp = IterativeImputer(random_state=0, estimator=etr_estimator)
    etr_imp.fit(train_X)

    imputed_X = etr_imp.transform(test_X)

    print('>>>ExtraTreesRegressor IterativeImputer result: \n')
    print(imputed_X)

    _all_rmse = compute_rmse(testX, imputed_X, testM)

    print('>>>all_rmse', _all_rmse)

    return _all_rmse
예제 #18
0
def main():
    configs = json.load(
        open('MachineLearning/Models/LSTM/Configuration.json', 'r'))
    if not os.path.exists(configs['model']['save_dir']):
        os.makedirs(configs['model']['save_dir'])

    time_series = pd.read_csv(clustered_timeseries_path +
                              "TimeSeriesAggregatedClusteredDeltaTwoDays.csv")
    print(time_series.shape)
    # configs['data']['train_test_split'],  #the split
    #configs['data']['columns_dynamic'] # the columns

    #Impute and Scale Data

    dynamic_features = configs['data']['dynamic_columns']
    grouping = configs['data']['grouping']
    imp = IterativeImputer(max_iter=10, random_state=0)
    imp.fit(time_series[dynamic_features])
    time_series[dynamic_features] = imp.transform(
        time_series[dynamic_features])
    time_series = scale(time_series, dynamic_features)

    X = time_series[dynamic_features]
    groups = np.array(time_series[grouping])

    for outcome in configs['data']['classification_outcome']:
        y = time_series[outcome]
        y = y.astype(int)

        model = Model(configs['model']['name'] + outcome)

        print(grouping)
        print(len(set(time_series[grouping])))

        model.build_model(configs)

        i = 0
        for ffold_ind, (training_ind, testing_ind) in enumerate(
                stratified_group_k_fold(X, y, groups,
                                        k=10)):  # CROSS-VALIDATION
            training_groups, testing_groups = groups[training_ind], groups[
                testing_ind]
            this_y_train, this_y_val = y[training_ind], y[testing_ind]
            this_X_train, this_X_val = X.iloc[training_ind], X.iloc[
                testing_ind]

            assert len(set(training_groups) & set(testing_groups)) == 0

            print(" X SHAPE: ", this_X_train.shape)
            print(" Y shape: ", this_y_train.shape)

            input_timesteps = 24
            input_dim = 2

            if i == 0:
                #(NumberOfExamples, TimeSteps, FeaturesPerStep).
                model.train((this_X_train.values).reshape(-1, 24, 35),
                            (this_y_train.values).reshape(-1, 24, 1))
                i = i + 1
예제 #19
0
def iterative_imputer(X, args={}):
    """
    缺失值插入:通过将该缺失属性与其他属性结合起来进行插值
    """
    from sklearn.impute import IterativeImputer
    iti = IterativeImputer(**args)
    iti.fit(X)
    return iti
예제 #20
0
def problem2_3_3(data):
    data[3].loc[data[3] == 0] = np.nan
    imp = IterativeImputer(missing_values=np.nan)
    imp.fit(data)
    newdata = np.round(imp.transform(data))
    area = newdata[:, 2].tolist()
    print("Use Multivariate:", problem2_3_1(area))
    return "as shown in the plots"
예제 #21
0
 def iterative_inputer_integer(self, df):
     df_copy = df.copy()
     imp = IterativeImputer(max_iter=10, random_state=0)
     imp.fit(df_copy)
     df_new = pd.DataFrame(np.round(imp.transform(df_copy)),
                           columns=df_copy.columns)
     df_new = df_new.astype('int32')
     return df_new
예제 #22
0
def experiment_LinearRegression(df, df_full, score):
    start_time = time.time()
    imp = IterativeImputer(estimator=LinearRegression(),
                           random_state=0,
                           max_iter=10)
    imp.fit(df)
    df_filled = pd.DataFrame(imp.transform(df))
    score.loc['Linear Regression', 'r2_score'] = r2_score(df_full, df_filled)
    score.loc['Linear Regression', 'time'] = time.time() - start_time
예제 #23
0
def test_iterative_imputer_one_feature(X):
    # check we exit early when there is a single feature
    imputer = IterativeImputer().fit(X)
    assert imputer.n_iter_ == 0
    imputer = IterativeImputer()
    imputer.fit([[1], [2]])
    assert imputer.n_iter_ == 0
    imputer.fit([[1], [np.nan]])
    assert imputer.n_iter_ == 0
예제 #24
0
def use_imputation(df_list, train_x_columns):
    imputer = IterativeImputer(random_state=0, max_iter=30, verbose=2)
    imputer.fit(df_list[0][train_x_columns])

    for i in range(len(df_list)):
        df_list[i][train_x_columns] = imputer.transform(
            df_list[i][train_x_columns])

    return df_list
예제 #25
0
def imput_data_with_sklearn_imputer(df_daily):

    df_daily_interp = df_daily.copy()
    df_daily_interp["MES"] = df_daily_interp.index.month
    imputer = IterativeImputer(estimator=BayesianRidge(), random_state=1)
    imputer.fit(df_daily_interp.values)
    imputted_vals = imputer.transform(df_daily_interp.values)
    df_daily_interp.loc[:, :] = imputted_vals
    return df_daily_interp
예제 #26
0
def fill_chunk(fit_df, transform_df):
    estimator = RandomForestRegressor(n_estimators=10, n_jobs=8)
    imp = IterativeImputer(estimator=estimator, max_iter=5, random_state=0)
    imp.fit(fit_df)
    transformed = imp.transform(transform_df)
    imputed_df = pd.DataFrame(data=transformed,
                              index=transform_df.index,
                              columns=transform_df.columns)
    return imputed_df
예제 #27
0
def main():
    df = get_raw_data()
    data_dict = pd.read_csv("data/WiDS Datathon 2020 Dictionary.csv")

    identifier_features = data_dict[data_dict["Category"] == "identifier"][
        "Variable Name"].tolist() + ["icu_id"]
    type__features = [
        "hospital_admit_source",
        "icu_admit_source",
        "icu_stay_type",
        "icu_type",
    ]
    redundant_features = ['readmission_status', 'apache_2_bodysystem']
    features_to_drop = identifier_features + type__features + redundant_features

    # keep features that have less than 70% of nulls
    cut_off_percentage = 0.3
    n_of_nulls = int(cut_off_percentage * df.shape[0])
    df = df.dropna(axis=1, thresh=n_of_nulls)

    numeric_features = data_dict[
        data_dict["Data Type"] == "numeric"]["Variable Name"].tolist() + [
            "bmi", "apache_2_diagnosis", "apache_3j_diagnosis"
        ]

    skewed_numeric_features = df.columns[df.columns.isin(numeric_features)]
    numeric_df = df[skewed_numeric_features]

    imp = IterativeImputer(max_iter=3, verbose=0)
    imp.fit(numeric_df)
    imputed_df = imp.transform(numeric_df)
    imputed_df = pd.DataFrame(imputed_df, columns=numeric_df.columns)

    categorical_features = data_dict[
        data_dict["Data Type"] != "numeric"]["Variable Name"].tolist()

    # remove ['bmi','apache_2_diagnosis','apache_3j_diagnosis'] non_categorical features
    categorical_features = [
        feature for feature in categorical_features
        if feature not in ["bmi", "apache_2_diagnosis", "apache_3j_diagnosis"]
    ]

    skewed_categorical_features = df.columns[df.columns.isin(
        categorical_features)]

    categorical_df = df[skewed_categorical_features]

    # fill the null with the most occurred values

    # df.series.mode() returns a series. so [0] exact value of the series
    for feature in skewed_categorical_features:
        categorical_df[feature].fillna(categorical_df[feature].mode()[0],
                                       inplace=True)

    complete_df = pd.concat([imputed_df, categorical_df], axis=1)

    return complete_df
예제 #28
0
    def fit(self, X, y=None):
        """Perform co-clustering.

        Parameters
        ----------
        X : numpy array or scipy sparse matrix, shape=(n_samples, n_features)
            Matrix to be analyzed
        """
        random_state = check_random_state(self.random_state)

        check_array(X, accept_sparse=True, dtype="numeric", order=None,
                    copy=False, force_all_finite=True, ensure_2d=True,
                    allow_nd=False, ensure_min_samples=self.n_row_clusters,
                    ensure_min_features=self.n_col_clusters,
                    warn_on_dtype=False, estimator=None)
        
        global indices 
        indices=np.argwhere(np.isnan(X))
        if(len(indices)):
            imp = IterativeImputer(missing_values=np.nan, sample_posterior=False, 
                                 max_iter=10, tol=0.001, 
                                 n_nearest_features=4, initial_strategy='most_frequent')
            imp.fit(X)
            X=imp.transform(X)   
        check_positive(X)

        X = X.astype(float)

        criterion = self.criterion
        criterions = self.criterions
        row_labels_ = self.row_labels_
        column_labels_ = self.column_labels_
        delta_kl_ = self.delta_kl_

        seeds = random_state.randint(np.iinfo(np.int32).max, size=self.n_init)
        for seed in seeds:
            self._fit_single(X, seed, y)
            if np.isnan(self.criterion):
                raise ValueError("matrix may contain negative or "
                                 "unexpected NaN values")
            # remember attributes corresponding to the best criterion
            if (self.criterion > criterion):
                criterion = self.criterion
                criterions = self.criterions
                row_labels_ = self.row_labels_
                column_labels_ = self.column_labels_
                delta_kl_ = self.delta_kl_

        # update attributes
        self.criterion = criterion
        self.criterions = criterions
        self.row_labels_ = row_labels_
        self.column_labels_ = column_labels_
        self.delta_kl_ = delta_kl_

        return self
예제 #29
0
def train_model_iterative_fill(filename):
    pd.options.mode.chained_assignment = None

    df = pd.read_csv(filename, encoding = 'utf-16', sep = '\t')
    groups = list(set(df[PAGAL_KA_SUGRUPUOTI_SPEJIMUS].astype(int)))

    estimators = [ExtraTreesRegressor(), BayesianRidge(), KNeighborsRegressor(), DecisionTreeRegressor(), RandomForestRegressor()] # geriausiai veikia decision trees regressor
    # pasirenkamas algoritmas
    estimator = estimators[0]

    # ar reikia atmesti mažas reikšmes
    new_filename = filename
    atmesti_mazas_reiksmes = True
    if atmesti_mazas_reiksmes:
        df = atmesti_mazas_tui(df)
        new_filename = filename.split('.')
        new_filename = new_filename[0] + '_be_mazu_tui.' + new_filename[1]
        df.to_csv(new_filename, sep = '\t', encoding = 'utf-16', index = False)

    for group in groups:

        print('Pildomas %s rodiklis' % group)
        maindf = pd.read_csv(new_filename, encoding = 'utf-16', sep = '\t')

        # atsirenkamos eilutės tik su tam tikra PAGAL_KA_SUGRUPUOTI_SPEJIMUS reikšme
        df = maindf.loc[maindf[PAGAL_KA_SUGRUPUOTI_SPEJIMUS] == group]
        X = shuffle(df)

        # numetamos reikšmės, kurių neina konvertuoti į skaičius
        for name in ATMESTI:
            X = X.drop(name, axis = 1)

        # atsikratome tuščių stulpelių (neįmanoma teisingai nuspėti kai nėra jokio pavyzdžio)
        for col in X:
            if X[col].isnull().all():
                X = X.drop(col, axis = 1)
        
        # jei yra bent viena eilutė, kurią būtų galima užpildyti
        if len(X) > 0:
            index = list(X.index)
            columns = list(X.columns.values)
            # sukuriamas ir ištreniruojamas algoritmas
            imp = IterativeImputer(estimator = estimator, missing_values = np.nan)
            imp.fit(X)
            # užpildomos tuščios X reikšmės
            X = imp.transform(X) # čia X grąžinamas np.array pavidalu, todėl reikia jį atversti atgal į pandas.DataFrame
            X = pd.DataFrame(data = X, index = index, columns = columns)
            maindf.update(X)

            new_filename = new_filename.split('.')[0] + '_updated.' + new_filename.split('.')[1]
            # išsaugomi spėjimai
            maindf.to_csv(new_filename, sep = '\t', encoding = 'utf-16', index = False)

    # sutvarko failą
    tidy_up_file(new_filename)
    return 0
예제 #30
0
def imputation(df):
    ## La imputacion se hace para reemplazar los nans por un valor estimado

    from sklearn.experimental import enable_iterative_imputer
    from sklearn.impute import IterativeImputer
    imputer = IterativeImputer(max_iter=10, random_state=0)
    imputer.fit(df)
    df_num_imp = imputer.transform(df)
    df = pd.DataFrame(df_num_imp, columns=df.columns)
    return df
예제 #31
0
 def internal(self, col_list):
     col_list1 = col_list.get('internal')
     data = self.data[self.data.columns.intersection(col_list1)]
     imp_mean = IterativeImputer(random_state=0)
     imp_mean.fit(data)
     data_iterative = pd.DataFrame(imp_mean.transform(data),
                                   columns=data.columns,
                                   index=data.index)
     data_iterative.to_csv('internal.csv', index=False)
     return data_iterative
예제 #32
0
def test_iterative_imputer_verbose():
    rng = np.random.RandomState(0)

    n = 100
    d = 3
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
    imputer = IterativeImputer(missing_values=0, max_iter=1, verbose=1)
    imputer.fit(X)
    imputer.transform(X)
    imputer = IterativeImputer(missing_values=0, max_iter=1, verbose=2)
    imputer.fit(X)
    imputer.transform(X)
예제 #33
0
def test_iterative_imputer_no_missing():
    rng = np.random.RandomState(0)
    X = rng.rand(100, 100)
    X[:, 0] = np.nan
    m1 = IterativeImputer(max_iter=10, random_state=rng)
    m2 = IterativeImputer(max_iter=10, random_state=rng)
    pred1 = m1.fit(X).transform(X)
    pred2 = m2.fit_transform(X)
    # should exclude the first column entirely
    assert_allclose(X[:, 1:], pred1)
    # fit and fit_transform should both be identical
    assert_allclose(pred1, pred2)
예제 #34
0
def test_iterative_imputer_transform_stochasticity():
    pytest.importorskip("scipy", minversion="0.17.0")
    rng1 = np.random.RandomState(0)
    rng2 = np.random.RandomState(1)
    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10,
                             random_state=rng1).toarray()

    # when sample_posterior=True, two transforms shouldn't be equal
    imputer = IterativeImputer(missing_values=0,
                               max_iter=1,
                               sample_posterior=True,
                               random_state=rng1)
    imputer.fit(X)

    X_fitted_1 = imputer.transform(X)
    X_fitted_2 = imputer.transform(X)

    # sufficient to assert that the means are not the same
    assert np.mean(X_fitted_1) != pytest.approx(np.mean(X_fitted_2))

    # when sample_posterior=False, and n_nearest_features=None
    # and imputation_order is not random
    # the two transforms should be identical even if rng are different
    imputer1 = IterativeImputer(missing_values=0,
                                max_iter=1,
                                sample_posterior=False,
                                n_nearest_features=None,
                                imputation_order='ascending',
                                random_state=rng1)

    imputer2 = IterativeImputer(missing_values=0,
                                max_iter=1,
                                sample_posterior=False,
                                n_nearest_features=None,
                                imputation_order='ascending',
                                random_state=rng2)
    imputer1.fit(X)
    imputer2.fit(X)

    X_fitted_1a = imputer1.transform(X)
    X_fitted_1b = imputer1.transform(X)
    X_fitted_2 = imputer2.transform(X)

    assert_allclose(X_fitted_1a, X_fitted_1b)
    assert_allclose(X_fitted_1a, X_fitted_2)