Exemplo n.º 1
0
    def __init__(self,
                 images_dict,
                 percent_missing=0.25,
                 saved_image_stride=25,
                 dirname="face_images",
                 scale_rows=False,
                 center_rows=False):
        self.images_dict = images_dict
        self.labels = list(sorted(images_dict.keys()))
        self.images_array = np.array([images_dict[k]
                                      for k in self.labels]).astype("float32")
        self.image_shape = self.images_array[0].shape
        self.width, self.height = self.image_shape[:2]
        self.color = (len(self.image_shape) == 3) and (self.image_shape[2]
                                                       == 3)
        if self.color:
            self.images_array = color_balance(self.images_array)
        self.n_pixels = self.width * self.height
        self.n_features = self.n_pixels * (3 if self.color else 1)
        self.n_images = len(self.images_array)
        print(
            "[ResultsTable] # images = %d, color=%s # features = %d, shape = %s"
            % (self.n_images, self.color, self.n_features, self.image_shape))

        self.flattened_array_shape = (self.n_images, self.n_features)

        self.flattened_images = self.images_array.reshape(
            self.flattened_array_shape)

        n_missing_pixels = int(self.n_pixels * percent_missing)

        missing_square_size = int(np.sqrt(n_missing_pixels))
        print(
            "[ResultsTable] n_missing_pixels = %d, missing_square_size = %d" %
            (n_missing_pixels, missing_square_size))
        self.incomplete_images = remove_pixels(
            self.images_array, missing_square_size=missing_square_size)
        print("[ResultsTable] Incomplete images shape = %s" %
              (self.incomplete_images.shape, ))
        self.flattened_incomplete_images = self.incomplete_images.reshape(
            self.flattened_array_shape)
        self.missing_mask = np.isnan(self.flattened_incomplete_images)
        self.normalizer = BiScaler(scale_rows=scale_rows,
                                   center_rows=center_rows,
                                   min_value=self.images_array.min(),
                                   max_value=self.images_array.max())
        self.incomplete_normalized = self.normalizer.fit_transform(
            self.flattened_incomplete_images)

        self.saved_image_indices = list(
            range(0, self.n_images, saved_image_stride))
        self.saved_images = defaultdict(dict)
        self.dirname = dirname
        self.mse_dict = {}
        self.mae_dict = {}

        self.save_images(self.images_array, "original", flattened=False)
        self.save_images(self.incomplete_images, "incomplete", flattened=False)
Exemplo n.º 2
0
def complex_imputation(df, method='mice', neighbors=3):
    """
	Inputs:
	df -- dataframe of incomplete data
	method -- method of imputation
		- 'knn': Imputes using K Nearest Neighbors of completed rows
		- 'soft_impute': Imputes using iterative soft thresholding of SVD decompositions
		- 'mice': Imputes using Multiple Imputation by Chained Equations method
		- 'nuclear_nm': Imputation using Exact Matrix Completion via Convex Optimization method
		- 'matrix_factorization': Imputes by factorization of matrix in low-rank U and V
								  with L1 sparsity on U elements and L2 sparsity on V elements
		- 'iterative_svd': Imputes based on iterative low-rank SVD decomposition
	neighbors -- parameter for KNN imputation
	
	Output:
	Completed matrix
	"""
    # Create matrix of features
    X_incomplete = df.values
    # Normalize matrix by std and mean (0 mean, 1 variance)
    X_incomplete_normalized = BiScaler().fit_transform(X_incomplete)

    if method == 'knn':
        X_complete = KNN(neighbors).complete(X_incomplete)
        return fill_values(df, X_complete)

    if method == 'soft_impute':
        X_complete_normalized = SoftImpute().complete(X_incomplete_normalized)
        X_complete = BiScaler().inverse_transform(X_complete_normalized)
        return fill_values(df, X_complete)

    if method == 'mice':
        X_complete = MICE().complete(X_incomplete)
        return fill_values(df, X_complete)

    if method == 'nuclear_nm':
        X_complete = NuclearNormMinimization().complete(X_incomplete)
        return fill_values(df, X_complete)

    if method == 'matrix_factorization':
        X_complete = MatrixFactorization().complete(X_incomplete)
        return fill_values(df, X_complete)

    if method == 'iterative_svd':
        X_complete = IterativeSVD().complete(X_incomplete)
        return fill_values(df, X_complete)
Exemplo n.º 3
0
    def __init__(
            self,
            images_dict,
            percent_missing=0.25,
            saved_image_stride=25,
            dirname="face_images",
            scale_rows=False,
            center_rows=False):
        self.images_dict = images_dict
        self.labels = list(sorted(images_dict.keys()))
        self.images_array = np.array(
            [images_dict[k] for k in self.labels]).astype("float32")
        self.image_shape = self.images_array[0].shape
        self.width, self.height = self.image_shape[:2]
        self.color = (len(self.image_shape) == 3) and (self.image_shape[2] == 3)
        if self.color:
            self.images_array = color_balance(self.images_array)
        self.n_pixels = self.width * self.height
        self.n_features = self.n_pixels * (3 if self.color else 1)
        self.n_images = len(self.images_array)
        print("[ResultsTable] # images = %d, color=%s # features = %d, shape = %s" % (
            self.n_images, self.color, self.n_features, self.image_shape))

        self.flattened_array_shape = (self.n_images, self.n_features)

        self.flattened_images = self.images_array.reshape(self.flattened_array_shape)

        n_missing_pixels = int(self.n_pixels * percent_missing)

        missing_square_size = int(np.sqrt(n_missing_pixels))
        print("[ResultsTable] n_missing_pixels = %d, missing_square_size = %d" % (
            n_missing_pixels, missing_square_size))
        self.incomplete_images = remove_pixels(
            self.images_array,
            missing_square_size=missing_square_size)
        print("[ResultsTable] Incomplete images shape = %s" % (
            self.incomplete_images.shape,))
        self.flattened_incomplete_images = self.incomplete_images.reshape(
            self.flattened_array_shape)
        self.missing_mask = np.isnan(self.flattened_incomplete_images)
        self.normalizer = BiScaler(
            scale_rows=scale_rows,
            center_rows=center_rows,
            min_value=self.images_array.min(),
            max_value=self.images_array.max())
        self.incomplete_normalized = self.normalizer.fit_transform(
            self.flattened_incomplete_images)

        self.saved_image_indices = list(
            range(0, self.n_images, saved_image_stride))
        self.saved_images = defaultdict(dict)
        self.dirname = dirname
        self.mse_dict = {}
        self.mae_dict = {}

        self.save_images(self.images_array, "original", flattened=False)
        self.save_images(self.incomplete_images, "incomplete", flattened=False)
Exemplo n.º 4
0
 def impute(self, trained_model, input):
     """
     Loads the input table and gives the imputed table
 
 	:param trained_model: trained model returned by train function - not used in our case
 	:param input: input table which needs to be imputed
 	:return:
 		X_filled_softimpute: imputed table as a numpy array
     """
     X_incomplete = input
     softImpute = SoftImpute()
     biscaler = BiScaler()
     X_incomplete_normalized = biscaler.fit_transform(X_incomplete)
     X_filled_softimpute_normalized = softImpute.fit_transform(
         X_incomplete_normalized)
     X_filled_softimpute = biscaler.inverse_transform(
         X_filled_softimpute_normalized)
     return X_filled_softimpute
Exemplo n.º 5
0
def get_imputer(imputer_name, **add_params):

    imputer_name = imputer_name.lower()

    if imputer_name == 'knn':
        return KNN(**add_params)
    elif imputer_name.lower() == 'nnm':
        return NuclearNormMinimization(**add_params)
    elif imputer_name == 'soft':
        return SoftImpute(**add_params)
    elif imputer_name == 'iterative':
        return IterativeImputer(**add_params)
    elif imputer_name == 'biscaler':
        return BiScaler(**add_params)
    else:
        print('Choose one of predefined imputers')
Exemplo n.º 6
0
def clean_input(data):
    cols = data.shape[1]
    for i in range(cols):
        curr = data[:,i]
        nans = np.isnan(curr)
        if not False in nans:
            data[0,i] = averages["avg_" + str(i)]
    if data.shape[0] == 1:
        norm = np.linalg.norm(data)
        if norm == 0:
            return data
        else:
            return data / norm
    data_normalized = BiScaler(verbose=False).fit_transform(data)
    data_filled = SoftImpute(verbose=False).fit_transform(data_normalized)
    return data_filled
Exemplo n.º 7
0
def fill_row(data):
    for i in range(data.shape[1]):
        if np.isnan(data[0,i]):
            data[0,i] = averages["avg_" + str(i)]
    tmp = np.zeros((1, data.shape[1]))
    tmp[:] = np.nan
    data = np.concatenate((data, tmp))
    for i in range(data.shape[1]):
        if i%2 == 1:
            tmp = data[0,i]
            data[0,i] = data[1,i]
            data[1,i] = tmp
    data_normalized = BiScaler(verbose=False).fit_transform(data)
    data_filled = SoftImpute(verbose=False).fit_transform(data_normalized)
    data_filled = np.delete(data_filled, 1, 0)
    return data_filled
        100.0 * observed_mask.sum() / X.size))
    if args.save_incomplete_affinity_matrix:
        print("Saving incomplete data to %s" % args.save_incomplete_affinity_matrix)
        df = pd.DataFrame(X, columns=allele_list, index=peptide_list)
        df.to_csv(args.save_incomplete_affinity_matrix, index_label="peptide")

    scores = ScoreSet()
    kfold = stratified_cross_validation(
        X=X,
        observed_mask=observed_mask,
        n_folds=args.n_folds)
    for fold_idx, (X_fold, ok_mesh, test_coords, X_test_vector) in enumerate(kfold):
        X_fold_reduced = X_fold[ok_mesh]
        biscaler = BiScaler(
            scale_rows=args.normalize_rows,
            center_rows=args.normalize_rows,
            scale_columns=args.normalize_columns,
            center_columns=args.normalize_columns)
        X_fold_reduced_scaled = biscaler.fit_transform(X=X_fold_reduced)
        for (method_name, solver) in sorted(imputation_methods.items()):
            print("CV fold %d/%d, running %s" % (
                fold_idx + 1,
                args.n_folds,
                method_name))
            X_completed_reduced_scaled = solver.complete(X_fold_reduced)
            X_completed_reduced = biscaler.inverse_transform(
                X_completed_reduced_scaled)
            X_completed = np.zeros_like(X)
            X_completed[ok_mesh] = X_completed_reduced
            y_pred = X_completed[test_coords]
            mae, tau, auc, f1_score = evaluate_predictions(
Exemplo n.º 9
0
class ResultsTable(object):
    def __init__(self,
                 images_dict,
                 percent_missing=0.25,
                 saved_image_stride=25,
                 dirname="face_images",
                 scale_rows=False,
                 center_rows=False):
        self.images_dict = images_dict
        self.labels = list(sorted(images_dict.keys()))
        self.images_array = np.array([images_dict[k]
                                      for k in self.labels]).astype("float32")
        self.image_shape = self.images_array[0].shape
        self.width, self.height = self.image_shape[:2]
        self.color = (len(self.image_shape) == 3) and (self.image_shape[2]
                                                       == 3)
        if self.color:
            self.images_array = color_balance(self.images_array)
        self.n_pixels = self.width * self.height
        self.n_features = self.n_pixels * (3 if self.color else 1)
        self.n_images = len(self.images_array)
        print(
            "[ResultsTable] # images = %d, color=%s # features = %d, shape = %s"
            % (self.n_images, self.color, self.n_features, self.image_shape))

        self.flattened_array_shape = (self.n_images, self.n_features)

        self.flattened_images = self.images_array.reshape(
            self.flattened_array_shape)

        n_missing_pixels = int(self.n_pixels * percent_missing)

        missing_square_size = int(np.sqrt(n_missing_pixels))
        print(
            "[ResultsTable] n_missing_pixels = %d, missing_square_size = %d" %
            (n_missing_pixels, missing_square_size))
        self.incomplete_images = remove_pixels(
            self.images_array, missing_square_size=missing_square_size)
        print("[ResultsTable] Incomplete images shape = %s" %
              (self.incomplete_images.shape, ))
        self.flattened_incomplete_images = self.incomplete_images.reshape(
            self.flattened_array_shape)
        self.missing_mask = np.isnan(self.flattened_incomplete_images)
        self.normalizer = BiScaler(scale_rows=scale_rows,
                                   center_rows=center_rows,
                                   min_value=self.images_array.min(),
                                   max_value=self.images_array.max())
        self.incomplete_normalized = self.normalizer.fit_transform(
            self.flattened_incomplete_images)

        self.saved_image_indices = list(
            range(0, self.n_images, saved_image_stride))
        self.saved_images = defaultdict(dict)
        self.dirname = dirname
        self.mse_dict = {}
        self.mae_dict = {}

        self.save_images(self.images_array, "original", flattened=False)
        self.save_images(self.incomplete_images, "incomplete", flattened=False)

    def ensure_dir(self, dirname):
        if not exists(dirname):
            print("Creating directory: %s" % dirname)
            mkdir(dirname)

    def save_images(self, images, base_filename, flattened=True):
        self.ensure_dir(self.dirname)
        for i in self.saved_image_indices:
            label = self.labels[i].lower().replace(" ", "_")
            image = images[i, :].copy()
            if flattened:
                image = image.reshape(self.image_shape)
            image[np.isnan(image)] = 0
            figure = pylab.gcf()
            axes = pylab.gca()
            extra_kwargs = {}
            if self.color:
                extra_kwargs["cmap"] = "gray"
            assert image.min() >= 0, "Image can't contain negative numbers"
            if image.max() <= 1:
                image *= 256
            image[image > 255] = 255
            axes.imshow(image.astype("uint8"), **extra_kwargs)
            axes.get_xaxis().set_visible(False)
            axes.get_yaxis().set_visible(False)
            filename = base_filename + ".png"
            subdir = join(self.dirname, label)
            self.ensure_dir(subdir)
            path = join(subdir, filename)
            figure.savefig(path, bbox_inches='tight')
            self.saved_images[i][base_filename] = path

    def add_entry(self, solver, name):
        print("Running %s" % name)
        completed_normalized = solver.fit_transform(self.incomplete_normalized)
        completed = self.normalizer.inverse_transform(completed_normalized)

        mae = masked_mae(X_true=self.flattened_images,
                         X_pred=completed,
                         mask=self.missing_mask)
        mse = masked_mse(X_true=self.flattened_images,
                         X_pred=completed,
                         mask=self.missing_mask)
        print("==> %s: MSE=%0.4f MAE=%0.4f" % (name, mse, mae))
        self.mse_dict[name] = mse
        self.mae_dict[name] = mae
        self.save_images(completed, base_filename=name)

    def sorted_errors(self):
        """
        Generator for (rank, name, MSE, MAE) sorted by increasing MAE
        """
        for i, (name, mae) in enumerate(
                sorted(self.mae_dict.items(), key=lambda x: x[1])):
            yield (
                i + 1,
                name,
                self.mse_dict[name],
                self.mae_dict[name],
            )

    def print_sorted_errors(self):
        for (rank, name, mse, mae) in self.sorted_errors():
            print("%d) %s: MSE=%0.4f MAE=%0.4f" % (rank, name, mse, mae))

    def save_html_table(self, filename="results_table.html"):
        html = """
            <table>
            <th>
                <td>Rank</td>
                <td>Name</td>
                <td>Mean Squared Error</td>
                <td>Mean Absolute Error</td>
            </th>
        """
        for (rank, name, mse, mae) in self.sorted_errors():
            html += """
            <tr>
                <td>%d</td>
                <td>%s</td>
                <td>%0.4f</td>
                <td>%0.4f</td>
            </tr>
            """ % (rank, name, mse, mae)
        html += "</table>"
        self.ensure_dir(self.dirname)
        path = join(self.dirname, filename)
        with open(path, "w") as f:
            f.write(html)
        return html
Exemplo n.º 10
0
    'random_state': 71,
    'silent': -1,
    'verbose': -1,
    'n_jobs': -1,
}
fit_params = {
    'eval_metric': 'auc',
    'early_stopping_rounds': 150,
    'verbose': 100
}

with timer('impute missing'):
    df = pd.concat([X_train, X_test], axis=0)
    df = df.loc[:, df.isnull().sum() != len(df)]
    cols = [f for f in df.columns if df[f].dtype != 'object']
    bi = BiScaler()
    df[cols] = bi.fit_transform(df[cols].values)
    df.fillna(-9999, inplace=True)
    X_train = df[:len(X_train)].copy()
    X_test = df[len(X_train):].copy()
    del bi, df, cols
    gc.collect()

with timer('training'):
    cv_results = []
    val_series = y_train.copy()
    test_df = pd.DataFrame()
    feat_df = None
    for i, (trn_idx, val_idx) in enumerate(cv.split(X_train, y_train)):
        X_trn = X_train.iloc[trn_idx].copy()
        y_trn = y_train[trn_idx]
Exemplo n.º 11
0
# Use 3 nearest rows which have a feature to fill in each row's missing features
knnImpute = KNN(k=3)
X_filled_knn = knnImpute.fit_transform(X_incomplete)

# matrix completion using convex optimization to find low-rank solution
# that still matches observed values. Slow!
X_filled_nnm = NuclearNormMinimization().fit_transform(X_incomplete)

# Instead of solving the nuclear norm objective directly, instead
# induce sparsity using singular value thresholding
softImpute = SoftImpute()

# simultaneously normalizes the rows and columns of your observed data,
# sometimes useful for low-rank imputation methods
biscaler = BiScaler()

# rescale both rows and columns to have zero mean and unit variance
X_incomplete_normalized = biscaler.fit_transform(X_incomplete)

X_filled_softimpute_normalized = softImpute.fit_transform(
    X_incomplete_normalized)
X_filled_softimpute = biscaler.inverse_transform(
    X_filled_softimpute_normalized)

X_filled_softimpute_no_biscale = softImpute.fit_transform(X_incomplete)

meanfill_mse = ((X_filled_mean[missing_mask] - X[missing_mask])**2).mean()
print("meanFill MSE: %f" % meanfill_mse)

# print mean squared error for the imputation methods above
 try:
     imputedData = mice.MICE().complete(missData)
     score = evaluate.RMSE(originData, imputedData)
     mice_rmse.append(score)
     logger.info("MICE missing rate:{},RMSE:{}".format(i, score))
 except:
     mice_rmse.append(np.nan)
 try:
     imputedData = EM().complete(missData)
     score = evaluate.RMSE(originData, imputedData)
     em_rmse.append(score)
     logger.info("EM missing rate:{},RMSE:{}".format(i, score))
 except:
     em_rmse.append(np.nan)
 try:
     imputedData = BiScaler().fit_transform(missData)
     imputedData = SoftImpute().fit_transform(imputedData)
     score = evaluate.RMSE(originData, imputedData)
     fi_bs_rmse.append(score)
     logger.info("fi BiScaler  missing rate:{},RMSE:{}".format(
         i, score))
 except:
     fi_bs_rmse.append(np.nan)
 try:
     imputedData = SoftImpute().fit_transform(missData)
     score = evaluate.RMSE(originData, imputedData)
     fi_si_rmse.append(score)
     logger.info("fi SoftImpute missing rate:{},RMSE:{}".format(
         i, score))
 except:
     fi_si_rmse.append(np.nan)
Exemplo n.º 13
0
# set random seed 2 ways cause I'm not sure what's appropriate, my suspicion
# is numpy
random.seed(123)
np.random.seed(123)

# read in data and transpose
data = pd.read_csv(input_file, sep='\t', header=0, index_col=0, 
				   error_bad_lines=False)
new_data = data.copy()
transposed = new_data.T

# we'll need a matrix specifically for the biscaler transform, for SoftImpute
print("SoftImpute...")
transposed_mat = transposed.as_matrix()
biscaler = BiScaler()

# perform the scaling appropriate for this imputation strategy
transposed_normalized = biscaler.fit_transform(transposed_mat)

# the imputation itself
imputed_softimpute = SoftImpute().fit_transform(transposed_normalized)

# we don't want the transformed values and we want samples to be columns
inverse_softimpute = biscaler.inverse_transform(imputed_softimpute)
untransposed_softimpute = inverse_softimpute.transpose()

# prepare to write to file, back to DataFrame, return indices
softimpute_df = pd.DataFrame(untransposed_softimpute)
softimpute_df.index = data.index
softimpute_df.columns = data.columns.values
Exemplo n.º 14
0
# Use 3 nearest rows which have a feature to fill in each row's missing features
knnImpute = KNN(k=3)
X_filled_knn = knnImpute.complete(X_incomplete)

# matrix completion using convex optimization to find low-rank solution
# that still matches observed values. Slow!
X_filled_nnm = NuclearNormMinimization().complete(X_incomplete)

# Instead of solving the nuclear norm objective directly, instead
# induce sparsity using singular value thresholding
softImpute = SoftImpute()

# simultaneously normalizes the rows and columns of your observed data,
# sometimes useful for low-rank imputation methods
biscaler = BiScaler()

# rescale both rows and columns to have zero mean and unit variance
X_incomplete_normalized = biscaler.fit_transform(X_incomplete)

X_filled_softimpute_normalized = softImpute.complete(X_incomplete_normalized)
X_filled_softimpute = biscaler.inverse_transform(X_filled_softimpute_normalized)

X_filled_softimpute_no_biscale = softImpute.complete(X_incomplete)

meanfill_mse = ((X_filled_mean[missing_mask] - X[missing_mask]) ** 2).mean()
print("meanFill MSE: %f" % meanfill_mse)

# print mean squared error for the three imputation methods above
nnm_mse = ((X_filled_nnm[missing_mask] - X[missing_mask]) ** 2).mean()
print("Nuclear norm minimization MSE: %f" % nnm_mse)
Exemplo n.º 15
0
                                  min_improvement=1e-6).fit_transform(X_incomplete)

# matrix completion using Mean Fill
X_filled_meanfill = SimpleFill(fill_method='mean').fit_transform(X_incomplete)
# matrix completion using Median Fill
X_filled_medianfill = SimpleFill(fill_method='median').fit_transform(X_incomplete)
# matrix completion using Zero Fill
X_filled_zerofill = SimpleFill(fill_method='zero').fit_transform(X_incomplete)
# matrix completion using Min Fill
X_filled_minfill = SimpleFill(fill_method='min').fit_transform(X_incomplete)
# matrix completion using Sampled Fill
X_filled_randomfill = SimpleFill(fill_method='random').fit_transform(X_incomplete)

# Instead of solving the nuclear norm objective directly, instead
# induce sparsity using singular value thresholding
X_incomplete_normalized = BiScaler().fit_transform(X_incomplete)
X_filled_softimpute = SoftImpute().fit_transform(X_incomplete_normalized)

# print mean squared error for the  imputation methods above
mice_mse = ((X_filled_mice[missing_mask] - X[missing_mask]) ** 2).mean()
print("MICE MSE: %f" % mice_mse)

svd_mse = ((X_filled_svd[missing_mask] - X[missing_mask]) ** 2).mean()
print("SVD MSE: %f" % svd_mse)

mf_mse = ((X_filled_mf[missing_mask] - X[missing_mask]) ** 2).mean()
print("Matrix Factorization MSE: %f" % mf_mse)

meanfill_mse = ((X_filled_meanfill[missing_mask] - X[missing_mask]) ** 2).mean()
print("MeanImpute MSE: %f" % meanfill_mse)
Exemplo n.º 16
0
def main(
        input_path: str = '/project/lindner/air-pollution/level3_data/',
        input_prefix: str = "Data_",
        input_suffix: str = "",
        output_path:
    str = '/project/lindner/air-pollution/current/2019/data-formatted/houston',
        year_begin: int = 2000,
        year_end: int = 2018,
        fillgps: bool = False,
        naninvalid: bool = False,
        dropnan: bool = False,
        masknan: float = None,
        fillnan: float = None,
        aqsnumerical: bool = False,
        houston: bool = False,
        chunksize: int = 200000):
    data1 = pd.read_csv(
        "/project/lindner/air-pollution/current/2019/data-formatted/concat_aqs/Transformed_Data_48_201_0695.csv"
    )
    data2 = pd.read_csv(
        "/project/lindner/air-pollution/current/2019/data-formatted/concat_aqs/Transformed_Data_48_201_0416.csv"
    )
    #Goal is to impute Park Place o3 from all other features
    y = data2['o3']
    data1 = data1.add_prefix('MoodyTowers_')
    data2 = data2.drop(['o3'], axis='columns').add_prefix('ParkPlace_')
    #Because of unneeded columns leftover from faulty script
    data1X = data1.replace('48_201_0695', 0)
    data2X = data2.replace('48_201_0416', 1)
    X = pd.concat([data1X, data2X], ignore_index=True)
    #X, y = X.dropna(), y.dropna()
    X = X.dropna(how='all', axis='columns')
    X, y = np.array(X), np.array(y)
    scaler = MinMaxScaler()
    X = BiScaler().fit_transform(X)
    X = SoftImpute().fit_transform(X)
    y = BiScaler().fit_transform(y)
    y = SoftImpute().fit_transform(y)
    #X = scaler.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        random_state=0)
    # Initialising the RNN
    regressor = Sequential()
    #Layers
    regressor.add(
        Dense(25,
              input_dim=21,
              activation='relu',
              kernel_initializer='he_uniform'))
    regressor.add(Dropout(0.2))
    regressor.add(
        Dense(25,
              input_dim=21,
              activation='relu',
              kernel_initializer='he_uniform'))
    regressor.add(Dropout(0.2))
    regressor.add(
        Dense(25,
              input_dim=21,
              activation='relu',
              kernel_initializer='he_uniform'))
    regressor.add(Dropout(0.2))
    # Adding the output layer
    regressor.add(Dense(units=1))

    # Compiling the RNN
    regressor.compile(optimizer='adam', loss='mean_squared_error')

    # Fitting the RNN to the Training set
    history = model.fit(X_train,
                        y_train,
                        validation_data=(X_test, y_test),
                        epochs=100,
                        verbose=1)
    # evaluate the model
    train_mse = model.evaluate(X_train, y_train, verbose=0)
    test_mse = model.evaluate(X_test, y_test, verbose=0)
    print('Train: %.3f, Test: %.3f' % (train_mse, test_mse))
    # plot loss during training
    pyplot.title('Loss / Mean Squared Error')
    pyplot.plot(history.history['loss'], label='train')
    pyplot.plot(history.history['val_loss'], label='test')
    pyplot.legend()
    pyplot.show()
    pyplot.savefig(output_path + "MSE_of_LSTM_model.png")
    regressor.save(output_path + "model.h5")
Exemplo n.º 17
0
class ResultsTable(object):

    def __init__(
            self,
            images_dict,
            percent_missing=0.25,
            saved_image_stride=25,
            dirname="face_images",
            scale_rows=False,
            center_rows=False):
        self.images_dict = images_dict
        self.labels = list(sorted(images_dict.keys()))
        self.images_array = np.array(
            [images_dict[k] for k in self.labels]).astype("float32")
        self.image_shape = self.images_array[0].shape
        self.width, self.height = self.image_shape[:2]
        self.color = (len(self.image_shape) == 3) and (self.image_shape[2] == 3)
        if self.color:
            self.images_array = color_balance(self.images_array)
        self.n_pixels = self.width * self.height
        self.n_features = self.n_pixels * (3 if self.color else 1)
        self.n_images = len(self.images_array)
        print("[ResultsTable] # images = %d, color=%s # features = %d, shape = %s" % (
            self.n_images, self.color, self.n_features, self.image_shape))

        self.flattened_array_shape = (self.n_images, self.n_features)

        self.flattened_images = self.images_array.reshape(self.flattened_array_shape)

        n_missing_pixels = int(self.n_pixels * percent_missing)

        missing_square_size = int(np.sqrt(n_missing_pixels))
        print("[ResultsTable] n_missing_pixels = %d, missing_square_size = %d" % (
            n_missing_pixels, missing_square_size))
        self.incomplete_images = remove_pixels(
            self.images_array,
            missing_square_size=missing_square_size)
        print("[ResultsTable] Incomplete images shape = %s" % (
            self.incomplete_images.shape,))
        self.flattened_incomplete_images = self.incomplete_images.reshape(
            self.flattened_array_shape)
        self.missing_mask = np.isnan(self.flattened_incomplete_images)
        self.normalizer = BiScaler(
            scale_rows=scale_rows,
            center_rows=center_rows,
            min_value=self.images_array.min(),
            max_value=self.images_array.max())
        self.incomplete_normalized = self.normalizer.fit_transform(
            self.flattened_incomplete_images)

        self.saved_image_indices = list(
            range(0, self.n_images, saved_image_stride))
        self.saved_images = defaultdict(dict)
        self.dirname = dirname
        self.mse_dict = {}
        self.mae_dict = {}

        self.save_images(self.images_array, "original", flattened=False)
        self.save_images(self.incomplete_images, "incomplete", flattened=False)

    def ensure_dir(self, dirname):
        if not exists(dirname):
            print("Creating directory: %s" % dirname)
            mkdir(dirname)

    def save_images(self, images, base_filename, flattened=True):
        self.ensure_dir(self.dirname)
        for i in self.saved_image_indices:
            label = self.labels[i].lower().replace(" ", "_")
            image = images[i, :].copy()
            if flattened:
                image = image.reshape(self.image_shape)
            image[np.isnan(image)] = 0
            figure = pylab.gcf()
            axes = pylab.gca()
            extra_kwargs = {}
            if self.color:
                extra_kwargs["cmap"] = "gray"
            assert image.min() >= 0, "Image can't contain negative numbers"
            if image.max() <= 1:
                image *= 256
            image[image > 255] = 255
            axes.imshow(image.astype("uint8"), **extra_kwargs)
            axes.get_xaxis().set_visible(False)
            axes.get_yaxis().set_visible(False)
            filename = base_filename + ".png"
            subdir = join(self.dirname, label)
            self.ensure_dir(subdir)
            path = join(subdir, filename)
            figure.savefig(
                path,
                bbox_inches='tight')
            self.saved_images[i][base_filename] = path

    def add_entry(self, solver, name):
        print("Running %s" % name)
        completed_normalized = solver.complete(self.incomplete_normalized)
        completed = self.normalizer.inverse_transform(completed_normalized)

        mae = masked_mae(
            X_true=self.flattened_images,
            X_pred=completed,
            mask=self.missing_mask)
        mse = masked_mse(
            X_true=self.flattened_images,
            X_pred=completed,
            mask=self.missing_mask)
        print("==> %s: MSE=%0.4f MAE=%0.4f" % (name, mse, mae))
        self.mse_dict[name] = mse
        self.mae_dict[name] = mae
        self.save_images(completed, base_filename=name)

    def sorted_errors(self):
        """
        Generator for (rank, name, MSE, MAE) sorted by increasing MAE
        """
        for i, (name, mae) in enumerate(
                sorted(self.mae_dict.items(), key=lambda x: x[1])):
            yield(i + 1, name, self.mse_dict[name], self.mae_dict[name],)

    def print_sorted_errors(self):
        for (rank, name, mse, mae) in self.sorted_errors():
            print("%d) %s: MSE=%0.4f MAE=%0.4f" % (
                rank,
                name,
                mse,
                mae))

    def save_html_table(self, filename="results_table.html"):
        html = """
            <table>
            <th>
                <td>Rank</td>
                <td>Name</td>
                <td>Mean Squared Error</td>
                <td>Mean Absolute Error</td>
            </th>
        """
        for (rank, name, mse, mae) in self.sorted_errors():
            html += """
            <tr>
                <td>%d</td>
                <td>%s</td>
                <td>%0.4f</td>
                <td>%0.4f</td>
            </tr>
            """ % (rank, name, mse, mae)
        html += "</table>"
        self.ensure_dir(self.dirname)
        path = join(self.dirname, filename)
        with open(path, "w") as f:
            f.write(html)
        return html
Exemplo n.º 18
0
def impute(data, method='mean', value=None, nan_value=np.nan):
    """
    Impute missing values on a numpy ndarray in a column-wise manner.
    
    ANTsR function: `antsrimpute`

    Arguments
    ---------
    data : numpy.ndarray
        data to impute

    method : string or float
        type of imputation method to use
        Options:
            mean
            median
            constant
            KNN
            BiScaler
            NuclearNormMinimization
            SoftImpute
            IterativeSVD

    value : scalar (optional)
        optional arguments for different methods
        if method == 'constant'
            constant value
        if method == 'KNN'
            number of nearest neighbors to use

    nan_value : scalar
        value which is interpreted as a missing value

    Returns
    -------
    ndarray if ndarray was given
    OR
    pd.DataFrame if pd.DataFrame was given

    Example
    -------
    >>> import ants
    >>> import numpy as np
    >>> data = np.random.randn(4,10)
    >>> data[2,3] = np.nan
    >>> data[3,5] = np.nan
    >>> data_imputed = ants.impute(data, 'mean')

    Details
    -------
    KNN: Nearest neighbor imputations which weights samples using the mean squared 
            difference on features for which two rows both have observed data.

    SoftImpute: Matrix completion by iterative soft thresholding of SVD 
                decompositions. Inspired by the softImpute package for R, which 
                is based on Spectral Regularization Algorithms for Learning 
                Large Incomplete Matrices by Mazumder et. al.

    IterativeSVD: Matrix completion by iterative low-rank SVD decomposition.
                    Should be similar to SVDimpute from Missing value estimation 
                    methods for DNA microarrays by Troyanskaya et. al.

    MICE: Reimplementation of Multiple Imputation by Chained Equations.

    MatrixFactorization: Direct factorization of the incomplete matrix into 
                        low-rank U and V, with an L1 sparsity penalty on the elements 
                        of U and an L2 penalty on the elements of V. 
                        Solved by gradient descent.

    NuclearNormMinimization: Simple implementation of Exact Matrix Completion 
                            via Convex Optimization by Emmanuel Candes and Benjamin 
                            Recht using cvxpy. Too slow for large matrices.

    BiScaler: Iterative estimation of row/column means and standard deviations 
                to get doubly normalized matrix. Not guaranteed to converge but 
                works well in practice. Taken from Matrix Completion and 
                Low-Rank SVD via Fast Alternating Least Squares.
    """
    _fancyimpute_options = {
        'KNN', 'BiScaler', 'NuclearNormMinimization', 'SoftImpute',
        'IterativeSVD'
    }
    if (not has_fancyimpute) and (method in _fancyimpute_options):
        raise ValueError(
            'You must install `fancyimpute` (pip install fancyimpute) to use this method'
        )

    _base_options = {'mean', 'median', 'constant'}
    if (method not in _base_options) and (
            method not in _fancyimpute_options) and (not isinstance(
                method, (int, float))):
        raise ValueError(
            'method not understood.. Use `mean`, `median`, a scalar, or an option from `fancyimpute`'
        )

    X_incomplete = data.copy()

    if method == 'KNN':
        if value is None:
            value = 3
        X_filled = KNN(k=value, verbose=False).complete(X_incomplete)

    elif method == 'BiScaler':
        X_filled = BiScaler(verbose=False).fit_transform(X_incomplete)

    elif method == 'SoftImpute':
        X_filled = SoftImpute(verbose=False).complete(X_incomplete)

    elif method == 'IterativeSVD':
        if value is None:
            rank = min(10, X_incomplete.shape[0] - 2)
        else:
            rank = value
        X_filled = IterativeSVD(rank=rank,
                                verbose=False).complete(X_incomplete)

    elif method == 'mean':
        col_means = np.nanmean(X_incomplete, axis=0)
        for i in range(X_incomplete.shape[1]):
            X_incomplete[:, i][np.isnan(X_incomplete[:, i])] = col_means[i]
        X_filled = X_incomplete

    elif method == 'median':
        col_means = np.nanmean(X_incomplete, axis=0)
        for i in range(X_incomplete.shape[1]):
            X_incomplete[:, i][np.isnan(X_incomplete[:, i])] = col_means[i]
        X_filled = X_incomplete

    elif method == 'constant':
        if value is None:
            raise ValueError(
                'Must give `value` argument if method == constant')
        X_incomplete[np.isnan(X_incomplete)] = value
        X_filled = X_incomplete

    return X_filled
Exemplo n.º 19
0
    masked_ii = train_x_ii[mask, 2]

    # Use 3 nearest rows which have a feature to fill
    # in each row's missing features
    train_x_knn = KNN(k=3, verbose=False).fit_transform(train_x)
    masked_knn = train_x_knn[mask, 2]

    # matrix completion using convex optimization to find low-rank solution
    # that still matches observed values.
    # Slow!
    # train_x_nnm = NuclearNormMinimization().fit_transform(train_x)
    # imp_nnm = train_x_nnm[train_x.isnull().values]

    # Instead of solving the nuclear norm objective directly, instead
    # induce sparsity using singular value thresholding
    train_x_normalized = BiScaler(verbose=False).fit_transform(train_x)
    train_x_softimpute = SoftImpute(verbose=False).fit_transform(
        train_x_normalized)
    masked_soft = train_x_softimpute[mask, 2]

    # print mean squared error for the four imputation methods above
    ii_mse = ((masked_ii - masked_x) ** 2).mean()
    knn_mse = ((masked_knn - masked_x) ** 2).mean()
    soft_mse = ((masked_soft - masked_x) ** 2).mean()

    lrcv.fit(train_x_ii, train_y)
    print("Iterative Imputer\nImputed MSE : {:5f}".format(ii_mse))
    print('Ridge alpha : {}'.format(lrcv.alpha_))
    ridge = Ridge(alpha=lrcv.alpha_, random_state=SEED + i)
    cvs = cross_val_score(ridge,
                          train_x_ii, train_y,