예제 #1
0
def em_algorithm(data, loops=50, dtype="cont"):

    if not checks(data):
        raise Exception("Checks failed")
    if dtype == "cont":
        null_xy = find_null(data)
        for x_i, y_i in null_xy:
            col = data[:, int(y_i)]
            mu = col[~np.isnan(col)].mean()
            std = col[~np.isnan(col)].std()
            col[x_i] = random.gauss(mu, std)
            previous, i = 1, 1
            for i in range(loops):

                mu = col[~np.isnan(col)].mean()
                std = col[~np.isnan(col)].std()

                col[x_i] = random.gauss(mu, std)

                delta = (col[x_i] - previous) / previous
                if i > 5 and delta < 0.1:
                    data[x_i][y_i] = col[x_i]
                    break
                data[x_i][y_i] = col[x_i]
                previous = col[x_i]
        return data
    else:
        raise Exception("Other dtypes not supported yet.")
예제 #2
0
 def test_1d(self):
     """ Check 1d array, should return false"""
     _redirect_stdout(True)
     arr = np.array([np.nan, 2])
     output = checks(arr)
     _redirect_stdout(False)
     self.assertFalse(output)
예제 #3
0
def from_before_observation(data, axis=0):

    if not checks(data):
        raise Exception("Checks failed")

    if axis == 0:
        data = np.transpose(data)
    elif axis == 1:
        pass

    null_xy = find_null(data)
    for x_i, y_i in null_xy:

        # Simplest scenario, look one row back
        if x_i - 1 > -1:
            data[x_i][y_i] = data[x_i - 1][y_i]

        # Look n rows forward

        else:
            x_residuals = np.shape(data)[0] - x_i - 1  # n data points left
            val_found = False
            for i in range(1, x_residuals):
                if not np.isnan(data[x_i + i][y_i]):
                    val_found = True
                    break
            if val_found:

                for x_nan in range(i):
                    data[x_i + x_nan][y_i] = data[x_i + i][y_i]
            else:
                print("Error: Entire Column is NaN")
                raise Exception
    return data
예제 #4
0
def arima(data, p, d, q):
    """Autoregressive Integrated Moving Average Imputation

    PARAMETERS
    ----------
    data: numpy.ndarray
        The matrix with missing values that you want to impute
    p: int
        Number of autoregressive terms
    d: int
        Number of nonseasonal differences needed for stationarity
    q: int
        Number of lagged forecast errors in the prediction equation
    RETURNS
    -------
    numpy.ndarray
    """
    # Verify inputs
    if not checks(data):
        raise Exception("Checks failed")
    try:
        p = int(p)
        d = int(d)
        q = int(q)
        data = isinstance(data, np.ndarray)
    except:
        raise Exception
    # Arima
    null_xy = find_null(data)
    for x, y in null_xy:
        print(x, y)
    return data
예제 #5
0
 def test_3d_false(self):
     """ Check 3d array without setting allow_3d=True, should return false"""
     _redirect_stdout(True)
     arr = np.ones((5, 5, 3))
     arr[0][0][0] = np.nan
     output = checks(arr)
     _redirect_stdout(False)
     self.assertFalse(output)
예제 #6
0
 def test_correct_input(self):
     """ Test that an array that satisfies all checks returns True"""
     # Integer np.ndarray (check: `_is_ndarray`, `_shape_2d`)
     arr = np.array([[1, 2], [3, 4]])
     # Cast integer array to float (check: `_dtype_float`)
     arr.dtype = np.float
     # Add a nan value to the array (check: `_nan_exists`)
     arr[0][0] = np.nan
     output = checks(arr)
     self.assertTrue(output)
예제 #7
0
def random_imputation(data):

    if not checks(data):
        raise Exception("Checks failed")
    null_xy = find_null(data)
    for x, y in null_xy:
        uniques = np.unique(data[:, y])
        uniques = uniques[~np.isnan(uniques)]
        data[x][y] = np.random.choice(uniques)
    return data
예제 #8
0
def locf(data, axis=0):
    """ Last Observation Carried Forward

    For each set of missing indices, use the value of one row before(same
    column). In the case that the missing value is the first row, look one
    row ahead instead. If this next row is also NaN, look to the next row.
    Repeat until you find a row in this column that's not NaN. All the rows
    before will be filled with this value.

    Parameters
    ----------
    data: numpy.ndarray
        Data to impute.
    axis: boolean (optional)
        0 if time series is in row format (Ex. data[0][:] is 1st data point).
        1 if time series is in col format (Ex. data[:][0] is 1st data point).

    Returns
    -------
    numpy.ndarray
        Imputed data.

    """
    if not checks(data):
        raise Exception("Checks failed")

    if axis == 0:
        data = np.transpose(data)
    elif axis == 1:
        pass

    null_xy = find_null(data)
    for x_i, y_i in null_xy:
        # Simplest scenario, look one row back
        if x_i - 1 > -1:
            data[x_i][y_i] = data[x_i - 1][y_i]
        # Look n rows forward
        else:
            x_residuals = np.shape(data)[0] - x_i - 1  # n datapoints left
            val_found = False
            for i in range(1, x_residuals):
                if not np.isnan(data[x_i + i][y_i]):
                    val_found = True
                    break
            if val_found:
                # pylint: disable=undefined-loop-variable
                for x_nan in range(i):
                    data[x_i + x_nan][y_i] = data[x_i + i][y_i]
            else:
                print("Error: Entire Column is NaN")
                raise Exception
    return data
예제 #9
0
def em(data, loops=50, dtype="cont"):
    """ Imputes given data using expectation maximization.

    E-step: Calculates the expected complete data log likelihood ratio.
    M-step: Finds the parameters that maximize the log likelihood of the
    complete data.

    Parameters
    ----------
    data: numpy.nd.array
        Data to impute.
    loops: int
        Number of em iterations to run before breaking.
    dtype: ("cont","disc")
        Indicates whether the possible values will come from a continuous
        range or categorical range.

    Returns
    -------
    numpy.nd.array
        Imputed data.

    """
    if not checks(data):
        raise Exception("Checks failed")
    if dtype == "cont":
        null_xy = find_null(data)
        for x_i, y_i in null_xy:
            col = data[:, int(y_i)]
            mu = col[~np.isnan(col)].mean()
            std = col[~np.isnan(col)].std()
            col[x_i] = random.gauss(mu, std)
            previous, i = 1, 1
            for i in range(loops):
                # Expectation
                mu = col[~np.isnan(col)].mean()
                std = col[~np.isnan(col)].std()
                # Maximization
                col[x_i] = random.gauss(mu, std)
                # Break out of loop if likelihood doesn't change at least 10%
                # and has run at least 5 times
                delta = (col[x_i] - previous) / previous
                if i > 5 and delta < 0.1:
                    data[x_i][y_i] = col[x_i]
                    break
                data[x_i][y_i] = col[x_i]
                previous = col[x_i]
        return data
    else:
        raise Exception("Other dtypes not supported yet.")
def mean_imputation(data):
    """ Substitute missing values with the mean of that column.

    Parameters
    ----------
    data: numpy.ndarray
        Data to impute.

    Returns
    -------
    numpy.ndarray
        Imputed data.

    """
    if not checks(data):
        raise Exception("Checks failed")
    null_xy = find_null(data)
    for x_i, y_i in null_xy:
        row_wo_nan = data[:, [y_i]][~np.isnan(data[:, [y_i]])]
        new_value = np.mean(row_wo_nan)
        data[x_i][y_i] = new_value
    return data
예제 #11
0
def random_imputation(data):
    """ Fill missing values in with a randomly selected value from the same
    column.

    Parameters
    ----------
    data: numpy.ndarray
        Data to impute.

    Returns
    -------
    numpy.ndarray
        Imputed data.

    """
    if not checks(data):
        raise Exception("Checks failed")
    null_xy = find_null(data)
    for x, y in null_xy:
        uniques = np.unique(data[:, y])
        uniques = uniques[~np.isnan(uniques)]
        data[x][y] = np.random.choice(uniques)
    return data
예제 #12
0
 def test_not_nparray(self):
     """ If not an np.array, should return false"""
     _redirect_stdout(True)
     output = checks([[np.nan, 2.], [3, 4]])
     _redirect_stdout(False)
     self.assertEqual(output, False)
예제 #13
0
 def test_return_type(self):
     """ Check return type, should return a boolean"""
     _redirect_stdout(True)
     output = checks(np.array([[1., 2.], [3, 4]]))
     _redirect_stdout(False)
     self.assertEqual(type(output), type(False))
예제 #14
0
 def test_2d(self):
     """ Check 2d array, should return true"""
     arr = np.array([[1., 2.], [3, 4]])
     arr[0][0] = np.nan
     output = checks(arr)
     self.assertTrue(output)
예제 #15
0
 def test_3d_true(self):
     """ Check 3d array setting allow_3d=True, should return true"""
     arr = np.ones((5, 5, 3))
     arr[0][0][0] = np.nan
     output = checks(arr, allow_3d=True)
     self.assertTrue(output)
예제 #16
0
def mice(data):
    """Multivariate Imputation by Chained Equations

    Reference:
        Buuren, S. V., & Groothuis-Oudshoorn, K. (2011). Mice: Multivariate
        Imputation by Chained Equations in R. Journal of Statistical Software,
        45(3). doi:10.18637/jss.v045.i03

    Implementation follows the main idea from the paper above. Differs in
    decision of which variable to regress on (here, I choose it at random).
    Also differs in stopping criterion (here the model stops after change in
    prediction from previous prediction is less than 10%).

    PARAMETERS
    ----------
    data: numpy.ndarray
        Data to impute.

    RETURNS
    -------
    numpy.ndarray
        Imputed data.

    """
    if not checks(data):
        raise Exception("Checks failed")
    null_xy = find_null(data)

    # Add a column of zeros to the index values
    null_xyv = np.append(null_xy, np.zeros((np.shape(null_xy)[0], 1)), axis=1)

    null_xyv = [[int(x), int(y), v] for x, y, v in null_xyv]
    temp = []
    cols_missing = set([y for _, y, _ in null_xyv])

    # Step 1: Simple Imputation, these are just placeholders
    for x_i, y_i, value in null_xyv:
        # Column containing nan value without the nan value
        col = data[:, [y_i]][~np.isnan(data[:, [y_i]])]

        new_value = np.mean(col)
        data[x_i][y_i] = new_value
        temp.append([x_i, y_i, new_value])
    null_xyv = temp

    # Step 5: Repeat step 2 - 4 until convergence (the 100 is arbitrary)

    converged = [False] * len(null_xyv)
    while all(converged):
        # Step 2: Placeholders are set back to missing for one variable/column
        dependent_col = int(np.random.choice(list(cols_missing)))
        missing_xs = [int(x) for x, y, value in null_xyv if y == dependent_col]

        # Step 3: Perform linear regression using the other variables
        x_train, y_train = [], []
        for x_i in (x_i for x_i in range(len(data)) if x_i not in missing_xs):
            x_train.append(np.delete(data[x_i], dependent_col))
            y_train.append(data[x_i][dependent_col])
        model = LinearRegression()
        model.fit(x_train, y_train)

        # Step 4: Missing values for the missing variable/column are replaced
        # with predictions from our new linear regression model
        temp = []
        # For null indices with the dependent column that was randomly chosen
        for i, x_i, y_i, value in enumerate(null_xyv):
            if y_i == dependent_col:
                # Row 'x' without the nan value
                new_value = model.predict(np.delete(data[x_i], dependent_col))
                data[x_i][y_i] = new_value.reshape(1, -1)
                temp.append([x_i, y_i, new_value])
                delta = (new_value - value) / value
                if delta < 0.1:
                    converged[i] = True
        null_xyv = temp
    return data
예제 #17
0
 def test_nan_exists(self):
     """ If no NaN, should return false"""
     _redirect_stdout(True)
     output = checks(np.array([[1., 2.], [3, 4]]))
     _redirect_stdout(False)
     self.assertEqual(output, False)