Exemplo n.º 1
0
    def apply_as_distribution(self, model):
        if model.kind() == 'ddp_model':
            if len(model.xx) != len(self.yy):
                raise ValueError("Wrong number of values.")

            for ii in range(len(model.xx)):
                if model.get_xx()[ii] != self.get_yy()[ii]:
                    raise ValueError("Wrong values in distribution.")

            vv = self.lin_p().dot(model.lin_p())

            return DDPModel('ddp1',
                            'distribution',
                            self.xx_is_categorical,
                            self.get_xx(),
                            model.yy_is_categorical,
                            model.get_yy(),
                            vv,
                            scaled=self.scaled)

        elif model.kind() == 'spline_model':
            if len(model.xx) != len(self.yy):
                raise ValueError("Wrong number of values.")

            for ii in range(len(model.xx)):
                if model.get_xx()[ii] not in self.get_yy():
                    raise ValueError("Wrong values in distribution.")

            pp = self.lin_p()
            conditionals = []

            for ii in range(len(self.xx)):
                conds = []

                for jj in range(len(model.xx)):
                    original = model.get_conditional(model.get_xx()[jj])
                    conditional = SplineModelConditional(
                        original.y0s, original.y1s, original.coeffs)
                    conditional.scale(
                        pp[ii, self.get_yy().index(model.get_xx()[jj])])
                    conds.append(conditional)

                conditionals.append(
                    SplineModelConditional.approximate_sum(conds))

            return SplineModel(self.xx_is_categorical,
                               self.get_xx(),
                               conditionals,
                               scaled=self.scaled)

        else:
            raise ValueError("Unknown model type in apply_as_distribution")
Exemplo n.º 2
0
    def skew_gaussian_construct(ys, lps, low_segment, high_segment):
        mid_segment = SplineModelConditional.make_conditional_from_spline(InterpolatedUnivariateSpline(ys, lps, k=2), (low_segment.y1s[0], high_segment.y0s[0]))
        conditional = SplineModelConditional()
        conditional.add_segment(low_segment.y0s[0], low_segment.y1s[0], copy.copy(low_segment.coeffs[0]))
        for ii in range(mid_segment.size()):
            conditional.add_segment(mid_segment.y0s[ii], mid_segment.y1s[ii], mid_segment.coeffs[ii])
        conditional.add_segment(high_segment.y0s[0], high_segment.y1s[0], copy.copy(high_segment.coeffs[0]))

        try:
            conditional.rescale()
        except:
            return None

        return conditional
Exemplo n.º 3
0
    def evaluate_spline(header, row, spline, limits):
        limits = (max(min(spline.get_knots()), float(limits[0])), min(max(spline.get_knots()), float(limits[1])))

        ys = np.linspace(limits[0], limits[1], len(header) * SplineModel.samples)
        ps = np.exp(spline(ys)) * (limits[1] - limits[0]) / (len(header) * SplineModel.samples)
        ps = ps / sum(ps)
        cfs = np.cumsum(ps)

        if 'mean' in header or 'var' in header or 'skew' in header:
            mean = sum(ps * ys)
        if 'var' in header or 'skew' in header:
            var = sum(ps * np.square(ys - mean))
        
        error = 0
        for ii in range(1, len(header)):
            if isinstance(header[ii], float):
                error = error + np.abs(SplineModelConditional.find_nearest(cfs, header[ii], ys) - float(row[ii]))
            elif header[ii] == 'mean':
                error = error + np.abs(mean - float(row[ii]))
            elif header[ii] == 'mode':
                mode = ys[ps.argmax()]
                error = error + np.abs(mode - float(row[ii]))
            elif header[ii] == 'var':
                error = error + np.sqrt(np.abs(var - float(row[ii])))
            elif header[ii] == 'skew':
                skew = sum(ps * np.pow((ys - mean) / sqrt(var), 3))
                error = error + np.pow(np.abs(skew - float(row[ii])), 1.0/3)

        return error
    def evaluate_spline(header, row, spline, limits):
        limits = (max(min(spline.get_knots()), float(limits[0])),
                  min(max(spline.get_knots()), float(limits[1])))

        ys = np.linspace(limits[0], limits[1],
                         len(header) * SplineModel.samples)
        ps = np.exp(spline(ys)) * (limits[1] - limits[0]) / (
            len(header) * SplineModel.samples)
        ps = ps / sum(ps)
        cfs = np.cumsum(ps)

        if 'mean' in header or 'var' in header or 'skew' in header:
            mean = sum(ps * ys)
        if 'var' in header or 'skew' in header:
            var = sum(ps * np.square(ys - mean))

        error = 0
        for ii in range(1, len(header)):
            if isinstance(header[ii], float):
                error = error + np.abs(
                    SplineModelConditional.find_nearest(cfs, header[ii], ys) -
                    float(row[ii]))
            elif header[ii] == 'mean':
                error = error + np.abs(mean - float(row[ii]))
            elif header[ii] == 'mode':
                mode = ys[ps.argmax()]
                error = error + np.abs(mode - float(row[ii]))
            elif header[ii] == 'var':
                error = error + np.sqrt(np.abs(var - float(row[ii])))
            elif header[ii] == 'skew':
                skew = sum(ps * np.pow((ys - mean) / sqrt(var), 3))
                error = error + np.pow(np.abs(skew - float(row[ii])), 1.0 / 3)

        return error
Exemplo n.º 5
0
    def features_to_exponential(header, row, limits):
        if len(header) != 2:
            return None

        if 'mean' not in header:
            return None

        mean = float(row[header.index('mean')])

        # Is it one-sided?
        if mean > limits[0] and limits[0] + (mean - limits[0]) * 3 < limits[1]:
            # positive exponential
            return SplineModelConditional.make_single(limits[0], limits[1], [limits[0] / (mean - limits[0]), -1/(mean - limits[0])]).rescale()
        if mean < limits[1] or limits[1] - (limits[1] - mean) * 3 > limits[0]:
            # negative exponential
            return SplineModelConditional.make_single(limits[0], limits[1], [-limits[1] / (limits[1] - mean), 1/(limits[1] - mean)]).rescale()
        else:
            return None
Exemplo n.º 6
0
def uniform_doseless(start, end, height=None):
    scaled = False
    if height is None:
        height = 1 / (end - start)
        scaled = True

    conditional = SplineModelConditional()
    conditional.add_segment(SplineModel.neginf, start, [SplineModel.neginf])
    conditional.add_segment(start, end, [math.log(height)])
    conditional.add_segment(end, SplineModel.posinf, [SplineModel.neginf])

    return SplineModel(True, [''], [conditional], scaled)
Exemplo n.º 7
0
    def apply_as_distribution(self, model):
        if model.kind() == 'ddp_model':
            if len(model.xx) != len(self.yy):
                raise ValueError("Wrong number of values.")

            for ii in range(len(model.xx)):
                if model.get_xx()[ii] != self.get_yy()[ii]:
                    raise ValueError("Wrong values in distribution.")

            vv = self.lin_p().dot(model.lin_p())

            return DDPModel('ddp1', 'distribution', self.xx_is_categorical, self.get_xx(), model.yy_is_categorical, model.get_yy(), vv, scaled=self.scaled)
            
        elif model.kind() == 'spline_model':
            if len(model.xx) != len(self.yy):
                raise ValueError("Wrong number of values.")

            for ii in range(len(model.xx)):
                if model.get_xx()[ii] not in self.get_yy():
                    raise ValueError("Wrong values in distribution.")

            pp = self.lin_p()
            conditionals = []

            for ii in range(len(self.xx)):
                conds = []

                for jj in range(len(model.xx)):
                    original = model.get_conditional(model.get_xx()[jj])
                    conditional = SplineModelConditional(original.y0s, original.y1s, original.coeffs)
                    conditional.scale(pp[ii, self.get_yy().index(model.get_xx()[jj])])
                    conds.append(conditional)

                conditionals.append(SplineModelConditional.approximate_sum(conds))

            return SplineModel(self.xx_is_categorical, self.get_xx(), conditionals, scaled=self.scaled)
            
        else:
            raise ValueError("Unknown model type in apply_as_distribution")
Exemplo n.º 8
0
def write_robberies(writer, row):
    print row
    writer.writerow(["year", "income0", "income1", "coeff0", "coeff1", "coeff2"])
    incomes = np.linspace(0, 1e6, 20)
    for year in range(2010, 2100+1):
        crime_fraction = np.sin(np.pi*incomes / 1e6) + random.uniform(.5, 1.5) * (1 + (year - 2010) / 200.0)
        crime = 200 * crime_fraction
        spline = UnivariateSpline(incomes, crime, s=10, k=2)
        conditional = SplineModelConditional.make_conditional_from_spline(spline, [0, 1e6])
        for jj in range(conditional.size()):
            row = [year, conditional.y0s[jj], conditional.y1s[jj]]
            row.extend(conditional.coeffs[jj])
            writer.writerow(row)
Exemplo n.º 9
0
def uniform_doseless(start, end, height=None):
    scaled = False
    if height is None:
        height = 1 / (end - start)
        scaled = True
    
    conditional = SplineModelConditional()
    conditional.add_segment(SplineModel.neginf, start, [SplineModel.neginf])
    conditional.add_segment(start, end, [math.log(height)])
    conditional.add_segment(end, SplineModel.posinf, [SplineModel.neginf])
        
    return SplineModel(True, [''], [conditional], scaled)
Exemplo n.º 10
0
    def features_to_exponential(header, row, limits):
        if len(header) != 2:
            return None

        if 'mean' not in header:
            return None

        mean = float(row[header.index('mean')])

        # Is it one-sided?
        if mean > limits[0] and limits[0] + (mean - limits[0]) * 3 < limits[1]:
            # positive exponential
            return SplineModelConditional.make_single(
                limits[0], limits[1],
                [limits[0] / (mean - limits[0]), -1 /
                 (mean - limits[0])]).rescale()
        if mean < limits[1] or limits[1] - (limits[1] - mean) * 3 > limits[0]:
            # negative exponential
            return SplineModelConditional.make_single(
                limits[0], limits[1],
                [-limits[1] / (limits[1] - mean), 1 /
                 (limits[1] - mean)]).rescale()
        else:
            return None
Exemplo n.º 11
0
    def make_conditional(header, row, limits):
        # Look for a special case
        conditional = FeaturesInterpreter.features_to_gaussian(header, row, limits)
        if conditional is not None:
            return conditional

        conditional = FeaturesInterpreter.features_to_exponential(header, row, limits)
        if conditional is not None:
            return conditional

        conditional = FeaturesInterpreter.features_to_uniform(header, row, limits)
        if conditional is not None:
            return conditional

        spline = FeaturesInterpreter.best_spline(header, row, limits)
        conditional = SplineModelConditional.make_conditional_from_spline(spline, limits)
        return conditional.rescale()
Exemplo n.º 12
0
    def make_conditional(header, row, limits):
        # Look for a special case
        conditional = FeaturesInterpreter.features_to_gaussian(
            header, row, limits)
        if conditional is not None:
            return conditional

        conditional = FeaturesInterpreter.features_to_exponential(
            header, row, limits)
        if conditional is not None:
            return conditional

        conditional = FeaturesInterpreter.features_to_uniform(
            header, row, limits)
        if conditional is not None:
            return conditional

        spline = FeaturesInterpreter.best_spline(header, row, limits)
        conditional = SplineModelConditional.make_conditional_from_spline(
            spline, limits)
        return conditional.rescale()
Exemplo n.º 13
0
    def features_to_gaussian(header, row, limits):
        # Does this look like a mean-variance feature file?
        if len(header) == 3:
            mean = None
            if 'mean' in header:
                mean = float(row[header.index('mean')])
            if 'mode' in header:
                mean = float(row[header.index('mode')])
            if .5 in header:
                mean = float(row[header.index(.5)])
            if mean is None:
                return None
            
            if 'var' in header:
                var = float(row[header.index('var')])
            elif 'sdev' in header:
                var = float(row[header.index('sdev')]) * float(row[header.index('sdev')])
            else:
                return None

            if np.isnan(var) or var == 0:
                return SplineModelConditional.make_single(mean, mean, [])

            # This might be uniform
            if mean - 2*var < limits[0] or mean + 2*var > limits[1]:
                return None

            return SplineModelConditional.make_gaussian(limits[0], limits[1], mean, var)
        elif len(header) == 4:
            # Does this look like a mean and evenly spaced p-values?
            header = header[1:] # Make a copy of the list
            row = row[1:]
            mean = None
            if 'mean' in header:
                mean = float(row.pop(header.index('mean')))
                header.remove('mean')
                
            elif 'mode' in header:
                mean = float(row.pop(header.index('mode')))
                header.remove('mode')
            elif .5 in header:
                mean = float(row.pop(header.index(.5)))
                header.remove(.5)
            else:
                return None

            # Check that the two other values are evenly spaced p-values
            row = map(float, row[0:2])
            if np.all(np.isnan(row)):
                return SplineModelConditional.make_single(mean, mean, [])
                
            if header[1] == 1 - header[0] and abs(row[1] - mean - (mean - row[0])) < abs(row[1] - row[0]) / 1000.0:
                lowp = min(header)
                lowv = np.array(row)[np.array(header) == lowp][0]

                if lowv == mean:
                    return SplineModelConditional.make_single(mean, mean, [])

                lowerbound = 1e-4 * (mean - lowv)
                upperbound = np.sqrt((mean - lowv) / lowp)

                sdev = brentq(lambda sdev: norm.cdf(lowv, mean, sdev) - lowp, lowerbound, upperbound)
                if float(limits[0]) < mean - 3*sdev and float(limits[1]) > mean + 3*sdev:
                    return SplineModelConditional.make_gaussian(limits[0], limits[1], mean, sdev*sdev)
                else:
                    return None
            else:
                # Heuristic best curve: known tails, fit to mean
                lowp = min(header)
                lowv = np.array(row)[np.array(header) == lowp][0]

                lowerbound = 1e-4 * (mean - lowv)
                upperbound = np.log((mean - lowv) / lowp)

                low_sdev = brentq(lambda sdev: norm.cdf(lowv, mean, sdev) - lowp, lowerbound, upperbound)
                if float(limits[0]) > mean - 3*low_sdev:
                    return None
                
                low_segment = SplineModelConditional.make_gaussian(float(limits[0]), lowv, mean, low_sdev*low_sdev)

                highp = max(header)
                highv = np.array(row)[np.array(header) == highp][0]

                lowerbound = 1e-4 * (highv - mean)
                upperbound = np.log((highv - mean) / (1 - highp))

                high_scale = brentq(lambda scale: .5 + expon.cdf(highv, mean, scale) / 2 - highp, lowerbound, upperbound)
                if float(limits[1]) < mean + 3*high_scale:
                    return None

                # Construct exponential, starting at mean, with full cdf of .5
                high_segment = SplineModelConditional.make_single(highv, float(limits[1]), [np.log(1/high_scale) + np.log(.5) + mean / high_scale, -1 / high_scale])

                sevenys = np.linspace(lowv, highv, 7)
                ys = np.append(sevenys[0:2], [mean, sevenys[-2], sevenys[-1]])

                lps0 = norm.logpdf(ys[0:2], mean, low_sdev)
                lps1 = expon.logpdf([ys[-2], ys[-1]], mean, high_scale) + np.log(.5)

                #bounds = [norm.logpdf(mean, mean, low_sdev), norm.logpdf(mean, mean, high_sdev)]

                result = minimize(lambda lpmean: FeaturesInterpreter.skew_gaussian_evaluate(ys, np.append(np.append(lps0, [lpmean]), lps1), low_segment, high_segment, mean, lowp, highp), .5, method='Nelder-Mead')
                print np.append(np.append(lps0, result.x), lps1)
                return FeaturesInterpreter.skew_gaussian_construct(ys, np.append(np.append(lps0, result.x), lps1), low_segment, high_segment)
Exemplo n.º 14
0
    def features_to_uniform(header, row, limits):
        if len(header) != 1:
            return None

        return SplineModelConditional.make_single(limits[0], limits[1], [1/(limits[1] - limits[0])])
Exemplo n.º 15
0
    def features_to_gaussian(header, row, limits):
        # Does this look like a mean-variance feature file?
        if len(header) == 3:
            mean = None
            if 'mean' in header:
                mean = float(row[header.index('mean')])
            if 'mode' in header:
                mean = float(row[header.index('mode')])
            if .5 in header:
                mean = float(row[header.index(.5)])
            if mean is None:
                return None

            if 'var' in header:
                var = float(row[header.index('var')])
            elif 'sdev' in header:
                var = float(row[header.index('sdev')]) * float(
                    row[header.index('sdev')])
            else:
                return None

            if np.isnan(var) or var == 0:
                return SplineModelConditional.make_single(mean, mean, [])

            # This might be uniform
            if mean - 2 * var < limits[0] or mean + 2 * var > limits[1]:
                return None

            return SplineModelConditional.make_gaussian(
                limits[0], limits[1], mean, var)
        elif len(header) == 4:
            # Does this look like a mean and evenly spaced p-values?
            header = header[1:]  # Make a copy of the list
            row = row[1:]
            mean = None
            if 'mean' in header:
                mean = float(row.pop(header.index('mean')))
                header.remove('mean')

            elif 'mode' in header:
                mean = float(row.pop(header.index('mode')))
                header.remove('mode')
            elif .5 in header:
                mean = float(row.pop(header.index(.5)))
                header.remove(.5)
            else:
                return None

            # Check that the two other values are evenly spaced p-values
            row = map(float, row[0:2])
            if np.all(np.isnan(row)):
                return SplineModelConditional.make_single(mean, mean, [])

            if header[1] == 1 - header[0] and abs(row[1] - mean - (
                    mean - row[0])) < abs(row[1] - row[0]) / 1000.0:
                lowp = min(header)
                lowv = np.array(row)[np.array(header) == lowp][0]

                if lowv == mean:
                    return SplineModelConditional.make_single(mean, mean, [])

                lowerbound = 1e-4 * (mean - lowv)
                upperbound = np.sqrt((mean - lowv) / lowp)

                sdev = brentq(lambda sdev: norm.cdf(lowv, mean, sdev) - lowp,
                              lowerbound, upperbound)
                if float(limits[0]) < mean - 3 * sdev and float(
                        limits[1]) > mean + 3 * sdev:
                    return SplineModelConditional.make_gaussian(
                        limits[0], limits[1], mean, sdev * sdev)
                else:
                    return None
            else:
                # Heuristic best curve: known tails, fit to mean
                lowp = min(header)
                lowv = np.array(row)[np.array(header) == lowp][0]

                lowerbound = 1e-4 * (mean - lowv)
                upperbound = np.log((mean - lowv) / lowp)

                low_sdev = brentq(
                    lambda sdev: norm.cdf(lowv, mean, sdev) - lowp, lowerbound,
                    upperbound)
                if float(limits[0]) > mean - 3 * low_sdev:
                    return None

                low_segment = SplineModelConditional.make_gaussian(
                    float(limits[0]), lowv, mean, low_sdev * low_sdev)

                highp = max(header)
                highv = np.array(row)[np.array(header) == highp][0]

                lowerbound = 1e-4 * (highv - mean)
                upperbound = np.log((highv - mean) / (1 - highp))

                high_scale = brentq(
                    lambda scale: .5 + expon.cdf(highv, mean, scale) / 2 -
                    highp, lowerbound, upperbound)
                if float(limits[1]) < mean + 3 * high_scale:
                    return None

                # Construct exponential, starting at mean, with full cdf of .5
                high_segment = SplineModelConditional.make_single(
                    highv, float(limits[1]), [
                        np.log(1 / high_scale) + np.log(.5) +
                        mean / high_scale, -1 / high_scale
                    ])

                sevenys = np.linspace(lowv, highv, 7)
                ys = np.append(sevenys[0:2], [mean, sevenys[-2], sevenys[-1]])

                lps0 = norm.logpdf(ys[0:2], mean, low_sdev)
                lps1 = expon.logpdf([ys[-2], ys[-1]], mean,
                                    high_scale) + np.log(.5)

                #bounds = [norm.logpdf(mean, mean, low_sdev), norm.logpdf(mean, mean, high_sdev)]

                result = minimize(
                    lambda lpmean: FeaturesInterpreter.skew_gaussian_evaluate(
                        ys, np.append(np.append(lps0, [lpmean]), lps1),
                        low_segment, high_segment, mean, lowp, highp),
                    .5,
                    method='Nelder-Mead')
                print np.append(np.append(lps0, result.x), lps1)
                return FeaturesInterpreter.skew_gaussian_construct(
                    ys, np.append(np.append(lps0, result.x), lps1),
                    low_segment, high_segment)
Exemplo n.º 16
0
    def skew_gaussian_construct(ys, lps, low_segment, high_segment):
        mid_segment = SplineModelConditional.make_conditional_from_spline(
            InterpolatedUnivariateSpline(ys, lps, k=2),
            (low_segment.y1s[0], high_segment.y0s[0]))
        conditional = SplineModelConditional()
        conditional.add_segment(low_segment.y0s[0], low_segment.y1s[0],
                                copy.copy(low_segment.coeffs[0]))
        for ii in range(mid_segment.size()):
            conditional.add_segment(mid_segment.y0s[ii], mid_segment.y1s[ii],
                                    mid_segment.coeffs[ii])
        conditional.add_segment(high_segment.y0s[0], high_segment.y1s[0],
                                copy.copy(high_segment.coeffs[0]))

        try:
            conditional.rescale()
        except:
            return None

        return conditional
Exemplo n.º 17
0
    def features_to_uniform(header, row, limits):
        if len(header) != 1:
            return None

        return SplineModelConditional.make_single(
            limits[0], limits[1], [1 / (limits[1] - limits[0])])