def apply_as_distribution(self, model): if model.kind() == 'ddp_model': if len(model.xx) != len(self.yy): raise ValueError("Wrong number of values.") for ii in range(len(model.xx)): if model.get_xx()[ii] != self.get_yy()[ii]: raise ValueError("Wrong values in distribution.") vv = self.lin_p().dot(model.lin_p()) return DDPModel('ddp1', 'distribution', self.xx_is_categorical, self.get_xx(), model.yy_is_categorical, model.get_yy(), vv, scaled=self.scaled) elif model.kind() == 'spline_model': if len(model.xx) != len(self.yy): raise ValueError("Wrong number of values.") for ii in range(len(model.xx)): if model.get_xx()[ii] not in self.get_yy(): raise ValueError("Wrong values in distribution.") pp = self.lin_p() conditionals = [] for ii in range(len(self.xx)): conds = [] for jj in range(len(model.xx)): original = model.get_conditional(model.get_xx()[jj]) conditional = SplineModelConditional( original.y0s, original.y1s, original.coeffs) conditional.scale( pp[ii, self.get_yy().index(model.get_xx()[jj])]) conds.append(conditional) conditionals.append( SplineModelConditional.approximate_sum(conds)) return SplineModel(self.xx_is_categorical, self.get_xx(), conditionals, scaled=self.scaled) else: raise ValueError("Unknown model type in apply_as_distribution")
def skew_gaussian_construct(ys, lps, low_segment, high_segment): mid_segment = SplineModelConditional.make_conditional_from_spline(InterpolatedUnivariateSpline(ys, lps, k=2), (low_segment.y1s[0], high_segment.y0s[0])) conditional = SplineModelConditional() conditional.add_segment(low_segment.y0s[0], low_segment.y1s[0], copy.copy(low_segment.coeffs[0])) for ii in range(mid_segment.size()): conditional.add_segment(mid_segment.y0s[ii], mid_segment.y1s[ii], mid_segment.coeffs[ii]) conditional.add_segment(high_segment.y0s[0], high_segment.y1s[0], copy.copy(high_segment.coeffs[0])) try: conditional.rescale() except: return None return conditional
def evaluate_spline(header, row, spline, limits): limits = (max(min(spline.get_knots()), float(limits[0])), min(max(spline.get_knots()), float(limits[1]))) ys = np.linspace(limits[0], limits[1], len(header) * SplineModel.samples) ps = np.exp(spline(ys)) * (limits[1] - limits[0]) / (len(header) * SplineModel.samples) ps = ps / sum(ps) cfs = np.cumsum(ps) if 'mean' in header or 'var' in header or 'skew' in header: mean = sum(ps * ys) if 'var' in header or 'skew' in header: var = sum(ps * np.square(ys - mean)) error = 0 for ii in range(1, len(header)): if isinstance(header[ii], float): error = error + np.abs(SplineModelConditional.find_nearest(cfs, header[ii], ys) - float(row[ii])) elif header[ii] == 'mean': error = error + np.abs(mean - float(row[ii])) elif header[ii] == 'mode': mode = ys[ps.argmax()] error = error + np.abs(mode - float(row[ii])) elif header[ii] == 'var': error = error + np.sqrt(np.abs(var - float(row[ii]))) elif header[ii] == 'skew': skew = sum(ps * np.pow((ys - mean) / sqrt(var), 3)) error = error + np.pow(np.abs(skew - float(row[ii])), 1.0/3) return error
def evaluate_spline(header, row, spline, limits): limits = (max(min(spline.get_knots()), float(limits[0])), min(max(spline.get_knots()), float(limits[1]))) ys = np.linspace(limits[0], limits[1], len(header) * SplineModel.samples) ps = np.exp(spline(ys)) * (limits[1] - limits[0]) / ( len(header) * SplineModel.samples) ps = ps / sum(ps) cfs = np.cumsum(ps) if 'mean' in header or 'var' in header or 'skew' in header: mean = sum(ps * ys) if 'var' in header or 'skew' in header: var = sum(ps * np.square(ys - mean)) error = 0 for ii in range(1, len(header)): if isinstance(header[ii], float): error = error + np.abs( SplineModelConditional.find_nearest(cfs, header[ii], ys) - float(row[ii])) elif header[ii] == 'mean': error = error + np.abs(mean - float(row[ii])) elif header[ii] == 'mode': mode = ys[ps.argmax()] error = error + np.abs(mode - float(row[ii])) elif header[ii] == 'var': error = error + np.sqrt(np.abs(var - float(row[ii]))) elif header[ii] == 'skew': skew = sum(ps * np.pow((ys - mean) / sqrt(var), 3)) error = error + np.pow(np.abs(skew - float(row[ii])), 1.0 / 3) return error
def features_to_exponential(header, row, limits): if len(header) != 2: return None if 'mean' not in header: return None mean = float(row[header.index('mean')]) # Is it one-sided? if mean > limits[0] and limits[0] + (mean - limits[0]) * 3 < limits[1]: # positive exponential return SplineModelConditional.make_single(limits[0], limits[1], [limits[0] / (mean - limits[0]), -1/(mean - limits[0])]).rescale() if mean < limits[1] or limits[1] - (limits[1] - mean) * 3 > limits[0]: # negative exponential return SplineModelConditional.make_single(limits[0], limits[1], [-limits[1] / (limits[1] - mean), 1/(limits[1] - mean)]).rescale() else: return None
def uniform_doseless(start, end, height=None): scaled = False if height is None: height = 1 / (end - start) scaled = True conditional = SplineModelConditional() conditional.add_segment(SplineModel.neginf, start, [SplineModel.neginf]) conditional.add_segment(start, end, [math.log(height)]) conditional.add_segment(end, SplineModel.posinf, [SplineModel.neginf]) return SplineModel(True, [''], [conditional], scaled)
def apply_as_distribution(self, model): if model.kind() == 'ddp_model': if len(model.xx) != len(self.yy): raise ValueError("Wrong number of values.") for ii in range(len(model.xx)): if model.get_xx()[ii] != self.get_yy()[ii]: raise ValueError("Wrong values in distribution.") vv = self.lin_p().dot(model.lin_p()) return DDPModel('ddp1', 'distribution', self.xx_is_categorical, self.get_xx(), model.yy_is_categorical, model.get_yy(), vv, scaled=self.scaled) elif model.kind() == 'spline_model': if len(model.xx) != len(self.yy): raise ValueError("Wrong number of values.") for ii in range(len(model.xx)): if model.get_xx()[ii] not in self.get_yy(): raise ValueError("Wrong values in distribution.") pp = self.lin_p() conditionals = [] for ii in range(len(self.xx)): conds = [] for jj in range(len(model.xx)): original = model.get_conditional(model.get_xx()[jj]) conditional = SplineModelConditional(original.y0s, original.y1s, original.coeffs) conditional.scale(pp[ii, self.get_yy().index(model.get_xx()[jj])]) conds.append(conditional) conditionals.append(SplineModelConditional.approximate_sum(conds)) return SplineModel(self.xx_is_categorical, self.get_xx(), conditionals, scaled=self.scaled) else: raise ValueError("Unknown model type in apply_as_distribution")
def write_robberies(writer, row): print row writer.writerow(["year", "income0", "income1", "coeff0", "coeff1", "coeff2"]) incomes = np.linspace(0, 1e6, 20) for year in range(2010, 2100+1): crime_fraction = np.sin(np.pi*incomes / 1e6) + random.uniform(.5, 1.5) * (1 + (year - 2010) / 200.0) crime = 200 * crime_fraction spline = UnivariateSpline(incomes, crime, s=10, k=2) conditional = SplineModelConditional.make_conditional_from_spline(spline, [0, 1e6]) for jj in range(conditional.size()): row = [year, conditional.y0s[jj], conditional.y1s[jj]] row.extend(conditional.coeffs[jj]) writer.writerow(row)
def features_to_exponential(header, row, limits): if len(header) != 2: return None if 'mean' not in header: return None mean = float(row[header.index('mean')]) # Is it one-sided? if mean > limits[0] and limits[0] + (mean - limits[0]) * 3 < limits[1]: # positive exponential return SplineModelConditional.make_single( limits[0], limits[1], [limits[0] / (mean - limits[0]), -1 / (mean - limits[0])]).rescale() if mean < limits[1] or limits[1] - (limits[1] - mean) * 3 > limits[0]: # negative exponential return SplineModelConditional.make_single( limits[0], limits[1], [-limits[1] / (limits[1] - mean), 1 / (limits[1] - mean)]).rescale() else: return None
def make_conditional(header, row, limits): # Look for a special case conditional = FeaturesInterpreter.features_to_gaussian(header, row, limits) if conditional is not None: return conditional conditional = FeaturesInterpreter.features_to_exponential(header, row, limits) if conditional is not None: return conditional conditional = FeaturesInterpreter.features_to_uniform(header, row, limits) if conditional is not None: return conditional spline = FeaturesInterpreter.best_spline(header, row, limits) conditional = SplineModelConditional.make_conditional_from_spline(spline, limits) return conditional.rescale()
def make_conditional(header, row, limits): # Look for a special case conditional = FeaturesInterpreter.features_to_gaussian( header, row, limits) if conditional is not None: return conditional conditional = FeaturesInterpreter.features_to_exponential( header, row, limits) if conditional is not None: return conditional conditional = FeaturesInterpreter.features_to_uniform( header, row, limits) if conditional is not None: return conditional spline = FeaturesInterpreter.best_spline(header, row, limits) conditional = SplineModelConditional.make_conditional_from_spline( spline, limits) return conditional.rescale()
def features_to_gaussian(header, row, limits): # Does this look like a mean-variance feature file? if len(header) == 3: mean = None if 'mean' in header: mean = float(row[header.index('mean')]) if 'mode' in header: mean = float(row[header.index('mode')]) if .5 in header: mean = float(row[header.index(.5)]) if mean is None: return None if 'var' in header: var = float(row[header.index('var')]) elif 'sdev' in header: var = float(row[header.index('sdev')]) * float(row[header.index('sdev')]) else: return None if np.isnan(var) or var == 0: return SplineModelConditional.make_single(mean, mean, []) # This might be uniform if mean - 2*var < limits[0] or mean + 2*var > limits[1]: return None return SplineModelConditional.make_gaussian(limits[0], limits[1], mean, var) elif len(header) == 4: # Does this look like a mean and evenly spaced p-values? header = header[1:] # Make a copy of the list row = row[1:] mean = None if 'mean' in header: mean = float(row.pop(header.index('mean'))) header.remove('mean') elif 'mode' in header: mean = float(row.pop(header.index('mode'))) header.remove('mode') elif .5 in header: mean = float(row.pop(header.index(.5))) header.remove(.5) else: return None # Check that the two other values are evenly spaced p-values row = map(float, row[0:2]) if np.all(np.isnan(row)): return SplineModelConditional.make_single(mean, mean, []) if header[1] == 1 - header[0] and abs(row[1] - mean - (mean - row[0])) < abs(row[1] - row[0]) / 1000.0: lowp = min(header) lowv = np.array(row)[np.array(header) == lowp][0] if lowv == mean: return SplineModelConditional.make_single(mean, mean, []) lowerbound = 1e-4 * (mean - lowv) upperbound = np.sqrt((mean - lowv) / lowp) sdev = brentq(lambda sdev: norm.cdf(lowv, mean, sdev) - lowp, lowerbound, upperbound) if float(limits[0]) < mean - 3*sdev and float(limits[1]) > mean + 3*sdev: return SplineModelConditional.make_gaussian(limits[0], limits[1], mean, sdev*sdev) else: return None else: # Heuristic best curve: known tails, fit to mean lowp = min(header) lowv = np.array(row)[np.array(header) == lowp][0] lowerbound = 1e-4 * (mean - lowv) upperbound = np.log((mean - lowv) / lowp) low_sdev = brentq(lambda sdev: norm.cdf(lowv, mean, sdev) - lowp, lowerbound, upperbound) if float(limits[0]) > mean - 3*low_sdev: return None low_segment = SplineModelConditional.make_gaussian(float(limits[0]), lowv, mean, low_sdev*low_sdev) highp = max(header) highv = np.array(row)[np.array(header) == highp][0] lowerbound = 1e-4 * (highv - mean) upperbound = np.log((highv - mean) / (1 - highp)) high_scale = brentq(lambda scale: .5 + expon.cdf(highv, mean, scale) / 2 - highp, lowerbound, upperbound) if float(limits[1]) < mean + 3*high_scale: return None # Construct exponential, starting at mean, with full cdf of .5 high_segment = SplineModelConditional.make_single(highv, float(limits[1]), [np.log(1/high_scale) + np.log(.5) + mean / high_scale, -1 / high_scale]) sevenys = np.linspace(lowv, highv, 7) ys = np.append(sevenys[0:2], [mean, sevenys[-2], sevenys[-1]]) lps0 = norm.logpdf(ys[0:2], mean, low_sdev) lps1 = expon.logpdf([ys[-2], ys[-1]], mean, high_scale) + np.log(.5) #bounds = [norm.logpdf(mean, mean, low_sdev), norm.logpdf(mean, mean, high_sdev)] result = minimize(lambda lpmean: FeaturesInterpreter.skew_gaussian_evaluate(ys, np.append(np.append(lps0, [lpmean]), lps1), low_segment, high_segment, mean, lowp, highp), .5, method='Nelder-Mead') print np.append(np.append(lps0, result.x), lps1) return FeaturesInterpreter.skew_gaussian_construct(ys, np.append(np.append(lps0, result.x), lps1), low_segment, high_segment)
def features_to_uniform(header, row, limits): if len(header) != 1: return None return SplineModelConditional.make_single(limits[0], limits[1], [1/(limits[1] - limits[0])])
def features_to_gaussian(header, row, limits): # Does this look like a mean-variance feature file? if len(header) == 3: mean = None if 'mean' in header: mean = float(row[header.index('mean')]) if 'mode' in header: mean = float(row[header.index('mode')]) if .5 in header: mean = float(row[header.index(.5)]) if mean is None: return None if 'var' in header: var = float(row[header.index('var')]) elif 'sdev' in header: var = float(row[header.index('sdev')]) * float( row[header.index('sdev')]) else: return None if np.isnan(var) or var == 0: return SplineModelConditional.make_single(mean, mean, []) # This might be uniform if mean - 2 * var < limits[0] or mean + 2 * var > limits[1]: return None return SplineModelConditional.make_gaussian( limits[0], limits[1], mean, var) elif len(header) == 4: # Does this look like a mean and evenly spaced p-values? header = header[1:] # Make a copy of the list row = row[1:] mean = None if 'mean' in header: mean = float(row.pop(header.index('mean'))) header.remove('mean') elif 'mode' in header: mean = float(row.pop(header.index('mode'))) header.remove('mode') elif .5 in header: mean = float(row.pop(header.index(.5))) header.remove(.5) else: return None # Check that the two other values are evenly spaced p-values row = map(float, row[0:2]) if np.all(np.isnan(row)): return SplineModelConditional.make_single(mean, mean, []) if header[1] == 1 - header[0] and abs(row[1] - mean - ( mean - row[0])) < abs(row[1] - row[0]) / 1000.0: lowp = min(header) lowv = np.array(row)[np.array(header) == lowp][0] if lowv == mean: return SplineModelConditional.make_single(mean, mean, []) lowerbound = 1e-4 * (mean - lowv) upperbound = np.sqrt((mean - lowv) / lowp) sdev = brentq(lambda sdev: norm.cdf(lowv, mean, sdev) - lowp, lowerbound, upperbound) if float(limits[0]) < mean - 3 * sdev and float( limits[1]) > mean + 3 * sdev: return SplineModelConditional.make_gaussian( limits[0], limits[1], mean, sdev * sdev) else: return None else: # Heuristic best curve: known tails, fit to mean lowp = min(header) lowv = np.array(row)[np.array(header) == lowp][0] lowerbound = 1e-4 * (mean - lowv) upperbound = np.log((mean - lowv) / lowp) low_sdev = brentq( lambda sdev: norm.cdf(lowv, mean, sdev) - lowp, lowerbound, upperbound) if float(limits[0]) > mean - 3 * low_sdev: return None low_segment = SplineModelConditional.make_gaussian( float(limits[0]), lowv, mean, low_sdev * low_sdev) highp = max(header) highv = np.array(row)[np.array(header) == highp][0] lowerbound = 1e-4 * (highv - mean) upperbound = np.log((highv - mean) / (1 - highp)) high_scale = brentq( lambda scale: .5 + expon.cdf(highv, mean, scale) / 2 - highp, lowerbound, upperbound) if float(limits[1]) < mean + 3 * high_scale: return None # Construct exponential, starting at mean, with full cdf of .5 high_segment = SplineModelConditional.make_single( highv, float(limits[1]), [ np.log(1 / high_scale) + np.log(.5) + mean / high_scale, -1 / high_scale ]) sevenys = np.linspace(lowv, highv, 7) ys = np.append(sevenys[0:2], [mean, sevenys[-2], sevenys[-1]]) lps0 = norm.logpdf(ys[0:2], mean, low_sdev) lps1 = expon.logpdf([ys[-2], ys[-1]], mean, high_scale) + np.log(.5) #bounds = [norm.logpdf(mean, mean, low_sdev), norm.logpdf(mean, mean, high_sdev)] result = minimize( lambda lpmean: FeaturesInterpreter.skew_gaussian_evaluate( ys, np.append(np.append(lps0, [lpmean]), lps1), low_segment, high_segment, mean, lowp, highp), .5, method='Nelder-Mead') print np.append(np.append(lps0, result.x), lps1) return FeaturesInterpreter.skew_gaussian_construct( ys, np.append(np.append(lps0, result.x), lps1), low_segment, high_segment)
def skew_gaussian_construct(ys, lps, low_segment, high_segment): mid_segment = SplineModelConditional.make_conditional_from_spline( InterpolatedUnivariateSpline(ys, lps, k=2), (low_segment.y1s[0], high_segment.y0s[0])) conditional = SplineModelConditional() conditional.add_segment(low_segment.y0s[0], low_segment.y1s[0], copy.copy(low_segment.coeffs[0])) for ii in range(mid_segment.size()): conditional.add_segment(mid_segment.y0s[ii], mid_segment.y1s[ii], mid_segment.coeffs[ii]) conditional.add_segment(high_segment.y0s[0], high_segment.y1s[0], copy.copy(high_segment.coeffs[0])) try: conditional.rescale() except: return None return conditional
def features_to_uniform(header, row, limits): if len(header) != 1: return None return SplineModelConditional.make_single( limits[0], limits[1], [1 / (limits[1] - limits[0])])