def analyze(self, extracts: Iterable[Tuple[str, Any]]) -> None: for company_id, company in extracts: zip_code = composites.get_property(company, self.source_zip_var) city = composites.get_property(company, self.source_city_var) state = composites.get_property(company, self.source_state_var) # Create a transient composite for the city. It will be processed into its final form in emit(). if zip_code not in self.city_data: self.city_data[zip_code] = { 'invariant': { "zip": zip_code, "city": city, "state": state } } for period in composites.get_periods(company): if period not in self.city_data[zip_code]: self.city_data[zip_code][period] = { "n_companies": 0, "tot_employees": 0, "tot_revenue": 0.0 } p_dict = self.city_data[zip_code][period] p_dict["n_companies"] += 1 p_dict["tot_employees"] += composites.get_observation( company, period, self.n_employee_var) p_dict["tot_revenue"] += composites.get_observation( company, period, self.revenue_var)
def __call__(self, composite: Dict): periods: List[str] = list(composites.get_periods(composite)) annual_prods = [ composites.get_observation(composite, period, self.annual_prod_var) for period in periods ] mean_prod = numpy.average(annual_prods) composites.put_property(composite, self.mean_prod_var, mean_prod)
def __call__(self, composite: Dict): years = sorted( [int(year) for year in composites.get_periods(composite)]) weights = [ composites.get_observation(composite, str(year), self.annual_weight_var) for year in years ] slope, intercept, r_value, p_value, std_err = scipy.stats.linregress( np.asarray(years), np.asarray(weights)) composites.put_property(composite, self.weight_slope_var, slope) composites.put_property(composite, self.weight_pval_var, p_value)
def get_rows(self, composite_id, composite): if self.invariant: data = { name: get_property(composite, var) for name, var in self.column_vars.items() } data['composite_id'] = composite_id yield [data] else: for period, value in composite.items(): if period.isdigit(): row = {} row['composite_id'] = composite_id row['period'] = period for name, var in self.column_vars.items(): row[name] = get_observation(composite, period, var, True) yield row