def __init__(self, path): self.path = Path(path) self.dir = DMD._local_directory / self.path.stem match = DMD.file_pattern.fullmatch(self.path.name) assert match self.version = match.group(1) self.publish_date = parsedate(match.group(2)) DMD._instances.append(self)
def analyze_records(reader, fiscal_year, datefield, fields): fy_months = [date(fiscal_year - (1 if month >= 10 else 0), month, 1) for month in range(1, 13)] observations = dict(((month, dict.fromkeys(fields, [])) for month in fy_months)) digits = dict(((month, dict.fromkeys(fields, [])) for month in fy_months)) for (line_number, record) in enumerate(reader, 2): dtstr = record[datefield] if dtstr is None or dtstr.strip() == '': print >>sys.stderr, "Skipping record with blank date field." continue dt = parsedate(record[datefield], settings.DATE_FORMATS) dt1 = date(dt.year, dt.month, 1) if dt1 not in fy_months: fy_begin = min(fy_months) fy_end = max(fy_months) print "Skipping %s-%s because it's not in %s-%s - %s-%s" % (dt1.year, dt1.month, fy_begin.year, fy_begin.month, fy_end.year, fy_end.month) continue for field in fields: obs = observations[dt1][field] digs = digits[dt1][field] value = record[field] (value, digit) = benford_filter(value) if value is not None: obs.append(value) if digit is not None: digs.append(digit) results = dict(((month, dict.fromkeys(fields, {})) for month in fy_months)) for dt1 in results: for field in fields: result = results[dt1][field] obs = observations[dt1][field] obs_array = numpy.array(obs, dtype=float) digs = digits[dt1][field] result['field_name'] = field result['value_count'] = len(obs) result['value_sum'] = numpy.sum(obs_array) result['mean'] = numpy.mean(obs_array) result['median'] = numpy.median(obs_array) result['skew'] = stats.skew(obs_array) result['digits'] = benford_difference(digs) return results