def load_data(self, **kwargs): self.entry_cache = EntryCache(**kwargs) self.groups = self.entry_cache.group_by_year()
class JsonPreparation(object): """ Prepare JSON data files to be read by the D3 app. """ def __init__(self): self.entry_cache = None self.groups = None def load_data(self, **kwargs): self.entry_cache = EntryCache(**kwargs) self.groups = self.entry_cache.group_by_year() def write(self, **kwargs): out_dir = kwargs.get('out_dir') examples_log_file = kwargs.get('examples_log') files = {k: os.path.join(out_dir, v) for k, v in kwargs.items()} language_index = self._write_language_file(files['languages']) self._write_running_totals_file(files['running_totals']) self._write_increase_rate_file(files['increase_rate']) entries = defaultdict(list) examples = {} for year, entry_list in self.groups: if START_YEAR <= year <= END_YEAR: for entry in entry_list: entry.coordinates() entry_list = _winnow(entry_list) examples[year] = list(_choose_examples(entry_list, year)) for entry in entry_list: freq = float('%.1g' % entry.frequency) if freq >= 1: freq = int(freq) freq = max(freq, 0.0001) entries[year].append(( entry.id, entry.lemma, entry.band, freq, language_index[entry.language], )) _write_words_file(entries, files['words']) _write_examples_file(examples, files['examples'], examples_log_file) def _write_running_totals_file(self, out_file): running_totals = {499: {group: 0 for group in LANGUAGE_GROUPS}} running_counts = {499: {group: 0 for group in LANGUAGE_GROUPS}} for year in range(500, END_YEAR + 1): # Find the list of entries for this year entries = [] for year2, elist in self.groups: if year2 == year: entries = elist break # Initially, set the running totals to be the same as last year's running_totals[year] = {} running_counts[year] = {} for group in LANGUAGE_GROUPS: running_totals[year][group] = running_totals[year - 1][group] running_counts[year][group] = running_counts[year - 1][group] # ...then add on to the counts for this year for entry in [e for e in entries if e.language_group() in LANGUAGE_GROUPS]: running_totals[year][entry.language_group()] += entry.frequency running_counts[year][entry.language_group()] += 1 minified = {'summedfrequencies': {}, 'counts': {}} for year, vals in running_totals.items(): if year >= START_YEAR: this_year_sums = [] this_year_counts = [] for group in LANGUAGE_GROUPS: this_year_sums.append(int(vals[group])) this_year_counts.append(running_counts[year][group]) minified['summedfrequencies'][year] = this_year_sums minified['counts'][year] = [int(n/100) * 100 for n in this_year_counts] with open(out_file, 'w') as filehandle: json.dump(minified, filehandle) def _write_increase_rate_file(self, out_file): rates = defaultdict(int) for year in range(500, END_YEAR + 1): span = ((int(year / 20)) * 20) + 10 entries = [] for year2, entry_list in self.groups: if year2 == year: entries = entry_list break rates[span] += sum([e.frequency for e in entries]) rates = [(k, v / 20) for k, v in rates.items()] rates.sort(key=lambda a: a[0]) years = range(START_YEAR, END_YEAR + 1) freqs = numpy.interp(years, [r[0] for r in rates], [r[1] for r in rates]) rates = {year2: int(f) for year2, f in zip(years, freqs)} with open(out_file, 'w') as filehandle: json.dump(rates, filehandle) def _write_language_file(self, out_file): coords = Coordinates() langs = defaultdict(lambda: {'count': 0, 'group': None}) for year, entry_list in self.groups: if year >= START_YEAR and year <= END_YEAR: entry_list = list(entry_list) for entry in entry_list: langs[entry.language]['count'] += 1 langs[entry.language]['group'] = entry.language_group_initial() for language in langs.keys(): # Number of possible points (between 3 and 30, depending on # the frequency of the language) num_points = int(langs[language]['count'] / 5) num_points = max(4, min(num_points, 30)) # Select a bunch of random points within the language's geo region langs[language]['coords'] = [coords.randomize( language, decimalPlaces=2) for i in range(num_points)] langs2 = [] for language, vals in langs.items(): langs2.append({'l': language, 'g': vals['group'], 'c': vals['coords']}) with open(out_file, 'w') as filehandle: json.dump(langs2, filehandle) language_index = {row['l']: i for i, row in enumerate(langs2)} return language_index