Exemplo n.º 1
0
 def load_data(self, **kwargs):
     self.entry_cache = EntryCache(**kwargs)
     self.groups = self.entry_cache.group_by_year()
Exemplo n.º 2
0
class JsonPreparation(object):

    """
    Prepare JSON data files to be read by the D3 app.
    """

    def __init__(self):
        self.entry_cache = None
        self.groups = None

    def load_data(self, **kwargs):
        self.entry_cache = EntryCache(**kwargs)
        self.groups = self.entry_cache.group_by_year()

    def write(self, **kwargs):
        out_dir = kwargs.get('out_dir')
        examples_log_file = kwargs.get('examples_log')
        files = {k: os.path.join(out_dir, v) for k, v in kwargs.items()}
        language_index = self._write_language_file(files['languages'])
        self._write_running_totals_file(files['running_totals'])
        self._write_increase_rate_file(files['increase_rate'])

        entries = defaultdict(list)
        examples = {}
        for year, entry_list in self.groups:
            if START_YEAR <= year <= END_YEAR:
                for entry in entry_list:
                    entry.coordinates()
                entry_list = _winnow(entry_list)
                examples[year] = list(_choose_examples(entry_list, year))
                for entry in entry_list:
                    freq = float('%.1g' % entry.frequency)
                    if freq >= 1:
                        freq = int(freq)
                    freq = max(freq, 0.0001)
                    entries[year].append((
                        entry.id,
                        entry.lemma,
                        entry.band,
                        freq,
                        language_index[entry.language],
                    ))

        _write_words_file(entries, files['words'])
        _write_examples_file(examples, files['examples'], examples_log_file)

    def _write_running_totals_file(self, out_file):
        running_totals = {499: {group: 0 for group in LANGUAGE_GROUPS}}
        running_counts = {499: {group: 0 for group in LANGUAGE_GROUPS}}
        for year in range(500, END_YEAR + 1):
            # Find the list of entries for this year
            entries = []
            for year2, elist in self.groups:
                if year2 == year:
                    entries = elist
                    break

            # Initially, set the running totals to be the same as last year's
            running_totals[year] = {}
            running_counts[year] = {}
            for group in LANGUAGE_GROUPS:
                running_totals[year][group] = running_totals[year - 1][group]
                running_counts[year][group] = running_counts[year - 1][group]
            # ...then add on to the counts for this year
            for entry in [e for e in entries
                          if e.language_group() in LANGUAGE_GROUPS]:
                running_totals[year][entry.language_group()] += entry.frequency
                running_counts[year][entry.language_group()] += 1

        minified = {'summedfrequencies': {}, 'counts': {}}
        for year, vals in running_totals.items():
            if year >= START_YEAR:
                this_year_sums = []
                this_year_counts = []
                for group in LANGUAGE_GROUPS:
                    this_year_sums.append(int(vals[group]))
                    this_year_counts.append(running_counts[year][group])
                minified['summedfrequencies'][year] = this_year_sums
                minified['counts'][year] = [int(n/100) * 100 for n in this_year_counts]

        with open(out_file, 'w') as filehandle:
            json.dump(minified, filehandle)

    def _write_increase_rate_file(self, out_file):
        rates = defaultdict(int)
        for year in range(500, END_YEAR + 1):
            span = ((int(year / 20)) * 20) + 10
            entries = []
            for year2, entry_list in self.groups:
                if year2 == year:
                    entries = entry_list
                    break
            rates[span] += sum([e.frequency for e in entries])

        rates = [(k, v / 20) for k, v in rates.items()]
        rates.sort(key=lambda a: a[0])

        years = range(START_YEAR, END_YEAR + 1)
        freqs = numpy.interp(years,
                             [r[0] for r in rates],
                             [r[1] for r in rates])
        rates = {year2: int(f) for year2, f in zip(years, freqs)}

        with open(out_file, 'w') as filehandle:
            json.dump(rates, filehandle)

    def _write_language_file(self, out_file):
        coords = Coordinates()
        langs = defaultdict(lambda: {'count': 0, 'group': None})
        for year, entry_list in self.groups:
            if year >= START_YEAR and year <= END_YEAR:
                entry_list = list(entry_list)
                for entry in entry_list:
                    langs[entry.language]['count'] += 1
                    langs[entry.language]['group'] = entry.language_group_initial()

        for language in langs.keys():
            # Number of possible points (between 3 and 30, depending on
            #  the frequency of the language)
            num_points = int(langs[language]['count'] / 5)
            num_points = max(4, min(num_points, 30))
            # Select a bunch of random points within the language's geo region
            langs[language]['coords'] = [coords.randomize(
                language, decimalPlaces=2) for i in range(num_points)]

        langs2 = []
        for language, vals in langs.items():
            langs2.append({'l': language,
                           'g': vals['group'],
                           'c': vals['coords']})
        with open(out_file, 'w') as filehandle:
            json.dump(langs2, filehandle)

        language_index = {row['l']: i for i, row in enumerate(langs2)}
        return language_index