def index_frequency_files(in_dir, out_file): entry_list = defaultdict(lambda: defaultdict(list)) iterator = FrequencyIterator(in_dir=in_dir, message='Compiling index') for e in iterator.iterate(): entry_list[e.letter][e.filename].append(e.label) doc = etree.Element('letters') doc.addprevious(XSLPI) for letter in sorted(entry_list.keys()): num_files = len(entry_list[letter].keys()) num_entries = sum([len(entry_list[letter][f]) for f in entry_list[letter].keys()]) letter_node = etree.SubElement(doc, 'letterSet', letter=letter, files=str(num_files), entries=str(num_entries),) for filename in sorted(entry_list[letter].keys()): fnode = etree.SubElement(letter_node, 'file', name=filename, letter=letter, entries=str(len(entry_list[letter][filename]))) t1 = etree.SubElement(fnode, 'first') t1.text = entry_list[letter][filename][0] t2 = etree.SubElement(fnode, 'last') t2.text = entry_list[letter][filename][-1] with open(out_file, 'w') as filehandle: filehandle.write('<?xml version="1.0" encoding="UTF-8"?>\n') filehandle.write(etree.tounicode(doc.getroottree(), pretty_print=True,))
def store_values(self): print('Loading coordinates...') coords = Coordinates() print('Checking language overrides...') overrides = LanguageOverrides().list_language_overrides() print('Loading OED vital statistics...') vitalstats = VitalStatisticsCache() entries = [] iterator = FrequencyIterator(message='Listing entries') for entry in iterator.iterate(): if (entry.has_frequency_table() and not ' ' in entry.lemma and not '-' in entry.lemma): language_breadcrumb = vitalstats.find(entry.id, field='language') year = vitalstats.find(entry.id, field='first_date') or 0 languages = [] if language_breadcrumb is not None: languages = [l for l in language_breadcrumb.split('/') if coords.is_listed(l) or l == 'English'] else: languages = ['unspecified', ] if entry.id in overrides: languages = [overrides[entry.id], ] if languages: # pick the most granular level (e.g. 'Icelandic' in # preference to 'Germanic') language = languages[-1] # Find frequency for this word freq_table = entry.frequency_table() frequency = freq_table.frequency(period='modern') band = freq_table.band(period='modern') row = (entry.lemma, entry.label, entry.id, year, frequency, band, language) entries.append(row) entries = sorted(entries, key=lambda entry: entry[2]) with (open(self.out_file, 'w')) as csvfile: writer = csv.writer(csvfile) writer.writerows(entries)
def build_currency_data(self): self.vs = VitalStatisticsCache() iterator = FrequencyIterator(in_dir=self.in_dir, letters=None, message='Getting data') self.candidates = [] self.candidates.append(list(RawCurrencyData.headers)) for e in iterator.iterate(): if (e.end and e.end >= RawCurrencyData.start and e.end <= RawCurrencyData.end and not e.is_obsolete() and not self.vs.find(e.id, field='revised') and not e.lemma.startswith('-') and not e.lemma.endswith('-')): if e.frequency_table() is not None: freqs = [e.frequency_table().frequency(period=p) for p in RawCurrencyData.periods] delta = self.find_delta(e.frequency_table()) else: freqs = [float(0) for p in RawCurrencyData.periods] delta = float(1) definition = e.definition or '' definition = '.' + definition row = [ e.id, e.label, e.wordclass(), self.vs.find(e.id, field='header'), self.vs.find(e.id, field='subject'), self.vs.find(e.id, field='region'), self.vs.find(e.id, field='usage'), definition, e.start, e.end, self.vs.find(e.id, field='quotations'), self.vs.find(e.id, field='weighted_size'), self.is_linked_to_odo(e), self.is_logically_current(e), ] row.extend(['%0.2g' % f for f in freqs]) row.append('%0.2g' % delta) self.candidates.append(tuple(row))
def measure_ratios(self): ratios = defaultdict(list) iterator = FrequencyIterator(in_dir=self.in_dir, letters=None, message='Analysing p.o.s. ratios') for e in iterator.iterate(): for wcs in e.wordclass_sets(): if ((wcs.wordclass == 'NN' or wcs.wordclass == 'VB') and wcs.has_frequency_table()): total = wcs.frequency_table().frequency() local = defaultdict(lambda: 0) for type in wcs.types(): if type.frequency_table().frequency() > 0: local[type.wordclass] += type.frequency_table().frequency() for wordclass, fpm in local.items(): ratios[wordclass].append(total / fpm) for wordclass in ratios: print('%s\t%0.4g' % (wordclass, numpy.median(ratios[wordclass])))
def store_values(self): def nullvalues(): return {y: 0 for y in YEARS} languages = defaultdict(nullvalues) num_entries = defaultdict(nullvalues) vitalstats = VitalStatisticsCache() iterator = FrequencyIterator(message='Measuring language frequency') for entry in iterator.iterate(): if (entry.has_frequency_table() and not ' ' in entry.lemma and not '-' in entry.lemma): freq_table = entry.frequency_table() ltext = vitalstats.find(entry.id, field='indirect_language') or 'unspecified' langs = ltext.split('/') for year in YEARS: frequency = freq_table.frequency(year=year, interpolated=True) for language in langs: languages[language][year] += frequency if entry.start < year: num_entries[language][year] += 1 rows1 = [] rows1.append(['language', ] + YEARS) for lang in sorted(languages.keys()): row = [lang, ] + [languages[lang][y] for y in YEARS] rows1.append(row) rows2 = [] rows2.append(['language', ] + YEARS) for lang in sorted(languages.keys()): row = [lang, ] + [num_entries[lang][y] for y in YEARS] rows2.append(row) with (open(self.csv1, 'w')) as csvfile: writer = csv.writer(csvfile) writer.writerows(rows1) with (open(self.csv2, 'w')) as csvfile: writer = csv.writer(csvfile) writer.writerows(rows2)
def xml_to_csv(in_dir, out_file): iterator = FrequencyIterator(in_dir=in_dir, message='Populating .csv file') entries = [] for e in iterator.iterate(): if not e.has_frequency_table(): continue frequency = e.frequency_table().frequency(period='modern') band = e.frequency_table().band(period='modern') label = e.label entry_id = e.id if e.is_main_entry: node_id = None else: node_id = e.xrnode row = (entry_id, node_id, label, frequency, band) entries.append(row) with open(out_file, 'w') as filehandle: csvwriter = csv.writer(filehandle) csvwriter.writerows(entries)
def store_rankings(**kwargs): in_dir = kwargs.get('in_dir') out_file = kwargs.get('out_file') or DEFAULT_FILE iterator = FrequencyIterator(in_dir=in_dir, letters=None, message='Compiling frequency ranking') entryrank = [] for e in iterator.iterate(): if e.has_frequency_table(): entryrank.append(( e.label, e.lemma, e.xrid, e.frequency_table().frequency(), )) entryrank = sorted(entryrank, key=lambda e: e[3], reverse=True) with (open(out_file, 'w')) as filehandle: csv_writer = csv.writer(filehandle) for row in entryrank: csv_writer.writerow(row)
def populate_lemmas(): ranking = EntryRank() iterator = FrequencyIterator(in_dir=INPUT_DIR, message='Populating database') count = 0 entries = [] for e in iterator.iterate(): count += 1 if e.is_obsolete(): last_date = e.end else: last_date = 2050 if e.is_main_entry: try: rank = ranking.entry(e.id).rank except AttributeError: rank = 250000 else: rank = None if e.is_main_entry: xrnode = None else: xrnode = e.xrnode entry = Lemma( xrnode=xrnode, label=e.label[:LABEL_LENGTH], alphasort=e.alphasort()[:ALPHASORT_LENGTH], definition=e.definition[:DEFINITION_LENGTH], dictsort=count, json=e.todict(), wordclass=e.wordclass() or 'X', startdate=e.start, enddate=last_date, rank=rank, entry_id=e.id, mainentry=e.is_main_entry, ) # Frequency + frequency-band fields if not e.has_frequency_table(): for year in FREQUENCY_FIELDS: field = 'f%d' % year entry.__dict__[field] = 0 entry.fmodern = 0 for year in BAND_FIELDS: field = 'fb%d' % year entry.__dict__[field] = NULL_FREQUENCY_BAND entry.fbmodern = NULL_FREQUENCY_BAND else: for year in FREQUENCY_FIELDS: field = 'f%d' % year entry.__dict__[field] = e.frequency_table().frequency(year=year) entry.fmodern = e.frequency_table().frequency(period='modern') for year in BAND_FIELDS: field = 'fb%d' % year entry.__dict__[field] = e.frequency_table().band(year=year) entry.fbmodern = e.frequency_table().band(period='modern') entries.append(entry) if len(entries) > 1000: Lemma.objects.bulk_create(entries) entries = [] Lemma.objects.bulk_create(entries)
def analyse(self): vs = VitalStatisticsCache() self.track = { 'band_distribution': defaultdict(lambda: 0), 'total_frequency': defaultdict(lambda: 0), 'high_frequency': [], 'high_delta_up': [], 'high_delta_down': [], 'delta_dist': defaultdict(lambda: 0), 'plural_to_singular': [], 'high_frequency_rare': [], 'frequency_to_size_high': [], 'frequency_to_size_low': [], } iterator = FrequencyIterator(in_dir=self.in_dir, letters=None, message='Analysing frequency data') for e in iterator.iterate(): if not e.has_frequency_table(): self.track['band_distribution'][16] += 1 if e.has_frequency_table(): ft = e.frequency_table() self.track['band_distribution'][ft.band(period='modern')] += 1 if ft.band(period='modern') <= 5: self.track['high_frequency'].append({ 'label': e.label, 'id': e.id, 'ftable': ft }) if ft.frequency(period='modern') > 0.5 and e.start < 1750: delta = ft.delta('1800-49', 'modern') if delta is not None: self.log_delta(delta, reciprocal=True) if delta > 2: self.track['high_delta_up'].append({ 'label': e.label, 'id': e.id, 'ftable': ft }) if (ft.frequency(period='1800-49') > 0.5 and not e.is_obsolete()): delta = ft.delta('1800-49', 'modern') if delta is not None and delta < 0.5: self.track['high_delta_down'].append({ 'label': e.label, 'id': e.id, 'ftable': ft }) self.log_delta(delta) if not ' ' in e.lemma and not '-' in e.lemma: for p in e.frequency_table().data.keys(): self.track['total_frequency'][p] +=\ ft.frequency(period=p) if (ft.frequency() > 0.01 and self.is_marked_rare(vs.find(e.id, 'header'))): self.track['high_frequency_rare'].append({ 'label': e.label, 'id': e.id, 'header': vs.find(e.id, 'header'), 'fpm': ft.frequency() }) if ft.frequency() > 1: self.compare_singular_to_plural(e) if ft.frequency() >= 0.0001 and vs.find(e.id, 'quotations') > 0: ratio = log(ft.frequency()) / vs.find(e.id, 'quotations') if ratio > 0.2: self.track['frequency_to_size_high'].append({ 'label': e.label, 'id': e.id, 'quotations': vs.find(e.id, 'quotations'), 'fpm': ft.frequency(), 'ratio': ratio, }) if vs.find(e.id, 'quotations') >= 20: self.track['frequency_to_size_low'].append({ 'label': e.label, 'id': e.id, 'quotations': vs.find(e.id, 'quotations'), 'fpm': ft.frequency(), 'ratio': ratio, })