def store_values(self): print('Loading coordinates...') coords = Coordinates() print('Checking language overrides...') overrides = LanguageOverrides().list_language_overrides() print('Loading OED vital statistics...') vitalstats = VitalStatisticsCache() entries = [] iterator = FrequencyIterator(message='Listing entries') for entry in iterator.iterate(): if (entry.has_frequency_table() and not ' ' in entry.lemma and not '-' in entry.lemma): language_breadcrumb = vitalstats.find(entry.id, field='language') year = vitalstats.find(entry.id, field='first_date') or 0 languages = [] if language_breadcrumb is not None: languages = [l for l in language_breadcrumb.split('/') if coords.is_listed(l) or l == 'English'] else: languages = ['unspecified', ] if entry.id in overrides: languages = [overrides[entry.id], ] if languages: # pick the most granular level (e.g. 'Icelandic' in # preference to 'Germanic') language = languages[-1] # Find frequency for this word freq_table = entry.frequency_table() frequency = freq_table.frequency(period='modern') band = freq_table.band(period='modern') row = (entry.lemma, entry.label, entry.id, year, frequency, band, language) entries.append(row) entries = sorted(entries, key=lambda entry: entry[2]) with (open(self.out_file, 'w')) as csvfile: writer = csv.writer(csvfile) writer.writerows(entries)
def __init__(self, **kwargs): self.dict_name = kwargs.get('dictName') self.oed_in = kwargs.get('oedIn', None) self.oed_out = kwargs.get('oedOut', None) self.odo_in = kwargs.get('odoIn', None) self.odo_out = kwargs.get('odoOut', None) self.oed_index = VitalStatisticsCache() self.odo_index = Distiller(dictName=self.dict_name) self.odo_index.load_distilled_file()
def store_values(self): def nullvalues(): return {y: 0 for y in YEARS} languages = defaultdict(nullvalues) num_entries = defaultdict(nullvalues) vitalstats = VitalStatisticsCache() iterator = FrequencyIterator(message='Measuring language frequency') for entry in iterator.iterate(): if (entry.has_frequency_table() and not ' ' in entry.lemma and not '-' in entry.lemma): freq_table = entry.frequency_table() ltext = vitalstats.find(entry.id, field='indirect_language') or 'unspecified' langs = ltext.split('/') for year in YEARS: frequency = freq_table.frequency(year=year, interpolated=True) for language in langs: languages[language][year] += frequency if entry.start < year: num_entries[language][year] += 1 rows1 = [] rows1.append(['language', ] + YEARS) for lang in sorted(languages.keys()): row = [lang, ] + [languages[lang][y] for y in YEARS] rows1.append(row) rows2 = [] rows2.append(['language', ] + YEARS) for lang in sorted(languages.keys()): row = [lang, ] + [num_entries[lang][y] for y in YEARS] rows2.append(row) with (open(self.csv1, 'w')) as csvfile: writer = csv.writer(csvfile) writer.writerows(rows1) with (open(self.csv2, 'w')) as csvfile: writer = csv.writer(csvfile) writer.writerows(rows2)
def build_currency_data(self): self.vs = VitalStatisticsCache() iterator = FrequencyIterator(in_dir=self.in_dir, letters=None, message='Getting data') self.candidates = [] self.candidates.append(list(RawCurrencyData.headers)) for e in iterator.iterate(): if (e.end and e.end >= RawCurrencyData.start and e.end <= RawCurrencyData.end and not e.is_obsolete() and not self.vs.find(e.id, field='revised') and not e.lemma.startswith('-') and not e.lemma.endswith('-')): if e.frequency_table() is not None: freqs = [e.frequency_table().frequency(period=p) for p in RawCurrencyData.periods] delta = self.find_delta(e.frequency_table()) else: freqs = [float(0) for p in RawCurrencyData.periods] delta = float(1) definition = e.definition or '' definition = '.' + definition row = [ e.id, e.label, e.wordclass(), self.vs.find(e.id, field='header'), self.vs.find(e.id, field='subject'), self.vs.find(e.id, field='region'), self.vs.find(e.id, field='usage'), definition, e.start, e.end, self.vs.find(e.id, field='quotations'), self.vs.find(e.id, field='weighted_size'), self.is_linked_to_odo(e), self.is_logically_current(e), ] row.extend(['%0.2g' % f for f in freqs]) row.append('%0.2g' % delta) self.candidates.append(tuple(row))
def refine_index(): """ Refine the data build by index_raw_forms(), in particular removing minor homographs (both lemma-level homographs and wordform-level homographs). Also swaps in standard lemma forms, main-sense definitions, and thesaurus links. """ # Determine which alien variants are okay to keep (because they don't # shadow main forms). - Alien types are wordforms which begin with # a different letter from their parent lemma, and so wouldn't be # correctly filtered by the main letter-by-letter filtering process. stdout.write("Filtering alien types...\n") allowed_alien_types = _filter_alien_types() stdout.write("...done\n") # Initialize the resources that will be used for look-ups vitalstats = VitalStatisticsCache() main_sense_checker = MainSensesCache(with_definitions=True) for letter in string.ascii_lowercase: stdout.write("Refining index for %s...\n" % letter) blocks = [] for block in _raw_pickle_iterator(letter): blocks.append(block) # Remove duplicate types, so that only the version # in the block with the highest frequency is retained. # Cluster together typeunits with the same wordform + wordclass standardmap = defaultdict(lambda: defaultdict(list)) for i, block in enumerate(blocks): for typeunit in block.standard_types: standardmap[typeunit.wordform][typeunit.wordclassflat].append((i, typeunit)) # Go through each wordclass-cluster for each wordform, and pick # the highest-frequency instance in each case for wordform, wordclasses in standardmap.items(): winners = [] for candidates in wordclasses.values(): # Sort by frequency (highest first) candidates.sort(key=lambda c: c[1].f2000, reverse=True) # Remove the first candidate (the highest-frequency one); # this is the one we'll keep. winners.append(candidates.pop(0)) # Delete all the rest for index, typeunit in candidates: blocks[index].standard_types.discard(typeunit) # We should now be left with the highest-scoring wordclasses # for the current wordform (e.g. the highest-frequency # homograph for spell_VB and the highest-frequency # homograph for one spell_NN). We now need to decide which # of these to keep and which to discard discards = _discardable_homographs(winners) for index, typeunit in discards: blocks[index].standard_types.discard(typeunit) # Remove variant types which either duplicate each other # or that shadow a standard type. (Standard types are always # given precedence.) varmap = defaultdict(list) for i, block in enumerate(blocks): for typeunit in block.variant_types: varmap[typeunit.wordform].append((i, typeunit, block.f2000)) for wordform, candidates in varmap.items(): if wordform not in standardmap: # Sort by the frequency of the parent lemma candidates.sort(key=lambda c: c[2], reverse=True) # Remove the first candidate (the highest-frequency # one); this is the one we'll keep. candidates.pop(0) # Delete all the rest for index, typeunit, _ in candidates: blocks[index].variant_types.discard(typeunit) # Remove any alien types that are not allowed (because they # shadow other standard types or variants). for block in blocks: to_be_deleted = set() for typeunit in block.alien_types: if typeunit.wordform not in allowed_alien_types: to_be_deleted.add(typeunit) for typeunit in to_be_deleted: block.alien_types.discard(typeunit) # Remove any blocks whose standard_types and # variant_types sets have now been completely emptied # For the remainder, turn standard_forms and variant_forms # from sets into lists blocks = [_listify_forms(b) for b in blocks if b.standard_types or b.variant_types] blocks_filtered = [] for block in blocks: language = vitalstats.find(block.refentry, field="indirect_language") if not language and block.start and block.start < 1200: language = "West Germanic" block = _replace_language(block, language) # Acquire main-sense data for this block (which will be # used to swap in a new definition and a thesaurus link) if block.type == "entry": ms_block_data = main_sense_checker.find_block_data(block.refentry, block.refid) if ms_block_data and ms_block_data.senses: main_sense_data = ms_block_data.senses[0] main_sense_confidence = ms_block_data.confidence() else: main_sense_data = None main_sense_confidence = None else: main_sense_data = None main_sense_confidence = None # Swap in thesaurus-class link block = _replace_htclass(block, main_sense_data, main_sense_confidence) if block.type == "entry": # Make sure we use the OED headword, not the headword # that's been used in GEL (which could be the version # of the headword found in ODE or NOAD). headword = vitalstats.find(block.refentry, field="headword") if headword and headword != block.lemma: block = _replace_lemma(block, headword) # Make sure we use the best main-sense definition if main_sense_data and main_sense_data.definition: block = _replace_definition(block, main_sense_data.definition) blocks_filtered.append(block) out_file = os.path.join(FORM_INDEX_DIR, "refined", letter + ".json") with open(out_file, "w") as filehandle: for block in blocks_filtered: filehandle.write(json.dumps(block) + "\n")
class LinkUpdater(object): error_message = '!ERROR entry not found' def __init__(self, **kwargs): self.dict_name = kwargs.get('dictName') self.oed_in = kwargs.get('oedIn', None) self.oed_out = kwargs.get('oedOut', None) self.odo_in = kwargs.get('odoIn', None) self.odo_out = kwargs.get('odoOut', None) self.oed_index = VitalStatisticsCache() self.odo_index = Distiller(dictName=self.dict_name) self.odo_index.load_distilled_file() def update_odo(self, **kwargs): valid_links_only = kwargs.get('validLinksOnly', False) tree = etree.parse(self.odo_in) for entry in tree.findall('./e'): lexid = entry.get('lexid', None) odo_label = entry.find('./label') odo_label_text = self.odo_index.headword_by_id(lexid) or LinkUpdater.error_message etree.strip_tags(odo_label, 'i', 'sup', 'sub', 'hm') odo_label.text = odo_label_text link = entry.find('./linkSet/link') if link is not None: refentry = link.get('refentry', '0') refid = link.get('refid', '0') oed_label_text = self.oed_index.find(refentry, field='label') or LinkUpdater.error_message etree.strip_tags(link, 'i', 'sup', 'sub', 'hm') link.text = oed_label_text if (valid_links_only and (link is None or link.text == LinkUpdater.error_message or odo_label.text == LinkUpdater.error_message or not check_match(link.text, odo_label.text))): entry.getparent().remove(entry) with open(self.odo_out, 'w') as filehandle: filehandle.write(etree.tostring(tree, pretty_print=True, encoding='unicode')) def update_oed(self, **kwargs): valid_links_only = kwargs.get('validLinksOnly', False) tree = etree.parse(self.oed_in) for entry in tree.findall('./link'): oed_id = entry.get('sourceID', None) oed_label_text = self.oed_index.find(oed_id, field='label') or LinkUpdater.error_message source_label = entry.find('./sourceLabel') etree.strip_tags(source_label, 'i', 'sup', 'sub', 'hm') source_label.text = oed_label_text lexid = entry.get('targetID', None) ode_label_text = self.odo_index.headword_by_id(lexid) or LinkUpdater.error_message target_label = entry.find('./targetLabel') etree.strip_tags(target_label, 'i', 'sup', 'sub', 'hm') target_label.text = ode_label_text if (valid_links_only and (oed_id is None or lexid is None or source_label.text == LinkUpdater.error_message or target_label.text == LinkUpdater.error_message or not check_match(source_label.text, target_label.text))): entry.getparent().remove(entry) with open(self.oed_out, 'w') as filehandle: filehandle.write(etree.tostring(tree, pretty_print=True, encoding='unicode'))
def analyse(self): vs = VitalStatisticsCache() self.track = { 'band_distribution': defaultdict(lambda: 0), 'total_frequency': defaultdict(lambda: 0), 'high_frequency': [], 'high_delta_up': [], 'high_delta_down': [], 'delta_dist': defaultdict(lambda: 0), 'plural_to_singular': [], 'high_frequency_rare': [], 'frequency_to_size_high': [], 'frequency_to_size_low': [], } iterator = FrequencyIterator(in_dir=self.in_dir, letters=None, message='Analysing frequency data') for e in iterator.iterate(): if not e.has_frequency_table(): self.track['band_distribution'][16] += 1 if e.has_frequency_table(): ft = e.frequency_table() self.track['band_distribution'][ft.band(period='modern')] += 1 if ft.band(period='modern') <= 5: self.track['high_frequency'].append({ 'label': e.label, 'id': e.id, 'ftable': ft }) if ft.frequency(period='modern') > 0.5 and e.start < 1750: delta = ft.delta('1800-49', 'modern') if delta is not None: self.log_delta(delta, reciprocal=True) if delta > 2: self.track['high_delta_up'].append({ 'label': e.label, 'id': e.id, 'ftable': ft }) if (ft.frequency(period='1800-49') > 0.5 and not e.is_obsolete()): delta = ft.delta('1800-49', 'modern') if delta is not None and delta < 0.5: self.track['high_delta_down'].append({ 'label': e.label, 'id': e.id, 'ftable': ft }) self.log_delta(delta) if not ' ' in e.lemma and not '-' in e.lemma: for p in e.frequency_table().data.keys(): self.track['total_frequency'][p] +=\ ft.frequency(period=p) if (ft.frequency() > 0.01 and self.is_marked_rare(vs.find(e.id, 'header'))): self.track['high_frequency_rare'].append({ 'label': e.label, 'id': e.id, 'header': vs.find(e.id, 'header'), 'fpm': ft.frequency() }) if ft.frequency() > 1: self.compare_singular_to_plural(e) if ft.frequency() >= 0.0001 and vs.find(e.id, 'quotations') > 0: ratio = log(ft.frequency()) / vs.find(e.id, 'quotations') if ratio > 0.2: self.track['frequency_to_size_high'].append({ 'label': e.label, 'id': e.id, 'quotations': vs.find(e.id, 'quotations'), 'fpm': ft.frequency(), 'ratio': ratio, }) if vs.find(e.id, 'quotations') >= 20: self.track['frequency_to_size_low'].append({ 'label': e.label, 'id': e.id, 'quotations': vs.find(e.id, 'quotations'), 'fpm': ft.frequency(), 'ratio': ratio, })
def refine_index(self): allowed_alien_types = _filter_alien_types() vitalstats = VitalStatisticsCache() main_sense_checker = MainSensesCache(with_definitions=True) for letter in string.ascii_lowercase: print('Refining index for %s...' % letter) blocks = [] for block in raw_pickle_iterator(letter): blocks.append(block) # Remove duplicate types, so that only the version # in the block with the highest frequency is retained. standardmap = defaultdict(list) for i, block in enumerate(blocks): for wordform in block.standard_types: standardmap[wordform].append((i, block.frequency)) for wordform, candidates in standardmap.items(): if len(candidates) > 1: # Sort by frequency candidates.sort(key=lambda c: c[1], reverse=True) # Remove the first candidate (the highest-frequency # one); this is the one we'll keep. candidates.pop(0) # Delete all the rest for index in [c[0] for c in candidates]: blocks[index].standard_types.discard(wordform) # Remove variant types which either duplicate each other # or that shadow a standard type (standard types are always # given precedence). varmap = defaultdict(list) for i, block in enumerate(blocks): for wordform in block.variant_types: varmap[wordform].append((i, block.frequency)) for wordform, candidates in varmap.items(): if wordform not in standardmap: # Sort by frequency candidates.sort(key=lambda c: c[1], reverse=True) # Remove the first candidate (the highest-frequency # one); this is the one we'll keep. candidates.pop(0) # Delete all the rest for index in [c[0] for c in candidates]: blocks[index].variant_types.discard(wordform) # Remove any alien types that are not allowed (because they # shadow other standard types or variants). for block in blocks: to_be_deleted = set() for wordform in block.alien_types: if wordform not in allowed_alien_types: to_be_deleted.add(wordform) for wordform in to_be_deleted: block.alien_types.discard(wordform) # Remove any blocks whose standard_types and # variant_types sets have now been completely emptied # For the remainder, turn standard_forms and variant_forms # from sets into lists blocks = [_listify_forms(b) for b in blocks if b.standard_types or b.variant_types] blocks_filtered = [] for block in blocks: language = vitalstats.find(block.refentry, field='indirect_language') if not language and block.start and block.start < 1200: language = 'West Germanic' block = _replace_language(block, language) if block.type == 'entry': # Make sure we use the OED headword, not the headword # that's been used in GEL (which could be the version # of the headword found in ODE or NOAD). headword = vitalstats.find(block.refentry, field='headword') if headword and headword != block.lemma: block = _replace_lemma(block, headword) # Make sure we use the correct main-sense definition main_sense = main_sense_checker.find_main_sense_data( block.refentry, block.refid) if main_sense and main_sense.definition: block = _replace_definition(block, main_sense.definition) blocks_filtered.append(block) out_file = os.path.join(FORM_INDEX_DIR, 'refined', letter + '.json') with open(out_file, 'w') as filehandle: for block in blocks_filtered: filehandle.write(json.dumps(block) + '\n')
class RawCurrencyData(object): start = frequencyconfig.RANGE_START end = frequencyconfig.RANGE_END periods = ('1800-49', '1850-99', '1900-49', '1950-99', '2000-') headers = ['id', 'label', 'wordclass', 'header', 'subject', 'region', 'usage', 'definition', 'start', 'end', 'quotations', 'weighted size', 'ODO-linked', 'logically current'] headers.extend(periods) headers.append('frequency change') # parameters for testing logical currency logical = { 'date': frequencyconfig.LOGICAL_CURRENCY_DATE, 'size': frequencyconfig.LOGICAL_CURRENCY_SIZE, 'suffixes1': ['-' + j for j in frequencyconfig.LOGICAL_CURRENCY_SUFFIXES1.split('|')], 'suffixes2': ['-' + j for j in frequencyconfig.LOGICAL_CURRENCY_SUFFIXES1.split('|')], } def __init__(self, **kwargs): self.in_dir = kwargs.get('in_dir') def build_currency_data(self): self.vs = VitalStatisticsCache() iterator = FrequencyIterator(in_dir=self.in_dir, letters=None, message='Getting data') self.candidates = [] self.candidates.append(list(RawCurrencyData.headers)) for e in iterator.iterate(): if (e.end and e.end >= RawCurrencyData.start and e.end <= RawCurrencyData.end and not e.is_obsolete() and not self.vs.find(e.id, field='revised') and not e.lemma.startswith('-') and not e.lemma.endswith('-')): if e.frequency_table() is not None: freqs = [e.frequency_table().frequency(period=p) for p in RawCurrencyData.periods] delta = self.find_delta(e.frequency_table()) else: freqs = [float(0) for p in RawCurrencyData.periods] delta = float(1) definition = e.definition or '' definition = '.' + definition row = [ e.id, e.label, e.wordclass(), self.vs.find(e.id, field='header'), self.vs.find(e.id, field='subject'), self.vs.find(e.id, field='region'), self.vs.find(e.id, field='usage'), definition, e.start, e.end, self.vs.find(e.id, field='quotations'), self.vs.find(e.id, field='weighted_size'), self.is_linked_to_odo(e), self.is_logically_current(e), ] row.extend(['%0.2g' % f for f in freqs]) row.append('%0.2g' % delta) self.candidates.append(tuple(row)) def is_logically_current(self, e): etyma = self.vs.find(e.id, field='etyma') if len(etyma) == 2: if etyma[1][0] in RawCurrencyData.logical['suffixes1']: parent_id = etyma[0][1] tier = 'high' elif etyma[1][0] in RawCurrencyData.logical['suffixes2']: parent_id = etyma[0][1] tier = 'low' else: tier = None if (tier is not None and (self.vs.find(parent_id, field='last_date') > RawCurrencyData.end or (self.vs.find(parent_id, field='last_date') > RawCurrencyData.logical['date'] and self.vs.find(parent_id, field='quotations') > RawCurrencyData.logical['size']))): return tier return None def is_linked_to_odo(self, e): if (self.vs.find(e.id, field='ode') is not None or self.vs.find(e.id, field='noad') is not None): return True else: return False def write(self, filepath): with open(filepath, 'w') as csvfile: csvw = csv.writer(csvfile) csvw.writerows(self.candidates) def find_delta(self, ft): f1 = ft.frequency(period='1800-99') f2 = ft.frequency(period='1950-99') if f1 == 0: d = float(1) elif f2 == 0: d = 0.0001 / f1 else: d = f2 / f1 if d < 1: d = -(1 / d) return d