def index_raw_forms(self): for letter in string.ascii_lowercase: print('Indexing %s...' % letter) blocks = [] for entry in entry_iterator(letters=letter): if (entry.date().end < ENTRY_MINIMUM_END_DATE or entry.primary_wordclass() in ('NP', 'NPS') or len(entry.lemma) > MAX_WORDLENGTH): continue entry_type = entry.oed_entry_type() if entry_type is None: continue seen = set() for block in entry.wordclass_sets(): # Check that this block is in OED, and does not shadow # something already covered (as e.g. vast adv. shadows # vast adj.). refentry, refid = block.link(target='oed', asTuple=True) if not refentry or (refentry, refid) in seen: continue block_data = _store_forms(block, entry, entry_type, letter) if block_data.standard_types: blocks.append(block_data) seen.add((refentry, refid)) out_file = os.path.join(FORM_INDEX_DIR, 'raw', letter) with open(out_file, 'wb') as filehandle: for block in blocks: pickle.dump(block, filehandle)
def index_proper_names(self): allnames = set() for name_type in ('firstname', 'surname', 'placename'): for name in propernames.names_list(name_type): if ' ' in name: continue allnames.add(name) for letter in string.ascii_lowercase: print('Indexing proper names in %s...' % letter) for entry in entry_iterator(letters=letter): if entry.primary_wordclass() not in ('NP', 'NPS'): continue for typeunit in entry.types(): if (' ' in typeunit.form or not typeunit.lemma_manager().capitalization_type() == 'capitalized'): continue allnames.add(typeunit.form) out_file = os.path.join(FORM_INDEX_DIR, 'proper_names', 'all.txt') with open(out_file, 'w') as filehandle: for name in allnames: sortable = stringtools.lexical_sort(name) if (not sortable or len(sortable) > MAX_WORDLENGTH or len(name) > MAX_WORDLENGTH): continue filehandle.write('%s\t%s\t%s\n' % (sortable, name, str(propernames.is_common(name))))
def index_raw_forms(): """ Build an index of all OED lemmas and their various inflections/variants. Based on GEL. Outputs a series of pickle files (one file per letter). """ for letter in string.ascii_lowercase: stdout.write('Indexing %s...\n' % letter) blocks = [] for entry in entry_iterator(letters=letter): # Skip proper names and over-long lemmas if (entry.date().end < ENTRY_MINIMUM_END_DATE or entry.primary_wordclass() in ('NP', 'NPS') or len(entry.lemma) > MAX_WORDLENGTH): continue entry_type = entry.oed_entry_type() if entry_type is None: continue seen = set() for block in entry.wordclass_sets(): # Check that this block is in OED, and does not shadow # something already covered within this entry # (as e.g. _vast_ adv. shadows _vast_ adj.). refentry, refid = block.link(target='oed', asTuple=True) if not refentry or (refentry, refid) in seen: continue block_data = _store_forms(block, entry, entry_type, letter) if block_data.standard_types: blocks.append(block_data) seen.add((refentry, refid)) out_file = os.path.join(FORM_INDEX_DIR, 'raw', letter) with open(out_file, 'wb') as filehandle: for block in blocks: pickle.dump(block, filehandle)