def load_morphgroups(self): iterator = EntryIterator(dictType='ode') for entry in iterator.iterate(): for morphgroup in entry.morphgroups(): if (WORDCLASS_PATTERN.search(morphgroup.baseclass) and not morphgroup.variant_type == 'deprecated'): self._process_morphgroup(morphgroup)
def process(self): self.clear_outdir() self.initialize_root() previous = None iterator = EntryIterator(dict_type="oed", verbosity="low", fix_ligatures=True) # Iterate through all entries in OED, processing each and storing # the results in a buffer for entry in iterator.iterate(): self.entry = entry # Write the buffer to a file when it gets to a certain size, and # there's an appropriate break, e.g. not in the middle # of homographs. if self.buffersize() >= FILESIZE and entry.lemma_manager().lexical_sort() != previous: self.writebuffer() self.initialize_root() # Process the current entry -> buffer self.process_entry() # Keep track of the previous entry's headword (to help find a good # opportunity to write the buffer to a file) previous = entry.lemma_manager().lexical_sort() # Write a file for anything still left in the buffer after the # entry iterator has completed self.writebuffer()
def distil(self): iterator = EntryIterator(dictType=self.dict_name) with open(self.pickle_file, 'wb') as filehandle: for entry in iterator.iterate(): distilled = _parse_source_entry(entry, self.definition_length) if distilled.wordclass_blocks: pickle.dump(distilled, filehandle)
def test_oed_parser(): names = [] for letter in LETTERS: print('Collecting names in %s...' % letter) filter_pattern = 'oed_%s.xml' % letter.upper() iterator = EntryIterator(dictType='oed', fixLigatures=True, fileFilter=filter_pattern, verbosity=None) for entry in iterator.iterate(): if (('personal name' in entry.characteristic_nodes('etymonLanguage') or 'place name' in entry.characteristic_nodes('etymonLanguage')) and re.search(r'^([A-Z]|[A-Z]\'[A-Z])[a-z]+$', entry.headword) and not entry.headword.endswith('ism') and not entry.headword.endswith('ist') and not entry.headword.endswith('ian') and not entry.headword.endswith('ite') and entry.primary_wordclass().penn == 'NN'): print(entry.headword) names.append(entry.headword) #for et in entry.etymology().etyma(): # print(et) #for s1 in entry.s1blocks(): # s1.share_quotations() # for i, s in enumerate(s1.senses()): # _process_sense(s, i, len(s1.senses())) with open('somenames.txt', 'w') as filehandle: for name in names: filehandle.write(name + '\n')
def count(): counts = defaultdict(lambda: defaultdict(int)) ei = EntryIterator(path=oed_dir, dictType="oed", fixLigatures=True, verbosity="low") for entry in ei.iterate(): entry.share_quotations() senses = {"all": entry.senses, "missed": [s for s in entry.senses if not s.thesaurus_categories()]} for j in ("all", "missed"): for s in senses[j]: counts[j]["all"] += 1 if s.is_subentry() and s.definition(): counts[j]["defined_subentry"] += 1 elif s.is_subentry(): counts[j]["undefined_subentry"] += 1 else: counts[j]["main_sense"] += 1 counts[j]["quotations"] += s.num_quotations if s.primary_wordclass.penn in ('NN', 'VB', 'JJ', 'RB'): counts[j][s.primary_wordclass.penn] += 1 else: counts[j]['other_wordclass'] += 1 if entry.is_revised: counts[j]["revised"] += 1 else: counts[j]["unrevised"] += 1 for j in ("all", "missed"): print j for k, v in counts[j].items(): print "\t%s\t%d" % (k, v)
def store_vital_statistics(self): for letter in LETTERS: print('Collecting vital statistics in %s...' % letter) filter_pattern = 'oed_%s.xml' % letter.upper() iterator = EntryIterator(path=self.oed_dir, dictType='oed', fixLigatures=True, fileFilter=filter_pattern, verbosity=None) self.doc = etree.Element('entries') for entry in iterator.iterate(): entry_node = etree.SubElement( self.doc, 'e', xrid=entry.id, quotations=str(entry.num_quotations(force_recount=True)), weightedSize='%0.2g' % entry.weighted_size(), obsolete=str(entry.is_marked_obsolete()), revised=str(entry.is_revised), firstDate=str(entry.date().start), lastDate=str(entry.date().end) ) label_node = etree.SubElement(entry_node, 'label') label_node.text = entry.label() hw_node = etree.SubElement(entry_node, 'headword') hw_node.text = entry.headword if entry.header() is not None: header_node = etree.SubElement(entry_node, 'header') header_node.text = entry.header() etym_node = etree.SubElement(entry_node, 'etyma') for etymon in entry.etymology().etyma(): if etymon.type() == 'cross-reference': etymon_node = etree.SubElement(etym_node, 'etymon') etymon_node.set('xrid', str(etymon.refentry())) etymon_node.text = etymon.lemma lang_node = etree.SubElement(entry_node, 'language') language = (entry.characteristic_first('etymonLanguage') or entry.characteristic_first('sourceLanguage')) if language: lang_node.text = language def_node = etree.SubElement(entry_node, 'def') definition = entry.definition(length=100, current=True) if definition: def_node.text = definition if entry.senses(): for label_type in ('subject', 'region', 'usage'): label_text = entry.senses()[0].characteristic_first(label_type) label_text = label_text.split('/')[-1] if label_text: label_node = etree.SubElement(entry_node, label_type) label_node.text = label_text self._write_output(letter)
def store_main_senses(**kwargs): """ Store main-sense data for OED entries as XML documents. """ from lex.entryiterator import EntryIterator oed_dir = kwargs.get("oed_dir") or DEFAULT_INPUT out_dir = kwargs.get("out_dir") or DEFAULT_OUTPUT for letter in LETTERS: print("Collecting main-sense data in %s..." % letter) filter_pattern = "oed_%s.xml" % letter.upper() iterator = EntryIterator( path=oed_dir, dictType="oed", fixLigatures=True, fileFilter=filter_pattern, verbosity=None ) doc = etree.Element("entries") for entry in iterator.iterate(): entry.check_revised_status() entry_node = etree.SubElement(doc, "e", refentry=entry.id) label_node = etree.SubElement(entry_node, "label") label_node.text = entry.label() hw_node = etree.SubElement(entry_node, "headword") hw_node.text = entry.headword for block in entry.s1blocks(): ranking, num_current, num_large, num_quotations = calculate_main_sense(block) if ranking: wordclass = block.primary_wordclass().penn or "null" num_senses = len(block.senses()) s1_node = etree.SubElement( entry_node, "s1", wordclass=wordclass, refid=block.node_id(), senses=str(num_senses), currentSenses=str(num_current), largeSenses=str(num_large), quotations=str(num_quotations), ) for sense in ranking[0:3]: sense_num = sense.sense_number() or "null" thes_links = "|".join(sense.thesaurus_nodes()) sense_node = etree.SubElement( s1_node, "sense", refid=sense.node_id(), number=sense_num, quotations=str(sense.qcount) ) if sense.marked: sense_node.set("marked", "true") sense_node.text = sense.definition(length=100) if thes_links: sense_node.set("thesaurus", thes_links) with open(os.path.join(out_dir, letter + ".xml"), "w") as filehandle: filehandle.write(etree.tounicode(doc, pretty_print=True))
def _load_oed_entries(): iterator = EntryIterator(dictType='oed', verbosity='low') oed_entries = defaultdict(list) for entry in iterator.iterate(): headword = entry.lemma_manager().asciified() if (re.search(r'^[A-Z][a-z]', headword) and LINK_MANAGERS['ode'].translate_id(entry.id) is None): oed_entries[headword].append(OedData(entry.id, entry.headword, entry.definition(length=100))) return oed_entries
def _process_entries(self, file_filter): iterator = EntryIterator(dictType='oed', fixLigatures=True, verbosity='low', fileFilter=file_filter) for entry in iterator.iterate(): self.current_entry = entry for s1 in entry.s1blocks(): s1.share_quotations() for i, s in enumerate(s1.senses()): self._process_sense(s, i, len(s1.senses())) for s in entry.lemsect_senses(): self._process_sense(s, 5, 10) for s in entry.revsect_senses(): self._process_sense(s, 5, 10)
def store_features_by_sense(self): """ Iterate through each sense (both training and new data), parsing and storing the set of features that will be used by the Bayes classifier. This store is later used both to build the classifiers (picking out the training senses only) and as a cache of data for classifying the new senses. Features include: - definition keywords - quotation keywords - author names or titles from quotations - keywords from titles - subject labels - usage labels - first date - wordclass Lemma words (components derived by decomposing the lemma) are also collected here. Lemma words are not used directly for the Bayes classifier itself (they're folded into the set of definition keywords), but are used separately to help classify compounds. It's just more efficient to parse them along with everything else as part of this process. """ sense_parser = SenseParser(self.parent_dir, self.subject_map_file) for letter in string.ascii_uppercase: file_filter = 'oed_%s.xml' % letter ei = EntryIterator(dictType='oed', fixLigatures=True, verbosity='low', fileFilter=file_filter) outfile = os.path.join(self.senses_dir, letter) with open(outfile, 'wb') as filehandle: for entry in ei.iterate(): entry.share_quotations() etyma = entry.etymology().etyma() for sense in entry.senses(): sense_data_object = sense_parser.parse_sense( sense, etyma, entry.id) pickle.dump(sense_data_object, filehandle)
def process(self): for letter in string.ascii_lowercase: _clear_dir(self.out_dir, letter) frequencies, subfrequencies = _load_frequency_data(letter, self.include_subentries) print('Listing frequencies for entries in %s...' % letter) file_filter = 'oed_%s.xml' % letter.upper() iterator = EntryIterator(dictType='oed', fixLigatures=True, fileFilter=file_filter, verbosity=None) self.filecount = 0 previous = None self.initialize_doc() for e in iterator.iterate(): sortcode = e.lemma_manager().lexical_sort() if e.id in frequencies: frequency_blocks = frequencies[e.id] else: frequency_blocks = [] enode = _construct_node(e, 'entry', e.id, 0, e.label(), e.label(), frequency_blocks, self.terse) self.doc.append(enode) if self.include_subentries: for sense in e.senses(): sig = (e.id, sense.node_id()) if sig in subfrequencies: frequency_blocks = subfrequencies[sig] subnode = _construct_node(sense, 'subentry', e.id, sense.node_id(), sense.lemma, e.label(), frequency_blocks, self.terse) self.doc.append(subnode) if self.buffersize() >= MAX_BUFFER and sortcode != previous: self.write_buffer(letter) self.initialize_doc() previous = sortcode self.write_buffer(letter)
def store_content(content_dir): ThesInstance.__table__.drop(DB_ENGINE, checkfirst=True) ThesInstance.__table__.create(DB_ENGINE, checkfirst=True) # Store the lemmas for each thesaurus instance (using # refentry+refid+classid as the identifier) lemmas = {} # = _cache_thesaurus_lemmas(content_dir) from lex.entryiterator import EntryIterator iterator = EntryIterator(dictType='oed', fixLigatures=True, verbosity='low') buffer_size = 0 for entry in iterator.iterate(): entry.check_revised_status() for block in entry.s1blocks(): block.share_quotations() entry_size = block.weighted_size() senses = [s for s in block.senses() if not s.is_xref_sense()] senses.sort(key=_sortable_date) for i, s in enumerate(senses): records = _prepare_records(s, entry.id, entry.node_id(), lemmas, i + 1, entry_size,) for r in records: DB_SESSION.add(r) buffer_size += 1 for s in [s for s in entry.senses() if not s.is_in_sensesect() and not s.is_xref_sense()]: records = _prepare_records(s, entry.id, entry.node_id(), lemmas, 5, 1.0,) for r in records: DB_SESSION.add(r) buffer_size += 1 if buffer_size > 1000: DB_SESSION.commit() buffer_size = 0 DB_SESSION.commit()
def list_variants(self): """ Main process for iterating through OED and writing output XML documents. >>> VariantsLister(in_dir, out_dir).list_variants() """ self._clear_outdir() self._initialize_root() iterator = EntryIterator(path=self.in_dir, dictType='oed', verbosity='low', # fileFilter='oed_[K].xml', fixLigatures=True) for entry in iterator.iterate(): self.entry = entry self._process_entry() if self.buffersize >= FILE_SIZE: self._writebuffer() self._initialize_root() # Write a file for anything still left in the buffer after the # entry iterator has completed self._writebuffer()
def compile_features(): level4_nodes = load_level4_nodes() elements = {} for feature in FEATURES: directory = os.path.join(FEATURESET_ROOT, feature) elements_list = os.path.join(directory, 'elements.csv') elements[feature] = load_element_list(elements_list) matches = {feature: {} for feature in FEATURES} for feature in FEATURES: for element in elements[feature]: matches[feature][element.id] = [] salient_senses = [] sense_id = 0 iterator = EntryIterator(dict_type='oed', fix_ligatures=True, #file_filter='oed_[R].xml', verbosity='low') for entry in iterator.iterate(): generic_language_matches = _test_languages(entry, elements['language']) entry.share_quotations() # Number all the senses (so that we know which is the first sense, # which we need for the language feature), then pull out just # those that have thesaurus links senses = entry.senses() for i, sense in enumerate(senses): sense.count = i senses = [s for s in senses if s.thesaurus_categories()] for sense in senses: thes_nodes = sense.thesaurus_nodes().intersection(level4_nodes) if not thes_nodes: continue local_matches = dict() local_matches['author'] = _test_authors(sense, elements['author']) local_matches['compound'] = _test_compounds(sense, entry, elements['compound']) local_matches['language'] = {} if (local_matches['author'] or local_matches['compound'] or (generic_language_matches and sense.count == 0)): sense_id += 1 date = sense.date().start if sense.count == 0: local_matches['language'] = {id: date for id in generic_language_matches} local_matches['compound'] = {id: date for id in local_matches['compound']} for feature in FEATURES: for element_id, date in local_matches[feature].items(): for node in thes_nodes: matches[feature][element_id].append((sense_id, date, node)) salient_senses.append((sense_id, entry.id, sense.lexid(), sense.lemma)) # Print a csv file for each element in each featureset; the file # is a list of all the matching senses for feature, elements in matches.items(): for element_id, localmatches in elements.items(): out_file = os.path.join(FEATURESET_ROOT, feature, 'raw', '%d.csv' % element_id) with open(out_file, 'w') as csvfile: csvwriter = csv.writer(csvfile) for match in localmatches: csvwriter.writerow(match) # Print list of all the salient senses (those which are linked # from at least one match in one of the featuresets) with open(SENSES_FILE, 'w') as csvfile: csvwriter = csv.writer(csvfile) for sense in salient_senses: csvwriter.writerow(sense)
def build_weighted_size_index(): for letter in string.ascii_uppercase: iterator = EntryIterator(dict_type='oed', file_filter='oed_%s.xml' % letter, verbosity='low', fix_ligatures=True) entries = [] for entry in iterator.iterate(): blocks = [] for block in entry.s1blocks(): if (block.primary_wordclass() and block.primary_wordclass().penn): wordclass = block.primary_wordclass().penn else: wordclass = '?' if len(entry.s1blocks()) == 1: # If there's only one <s1> block, it's effectively # equivalent to the parent entry. So we make a dummy # entry, and later let it inherit from the parent entry. block_data = EntryData(int(entry.id), int(block.node_id()), wordclass, 0, [], 0, entry.is_revised, True, ) else: block_sizes = [(d, block.weighted_size( revised=entry.is_revised, disregard_obsolete=True, currentYear=d)) for d in DATES] block_sizes = [(d, round(n, 2)) for d, n in block_sizes] block_data = EntryData(int(entry.id), int(block.node_id()), wordclass, block.num_quotations(), block_sizes, block.date().start, entry.is_revised, False, ) blocks.append(block_data) try: entry_wordclass = blocks[0].wordclass except IndexError: entry_wordclass = '?' sizes = [(d, entry.weighted_size(revised=entry.is_revised, disregard_obsolete=True, currentYear=d)) for d in DATES] sizes = [(d, round(n, 2)) for d, n in sizes] num_quotations = entry.num_quotations(force_recount=True, include_derivatives=False) entry_data = EntryData(int(entry.id), 0, entry_wordclass, num_quotations, sizes, entry.date().start, entry.is_revised, False, ) if len(blocks) > 1: # Adjust block sizes to fit the entry size. We only need # bother if there's more than one block; if there's only # one block, it'll be inheriting from the entry anyway. blocks = _adjust_block_sizes(blocks, entry_data) entries.append(entry_data) entries.extend(blocks) out_file = os.path.join(PICKLE_DIR, letter) with open(out_file, 'wb') as filehandle: for entry in entries: pickle.dump(entry, filehandle)
def collect(self): # Initialize the buffers where quotations will be stored self.buffers = {} for year in range(DATE_MIN, DATE_MAX+1): decade = (year // 10) * 10 self.buffers[decade] = [] iterator = EntryIterator(dict_type='oed', verbosity='low') for entry in iterator.iterate(): # Skip entries which may be obscenities if any([token in entry.lemma for token in OBSCENITIES]): continue for sense in entry.senses(): if sense.is_subentry() or sense.is_subentry_like(): pass elif entry.num_quotations() > ENTRY_MAX_SIZE: continue if sense.is_subentry() or sense.is_subentry_like(): lemma = sense.lemma else: lemma = entry.lemma lemma = lemma[0:LEMMA_LENGTH_MAX] for quotation in sense.quotations(strip_suppressed=True): if (quotation.is_textless() or quotation.is_bracketed() or quotation.is_suppressed() or quotation.year < DATE_MIN or quotation.year > DATE_MAX or quotation.citation.date_qualifier or not quotation.citation.author() or quotation.citation.is_glossary() or quotation.text.comments() or quotation.is_modernized_text() or quotation.is_electronic_text() or quotation.is_title_quotation()): continue if quotation.text.node.findall('.//i'): continue if quotation.citation.edition: status = 30 elif quotation.citation.bibsub is not None: status = 20 else: status = 10 if quotation.citation.publication_datestring: status += 10 if quotation.citation.is_translation(): status += 10 text = quotation.text.plaintext text_lower = text.lower() if (len(text) < TEXT_LENGTH_MIN or len(text) > TEXT_LENGTH_MAX): continue if any([token in text_lower for token in OBSCENITIES]): continue citation = quotation.citation.html_lite if len(citation) > CITATION_LENGTH_MAX: continue decade = (quotation.year // 10) * 10 row = [quotation.year, citation, text, status, lemma, entry.id, sense.node_id(),] self.buffers[decade].append(row) if len(self.buffers[decade]) > BUFFER_SIZE: self._flush_buffer(decade) # Flush anything left in the buffers for decade in self.buffers: self._flush_buffer(decade)
def setUp(self): iterator = EntryIterator(path=FIXTURE_DIR, dictType='oed', verbosity=None, fixLigatures=True,) self.entries = {int(e.id): e for e in iterator.iterate()}