def store_content(): ThesaurusInstance.objects.all().delete() ci = ContentIterator(in_dir=IN_DIR, fixLigatures=True, verbosity="low") records = [] for thesclass in ci.iterate(): for instance in thesclass.instances(): inf_node = instance.node.find("./infl") if inf_node is not None: inflections = inf_node.text or None else: inflections = None record = ThesaurusInstance( lemma=instance.lemma(), refentry=instance.refentry(), refid=instance.refid(), start_year=instance.start_date(), end_year=instance.end_date(), thesclass_id=thesclass.id(), inflections=inflections, ) records.append(record) if len(records) > 1000: ThesaurusInstance.objects.bulk_create(records) records = [] ThesaurusInstance.objects.bulk_create(records)
def store_taxonomy(): ThesaurusInstance.objects.all().delete() ThesaurusClass.objects.all().delete() ci = ContentIterator(in_dir=IN_DIR, fixLigatures=True, verbosity="low") for thesclass in ci.iterate(): if thesclass.id() == 1630 or thesclass.parent() == 1630: print(thesclass.id(), thesclass.label(), thesclass.wordclass(penn=True)) valid_ids = {thesclass.id(): thesclass.size() for thesclass in ci.iterate()} tree_manager = TaxonomyManager(lazy=True, verbosity=None) for level in range(1, 20): classes = [c for c in tree_manager.classes if c.level() == level and c.id() in valid_ids] stdout.write("%d\t%d\n" % (level, len(classes))) records = [] for thesclass in classes: revised_size = valid_ids[thesclass.id()] if thesclass.label(): label = thesclass.label()[0:LABEL_LENGTH] else: label = None record = ThesaurusClass( id=thesclass.id(), label=label, wordclass=thesclass.wordclass(penn=True), level=thesclass.level(), parent_id=thesclass.parent(), node_size=revised_size, branch_size=thesclass.size(branch=True), ) records.append(record) if len(records) > 1000: ThesaurusClass.objects.bulk_create(records) records = [] ThesaurusClass.objects.bulk_create(records)
def make_lean_ht(): iterator = ContentIterator(out_dir=OUT_DIR, yield_mode='file') for classes in iterator.iterate(): # Build a map of each class indexed by ID classmap = {thesclass.id(): thesclass for thesclass in classes} # Set of IDs marking classes which will be dropped dropped_classes = set() # Drop instances that represent minor senses for thesclass in classes: if thesclass.instances(): wordclass = thesclass.wordclass(penn=True) stripnodes = [] for instance in thesclass.instances(): minor_sense, minor_homograph = _test_status(instance, wordclass) if minor_sense or minor_homograph: stripnodes.append(instance.node) if stripnodes: container = stripnodes[0].getparent() for node in stripnodes: container.remove(node) # Reset the listed size of the class new_size = thesclass.size() - len(stripnodes) if thesclass.size() == thesclass.size(branch=True): thesclass.reset_size(new_size, branch=True) thesclass.reset_size(new_size) if thesclass.size(branch=True) == 0: dropped_classes.add(thesclass.id()) # Roll up minor leaf nodes to the parent node for thesclass in [c for c in classes if not c.id() in dropped_classes]: thesclass.reload_instances() parentclass = classmap.get(thesclass.parent(), None) if _viable_for_rollup(thesclass, parentclass): # Move instances from this class to the parent class for instance in thesclass.instances(): parentclass.node.append(instance.node) # Mark this class to be dropped dropped_classes.add(thesclass.id()) print('-----------------------------------------') print(thesclass.id(), thesclass.breadcrumb()) print('->', parentclass.id(), parentclass.breadcrumb()) # Remove child-node pointers for nodes which are about to be deleted for thesclass in [c for c in classes if not c.id() in dropped_classes]: for child_id in thesclass.child_nodes(): if child_id in dropped_classes: thesclass.remove_child(child_id) # Remove nodes for classes marked to be dropped for classid in dropped_classes: thesclass = classmap[classid] thesclass.node.getparent().remove(thesclass.node) # Redo counts in the remaining classes for thesclass in [c for c in classes if not c.id() in dropped_classes]: thesclass.reload_instances() thesclass.reset_size(len(thesclass.instances()))
def inflect_ht(): iterator = ContentIterator(in_dir=IN_DIR, out_dir=OUT_DIR, yield_mode='file') for classes in iterator.iterate(): for thesclass in classes: wordclass = thesclass.wordclass(penn=True) if wordclass in MAPPINGS: for instance in thesclass.instances(): z = _get_inflections(instance.lemma(), wordclass) if z: inf_node = etree.SubElement(instance.node, 'infl') inf_node.text = z
def _cache_thesaurus_lemmas(content_dir): lemmas = {} ci = ContentIterator(path=content_dir, fixLigatures=True, verbosity='low') for c in ci.iterate(): if c.instances(): if c.wordclass() is not None and c.wordclass() in WORDCLASS_MAP: wordclass = WORDCLASS_MAP[c.wordclass()] else: wordclass = None for i in c.instances(): identifier = '%d_%d_%d' % (int(i.refentry()), int(i.refid()), int(c.id())) if not identifier in lemmas: lemmas[identifier] = (i.lemma(), wordclass) return lemmas
def store_content(): ThesInstance.__table__.drop(DB_ENGINE, checkfirst=True) ThesInstance.__table__.create(DB_ENGINE, checkfirst=True) ci = ContentIterator(path=IN_DIR, fixLigatures=True, verbosity="low") buffer_size = 0 for thesclass in ci.iterate(): for instance in thesclass.instances(): record_data = { "lemma": instance.lemma(), "refentry": instance.refentry(), "refid": instance.refid(), "start_year": instance.start_date(), "end_year": instance.end_date(), "class_id": thesclass.id(), } DB_SESSION.add(ThesInstance(record_data)) buffer_size += 1 if buffer_size > 1000: DB_SESSION.commit() buffer_size = 0 DB_SESSION.commit()
def store_taxonomy(): ThesInstance.__table__.drop(DB_ENGINE, checkfirst=True) ThesClass.__table__.drop(DB_ENGINE, checkfirst=True) ThesClass.__table__.create(DB_ENGINE, checkfirst=True) ci = ContentIterator(path=IN_DIR, fixLigatures=True, verbosity="low") valid_ids = {thesclass.id(): thesclass.size() for thesclass in ci.iterate()} tree_manager = TaxonomyManager(lazy=True, verbosity=None) for level in range(1, 20): classes = [c for c in tree_manager.classes if c.level() == level and c.id() in valid_ids] print(level, len(classes)) buffer_size = 0 for thesaurus_class in classes: revised_size = valid_ids[thesaurus_class.id()] record = ThesClass(thesaurus_class, size=revised_size) DB_SESSION.add(record) buffer_size += 1 if buffer_size > 1000: DB_SESSION.commit() buffer_size = 0 DB_SESSION.commit()
def recheck_counts(): # Figure out the node sizes of all the individual classes node_sizes = defaultdict(int) iterator = ContentIterator(in_dir=CONTENT_DIR) for thesclass in iterator.iterate(): node_sizes[thesclass.id()] = len(thesclass.instances()) branch_sizes = {} cumulate = defaultdict(int) tree_manager = TaxonomyManager(dir=TAX_DIR, lazy=True, verbosity=None) levels = list(reversed(range(1, 20))) for level in levels: classes = [c for c in tree_manager.classes if c.level() == level] print(level, len(classes)) for thesclass in classes: branch_sizes[thesclass.id()] = cumulate[thesclass.id()] + node_sizes[thesclass.id()] for thesclass in classes: cumulate[thesclass.parent()] += branch_sizes[thesclass.id()] iterator = ContentIterator(in_dir=CONTENT_DIR, out_dir=CONTENT_DIR_TMP) for thesclass in iterator.iterate(): thesclass.node.set('numInstancesDirect', str(node_sizes[thesclass.id()])) thesclass.node.set('numInstancesDescendant', str(branch_sizes[thesclass.id()])) node_sizes[thesclass.id()] = len(thesclass.instances()) for in_file in os.listdir(TAX_DIR): lines = [] with open(os.path.join(TAX_DIR, in_file)) as filehandle: for line in filehandle: m = re.search('^[ \t]+<class id="(\d+)"', line) if m: id = int(m.group(1)) additions = ATTSTRING % (node_sizes[id], branch_sizes[id]) line = re.sub('>', additions, line, count=1) lines.append(line) with open(os.path.join(TAX_DIR_TMP, in_file), 'w') as filehandle: filehandle.writelines(lines)
def insert_child_nodes(): """ Copy child nodes from the taxonomy version of the data, and insert into the content version """ tree_manager = TaxonomyManager(dir=TAX_DIR, lazy=True, verbosity=None) childmap = defaultdict(list) for thesclass in tree_manager.classes: if thesclass.parent(): childmap[thesclass.parent()].append(thesclass) iterator = ContentIterator(in_dir=CONTENT_DIR, out_dir=CONTENT_DIR_TMP) for thesclass in iterator.iterate(): if thesclass.id() in childmap: cn_node = etree.Element("childNodes") for child in childmap[thesclass.id()]: n = etree.SubElement(cn_node, "node") n.set("idref", str(child.id())) n.set("numInstancesDescendant", str(child.size(branch=True))) if child.label(): n.text = child.label() if child.is_wordclass_level(): n.set("pos", child.wordclass()) thesclass.node.append(cn_node)
def compile_iteration(in_dir, out_dir, **kwargs): sanitize = kwargs.get('sanitize', False) drop_instances = kwargs.get('drop_instances', False) deduplicate = kwargs.get('deduplicate', False) iterator = ContentIterator(in_dir=in_dir, out_dir=out_dir, yield_mode='file') for classes in iterator.iterate(): # Build a map of each class indexed by ID classmap = {thesclass.id(): thesclass for thesclass in classes} # Set of IDs marking classes which will be dropped dropped_classes = set() # Drop instances that are not usable if drop_instances: for thesclass in classes: if thesclass.instances(): wordclass = thesclass.wordclass(penn=True) stripnodes = [instance for instance in thesclass.instances() if _is_not_usable(instance)] if stripnodes: for instance in stripnodes: instance.selfdestruct() # Reset the listed size of the class new_size = thesclass.size() - len(stripnodes) if thesclass.size() == thesclass.size(branch=True): thesclass.reset_size(new_size, branch=True) thesclass.reset_size(new_size) if thesclass.size(branch=True) == 0: dropped_classes.add(thesclass.id()) # Roll up minor leaf nodes to the parent node for thesclass in [c for c in classes if not c.id() in dropped_classes]: thesclass.reload_instances() parentclass = classmap.get(thesclass.parent(), None) if parentclass: grandparentclass = classmap.get(parentclass.parent(), None) else: grandparentclass = None if _viable_for_rollup(thesclass, parentclass, grandparentclass): # Move instances from this class to the parent class for instance in thesclass.instances(): parentclass.node.append(instance.node) # Mark this class to be dropped dropped_classes.add(thesclass.id()) # Remove child-node pointers for nodes which are about to be deleted for thesclass in [c for c in classes if not c.id() in dropped_classes]: for child_id in thesclass.child_nodes(): if child_id in dropped_classes: thesclass.remove_child(child_id) # Remove nodes for classes marked to be dropped for classid in dropped_classes: thesclass = classmap[classid] thesclass.selfdestruct() # Redo counts in the remaining classes for thesclass in [c for c in classes if not c.id() in dropped_classes]: thesclass.reload_instances() thesclass.reset_size(len(thesclass.instances())) if sanitize: for instance in thesclass.instances(): _sanitize_lemma(instance, thesclass.wordclass(penn=True)) if deduplicate: _deduplicate_instances(thesclass)