def lemma_as_superordinate(sense): """ Check if the lemma is used anywhere as a superordinate """ match = None riskable = False if sense.wordclass == 'NN' and len(sense.lemma) > 8: # If this is a subentry, then we can risk it without checking # the database if (sense.is_subentry and sense.lemma.lower() != sense.entry_lemma.lower() and len(sense.lemma) > 8): riskable = True # ...But if it's a main sense, we check that (a) it's the only sense # in its entry (or entry block), and then confirm that it's the only # sense recorded in the database elif sense.senses_in_entry == 1: instances = tdb.search(lemma=sense.lemma) if len(instances) == 1 and instances[0].refid == sense.node_id: riskable = True if riskable: qterm = sense.lemma.lower().replace('-', '').replace(' ', '') record = tdb.get_superordinate_record(qterm) if record is not None: if sense.bayes.branches(): # Look for commonalities with Bayes branches bayes_ancestors = sense.bayes.ancestors(level=2) record_ancestors = set([b.thesclass.ancestor(level=2) for b in record.branches]) common_branches = set.intersection(bayes_ancestors, record_ancestors) for b in record.branches: if b.thesclass.ancestor(level=2) in common_branches: match = b.thesclass break else: match = record.branches[0].thesclass if match is not None: match.reason_text = 'Lemma appears elsewhere as a superordinate' match.reason_code = 'lass' return match else: return None
def superordinate_lookup(self, sense, panic=False): """ Classify by looking up how other senses with the same superordinate have been classified. """ # Get all the branches relevant for this sense's long and/or short # superordinate. branches = [] superordinates = [sense.superordinate_full,] if sense.superordinate != sense.superordinate_full: superordinates.append(sense.superordinate) seen = set() for superordinate in [s for s in superordinates if s is not None]: superordinate = superordinate.replace('-', '').replace(' ', '') record = tdb.get_superordinate_record(superordinate) if record is not None: for b in record.branches: if b.thesclass.id not in seen: branches.append(b) seen.add(b.thesclass.id) if branches: branches_filtered = [] if panic: branches_filtered = [b for b in branches if b.probability > 0.4] else: xref_nodes = set(sense.xref_branches) branches_filtered = [b for b in branches if set.intersection(b.thesclass.ancestor_ids(), xref_nodes)] if not branches_filtered and sense.bayes.confidence() >= 4: bayes_ids = set(sense.bayes.ids()) branches_filtered = [b for b in branches if set.intersection(b.thesclass.ancestor_ids(), bayes_ids)] if not branches_filtered and sense.bayes.confidence() >= 4: # Try again with the Bayes classifications, but this # time just use their level-3 parents bayes_ids = set([b.ancestor(level=3).id for b in sense.bayes.branches() if b.ancestor(level=3) is not None]) branches_filtered = [b for b in branches if set.intersection( b.thesclass.ancestor_ids(), bayes_ids)] if branches_filtered: # Find the best branch below wordclass level, or failing that, # above wordclass level wc_branches = [b for b in branches_filtered if b.thesclass.wordclass is not None] or branches_filtered wc_branches.sort(key=lambda b: b.probability, reverse=True) winning_branch = wc_branches[0].thesclass # If this is a compound, see if we can get more specific # by finding an instance of the second element within the # winning branch. # (Fairly unlikely, since most of these should already # have been picked off by the compound classifiers.) if sense.last_element() is not None: subclass = tdb.highest_ranked(lemma=sense.last_element(), wordclass=sense.wordclass, branches=[winning_branch.id,]) if (subclass is not None and subclass.thesclass is not None): winning_branch = subclass.thesclass return winning_branch return None