def infer_from_etyma(sense, subjectFilter=False): etymon, target_instance = (None, None) if len(sense.etyma) == 1 and sense.etyma[0][0] == sense.lemma: etymon = sense.etyma[0] elif (len(sense.etyma) == 2 and re.search(r'^[a-zA-Z]+$', sense.etyma[0][0]) and re.search(r'^-[a-z]+$', sense.etyma[1][0])): suffix = sense.etyma[1][0] if (deriv_tester.is_neutral_suffix(suffix) or (suffix in ('-ist', '-ian', '-ful') and sense.wordclass == 'JJ')): etymon = sense.etyma[0] if etymon is not None: # First try to find the exact sense, in case the etymon points to a # specific sense - see e.g. lam n./3 target_instance = tdb.highest_ranked(lemma=etymon[0], refentry=etymon[1], refid=etymon[2], exact_sense=True) # ...but if the etymon just points to an entry in general, find that # entry's main sense if target_instance is None and not subjectFilter: target_instance = main_sense_finder.main_sense(lemma=etymon[0], refentry=etymon[1]) elif target_instance is None and subjectFilter: main_sense = tdb.highest_ranked(lemma=etymon[0], refentry=etymon[1], subjects=sense.subjects) if main_sense is not None and main_sense.entry_size < 100: target_instance = main_sense if target_instance is not None: # Check if the target is also referenced in the sense's # cross-references (in case a particular sense is pointed to, # as in 'nocturning'). for xr in sense.cross_references: if (xr.lemma == target_instance.lemma and xr.refentry == target_instance.refentry): specific_target = tdb.highest_ranked(lemma=xr.lemma, refentry=xr.refentry, refid=xr.refid, exact_sense=True) if specific_target is not None: target_instance = specific_target break if target_instance is not None and target_instance.thesclass is not None: if target_instance.wordclass == sense.wordclass: match = target_instance.thesclass.wordclass_parent() else: match = tdb.equivalent_class(target_instance.thesclass, sense.wordclass) if match is not None: match.reason_code = 'etym' match.reason_text = 'Analogy with "%s" in etymology' % etymon[0] return match else: return None
def main_sense(self, **kwargs): lemma = kwargs.get('lemma') wordclass = kwargs.get('wordclass', None) entry_id = kwargs.get('refentry', None) listed_only = kwargs.get('listed_only', False) # Work out what the wordclass should be, if it's not been passed # explicitly naive_main_sense = None if wordclass is None: naive_main_sense = tdb.highest_ranked(lemma=lemma, refentry=entry_id) if naive_main_sense is not None: wordclass = naive_main_sense.wordclass else: wordclass = 'NN' # Find the main sense from the look-up tables instance = None try: instance = MainSense.cache[wordclass][lemma] except KeyError: try: refentry, refid = MainSense.data[wordclass][lemma] except KeyError: pass else: instance = tdb.highest_ranked(lemma=lemma, wordclass=wordclass, refentry=refentry, refid=refid) # Store this instance in the cache MainSense.cache[wordclass][lemma] = instance # Nix the instance if it's got the wrong refentry value if (instance is not None and entry_id is not None and instance.refentry != entry_id): instance = None # Fall back to ThesaurusDB's main_sense algorithm, # unless the listed_only argument has been passed if not listed_only: if instance is None and naive_main_sense is not None: # Don't bother recalculating if already calculated above instance = naive_main_sense elif instance is None: instance = tdb.highest_ranked(lemma=lemma, wordclass=wordclass, refentry=entry_id) return instance
def attributive_of_noun(sense, main_sense_of_entry): """ An attrib. sense of a noun is treated as the adj. equivalent of the main sense of the entry (or of a particular sense, if referenced) """ # Use a particular sense if it's cross-referenced # Check that it's an internal cross-reference to a main sense # (hence no lemma) xrefs = [xr for xr in sense.cross_references if xr.refentry == sense.entry_id and xr.lemma is None] if xrefs: target_sense = tdb.highest_ranked(lemma=sense.lemma, wordclass='NN', refentry=xrefs[0].refentry, refid=xrefs[0].refid) if target_sense is not None and target_sense.thesclass is not None: equiv = tdb.equivalent_class(target_sense.thesclass, 'JJ') equiv.reason_text = 'Adjective equivalent of cross-referenced noun sense' equiv.reason_code = 'attb' return equiv elif target_sense is not None: return None # ... otherwise, default to the main sense of the entry if (main_sense_of_entry is not None and main_sense_of_entry.thesclass is not None): equiv = tdb.equivalent_class(main_sense_of_entry.thesclass, 'JJ') equiv.reason_text = 'Adjective equivalent of main noun sense' equiv.reason_code = 'attb' return equiv else: return None
def infer_from_neighbouring_wordclass(sense): match = None if sense.wordclass in ('JJ', 'VB'): opposite_class = 'NN' elif sense.wordclass in ('NN', 'RB'): opposite_class = 'JJ' else: opposite_class = None if opposite_class is not None: opposite = tdb.highest_ranked(lemma=sense.lemma, refentry=sense.entry_id, wordclass=opposite_class) if opposite is not None and opposite.thesclass is not None: match = tdb.equivalent_class(opposite.thesclass, sense.wordclass) if match is not None: match.reason_code = 'nbor' match.reason_text = 'Inferred from neighbouring %s branch' % opposite_class return match
def affix_subentry(sense): stem = sense.entry_lemma.strip('-') if sense.lemma.startswith(stem) and re.search(r'^[a-zA-Z]+$', stem): ending = re.sub(r'^' + stem, '', sense.lemma) ending = ending.strip(' -') if len(ending) < 4: target_sense = None elif sense.subjects: target_sense = tdb.highest_ranked(lemma=ending, wordclass=sense.wordclass, subjects=sense.subjects) else: target_sense = main_sense_finder.main_sense(lemma=ending, wordclass=sense.wordclass) if target_sense is not None and target_sense.thesclass is not None: match = target_sense.thesclass match.reason_text = 'Inferred from last element ("%s")' % ending match.reason_code = 'driv' return match return None
def find_branch_from_superordinate(self, sense): """Classify by finding the main or only sense of the superordinate """ if (sense.wordclass not in ('NN', 'JJ') or not sense.superordinate or len(sense.superordinate) < 3 or sense.superordinate in GENERICS): return None target_sense = None # If the superordinate is (more or less) single-sense, we assume that # sense to be the correct one candidates = tdb.ranked_search( lemma=sense.superordinate, wordclass='NN', current_only=True) if candidates and tdb.distinct_senses(candidates) <= 2: target_sense = candidates[0] # Otherwise, narrow by Bayes classification if target_sense is None and sense.bayes.confidence() >= 8: target_sense = tdb.highest_ranked( lemma=sense.superordinate, wordclass='NN', branches=sense.bayes_based_classifications, current_only=True) # Otherwise, narrow by branches based on subject labels if target_sense is None and sense.label_based_classifications: target_sense = tdb.highest_ranked( lemma=sense.superordinate, wordclass='NN', branches=sense.label_based_classifications, current_only=True) # Otherwise, narrow by branches based on cross-references if target_sense is None and sense.xref_branches: target_sense = tdb.highest_ranked( lemma=sense.superordinate, wordclass='NN', branches=sense.xref_branches, current_only=True) # Last gasp: If the gloss consists more or less *only* of the # superordinate (e.g. 'an abbey'), then it should be adequate to # just use the main sense of the superordinate, even if it's # multi-sense. # But don't risk this is there are cross-references or subject # labels which might suggest a more specific use if (target_sense is None and not sense.subjects and not sense.xref_branches and sense.gloss is not None): g = re.sub(r'^(a|an|the) ', '', sense.gloss.lower()) if g == sense.superordinate: target_sense = MAIN_SENSE_FINDER.main_sense( lemma=sense.superordinate, wordclass='NN') # Otherwise, narrow by Bayes classification if target_sense is None and sense.bayes.is_usable(): target_sense = tdb.highest_ranked( lemma=sense.superordinate, wordclass='NN', branches=sense.bayes_based_classifications, current_only=True) if target_sense is not None and target_sense.thesclass is not None: match = target_sense.thesclass if sense.wordclass == 'JJ': match = tdb.equivalent_class(match, 'JJ') return match else: return None
def superordinate_lookup(self, sense, panic=False): """ Classify by looking up how other senses with the same superordinate have been classified. """ # Get all the branches relevant for this sense's long and/or short # superordinate. branches = [] superordinates = [sense.superordinate_full,] if sense.superordinate != sense.superordinate_full: superordinates.append(sense.superordinate) seen = set() for superordinate in [s for s in superordinates if s is not None]: superordinate = superordinate.replace('-', '').replace(' ', '') record = tdb.get_superordinate_record(superordinate) if record is not None: for b in record.branches: if b.thesclass.id not in seen: branches.append(b) seen.add(b.thesclass.id) if branches: branches_filtered = [] if panic: branches_filtered = [b for b in branches if b.probability > 0.4] else: xref_nodes = set(sense.xref_branches) branches_filtered = [b for b in branches if set.intersection(b.thesclass.ancestor_ids(), xref_nodes)] if not branches_filtered and sense.bayes.confidence() >= 4: bayes_ids = set(sense.bayes.ids()) branches_filtered = [b for b in branches if set.intersection(b.thesclass.ancestor_ids(), bayes_ids)] if not branches_filtered and sense.bayes.confidence() >= 4: # Try again with the Bayes classifications, but this # time just use their level-3 parents bayes_ids = set([b.ancestor(level=3).id for b in sense.bayes.branches() if b.ancestor(level=3) is not None]) branches_filtered = [b for b in branches if set.intersection( b.thesclass.ancestor_ids(), bayes_ids)] if branches_filtered: # Find the best branch below wordclass level, or failing that, # above wordclass level wc_branches = [b for b in branches_filtered if b.thesclass.wordclass is not None] or branches_filtered wc_branches.sort(key=lambda b: b.probability, reverse=True) winning_branch = wc_branches[0].thesclass # If this is a compound, see if we can get more specific # by finding an instance of the second element within the # winning branch. # (Fairly unlikely, since most of these should already # have been picked off by the compound classifiers.) if sense.last_element() is not None: subclass = tdb.highest_ranked(lemma=sense.last_element(), wordclass=sense.wordclass, branches=[winning_branch.id,]) if (subclass is not None and subclass.thesclass is not None): winning_branch = subclass.thesclass return winning_branch return None