Пример #1
0
def infer_from_etyma(sense, subjectFilter=False):
    etymon, target_instance = (None, None)
    if len(sense.etyma) == 1 and sense.etyma[0][0] == sense.lemma:
        etymon = sense.etyma[0]
    elif (len(sense.etyma) == 2 and
        re.search(r'^[a-zA-Z]+$', sense.etyma[0][0]) and
        re.search(r'^-[a-z]+$', sense.etyma[1][0])):
        suffix = sense.etyma[1][0]
        if (deriv_tester.is_neutral_suffix(suffix) or
            (suffix in ('-ist', '-ian', '-ful') and sense.wordclass == 'JJ')):
            etymon = sense.etyma[0]

    if etymon is not None:
        # First try to find the exact sense, in case the etymon points to a
        #  specific sense - see e.g. lam n./3
        target_instance = tdb.highest_ranked(lemma=etymon[0],
                                             refentry=etymon[1],
                                             refid=etymon[2],
                                             exact_sense=True)
        # ...but if the etymon just points to an entry in general, find that
        #  entry's main sense
        if target_instance is None and not subjectFilter:
            target_instance = main_sense_finder.main_sense(lemma=etymon[0],
                                                           refentry=etymon[1])
        elif target_instance is None and subjectFilter:
            main_sense = tdb.highest_ranked(lemma=etymon[0],
                                            refentry=etymon[1],
                                            subjects=sense.subjects)
            if main_sense is not None and main_sense.entry_size < 100:
                target_instance = main_sense

        if target_instance is not None:
            # Check if the target is also referenced in the sense's
            #  cross-references (in case a particular sense is pointed to,
            #  as in 'nocturning').
            for xr in sense.cross_references:
                if (xr.lemma == target_instance.lemma and
                    xr.refentry == target_instance.refentry):
                    specific_target = tdb.highest_ranked(lemma=xr.lemma,
                                                         refentry=xr.refentry,
                                                         refid=xr.refid,
                                                         exact_sense=True)
                    if specific_target is not None:
                        target_instance = specific_target
                    break

    if target_instance is not None and target_instance.thesclass is not None:
        if target_instance.wordclass == sense.wordclass:
            match = target_instance.thesclass.wordclass_parent()
        else:
            match = tdb.equivalent_class(target_instance.thesclass, sense.wordclass)
        if match is not None:
            match.reason_code = 'etym'
            match.reason_text = 'Analogy with "%s" in etymology' % etymon[0]
        return match
    else:
        return None
Пример #2
0
    def main_sense(self, **kwargs):
        lemma = kwargs.get('lemma')
        wordclass = kwargs.get('wordclass', None)
        entry_id = kwargs.get('refentry', None)
        listed_only = kwargs.get('listed_only', False)

        # Work out what the wordclass should be, if it's not been passed
        #  explicitly
        naive_main_sense = None
        if wordclass is None:
            naive_main_sense = tdb.highest_ranked(lemma=lemma,
                                                  refentry=entry_id)
            if naive_main_sense is not None:
                wordclass = naive_main_sense.wordclass
            else:
                wordclass = 'NN'

        # Find the main sense from the look-up tables
        instance = None
        try:
            instance = MainSense.cache[wordclass][lemma]
        except KeyError:
            try:
                refentry, refid = MainSense.data[wordclass][lemma]
            except KeyError:
                pass
            else:
                instance = tdb.highest_ranked(lemma=lemma,
                                              wordclass=wordclass,
                                              refentry=refentry,
                                              refid=refid)
                # Store this instance in the cache
                MainSense.cache[wordclass][lemma] = instance

        # Nix the instance if it's got the wrong refentry value
        if (instance is not None and
                entry_id is not None and
                instance.refentry != entry_id):
            instance = None

        # Fall back to ThesaurusDB's main_sense algorithm,
        #  unless the listed_only argument has been passed
        if not listed_only:
            if instance is None and naive_main_sense is not None:
                # Don't bother recalculating if already calculated above
                instance = naive_main_sense
            elif instance is None:
                instance = tdb.highest_ranked(lemma=lemma,
                                              wordclass=wordclass,
                                              refentry=entry_id)

        return instance
Пример #3
0
def attributive_of_noun(sense, main_sense_of_entry):
    """
    An attrib. sense of a noun is treated as the adj. equivalent
    of the main sense of the entry (or of a particular sense, if referenced)
    """
    # Use a particular sense if it's cross-referenced
    #  Check that it's an internal cross-reference to a main sense
    #   (hence no lemma)
    xrefs = [xr for xr in sense.cross_references if
        xr.refentry == sense.entry_id and xr.lemma is None]
    if xrefs:
        target_sense = tdb.highest_ranked(lemma=sense.lemma,
                                          wordclass='NN',
                                          refentry=xrefs[0].refentry,
                                          refid=xrefs[0].refid)
        if target_sense is not None and target_sense.thesclass is not None:
            equiv = tdb.equivalent_class(target_sense.thesclass, 'JJ')
            equiv.reason_text = 'Adjective equivalent of cross-referenced noun sense'
            equiv.reason_code = 'attb'
            return equiv
        elif target_sense is not None:
            return None

    # ... otherwise, default to the main sense of the entry
    if (main_sense_of_entry is not None and
        main_sense_of_entry.thesclass is not None):
        equiv = tdb.equivalent_class(main_sense_of_entry.thesclass, 'JJ')
        equiv.reason_text = 'Adjective equivalent of main noun sense'
        equiv.reason_code = 'attb'
        return equiv
    else:
        return None
Пример #4
0
def infer_from_neighbouring_wordclass(sense):
    match = None
    if sense.wordclass in ('JJ', 'VB'):
        opposite_class = 'NN'
    elif sense.wordclass in ('NN', 'RB'):
        opposite_class = 'JJ'
    else:
        opposite_class = None
    if opposite_class is not None:
        opposite = tdb.highest_ranked(lemma=sense.lemma,
                                      refentry=sense.entry_id,
                                      wordclass=opposite_class)
        if opposite is not None and opposite.thesclass is not None:
            match = tdb.equivalent_class(opposite.thesclass, sense.wordclass)

    if match is not None:
        match.reason_code = 'nbor'
        match.reason_text = 'Inferred from neighbouring %s branch' % opposite_class
    return match
Пример #5
0
def affix_subentry(sense):
    stem = sense.entry_lemma.strip('-')
    if sense.lemma.startswith(stem) and re.search(r'^[a-zA-Z]+$', stem):
        ending = re.sub(r'^' + stem, '', sense.lemma)
        ending = ending.strip(' -')
        if len(ending) < 4:
            target_sense = None
        elif sense.subjects:
            target_sense = tdb.highest_ranked(lemma=ending,
                                              wordclass=sense.wordclass,
                                              subjects=sense.subjects)
        else:
            target_sense = main_sense_finder.main_sense(lemma=ending,
                                                        wordclass=sense.wordclass)
        if target_sense is not None and target_sense.thesclass is not None:
            match = target_sense.thesclass
            match.reason_text = 'Inferred from last element ("%s")' % ending
            match.reason_code = 'driv'
            return match
    return None
Пример #6
0
    def find_branch_from_superordinate(self, sense):
        """Classify by finding the main or only sense of the superordinate
        """
        if (sense.wordclass not in ('NN', 'JJ') or
                not sense.superordinate or
                len(sense.superordinate) < 3 or
                sense.superordinate in GENERICS):
            return None

        target_sense = None

        # If the superordinate is (more or less) single-sense, we assume that
        #  sense to be the correct one
        candidates = tdb.ranked_search(
            lemma=sense.superordinate,
            wordclass='NN',
            current_only=True)
        if candidates and tdb.distinct_senses(candidates) <= 2:
            target_sense = candidates[0]

        # Otherwise, narrow by Bayes classification
        if target_sense is None and sense.bayes.confidence() >= 8:
            target_sense = tdb.highest_ranked(
                lemma=sense.superordinate,
                wordclass='NN',
                branches=sense.bayes_based_classifications,
                current_only=True)

        # Otherwise, narrow by branches based on subject labels
        if target_sense is None and sense.label_based_classifications:
            target_sense = tdb.highest_ranked(
                lemma=sense.superordinate,
                wordclass='NN',
                branches=sense.label_based_classifications,
                current_only=True)

        # Otherwise, narrow by branches based on cross-references
        if target_sense is None and sense.xref_branches:
            target_sense = tdb.highest_ranked(
                lemma=sense.superordinate,
                wordclass='NN',
                branches=sense.xref_branches,
                current_only=True)

        # Last gasp: If the gloss consists more or less *only* of the
        #   superordinate (e.g. 'an abbey'), then it should be adequate to
        #   just use the main sense of the superordinate, even if it's
        #   multi-sense.
        # But don't risk this is there are cross-references or subject
        #   labels which might suggest a more specific use
        if (target_sense is None and not sense.subjects and
            not sense.xref_branches and sense.gloss is not None):
            g = re.sub(r'^(a|an|the) ', '', sense.gloss.lower())
            if g == sense.superordinate:
                target_sense = MAIN_SENSE_FINDER.main_sense(
                    lemma=sense.superordinate, wordclass='NN')

        # Otherwise, narrow by Bayes classification
        if target_sense is None and sense.bayes.is_usable():
            target_sense = tdb.highest_ranked(
                lemma=sense.superordinate,
                wordclass='NN',
                branches=sense.bayes_based_classifications,
                current_only=True)

        if target_sense is not None and target_sense.thesclass is not None:
            match = target_sense.thesclass
            if sense.wordclass == 'JJ':
                match = tdb.equivalent_class(match, 'JJ')
            return match
        else:
            return None
Пример #7
0
    def superordinate_lookup(self, sense, panic=False):
        """
        Classify by looking up how other senses with the same superordinate
        have been classified.
        """
        # Get all the branches relevant for this sense's long and/or short
        #  superordinate.
        branches = []
        superordinates = [sense.superordinate_full,]
        if sense.superordinate != sense.superordinate_full:
            superordinates.append(sense.superordinate)
        seen = set()
        for superordinate in [s for s in superordinates if s is not None]:
            superordinate = superordinate.replace('-', '').replace(' ', '')
            record = tdb.get_superordinate_record(superordinate)
            if record is not None:
                for b in record.branches:
                    if b.thesclass.id not in seen:
                        branches.append(b)
                        seen.add(b.thesclass.id)

        if branches:
            branches_filtered = []
            if panic:
                branches_filtered = [b for b in branches if b.probability > 0.4]
            else:
                xref_nodes = set(sense.xref_branches)
                branches_filtered = [b for b in branches if
                    set.intersection(b.thesclass.ancestor_ids(), xref_nodes)]

                if not branches_filtered and sense.bayes.confidence() >= 4:
                    bayes_ids = set(sense.bayes.ids())
                    branches_filtered = [b for b in branches if
                        set.intersection(b.thesclass.ancestor_ids(), bayes_ids)]

                if not branches_filtered and sense.bayes.confidence() >= 4:
                    # Try again with the Bayes classifications, but this
                    #  time just use their level-3 parents
                    bayes_ids = set([b.ancestor(level=3).id for b in
                                     sense.bayes.branches()
                                     if b.ancestor(level=3) is not None])
                    branches_filtered = [b for b in branches if
                                         set.intersection(
                                         b.thesclass.ancestor_ids(), bayes_ids)]

            if branches_filtered:
                # Find the best branch below wordclass level, or failing that,
                #   above wordclass level
                wc_branches = [b for b in branches_filtered if
                    b.thesclass.wordclass is not None] or branches_filtered
                wc_branches.sort(key=lambda b: b.probability, reverse=True)
                winning_branch = wc_branches[0].thesclass

                # If this is a compound, see if we can get more specific
                #   by finding an instance of the second element within the
                #   winning branch.
                # (Fairly unlikely, since most of these should already
                #   have been picked off by the compound classifiers.)
                if sense.last_element() is not None:
                    subclass = tdb.highest_ranked(lemma=sense.last_element(),
                                                  wordclass=sense.wordclass,
                                                  branches=[winning_branch.id,])
                    if (subclass is not None and
                        subclass.thesclass is not None):
                        winning_branch = subclass.thesclass

                return winning_branch

        return None