def compound_derivative(sense):
    if sense.last_element() is None:
        return None, None

    thesclass = None
    for ending, wordclasses, replacement in derivation_forms:
        if (sense.wordclass in wordclasses and
            sense.last_element().endswith(ending) and
            len(sense.last_element()) > len(ending) + 2):

            # Figure out what the base form would look like,
            #  if the lemma *is* a derivative
            # - Strip off the ending, then add the replacement ending
            #  (which is usually a null string).
            hypothetical_base =\
                sense.lemma[0:len(sense.lemma)-len(ending)] + replacement

            # Test if the hypothetical base form exists, and if so
            #  find out how it is classified
            base_classifications = tdb.ranked_search(lemma=hypothetical_base,
                                                     current_only=True)

            if (tdb.distinct_senses(base_classifications) == 1 and
                    base_classifications[0].thesclass is not None):
                thesclass = base_classifications[0].thesclass
                break

    if thesclass is not None:
        # Don't risk things like 'yellow-bellied' - these are
        #  likely to be transparent, not a derivative of e.g.
        #  'yellow-belly', so should go after existing guesses
        if ending in ('ed', 'ied') and sense.first_element().lower() in colours:
            position = 'last'
        else:
            position = 'first'
        return base_classifications[0].thesclass, position
    else:
        return None, None
Пример #2
0
def _parse_instances(instances):
    local_branches = set()
    if instances and tdb.distinct_senses(instances) <= 2:
        # Filter to just instances from the first sense that have
        #   a thesaurus class attached
        instances = [i for i in instances if i.refid == instances[0].refid
                     and i.thesclass is not None]

        # Get the set of level-3 ancestor branches covering the set of
        #  instances
        for i in instances:
            branch = i.thesclass.ancestor(level=3)
            if branch is not None:
                local_branches.add(branch)

    # Nix it if too many branches (>3) have emerged from this set of
    #  instances - which suggests that the underlying sense is too vague
    #  to be relied on
    if len(local_branches) > 3:
        local_branches = ()

    # Drop anything in the abstract properties and relative properties
    #  branches - these aren't very useful
    return set([b for b in local_branches if not _is_useless(b)])
Пример #3
0
def match_single_synonym(sense):
    # Drop out any highly polysemous synonyms
    synonyms = []
    for syn in sense.synonyms:
        instances = tdb.search(lemma=syn,
                               wordclass=sense.wordclass,
                               current_only=True)
        if tdb.distinct_senses(instances) < 20:
            synonyms.append(syn)

    if not synonyms:
        return None, None

    match = None
    matching_synonym = None

    # If the sense can be restricted by subject area, try to find a match for
    #  *any* synonym (even if there's only one)
    if not match and synonyms and sense.subjects:
        candidates = []
        for syn in synonyms:
            candidates.extend(tdb.ranked_search(lemma=syn,
                                                wordclass=sense.wordclass,
                                                subjects=sense.subjects,
                                                current_only=True))
        if candidates and candidates[0].thesclass is not None:
            match = candidates[0].thesclass
            matching_synonym = candidates[0].lemma

    # If the sense is an interjection, try to find a match for
    #  *any* synonym (even if there's only one) - since interjection
    #  synonyms are more reliable and less ambiguous
    if not match and synonyms and sense.wordclass == 'UH':
        candidates = []
        for syn in synonyms:
            candidates.extend(tdb.ranked_search(lemma=syn,
                                                wordclass='UH',
                                                current_only=True))
        if candidates and candidates[0].thesclass is not None:
            match = candidates[0].thesclass
            matching_synonym = candidates[0].lemma

    # If any of the synonyms are single-sense (or nearly single-sense),
    #  then we assume that that is the correct sense
    if not match:
        candidates = []
        for syn in synonyms:
            syn_senses = tdb.ranked_search(lemma=syn,
                                           wordclass=sense.wordclass,
                                           current_only=True)
            if (syn_senses and
                    (tdb.distinct_senses(syn_senses) == 1 or
                    (tdb.distinct_senses(syn_senses) <= 3 and
                    len(synonyms) == 1))):
                candidates.append(syn_senses[0])
        for c in candidates:
            if c.thesclass is not None:
                match = c.thesclass
                matching_synonym = c.lemma
                break

    # If the sense can be restricted by Bayes classification(s), try to
    #   find a match for *any* synonym (even if there's only one)
    if not match and synonyms and sense.bayes.is_usable():
        candidates = []
        for syn in synonyms:
            candidates.extend(tdb.ranked_search(lemma=syn,
                                                wordclass=sense.wordclass,
                                                branches=sense.bayes.ids(),
                                                current_only=True))
        if candidates and candidates[0].thesclass is not None:
            match = candidates[0].thesclass
            matching_synonym = candidates[0].lemma

    return match, matching_synonym
Пример #4
0
    def find_branch_from_superordinate(self, sense):
        """Classify by finding the main or only sense of the superordinate
        """
        if (sense.wordclass not in ('NN', 'JJ') or
                not sense.superordinate or
                len(sense.superordinate) < 3 or
                sense.superordinate in GENERICS):
            return None

        target_sense = None

        # If the superordinate is (more or less) single-sense, we assume that
        #  sense to be the correct one
        candidates = tdb.ranked_search(
            lemma=sense.superordinate,
            wordclass='NN',
            current_only=True)
        if candidates and tdb.distinct_senses(candidates) <= 2:
            target_sense = candidates[0]

        # Otherwise, narrow by Bayes classification
        if target_sense is None and sense.bayes.confidence() >= 8:
            target_sense = tdb.highest_ranked(
                lemma=sense.superordinate,
                wordclass='NN',
                branches=sense.bayes_based_classifications,
                current_only=True)

        # Otherwise, narrow by branches based on subject labels
        if target_sense is None and sense.label_based_classifications:
            target_sense = tdb.highest_ranked(
                lemma=sense.superordinate,
                wordclass='NN',
                branches=sense.label_based_classifications,
                current_only=True)

        # Otherwise, narrow by branches based on cross-references
        if target_sense is None and sense.xref_branches:
            target_sense = tdb.highest_ranked(
                lemma=sense.superordinate,
                wordclass='NN',
                branches=sense.xref_branches,
                current_only=True)

        # Last gasp: If the gloss consists more or less *only* of the
        #   superordinate (e.g. 'an abbey'), then it should be adequate to
        #   just use the main sense of the superordinate, even if it's
        #   multi-sense.
        # But don't risk this is there are cross-references or subject
        #   labels which might suggest a more specific use
        if (target_sense is None and not sense.subjects and
            not sense.xref_branches and sense.gloss is not None):
            g = re.sub(r'^(a|an|the) ', '', sense.gloss.lower())
            if g == sense.superordinate:
                target_sense = MAIN_SENSE_FINDER.main_sense(
                    lemma=sense.superordinate, wordclass='NN')

        # Otherwise, narrow by Bayes classification
        if target_sense is None and sense.bayes.is_usable():
            target_sense = tdb.highest_ranked(
                lemma=sense.superordinate,
                wordclass='NN',
                branches=sense.bayes_based_classifications,
                current_only=True)

        if target_sense is not None and target_sense.thesclass is not None:
            match = target_sense.thesclass
            if sense.wordclass == 'JJ':
                match = tdb.equivalent_class(match, 'JJ')
            return match
        else:
            return None