Python lemmatize示例，nym_embeddings.pywsd.utils.lemmatize Python示例

示例#1

0

显示文件

def max_similarity(context_sentence: str,
                   ambiguous_word: str,
                   option="path",
                   lemma=True,
                   context_is_lemmatized=False,
                   pos=None,
                   best=True) -> "wn.Synset":
    """
    Perform WSD by maximizing the sum of maximum similarity between possible
    synsets of all words in the context sentence and the possible synsets of the
    ambiguous words (see https://ibin.co/4gG9zUlejUUA.png):
    {argmax}_{synset(a)}(\sum_{i}^{n}{{max}_{synset(i)}(sim(i,a))}

    :param context_sentence: String, a sentence.
    :param ambiguous_word: String, a single word.
    :return: If best, returns only the best Synset, else returns a dict.
    """
    ambiguous_word = lemmatize(ambiguous_word)
    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    if context_is_lemmatized:
        context_sentence = word_tokenize(context_sentence)
    else:
        context_sentence = [
            lemmatize(w) for w in word_tokenize(context_sentence)
        ]
    result = {}
    for i in wn.synsets(ambiguous_word, pos=pos):
        result[i] = 0
        for j in context_sentence:
            _result = [0]
            for k in wn.synsets(j):
                _result.append(sim(i, k, option))
            result[i] += max(_result)

    if option in ["res", "resnik"]:  # lower score = more similar
        result = sorted([(v, k) for k, v in result.items()])
    else:  # higher score = more similar
        result = sorted([(v, k) for k, v in result.items()], reverse=True)

    try:
        return result[0][1] if best else result
    except IndexError:
        return None

示例#2

0

显示文件

def adapted_lesk(context_sentence: str,
                 ambiguous_word: str,
                 pos: str = None,
                 lemma=True,
                 stem=False,
                 hyperhypo=True,
                 stop=True,
                 context_is_lemmatized=False,
                 nbest=False,
                 keepscore=False,
                 normalizescore=False,
                 from_cache=True) -> "wn.Synset":
    """
    This function is the implementation of the Adapted Lesk algorithm,
    described in Banerjee and Pederson (2002). It makes use of the lexical
    items from semantically related senses within the wordnet
    hierarchies and to generate more lexical items for each sense.
    see www.d.umn.edu/~tpederse/Pubs/cicling2002-b.pdf‎

    :param context_sentence: String, sentence or document.
    :param ambiguous_word: String, a single word.
    :param pos: String, one of 'a', 'r', 's', 'n', 'v', or None.
    :return: A Synset for the estimated best sense.
    """

    # Ensure that ambiguous word is a lemma.
    ambiguous_word = lemmatize(ambiguous_word)
    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    # Get the signatures for each synset.
    ss_sign = signatures(ambiguous_word,
                         pos=pos,
                         hyperhypo=hyperhypo,
                         adapted=True,
                         remove_stopwords=stop,
                         to_lemmatize=lemma,
                         remove_numbers=True,
                         lowercase=True,
                         to_stem=stem,
                         from_cache=from_cache)

    # Disambiguate the sense in context.
    context_sentence = context_sentence.split(
    ) if context_is_lemmatized else lemmatize_sentence(context_sentence)
    return compare_overlaps(context_sentence,
                            ss_sign,
                            nbest=nbest,
                            keepscore=keepscore,
                            normalizescore=normalizescore)

示例#3

0

显示文件

def cosine_lesk(context_sentence: str,
                ambiguous_word: str,
                pos: str = None,
                lemma=True,
                stem=True,
                hyperhypo=True,
                stop=True,
                context_is_lemmatized=False,
                nbest=False,
                from_cache=True) -> "wn.Synset":
    """
    In line with vector space models, we can use cosine to calculate overlaps
    instead of using raw overlap counts. Essentially, the idea of using
    signatures (aka 'sense paraphrases') is lesk-like.

    :param context_sentence: String, sentence or document.
    :param ambiguous_word: String, a single word.
    :param pos: String, one of 'a', 'r', 's', 'n', 'v', or None.
    :return: A Synset for the estimated best sense.
    """

    # Ensure that ambiguous word is a lemma.
    ambiguous_word = lemmatize(ambiguous_word)

    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    ss_sign = simple_signatures(ambiguous_word,
                                pos,
                                lemma,
                                stem,
                                hyperhypo,
                                stop,
                                from_cache=from_cache)
    if context_is_lemmatized:
        context_sentence = " ".join(context_sentence.split())
    else:
        context_sentence = " ".join(lemmatize_sentence(context_sentence))

    scores = []
    for ss, signature in ss_sign.items():
        # Lowercase and replace "_" with spaces.
        signature = " ".join(map(str, signature)).lower().replace("_", " ")
        scores.append((cos_sim(context_sentence, signature), ss))

    scores = sorted(scores, reverse=True)
    return scores if nbest else scores[0][1]

示例#4

0

显示文件

def simple_lesk(tokens: list,
                ambiguous_word: str,
                pos: str = None,
                lemma=True,
                stem=False,
                hyperhypo=True,
                stop=True,
                context_is_lemmatized=False,
                nbest=False,
                keepscore=False,
                normalizescore=False,
                from_cache=True) -> "wn.Synset":
    """
    Simple Lesk is somewhere in between using more than the
    original Lesk algorithm (1986) and using less signature
    words than adapted Lesk (Banerjee and Pederson, 2002)

    :param context_sentence: String, sentence or document.
    :param ambiguous_word: String, a single word.
    :param pos: String, one of 'a', 'r', 's', 'n', 'v', or None.
    :return: A Synset for the estimated best sense.
    """
    # Ensure that ambiguous word is a lemma.
    ambiguous_word = lemmatize(ambiguous_word, pos=pos)
    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    # Get the signatures for each synset.
    ss_sign = simple_signatures(ambiguous_word,
                                pos,
                                lemma,
                                stem,
                                hyperhypo,
                                stop,
                                from_cache=from_cache)
    # Disambiguate the sense in context.
    context_sentence = tokens  #if context_is_lemmatized else lemmatize_sentence(context_sentence)
    return compare_overlaps(context_sentence,
                            ss_sign,
                            nbest=nbest,
                            keepscore=keepscore,
                            normalizescore=normalizescore)

示例#5

0

显示文件

def original_lesk(context_sentence: str,
                  ambiguous_word: str,
                  dictionary=None,
                  from_cache=True) -> "wn.Synset":
    """
    This function is the implementation of the original Lesk algorithm (1986).
    It requires a dictionary which contains the definition of the different
    sense of each word. See http://dl.acm.org/citation.cfm?id=318728

    :param context_sentence: String, sentence or document.
    :param ambiguous_word: String, a single word.
    :return: A Synset for the estimated best sense.
    """

    ambiguous_word = lemmatize(ambiguous_word)
    if not dictionary:  # If dictionary is not provided, use the WN defintion.
        dictionary = signatures(ambiguous_word,
                                original_lesk=True,
                                from_cache=from_cache)
    best_sense = compare_overlaps_greedy(context_sentence.split(), dictionary)

    return best_sense

示例#6

0

显示文件

def synset_signatures(ss: "wn.Synset",
                      hyperhypo=True,
                      adapted=False,
                      remove_stopwords=True,
                      to_lemmatize=True,
                      remove_numbers=True,
                      lowercase=True,
                      original_lesk=False,
                      from_cache=True) -> set:
    """
    Takes a Synset and returns its signature words.

    :param ss: An instance of wn.Synset.
    :return: A set of signature strings
    """
    if from_cache:
        return synset_signatures_from_cache(ss, hyperhypo, adapted,
                                            original_lesk)

    # Collects the signatures from WordNet.
    signature = []

    # Adds the definition, example sentences and lemma_names.
    signature += word_tokenize(ss.definition())

    # If the original lesk signature is requested, skip the other signatures.
    if original_lesk:
        return set(signature)

    # Adds the examples and lemma names.
    signature += chain(*[word_tokenize(eg) for eg in ss.examples()])
    signature += ss.lemma_names()

    # Includes lemma_names of hyper-/hyponyms.
    if hyperhypo:
        hyperhyponyms = set(ss.hyponyms() + ss.hypernyms() +
                            ss.instance_hyponyms() + ss.instance_hypernyms())
        signature += set(chain(*[i.lemma_names() for i in hyperhyponyms]))

    # Includes signatures from related senses as in Adapted Lesk.
    if adapted:
        # Includes lemma_names from holonyms, meronyms and similar_tos
        related_senses = set(ss.member_holonyms() + ss.part_holonyms() + ss.substance_holonyms() + \
                             ss.member_meronyms() + ss.part_meronyms() + ss.substance_meronyms() + \
                             ss.similar_tos())
        signature += set(chain(*[i.lemma_names() for i in related_senses]))

    # Lowercase.
    signature = set(s.lower() for s in signature) if lowercase else signature

    # Removes stopwords.
    signature = set(signature).difference(
        EN_STOPWORDS) if remove_stopwords else signature

    # Lemmatized context is preferred over stemmed context.
    if to_lemmatize:
        signature = [
            lemmatize(s)
            if lowercase else lemmatize(s)  # Lowercasing checks here.
            for s in signature
            # We only throw away if both remove_numbers and s is a digit are true.
            if not (remove_numbers and s.isdigit())
        ]

    # Keep only the unique bag-of-words
    return set(signature)