예제 #1
0
def simple_signature(ss, stem=False):
    """
    Returns a synsets_signatures dictionary that includes signature words of a
    sense from its:
    (i)   definition
    (ii)  example sentences
    (iii) hypernyms and hyponyms
    """
    signature = []
    # print("ss: ",ss)
    # Includes definition.
    ss_definition = synset_properties(ss, 'definition')
    signature += word_tokenize(ss_definition)
    # Includes examples
    ss_examples = synset_properties(ss, 'examples')
    signature += list(chain(*[i.split() for i in ss_examples]))
    # Includes lemma_names.
    ss_lemma_names = synset_properties(ss, 'lemma_names')
    signature += ss_lemma_names
    # Optional: includes lemma_names of hypernyms and hyponyms.

    ss_hyponyms = synset_properties(ss, 'hyponyms')
    ss_hypernyms = synset_properties(ss, 'hypernyms')
    ss_hypohypernyms = ss_hypernyms + ss_hyponyms
    signature += list(chain(*[i.lemma_names() for i in ss_hypohypernyms]))
    #  print(signature)
    '''
    # Includes holonyms.
    ss_mem_holonyms = synset_properties(ss, 'member_holonyms')
    ss_part_holonyms = synset_properties(ss, 'part_holonyms')
    ss_sub_holonyms = synset_properties(ss, 'substance_holonyms')
    # Includes meronyms.
    ss_mem_meronyms = synset_properties(ss, 'member_meronyms')
    ss_part_meronyms = synset_properties(ss, 'part_meronyms')
    ss_sub_meronyms = synset_properties(ss, 'substance_meronyms')
    # Includes similar_tos
    ss_simto = synset_properties(ss, 'similar_tos')
    
    related_senses = list(set(ss_mem_holonyms+ss_part_holonyms+
                              ss_sub_holonyms+ss_mem_meronyms+
                              ss_part_meronyms+ss_sub_meronyms+ ss_simto))

    signature += list([j for j in chain(*[synset_properties(i, 'lemma_names')
                                         for i in related_senses])
                      if j not in EN_STOPWORDS])
  #  print(signature)
    # Optional: removes stopwords.
    '''
    signature = [i for i in signature if i.lower() not in EN_STOPWORDS]
    # Lemmatized context is preferred over stemmed context.
    signature = [lemmatize(i) for i in signature]
    # Matching exact words may cause sparsity, so optional matching for stems.
    if stem == True:
        signature = [porter.stem(i) for i in signature]

    return signature
예제 #2
0
def adapted_lesk(context_sentence, ambiguous_word, \
                pos=None, lemma=True, stem=True, hyperhypo=True, \
                stop=True, context_is_lemmatized=False, \
                nbest=False, keepscore=False, normalizescore=False):
    """
    This function is the implementation of the Adapted Lesk algorithm, 
    described in Banerjee and Pederson (2002). It makes use of the lexical 
    items from semantically related senses within the wordnet 
    hierarchies and to generate more lexical items for each sense. 
    see www.d.umn.edu/~tpederse/Pubs/cicling2002-b.pdf‎
    """
    # Ensure that ambiguous word is a lemma.
    ambiguous_word = lemmatize(ambiguous_word)
    # If ambiguous word not in WordNet return None
    if not wn.synsets(ambiguous_word):
        return None
    # Get the signatures for each synset.
    ss_sign = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo)
    for ss in ss_sign:
        # Includes holonyms.
        ss_mem_holonyms = synset_properties(ss, 'member_holonyms')
        ss_part_holonyms = synset_properties(ss, 'part_holonyms')
        ss_sub_holonyms = synset_properties(ss, 'substance_holonyms')
        # Includes meronyms.
        ss_mem_meronyms = synset_properties(ss, 'member_meronyms')
        ss_part_meronyms = synset_properties(ss, 'part_meronyms')
        ss_sub_meronyms = synset_properties(ss, 'substance_meronyms')
        # Includes similar_tos
        ss_simto = synset_properties(ss, 'similar_tos')

        related_senses = list(
            set(ss_mem_holonyms + ss_part_holonyms + ss_sub_holonyms +
                ss_mem_meronyms + ss_part_meronyms + ss_sub_meronyms +
                ss_simto))

        signature = list([
            j for j in chain(
                *[synset_properties(i, 'lemma_names') for i in related_senses])
            if j not in EN_STOPWORDS
        ])

    # Lemmatized context is preferred over stemmed context
    if lemma == True:
        signature = [lemmatize(i) for i in signature]
    # Matching exact words causes sparsity, so optional matching for stems.
    if stem == True:
        signature = [porter.stem(i) for i in signature]
    # Adds the extended signature to the simple signatures.
    ss_sign[ss] += signature

    # Disambiguate the sense in context.
    if context_is_lemmatized:
        context_sentence = context_sentence.split()
    else:
        context_sentence = lemmatize_sentence(context_sentence)
    best_sense = compare_overlaps(context_sentence, ss_sign, \
                                    nbest=nbest, keepscore=keepscore, \
                                    normalizescore=normalizescore)
    return best_sense
예제 #3
0
def simple_signature(ambiguous_word, pos=None, lemma=True, stem=False, \
                     hyperhypo=True, stop=True):
    """
    Returns a synsets_signatures dictionary that includes signature words of a
    sense from its:
    (i)   definition
    (ii)  example sentences
    (iii) hypernyms and hyponyms
    """
    synsets_signatures = {}
    for ss in wn.synsets(ambiguous_word):
        try:  # If POS is specified.
            if pos and str(ss.pos()) != pos:
                continue
        except:
            if pos and str(ss.pos) != pos:
                continue
        signature = []
        # Includes definition.
        ss_definition = synset_properties(ss, 'definition')
        signature += ss_definition
        # Includes examples
        ss_examples = synset_properties(ss, 'examples')
        signature += list(chain(*[i.split() for i in ss_examples]))
        # Includes lemma_names.
        ss_lemma_names = synset_properties(ss, 'lemma_names')
        signature += ss_lemma_names

        # Optional: includes lemma_names of hypernyms and hyponyms.
        if hyperhypo == True:
            ss_hyponyms = synset_properties(ss, 'hyponyms')
            ss_hypernyms = synset_properties(ss, 'hypernyms')
            ss_hypohypernyms = ss_hypernyms + ss_hyponyms
            signature += list(
                chain(*[i.lemma_names() for i in ss_hypohypernyms]))

        # Optional: removes stopwords.
        if stop == True:
            signature = [i for i in signature if i not in EN_STOPWORDS]
        # Lemmatized context is preferred over stemmed context.
        if lemma == True:
            signature = [lemmatize(i) for i in signature]
        # Matching exact words may cause sparsity, so optional matching for stems.
        if stem == True:
            signature = [porter.stem(i) for i in signature]
        synsets_signatures[ss] = signature

    return synsets_signatures
예제 #4
0
def original_lesk(context_sentence, ambiguous_word, dictionary=None):
    """
    This function is the implementation of the original Lesk algorithm (1986).
    It requires a dictionary which contains the definition of the different
    sense of each word. See http://dl.acm.org/citation.cfm?id=318728
    """
    ambiguous_word = lemmatize(ambiguous_word)
    if not dictionary:  # If dictionary is not provided, use the WN defintion.
        dictionary = {}
        for ss in wn.synsets(ambiguous_word):
            ss_definition = synset_properties(ss, 'definition')
            dictionary[ss] = ss_definition
    best_sense = compare_overlaps_greedy(context_sentence.split(), dictionary)
    return best_sense
예제 #5
0
def simple_signature(word, ss, lexicon):
    """
    Returns a synsets_signatures dictionary that includes signature words of a
    sense from its:
    (i)   definition
    (ii)  example sentences
    (iii) hypernyms and hyponyms
    """
    signature = [wn.morphy(word), word,
                 ss.name().split('.')[0]] + findRelatedForms(ss)

    ss_definition = synset_properties(ss, 'definition')
    signature += word_tokenize(ss_definition)

    ss_lemma_names = synset_properties(ss, 'lemma_names')
    signature += ss_lemma_names

    ss_hyponyms = synset_properties(ss, 'hyponyms')
    #ss_hypernyms = synset_properties(ss, 'hypernyms')
    ss_hypohypernyms = ss_hyponyms  #+ss_hypernyms
    signature += list(chain(*[i.lemma_names() for i in ss_hypohypernyms]))
    #  print(signature)

    # Includes holonyms.
    ss_mem_holonyms = synset_properties(ss, 'member_holonyms')
    ss_part_holonyms = synset_properties(ss, 'part_holonyms')
    ss_sub_holonyms = synset_properties(ss, 'substance_holonyms')
    # Includes meronyms.
    ss_mem_meronyms = synset_properties(ss, 'member_meronyms')
    ss_part_meronyms = synset_properties(ss, 'part_meronyms')
    ss_sub_meronyms = synset_properties(ss, 'substance_meronyms')
    # Includes similar_tos
    #ss_simto = synset_properties(ss, 'similar_tos')

    related_senses = list(
        set(ss_mem_holonyms + ss_part_holonyms + ss_sub_holonyms +
            ss_mem_meronyms + ss_part_meronyms + ss_sub_meronyms))

    signature += list([
        j for j in chain(
            *[synset_properties(i, 'lemma_names') for i in related_senses])
        if j not in EN_STOPWORDS
    ])
    #  print(signature)
    # Optional: removes stopwords.
    signature = [i for i in signature if type(i) == str]
    signature = [i for i in signature if i.lower() not in EN_STOPWORDS]
    # Lemmatized context is preferred over stemmed context.
    signature = [lemmatize(i) for i in signature]
    # Matching exact words may cause sparsity, so optional matching for stems.
    #signature = [porter.stem(i) for i in signature]

    ss_sign = set(signature)
    to_remove = []

    for word in ss_sign:
        morphy = wn.morphy(word)
        if lexicon == '+-':
            if word not in lexiconDict and morphy not in lexiconDict:
                to_remove.append(word)
        else:
            if word in lexiconDict:
                if lexiconDict[word] != lexicon:
                    to_remove.append(word)
            elif (morphy is not None) and morphy in lexiconDict:
                if lexiconDict[morphy] != lexicon:
                    to_remove.append(word)
            else:
                to_remove.append(word)

    to_remove = set(to_remove)
    ss_sign = ss_sign - to_remove
    signature = list(ss_sign)

    sim = []
    for ss in ss_sign:
        synsets = wn.synsets(ss)
        if len(synsets) > 0:
            if synsets[0] not in sim:
                synset = synsets[0]
                pos = synset.pos()
                morphy = wn.morphy(ss, pos)
                if ss in lexiconDict or ss in lexiconDict or ss in disgustingWords:
                    sim.append(synset)
                elif (morphy is not None) and (morphy in lexiconDict
                                               or morphy in lexiconDict):
                    sim.append(synset)
                else:
                    signature.remove(ss)
        else:
            signature.remove(ss)
    return [signature, sim]