예제 #1
0
    def style_convert_string(self, input_text):
        """ For each word in input text, look up synonyms in the
            author's thesaurus and probabilistically select a
            replacement word. Write output to outfile. """

        text = tokenize_string(input_text)
        output = ""

        tagged_tuples = nltk.pos_tag(text)

        untagged_string = " ".join([tagged_tuple[0] for tagged_tuple in tagged_tuples])

        for index, tagged_tuple in enumerate(tagged_tuples):

            orig_word, temp_pos = tagged_tuple

            word = orig_word.strip().lower()
            was_title = orig_word.istitle()        # "Title"
            was_capitalized = orig_word.isupper()  # "UPPER"
            was_lower = orig_word.islower()        # "lower"

            # Don't replace determinants
            if temp_pos == u'DT':
                weighted_key = word
            else:      
                if temp_pos in ['VBD', 'VBG', 'VBN', 'NNS', 'NNPS']:
                    synset = None
                else:
                    # Replace word
                    # Converts penn tree bank pos tag to wordnet pos tag
                    wordnet_pos = reduce_pos_tagset(temp_pos)
                    if wordnet_pos:
                        synset = nltk_lesk(untagged_string, word, wordnet_pos)
                    else:
                        synset = nltk_lesk(untagged_string, word)

                # Probabilistically choose a synonym in thesaurus[synset]
                weighted_key = self._weighted_choice_lesk(str(synset), word)

            # Match capitalization of original word
            if was_title:
                weighted_key = weighted_key.title()
            elif was_capitalized:
                weighted_key = weighted_key.upper()
            elif not was_lower: 
                weighted_key = orig_word

            # Add a space between words, no space for punctuation
            if word not in string.punctuation and index != 0: 
                output += " "

            output += weighted_key

        return output
예제 #2
0
def lesk_builtin(wsd):
    """returns word sense for synset found using lesk's algorithm"""
    synset = nltk_lesk(wsd.context, wsd.lemma)
    if synset is not None:
        return get_first_sense_key(synset)
    else:
        logger.debug('synset empty for {}'.format(wsd.lemma))
        return (None)
예제 #3
0
파일: wsd.py 프로젝트: gsi-upm/sematch
 def lesk(self, context, word):
     from nltk.wsd import lesk as nltk_lesk
     context_words = self.context2words(context)
     return nltk_lesk(context_words, word, 'n')
예제 #4
0
 def lesk(self, context, word):
     from nltk.wsd import lesk as nltk_lesk
     context_words = self.context2words(context)
     return nltk_lesk(context_words, word, 'n')
예제 #5
0
    def style_convert_lesk(self, infile_name, outfile_name):
        """ For each word in input text, look up synonyms in the
            author's thesaurus and probabilistically select a
            replacement word. Write output to outfile. """

        with open(infile_name, 'r') as infile, open(outfile_name, 'w') as outfile:

            for line in infile:

                # POS tag, and then lesk-ify the input,
                # look it up in the thesauri
                try:
                    line = line.decode('ascii', 'ignore')
                except (UnicodeDecodeError, UnicodeEncodeError):
                    continue
                line = tokenize_string(line)

                tagged_tuples = nltk.pos_tag(line)

                untagged_string = " ".join([tagged_tuple[0] for tagged_tuple in tagged_tuples])

                for index, tagged_tuple in enumerate(tagged_tuples):

                    orig_word, temp_pos = tagged_tuple

                    word = orig_word.strip().lower()

                    was_title = orig_word.istitle()        # "Title"
                    was_capitalized = orig_word.isupper()  # "UPPER"
                    was_lower = orig_word.islower()        # "lower"

                    # Don't replace determinants
                    if temp_pos == u'DT':
                        weighted_key = word
                    else:
                        # Skip past tense verbs and nouns for synsets
                        if temp_pos in ['VBD', 'VBG', 'VBN', 'NNS', 'NNPS']:
                            synset = None
                        else:
                            # Replace word
                            # Converts penn tree bank pos tag to wordnet pos tag
                            wordnet_pos = reduce_pos_tagset(temp_pos)
                            if wordnet_pos:
                                synset = nltk_lesk(untagged_string, word, wordnet_pos)
                            else:
                                synset = nltk_lesk(untagged_string, word)

                        # Probabilistically choose a synonym in thesaurus[synset]
                        # -> Interpolates to non-WordNet/Synset if synset doesn't exist
                        weighted_key = self._weighted_choice_lesk(str(synset), word)

                    # Match capitalization of original word
                    if was_title:
                        weighted_key = weighted_key.title()
                    elif was_capitalized:
                        weighted_key = weighted_key.upper()
                    elif not was_lower: 
                        weighted_key = orig_word

                    # Add a space between words, no space for punctuation
                    if word not in string.punctuation and index != 0: 
                        outfile.write(" ")

                    outfile.write(weighted_key)
                outfile.write('\n')

        return outfile_name