def parseSurfaceSemantics(sss_str):
    if '_' not in sss_str: return []
    text,POS,senses = splitSurfaceSemantics(sss_str)
    try:
        return [pywordnet.getWord(text,POS).getSenses()[int(s)-1] for s in senses]
    except (IndexError,KeyError):
        sense = None
        for altPOS in ('N','V','ADJ','ADV'):
            if altPOS == POS: continue
            try:
                return [pywordnet.getWord(text,POS).getSenses()[int(s)-1] for s in senses]
            except (IndexError,KeyError): pass
        return []
示例#2
0
def parseSurfaceSemantics(sss_str):
    if "_" not in sss_str:
        return []
    text, POS, senses = splitSurfaceSemantics(sss_str)
    try:
        return [pywordnet.getWord(text, POS).getSenses()[int(s) - 1] for s in senses]
    except (IndexError, KeyError):
        sense = None
        for altPOS in ("N", "V", "ADJ", "ADV"):
            if altPOS == POS:
                continue
            try:
                return [pywordnet.getWord(text, POS).getSenses()[int(s) - 1] for s in senses]
            except (IndexError, KeyError):
                pass
        return []
def extractSurfaceSemantics(token,parent):
    global Senses
    POS=getPartOfSpeech(token,parent)
    tokenSenses = {}
    text = token['TEXT'].lower()
    default = token['TEXT'].upper()
    if POS in ['N', 'V', 'ADV', 'ADJ']:
        try: #Redo as test = foo while not tokenSensesword: try: foo ; except KeyError: foo = next foo
            tokenSenses = Senses[text]
        except KeyError:
            logger.warning('extractSurfaceSemantics : Text not in tagged senses: %s', text)
            try: 
                #logger.warning('extractSurfaceSemantics : Previously unseen word but in WordNet?: %s', text)
                # stringified range of possible senses without spaces
                tokenSenses = {POS : range(1,len(pywordnet.getWord(text,POS).getSenses())+1)}
            except KeyError:
                try:
                    logger.warning('extractSurfaceSemantics : Inflected version of WordNet word? %s', text)
                    if text.endswith('s'):
                        text = text[:-1]
                        tokenSenses = Senses[text]
                    else:
                        stemmer = PorterStemmer() # Update WordNetStemmer to NLTK 1.4 API
                        stemmer.stem(token)
                        text = token['STEM']
                        tokenSenses = Senses[text]
                except KeyError:
                    text = token['TEXT'].lower()
                    try:
                        logger.warning('extractSurfaceSemantics : Misspelling / typo of WordNet word? %s', text)
                        spellchecker = enchant.DictWithPWL('en_US', Lexicon)
                        s = ''
                        for s in spellchecker.suggest(text):
                            if s in Senses:
                                tokenSenses = Senses[s]
                                break
                        if not tokenSenses and spellchecker.suggest(text):
                            s = spellchecker.suggest(text)[0]
                            tokenSenses = {POS : range(1,len(pywordnet.getWord(s,POS).getSenses())+1)}
                        if s and Options.Spellcheck:
                            logger.warning('extractSurfaceSemantics : Found spelling correction %s for %s', s,text)
                            text = s
                        #logger.debug('*** extractSurfaceSemantics : Implement spelling correction. *** ')
                        #raise KeyError
                    except KeyError:
                        logger.error('extractSurfaceSemantics : Unknown token: %s', text)
                        return default
        # Handle experienced typos.
        if 'see' in tokenSenses:
            ### FIXME adding to dict for typos that are other words
            text = tokenSenses['see']
            try:
                tokenSenses = Senses[text]
            except: return default
        # Handle morphology variants that wordnet understands.
        elif isinstance(tokenSenses, tuple):
            text,tokenSenses[POS] = tokenSenses[POS]
        try:
            return '_'.join([text,POS,','.join([str(i) for i in tokenSenses[POS]])])
        except KeyError:
            #logger.warning('extractSurfaceSemantics : Expected POS %s for token %s, Got %s, Using %s',
            #            POS, token, tokenSenses.keys(), tokenSenses.keys()[0])
            if tokenSenses.keys():
                POS = token['POS'] = tokenSenses.keys()[0]
                return '_'.join([text,POS,','.join([str(i) for i in tokenSenses.values()[0]])])
        except Exception,e:
            logger.error('extractSurfaceSemantics: %s: Could not find sense %s for token %s',
                      e, POS, token) #tokenSenses, text
示例#4
0
def extractSurfaceSemantics(token, parent):
    global Senses
    POS = getPartOfSpeech(token, parent)
    tokenSenses = {}
    text = token["TEXT"].lower()
    default = token["TEXT"].upper()
    if POS in ["N", "V", "ADV", "ADJ"]:
        try:  # Redo as test = foo while not tokenSensesword: try: foo ; except KeyError: foo = next foo
            tokenSenses = Senses[text]
        except KeyError:
            logger.warning("extractSurfaceSemantics : Text not in tagged senses: %s", text)
            try:
                # logger.warning('extractSurfaceSemantics : Previously unseen word but in WordNet?: %s', text)
                # stringified range of possible senses without spaces
                tokenSenses = {POS: range(1, len(pywordnet.getWord(text, POS).getSenses()) + 1)}
            except KeyError:
                try:
                    logger.warning("extractSurfaceSemantics : Inflected version of WordNet word? %s", text)
                    if text.endswith("s"):
                        text = text[:-1]
                        tokenSenses = Senses[text]
                    else:
                        stemmer = PorterStemmer()  # Update WordNetStemmer to NLTK 1.4 API
                        stemmer.stem(token)
                        text = token["STEM"]
                        tokenSenses = Senses[text]
                except KeyError:
                    text = token["TEXT"].lower()
                    try:
                        logger.warning("extractSurfaceSemantics : Misspelling / typo of WordNet word? %s", text)
                        spellchecker = enchant.DictWithPWL("en_US", Lexicon)
                        s = ""
                        for s in spellchecker.suggest(text):
                            if s in Senses:
                                tokenSenses = Senses[s]
                                break
                        if not tokenSenses and spellchecker.suggest(text):
                            s = spellchecker.suggest(text)[0]
                            tokenSenses = {POS: range(1, len(pywordnet.getWord(s, POS).getSenses()) + 1)}
                        if s and Options.Spellcheck:
                            logger.warning("extractSurfaceSemantics : Found spelling correction %s for %s", s, text)
                            text = s
                        # logger.debug('*** extractSurfaceSemantics : Implement spelling correction. *** ')
                        # raise KeyError
                    except KeyError:
                        logger.error("extractSurfaceSemantics : Unknown token: %s", text)
                        return default
        # Handle experienced typos.
        if "see" in tokenSenses:
            ### FIXME adding to dict for typos that are other words
            text = tokenSenses["see"]
            try:
                tokenSenses = Senses[text]
            except:
                return default
        # Handle morphology variants that wordnet understands.
        elif isinstance(tokenSenses, tuple):
            text, tokenSenses[POS] = tokenSenses[POS]
        try:
            return "_".join([text, POS, ",".join([str(i) for i in tokenSenses[POS]])])
        except KeyError:
            # logger.warning('extractSurfaceSemantics : Expected POS %s for token %s, Got %s, Using %s',
            #            POS, token, tokenSenses.keys(), tokenSenses.keys()[0])
            if tokenSenses.keys():
                POS = token["POS"] = tokenSenses.keys()[0]
                return "_".join([text, POS, ",".join([str(i) for i in tokenSenses.values()[0]])])
        except Exception, e:
            logger.error(
                "extractSurfaceSemantics: %s: Could not find sense %s for token %s", e, POS, token
            )  # tokenSenses, text