def yield_17_candidates(corpus):
    wn16 = WordNetCorpusReader('wordnet/1.6/')
    wn17 = WordNetCorpusReader('wordnet/1.7.1/')

    for w in corpus.get_unique_words():
        synsets17 = wn17.synsets(w)
        lexclasses = list(set([s.lexname for s in synsets17]))
        synsets16 = wn16.synsets(w)
        if synsets16:
            continue
        if len(lexclasses) != 1:
            continue
        if 'noun' not in lexclasses[0]:
            continue
        yield w
Exemplo n.º 2
0
class Antonimos(Feature):
    def __init__(self):
        super(Antonimos, self).__init__()
        self.nombre = "Antonimos"
        self.descripcion = """
            Mide la cantidad de pares de antónimos presentes en el texto.
        """
        self.thread_safe = False  # Tiene problemas de concurrencia: https://github.com/nltk/nltk/issues/803
        self.wncr = WordNetCorpusReader(resource_filename('clasificador.recursos', 'wordnet_spa'), None)

    def calcular_feature(self, tweet):
        oraciones = Freeling.procesar_texto(remover_hashtags(remover_usuarios(tweet.texto)))
        tokens = Freeling.get_tokens_de_oraciones(oraciones)

        cant_antonimos = 0

        for token in tokens:
            antonimos = []
            for synset in self.wncr.synsets(token.lemma):
                for lemma in synset.lemmas():
                    antonimos += [lemma_antonimo.name() for lemma_antonimo in lemma.antonyms()]

            for otro_token in tokens:
                if otro_token.lemma in antonimos:
                    cant_antonimos += 1
                    break

        if len(tokens) == 0:
            return 0
        else:
            return cant_antonimos / math.sqrt(len(tokens)) / 2.0  # divido entre 2 para contar una vez cada par
Exemplo n.º 3
0
def main():
    args = parse_args()
    ft_vec = FasttextVectorizer(args.fasttext_path)

    if args.data_path:
        # read data
        with open(args.data_path, 'r', encoding='utf-8') as f:
            dataset = [
                line.split("\t")[1].replace(" ", "_")
                for line in f.read().split("\n") if line
            ]

        # vectorize wordnet
        if "wordnet" in args:
            wn = WordNetCorpusReader(args.wordnet, None)
            for word in dataset:
                print(word, wn.synsets(word, pos=args.pos))
        else:
            ft_vec.vectorize_multiword_data(dataset,
                                            args.output_path,
                                            to_upper=False)

    elif args.data_dir:
        for system_dir in os.listdir(args.data_dir):
            for dirpath, _, filenames in os.walk(
                    os.path.join(args.data_dir, system_dir, args.language)):
                for filename in filenames:
                    if filename.endswith(".terms"):
                        input_path = os.path.join(dirpath, filename)
                        os.makedirs(os.path.join(args.output_path, system_dir),
                                    exist_ok=True)
                        output_path = os.path.join(
                            args.output_path, system_dir,
                            filename.replace(".terms", ".txt").replace(
                                system_dir + "_", ""))
                        with open(input_path, 'r', encoding='utf-8') as f:
                            dataset = [
                                line.split("\t")[1].replace(" ", "_")
                                for line in f.read().split("\n") if line
                            ]
                        ft_vec.vectorize_multiword_data(dataset,
                                                        output_path,
                                                        to_upper=False)
                        print(f"Processed: {filename}")
    else:
        raise Exception("Please, specify either --data_dir or --data_path")
Exemplo n.º 4
0
class TestTransform(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.languages = ["cat", "eng", "eus", "glg", "spa"]
        cls.wn_names = {}
        for lang in cls.languages:
            cls.wn_names[lang] = '.wordnet_' + lang
            with tarfile.open('wordnet_' + lang + '.tar.gz') as f:
                f.extractall(cls.wn_names[lang])

    def test_all_synsets(self):
        self.wncr = WordNetCorpusReader(self.wn_names['spa'], None)
        for synset in self.wncr.all_synsets():
            a = synset
        # success if there is no error
        # This will also test that all synsets in data files are in index files.

    def test_invalid_literal_for_int_16(self):
        self.wncr = WordNetCorpusReader(self.wn_names['spa'], None)
        for synset in self.wncr.synsets("agudeza"):
            a = synset


#        self.wncr._synset_from_pos_and_line('n',
#                                            "04122387 00 n 0a agudeza 0 broma 0 chiste 0 chufleta 0 comentario_burlón 0 cuchufleta 0 idea 0 ocurrencia 0 pulla 0 salida 0 04 @ 04120601 n 0000 + 00620096 v 0000 + 00499330 v 0000 + 00558467 v 0000 | comentario ingenioso para hacer reír  \n")
#        # success if there is no error

    def test_key_error(self):
        self.wncr = WordNetCorpusReader(self.wn_names['spa'], None)
        self.wncr.lemma("menor.a.09.menor").antonyms()
        # success if there is no error

    def test_load_wordnet(self):
        for lang in self.languages:
            self.wncr = WordNetCorpusReader(self.wn_names[lang], None)
            # success if there is no error

    @classmethod
    def tearDownClass(cls):
        for lang in cls.languages:
            shutil.rmtree(cls.wn_names[lang])
Exemplo n.º 5
0
class TestTransform(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.languages = ["cat", "eng", "eus", "glg", "spa"]
        cls.wn_names = {}
        for lang in cls.languages:
            cls.wn_names[lang] = '.wordnet_' + lang
            with tarfile.open('wordnet_' + lang + '.tar.gz') as f:
                    f.extractall(cls.wn_names[lang])

    def test_all_synsets(self):
        self.wncr = WordNetCorpusReader(self.wn_names['spa'], None)
        for synset in self.wncr.all_synsets():
            a = synset
        # success if there is no error
        # This will also test that all synsets in data files are in index files.

    def test_invalid_literal_for_int_16(self):
        self.wncr = WordNetCorpusReader(self.wn_names['spa'], None)
        for synset in self.wncr.synsets("agudeza"):
            a = synset
#        self.wncr._synset_from_pos_and_line('n',
#                                            "04122387 00 n 0a agudeza 0 broma 0 chiste 0 chufleta 0 comentario_burlón 0 cuchufleta 0 idea 0 ocurrencia 0 pulla 0 salida 0 04 @ 04120601 n 0000 + 00620096 v 0000 + 00499330 v 0000 + 00558467 v 0000 | comentario ingenioso para hacer reír  \n")
#        # success if there is no error

    def test_key_error(self):
        self.wncr = WordNetCorpusReader(self.wn_names['spa'], None)
        self.wncr.lemma("menor.a.09.menor").antonyms()
        # success if there is no error

    def test_load_wordnet(self):
        for lang in self.languages:
            self.wncr = WordNetCorpusReader(self.wn_names[lang], None)
            # success if there is no error

    @classmethod
    def tearDownClass(cls):
        for lang in cls.languages:
            shutil.rmtree(cls.wn_names[lang])
Exemplo n.º 6
0
class WNAffect(object):
    """WordNet-Affect resource."""
    def __init__(self, wordnet16_dir=None, wn_domains_dir=None):
        """Initializes the WordNet-Affect object."""
        wordnet16_dir = wordnet16_dir or join(dirname(__file__), "wordnet-1.6")
        wn_domains_dir = wn_domains_dir or join(dirname(__file__),
                                                "wn-domains-3.2")
        cwd = os.getcwd()
        nltk.data.path.append(cwd)
        wn16_path = "{0}/dict".format(wordnet16_dir)
        self.wn16 = WordNetCorpusReader(
            os.path.abspath("{0}/{1}".format(cwd, wn16_path)),
            nltk.data.find(wn16_path))
        self.flat_pos = {
            'NN': 'NN',
            'NNS': 'NN',
            'JJ': 'JJ',
            'JJR': 'JJ',
            'JJS': 'JJ',
            'RB': 'RB',
            'RBR': 'RB',
            'RBS': 'RB',
            'VB': 'VB',
            'VBD': 'VB',
            'VGB': 'VB',
            'VBN': 'VB',
            'VBP': 'VB',
            'VBZ': 'VB'
        }
        self.wn_pos = {
            'NN': self.wn16.NOUN,
            'JJ': self.wn16.ADJ,
            'VB': self.wn16.VERB,
            'RB': self.wn16.ADV
        }
        self._load_emotions(wn_domains_dir)
        self.synsets = self._load_synsets(wn_domains_dir)

    def _load_synsets(self, wn_domains_dir):
        """Returns a dictionary POS tag -> synset offset -> emotion (str -> int -> str)."""

        tree = ET.parse(
            "{0}/wn-affect-1.1/a-synsets.xml".format(wn_domains_dir))
        root = tree.getroot()
        pos_map = {"noun": "NN", "adj": "JJ", "verb": "VB", "adv": "RB"}

        synsets = {}
        for pos in ["noun", "adj", "verb", "adv"]:
            tag = pos_map[pos]
            synsets[tag] = {}
            for elem in root.findall(".//{0}-syn-list//{0}-syn".format(
                    pos, pos)):
                offset = int(elem.get("id")[2:])
                if not offset: continue
                if elem.get("categ"):
                    synsets[tag][offset] = WNAffectEmotion.emotions[elem.get(
                        "categ")] if elem.get(
                            "categ") in WNAffectEmotion.emotions else None
                elif elem.get("noun-id"):
                    synsets[tag][offset] = synsets[pos_map["noun"]][int(
                        elem.get("noun-id")[2:])]

        return synsets

    def _load_emotions(self, wn_domains_dir):
        """Loads the hierarchy of emotions from the WordNet-Affect xml."""

        tree = ET.parse(
            "{0}/wn-affect-1.1/a-hierarchy.xml".format(wn_domains_dir))
        root = tree.getroot()
        for elem in root.findall("categ"):
            name = elem.get("name")
            if name == "root":
                WNAffectEmotion.emotions["root"] = WNAffectEmotion("root")
            else:
                WNAffectEmotion.emotions[name] = WNAffectEmotion(
                    name, elem.get("isa"))

    def get_emotion(self, word, pos):
        """Returns the emotion of the word.
            word -- the word (str)
            pos -- part-of-speech (str)
        """

        if pos in self.flat_pos:
            pos = self.flat_pos[pos]
            synsets = self.wn16.synsets(word, self.wn_pos[pos])
            if synsets:
                for synset in synsets:
                    offset = synset.offset()
                    if offset in self.synsets[pos]:
                        return self.synsets[pos][offset]
        return None

    def get_emotion_synset(self, offset):
        """Returns the emotion of the synset.
            offset -- synset offset (int)
        """

        for pos in self.flat_pos.values():
            if offset in self.synsets[pos]:
                return self.synsets[pos][offset]
        return None
def yield_single_supersense_nouns_in_corpus(corpus):
    wn = WordNetCorpusReader('wordnet/1.6/')
    for w in corpus.get_unique_words():
        lexclasses = list(set([s.lexname for s in wn.synsets(w)]))
        if len(lexclasses) == 1 and 'noun' in lexclasses[0]:
            yield w
def yield_single_sense_nouns_in_corpus(corpus):
    wn = WordNetCorpusReader('wordnet/1.6/')
    for word in corpus.get_unique_words():
        synsets = list(wn.synsets(word))
        if len(synsets) == 1 and 'noun' in synsets[0].lexname:
            yield word
Exemplo n.º 9
0
        c1_list = []
        for c in c1:
            if c in model.wv.vocab:
                c1_list.append(model[c])
        c1_vec = sum(c1_list)

        c2 = re.sub(r'[^a-zA-Z ]', '', df['Context2'][i]).lower().split(' ')
        c2_list = []
        for c in c2:
            if c in model.wv.vocab:
                c2_list.append(model[c])
        c2_vec = sum(c2_list)
        for elem in mapping:
            sim1 = 0
            # for s1 in WN17.synsets(df['Word1'][i]):
            for s1 in WN17.synsets(df['Word1'][i], pos = df['POS1'][i]):
                if s1.name() in S[elem]:
                    if S[elem][s1.name()] is not 0:
                        temp1 = sum(np.array(S[elem][s1.name()])*c1_vec)/np.sqrt(sum(np.array(S[elem][s1.name()])*np.array(S[elem][s1.name()]))*sum(c1_vec*c1_vec))
                    else:
                        temp1 = 0
                else:
                    temp1 = 0

                if temp1 > sim1:
                    s1c = s1.name()
                    sim1 = temp1
            if sim1 is 0:
                Lists[elem+'_s1'].append('None')
            else:
                Lists[elem+'_s1'].append(s1c)
Exemplo n.º 10
0
    Lists = {}

    for elem in mapping:
        Lists[elem] = []

    Temps = {}
    S1 = {}
    S2 = {}

    for elem in mapping:
        print(elem)
        for i in range(353):
            Temps[elem] = [-1]

            for s1 in WN17.synsets(df['Word 1'][i]):
                S1[elem] = np.array(S[elem][s1.name()])
                for s2 in WN17.synsets(df['Word 2'][i]):
                    S2[elem] = np.array(S[elem][s2.name()])
                    Temps[elem].append(
                        sum(S1[elem] * S2[elem]) / np.sqrt(
                            sum(S1[elem] * S1[elem]) *
                            sum(S2[elem] * S2[elem])))
            Lists[elem].append(max(Temps[elem]))
            print(i)

        df[elem] = Lists[elem]

    df.to_csv('updating2.csv')

    df = pd.read_csv('updating2.csv')
Exemplo n.º 11
0
class EmotionTextPlugin(EmotionPlugin):
    
    def __init__(self, info, *args, **kwargs):
        super(EmotionTextPlugin, self).__init__(info, *args, **kwargs)
        self.id = info['module']
        self.info = info
        self._stopwords = stopwords.words('english')
        local_path=os.path.dirname(os.path.abspath(__file__))
        self._categories = {'anger': ['general-dislike',],
                            'fear': ['negative-fear',],
                            'disgust': ['shame',],
                            'joy': ['gratitude','affective','enthusiasm','love','joy','liking'],
                            'sadness': ['ingrattitude','daze','humility','compassion','despair','anxiety','sadness']}

        self._wnaffect_mappings = {'anger': 'anger',
                                   'fear': 'negative-fear',
                                   'disgust': 'disgust',
                                   'joy': 'joy',
                                   'sadness': 'sadness'}

        self._load_emotions(local_path+self.info['hierarchy_path'])     
        self._total_synsets = self._load_synsets(local_path+self.info['synsets_path'])
        self._wn16_path = local_path+self.info['wn16_path']
        self._wn16= None
        self._wn16 = WordNetCorpusReader(os.path.abspath("{0}".format(self._wn16_path)), nltk.data.find(self._wn16_path))
        

    def _load_synsets(self, synsets_path):
        """Returns a dictionary POS tag -> synset offset -> emotion (str -> int -> str)."""
        tree = ET.parse(synsets_path)
        root = tree.getroot()
        pos_map = { "noun": "NN", "adj": "JJ", "verb": "VB", "adv": "RB" }

        synsets = {}
        for pos in ["noun", "adj", "verb", "adv"]:
            tag = pos_map[pos]
            synsets[tag] = {}
            for elem in root.findall(".//{0}-syn-list//{0}-syn".format(pos, pos)):
                offset = int(elem.get("id")[2:])                
                if not offset: continue
                if elem.get("categ"):
                    synsets[tag][offset] = Emo.emotions[elem.get("categ")] if elem.get("categ") in Emo.emotions else None
                elif elem.get("noun-id"):
                    synsets[tag][offset] = synsets[pos_map["noun"]][int(elem.get("noun-id")[2:])]
        return synsets

    def _load_emotions(self, hierarchy_path):
        """Loads the hierarchy of emotions from the WordNet-Affect xml."""

        tree = ET.parse(hierarchy_path)
        root = tree.getroot()
        for elem in root.findall("categ"):
            name = elem.get("name")
            if name == "root":
                Emo.emotions["root"] = Emo("root")
            else:
                Emo.emotions[name] = Emo(name, elem.get("isa"))

    def activate(self, *args, **kwargs):
        logger.info("EmoText plugin is ready to go!")

    def deactivate(self, *args, **kwargs):

        logger.info("EmoText plugin is being deactivated...")

    def _my_preprocessor(self, text):

        regHttp = re.compile('(http://)[a-zA-Z0-9]*.[a-zA-Z0-9/]*(.[a-zA-Z0-9]*)?')
        regHttps = re.compile('(https://)[a-zA-Z0-9]*.[a-zA-Z0-9/]*(.[a-zA-Z0-9]*)?')
        regAt = re.compile('@([a-zA-Z0-9]*[*_/&%#@$]*)*[a-zA-Z0-9]*')
        text = re.sub(regHttp, '', text)
        text = re.sub(regAt, '', text)
        text = re.sub('RT : ', '', text)
        text = re.sub(regHttps, '', text)
        text = re.sub('[0-9]', '', text)
        text = self._delete_punctuation(text)
        return text

    def _delete_punctuation(self, text):

        exclude = set(string.punctuation)
        s = ''.join(ch for ch in text if ch not in exclude)
        return s

    def _extract_ngrams(self, text):

        unigrams_lemmas = []
        pos_tagged = []
        unigrams_words = []
        sentences = parse(text,lemmata=True).split()
        for sentence in sentences:
            for token in sentence:
                if token[0].lower() not in self._stopwords:
                    unigrams_words.append(token[0].lower())
                    unigrams_lemmas.append(token[4])  
                    pos_tagged.append(token[1])        

        return unigrams_words,unigrams_lemmas,pos_tagged

    def _find_ngrams(self, input_list, n):
        return zip(*[input_list[i:] for i in range(n)])

    def _clean_pos(self, pos_tagged):

        pos_tags={'NN':'NN', 'NNP':'NN','NNP-LOC':'NN', 'NNS':'NN', 'JJ':'JJ', 'JJR':'JJ', 'JJS':'JJ', 'RB':'RB', 'RBR':'RB',
        'RBS':'RB', 'VB':'VB', 'VBD':'VB', 'VGB':'VB', 'VBN':'VB', 'VBP':'VB', 'VBZ':'VB'}

        for i in range(len(pos_tagged)):
            if pos_tagged[i] in pos_tags:
                pos_tagged[i]=pos_tags[pos_tagged[i]]
        return pos_tagged
    
    def _extract_features(self, text):

        feature_set={k:0 for k in self._categories}
        ngrams_words,ngrams_lemmas,pos_tagged = self._extract_ngrams(text)
        matches=0
        pos_tagged=self._clean_pos(pos_tagged)

        tag_wn={'NN':self._wn16.NOUN,'JJ':self._wn16.ADJ,'VB':self._wn16.VERB,'RB':self._wn16.ADV}
        for i in range(len(pos_tagged)):
            if pos_tagged[i] in tag_wn:
                synsets = self._wn16.synsets(ngrams_words[i], tag_wn[pos_tagged[i]])   
                if synsets:
                    offset = synsets[0].offset()
                    if offset in self._total_synsets[pos_tagged[i]]:
                        if self._total_synsets[pos_tagged[i]][offset] is None:
                            continue
                        else:
                            emotion = self._total_synsets[pos_tagged[i]][offset].get_level(5).name
                            matches+=1
                            for i in self._categories:
                                if emotion in self._categories[i]:
                                    feature_set[i]+=1
        if matches == 0:
            matches=1                

        for i in feature_set:
            feature_set[i] = (feature_set[i]/matches)*100

        return feature_set

    def analyse(self, **params):

        logger.debug("Analysing with params {}".format(params))

        text_input = params.get("input", None)

        text=self._my_preprocessor(text_input)

        feature_text=self._extract_features(text)

        response = Results()

        entry = Entry(id="Entry",
                      text=text_input)
        emotionSet = EmotionSet(id="Emotions0")
        emotions = emotionSet.onyx__hasEmotion

        for i in feature_text:
            emotions.append(Emotion(onyx__hasEmotionCategory=self._wnaffect_mappings[i],
                                    onyx__hasEmotionIntensity=feature_text[i]))

        entry.emotions = [emotionSet]
        response.entries.append(entry)
        return response
Exemplo n.º 12
0
class WNAffect:
    """WordNet-Affect ressource."""
    
    def __init__(self, wordnet16_dir, wn_domains_dir):
        """Initializes the WordNet-Affect object."""
        
        cwd = os.getcwd()
        nltk.data.path.append(cwd)
        wn16_path = "{0}/dict".format(wordnet16_dir)
        self.wn16 = WordNetCorpusReader(os.path.abspath("{0}/{1}".format(cwd, wn16_path)), nltk.data.find(wn16_path))
        self.flat_pos = {'NN':'NN', 'NNS':'NN', 'JJ':'JJ', 'JJR':'JJ', 'JJS':'JJ', 'RB':'RB', 'RBR':'RB', 'RBS':'RB', 'VB':'VB', 'VBD':'VB', 'VGB':'VB', 'VBN':'VB', 'VBP':'VB', 'VBZ':'VB'}
        self.wn_pos = {'NN':self.wn16.NOUN, 'JJ':self.wn16.ADJ, 'VB':self.wn16.VERB, 'RB':self.wn16.ADV}
        self._load_emotions(wn_domains_dir)
        self.synsets = self._load_synsets(wn_domains_dir)
        


    def _load_synsets(self, wn_domains_dir):
        """Returns a dictionary POS tag -> synset offset -> emotion (str -> int -> str)."""
        
        tree = ET.parse("{0}/a-synsets.xml".format(wn_domains_dir))
        root = tree.getroot()
        pos_map = { "noun": "NN", "adj": "JJ", "verb": "VB", "adv": "RB" }
    
        synsets = {}
        for pos in ["noun", "adj", "verb", "adv"]:
            tag = pos_map[pos]
            synsets[tag] = {}
            for elem in root.findall(".//{0}-syn-list//{0}-syn".format(pos, pos)):
                offset = int(elem.get("id")[2:])                
                if not offset: continue
                if elem.get("categ"):
                    synsets[tag][offset] = Emotion.emotions[elem.get("categ")] if elem.get("categ") in Emotion.emotions else None
                elif elem.get("noun-id"):
                    synsets[tag][offset] = synsets[pos_map["noun"]][int(elem.get("noun-id")[2:])]
    
        return synsets
        
    def _load_emotions(self, wn_domains_dir):
        """Loads the hierarchy of emotions from the WordNet-Affect xml."""
        
        tree = ET.parse("{0}/a-hierarchy.xml".format(wn_domains_dir))
        root = tree.getroot()
        for elem in root.findall("categ"):
            name = elem.get("name")
            if name == "root":
                Emotion.emotions["root"] = Emotion("root")
            else:
                Emotion.emotions[name] = Emotion(name, elem.get("isa"))
    
    def get_emotion(self, word, pos):
        """Returns the emotion of the word.
            word -- the word (str)
            pos -- part-of-speech (str)
        """
        
        if pos in self.flat_pos:
            pos = self.flat_pos[pos]
            synsets = self.wn16.synsets(word, self.wn_pos[pos])         
            if synsets:
                offset = synsets[0].offset()
                if offset in self.synsets[pos]:
                    return self.synsets[pos][offset]
        return None
class WNAffect(EmotionPlugin, ShelfMixin):
    '''
    Emotion classifier using WordNet-Affect to calculate the percentage
    of each emotion. This plugin classifies among 6 emotions: anger,fear,disgust,joy,sadness
    or neutral. The only available language is English (en)
    '''
    name = 'emotion-wnaffect'
    author = ["@icorcuera", "@balkian"]
    version = '0.2'
    extra_params = {
        'language': {
            "@id": 'lang_wnaffect',
            'description': 'language of the input',
            'aliases': ['language', 'l'],
            'required': True,
            'options': [
                'en',
            ]
        }
    }
    synsets_path = "a-synsets.xml"
    hierarchy_path = "a-hierarchy.xml"
    wn16_path = "wordnet1.6/dict"
    onyx__usesEmotionModel = "emoml:big6"
    nltk_resources = ['stopwords', 'averaged_perceptron_tagger', 'wordnet']

    def _load_synsets(self, synsets_path):
        """Returns a dictionary POS tag -> synset offset -> emotion (str -> int -> str)."""
        tree = ET.parse(synsets_path)
        root = tree.getroot()
        pos_map = {"noun": "NN", "adj": "JJ", "verb": "VB", "adv": "RB"}

        synsets = {}
        for pos in ["noun", "adj", "verb", "adv"]:
            tag = pos_map[pos]
            synsets[tag] = {}
            for elem in root.findall(".//{0}-syn-list//{0}-syn".format(
                    pos, pos)):
                offset = int(elem.get("id")[2:])
                if not offset: continue
                if elem.get("categ"):
                    synsets[tag][offset] = Emo.emotions[elem.get(
                        "categ")] if elem.get(
                            "categ") in Emo.emotions else None
                elif elem.get("noun-id"):
                    synsets[tag][offset] = synsets[pos_map["noun"]][int(
                        elem.get("noun-id")[2:])]
        return synsets

    def _load_emotions(self, hierarchy_path):
        """Loads the hierarchy of emotions from the WordNet-Affect xml."""

        tree = ET.parse(hierarchy_path)
        root = tree.getroot()
        for elem in root.findall("categ"):
            name = elem.get("name")
            if name == "root":
                Emo.emotions["root"] = Emo("root")
            else:
                Emo.emotions[name] = Emo(name, elem.get("isa"))

    def activate(self, *args, **kwargs):

        self._stopwords = stopwords.words('english')
        self._wnlemma = wordnet.WordNetLemmatizer()
        self._syntactics = {'N': 'n', 'V': 'v', 'J': 'a', 'S': 's', 'R': 'r'}
        local_path = os.environ.get("SENPY_DATA")
        self._categories = {
            'anger': [
                'general-dislike',
            ],
            'fear': [
                'negative-fear',
            ],
            'disgust': [
                'shame',
            ],
            'joy':
            ['gratitude', 'affective', 'enthusiasm', 'love', 'joy', 'liking'],
            'sadness': [
                'ingrattitude', 'daze', 'humility', 'compassion', 'despair',
                'anxiety', 'sadness'
            ]
        }

        self._wnaffect_mappings = {
            'anger': 'anger',
            'fear': 'negative-fear',
            'disgust': 'disgust',
            'joy': 'joy',
            'sadness': 'sadness'
        }

        self._load_emotions(self.find_file(self.hierarchy_path))

        if 'total_synsets' not in self.sh:
            total_synsets = self._load_synsets(
                self.find_file(self.synsets_path))
            self.sh['total_synsets'] = total_synsets

        self._total_synsets = self.sh['total_synsets']

        self._wn16_path = self.wn16_path
        self._wn16 = WordNetCorpusReader(
            self.find_file(self._wn16_path),
            nltk.data.find(self.find_file(self._wn16_path)))

    def deactivate(self, *args, **kwargs):
        self.save()

    def _my_preprocessor(self, text):

        regHttp = re.compile(
            '(http://)[a-zA-Z0-9]*.[a-zA-Z0-9/]*(.[a-zA-Z0-9]*)?')
        regHttps = re.compile(
            '(https://)[a-zA-Z0-9]*.[a-zA-Z0-9/]*(.[a-zA-Z0-9]*)?')
        regAt = re.compile('@([a-zA-Z0-9]*[*_/&%#@$]*)*[a-zA-Z0-9]*')
        text = re.sub(regHttp, '', text)
        text = re.sub(regAt, '', text)
        text = re.sub('RT : ', '', text)
        text = re.sub(regHttps, '', text)
        text = re.sub('[0-9]', '', text)
        text = self._delete_punctuation(text)
        return text

    def _delete_punctuation(self, text):

        exclude = set(string.punctuation)
        s = ''.join(ch for ch in text if ch not in exclude)
        return s

    def _extract_ngrams(self, text):

        unigrams_lemmas = []
        pos_tagged = []
        unigrams_words = []
        tokens = text.split()
        for token in nltk.pos_tag(tokens):
            unigrams_words.append(token[0])
            pos_tagged.append(token[1])
            if token[1][0] in self._syntactics.keys():
                unigrams_lemmas.append(
                    self._wnlemma.lemmatize(token[0],
                                            self._syntactics[token[1][0]]))
            else:
                unigrams_lemmas.append(token[0])

        return unigrams_words, unigrams_lemmas, pos_tagged

    def _find_ngrams(self, input_list, n):
        return zip(*[input_list[i:] for i in range(n)])

    def _clean_pos(self, pos_tagged):

        pos_tags = {
            'NN': 'NN',
            'NNP': 'NN',
            'NNP-LOC': 'NN',
            'NNS': 'NN',
            'JJ': 'JJ',
            'JJR': 'JJ',
            'JJS': 'JJ',
            'RB': 'RB',
            'RBR': 'RB',
            'RBS': 'RB',
            'VB': 'VB',
            'VBD': 'VB',
            'VGB': 'VB',
            'VBN': 'VB',
            'VBP': 'VB',
            'VBZ': 'VB'
        }

        for i in range(len(pos_tagged)):
            if pos_tagged[i] in pos_tags:
                pos_tagged[i] = pos_tags[pos_tagged[i]]
        return pos_tagged

    def _extract_features(self, text):

        feature_set = {k: 0 for k in self._categories}
        ngrams_words, ngrams_lemmas, pos_tagged = self._extract_ngrams(text)
        matches = 0
        pos_tagged = self._clean_pos(pos_tagged)

        tag_wn = {
            'NN': self._wn16.NOUN,
            'JJ': self._wn16.ADJ,
            'VB': self._wn16.VERB,
            'RB': self._wn16.ADV
        }
        for i in range(len(pos_tagged)):
            if pos_tagged[i] in tag_wn:
                synsets = self._wn16.synsets(ngrams_words[i],
                                             tag_wn[pos_tagged[i]])
                if synsets:
                    offset = synsets[0].offset()
                    if offset in self._total_synsets[pos_tagged[i]]:
                        if self._total_synsets[pos_tagged[i]][offset] is None:
                            continue
                        else:
                            emotion = self._total_synsets[
                                pos_tagged[i]][offset].get_level(5).name
                            matches += 1
                            for i in self._categories:
                                if emotion in self._categories[i]:
                                    feature_set[i] += 1
        if matches == 0:
            matches = 1

        for i in feature_set:
            feature_set[i] = (feature_set[i] / matches)

        return feature_set

    def analyse_entry(self, entry, activity):
        params = activity.params

        text_input = entry['nif:isString']

        text = self._my_preprocessor(text_input)

        feature_text = self._extract_features(text)

        emotionSet = EmotionSet(id="Emotions0")
        emotions = emotionSet.onyx__hasEmotion

        for i in feature_text:
            emotions.append(
                Emotion(onyx__hasEmotionCategory=self._wnaffect_mappings[i],
                        onyx__hasEmotionIntensity=feature_text[i]))

        entry.emotions = [emotionSet]

        yield entry

    def test(self, *args, **kwargs):
        results = list()
        params = {
            'algo': 'emotion-wnaffect',
            'intype': 'direct',
            'expanded-jsonld': 0,
            'informat': 'text',
            'prefix': '',
            'plugin_type': 'analysisPlugin',
            'urischeme': 'RFC5147String',
            'outformat': 'json-ld',
            'i': 'Hello World',
            'input': 'Hello World',
            'conversion': 'full',
            'language': 'en',
            'algorithm': 'emotion-wnaffect'
        }

        self.activate()
        texts = {
            'I hate you': 'anger',
            'i am sad': 'sadness',
            'i am happy with my marks': 'joy',
            'This movie is scary': 'negative-fear'
        }

        for text in texts:
            response = next(
                self.analyse_entry(Entry(nif__isString=text),
                                   self.activity(params)))
            expected = texts[text]
            emotionSet = response.emotions[0]
            max_emotion = max(emotionSet['onyx:hasEmotion'],
                              key=lambda x: x['onyx:hasEmotionIntensity'])
            assert max_emotion['onyx:hasEmotionCategory'] == expected
class WordNetDomains:
    """
    API wrapping some functionality around WordNetDomains. WordNetDomains works with WordNet2.0 (and currently not with
    WordNet3.0). This class assumes you have downloaded WordNet2.0 and WordNetDomains and that they are on the same
    data home.
    WordNet2.0 can be downloaded at https://wordnetcode.princeton.edu/2.0/WordNet-2.0.tar.gz
    WordNetDomains can be downloaded from the home project page at http://wndomains.fbk.eu/index.html (it requires
    some permissions that can be granted by filling an online form).
    See http://wndomains.fbk.eu/index.html for more information.
    """
    def __init__(self, wordnet_home):
        assert exists(f'{wordnet_home}/WordNet-2.0'
                      ), f'error: missing WordNet-2.0 in {wordnet_home}'
        assert exists(f'{wordnet_home}/wn-domains-3.2'
                      ), f'error: missing WordNetDomains in {wordnet_home}'

        # load WordNet2.0
        self.wn = WordNetCorpusReader(f'{wordnet_home}/WordNet-2.0/dict',
                                      'WordNet-2.0/dict')

        # load WordNetDomains (based on https://stackoverflow.com/a/21904027/8759307)
        self.domain2synsets = defaultdict(list)
        self.synset2domains = defaultdict(list)
        for i in open(f'{wordnet_home}/wn-domains-3.2/wn-domains-3.2-20070223',
                      'r'):
            ssid, doms = i.strip().split('\t')
            doms = doms.split()
            self.synset2domains[ssid] = doms
            for d in doms:
                self.domain2synsets[d].append(ssid)

    def get_domains(self, word, pos=None, first_sense_only=False):
        """
        Gets a set of domains associated with a given word, possibly restricted to a specific pos
        :param word: the word (string)
        :param pos: the part-of-speech of the word (optional)
        :param first_sense_only: if True, only the first sense concurs in the set of domains. The first sense returned
        by WordNet is assumed to be the most common one, and is a typical baseline in tasks of word sense disambiguation.
        :return: a set of domains (strings) linked to the word according to WordNetDomains
        """
        word_synsets = self.wn.synsets(word, pos=pos)
        if first_sense_only:
            word_synsets = word_synsets[:1]
        domains = []
        for synset in word_synsets:
            domains.extend(self.get_domains_from_synset(synset))
        return set(domains)

    def get_domains_from_synset(self, synset):
        """
        Gets a set of domains associated with a given synset
        :param synset: the synset
        :return: a set of domains (strings) linked to the synset according to WordNetDomains
        """
        return self.synset2domains.get(self._askey_from_synset(synset), set())

    def get_synsets(self, domain):
        """
        Gets a list of synsets linked to the given domain as according to WordNetDomains (empty if the domain does
        not exist)
        :param domain: a string representing the domain. Should be a domain of those considered in WordNetDomains
        :return: a list of synset objects linked to the domain
        """
        return [
            self._synset_from_key(key)
            for key in self.domain2synsets.get(domain, [])
        ]

    def get_all_domains(self):
        """
        Gets a set of all the domains in WordNetDomains
        :return: a set of domains (strings)
        """
        return set(self.domain2synsets.keys())

    def _synset_from_key(self, key):
        offset, pos = key.split('-')
        return self.wn.synset_from_pos_and_offset(pos, int(offset))

    def _askey_from_synset(self, synset):
        return self._askey_from_offset_pos(synset.offset(), synset.pos())

    def _askey_from_offset_pos(self, offset, pos):
        return str(offset).zfill(8) + "-" + pos
Exemplo n.º 15
0
class EmotionTextPlugin(EmotionPlugin, ShelfMixin):
    def _load_synsets(self, synsets_path):
        """Returns a dictionary POS tag -> synset offset -> emotion (str -> int -> str)."""
        tree = ET.parse(synsets_path)
        root = tree.getroot()
        pos_map = {"noun": "NN", "adj": "JJ", "verb": "VB", "adv": "RB"}

        synsets = {}
        for pos in ["noun", "adj", "verb", "adv"]:
            tag = pos_map[pos]
            synsets[tag] = {}
            for elem in root.findall(".//{0}-syn-list//{0}-syn".format(
                    pos, pos)):
                offset = int(elem.get("id")[2:])
                if not offset: continue
                if elem.get("categ"):
                    synsets[tag][offset] = Emo.emotions[elem.get(
                        "categ")] if elem.get(
                            "categ") in Emo.emotions else None
                elif elem.get("noun-id"):
                    synsets[tag][offset] = synsets[pos_map["noun"]][int(
                        elem.get("noun-id")[2:])]
        return synsets

    def _load_emotions(self, hierarchy_path):
        """Loads the hierarchy of emotions from the WordNet-Affect xml."""

        tree = ET.parse(hierarchy_path)
        root = tree.getroot()
        for elem in root.findall("categ"):
            name = elem.get("name")
            if name == "root":
                Emo.emotions["root"] = Emo("root")
            else:
                Emo.emotions[name] = Emo(name, elem.get("isa"))

    def activate(self, *args, **kwargs):

        nltk.download(['stopwords', 'averaged_perceptron_tagger', 'wordnet'])
        self._stopwords = stopwords.words('english')
        self._wnlemma = wordnet.WordNetLemmatizer()
        self._syntactics = {'N': 'n', 'V': 'v', 'J': 'a', 'S': 's', 'R': 'r'}
        local_path = os.path.dirname(os.path.abspath(__file__))
        self._categories = {
            'anger': [
                'general-dislike',
            ],
            'fear': [
                'negative-fear',
            ],
            'disgust': [
                'shame',
            ],
            'joy':
            ['gratitude', 'affective', 'enthusiasm', 'love', 'joy', 'liking'],
            'sadness': [
                'ingrattitude', 'daze', 'humility', 'compassion', 'despair',
                'anxiety', 'sadness'
            ]
        }

        self._wnaffect_mappings = {
            'anger': 'anger',
            'fear': 'negative-fear',
            'disgust': 'disgust',
            'joy': 'joy',
            'sadness': 'sadness'
        }

        self._load_emotions(local_path + self.hierarchy_path)

        if 'total_synsets' not in self.sh:
            total_synsets = self._load_synsets(local_path + self.synsets_path)
            self.sh['total_synsets'] = total_synsets

        self._total_synsets = self.sh['total_synsets']

        self._wn16_path = self.wn16_path
        self._wn16 = WordNetCorpusReader(
            os.path.abspath("{0}".format(local_path + self._wn16_path)),
            nltk.data.find(local_path + self._wn16_path))

    def deactivate(self, *args, **kwargs):
        self.save()

    def _my_preprocessor(self, text):

        regHttp = re.compile(
            '(http://)[a-zA-Z0-9]*.[a-zA-Z0-9/]*(.[a-zA-Z0-9]*)?')
        regHttps = re.compile(
            '(https://)[a-zA-Z0-9]*.[a-zA-Z0-9/]*(.[a-zA-Z0-9]*)?')
        regAt = re.compile('@([a-zA-Z0-9]*[*_/&%#@$]*)*[a-zA-Z0-9]*')
        text = re.sub(regHttp, '', text)
        text = re.sub(regAt, '', text)
        text = re.sub('RT : ', '', text)
        text = re.sub(regHttps, '', text)
        text = re.sub('[0-9]', '', text)
        text = self._delete_punctuation(text)
        return text

    def _delete_punctuation(self, text):

        exclude = set(string.punctuation)
        s = ''.join(ch for ch in text if ch not in exclude)
        return s

    def _extract_ngrams(self, text):

        unigrams_lemmas = []
        pos_tagged = []
        unigrams_words = []
        tokens = text.split()
        for token in nltk.pos_tag(tokens):
            unigrams_words.append(token[0])
            pos_tagged.append(token[1])
            if token[1][0] in self._syntactics.keys():
                unigrams_lemmas.append(
                    self._wnlemma.lemmatize(token[0],
                                            self._syntactics[token[1][0]]))
            else:
                unigrams_lemmas.append(token[0])

        return unigrams_words, unigrams_lemmas, pos_tagged

    def _find_ngrams(self, input_list, n):
        return zip(*[input_list[i:] for i in range(n)])

    def _clean_pos(self, pos_tagged):

        pos_tags = {
            'NN': 'NN',
            'NNP': 'NN',
            'NNP-LOC': 'NN',
            'NNS': 'NN',
            'JJ': 'JJ',
            'JJR': 'JJ',
            'JJS': 'JJ',
            'RB': 'RB',
            'RBR': 'RB',
            'RBS': 'RB',
            'VB': 'VB',
            'VBD': 'VB',
            'VGB': 'VB',
            'VBN': 'VB',
            'VBP': 'VB',
            'VBZ': 'VB'
        }

        for i in range(len(pos_tagged)):
            if pos_tagged[i] in pos_tags:
                pos_tagged[i] = pos_tags[pos_tagged[i]]
        return pos_tagged

    def _extract_features(self, text):

        feature_set = {k: 0 for k in self._categories}
        ngrams_words, ngrams_lemmas, pos_tagged = self._extract_ngrams(text)
        matches = 0
        pos_tagged = self._clean_pos(pos_tagged)

        tag_wn = {
            'NN': self._wn16.NOUN,
            'JJ': self._wn16.ADJ,
            'VB': self._wn16.VERB,
            'RB': self._wn16.ADV
        }
        for i in range(len(pos_tagged)):
            if pos_tagged[i] in tag_wn:
                synsets = self._wn16.synsets(ngrams_words[i],
                                             tag_wn[pos_tagged[i]])
                if synsets:
                    offset = synsets[0].offset()
                    if offset in self._total_synsets[pos_tagged[i]]:
                        if self._total_synsets[pos_tagged[i]][offset] is None:
                            continue
                        else:
                            emotion = self._total_synsets[
                                pos_tagged[i]][offset].get_level(5).name
                            matches += 1
                            for i in self._categories:
                                if emotion in self._categories[i]:
                                    feature_set[i] += 1
        if matches == 0:
            matches = 1

        for i in feature_set:
            feature_set[i] = (feature_set[i] / matches) * 100

        return feature_set

    def analyse_entry(self, entry, params):

        text_input = entry.get("text", None)

        text = self._my_preprocessor(text_input)

        feature_text = self._extract_features(text)

        emotionSet = EmotionSet(id="Emotions0")
        emotions = emotionSet.onyx__hasEmotion

        for i in feature_text:
            emotions.append(
                Emotion(onyx__hasEmotionCategory=self._wnaffect_mappings[i],
                        onyx__hasEmotionIntensity=feature_text[i]))

        entry.emotions = [emotionSet]

        yield entry