Exemplo n.º 1
0
    def create_blob(self):
        """
        hier kommt HanTa 
        """
        print(".")
        print('   >>>  HanoverTagger  GermaLemma  with TigerCorpus   <<<')
        tagger = ht.HanoverTagger('morphmodel_ger.pgz')
#
# Siehe: Christian Wartena (2019). A Probabilistic Morphology Model for German Lemmatization. 
# In: Proceedings of the 15th Conference on Natural Language Processing (KONVENS 2019): Long Papers. Pp. 40-49, Erlangen. )
#
        def convert(lst):
            return ' '.join(lst).split()
#
        pepe = convert(self.clean_data)
        tags = tagger.tag_sent(pepe)
#
        lemma_list = []
        for item in tags:
            lemma_list.append(item[1])
#
        self.hanta_lemma = lemma_list
#        print(lemma_list)
#
        blob_wtf = tbde(str(self.clean_data))
#        blob_wtf.words.singularize()
        self.blob_lemma = _lemmatizer.lemmatize(str(blob_wtf))
        self.blob_polarity = tbde(str(self.blob_lemma))
#        blob_wtf.parse()
        print("                      -/-                             ")
#        print("             TF-IDF  Auswertung        ")
        return self.blob_lemma, self.blob_polarity, self.hanta_lemma
Exemplo n.º 2
0
    def tfidf_calculate(self):
        """
        Erechnet den Tfidf von in jeweils 1/3 geteilte Abschnitte des sliste_n 
        """
        self.sliste_n = [x for (x,y) in self.blob_lemma if y not in ('N')]
        def tf(word, blob):
            return blob.words.count(word) / len(blob.words)
        def n_containing(word, bloblist):
            return sum(1 for blob in bloblist if word in blob.words)
        def idf(word, bloblist):
            return math.log(len(bloblist) / (1 + n_containing(word, bloblist)))
        def tfidf(word, blob, bloblist):
            return tf(word, blob) * idf(word, bloblist)

        nb1 = int(len(self.clean_data) * 0.333)
        nb2 = nb1 * 2
        nb3 = len(self.clean_data)
        doku1= tbde(str(self.sliste_n[0:nb1]))
        doku2= tbde(str(self.sliste_n[nb1:nb2]))
        doku3= tbde(str(self.sliste_n[nb2:nb3]))

        bloblist = [doku1,doku2,doku3]
        for i, item in enumerate(bloblist):
            print("Top words in document {}".format(i + 1))
            scores = {word: tfidf(word, item, bloblist) for word in item.words}
            sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
            for element, score in sorted_words[:3]:
                print("\tWord: {}, TF-IDF: {}".format(element, round(score, 4)))
        return self.sliste_n
Exemplo n.º 3
0
 def create_blob(self):
     """
     Erzeugt blob objekte: 1x blob lemma liste und 1x pures blob object
     """
     blob_wtf = tbde(str(self.clean_data))
     self.blob_lemma = _lemmatizer.lemmatize(str(blob_wtf))
     self.blob_polarity = tbde(str(self.blob_lemma))
     return self.blob_lemma, self.blob_polarity