예제 #1
0
    words = [ word for word in words if len(word) > 1\
                             and not word.isnumeric()\
                             and word not in stopWords ]

    word_dist = FreqDist(words)
    plt.title(song[1]['song'] + " - " + str(song[1]['genre']))
    word_dist.plot(20)

#p5
word_dist = []
for song in songList.iterrows():
    words = nltk.word_tokenize(song[1]['lyrics'].lower())
    words = [ word for word in words if len(word) > 1\
                             and not word.isnumeric()\
                             and word not in stopWords ]
    word_dist.append(FreqDist(words))
songList['word_dist'] = word_dist
#p6
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

train, test = train_test_split(songList, test_size=0.2)

X_train, y_test = train[['song', 'year', 'artist']], train['genre']
X_test, y_train = test[['song', 'year', 'artist']], test['genre']
X_train, y_test = train['year'], train['genre']
X_test, y_train = test['year'], test['genre']

clf = MultinomialNB().fit(X_train, y_train)
clf = SVC()
    def getTokens(self):
        """
        rankedMandatory 
        {'tokens': 
            {u'bar': {'all': {'densityAverage': 0.5584,
                              'densityUpperLimit': 2.55,
                              'freqAverage': 0,
                              'freqUpperLimit': 1},
                      'h1': {'densityAverage': 0.0,
                             'densityUpperLimit': 0.0,
                             'freqAverage': 0,
                             'freqUpperLimit': 0},
                      'h2': {'densityAverage': 0.5073,
                                  ....
        """

        mandatoryTokens = self._getTokens()

        maxValue = min(len(self.seoLibrary.seoDocuments),
                       max(MandatoryTerms.INTERVALS))

        documentsFreqs = []

        for seoDocument in self.seoLibrary.seoDocuments[0:maxValue]:
            documentsFreqs.append(
                FreqDist(seoDocument.getTextTokens(lemmatize=True)))

        #Para cada bloque de cada documento, tenemos una lista de FreqDist's
        mandatoryFreqs = {
            'uriTokens':
            self._getTokensFreqList(self.seoLibrary.getUriTokens(unique=False),
                                    maxValue),
            'titleTokens':
            self._getTokensFreqList(
                self.seoLibrary.getTitleTokens(unique=False), maxValue),
            'metaDescriptionTokens':
            self._getTokensFreqList(
                self.seoLibrary.getMetaDescriptionTokens(unique=False),
                maxValue),
            'h1Tokens':
            self._getTokensFreqList(self.seoLibrary.getH1Tokens(unique=False),
                                    maxValue),
            'h2Tokens':
            self._getTokensFreqList(self.seoLibrary.getH2Tokens(unique=False),
                                    maxValue),
            'strongTokens':
            self._getTokensFreqList(
                self.seoLibrary.getStrongTokens(unique=False), maxValue),
        }

        allList = []

        for i in range(0, maxValue):
            freqs = FreqDist()
            for blockFreqList in mandatoryFreqs.values():
                freqs += blockFreqList[i]
            allList.append(FreqDist(freqs))

        mandatoryFreqs['all'] = allList

        mandatoryTokensInfo = {}

        for lemma in mandatoryTokens:
            occurencesScore, seoDocumentsFreqs, seoDocumentsDensities = self._getseoDocumentsInfo(
                lemma, documentsFreqs)
            if occurencesScore > settings.MANDATORY_LOWER_LIMIT:
                token = self.seoLibrary.lemma2Token([(lemma, 1)])[0][0][0]
                if not token in mandatoryTokensInfo:
                    # create entry
                    mandatoryTokensInfo[token] = {}

                for mandatoryField, freqList in mandatoryFreqs.items():
                    freqs = []
                    densities = []
                    for freqTokens in freqList:
                        freqs.append(freqTokens.get(lemma, 0))
                        densities.append(
                            int(
                                freqTokens.get(lemma, 0) * 10000 /
                                max(1, freqTokens.N())) / 100.00)
                    mandatoryTokensInfo[token][
                        mandatoryField] = self._getBlockInfo(freqs, densities)
                mandatoryTokensInfo[token]['text'] = self._getBlockInfo(
                    seoDocumentsFreqs, seoDocumentsDensities)
        return mandatoryTokensInfo