words = [ word for word in words if len(word) > 1\ and not word.isnumeric()\ and word not in stopWords ] word_dist = FreqDist(words) plt.title(song[1]['song'] + " - " + str(song[1]['genre'])) word_dist.plot(20) #p5 word_dist = [] for song in songList.iterrows(): words = nltk.word_tokenize(song[1]['lyrics'].lower()) words = [ word for word in words if len(word) > 1\ and not word.isnumeric()\ and word not in stopWords ] word_dist.append(FreqDist(words)) songList['word_dist'] = word_dist #p6 from sklearn.model_selection import train_test_split from sklearn.svm import SVC from sklearn.naive_bayes import MultinomialNB train, test = train_test_split(songList, test_size=0.2) X_train, y_test = train[['song', 'year', 'artist']], train['genre'] X_test, y_train = test[['song', 'year', 'artist']], test['genre'] X_train, y_test = train['year'], train['genre'] X_test, y_train = test['year'], test['genre'] clf = MultinomialNB().fit(X_train, y_train) clf = SVC()
def getTokens(self): """ rankedMandatory {'tokens': {u'bar': {'all': {'densityAverage': 0.5584, 'densityUpperLimit': 2.55, 'freqAverage': 0, 'freqUpperLimit': 1}, 'h1': {'densityAverage': 0.0, 'densityUpperLimit': 0.0, 'freqAverage': 0, 'freqUpperLimit': 0}, 'h2': {'densityAverage': 0.5073, .... """ mandatoryTokens = self._getTokens() maxValue = min(len(self.seoLibrary.seoDocuments), max(MandatoryTerms.INTERVALS)) documentsFreqs = [] for seoDocument in self.seoLibrary.seoDocuments[0:maxValue]: documentsFreqs.append( FreqDist(seoDocument.getTextTokens(lemmatize=True))) #Para cada bloque de cada documento, tenemos una lista de FreqDist's mandatoryFreqs = { 'uriTokens': self._getTokensFreqList(self.seoLibrary.getUriTokens(unique=False), maxValue), 'titleTokens': self._getTokensFreqList( self.seoLibrary.getTitleTokens(unique=False), maxValue), 'metaDescriptionTokens': self._getTokensFreqList( self.seoLibrary.getMetaDescriptionTokens(unique=False), maxValue), 'h1Tokens': self._getTokensFreqList(self.seoLibrary.getH1Tokens(unique=False), maxValue), 'h2Tokens': self._getTokensFreqList(self.seoLibrary.getH2Tokens(unique=False), maxValue), 'strongTokens': self._getTokensFreqList( self.seoLibrary.getStrongTokens(unique=False), maxValue), } allList = [] for i in range(0, maxValue): freqs = FreqDist() for blockFreqList in mandatoryFreqs.values(): freqs += blockFreqList[i] allList.append(FreqDist(freqs)) mandatoryFreqs['all'] = allList mandatoryTokensInfo = {} for lemma in mandatoryTokens: occurencesScore, seoDocumentsFreqs, seoDocumentsDensities = self._getseoDocumentsInfo( lemma, documentsFreqs) if occurencesScore > settings.MANDATORY_LOWER_LIMIT: token = self.seoLibrary.lemma2Token([(lemma, 1)])[0][0][0] if not token in mandatoryTokensInfo: # create entry mandatoryTokensInfo[token] = {} for mandatoryField, freqList in mandatoryFreqs.items(): freqs = [] densities = [] for freqTokens in freqList: freqs.append(freqTokens.get(lemma, 0)) densities.append( int( freqTokens.get(lemma, 0) * 10000 / max(1, freqTokens.N())) / 100.00) mandatoryTokensInfo[token][ mandatoryField] = self._getBlockInfo(freqs, densities) mandatoryTokensInfo[token]['text'] = self._getBlockInfo( seoDocumentsFreqs, seoDocumentsDensities) return mandatoryTokensInfo