Пример #1
0
# Reads in lyrics from data/lyrics/####hot100.atsv files
# Creates bag of words from lyrics

# Reference for bag of words: https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-1-for-beginners-bag-of-words

import pandas
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import cross_validation
import numpy as np

# user functions
from bagOfWords import getDF, split_tokenize

BAGSIZE = 100
LYRICS_PATH = "data/lyrics_1990s_2000s/"

# Initialize the "CountVectorizer" object, which is scikit-learn's bag of words tool.
vectorizer = CountVectorizer(analyzer="word", tokenizer=split_tokenize, preprocessor=None, stop_words=None)
# max_features = BAGSIZE)

# Transform data into feature vectors
dataDF = getDF(LYRICS_PATH, train=True)
dataDFNotNull = dataDF[pandas.notnull(dataDF["LYRICS"])]

# Split dataset to train and test with a 4:1 ratio
trainDFNotNull, testDFNotNull = cross_validation.train_test_split(dataDFNotNull, test_size=0.2)
trainDataFeatures = vectorizer.fit_transform(trainDFNotNull["LYRICS"])
testDataFeatures = vectorizer.transform(testDFNotNull["LYRICS"])
testDataFeatures = testDataFeatures.toarray()
Пример #2
0
#!/usr/bin/env python

# Check which lyrics are null

import pandas
import numpy as np

# user functions
from bagOfWords import getDF, split_tokenize

LYRICS_PATH = 'data/lyrics/'

### Processing training set ###
lyricsDF = getDF(LYRICS_PATH, train=True)

# Transform training data into feature vectors
lyricsDFNotNull = lyricsDF[pandas.notnull(lyricsDF['LYRICS'])]

print lyricsDF.shape
print lyricsDF.shape[0] - lyricsDFNotNull.shape[0]