예제 #1
0
 data_test = tfv.transform(data_test)
 
 # Feature Selection Tfidf
 chi = SelectKBest(chi2,k=k_best_nb)
 data_train = chi.fit_transform(data_train,labels_train)
 data_test = chi.transform(data_test)
 
 # Feature Selection CountVectorizer
 chi = SelectKBest(chi2,k=k_best_nb)
 chi.fit_transform(count_matrix,labels_train)
 count_matrix = chi.transform(count_matrix)
 count_test = chi.transform(count_test)
 
 
 # Nbmatrix
 nbmat = NBmatrix(1.0,bina=True,n_jobs=1)
 nbmat.fit(count_matrix,labels_train)
 nbm_test = nbmat.transform(count_test)
 nbm_data = nbmat.transform(count_matrix)
 
 ########################### Train part ########################################
 
 # First Layer Models for TF-IDF
 proba1, basic_score1, basic_name1 = first_layer(basic_model1, data_train, labels_train,data_train,labels_train)
 # First Layer Models for New features
 proba2,basic_score2, basic_name2 = first_layer(basic_model2, new_mat_train, labels_train,new_mat_train,labels_train)
 # First Layer Nbmatrix
 proba3, basic_score3, basic_name3 = first_layer(basic_model3, nbm_data, labels_train,nbm_data,labels_train)
 # Grouping the first layer probas
 proba = np.hstack([proba1,proba2,proba3])
 
예제 #2
0
#X= X_T

#Remove html tags
train = ct.removehtml(data)

#Create the dictionnary (WARNING nltk should be up-to-date)
data=ct.stemTokenize(train)  

#Compute tf-idf including n_grams of size 2 
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), binary=False)

#Compute a count_vectorizer including n_grams of size 2
count_vectorizer = CountVectorizer(ngram_range=(1,2),binary=False)

#Comptute a NB matrix as describe by Wang & Manning
nb_vectorizer = NBmatrix(alpha = 1.0 ,bina = True, n_jobs = 1)

#Fit transform on the data
tfidf_matrix = tfidf_vectorizer.fit_transform(data)
count_matrix = count_vectorizer.fit_transform(data)
nb_matrix = nb_vectorizer.fit_transform(count_matrix,labels)

print "size of the matrix : ", tfidf_matrix.shape
average_nb_words = np.mean(count_matrix.sum(axis=1))
print "Average number of words per review : ", average_nb_words
dic_size = count_matrix.shape[1]
print "dictionnary size : " , dic_size
sparsity = 1-float(count_matrix.nnz)/(25000.0*dic_size)
print "Sparsity of the data : ", sparsity