from sklearn.preprocessing import LabelEncoder dataset = pd.read_csv('LabelledData.txt', delimiter=',,,', quoting=3, header=None, engine='python') y = dataset.iloc[:, 1].str.strip() labelencoder_y = LabelEncoder() y = labelencoder_y.fit_transform(y) # Cleaning the texts corpus = [] cleaner = DataHandler(dataset.iloc[:, 0]) #print(cleaner.__dict__) #corpus = cleaner.cleanLemmatizer() corpus = cleaner.cleanStemmer() # Creating the Bag of Words model cv = CountVectorizer() X = cv.fit_transform(corpus).toarray() ############################################################## # Training the model print("Training the model with train_set=80% & test_set=20%") # Splitting the dataset into the Training set and Test set from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)