# Check whether model has been built already. if not (os.path.exists('dumped_model_LR.pkl')): # Call function that builds model on train.csv and uses it to test data. training.main() # Open saved model. model = 0 with open('dumped_model_LR.pkl', 'rb') as f: model = rick.load(f) # Read in test data. test_phrases = cleaning.read_data("testset_1.csv", "Phrase") test_ids = cleaning.read_data("testset_1.csv", "PhraseId") # Clean test data. test_phrases = cleaning.tokenize_data(test_phrases) test_phrases = cleaning.filter_data(test_phrases) test_phrases = training.untokenize(test_phrases) vect = CountVectorizer(min_df=2, ngram_range=(0, 30)) test_phrases = vect.transform(test_phrases) #test_phrases = np.array(test_phrases) #test_phrases = vect.transform(test_phrases) print(test_phrases.shape) #print(test_phrases.shape[1]) #test_phrases = test_phrases.reshape(1,-1) # Predict labels on test data. predictions = model.predict( test_phrases) # ValueError: X has 1 features per sample; expecting 170735 #data = create_df(ids, predictions)
from sklearn.model_selection import GridSearchCV from sklearn.linear_model import LogisticRegression import cleaning import timeit import numpy # MAIN ----------------------------------------------- # Get data phrases = cleaning.read_data("../data/train.csv", "Phrase") labels = cleaning.read_data("../data/train.csv", "Sentiment") # Tokenize and clean start_time_clean = timeit.default_timer() cleaned = cleaning.tokenize_data(phrases) # cleaned = cleaning.stem_data(cleaned) result = cleaning.filter_data(cleaned) elapsed_time_clean = timeit.default_timer() - start_time_clean print("Cleaning finished in " + str(elapsed_time_clean) + " seconds") # Tf-idf model to extract features. start_time_extract = timeit.default_timer() transformer = TfidfTransformer(smooth_idf=False) result = cleaning.untokenize(result) result = numpy.array(result) result = result.reshape(-1, 1) # convert to 2D array X_train = transformer.fit(result).transform(result) # X_train.toarray()
def main(): # Get train data. train_phrases = cleaning.read_data("../data/train.csv", "Phrase") train_labels = cleaning.read_data("../data/train.csv", "Sentiment") # Tokenize and clean train data prior to building model. start_time_clean = timeit.default_timer() train_phrases = cleaning.tokenize_data(train_phrases) # Stemming wasn't helpful. # train_phrases = cleaning.stem_data(train_phrases) train_phrases = cleaning.filter_data(train_phrases) elapsed_time_clean = timeit.default_timer() - start_time_clean print("Cleaning finished in " + str(elapsed_time_clean) + " seconds") # Bag of Words model to extract features. start_time_extract = timeit.default_timer() # Only uses the phrases between 0 through 30 words. vect = CountVectorizer(min_df=2, ngram_range=(1, 30)) train_phrases = cleaning.untokenize(train_phrases) train_phrases = vect.fit(train_phrases).transform(train_phrases) elapsed_time_extract = timeit.default_timer() - start_time_extract print("Feature extracting finished in " + str(elapsed_time_extract) + " seconds") # Train logistic regression model with built-in K-fold CV. start_time_train = timeit.default_timer() # Try a variety of C-values #param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]} # best is 1 #param_grid = {'C': [0.6, 0.8, 1, 2]} # best is 2 #param_grid = {'C': [5, 7, 9, 11, 13]} # best is 5 for cv=10 param_grid = {'C': [2]} grid = GridSearchCV(LogisticRegression(), param_grid, cv=5) grid.fit(train_phrases, train_labels) elapsed_time_train = timeit.default_timer() - start_time_train print("Training finished in " + str(elapsed_time_train) + " seconds") # Print Cross Validation estimates and optimal parameters. # print("Best cross-validation score: {:.2f}".format(grid.best_score_)) # print("Best parameters: ", grid.best_params_) # Output best parameter # print("Best estimator: ", grid.best_estimator_) model = grid.best_estimator_ # Read in test data. test_phrases = cleaning.read_data("../data/testset_1.csv", "Phrase") test_ids = cleaning.read_data("../data/testset_1.csv", "PhraseId") # Clean test data. test_phrases = cleaning.tokenize_data(test_phrases) test_phrases = cleaning.filter_data(test_phrases) test_phrases = cleaning.untokenize(test_phrases) test_phrases = vect.transform(test_phrases) print(test_phrases.shape) # Predict labels on test data. predictions = model.predict(test_phrases) # Output predictions into csv file. df = pd.DataFrame({ 'PhraseId': np.array(test_ids), 'Sentiment': np.array(predictions) }) outfile = "output.csv" df.to_csv(outfile, index=False)