def main(unzip_IMDB, n_gram_min, n_gram_max, max_iter, solver, rand_seed, cross_val, jobs): print("Entered Main...") if args.n_gram_max < args.n_gram_min: args.n_gram_max = args.n_gram_min if args.max_iter < 5000: args.max_iter = 5000 fname = "Project_LR" tablename = fname + "_results" + ".csv" #%% ########################################################################### program_time = time.time() start_time = strftime("%b %d, %Y_%H.%M.%S %p", time.localtime()) ftime = str(start_time) start_time = time.time() print("UnZip file to csv processing...") if args.unzip_IMDB == 'unzip': source = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz' target = 'aclImdb_v1.tar.gz' if not os.path.isdir('aclImdb') and not os.path.isfile( 'aclImdb_v1.tar.gz'): if (sys.version_info < (3, 0)): import urllib urllib.urlretrieve(source, target, reporthook) else: import urllib.request urllib.request.urlretrieve(source, target, reporthook) if not os.path.isdir('aclImdb'): with tarfile.open(target, 'r:gz') as tar: tar.extractall() # ## Preprocessing the movie dataset into more convenient format # change the `basepath` to the directory of the # unzipped movie dataset basepath = 'C:\\Users\\sanfo\\Documents\\NMSU\\CS_487\\Semester_Project\\aclImdb' labels = {'pos': 1, 'neg': 0} pbar = pyprind.ProgBar(50000) df = pd.DataFrame() for s in ('test', 'train'): for l in ('pos', 'neg'): path = os.path.join(basepath, s, l) for file in os.listdir(path): with open(os.path.join(path, file), 'r', encoding='utf-8') as infile: txt = infile.read() df = df.append([[txt, labels[l]]], ignore_index=True) pbar.update() df.columns = ['review', 'sentiment'] # Shuffling the DataFrame: np.random.seed(0) df = df.reindex(np.random.permutation(df.index)) # header = ['review', 'sentiment'] df.to_csv('movie_data.csv', index=False, header=True) # Optional: Saving the assembled data as CSV file: df = pd.read_csv('movie_data.csv', encoding='utf-8') print(df.shape) unzip_time = ((time.time() - start_time)) print("Zip to csv running time: %.12f" % unzip_time + " seconds.\n\n") #%% ########################################################################### # clean the data of web markups # test that the data web markup cleaning works # print(preprocessor(df.loc[0, 'review'][:])) # apply the cleaning of web markup characters from the whole dataset start_time = time.time() print("Cleaning Markup Language processing...") df['review'] = df['review'].apply(preprocessor) # save the cleaned data to file df.to_csv('movie_data_clean.csv', index=False, header=True) clean_time = ((time.time() - start_time)) print("Cleaning Markup Language running time: %.12f" % clean_time + " seconds.\n\n") #%% ########################################################################### # stop_words_ = text.ENGLISH_STOP_WORDS start_time = time.time() print("stopwords processing...") nltk.download('stopwords') stop = stopwords.words('english') print("stop type:", type(stop)) stopword_time = ((time.time() - start_time)) print("Stopwords running time: %.12f" % stopword_time + " seconds.\n\n") #%% ########################################################################### # Tokenize the bag of words database #### PRE-PROCESSING - SPLITING A SAMPLE OF THE DATA #### start_time = time.time() print("Tokenize processing...") X = df.loc[:, 'review'] y = df.loc[:, 'sentiment'] X_junk, X_1250, y_junk, y_1250 = train_test_split( X, y, test_size=0.025, random_state=args.rand_seed, stratify=y) X_train, X_test, y_train, y_test = train_test_split( X_1250, y_1250, test_size=0.25, random_state=args.rand_seed, stratify=y_1250) print("Split Data Train \tTest:", str(X_train.shape), "\t", str(y_train.shape)) print("Split Target Train\tTest:", str(X_test.shape), "\t", str(y_test.shape)) print("Shape X_train:", X_train.shape) print("Shape y_train:", y_train.shape) print("Shape X_test", X_test.shape) print("Shape y_test", y_test.shape) print("X_train type:", type(X_train[1:2])) print("Range of training data: ", range(len(X_train))) print("Contents of X_train[1] before tokenizeing with stop words:\n", X_train[1:2]) print() print("Contents of X_test[1] before tokenizeing with stop words:\n", X_test[1:2]) print() print() tokenize_time = ((time.time() - start_time)) print("Tokenize running time: %.12f" % tokenize_time + " seconds.\n\n") #%% ########################################################################### # Term Frequency Inverse Document Frequency start_time = time.time() print("Term Frequency Inverse Document Frequency processing...") tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None) tfidf_time = ((time.time() - start_time)) print("Term Frequency Inverse Document Frequency running time: %.12f" % tfidf_time + " seconds.\n\n") #%% ########################################################################### # Grid Search CV start_time = time.time() print("GridSearchCV processing...") print("CMD arguments:\n") print("n_gram_min:", args.n_gram_min) print("n_gram_max:", args.n_gram_max) print("max_iter:", args.max_iter) print("solver:", args.solver) print("rand_seed:", args.rand_seed) print("cross_val:", args.cross_val) print("jobs:", args.jobs) if args.solver == 'lbfgs': penalty1 = 'none' if args.solver == 'newton-cg': penalty1 = 'none' if args.solver == 'liblinear': penalty1 = 'l1' if args.solver == 'saga': penalty1 = 'l1' if args.solver == 'sag': penalty1 = 'none' penalty2 = 'l2' param_grid = [ { 'vect__ngram_range': [(args.n_gram_min, args.n_gram_max)], 'vect__stop_words': [stop, None], 'vect__tokenizer': [tokenizer, tokenizer_porter], 'clf__penalty': [penalty1, penalty2], 'clf__C': [1.0, 10.0, 100.0] }, { 'vect__ngram_range': [(args.n_gram_min, args.n_gram_max)], 'vect__stop_words': [stop, None], 'vect__tokenizer': [tokenizer, tokenizer_porter], 'vect__use_idf': [False], 'vect__norm': [None], 'clf__penalty': [penalty1, penalty2], 'clf__C': [1.0, 10.0, 100.0] }, ] lr_tfidf = Pipeline([('vect', tfidf), ('clf', LogisticRegression(max_iter=args.max_iter, solver=args.solver, random_state=args.rand_seed))]) gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring='accuracy', cv=args.cross_val, verbose=1, n_jobs=args.jobs) GSCV_time = ((time.time() - start_time)) print("GridSearchCV running time: %.12f" % GSCV_time + " seconds.\n\n") #%% ########################################################################### # Fitting data and testing start_time = time.time() print("Fitting training data to best grid search results...") print("Shape X_train:", X_train.shape) print("Shape y_train:", y_train.shape) print("Shape X_test", X_test.shape) print("Shape y_test", y_test.shape) gs_lr_tfidf.fit(X_train, y_train) print('Best parameter set: %s ' % gs_lr_tfidf.best_params_) logname = fname + "_best_parameters.log" log_string = "n_gram_min, n_gram_max, cross_val, jobs, max_iter, rand_seed, solver\n" log_string += str(args.n_gram_min) + ", " + str( args.n_gram_max) + ", " + str(args.cross_val) + ", " + str( args.jobs) + ", " + str(args.max_iter) + ", " + str( args.rand_seed) + ", " + args.solver + "\n" log_string += "Best parameter set: %s\n" % gs_lr_tfidf.best_params_ + "\n\n" write_log(log_string, logname) print() print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_) print() clf = gs_lr_tfidf.best_estimator_ print('Train Accuracy: %.3f' % clf.score(X_train, y_train)) print('Test Accuracy: %.3f' % clf.score(X_test, y_test)) fit_time = ((time.time() - start_time)) print("Fitting and testing running time: %.12f" % fit_time + " seconds.\n\n") #%% # Test on unseen data confmat = testing.conf_mat(clf, X_test, y_test) print(confmat) axes_name = "LR " + args.solver view_data.plot_confusion_matrix(confmat, axes_name, ftime, fname) precision, recall, test_f1 = testing.p_r_f1(clf, X_test, y_test) error, test_accuracy = testing.acc_err(confmat) # Test on seen data confmat = testing.conf_mat(clf, X_train, y_train) pre, rec, train_f1 = testing.p_r_f1(clf, X_train, y_train) err, train_accuracy = testing.acc_err(confmat) ## show roc_auc curve # view_data.roc_auc(clf, X_train, y_train, ftime, fname) #%% ########################################################################### if not (paths.exists(tablename)): table_string = "n_gram_min, n_gram_max, cross_val, jobs, max_iter, rand_seed, solver, unzip_time, clean_time, stopword_time, tokenize_time, ftidf_time, GSCV_time, fit_time, total_time, train_f1, train_acc, test_f1, test_acc, precision, recall, error\n" table_string += str(args.n_gram_min) + ", " + str( args.n_gram_max) + ", " + str(args.cross_val) + ", " + str( args.jobs) + ", " + str(args.max_iter) + ", " + str( args.rand_seed) + ", " + args.solver + ", " else: table_string = str(args.n_gram_min) + ", " + str( args.n_gram_max) + ", " + str(args.cross_val) + ", " + str( args.jobs) + ", " + str(args.max_iter) + ", " + str( args.rand_seed) + ", " + args.solver + ", " total_time = ((time.time() - program_time)) table_string += "%.12f" % unzip_time + ", " + "%.12f" % clean_time + ", " + "%.12f" % stopword_time + ", " + "%.12f" % tokenize_time + ", " + "%.12f" % tfidf_time + ", " + "%.12f" % GSCV_time + ", " + "%.12f" % fit_time + ", " + "%.12f" % total_time + ", " table_string += "%.2f" % train_f1 + ", " + "%.2f" % train_accuracy + ", " + "%.2f" % test_f1 + ", " + "%.2f" % test_accuracy + ", " + "%.2f" % precision + ", " + "%.2f" % recall + ", " + "%.2f" % error + "\n" write_log(table_string, tablename) print("Total program running time: %.12f" % total_time + " seconds.\n\n") return