示例#1
0
def main(unzip_IMDB, n_gram_min, n_gram_max, max_iter, solver, rand_seed,
         cross_val, jobs):

    print("Entered Main...")
    if args.n_gram_max < args.n_gram_min:
        args.n_gram_max = args.n_gram_min
    if args.max_iter < 5000:
        args.max_iter = 5000
    fname = "Project_LR"
    tablename = fname + "_results" + ".csv"
    #%%
    ###########################################################################
    program_time = time.time()
    start_time = strftime("%b %d, %Y_%H.%M.%S %p", time.localtime())
    ftime = str(start_time)
    start_time = time.time()
    print("UnZip file to csv processing...")
    if args.unzip_IMDB == 'unzip':
        source = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
        target = 'aclImdb_v1.tar.gz'
        if not os.path.isdir('aclImdb') and not os.path.isfile(
                'aclImdb_v1.tar.gz'):

            if (sys.version_info < (3, 0)):
                import urllib
                urllib.urlretrieve(source, target, reporthook)

            else:
                import urllib.request
                urllib.request.urlretrieve(source, target, reporthook)
        if not os.path.isdir('aclImdb'):
            with tarfile.open(target, 'r:gz') as tar:
                tar.extractall()

        # ## Preprocessing the movie dataset into more convenient format
        # change the `basepath` to the directory of the
        # unzipped movie dataset
        basepath = 'C:\\Users\\sanfo\\Documents\\NMSU\\CS_487\\Semester_Project\\aclImdb'
        labels = {'pos': 1, 'neg': 0}
        pbar = pyprind.ProgBar(50000)
        df = pd.DataFrame()
        for s in ('test', 'train'):
            for l in ('pos', 'neg'):
                path = os.path.join(basepath, s, l)
                for file in os.listdir(path):
                    with open(os.path.join(path, file), 'r',
                              encoding='utf-8') as infile:
                        txt = infile.read()
                    df = df.append([[txt, labels[l]]], ignore_index=True)
                    pbar.update()
        df.columns = ['review', 'sentiment']

        # Shuffling the DataFrame:
        np.random.seed(0)
        df = df.reindex(np.random.permutation(df.index))

        # header = ['review', 'sentiment']
        df.to_csv('movie_data.csv', index=False, header=True)

    # Optional: Saving the assembled data as CSV file:
    df = pd.read_csv('movie_data.csv', encoding='utf-8')

    print(df.shape)
    unzip_time = ((time.time() - start_time))
    print("Zip to csv running time: %.12f" % unzip_time + " seconds.\n\n")

    #%%
    ###########################################################################
    # clean the data of web markups
    # test that the data web markup cleaning works
    # print(preprocessor(df.loc[0, 'review'][:]))
    # apply the cleaning of web markup characters from the whole dataset
    start_time = time.time()
    print("Cleaning Markup Language processing...")
    df['review'] = df['review'].apply(preprocessor)

    # save the cleaned data to file
    df.to_csv('movie_data_clean.csv', index=False, header=True)
    clean_time = ((time.time() - start_time))
    print("Cleaning Markup Language running time: %.12f" % clean_time +
          " seconds.\n\n")

    #%%
    ###########################################################################
    # stop_words_ = text.ENGLISH_STOP_WORDS

    start_time = time.time()
    print("stopwords processing...")

    nltk.download('stopwords')
    stop = stopwords.words('english')
    print("stop type:", type(stop))

    stopword_time = ((time.time() - start_time))
    print("Stopwords running time: %.12f" % stopword_time + " seconds.\n\n")

    #%%
    ###########################################################################
    # Tokenize the bag of words database
    #### PRE-PROCESSING - SPLITING A SAMPLE OF THE DATA ####
    start_time = time.time()
    print("Tokenize processing...")
    X = df.loc[:, 'review']
    y = df.loc[:, 'sentiment']
    X_junk, X_1250, y_junk, y_1250 = train_test_split(
        X, y, test_size=0.025, random_state=args.rand_seed, stratify=y)
    X_train, X_test, y_train, y_test = train_test_split(
        X_1250,
        y_1250,
        test_size=0.25,
        random_state=args.rand_seed,
        stratify=y_1250)

    print("Split Data Train  \tTest:", str(X_train.shape), "\t",
          str(y_train.shape))
    print("Split Target Train\tTest:", str(X_test.shape), "\t",
          str(y_test.shape))

    print("Shape X_train:", X_train.shape)
    print("Shape y_train:", y_train.shape)
    print("Shape X_test", X_test.shape)
    print("Shape y_test", y_test.shape)

    print("X_train type:", type(X_train[1:2]))

    print("Range of training data: ", range(len(X_train)))

    print("Contents of X_train[1] before tokenizeing with stop words:\n",
          X_train[1:2])
    print()

    print("Contents of X_test[1] before tokenizeing with stop words:\n",
          X_test[1:2])
    print()

    print()
    tokenize_time = ((time.time() - start_time))
    print("Tokenize running time: %.12f" % tokenize_time + " seconds.\n\n")

    #%%
    ###########################################################################
    # Term Frequency Inverse Document Frequency
    start_time = time.time()
    print("Term Frequency Inverse Document Frequency processing...")
    tfidf = TfidfVectorizer(strip_accents=None,
                            lowercase=False,
                            preprocessor=None)
    tfidf_time = ((time.time() - start_time))
    print("Term Frequency Inverse Document Frequency running time: %.12f" %
          tfidf_time + " seconds.\n\n")

    #%%
    ###########################################################################
    # Grid Search CV
    start_time = time.time()
    print("GridSearchCV processing...")
    print("CMD arguments:\n")
    print("n_gram_min:", args.n_gram_min)
    print("n_gram_max:", args.n_gram_max)
    print("max_iter:", args.max_iter)
    print("solver:", args.solver)
    print("rand_seed:", args.rand_seed)
    print("cross_val:", args.cross_val)
    print("jobs:", args.jobs)

    if args.solver == 'lbfgs':
        penalty1 = 'none'
    if args.solver == 'newton-cg':
        penalty1 = 'none'
    if args.solver == 'liblinear':
        penalty1 = 'l1'
    if args.solver == 'saga':
        penalty1 = 'l1'
    if args.solver == 'sag':
        penalty1 = 'none'
    penalty2 = 'l2'

    param_grid = [
        {
            'vect__ngram_range': [(args.n_gram_min, args.n_gram_max)],
            'vect__stop_words': [stop, None],
            'vect__tokenizer': [tokenizer, tokenizer_porter],
            'clf__penalty': [penalty1, penalty2],
            'clf__C': [1.0, 10.0, 100.0]
        },
        {
            'vect__ngram_range': [(args.n_gram_min, args.n_gram_max)],
            'vect__stop_words': [stop, None],
            'vect__tokenizer': [tokenizer, tokenizer_porter],
            'vect__use_idf': [False],
            'vect__norm': [None],
            'clf__penalty': [penalty1, penalty2],
            'clf__C': [1.0, 10.0, 100.0]
        },
    ]

    lr_tfidf = Pipeline([('vect', tfidf),
                         ('clf',
                          LogisticRegression(max_iter=args.max_iter,
                                             solver=args.solver,
                                             random_state=args.rand_seed))])

    gs_lr_tfidf = GridSearchCV(lr_tfidf,
                               param_grid,
                               scoring='accuracy',
                               cv=args.cross_val,
                               verbose=1,
                               n_jobs=args.jobs)

    GSCV_time = ((time.time() - start_time))
    print("GridSearchCV running time: %.12f" % GSCV_time + " seconds.\n\n")

    #%%
    ###########################################################################
    # Fitting data and testing
    start_time = time.time()
    print("Fitting training data to best grid search results...")
    print("Shape X_train:", X_train.shape)
    print("Shape y_train:", y_train.shape)
    print("Shape X_test", X_test.shape)
    print("Shape y_test", y_test.shape)
    gs_lr_tfidf.fit(X_train, y_train)

    print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)
    logname = fname + "_best_parameters.log"
    log_string = "n_gram_min, n_gram_max, cross_val, jobs, max_iter, rand_seed, solver\n"
    log_string += str(args.n_gram_min) + ", " + str(
        args.n_gram_max) + ", " + str(args.cross_val) + ", " + str(
            args.jobs) + ", " + str(args.max_iter) + ", " + str(
                args.rand_seed) + ", " + args.solver + "\n"
    log_string += "Best parameter set: %s\n" % gs_lr_tfidf.best_params_ + "\n\n"
    write_log(log_string, logname)
    print()
    print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)
    print()

    clf = gs_lr_tfidf.best_estimator_
    print('Train Accuracy: %.3f' % clf.score(X_train, y_train))
    print('Test Accuracy: %.3f' % clf.score(X_test, y_test))

    fit_time = ((time.time() - start_time))
    print("Fitting and testing running time: %.12f" % fit_time +
          " seconds.\n\n")

    #%%
    # Test on unseen data
    confmat = testing.conf_mat(clf, X_test, y_test)
    print(confmat)
    axes_name = "LR " + args.solver
    view_data.plot_confusion_matrix(confmat, axes_name, ftime, fname)
    precision, recall, test_f1 = testing.p_r_f1(clf, X_test, y_test)
    error, test_accuracy = testing.acc_err(confmat)
    # Test on seen data
    confmat = testing.conf_mat(clf, X_train, y_train)
    pre, rec, train_f1 = testing.p_r_f1(clf, X_train, y_train)
    err, train_accuracy = testing.acc_err(confmat)
    ## show roc_auc curve
    # view_data.roc_auc(clf, X_train, y_train, ftime, fname)
    #%%
    ###########################################################################
    if not (paths.exists(tablename)):
        table_string = "n_gram_min, n_gram_max, cross_val, jobs, max_iter, rand_seed, solver, unzip_time, clean_time, stopword_time, tokenize_time, ftidf_time, GSCV_time, fit_time, total_time, train_f1, train_acc, test_f1, test_acc, precision, recall, error\n"
        table_string += str(args.n_gram_min) + ", " + str(
            args.n_gram_max) + ", " + str(args.cross_val) + ", " + str(
                args.jobs) + ", " + str(args.max_iter) + ", " + str(
                    args.rand_seed) + ", " + args.solver + ", "
    else:
        table_string = str(args.n_gram_min) + ", " + str(
            args.n_gram_max) + ", " + str(args.cross_val) + ", " + str(
                args.jobs) + ", " + str(args.max_iter) + ", " + str(
                    args.rand_seed) + ", " + args.solver + ", "

    total_time = ((time.time() - program_time))
    table_string += "%.12f" % unzip_time + ", " + "%.12f" % clean_time + ", " + "%.12f" % stopword_time + ", " + "%.12f" % tokenize_time + ", " + "%.12f" % tfidf_time + ", " + "%.12f" % GSCV_time + ", " + "%.12f" % fit_time + ", " + "%.12f" % total_time + ", "
    table_string += "%.2f" % train_f1 + ", " + "%.2f" % train_accuracy + ", " + "%.2f" % test_f1 + ", " + "%.2f" % test_accuracy + ", " + "%.2f" % precision + ", " + "%.2f" % recall + ", " + "%.2f" % error + "\n"
    write_log(table_string, tablename)
    print("Total program running time: %.12f" % total_time + " seconds.\n\n")

    return