def run_script(pdate_str, busjson, revjson, tipjson, senticsv, outfile): # convert pdate to seconds since the epoch pdate = du.date2int(du.str2date(pdate_str)) # load business objects print 'Loading business objects from %s...' % busjson all_buses, junk = ju.load_objects(busjson) # load review objects print 'loading review objects from %s...' % revjson all_reviews, junk = ju.load_objects(revjson) # load tip objects print 'loading tip objects from %s...' % tipjson all_tips, junk = ju.load_objects(tipjson) # load sentiment ranking data derived from tip and review data print 'loading sentiment rankings from %s...' % senticsv all_senti = cu.load_matrix(senticsv, has_hdr=False) # generate a data set the specified prediction date print('generate data set for prediction date %s...' % pdate_str) buses = du.gen_dataset(pdate, all_buses, all_reviews, all_tips, all_senti) # write data set to file print('writing generated data set to %s...' % outfile) ju.save_objects(buses, outfile)
def wfcv(clf, param_grid, all_buses, all_reviews, all_tips, all_senti, init_pdate, time_delta, feat_info=fi.data_feat_info, std_data=True, usamp=True, binary=None, reg=False, pca=-1, states=None): # find the earliest and latest review dates start_date = int(time.time()) end_date = 0 for bus in all_buses: first_review_date = bus[fi.first_review_date] last_review_date = bus[fi.last_review_date] if (first_review_date < start_date): start_date = first_review_date if (last_review_date > end_date): end_date = last_review_date # print out earliest and latest dates print('Earliest review date: %s' % du.date2str(du.int2date(start_date))) print('Latest review date: %s' % du.date2str(du.int2date(end_date))) # initialize the "prediction date" pdate = init_pdate # create variables for the training data - it will be populated later X_train_orig,y_train = None,None # generate the first data set buses_test = du.gen_dataset(pdate, all_buses, all_reviews, all_tips, all_senti, usamp=usamp, states=states, binary=binary, reg=reg) if (reg): # extract the target value as the y values for regression X_test_orig,y_test = ju.json2xy(buses_test, feat_info, fi.target, std=False) else: # extract the label value as the y values for classification X_test_orig,y_test = ju.json2xy(buses_test, feat_info, fi.label, std=False) print('Number of attributes in data set: %d' % X_test_orig.shape[1]) # initialize the stop_date threshold stop_date = end_date - 2*time_delta # create list to hold results results = [] # configure scoring metric to be used during grid search and feature selection if (usamp): # if class sizes are balanced then use accuracy scorer = 'accuracy' else: # if class sizes are unbalanced then use f1 score scorer = 'f1' # perform "walk forward cross validation" while (pdate <= stop_date): print('\n===================================================================') print("Train estimator using train set with prediction date %s:" % du.date2str(du.int2date(pdate))) # update the prediction date for the this round pdate = pdate + time_delta # use current test set as training set for this round X_train_orig = X_test_orig y_train = y_test # generate a new test set for this round buses_test = du.gen_dataset(pdate, all_buses, all_reviews, all_tips, all_senti, usamp=usamp, states=states, binary=binary, reg=reg) if (reg): # extract the target value as the y values for regression X_test_orig,y_test = ju.json2xy(buses_test, feat_info, fi.target, std=False) else: # extract the label value as the y values for classification X_test_orig,y_test = ju.json2xy(buses_test, feat_info, fi.label, std=False) # by default, use the original untransformed X data # - X_train & X_test will contain the transformed data (if any transformation is done) X_train = X_train_orig X_test = X_test_orig # =========================================== # apply any requested data transformations # standardize the data # See http://scikit-learn.org/stable/modules/preprocessing.html if (std_data): print(' Standardize the data...') # scaler is trained on training set scaler = prep.StandardScaler().fit(X_train_orig) # scaler is used to transform both train and test data X_train = scaler.transform(X_train_orig) X_test = scaler.transform(X_test_orig) # reduce the dimension of the data using PCA # See http://scikit-learn.org/stable/auto_examples/applications/face_recognition.html#example-applications-face-recognition-py if (pca >= 0): print(' Reduce dimension using PCA...') if (pca == 0): pca = None rand_pca = decomp.RandomizedPCA(n_components=pca, whiten=True) # fit PCA on the training data rand_pca.fit(X_train) # transform train and test sets using PCA X_train = rand_pca.transform(X_train) X_test = rand_pca.transform(X_test) print(' featues remaining after PCA: %d' % X_train.shape[1]) # data transformations complete # =========================================== # use grid search to train and test the classifier: # - see http://scikit-learn.org/stable/auto_examples/grid_search_digits.html#example-grid-search-digits-py if (param_grid): # train the classifier using grid search gs = grid_search.GridSearchCV(clf, param_grid, n_jobs=-1, scoring=scorer) #gs = grid_search.GridSearchCV(clf, param_grid, scoring=scorer) else: # use the classifier/regressor without grid search gs = clf print '\nTraining the estimator...' gs.fit(X_train, y_train) if (param_grid): print("\nBest parameters set found on train set:\n") print(gs.best_estimator_) print("\nGrid scores on train set:\n") for params, mean_score, scores in gs.grid_scores_: print(" %0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() / 2, params)) # if using RFE - print out number of features selected # TBD # collect predictions from the classifier print '\nTesting the estimator...' y_pred = gs.predict(X_test) print("\nResults for test set with prediction date %s:\n" % du.date2str(du.int2date(pdate))) if (reg): # print out explained variance score, mean absolute error, mean squared # error and R-squared metrics print_reg_metrics(y_test, y_pred) else: # print out the confusion matrix cm = metrics.confusion_matrix(y_test, y_pred) print_cm(cm) #print("\nScores on evaluation set:\n") #print(metrics.classification_report(y_test, y_pred, target_names=fi.class_names)) # save results results.append((y_test, y_pred)) #end while # return the true values and predictions for each round return results