def run_script(pdate_str, busjson, revjson, tipjson, senticsv, outfile): # convert pdate to seconds since the epoch pdate = du.date2int(du.str2date(pdate_str)) # load business objects print 'Loading business objects from %s...' % busjson all_buses, junk = ju.load_objects(busjson) # load review objects print 'loading review objects from %s...' % revjson all_reviews, junk = ju.load_objects(revjson) # load tip objects print 'loading tip objects from %s...' % tipjson all_tips, junk = ju.load_objects(tipjson) # load sentiment ranking data derived from tip and review data print 'loading sentiment rankings from %s...' % senticsv all_senti = cu.load_matrix(senticsv, has_hdr=False) # generate a data set the specified prediction date print('generate data set for prediction date %s...' % pdate_str) buses = du.gen_dataset(pdate, all_buses, all_reviews, all_tips, all_senti) # write data set to file print('writing generated data set to %s...' % outfile) ju.save_objects(buses, outfile)
def gen_dataset_files(pdates, busjson, revjson, tipjson, outdir): # load business objects print 'Loading business objects from %s...' % busjson all_buses, junk = jsonutils.load_objects(busjson) # load review objects print 'loading review objects from %s...' % revjson all_reviews, junk = jsonutils.load_objects(revjson) # load tip objects print 'loading tip objects from %s...' % tipjson all_tips, junk = jsonutils.load_objects(tipjson) # generate the datsets for pdatestr in pdates: # convert prediction date to int (seconds since epoch) pdate = date2int(str2date(pdatestr)) # generate the dataset for the specified prediction date print 'generating dataset for prediction date %s (%d)...' % (pdatestr,pdate) buses = gen_dataset(pdate, all_buses, all_reviews, all_tips) # generate filename for dataset outfile = outdir + '/' + pdatestr + '.json' # write dataset to file print 'writing %d JSON objects to %s...' % (len(buses),outfile) jsonutils.save_objects(buses, outfile)
def run_script(jsonfile, csvfile): # load json objects print 'Loading JSON objects from %s...' % jsonfile objects, columns = ju.load_objects(jsonfile) # write json object to csv file print('writing JSON objects to %s...' % csvfile) cu.write_objects(csvfile, objects, columns)
def run_script(jsonfile, attr1, attr2=None, omitLabels=None): # load json objects print 'Loading JSON objects from %s...' % jsonfile objects, columns = ju.load_objects(jsonfile) # convert to matrix form print 'Convert JSON to matrix...' X,columns = ju.get_matrix(objects, fi.data_feat_info) # get class labels y_idx = columns.index(fi.label) y = X[:,y_idx] if (omitLabels): labels = [int(x) for x in np.unique(y) if x not in omitLabels] else: labels = [int(x) for x in np.unique(y)] print labels # get the data for attribute 1 attr1_idx = columns.index(attr1) x1 = X[:,attr1_idx] + np.random.uniform(0,0.25,X.shape[0]) # get the data for attribute 2 if (attr2 is not None): attr2_idx = columns.index(attr2) x2 = X[:,attr2_idx] + np.random.uniform(0,0.25,X.shape[0]) else: attr2 = 'random' x2 = np.random.uniform(0,1,X.shape[0]) print 'Plot %s vs %s...' % (attr1,attr2) plt.clf() # create data series series_x1 = [] series_x2 = [] for label in labels: idx = np.where(y==label)[0] print ' class %d: %d' % (label,len(idx)) series_x1.append(x1[idx]) series_x2.append(x2[idx]) colors = ['b', 'g', 'r', 'm', 'k'] for i in labels: plt.scatter(series_x1[i], series_x2[i], c=colors[i], marker='+') plt.show()
def run_script(busjson, revjson, tipjson, senticsv, init_pdate, delta, ctype=linsvm, usamp=True, binary=None, rfe=False, pca=-1, reg=False, feat_info=fi.data_feat_info, states=None): print 'Initial prediction date: %s' % init_pdate print 'Time delta: %d months' % delta if (states): print 'limiting data to restaurants in: %s' % str(states) # convert pdate to secondds since the epoch pdate = du.date2int(du.str2date(init_pdate)) # load business objects print 'Loading business objects from %s...' % busjson all_buses, junk = ju.load_objects(busjson) # load review objects print 'loading review objects from %s...' % revjson all_reviews, junk = ju.load_objects(revjson) # load tip objects print 'loading tip objects from %s...' % tipjson all_tips, junk = ju.load_objects(tipjson) # load sentiment ranking data derived from tip and review data print 'loading sentiment rankings from %s...' % senticsv all_senti = cu.load_matrix(senticsv, has_hdr=False) # reduce the number of features using recursive feature elimination # - See http://scikit-learn.org/stable/auto_examples/plot_rfe_with_cross_validation.html#example-plot-rfe-with-cross-validation-py # - See http://stackoverflow.com/questions/23815938/recursive-feature-elimination-and-grid-search-using-scikit-learn if (reg): # create the least squares linear regressor print 'using least squares linear regression...' c = linmod.LinearRegression() # grid search not supported for linear regression (???) param_grid = None elif (ctype==rbfsvm): # create RBF SVM to test #c = svm.NuSVC(kernel='rbf') c = svm.SVC(kernel='rbf') # configure parameter grid for grid search C_range = 10.0 ** np.arange(-3, 5) gamma_range = 10.0 ** np.arange(-4, 3) if (rfe): print 'RFE not currently supported for RBF SVM...' #c = fs.RFECV(c, step=1) #pgrid = [] #for C in C_range: # for gamma in gamma_range: # pgrid.append({'C':C,'gamma':gamma}) #pgrid = [{'gamma':0.5},{'gamma':0.1},{'gamma':0.01},{'gamma':0.001},{'gamma':0.0001}] #param_grid = {'estimator_params': pgrid} print 'using RBF SVM...' param_grid = dict(gamma=gamma_range, C=C_range) elif (ctype==knn): # create a KNN classifier c = neigh.KNeighborsClassifier() if (rfe): print 'RFE not currently supported for k-nearesrt neighbors...' print 'using k-mearest neighbors...' param_grid = {'n_neighbors':[1,2,3,4,5,6,7,8,9,10,15,20,25,30], 'weights':['uniform','distance'], 'p':[1,2,3,4,5,6,7,8,9,10]} elif (ctype==ada): # create boosted classifier c = ensemble.AdaBoostClassifier() if (rfe): print 'RFE not currently supported for AdaBoost...' print 'using AdaBoost...' param_grid = {'n_estimators':[5, 10, 25, 40, 50, 60, 75, 85, 100], 'learning_rate':[0.25, 0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0]} elif (ctype==rf): # create random forest classifier c = ensemble.RandomForestClassifier() if (rfe): print 'RFE not currently supported for random forest...' print 'using random forest...' param_grid = {'n_estimators':[5, 10, 25, 40, 50, 60, 75, 85, 100], 'max_depth':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, None]} elif (ctype==dt): # create decision tree classifier c = tree.DecisionTreeClassifier() # max feats - subtract 1 because data feats includes the class label if (rfe): print 'RFE not supported with decision trees...' print 'using decision tree...' param_grid = {'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, None]} else: # create linear SVM to test c = svm.LinearSVC() # configure parameter grid for grid search C_range = 10.0 ** np.arange(-3, 5) if (rfe): print 'using linear SVM with RFE...' c = fs.RFECV(c, step=1) pgrid = [] for C in C_range: pgrid.append({'C':C}) #pgrid = [{'C':0.01},{'C':0.1},{'C':1},{'C':10},{'C':100},{'C':1000},{'C':10000}] param_grid = {'estimator_params': pgrid} else: print 'using linear SVM...' param_grid = {'C': C_range} # run the walk-forward cross validation and collect the results print('run walk-forward cross validation...') if (usamp): print(' under-sampling still open class...') else: print(' NOT under-sampling still open class...') results = wfcvutils.wfcv(c, param_grid, all_buses, all_reviews, all_tips, all_senti, pdate, delta*du.month, pca=pca, usamp=usamp, binary=binary, reg=reg, feat_info=feat_info, states=states) # combine the results to produce overall metrics y_true = None y_pred = None for r in results: if (y_true is None): y_true = r[0] else: y_true = np.hstack((y_true, r[0])) if (y_pred is None): y_pred = r[1] else: y_pred = np.hstack((y_pred, r[1])) # print out an overall classification report print('\n=========================================') print('Overall metrics for all prediction dates:\n') if (len(results) != 0): if (reg): wfcvutils.print_reg_metrics(y_true, y_pred) else: cm = metrics.confusion_matrix(y_true, y_pred) wfcvutils.print_cm(cm) #print(metrics.classification_report(y_true, y_pred, target_names=fi.class_names)) else: print ' NO RESULTS\n'
def load_restaurants(file_path): return jsonutils.load_objects(file_path, filt=fi.restaurant_filter)