def run_script(pdate_str, busjson, revjson, tipjson, senticsv, outfile):
    # convert pdate to seconds since the epoch
    pdate = du.date2int(du.str2date(pdate_str))

    # load business objects
    print 'Loading business objects from %s...' % busjson
    all_buses, junk = ju.load_objects(busjson)

    # load review objects
    print 'loading review objects from %s...' % revjson
    all_reviews, junk = ju.load_objects(revjson)

    # load tip objects
    print 'loading tip objects from %s...' % tipjson
    all_tips, junk = ju.load_objects(tipjson)
    
    # load sentiment ranking data derived from tip and review data
    print 'loading sentiment rankings from %s...' % senticsv
    all_senti = cu.load_matrix(senticsv, has_hdr=False)

    # generate a data set the specified prediction date
    print('generate data set for prediction date %s...' % pdate_str)
    buses = du.gen_dataset(pdate, all_buses, all_reviews, all_tips, all_senti)
    
    # write data set to file
    print('writing generated data set to %s...' % outfile)
    ju.save_objects(buses, outfile)
def gen_dataset_files(pdates, busjson, revjson, tipjson, outdir):
    # load business objects
    print 'Loading business objects from %s...' % busjson
    all_buses, junk = jsonutils.load_objects(busjson)

    # load review objects
    print 'loading review objects from %s...' % revjson
    all_reviews, junk = jsonutils.load_objects(revjson)

    # load tip objects
    print 'loading tip objects from %s...' % tipjson
    all_tips, junk = jsonutils.load_objects(tipjson)

    # generate the datsets
    for pdatestr in pdates:
        # convert prediction date to int (seconds since epoch)
        pdate = date2int(str2date(pdatestr))

        # generate the dataset for the specified prediction date
        print 'generating dataset for prediction date %s (%d)...' % (pdatestr,pdate)
        buses = gen_dataset(pdate, all_buses, all_reviews, all_tips)

        # generate filename for dataset
        outfile = outdir + '/' + pdatestr + '.json'

        # write dataset to file
        print 'writing %d JSON objects to %s...' % (len(buses),outfile)
        jsonutils.save_objects(buses, outfile)
def run_script(jsonfile, csvfile):
    # load json objects
    print 'Loading JSON objects from %s...' % jsonfile
    objects, columns = ju.load_objects(jsonfile)

    # write json object to csv file
    print('writing JSON objects to %s...' % csvfile)
    cu.write_objects(csvfile, objects, columns)
def run_script(jsonfile, attr1, attr2=None, omitLabels=None):
    # load json objects
    print 'Loading JSON objects from %s...' % jsonfile
    objects, columns = ju.load_objects(jsonfile)

    # convert to matrix form
    print 'Convert JSON to matrix...'
    X,columns = ju.get_matrix(objects, fi.data_feat_info)

    # get class labels
    y_idx = columns.index(fi.label)
    y = X[:,y_idx]
    if (omitLabels):
        labels = [int(x) for x in np.unique(y) if x not in omitLabels]
    else:
        labels = [int(x) for x in np.unique(y)]

    print labels

    # get the data for attribute 1
    attr1_idx = columns.index(attr1)
    x1 = X[:,attr1_idx] + np.random.uniform(0,0.25,X.shape[0])

    # get the data for attribute 2
    if (attr2 is not None):
        attr2_idx = columns.index(attr2)
        x2 = X[:,attr2_idx] + np.random.uniform(0,0.25,X.shape[0])
    else:
        attr2 = 'random'
        x2 = np.random.uniform(0,1,X.shape[0])

    print 'Plot %s vs %s...' % (attr1,attr2)
    plt.clf()

    # create data series
    series_x1 = []
    series_x2 = []
    for label in labels:
        idx = np.where(y==label)[0]
        print '  class %d: %d' % (label,len(idx))
        series_x1.append(x1[idx])
        series_x2.append(x2[idx])

    colors = ['b', 'g', 'r', 'm', 'k']

    for i in labels:
        plt.scatter(series_x1[i], series_x2[i], c=colors[i], marker='+')

    plt.show()
def run_script(busjson, revjson, tipjson, senticsv, init_pdate, delta, ctype=linsvm,
               usamp=True, binary=None, rfe=False, pca=-1, reg=False, feat_info=fi.data_feat_info,
               states=None):
    print 'Initial prediction date: %s' % init_pdate
    print 'Time delta: %d months' % delta
    if (states):
        print 'limiting data to restaurants in: %s' % str(states)

    # convert pdate to secondds since the epoch
    pdate = du.date2int(du.str2date(init_pdate))

    # load business objects
    print 'Loading business objects from %s...' % busjson
    all_buses, junk = ju.load_objects(busjson)

    # load review objects
    print 'loading review objects from %s...' % revjson
    all_reviews, junk = ju.load_objects(revjson)

    # load tip objects
    print 'loading tip objects from %s...' % tipjson
    all_tips, junk = ju.load_objects(tipjson)
    
    # load sentiment ranking data derived from tip and review data
    print 'loading sentiment rankings from %s...' % senticsv
    all_senti = cu.load_matrix(senticsv, has_hdr=False)

    # reduce the number of features using recursive feature elimination
    # - See http://scikit-learn.org/stable/auto_examples/plot_rfe_with_cross_validation.html#example-plot-rfe-with-cross-validation-py
    # - See http://stackoverflow.com/questions/23815938/recursive-feature-elimination-and-grid-search-using-scikit-learn

    if (reg):
        # create the least squares linear regressor
        print 'using least squares linear regression...'
        c = linmod.LinearRegression()
        # grid search not supported for linear regression (???)
        param_grid = None
    elif (ctype==rbfsvm):
        # create RBF SVM to test
        #c = svm.NuSVC(kernel='rbf')
        c = svm.SVC(kernel='rbf')
        # configure parameter grid for grid search
        C_range = 10.0 ** np.arange(-3, 5)
        gamma_range = 10.0 ** np.arange(-4, 3)
        if (rfe):
            print 'RFE not currently supported for RBF SVM...'
            #c = fs.RFECV(c, step=1)
            #pgrid = []
            #for C in C_range:
            #    for gamma in gamma_range:
            #        pgrid.append({'C':C,'gamma':gamma})
            #pgrid = [{'gamma':0.5},{'gamma':0.1},{'gamma':0.01},{'gamma':0.001},{'gamma':0.0001}]
            #param_grid = {'estimator_params': pgrid}
        print 'using RBF SVM...'
        param_grid = dict(gamma=gamma_range, C=C_range)
    elif (ctype==knn):
        # create a KNN classifier
        c = neigh.KNeighborsClassifier()
        if (rfe):
            print 'RFE not currently supported for k-nearesrt neighbors...'
        print 'using k-mearest neighbors...'
        param_grid = {'n_neighbors':[1,2,3,4,5,6,7,8,9,10,15,20,25,30],
                      'weights':['uniform','distance'],
                      'p':[1,2,3,4,5,6,7,8,9,10]}
    elif (ctype==ada):
        # create boosted classifier
        c = ensemble.AdaBoostClassifier()
        if (rfe):
            print 'RFE not currently supported for AdaBoost...'
        print 'using AdaBoost...'
        param_grid = {'n_estimators':[5, 10, 25, 40, 50, 60, 75, 85, 100],
                      'learning_rate':[0.25, 0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0]}
    elif (ctype==rf):
        # create random forest classifier
        c = ensemble.RandomForestClassifier()
        if (rfe):
            print 'RFE not currently supported for random forest...'
        print 'using random forest...'
        param_grid = {'n_estimators':[5, 10, 25, 40, 50, 60, 75, 85, 100],
                      'max_depth':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, None]}
    elif (ctype==dt):
        # create decision tree classifier
        c = tree.DecisionTreeClassifier()
        # max feats - subtract 1 because data feats includes the class label
        if (rfe):
            print 'RFE not supported with decision trees...'
        print 'using decision tree...'
        param_grid = {'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, None]}
    else:
        # create linear SVM to test
        c = svm.LinearSVC()
        # configure parameter grid for grid search
        C_range = 10.0 ** np.arange(-3, 5)
        if (rfe):
            print 'using linear SVM with RFE...'
            c = fs.RFECV(c, step=1)
            pgrid = []
            for C in C_range:
                pgrid.append({'C':C})
            #pgrid = [{'C':0.01},{'C':0.1},{'C':1},{'C':10},{'C':100},{'C':1000},{'C':10000}]
            param_grid = {'estimator_params': pgrid}
        else:
            print 'using linear SVM...'
            param_grid = {'C': C_range}

    # run the walk-forward cross validation and collect the results
    print('run walk-forward cross validation...')
    if (usamp):
        print('  under-sampling still open class...')
    else:
        print('  NOT under-sampling still open class...')
    results = wfcvutils.wfcv(c, param_grid, all_buses, all_reviews, all_tips, all_senti,
                             pdate, delta*du.month, pca=pca, usamp=usamp,
                             binary=binary, reg=reg, feat_info=feat_info, states=states)
    
    # combine the results to produce overall metrics
    y_true = None
    y_pred = None
    for r in results:
        if (y_true is None):
            y_true = r[0]
        else:
            y_true = np.hstack((y_true, r[0]))
        if (y_pred is None):
            y_pred = r[1]
        else:
            y_pred = np.hstack((y_pred, r[1]))

    # print out an overall classification report
    print('\n=========================================')
    print('Overall metrics for all prediction dates:\n')
    if (len(results) != 0):
        if (reg):
            wfcvutils.print_reg_metrics(y_true, y_pred)
        else:
            cm = metrics.confusion_matrix(y_true, y_pred)
            wfcvutils.print_cm(cm)
            #print(metrics.classification_report(y_true, y_pred, target_names=fi.class_names))
    else:
        print '  NO RESULTS\n'
def load_restaurants(file_path):
    return jsonutils.load_objects(file_path, filt=fi.restaurant_filter)