Python gen_dataset 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: datautils

메소드/함수: gen_dataset

hotexamples.com에서의 예제들: 2

Python gen_dataset - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 datautils.gen_dataset에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: gendataset.py 프로젝트: theharshest/yelp_dataset_challenge

def run_script(pdate_str, busjson, revjson, tipjson, senticsv, outfile):
    # convert pdate to seconds since the epoch
    pdate = du.date2int(du.str2date(pdate_str))

    # load business objects
    print 'Loading business objects from %s...' % busjson
    all_buses, junk = ju.load_objects(busjson)

    # load review objects
    print 'loading review objects from %s...' % revjson
    all_reviews, junk = ju.load_objects(revjson)

    # load tip objects
    print 'loading tip objects from %s...' % tipjson
    all_tips, junk = ju.load_objects(tipjson)
    
    # load sentiment ranking data derived from tip and review data
    print 'loading sentiment rankings from %s...' % senticsv
    all_senti = cu.load_matrix(senticsv, has_hdr=False)

    # generate a data set the specified prediction date
    print('generate data set for prediction date %s...' % pdate_str)
    buses = du.gen_dataset(pdate, all_buses, all_reviews, all_tips, all_senti)
    
    # write data set to file
    print('writing generated data set to %s...' % outfile)
    ju.save_objects(buses, outfile)

예제 #2

파일 보기

파일: wfcvutils.py 프로젝트: theharshest/yelp_dataset_challenge

def wfcv(clf, param_grid, all_buses, all_reviews, all_tips, all_senti, init_pdate, time_delta,
         feat_info=fi.data_feat_info, std_data=True, usamp=True, binary=None, reg=False, pca=-1,
         states=None):
    # find the earliest and latest review dates
    start_date = int(time.time())
    end_date = 0
    for bus in all_buses:
        first_review_date = bus[fi.first_review_date]
        last_review_date = bus[fi.last_review_date]
        if (first_review_date < start_date):
            start_date = first_review_date
        if (last_review_date > end_date):
            end_date = last_review_date

    # print out earliest and latest dates
    print('Earliest review date: %s' % du.date2str(du.int2date(start_date)))
    print('Latest review date:   %s' % du.date2str(du.int2date(end_date)))
    
    # initialize the "prediction date"
    pdate = init_pdate

    # create variables for the training data - it will be populated later
    X_train_orig,y_train = None,None

    # generate the first data set
    buses_test = du.gen_dataset(pdate, all_buses, all_reviews, all_tips, all_senti, usamp=usamp, states=states, binary=binary, reg=reg)
    if (reg):
        # extract the target value as the y values for regression
        X_test_orig,y_test = ju.json2xy(buses_test, feat_info, fi.target, std=False)
    else:
        # extract the label value as the y values for classification
        X_test_orig,y_test = ju.json2xy(buses_test, feat_info, fi.label, std=False)
    
    print('Number of attributes in data set: %d' % X_test_orig.shape[1])

    # initialize the stop_date threshold
    stop_date = end_date - 2*time_delta

    # create list to hold results
    results = []

    # configure scoring metric to be used during grid search and feature selection
    if (usamp):
        # if class sizes are balanced then use accuracy
        scorer = 'accuracy'
    else:
        # if class sizes are unbalanced then use f1 score
        scorer = 'f1'

    # perform "walk forward cross validation"
    while (pdate <= stop_date):
        print('\n===================================================================')
        print("Train estimator using train set with prediction date %s:" % du.date2str(du.int2date(pdate)))
        # update the prediction date for the this round
        pdate = pdate + time_delta

        # use current test set as training set for this round
        X_train_orig = X_test_orig
        y_train = y_test

        # generate a new test set for this round
        buses_test = du.gen_dataset(pdate, all_buses, all_reviews, all_tips, all_senti, usamp=usamp, states=states, binary=binary, reg=reg)
        if (reg):
            # extract the target value as the y values for regression
            X_test_orig,y_test = ju.json2xy(buses_test, feat_info, fi.target, std=False)
        else:
            # extract the label value as the y values for classification
            X_test_orig,y_test = ju.json2xy(buses_test, feat_info, fi.label, std=False)

        # by default, use the original untransformed X data
        # - X_train & X_test will contain the transformed data (if any transformation is done)
        X_train = X_train_orig
        X_test = X_test_orig

        # ===========================================
        # apply any requested data transformations

        # standardize the data
        # See http://scikit-learn.org/stable/modules/preprocessing.html
        if (std_data):
            print('  Standardize the data...')
            # scaler is trained on training set
            scaler = prep.StandardScaler().fit(X_train_orig)
            # scaler is used to transform both train and test data
            X_train = scaler.transform(X_train_orig)
            X_test = scaler.transform(X_test_orig)

        # reduce the dimension of the data using PCA
        # See http://scikit-learn.org/stable/auto_examples/applications/face_recognition.html#example-applications-face-recognition-py
        if (pca >= 0):
            print('  Reduce dimension using PCA...')
            if (pca == 0):
                pca = None
            rand_pca = decomp.RandomizedPCA(n_components=pca, whiten=True)
            # fit PCA on the training data
            rand_pca.fit(X_train)
            # transform train and test sets using PCA
            X_train = rand_pca.transform(X_train)
            X_test  = rand_pca.transform(X_test)
            print('    featues remaining after PCA: %d' % X_train.shape[1])

        # data transformations complete
        # ===========================================

        # use grid search to train and test the classifier:
        # - see http://scikit-learn.org/stable/auto_examples/grid_search_digits.html#example-grid-search-digits-py

        if (param_grid):
            # train the classifier using grid search
            gs = grid_search.GridSearchCV(clf, param_grid, n_jobs=-1, scoring=scorer)
            #gs = grid_search.GridSearchCV(clf, param_grid, scoring=scorer)
        else:
            # use the classifier/regressor without grid search
            gs = clf

        print '\nTraining the estimator...'
        gs.fit(X_train, y_train)

        if (param_grid):
            print("\nBest parameters set found on train set:\n")
            print(gs.best_estimator_)
            print("\nGrid scores on train set:\n")
            for params, mean_score, scores in gs.grid_scores_:
                print("  %0.3f (+/-%0.03f) for %r"
                      % (mean_score, scores.std() / 2, params))

        # if using RFE - print out number of features selected
        # TBD

        # collect predictions from the classifier
        print '\nTesting the estimator...'
        y_pred = gs.predict(X_test)

        print("\nResults for test set with prediction date %s:\n" % du.date2str(du.int2date(pdate)))
        if (reg):
            # print out explained variance score, mean absolute error, mean squared
            # error and R-squared metrics
            print_reg_metrics(y_test, y_pred)
        else:
            # print out the confusion matrix
            cm = metrics.confusion_matrix(y_test, y_pred)
            print_cm(cm)

        #print("\nScores on evaluation set:\n")
        #print(metrics.classification_report(y_test, y_pred, target_names=fi.class_names))

        # save results
        results.append((y_test, y_pred))
    #end while

    # return the true values and predictions for each round
    return results