예제 #1
0
def run_model5_mp(queue, col, trn, vld, tst, cat,
                  store_weather, store_data_max, \
                  model_param=1):
    if cat == 0:
        Y_hat2 = np.zeros((len(trn) + len(vld) + len(tst), 1))
    else:
        # normalize training, validing and testing data set
        nm_trn = normalize_store_data(trn, store_data_max)
        nm_vld = normalize_store_data(vld, store_data_max)
        nm_tst = normalize_store_data(tst, store_data_max)

        Y_hat = build_model5(nm_trn,
                             nm_vld,
                             nm_tst,
                             store_weather,
                             column=col,
                             alpha_train=model_param)

        # denormalize the sale
        Y_hat2 = denormalize_store_data(trn,
                                        vld,
                                        tst,
                                        Y_hat,
                                        store_data_max,
                                        column=col)
    queue.put((trn, vld, tst, Y_hat2, col, cat))
예제 #2
0
def run_model4_mp(queue, col, trn, vld, tst, cat,
                  store_weather, store_data_max, \
                  model_param=1):
    if cat == 0:
        Y_hat2 = np.zeros((len(trn) + len(vld) + len(tst), 1))
    else:
        nm_trn = normalize_store_data(trn, store_data_max)
        nm_vld = normalize_store_data(vld, store_data_max)
        nm_tst = normalize_store_data(tst, store_data_max)

        _, fmat = sim(nm_trn, nm_vld, nm_tst, store_weather)

        Y_hat = np.zeros((len(nm_trn) + len(nm_vld) + len(nm_tst), 1))
        X = fmat[:len(nm_trn)]

        Y = nm_trn[col].values[:, np.newaxis]
        clf = linear_model.Ridge(alpha=model_param)
        clf.fit(X, Y)
        Y_hat[:] = clf.predict(fmat)
        Y_hat2 = denormalize_store_data(trn,
                                        vld,
                                        tst,
                                        Y_hat,
                                        store_data_max,
                                        column=col)
    queue.put((trn, vld, tst, Y_hat2, col, cat))
예제 #3
0
def run_model1(store_data_file, store_weather_file, test_data_file, model_param, only_validate=False):
    """
    the model uses the square error to measure the difference between Y_hat and
    Y, and uses similarity to regulate Y_hat, that is, if one day's sale can be
    reconstructed by similar day's sale. The performance of the model in this
    task is not particularly good.
    """
    print "start here"

    # write header to test result
    with open('test_result.csv', 'w') as f:
        f.write('id,units\n')
        f.close()

    # load data
    store_data, store_weather, test = load_data2(store_data_file, \
          store_weather_file, test_data_file)

    # compute max item sales for each store as denominator
    store_data_max = store_data.groupby(level=1).max()

    # develop training and validation set
    train, valid = develop_valid_set2(store_data, store_weather, valid_size=100)

    # categorize testing data with a relevant but much smaller training set
    target_set = build_target_set2(train, valid, test, store_weather)

    # run prediction on testing data of each category
    for n, trn, vld, tst in target_set:
        print "%d, train(%d), valid(%d), test(%d)" % (n, len(trn), len(vld), len(tst))

        # normalize training, validing and testing data set
        nm_trn = normalize_store_data(trn, store_data_max)
        nm_vld = normalize_store_data(vld, store_data_max)
        nm_tst = normalize_store_data(tst, store_data_max)

        v_init = None
        Y_hat2 = None

        for i in range(1):
            # run prediction on all validation and testing data set
            Y_hat = build_model1(nm_trn, nm_vld, nm_tst, store_weather, \
                                 valid_init=v_init, alpha_train=model_param)

            # save the code in case the model has stacking effect
            #v_init=Y_hat[len(trn):]

            # denormalize the sale
            Y_hat2 = denormalize_store_data(trn, vld, tst, Y_hat, store_data_max)

            # evaluate error in training and validation set
            e1, e2 = eval_model(trn, vld, Y_hat2)
            print "error at %d is: train(%f), valid(%f)" % (i, e1, e2)

        # write results to test result
        write_submission(trn, vld, tst, Y_hat2, 'test_result.csv', 'valid_result')
예제 #4
0
def run_model5(store_data_file, store_weather_file, test_data_file, \
                 model_param=1, validate_only=False, eval_err=None):
    print "---------------------start here---------------------"
    test_result_file ='test_result.csv'

    # write header to test result
    with open(test_result_file, 'w') as f:
        f.write('id,units\n')
        f.close()

    # load data
    store_data, store_weather, test = load_data2(store_data_file, \
          store_weather_file, test_data_file)

    # compute max item sales for each store as denominator
    store_data_max = store_data.groupby(level=1).max()

    # categorize testing data with a relevant but much smaller training set
    target_set = build_target_set3(store_data, test, store_weather, store_data_max, valid_pct=0)

    # run prediction on testing data of each category
    for col, trn, vld, tst in target_set:
        print "%s, train(%d), valid(%d), test(%d), model_param(%f)" % (col, len(trn), len(vld), len(tst), model_param)
        if len(tst)==0: continue

        # normalize training, validing and testing data set
        nm_trn = normalize_store_data(trn, store_data_max)
        nm_vld = normalize_store_data(vld, store_data_max)
        nm_tst = normalize_store_data(tst, store_data_max)

        Y_hat=build_model5(nm_trn, nm_vld, nm_tst, store_weather, column=col, alpha_train=model_param)

        # denormalize the sale
        Y_hat2 = denormalize_store_data(trn, vld, tst, Y_hat, store_data_max, column=col)

        # evaluate error in training and validation set
        e1, e2 = eval_model(trn, vld, Y_hat2, column=col)
        print "error at item(%s) is: train(%f), valid(%f)" % (col, e1, e2)
        if eval_err is not None:
            eval_err.add_result(e1, len(trn), e2, len(vld))

        # write results to test result
        if not validate_only:
            write_submission(trn, vld, tst, Y_hat2, test_result_file, 'valid_result', column=col)

    # write out zero estimation
    if not validate_only:
        write_submission_zero(test, store_data_max, test_result_file)

    if eval_err is not None:
        e1, e2=eval_err.get_result()
        logging.info("model5(p=%f) error is: train(%f), valid(%f)" % (model_param, e1, e2))
        print "model5(p=%f) error is: train(%f), valid(%f)" % (model_param, e1, e2)
예제 #5
0
def run_model4v1(store_data_file, store_weather_file, test_data_file, \
                 model_param=1, validate_only=False, eval_err=None):
    """
    ridge regression with log error term
    """
    print "---------------------start here---------------------"
    test_result_file ='test_result.csv'

    with open(test_result_file, 'w') as f:
        f.write('id,units\n')
        f.close()

    store_data, store_weather, test = load_data2(store_data_file, \
          store_weather_file, test_data_file)

    store_data_max = store_data.groupby(level=1).max()

    # categorize testing data with a relevant but much smaller training set
    target_set = build_target_set3(store_data, test, store_weather, store_data_max, columns=set(['1']))

    for col, trn, vld, tst in target_set:
        print "item(%s), train(%d), valid(%d), test(%d), model_param(%f)" % (col, len(trn), len(vld), len(tst), model_param)
        if len(tst)==0: continue

        nm_trn = normalize_store_data(trn, store_data_max)
        nm_vld = normalize_store_data(vld, store_data_max)
        nm_tst = normalize_store_data(tst, store_data_max)

        Y_hat, fmat_wegith=build_model_log_ridge(nm_trn, nm_vld, nm_tst, store_weather,col, alpha=model_param)

        Y_hat2 = denormalize_store_data(trn, vld, tst, Y_hat[:,np.newaxis], store_data_max, column=col)

        # evaluate error in training and validation set
        e1, e2 = eval_model(trn, vld, Y_hat2, column=col)
        print "error at item(%s) is: train(%f), valid(%f)" % (col, e1, e2)
        if eval_err is not None:
            eval_err.add_result(e1, len(trn), e2, len(vld))

        # write results to test result
        if not validate_only:
            write_submission(trn, vld, tst, Y_hat2, test_result_file, 'valid_result', column=col)

    # write out zero estimation
    if not validate_only:
        write_submission_zero(test, store_data_max, test_result_file)

    if eval_err is not None:
        e1, e2=eval_err.get_result()
        logging.info("model4v1(p=%f) error is: train(%f), valid(%f)" % (model_param, e1, e2))
        print "model4v1(p=%f) error is: train(%f), valid(%f)" % (model_param, e1, e2)
예제 #6
0
def run_model5_mp(queue, col, trn, vld, tst, cat, 
                  store_weather, store_data_max, \
                  model_param=1):
    if cat==0:
        Y_hat2=np.zeros((len(trn)+len(vld)+len(tst), 1))
    else:
        # normalize training, validing and testing data set
        nm_trn = normalize_store_data(trn, store_data_max)
        nm_vld = normalize_store_data(vld, store_data_max)
        nm_tst = normalize_store_data(tst, store_data_max)

        Y_hat=build_model5(nm_trn, nm_vld, nm_tst, store_weather, column=col, alpha_train=model_param)

        # denormalize the sale
        Y_hat2 = denormalize_store_data(trn, vld, tst, Y_hat, store_data_max, column=col)
    queue.put((trn, vld, tst, Y_hat2, col, cat))    
예제 #7
0
def run_model3(store_data_file, store_weather_file, test_data_file, model_param=1, validate_only=False):
    print "start here"
    test_result_file ='test_result.csv'

    # write header to test result
    with open(test_result_file, 'w') as f:
        f.write('id,units\n')
        f.close()

    # load data
    store_data, store_weather, test = load_data2(store_data_file, \
          store_weather_file, test_data_file)

    # compute max item sales for each store as denominator
    store_data_max = store_data.groupby(level=1).max()

    # develop training and validation set
    train, valid = develop_valid_set2(store_data, store_weather, valid_size=100)

    # categorize testing data with a relevant but much smaller training set
    target_set = build_target_set3(train, valid, test, store_weather, store_data_max)

    # run prediction on testing data of each category
    for col, trn, vld, tst in target_set:
        print "%s, train(%d), valid(%d), test(%d)" % (col, len(trn), len(vld), len(tst))

        # normalize training, validing and testing data set
        nm_trn = normalize_store_data(trn, store_data_max)
        nm_vld = normalize_store_data(vld, store_data_max)
        nm_tst = normalize_store_data(tst, store_data_max)

        Y_hat=build_model3(nm_trn, nm_vld, nm_tst, store_weather, column=col, alpha_train=model_param)

        # denormalize the sale
        Y_hat2 = denormalize_store_data(trn, vld, tst, Y_hat, store_data_max, column=col)

        # evaluate error in training and validation set
        e1, e2 = eval_model(trn, vld, Y_hat2, column=col)
        print "error is: train(%f), valid(%f)" % (e1, e2)

        # write results to test result
        write_submission(trn, vld, tst, Y_hat2, test_result_file, 'valid_result', column=col)

    # write out zero estimation
    if not validate_only:
        write_submission_zero(test, store_data_max, test_result_file)
예제 #8
0
def helper_model1v1(df, offset, train, m, Y_hat, store_data_max, v_init=None):
    # construct feature matrix with one row more than training data
    m2=np.zeros((len(train)+1,m.shape[1]))

    # copy training feature matrix
    m2[:len(train)]=m[:len(train)]

    for i in range(len(df)):
        m2[-1]=m[offset+i]
        def l(i):
            g=np.dot(m2, m2[i])
            d=np.sum(g)
            g=-g
            g[i]=d+g[i]
            return g
        df0=df.iloc[i:i+1]
        Y_hat0 = build_model(train, None, df0, l, v_init)
        Y_hat1 = denormalize_store_data(train, None, df0, Y_hat0, store_data_max)
        Y_hat[offset+i]=Y_hat1[-1]
예제 #9
0
def run_model2(store_data_file, store_weather_file, test_data_file):
    print "start here"

    # write header to test result
    with open('test_result.csv', 'w') as f:
        f.write('id,units\n')
        f.close()

    # load data
    store_data, store_weather, test = load_data2(store_data_file, \
          store_weather_file, test_data_file)

    # compute max item sales for each store as denominator
    store_data_max = store_data.groupby(level=1).max()

    # develop training and validation set
    train, valid = develop_valid_set2(store_data, store_weather, valid_size=0)

    # categorize testing data with a relevant but much smaller training set
    target_set = build_target_set(train, valid, test, store_weather)

    # run prediction on testing data of each category
    for n, trn, vld, tst in target_set:
        print "%d, train(%d), valid(%d), test(%d)" % (n, len(trn), len(vld),
                                                      len(tst))

        # normalize training, validing and testing data set
        nm_trn = normalize_store_data(trn, store_data_max)
        nm_vld = normalize_store_data(vld, store_data_max)
        nm_tst = normalize_store_data(tst, store_data_max)

        Y_hat, theta = build_model2(nm_trn, nm_vld, nm_tst, store_weather)

        # denormalize the sale
        Y_hat2 = denormalize_store_data(trn, vld, tst, Y_hat, store_data_max)

        # evaluate error in training and validation set
        e1, e2 = eval_model(trn, vld, Y_hat2)
        print "error is: train(%f), valid(%f)" % (e1, e2)

        # write results to test result
        write_submission(trn, vld, tst, Y_hat2, 'test_result.csv')
예제 #10
0
def run_model2(store_data_file, store_weather_file, test_data_file):
    print "start here"

    # write header to test result
    with open('test_result.csv', 'w') as f:
        f.write('id,units\n')
        f.close()

    # load data
    store_data, store_weather, test = load_data2(store_data_file, \
          store_weather_file, test_data_file)

    # compute max item sales for each store as denominator
    store_data_max = store_data.groupby(level=1).max()

    # develop training and validation set
    train, valid = develop_valid_set2(store_data, store_weather, valid_size=0)

    # categorize testing data with a relevant but much smaller training set
    target_set = build_target_set(train, valid, test, store_weather)

    # run prediction on testing data of each category
    for n, trn, vld, tst in target_set:
        print "%d, train(%d), valid(%d), test(%d)" % (n, len(trn), len(vld), len(tst))

        # normalize training, validing and testing data set
        nm_trn = normalize_store_data(trn, store_data_max)
        nm_vld = normalize_store_data(vld, store_data_max)
        nm_tst = normalize_store_data(tst, store_data_max)

        Y_hat, theta=build_model2(nm_trn, nm_vld, nm_tst, store_weather)

        # denormalize the sale
        Y_hat2 = denormalize_store_data(trn, vld, tst, Y_hat, store_data_max)

        # evaluate error in training and validation set
        e1, e2 = eval_model(trn, vld, Y_hat2)
        print "error is: train(%f), valid(%f)" % (e1, e2)

        # write results to test result
        write_submission(trn, vld, tst, Y_hat2, 'test_result.csv')
예제 #11
0
def run_model4_mp(queue, col, trn, vld, tst, cat, 
                  store_weather, store_data_max, \
                  model_param=1):    
    if cat==0:        
        Y_hat2=np.zeros((len(trn)+len(vld)+len(tst), 1))
    else:        
        nm_trn = normalize_store_data(trn, store_data_max)
        nm_vld = normalize_store_data(vld, store_data_max)
        nm_tst = normalize_store_data(tst, store_data_max)

        _,fmat = sim(nm_trn, nm_vld, nm_tst, store_weather)

        Y_hat = np.zeros((len(nm_trn) + len(nm_vld) + len(nm_tst), 1))
        X = fmat[:len(nm_trn)]

        Y = nm_trn[col].values[:,np.newaxis]
        clf = linear_model.Ridge(alpha=model_param)
        clf.fit(X, Y)
        Y_hat[:] = clf.predict(fmat)
        Y_hat2 = denormalize_store_data(trn, vld, tst, Y_hat, store_data_max, column=col)
    queue.put((trn, vld, tst, Y_hat2, col, cat))
예제 #12
0
def helper_model1v1(df, offset, train, m, Y_hat, store_data_max, v_init=None):
    # construct feature matrix with one row more than training data
    m2 = np.zeros((len(train) + 1, m.shape[1]))

    # copy training feature matrix
    m2[:len(train)] = m[:len(train)]

    for i in range(len(df)):
        m2[-1] = m[offset + i]

        def l(i):
            g = np.dot(m2, m2[i])
            d = np.sum(g)
            g = -g
            g[i] = d + g[i]
            return g

        df0 = df.iloc[i:i + 1]
        Y_hat0 = build_model(train, None, df0, l, v_init)
        Y_hat1 = denormalize_store_data(train, None, df0, Y_hat0,
                                        store_data_max)
        Y_hat[offset + i] = Y_hat1[-1]
예제 #13
0
def run_model5(store_data_file, store_weather_file, test_data_file, \
                 model_param=1, validate_only=False, eval_err=None):
    print "---------------------start here---------------------"
    test_result_file = 'test_result.csv'

    # write header to test result
    with open(test_result_file, 'w') as f:
        f.write('id,units\n')
        f.close()

    # load data
    store_data, store_weather, test = load_data2(store_data_file, \
          store_weather_file, test_data_file)

    # compute max item sales for each store as denominator
    store_data_max = store_data.groupby(level=1).max()

    # categorize testing data with a relevant but much smaller training set
    target_set = build_target_set3(store_data,
                                   test,
                                   store_weather,
                                   store_data_max,
                                   valid_pct=0)

    # run prediction on testing data of each category
    for col, trn, vld, tst in target_set:
        print "%s, train(%d), valid(%d), test(%d), model_param(%f)" % (
            col, len(trn), len(vld), len(tst), model_param)
        if len(tst) == 0: continue

        # normalize training, validing and testing data set
        nm_trn = normalize_store_data(trn, store_data_max)
        nm_vld = normalize_store_data(vld, store_data_max)
        nm_tst = normalize_store_data(tst, store_data_max)

        Y_hat = build_model5(nm_trn,
                             nm_vld,
                             nm_tst,
                             store_weather,
                             column=col,
                             alpha_train=model_param)

        # denormalize the sale
        Y_hat2 = denormalize_store_data(trn,
                                        vld,
                                        tst,
                                        Y_hat,
                                        store_data_max,
                                        column=col)

        # evaluate error in training and validation set
        e1, e2 = eval_model(trn, vld, Y_hat2, column=col)
        print "error at item(%s) is: train(%f), valid(%f)" % (col, e1, e2)
        if eval_err is not None:
            eval_err.add_result(e1, len(trn), e2, len(vld))

        # write results to test result
        if not validate_only:
            write_submission(trn,
                             vld,
                             tst,
                             Y_hat2,
                             test_result_file,
                             'valid_result',
                             column=col)

    # write out zero estimation
    if not validate_only:
        write_submission_zero(test, store_data_max, test_result_file)

    if eval_err is not None:
        e1, e2 = eval_err.get_result()
        logging.info("model5(p=%f) error is: train(%f), valid(%f)" %
                     (model_param, e1, e2))
        print "model5(p=%f) error is: train(%f), valid(%f)" % (model_param, e1,
                                                               e2)
예제 #14
0
def run_model4v1(store_data_file, store_weather_file, test_data_file, \
                 model_param=1, validate_only=False, eval_err=None):
    """
    ridge regression with log error term
    """
    print "---------------------start here---------------------"
    test_result_file = 'test_result.csv'

    with open(test_result_file, 'w') as f:
        f.write('id,units\n')
        f.close()

    store_data, store_weather, test = load_data2(store_data_file, \
          store_weather_file, test_data_file)

    store_data_max = store_data.groupby(level=1).max()

    # categorize testing data with a relevant but much smaller training set
    target_set = build_target_set3(store_data,
                                   test,
                                   store_weather,
                                   store_data_max,
                                   columns=set(['1']))

    for col, trn, vld, tst in target_set:
        print "item(%s), train(%d), valid(%d), test(%d), model_param(%f)" % (
            col, len(trn), len(vld), len(tst), model_param)
        if len(tst) == 0: continue

        nm_trn = normalize_store_data(trn, store_data_max)
        nm_vld = normalize_store_data(vld, store_data_max)
        nm_tst = normalize_store_data(tst, store_data_max)

        Y_hat, fmat_wegith = build_model_log_ridge(nm_trn,
                                                   nm_vld,
                                                   nm_tst,
                                                   store_weather,
                                                   col,
                                                   alpha=model_param)

        Y_hat2 = denormalize_store_data(trn,
                                        vld,
                                        tst,
                                        Y_hat[:, np.newaxis],
                                        store_data_max,
                                        column=col)

        # evaluate error in training and validation set
        e1, e2 = eval_model(trn, vld, Y_hat2, column=col)
        print "error at item(%s) is: train(%f), valid(%f)" % (col, e1, e2)
        if eval_err is not None:
            eval_err.add_result(e1, len(trn), e2, len(vld))

        # write results to test result
        if not validate_only:
            write_submission(trn,
                             vld,
                             tst,
                             Y_hat2,
                             test_result_file,
                             'valid_result',
                             column=col)

    # write out zero estimation
    if not validate_only:
        write_submission_zero(test, store_data_max, test_result_file)

    if eval_err is not None:
        e1, e2 = eval_err.get_result()
        logging.info("model4v1(p=%f) error is: train(%f), valid(%f)" %
                     (model_param, e1, e2))
        print "model4v1(p=%f) error is: train(%f), valid(%f)" % (model_param,
                                                                 e1, e2)
예제 #15
0
def run_model4(store_data_file, store_weather_file, test_data_file, \
               model_param=1, validate_only=False, eval_err=None):
    """
    ridge regression
    """
    print "---------------------start here---------------------"
    test_result_file = 'test_result.csv'

    with open(test_result_file, 'w') as f:
        f.write('id,units\n')
        f.close()

    store_data, store_weather, test = load_data2(store_data_file, \
          store_weather_file, test_data_file)

    store_data_max = store_data.groupby(level=1).max()

    # categorize testing data with a relevant but much smaller training set
    target_set = build_target_set4(store_data, test, store_weather,
                                   store_data_max)

    for col, trn, vld, tst, cat in target_set:
        print "item(%s), train(%d), valid(%d), test(%d), model_param(%0.2f), cat(%d)" % \
              (col, len(trn), len(vld), len(tst), model_param, cat)
        if len(tst) == 0: continue

        if cat == 0:
            Y_hat2 = np.zeros((len(trn) + len(vld) + len(tst), 1))
        else:
            nm_trn = normalize_store_data(trn, store_data_max)
            nm_vld = normalize_store_data(vld, store_data_max)
            nm_tst = normalize_store_data(tst, store_data_max)

            _, fmat = sim(nm_trn, nm_vld, nm_tst, store_weather)

            Y_hat = np.zeros((len(nm_trn) + len(nm_vld) + len(nm_tst), 1))
            X = fmat[:len(nm_trn)]

            Y = nm_trn[col].values[:, np.newaxis]
            clf = linear_model.Ridge(alpha=model_param)
            clf.fit(X, Y)
            Y_hat[:] = clf.predict(fmat)

            Y_hat2 = denormalize_store_data(trn,
                                            vld,
                                            tst,
                                            Y_hat,
                                            store_data_max,
                                            column=col)

        # evaluate error in training and validation set
        e1, e2 = eval_model(trn, vld, Y_hat2, column=col)
        print "error at item(%s) is: train(%f), valid(%f)" % (col, e1, e2)
        if eval_err is not None:
            eval_err.add_result(e1, len(trn), e2, len(vld))

        # write results to test result
        if not validate_only:
            write_submission(trn,
                             vld,
                             tst,
                             Y_hat2,
                             test_result_file,
                             'valid_result',
                             column=col)

    # write out zero estimation
    if not validate_only:
        write_submission_zero(test, store_data_max, test_result_file)

    if eval_err is not None:
        e1, e2 = eval_err.get_result()
        logging.info("model4(p=%f) error is: train(%f), valid(%f)" %
                     (model_param, e1, e2))
        print "model4(p=%f) error is: train(%f), valid(%f)" % (model_param, e1,
                                                               e2)
예제 #16
0
def run_model3(store_data_file,
               store_weather_file,
               test_data_file,
               model_param=1,
               validate_only=False):
    print "start here"
    test_result_file = 'test_result.csv'

    # write header to test result
    with open(test_result_file, 'w') as f:
        f.write('id,units\n')
        f.close()

    # load data
    store_data, store_weather, test = load_data2(store_data_file, \
          store_weather_file, test_data_file)

    # compute max item sales for each store as denominator
    store_data_max = store_data.groupby(level=1).max()

    # develop training and validation set
    train, valid = develop_valid_set2(store_data,
                                      store_weather,
                                      valid_size=100)

    # categorize testing data with a relevant but much smaller training set
    target_set = build_target_set3(train, valid, test, store_weather,
                                   store_data_max)

    # run prediction on testing data of each category
    for col, trn, vld, tst in target_set:
        print "%s, train(%d), valid(%d), test(%d)" % (col, len(trn), len(vld),
                                                      len(tst))

        # normalize training, validing and testing data set
        nm_trn = normalize_store_data(trn, store_data_max)
        nm_vld = normalize_store_data(vld, store_data_max)
        nm_tst = normalize_store_data(tst, store_data_max)

        Y_hat = build_model3(nm_trn,
                             nm_vld,
                             nm_tst,
                             store_weather,
                             column=col,
                             alpha_train=model_param)

        # denormalize the sale
        Y_hat2 = denormalize_store_data(trn,
                                        vld,
                                        tst,
                                        Y_hat,
                                        store_data_max,
                                        column=col)

        # evaluate error in training and validation set
        e1, e2 = eval_model(trn, vld, Y_hat2, column=col)
        print "error is: train(%f), valid(%f)" % (e1, e2)

        # write results to test result
        write_submission(trn,
                         vld,
                         tst,
                         Y_hat2,
                         test_result_file,
                         'valid_result',
                         column=col)

    # write out zero estimation
    if not validate_only:
        write_submission_zero(test, store_data_max, test_result_file)
예제 #17
0
def run_model1(store_data_file,
               store_weather_file,
               test_data_file,
               model_param,
               only_validate=False):
    """
    the model uses the square error to measure the difference between Y_hat and
    Y, and uses similarity to regulate Y_hat, that is, if one day's sale can be
    reconstructed by similar day's sale. The performance of the model in this
    task is not particularly good.
    """
    print "start here"

    # write header to test result
    with open('test_result.csv', 'w') as f:
        f.write('id,units\n')
        f.close()

    # load data
    store_data, store_weather, test = load_data2(store_data_file, \
          store_weather_file, test_data_file)

    # compute max item sales for each store as denominator
    store_data_max = store_data.groupby(level=1).max()

    # develop training and validation set
    train, valid = develop_valid_set2(store_data,
                                      store_weather,
                                      valid_size=100)

    # categorize testing data with a relevant but much smaller training set
    target_set = build_target_set2(train, valid, test, store_weather)

    # run prediction on testing data of each category
    for n, trn, vld, tst in target_set:
        print "%d, train(%d), valid(%d), test(%d)" % (n, len(trn), len(vld),
                                                      len(tst))

        # normalize training, validing and testing data set
        nm_trn = normalize_store_data(trn, store_data_max)
        nm_vld = normalize_store_data(vld, store_data_max)
        nm_tst = normalize_store_data(tst, store_data_max)

        v_init = None
        Y_hat2 = None

        for i in range(1):
            # run prediction on all validation and testing data set
            Y_hat = build_model1(nm_trn, nm_vld, nm_tst, store_weather, \
                                 valid_init=v_init, alpha_train=model_param)

            # save the code in case the model has stacking effect
            #v_init=Y_hat[len(trn):]

            # denormalize the sale
            Y_hat2 = denormalize_store_data(trn, vld, tst, Y_hat,
                                            store_data_max)

            # evaluate error in training and validation set
            e1, e2 = eval_model(trn, vld, Y_hat2)
            print "error at %d is: train(%f), valid(%f)" % (i, e1, e2)

        # write results to test result
        write_submission(trn, vld, tst, Y_hat2, 'test_result.csv',
                         'valid_result')
예제 #18
0
def run_model4(store_data_file, store_weather_file, test_data_file, \
               model_param=1, validate_only=False, eval_err=None):
    """
    ridge regression
    """
    print "---------------------start here---------------------"
    test_result_file ='test_result.csv'

    with open(test_result_file, 'w') as f:
        f.write('id,units\n')
        f.close()

    store_data, store_weather, test = load_data2(store_data_file, \
          store_weather_file, test_data_file)

    store_data_max = store_data.groupby(level=1).max()

    # categorize testing data with a relevant but much smaller training set
    target_set = build_target_set4(store_data, test, store_weather, store_data_max)

    for col, trn, vld, tst, cat in target_set:
        print "item(%s), train(%d), valid(%d), test(%d), model_param(%0.2f), cat(%d)" % \
              (col, len(trn), len(vld), len(tst), model_param, cat)
        if len(tst)==0: continue

        if cat==0:
            Y_hat2=np.zeros((len(trn)+len(vld)+len(tst), 1))
        else:
            nm_trn = normalize_store_data(trn, store_data_max)
            nm_vld = normalize_store_data(vld, store_data_max)
            nm_tst = normalize_store_data(tst, store_data_max)

            _,fmat = sim(nm_trn, nm_vld, nm_tst, store_weather)

            Y_hat = np.zeros((len(nm_trn) + len(nm_vld) + len(nm_tst), 1))
            X = fmat[:len(nm_trn)]

            Y = nm_trn[col].values[:,np.newaxis]
            clf = linear_model.Ridge(alpha=model_param)
            clf.fit(X, Y)
            Y_hat[:] = clf.predict(fmat)

            Y_hat2 = denormalize_store_data(trn, vld, tst, Y_hat, store_data_max, column=col)

        # evaluate error in training and validation set
        e1, e2 = eval_model(trn, vld, Y_hat2, column=col)
        print "error at item(%s) is: train(%f), valid(%f)" % (col, e1, e2)
        if eval_err is not None:
            eval_err.add_result(e1, len(trn), e2, len(vld))

        # write results to test result
        if not validate_only:
            write_submission(trn, vld, tst, Y_hat2, test_result_file, 'valid_result', column=col)

    # write out zero estimation
    if not validate_only:
        write_submission_zero(test, store_data_max, test_result_file)

    if eval_err is not None:
        e1, e2=eval_err.get_result()
        logging.info("model4(p=%f) error is: train(%f), valid(%f)" % (model_param, e1, e2))
        print "model4(p=%f) error is: train(%f), valid(%f)" % (model_param, e1, e2)