def run_model5_mp(queue, col, trn, vld, tst, cat, store_weather, store_data_max, \ model_param=1): if cat == 0: Y_hat2 = np.zeros((len(trn) + len(vld) + len(tst), 1)) else: # normalize training, validing and testing data set nm_trn = normalize_store_data(trn, store_data_max) nm_vld = normalize_store_data(vld, store_data_max) nm_tst = normalize_store_data(tst, store_data_max) Y_hat = build_model5(nm_trn, nm_vld, nm_tst, store_weather, column=col, alpha_train=model_param) # denormalize the sale Y_hat2 = denormalize_store_data(trn, vld, tst, Y_hat, store_data_max, column=col) queue.put((trn, vld, tst, Y_hat2, col, cat))
def run_model4_mp(queue, col, trn, vld, tst, cat, store_weather, store_data_max, \ model_param=1): if cat == 0: Y_hat2 = np.zeros((len(trn) + len(vld) + len(tst), 1)) else: nm_trn = normalize_store_data(trn, store_data_max) nm_vld = normalize_store_data(vld, store_data_max) nm_tst = normalize_store_data(tst, store_data_max) _, fmat = sim(nm_trn, nm_vld, nm_tst, store_weather) Y_hat = np.zeros((len(nm_trn) + len(nm_vld) + len(nm_tst), 1)) X = fmat[:len(nm_trn)] Y = nm_trn[col].values[:, np.newaxis] clf = linear_model.Ridge(alpha=model_param) clf.fit(X, Y) Y_hat[:] = clf.predict(fmat) Y_hat2 = denormalize_store_data(trn, vld, tst, Y_hat, store_data_max, column=col) queue.put((trn, vld, tst, Y_hat2, col, cat))
def run_model1(store_data_file, store_weather_file, test_data_file, model_param, only_validate=False): """ the model uses the square error to measure the difference between Y_hat and Y, and uses similarity to regulate Y_hat, that is, if one day's sale can be reconstructed by similar day's sale. The performance of the model in this task is not particularly good. """ print "start here" # write header to test result with open('test_result.csv', 'w') as f: f.write('id,units\n') f.close() # load data store_data, store_weather, test = load_data2(store_data_file, \ store_weather_file, test_data_file) # compute max item sales for each store as denominator store_data_max = store_data.groupby(level=1).max() # develop training and validation set train, valid = develop_valid_set2(store_data, store_weather, valid_size=100) # categorize testing data with a relevant but much smaller training set target_set = build_target_set2(train, valid, test, store_weather) # run prediction on testing data of each category for n, trn, vld, tst in target_set: print "%d, train(%d), valid(%d), test(%d)" % (n, len(trn), len(vld), len(tst)) # normalize training, validing and testing data set nm_trn = normalize_store_data(trn, store_data_max) nm_vld = normalize_store_data(vld, store_data_max) nm_tst = normalize_store_data(tst, store_data_max) v_init = None Y_hat2 = None for i in range(1): # run prediction on all validation and testing data set Y_hat = build_model1(nm_trn, nm_vld, nm_tst, store_weather, \ valid_init=v_init, alpha_train=model_param) # save the code in case the model has stacking effect #v_init=Y_hat[len(trn):] # denormalize the sale Y_hat2 = denormalize_store_data(trn, vld, tst, Y_hat, store_data_max) # evaluate error in training and validation set e1, e2 = eval_model(trn, vld, Y_hat2) print "error at %d is: train(%f), valid(%f)" % (i, e1, e2) # write results to test result write_submission(trn, vld, tst, Y_hat2, 'test_result.csv', 'valid_result')
def run_model5(store_data_file, store_weather_file, test_data_file, \ model_param=1, validate_only=False, eval_err=None): print "---------------------start here---------------------" test_result_file ='test_result.csv' # write header to test result with open(test_result_file, 'w') as f: f.write('id,units\n') f.close() # load data store_data, store_weather, test = load_data2(store_data_file, \ store_weather_file, test_data_file) # compute max item sales for each store as denominator store_data_max = store_data.groupby(level=1).max() # categorize testing data with a relevant but much smaller training set target_set = build_target_set3(store_data, test, store_weather, store_data_max, valid_pct=0) # run prediction on testing data of each category for col, trn, vld, tst in target_set: print "%s, train(%d), valid(%d), test(%d), model_param(%f)" % (col, len(trn), len(vld), len(tst), model_param) if len(tst)==0: continue # normalize training, validing and testing data set nm_trn = normalize_store_data(trn, store_data_max) nm_vld = normalize_store_data(vld, store_data_max) nm_tst = normalize_store_data(tst, store_data_max) Y_hat=build_model5(nm_trn, nm_vld, nm_tst, store_weather, column=col, alpha_train=model_param) # denormalize the sale Y_hat2 = denormalize_store_data(trn, vld, tst, Y_hat, store_data_max, column=col) # evaluate error in training and validation set e1, e2 = eval_model(trn, vld, Y_hat2, column=col) print "error at item(%s) is: train(%f), valid(%f)" % (col, e1, e2) if eval_err is not None: eval_err.add_result(e1, len(trn), e2, len(vld)) # write results to test result if not validate_only: write_submission(trn, vld, tst, Y_hat2, test_result_file, 'valid_result', column=col) # write out zero estimation if not validate_only: write_submission_zero(test, store_data_max, test_result_file) if eval_err is not None: e1, e2=eval_err.get_result() logging.info("model5(p=%f) error is: train(%f), valid(%f)" % (model_param, e1, e2)) print "model5(p=%f) error is: train(%f), valid(%f)" % (model_param, e1, e2)
def run_model4v1(store_data_file, store_weather_file, test_data_file, \ model_param=1, validate_only=False, eval_err=None): """ ridge regression with log error term """ print "---------------------start here---------------------" test_result_file ='test_result.csv' with open(test_result_file, 'w') as f: f.write('id,units\n') f.close() store_data, store_weather, test = load_data2(store_data_file, \ store_weather_file, test_data_file) store_data_max = store_data.groupby(level=1).max() # categorize testing data with a relevant but much smaller training set target_set = build_target_set3(store_data, test, store_weather, store_data_max, columns=set(['1'])) for col, trn, vld, tst in target_set: print "item(%s), train(%d), valid(%d), test(%d), model_param(%f)" % (col, len(trn), len(vld), len(tst), model_param) if len(tst)==0: continue nm_trn = normalize_store_data(trn, store_data_max) nm_vld = normalize_store_data(vld, store_data_max) nm_tst = normalize_store_data(tst, store_data_max) Y_hat, fmat_wegith=build_model_log_ridge(nm_trn, nm_vld, nm_tst, store_weather,col, alpha=model_param) Y_hat2 = denormalize_store_data(trn, vld, tst, Y_hat[:,np.newaxis], store_data_max, column=col) # evaluate error in training and validation set e1, e2 = eval_model(trn, vld, Y_hat2, column=col) print "error at item(%s) is: train(%f), valid(%f)" % (col, e1, e2) if eval_err is not None: eval_err.add_result(e1, len(trn), e2, len(vld)) # write results to test result if not validate_only: write_submission(trn, vld, tst, Y_hat2, test_result_file, 'valid_result', column=col) # write out zero estimation if not validate_only: write_submission_zero(test, store_data_max, test_result_file) if eval_err is not None: e1, e2=eval_err.get_result() logging.info("model4v1(p=%f) error is: train(%f), valid(%f)" % (model_param, e1, e2)) print "model4v1(p=%f) error is: train(%f), valid(%f)" % (model_param, e1, e2)
def run_model5_mp(queue, col, trn, vld, tst, cat, store_weather, store_data_max, \ model_param=1): if cat==0: Y_hat2=np.zeros((len(trn)+len(vld)+len(tst), 1)) else: # normalize training, validing and testing data set nm_trn = normalize_store_data(trn, store_data_max) nm_vld = normalize_store_data(vld, store_data_max) nm_tst = normalize_store_data(tst, store_data_max) Y_hat=build_model5(nm_trn, nm_vld, nm_tst, store_weather, column=col, alpha_train=model_param) # denormalize the sale Y_hat2 = denormalize_store_data(trn, vld, tst, Y_hat, store_data_max, column=col) queue.put((trn, vld, tst, Y_hat2, col, cat))
def run_model3(store_data_file, store_weather_file, test_data_file, model_param=1, validate_only=False): print "start here" test_result_file ='test_result.csv' # write header to test result with open(test_result_file, 'w') as f: f.write('id,units\n') f.close() # load data store_data, store_weather, test = load_data2(store_data_file, \ store_weather_file, test_data_file) # compute max item sales for each store as denominator store_data_max = store_data.groupby(level=1).max() # develop training and validation set train, valid = develop_valid_set2(store_data, store_weather, valid_size=100) # categorize testing data with a relevant but much smaller training set target_set = build_target_set3(train, valid, test, store_weather, store_data_max) # run prediction on testing data of each category for col, trn, vld, tst in target_set: print "%s, train(%d), valid(%d), test(%d)" % (col, len(trn), len(vld), len(tst)) # normalize training, validing and testing data set nm_trn = normalize_store_data(trn, store_data_max) nm_vld = normalize_store_data(vld, store_data_max) nm_tst = normalize_store_data(tst, store_data_max) Y_hat=build_model3(nm_trn, nm_vld, nm_tst, store_weather, column=col, alpha_train=model_param) # denormalize the sale Y_hat2 = denormalize_store_data(trn, vld, tst, Y_hat, store_data_max, column=col) # evaluate error in training and validation set e1, e2 = eval_model(trn, vld, Y_hat2, column=col) print "error is: train(%f), valid(%f)" % (e1, e2) # write results to test result write_submission(trn, vld, tst, Y_hat2, test_result_file, 'valid_result', column=col) # write out zero estimation if not validate_only: write_submission_zero(test, store_data_max, test_result_file)
def helper_model1v1(df, offset, train, m, Y_hat, store_data_max, v_init=None): # construct feature matrix with one row more than training data m2=np.zeros((len(train)+1,m.shape[1])) # copy training feature matrix m2[:len(train)]=m[:len(train)] for i in range(len(df)): m2[-1]=m[offset+i] def l(i): g=np.dot(m2, m2[i]) d=np.sum(g) g=-g g[i]=d+g[i] return g df0=df.iloc[i:i+1] Y_hat0 = build_model(train, None, df0, l, v_init) Y_hat1 = denormalize_store_data(train, None, df0, Y_hat0, store_data_max) Y_hat[offset+i]=Y_hat1[-1]
def run_model2(store_data_file, store_weather_file, test_data_file): print "start here" # write header to test result with open('test_result.csv', 'w') as f: f.write('id,units\n') f.close() # load data store_data, store_weather, test = load_data2(store_data_file, \ store_weather_file, test_data_file) # compute max item sales for each store as denominator store_data_max = store_data.groupby(level=1).max() # develop training and validation set train, valid = develop_valid_set2(store_data, store_weather, valid_size=0) # categorize testing data with a relevant but much smaller training set target_set = build_target_set(train, valid, test, store_weather) # run prediction on testing data of each category for n, trn, vld, tst in target_set: print "%d, train(%d), valid(%d), test(%d)" % (n, len(trn), len(vld), len(tst)) # normalize training, validing and testing data set nm_trn = normalize_store_data(trn, store_data_max) nm_vld = normalize_store_data(vld, store_data_max) nm_tst = normalize_store_data(tst, store_data_max) Y_hat, theta = build_model2(nm_trn, nm_vld, nm_tst, store_weather) # denormalize the sale Y_hat2 = denormalize_store_data(trn, vld, tst, Y_hat, store_data_max) # evaluate error in training and validation set e1, e2 = eval_model(trn, vld, Y_hat2) print "error is: train(%f), valid(%f)" % (e1, e2) # write results to test result write_submission(trn, vld, tst, Y_hat2, 'test_result.csv')
def run_model2(store_data_file, store_weather_file, test_data_file): print "start here" # write header to test result with open('test_result.csv', 'w') as f: f.write('id,units\n') f.close() # load data store_data, store_weather, test = load_data2(store_data_file, \ store_weather_file, test_data_file) # compute max item sales for each store as denominator store_data_max = store_data.groupby(level=1).max() # develop training and validation set train, valid = develop_valid_set2(store_data, store_weather, valid_size=0) # categorize testing data with a relevant but much smaller training set target_set = build_target_set(train, valid, test, store_weather) # run prediction on testing data of each category for n, trn, vld, tst in target_set: print "%d, train(%d), valid(%d), test(%d)" % (n, len(trn), len(vld), len(tst)) # normalize training, validing and testing data set nm_trn = normalize_store_data(trn, store_data_max) nm_vld = normalize_store_data(vld, store_data_max) nm_tst = normalize_store_data(tst, store_data_max) Y_hat, theta=build_model2(nm_trn, nm_vld, nm_tst, store_weather) # denormalize the sale Y_hat2 = denormalize_store_data(trn, vld, tst, Y_hat, store_data_max) # evaluate error in training and validation set e1, e2 = eval_model(trn, vld, Y_hat2) print "error is: train(%f), valid(%f)" % (e1, e2) # write results to test result write_submission(trn, vld, tst, Y_hat2, 'test_result.csv')
def run_model4_mp(queue, col, trn, vld, tst, cat, store_weather, store_data_max, \ model_param=1): if cat==0: Y_hat2=np.zeros((len(trn)+len(vld)+len(tst), 1)) else: nm_trn = normalize_store_data(trn, store_data_max) nm_vld = normalize_store_data(vld, store_data_max) nm_tst = normalize_store_data(tst, store_data_max) _,fmat = sim(nm_trn, nm_vld, nm_tst, store_weather) Y_hat = np.zeros((len(nm_trn) + len(nm_vld) + len(nm_tst), 1)) X = fmat[:len(nm_trn)] Y = nm_trn[col].values[:,np.newaxis] clf = linear_model.Ridge(alpha=model_param) clf.fit(X, Y) Y_hat[:] = clf.predict(fmat) Y_hat2 = denormalize_store_data(trn, vld, tst, Y_hat, store_data_max, column=col) queue.put((trn, vld, tst, Y_hat2, col, cat))
def helper_model1v1(df, offset, train, m, Y_hat, store_data_max, v_init=None): # construct feature matrix with one row more than training data m2 = np.zeros((len(train) + 1, m.shape[1])) # copy training feature matrix m2[:len(train)] = m[:len(train)] for i in range(len(df)): m2[-1] = m[offset + i] def l(i): g = np.dot(m2, m2[i]) d = np.sum(g) g = -g g[i] = d + g[i] return g df0 = df.iloc[i:i + 1] Y_hat0 = build_model(train, None, df0, l, v_init) Y_hat1 = denormalize_store_data(train, None, df0, Y_hat0, store_data_max) Y_hat[offset + i] = Y_hat1[-1]
def run_model5(store_data_file, store_weather_file, test_data_file, \ model_param=1, validate_only=False, eval_err=None): print "---------------------start here---------------------" test_result_file = 'test_result.csv' # write header to test result with open(test_result_file, 'w') as f: f.write('id,units\n') f.close() # load data store_data, store_weather, test = load_data2(store_data_file, \ store_weather_file, test_data_file) # compute max item sales for each store as denominator store_data_max = store_data.groupby(level=1).max() # categorize testing data with a relevant but much smaller training set target_set = build_target_set3(store_data, test, store_weather, store_data_max, valid_pct=0) # run prediction on testing data of each category for col, trn, vld, tst in target_set: print "%s, train(%d), valid(%d), test(%d), model_param(%f)" % ( col, len(trn), len(vld), len(tst), model_param) if len(tst) == 0: continue # normalize training, validing and testing data set nm_trn = normalize_store_data(trn, store_data_max) nm_vld = normalize_store_data(vld, store_data_max) nm_tst = normalize_store_data(tst, store_data_max) Y_hat = build_model5(nm_trn, nm_vld, nm_tst, store_weather, column=col, alpha_train=model_param) # denormalize the sale Y_hat2 = denormalize_store_data(trn, vld, tst, Y_hat, store_data_max, column=col) # evaluate error in training and validation set e1, e2 = eval_model(trn, vld, Y_hat2, column=col) print "error at item(%s) is: train(%f), valid(%f)" % (col, e1, e2) if eval_err is not None: eval_err.add_result(e1, len(trn), e2, len(vld)) # write results to test result if not validate_only: write_submission(trn, vld, tst, Y_hat2, test_result_file, 'valid_result', column=col) # write out zero estimation if not validate_only: write_submission_zero(test, store_data_max, test_result_file) if eval_err is not None: e1, e2 = eval_err.get_result() logging.info("model5(p=%f) error is: train(%f), valid(%f)" % (model_param, e1, e2)) print "model5(p=%f) error is: train(%f), valid(%f)" % (model_param, e1, e2)
def run_model4v1(store_data_file, store_weather_file, test_data_file, \ model_param=1, validate_only=False, eval_err=None): """ ridge regression with log error term """ print "---------------------start here---------------------" test_result_file = 'test_result.csv' with open(test_result_file, 'w') as f: f.write('id,units\n') f.close() store_data, store_weather, test = load_data2(store_data_file, \ store_weather_file, test_data_file) store_data_max = store_data.groupby(level=1).max() # categorize testing data with a relevant but much smaller training set target_set = build_target_set3(store_data, test, store_weather, store_data_max, columns=set(['1'])) for col, trn, vld, tst in target_set: print "item(%s), train(%d), valid(%d), test(%d), model_param(%f)" % ( col, len(trn), len(vld), len(tst), model_param) if len(tst) == 0: continue nm_trn = normalize_store_data(trn, store_data_max) nm_vld = normalize_store_data(vld, store_data_max) nm_tst = normalize_store_data(tst, store_data_max) Y_hat, fmat_wegith = build_model_log_ridge(nm_trn, nm_vld, nm_tst, store_weather, col, alpha=model_param) Y_hat2 = denormalize_store_data(trn, vld, tst, Y_hat[:, np.newaxis], store_data_max, column=col) # evaluate error in training and validation set e1, e2 = eval_model(trn, vld, Y_hat2, column=col) print "error at item(%s) is: train(%f), valid(%f)" % (col, e1, e2) if eval_err is not None: eval_err.add_result(e1, len(trn), e2, len(vld)) # write results to test result if not validate_only: write_submission(trn, vld, tst, Y_hat2, test_result_file, 'valid_result', column=col) # write out zero estimation if not validate_only: write_submission_zero(test, store_data_max, test_result_file) if eval_err is not None: e1, e2 = eval_err.get_result() logging.info("model4v1(p=%f) error is: train(%f), valid(%f)" % (model_param, e1, e2)) print "model4v1(p=%f) error is: train(%f), valid(%f)" % (model_param, e1, e2)
def run_model4(store_data_file, store_weather_file, test_data_file, \ model_param=1, validate_only=False, eval_err=None): """ ridge regression """ print "---------------------start here---------------------" test_result_file = 'test_result.csv' with open(test_result_file, 'w') as f: f.write('id,units\n') f.close() store_data, store_weather, test = load_data2(store_data_file, \ store_weather_file, test_data_file) store_data_max = store_data.groupby(level=1).max() # categorize testing data with a relevant but much smaller training set target_set = build_target_set4(store_data, test, store_weather, store_data_max) for col, trn, vld, tst, cat in target_set: print "item(%s), train(%d), valid(%d), test(%d), model_param(%0.2f), cat(%d)" % \ (col, len(trn), len(vld), len(tst), model_param, cat) if len(tst) == 0: continue if cat == 0: Y_hat2 = np.zeros((len(trn) + len(vld) + len(tst), 1)) else: nm_trn = normalize_store_data(trn, store_data_max) nm_vld = normalize_store_data(vld, store_data_max) nm_tst = normalize_store_data(tst, store_data_max) _, fmat = sim(nm_trn, nm_vld, nm_tst, store_weather) Y_hat = np.zeros((len(nm_trn) + len(nm_vld) + len(nm_tst), 1)) X = fmat[:len(nm_trn)] Y = nm_trn[col].values[:, np.newaxis] clf = linear_model.Ridge(alpha=model_param) clf.fit(X, Y) Y_hat[:] = clf.predict(fmat) Y_hat2 = denormalize_store_data(trn, vld, tst, Y_hat, store_data_max, column=col) # evaluate error in training and validation set e1, e2 = eval_model(trn, vld, Y_hat2, column=col) print "error at item(%s) is: train(%f), valid(%f)" % (col, e1, e2) if eval_err is not None: eval_err.add_result(e1, len(trn), e2, len(vld)) # write results to test result if not validate_only: write_submission(trn, vld, tst, Y_hat2, test_result_file, 'valid_result', column=col) # write out zero estimation if not validate_only: write_submission_zero(test, store_data_max, test_result_file) if eval_err is not None: e1, e2 = eval_err.get_result() logging.info("model4(p=%f) error is: train(%f), valid(%f)" % (model_param, e1, e2)) print "model4(p=%f) error is: train(%f), valid(%f)" % (model_param, e1, e2)
def run_model3(store_data_file, store_weather_file, test_data_file, model_param=1, validate_only=False): print "start here" test_result_file = 'test_result.csv' # write header to test result with open(test_result_file, 'w') as f: f.write('id,units\n') f.close() # load data store_data, store_weather, test = load_data2(store_data_file, \ store_weather_file, test_data_file) # compute max item sales for each store as denominator store_data_max = store_data.groupby(level=1).max() # develop training and validation set train, valid = develop_valid_set2(store_data, store_weather, valid_size=100) # categorize testing data with a relevant but much smaller training set target_set = build_target_set3(train, valid, test, store_weather, store_data_max) # run prediction on testing data of each category for col, trn, vld, tst in target_set: print "%s, train(%d), valid(%d), test(%d)" % (col, len(trn), len(vld), len(tst)) # normalize training, validing and testing data set nm_trn = normalize_store_data(trn, store_data_max) nm_vld = normalize_store_data(vld, store_data_max) nm_tst = normalize_store_data(tst, store_data_max) Y_hat = build_model3(nm_trn, nm_vld, nm_tst, store_weather, column=col, alpha_train=model_param) # denormalize the sale Y_hat2 = denormalize_store_data(trn, vld, tst, Y_hat, store_data_max, column=col) # evaluate error in training and validation set e1, e2 = eval_model(trn, vld, Y_hat2, column=col) print "error is: train(%f), valid(%f)" % (e1, e2) # write results to test result write_submission(trn, vld, tst, Y_hat2, test_result_file, 'valid_result', column=col) # write out zero estimation if not validate_only: write_submission_zero(test, store_data_max, test_result_file)
def run_model4(store_data_file, store_weather_file, test_data_file, \ model_param=1, validate_only=False, eval_err=None): """ ridge regression """ print "---------------------start here---------------------" test_result_file ='test_result.csv' with open(test_result_file, 'w') as f: f.write('id,units\n') f.close() store_data, store_weather, test = load_data2(store_data_file, \ store_weather_file, test_data_file) store_data_max = store_data.groupby(level=1).max() # categorize testing data with a relevant but much smaller training set target_set = build_target_set4(store_data, test, store_weather, store_data_max) for col, trn, vld, tst, cat in target_set: print "item(%s), train(%d), valid(%d), test(%d), model_param(%0.2f), cat(%d)" % \ (col, len(trn), len(vld), len(tst), model_param, cat) if len(tst)==0: continue if cat==0: Y_hat2=np.zeros((len(trn)+len(vld)+len(tst), 1)) else: nm_trn = normalize_store_data(trn, store_data_max) nm_vld = normalize_store_data(vld, store_data_max) nm_tst = normalize_store_data(tst, store_data_max) _,fmat = sim(nm_trn, nm_vld, nm_tst, store_weather) Y_hat = np.zeros((len(nm_trn) + len(nm_vld) + len(nm_tst), 1)) X = fmat[:len(nm_trn)] Y = nm_trn[col].values[:,np.newaxis] clf = linear_model.Ridge(alpha=model_param) clf.fit(X, Y) Y_hat[:] = clf.predict(fmat) Y_hat2 = denormalize_store_data(trn, vld, tst, Y_hat, store_data_max, column=col) # evaluate error in training and validation set e1, e2 = eval_model(trn, vld, Y_hat2, column=col) print "error at item(%s) is: train(%f), valid(%f)" % (col, e1, e2) if eval_err is not None: eval_err.add_result(e1, len(trn), e2, len(vld)) # write results to test result if not validate_only: write_submission(trn, vld, tst, Y_hat2, test_result_file, 'valid_result', column=col) # write out zero estimation if not validate_only: write_submission_zero(test, store_data_max, test_result_file) if eval_err is not None: e1, e2=eval_err.get_result() logging.info("model4(p=%f) error is: train(%f), valid(%f)" % (model_param, e1, e2)) print "model4(p=%f) error is: train(%f), valid(%f)" % (model_param, e1, e2)