def gen_forecast(): curdir = os.path.dirname(__file__) data_dir = os.path.join(curdir, "data") #create random forest training input file state_file_name = os.path.join(data_dir, "states.csv") diff_file_name = os.path.join(data_dir, "diffs.csv") learn_file = os.path.join(data_dir, "learn.%dhr.csv" % hour) create_learning_input(state_file_name, diff_file_name, learn_file) #test the model testing_file = os.path.join(data_dir, "learn.%dhr.csv" % hour) roi_output_file = os.path.join(data_dir, "roi_output.pred.%dhr.csv" % hour) df = pd.read_csv(testing_file, header=0) cols = list(df.columns.values) with open(model_file_name, 'rb') as f: rf = cPickle.load(f) if DAT_BEGIN_VAR in cols: data_begin_idx = cols.index(DAT_BEGIN_VAR) else: data_begin_idx = cols.index(roi_items[-1]) + 1 roi_type_idx = cols.index("type") mat = df.as_matrix() x = mat[:, data_begin_idx:].astype(np.float32) y = mat[:, roi_type_idx].astype(np.int32) lats = mat[:, 2].astype(np.float32) lons = mat[:, 3].astype(np.float32) logger.info("Testing model, shape (%d, %d)" % (x.shape[0], x.shape[1])) #preds = rf.predict(x) preds = rf.predict_proba(x) roi_ofile = open(roi_output_file, 'w') roi_ofile.write("roi_name, actual, prediction, lat, lon\n") roi_names = df["roi_id"] idx = 0 for p in preds: roi_ofile.write( "%s, %d, %f, %f, %f\n" % (roi_names[idx], y[idx], p[1], lats[idx], lons[idx])) #roi_ofile.write("%s, %d, %f, %f, %f\n" % (roi_names[idx], y[idx], p, lats[idx], lons[idx])) idx += 1 roi_ofile.close()
def save_learning_files(): df, cols = load_state_df(train_file) save_states(df, cols, train_state_file) logger.info("Saved train state file") df, cols = load_diff_df(train_file) save_diffs(df, cols, train_diff_file) logger.info("Saved train diff file") df, cols = load_state_df(test_file) save_states(df, cols, test_state_file) logger.info("Saved test state file") df, cols = load_diff_df(test_file) save_diffs(df, cols, test_diff_file) logger.info("Saved test diff file")
def clean(file_name, ofilename, max_waterbody_count=-1): logger.info('processing %s' % file_name) odf = pd.read_csv(file_name, header=0) # first line contains header logger.info('checking data, %d rows' % len(odf)) use_rows = good_data(odf) odf1 = odf.loc[use_rows].copy() logger.info('replacing bad data, %d good rows' % len(odf1)) replace_data(odf1) if max_waterbody_count > 0: logger.info('restricting rows by WATERBODY_count') odf2 = odf1[odf1["WATERBODY_count"] < max_waterbody_count] odf2.to_csv(ofilename, index=False) else: # print 'writing outpout file %s, %d rows' % (ofilename, len(odf2)) odf1.to_csv(ofilename, index=False)
def save_learning_files(): logger.info("cleaning output file %s" % ifile) clean_output(ifile, ofilename) df, cols = load_state_df(ofilename) save_states(df, cols, state_file) logger.info("Saved state file") df, cols = load_diff_df(ofilename) save_diffs(df, cols, diff_file) logger.info("Saved diff file")
def split_test_train(create_combo=False, clean=False, mode=SHUFFLED): if create_combo: logger.info("creating july/aug combined output file %s" % cfile) flag = True with open(cfile, 'w') as of: for f in ifiles: logger.info("processing %s" % f) with open(f, 'r') as fin: line = fin.readline() while len(line) > 0: if "roi_id" in line: if flag: of.write(line) flag = False else: of.write(line) line = fin.readline() if clean: logger.info("cleaning output file %s" % cfile) clean_output(cfile, ofilename) logger.info("Shuffling data for test and train") df = pd.read_csv(ofilename, header=0) if mode == SHUFFLED: grouped = df.groupby("roi_id") roi_ids = grouped.indices.keys() np.random.shuffle(roi_ids) split_idx = int(split_factor * len(roi_ids)) test_roi_ids = roi_ids[0:split_idx] train_roi_ids = roi_ids[split_idx:-1] test_df = df.loc[(df["roi_id"].isin(test_roi_ids))] train_df = df.loc[(df["roi_id"].isin(train_roi_ids))] elif mode == TEMPORAL: starttimes = sorted(df["starttime"].values) split_time = starttimes[int(len(starttimes) * (1 - split_factor))] train_df = df[df["starttime"] <= split_time] test_df = df[df["starttime"] > split_time] else: raise Exception("mode incorrect") test_df.to_csv(test_file, index=False) logger.info("Saved test file") train_df.to_csv(train_file, index=False) logger.info("Saved train file")