def get_train_cv_data_by_chunk(data): chunk_map = utilities.get_chunk_map(data, 1) train_data = [] cv_data = [] for chunk_id in chunk_map.keys(): num = len(chunk_map[chunk_id]) train_num = 147 train_data += chunk_map[chunk_id][0:train_num] cv_data += chunk_map[chunk_id][train_num::] return train_data, cv_data
def get_train_cv_data_by_chunk(data): chunk_map = utilities.get_chunk_map(data, 1) train_data = [] cv_data = [] for chunk_id in chunk_map.keys(): num = len(chunk_map[chunk_id]) train_num = 147 train_data += chunk_map[chunk_id][0 : train_num] cv_data += chunk_map[chunk_id][train_num : :] return train_data, cv_data
def time_series(training_file, submission_file, output_file): data = utilities.read_file(training_file, True) first_line = data[0] data = data[1 : :] data = preprocess.fill_NAs(data) (chunk_avg, hour_avg_by_chunk, weekday_avg_by_chunk, hour_avg, weekday_avg) = feature_extraction.get_avg_maps(data) clf_map = regression.linear_regression_2(data) print 'Filling submission file...' chunk_map = utilities.get_chunk_map(data, 1) sub_data = utilities.read_file(submission_file, True) positions = [1, 2, 3, 4, 5, 10, 17, 24, 48, 72] for i in range(1, len(sub_data)): chunk_id = sub_data[i][1] hour = sub_data[i][3] pos = positions[(i - 1) % 10] for j in range(5, len(sub_data[i])): target = j - 5 if sub_data[i][j] == '0': if not chunk_id in chunk_map: sub_data[i][j] = hour_avg[hour][target] else: data_in_chunk = chunk_map[chunk_id] start = len(data_in_chunk) - 24 t = len(data_in_chunk[0]) - 39 + target features = [] prev_hour = 0 for k in range(start, len(data_in_chunk)): features.append(float(data_in_chunk[k][t])) if data_in_chunk[k][5] == hour: prev_hour = float(data_in_chunk[k][t]) features.append(prev_hour) # Binary hour features. for h in range(24): if h == int(hour): features.append(1) else: features.append(0) # Binary month features. month = int(sub_data[i][4]) for m in range(1, 13): if m == month: features.append(1) else: features.append(0) # Weather features. tmp_length = len(data_in_chunk) for k in range(6, 56): features.append(float(data_in_chunk[tmp_length - 1][k])) for k in range(6, 56): features.append(float(data_in_chunk[tmp_length - 2][k])) sub_data[i][j] = \ clf_map[(target, pos)].predict([features])[0] utilities.write_file(output_file, sub_data)
def time_series(training_file, submission_file, output_file): data = utilities.read_file(training_file, True) first_line = data[0] data = data[1::] data = preprocess.fill_NAs(data) (chunk_avg, hour_avg_by_chunk, weekday_avg_by_chunk, hour_avg, weekday_avg) = feature_extraction.get_avg_maps(data) clf_map = regression.linear_regression_2(data) print 'Filling submission file...' chunk_map = utilities.get_chunk_map(data, 1) sub_data = utilities.read_file(submission_file, True) positions = [1, 2, 3, 4, 5, 10, 17, 24, 48, 72] for i in range(1, len(sub_data)): chunk_id = sub_data[i][1] hour = sub_data[i][3] pos = positions[(i - 1) % 10] for j in range(5, len(sub_data[i])): target = j - 5 if sub_data[i][j] == '0': if not chunk_id in chunk_map: sub_data[i][j] = hour_avg[hour][target] else: data_in_chunk = chunk_map[chunk_id] start = len(data_in_chunk) - 24 t = len(data_in_chunk[0]) - 39 + target features = [] prev_hour = 0 for k in range(start, len(data_in_chunk)): features.append(float(data_in_chunk[k][t])) if data_in_chunk[k][5] == hour: prev_hour = float(data_in_chunk[k][t]) features.append(prev_hour) # Binary hour features. for h in range(24): if h == int(hour): features.append(1) else: features.append(0) # Binary month features. month = int(sub_data[i][4]) for m in range(1, 13): if m == month: features.append(1) else: features.append(0) # Weather features. tmp_length = len(data_in_chunk) for k in range(6, 56): features.append(float(data_in_chunk[tmp_length - 1][k])) for k in range(6, 56): features.append(float(data_in_chunk[tmp_length - 2][k])) sub_data[i][j] = \ clf_map[(target, pos)].predict([features])[0] utilities.write_file(output_file, sub_data)