示例#1
0
def get_train_cv_data_by_chunk(data):
    chunk_map = utilities.get_chunk_map(data, 1)

    train_data = []
    cv_data = []
    for chunk_id in chunk_map.keys():
        num = len(chunk_map[chunk_id])
        train_num = 147
        train_data += chunk_map[chunk_id][0:train_num]
        cv_data += chunk_map[chunk_id][train_num::]
    return train_data, cv_data
示例#2
0
def get_train_cv_data_by_chunk(data):
    chunk_map = utilities.get_chunk_map(data, 1)

    train_data = []
    cv_data = []
    for chunk_id in chunk_map.keys():
        num = len(chunk_map[chunk_id])
        train_num = 147
        train_data += chunk_map[chunk_id][0 : train_num]
        cv_data += chunk_map[chunk_id][train_num : :]
    return train_data, cv_data
示例#3
0
文件: main.py 项目: FindBoat/Kaggle
def time_series(training_file, submission_file, output_file):
    data = utilities.read_file(training_file, True)
    first_line = data[0]
    data = data[1 : :]
    data = preprocess.fill_NAs(data)

    (chunk_avg, hour_avg_by_chunk, weekday_avg_by_chunk,
     hour_avg, weekday_avg) = feature_extraction.get_avg_maps(data)

    clf_map = regression.linear_regression_2(data)

    print 'Filling submission file...'
    chunk_map = utilities.get_chunk_map(data, 1)
    sub_data = utilities.read_file(submission_file, True)

    positions = [1, 2, 3, 4, 5, 10, 17, 24, 48, 72]
    for i in range(1, len(sub_data)):
        chunk_id = sub_data[i][1]
        hour = sub_data[i][3]
        pos = positions[(i - 1) % 10]
        for j in range(5, len(sub_data[i])):
            target = j - 5
            if sub_data[i][j] == '0':
                if not chunk_id in chunk_map:
                    sub_data[i][j] = hour_avg[hour][target]
                else:
                    data_in_chunk = chunk_map[chunk_id]
                    start = len(data_in_chunk) - 24
                    t = len(data_in_chunk[0]) - 39 + target
                    features = []
                    prev_hour = 0
                    for k in range(start, len(data_in_chunk)):
                        features.append(float(data_in_chunk[k][t]))
                        if data_in_chunk[k][5] == hour:
                            prev_hour = float(data_in_chunk[k][t])

                    features.append(prev_hour)

                    # Binary hour features.
                    for h in range(24):
                        if h == int(hour):
                            features.append(1)
                        else:
                            features.append(0)

                    # Binary month features.
                    month = int(sub_data[i][4])
                    for m in range(1, 13):
                        if m == month:
                            features.append(1)
                        else:
                            features.append(0)

                    # Weather features.
                    tmp_length = len(data_in_chunk)
                    for k in range(6, 56):
                        features.append(float(data_in_chunk[tmp_length - 1][k]))
                    for k in range(6, 56):
                        features.append(float(data_in_chunk[tmp_length - 2][k]))

                    sub_data[i][j] = \
                        clf_map[(target, pos)].predict([features])[0]

    utilities.write_file(output_file, sub_data)
示例#4
0
def time_series(training_file, submission_file, output_file):
    data = utilities.read_file(training_file, True)
    first_line = data[0]
    data = data[1::]
    data = preprocess.fill_NAs(data)

    (chunk_avg, hour_avg_by_chunk, weekday_avg_by_chunk, hour_avg,
     weekday_avg) = feature_extraction.get_avg_maps(data)

    clf_map = regression.linear_regression_2(data)

    print 'Filling submission file...'
    chunk_map = utilities.get_chunk_map(data, 1)
    sub_data = utilities.read_file(submission_file, True)

    positions = [1, 2, 3, 4, 5, 10, 17, 24, 48, 72]
    for i in range(1, len(sub_data)):
        chunk_id = sub_data[i][1]
        hour = sub_data[i][3]
        pos = positions[(i - 1) % 10]
        for j in range(5, len(sub_data[i])):
            target = j - 5
            if sub_data[i][j] == '0':
                if not chunk_id in chunk_map:
                    sub_data[i][j] = hour_avg[hour][target]
                else:
                    data_in_chunk = chunk_map[chunk_id]
                    start = len(data_in_chunk) - 24
                    t = len(data_in_chunk[0]) - 39 + target
                    features = []
                    prev_hour = 0
                    for k in range(start, len(data_in_chunk)):
                        features.append(float(data_in_chunk[k][t]))
                        if data_in_chunk[k][5] == hour:
                            prev_hour = float(data_in_chunk[k][t])

                    features.append(prev_hour)

                    # Binary hour features.
                    for h in range(24):
                        if h == int(hour):
                            features.append(1)
                        else:
                            features.append(0)

                    # Binary month features.
                    month = int(sub_data[i][4])
                    for m in range(1, 13):
                        if m == month:
                            features.append(1)
                        else:
                            features.append(0)

                    # Weather features.
                    tmp_length = len(data_in_chunk)
                    for k in range(6, 56):
                        features.append(float(data_in_chunk[tmp_length -
                                                            1][k]))
                    for k in range(6, 56):
                        features.append(float(data_in_chunk[tmp_length -
                                                            2][k]))

                    sub_data[i][j] = \
                        clf_map[(target, pos)].predict([features])[0]

    utilities.write_file(output_file, sub_data)