예제 #1
0
def fit_model(io: IO, model: Sequential, preprocessed: List[Preprocessed]):
    epochs = io.get("epochs")
    model.reset_states()

    logline("splitting into training set and testing set ({}%)".format(
        io.get("split")))
    split = gen_split(preprocessed, io)

    log_dir = "logs/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    for i in range(epochs):
        logline("generating input and expected data for epoch {}/{}".format(
            i + 1, epochs))
        train_x, train_y = trim_params(gen_fit_params(split), io)

        logline("training epoch {}/{}".format(i + 1, epochs))
        callbacks = []
        if io.get("profile"):
            debug("profiling")
            callbacks.append(
                tf.keras.callbacks.TensorBoard(log_dir=log_dir,
                                               histogram_freq=1))

        model.fit(train_x,
                  train_y,
                  batch_size=io.get("batch_size"),
                  epochs=1,
                  shuffle=False,
                  callbacks=callbacks)
        model.reset_states()
예제 #2
0
def output_split(all: List[Preprocessed], train: List[Preprocessed], io: IO):
    obj = {
        "training_set": list(map(lambda x: x.file_name, train)),
        "test_set": list(map(lambda x: x.file_name, filter(lambda x: x not in train, all))),
    }
    with open(io.get("output_train"), "w+") as out_file:
        json.dump(obj, out_file)
        logline("wrote training/testing config to {}".format(io.get("output_train")))
예제 #3
0
def read_test_files(io: IO) -> List[Preprocessed]:
    with open(io.get("input_preprocessed"), "rb") as preprocessed_file:
        file_configs = pickle.load(preprocessed_file)
        with open(io.get("input_train"), "rb") as train_config_file:
            train_config = json.load(train_config_file)
            test_files_names = train_config["test_set"]

            preprocessed = map(lambda x: Preprocessed(x), file_configs)
            test_files = list(
                filter(lambda x: x.file_name in test_files_names,
                       preprocessed))
            return test_files
예제 #4
0
파일: chip8.py 프로젝트: facetoe/chip8
def main(screen):
    vm = VirtualMachine()
    vm.initialize(program_path='/home/facetoe/Downloads/chio/INVADERS')
    io = IO(screen)
    io.initialize(screen)

    while True:
        vm.tick()
        if vm.needs_refresh:
            io.draw(vm.gfx_buffer)
            vm.needs_refresh = False
        sleep(0.01)
예제 #5
0
    def __init__(self):
        self.io = IO()
        self.core = Core()

        banner = self.io.readBanner()
        args = self.io.getArguments()
        salt = args.salt
        if salt == None:
            salt = DEFAULT_SALT

        saltyBanner = self.core.addSalt(banner, salt)
        print(saltyBanner)
예제 #6
0
def fit_model(io: IO, model: Sequential, preprocessed: List[Preprocessed]):
    epochs = io.get("epochs")
    model.reset_states()

    logline("splitting into training set and testing set ({}%)".format(io.get("split")))
    split = gen_split(preprocessed, io)
    for i in range(epochs):
        logline("generating input and expected data for epoch {}/{}".format(i + 1, epochs))
        train_x, train_y = trim_params(gen_fit_params(split), io)

        logline("training epoch {}/{}".format(i + 1, epochs))
        model.fit(train_x, train_y, batch_size=io.get("batch_size"), epochs=1, shuffle=False)
        model.reset_states()
예제 #7
0
def output_split(all: List[Preprocessed], train: List[Preprocessed], io: IO):
    obj = {
        "training_set":
        list(map(lambda x: x.file_name, train)),
        "test_set":
        list(map(lambda x: x.file_name, filter(lambda x: x not in train,
                                               all))),
    }
    pathlib.Path(os.path.dirname(io.get("output_train"))).mkdir(parents=True,
                                                                exist_ok=True)
    with open(io.get("output_train"), "w+") as out_file:
        json.dump(obj, out_file)
        logline("wrote training/testing config to {}".format(
            io.get("output_train")))
예제 #8
0
def gen_split(preprocessed: List[Preprocessed], io: IO) -> List[Preprocessed]:
    split = io.get("split")
    if split == 100:
        output_split(preprocessed, preprocessed, io)
        return preprocessed

    shuffled = random.sample(preprocessed, len(preprocessed))

    total_len = sum(map(lambda x: len(x.features), preprocessed))
    train_len = (total_len / 100.0) * split

    train_items = list()
    current_len = 0
    for i in range(len(preprocessed) - 1):
        new_len = current_len + len(shuffled[i].features)

        if new_len >= train_len:
            output_split(preprocessed, train_items, io)
            return train_items

        current_len = new_len
        train_items.append(shuffled[i])

    output_split(preprocessed, train_items, io)
    return train_items
예제 #9
0
def predictions_to_out_file(predictions: np.array, io: IO):
    obj = {"items": [], "genre": {"hard": 0.5, "uptempo": 0.5}}
    interval = io.get("interval")

    melodies = list()

    cur_time = 0
    for i in range(len(predictions)):
        prediction = predictions[i]
        beat, melody = prediction

        if is_positive_beat(beat):
            cur_obj = {}
            cur_obj["type"] = "beat"
            cur_obj["time"] = cur_time
            obj["items"].append(cur_obj)
        if is_positive_melody(melody):
            cur_obj = {}
            cur_obj["type"] = "melody"
            cur_obj["time"] = cur_time
            cur_obj["duration"] = interval
            melodies.append(cur_obj)

        cur_time += interval

    obj["items"] = obj["items"] + stitch_melodies(melodies, io)
    return obj
예제 #10
0
def gen_outputs(file: MarkedAudioFile, io: IO) -> List[ExpectedOutput]:
    """Gen a list of marked outputs for given file"""
    out_len = len(file.bins_file.bins)
    outputs = [ExpectedOutput(False, False) for x in range(out_len)]

    interval = io.get("interval")
    for timestamp in file.json_file.timestamps:
        # Round it to the range
        timestamp_time = timestamp.timestamp * 1000
        closest = get_closest(timestamp_time, io)

        timestamp_index = int(closest / interval)

        if timestamp_index >= out_len:
            continue

        if timestamp.beat_type == "beat":
            output_mark = outputs[timestamp_index]
            output_mark.is_beat = True
        elif timestamp.beat_type == "melody":
            closest_end = get_closest(timestamp_time +
                                      (timestamp.length * 1000))
            for i in range(int((closest_end - closest) / interval)):
                outputs[timestamp_index + i].is_melody = True

    return outputs
예제 #11
0
def get_io() -> IO:
    return IO(
        {
            "i": IOInput(
                glob("../../data/tracks/*.wav"),
                list,
                has_input=True,
                arg_name="input_files",
                descr="Input .wav files",
                alias="input_files",
                is_generic=True,
            ),
            "a": IOInput(
                "../../data/analysis.json",
                str,
                has_input=True,
                arg_name="analysis",
                descr="Analysis JSON file",
                alias="analysis",
            ),
            "o": IOInput(
                "../../data/preprocessed.pickle",
                str,
                has_input=True,
                arg_name="output_file",
                descr="File in which the features and outputs get placed",
                alias="output_file",
            ),
            "n": IOInput(
                50, int, has_input=True, arg_name="interval", descr="Interval at which data is sent", alias="interval"
            ),
        }
    )
예제 #12
0
def collect_input_paths(io: IO) -> List[str]:
    """Turn the input glob into file paths"""
    all_files = list(set(io.get("input_files")))
    wav_files = list(
        filter(lambda in_file: in_file.split(".")[-1] == "wav", all_files))

    return wav_files
예제 #13
0
def run_tests(io: IO, model: Sequential, test_files: List[Preprocessed]):
    model.reset_states()

    for file in test_files:
        logline("creating test params for {}".format(file.file_name))
        test_x, test_y = get_test_params(file)

        logline("making predictions")
        predictions: List[List[float]] = model.predict(test_x,
                                                       batch_size=1,
                                                       verbose=1)
        model.reset_states()

        mse_total: List[float] = list()
        correct = 0
        diff_score = 0
        for i in range(len(predictions)):
            prediction = predictions[i]
            actual: List[float] = test_y[i]

            diff = abs(actual[0] - prediction[0])
            diff_score += diff
            if is_in_range(diff):
                correct += 1

            mse_total.append(mean_squared_error(actual, prediction))

        logline(
            "predicted {}/{} within range ({}%) correct, score was {}/{}, mse was {}"
            .format(
                correct,
                len(predictions),
                round(correct / len(predictions) * 100, 2),
                diff_score,
                len(predictions),
                round(sum(mse_total) / len(predictions), 4),
            ))

        out_obj = predictions_to_out_file(predictions, io)

        pathlib.Path(io.get("output_annotated")).mkdir(parents=True,
                                                       exist_ok=True)
        out_path = os.path.join(io.get("output_annotated"),
                                "{}.json".format(file.file_name))
        with open(out_path, "w+") as out_file:
            json.dump(out_obj, out_file)
            logline("wrote object to {}".format(out_path))
def start_server(io: IO):
    global interval
    interval = io.get("interval")

    port = io.get("port")
    httpd = HTTPServer(("", port),
                       partial(WebServer,
                               directory=os.path.join(CUR_DIR, "public")))
    logline("listening at port", port)
    enter_group()
    try:
        httpd.serve_forever()
    except KeyboardInterrupt:
        pass
    httpd.server_close()
    exit_group()
    logline("stopped listening")
예제 #15
0
def get_io() -> IO:
    return IO({
        "i":
        IOInput(
            "./data/preprocessed.pickle",
            str,
            has_input=True,
            arg_name="input_preprocessed",
            descr="Input preprocessed file",
            alias="input_preprocessed",
            is_generic=True,
        ),
        "iw":
        IOInput(
            "./data/weights.h5",
            str,
            has_input=True,
            arg_name="input_weights",
            descr="Input weights file",
            alias="input_weights",
        ),
        "im":
        IOInput(
            "./data/model.json",
            str,
            has_input=True,
            arg_name="input_model",
            descr="Input file for the model",
            alias="input_model",
        ),
        "it":
        IOInput(
            "./data/train_config.json",
            str,
            has_input=True,
            arg_name="input_train",
            descr="Input file for the train config",
            alias="input_train",
        ),
        "o":
        IOInput(
            "./data/annotated/",
            str,
            has_input=True,
            arg_name="output_annotated",
            descr="Directory where annotated files are stored",
            alias="output_annotated",
        ),
        "n":
        IOInput(50,
                int,
                has_input=True,
                arg_name="interval",
                descr="Interval at which data is sent",
                alias="interval"),
    })
예제 #16
0
def get_closest(timestamp_time: float, io: IO) -> int:
    """Get the closest multiple of INTERVAL to the timestamp"""
    interval = io.get("interval")
    lowerbound = (timestamp_time // interval) * interval
    upperbound = lowerbound + interval

    lowerbound_diff = timestamp_time - lowerbound
    upperbound_diff = upperbound - timestamp_time

    return lowerbound if lowerbound_diff <= upperbound_diff else upperbound
예제 #17
0
def trim_params(params: Tuple[np.ndarray, np.ndarray], io: IO) -> Tuple[np.ndarray, np.ndarray]:
    batch_size = io.get("batch_size")

    x_param, y_param = params

    length = x_param.shape[0]

    remainder = length % batch_size
    if remainder == 0:
        return params
    return x_param[:-remainder], y_param[:-remainder]
예제 #18
0
def collect_input_paths(io: IO) -> Union[None, List[str]]:
    """Turn the input glob into file paths"""
    all_files = io.get("input_files")
    wav_files = list(
        filter(lambda in_file: in_file.split(".")[-1] == "wav", all_files))

    if len(wav_files) == 0:
        return None

    annotated_files = list(filter(has_json_file, wav_files))

    return annotated_files
예제 #19
0
def stitch_melodies(obj: List[Dict[str, Union[str, float]]],
                    io: IO) -> List[Dict[str, Union[str, float]]]:
    new_melodies = list()
    interval = io.get("interval")

    i = 0
    while i < len(obj):
        if len(new_melodies) > 0:
            if new_melodies[-1]["time"] == obj[i]["time"] - interval:
                new_melodies[-1]["time"] += interval
                i += 1
                continue
        new_melodies.append(obj[i])
        i += 1

    return new_melodies
def get_io() -> IO:
    return IO({
        "p":
        IOInput(1234,
                int,
                has_input=True,
                arg_name="port",
                descr="The port on which to host it",
                alias="port"),
        "n":
        IOInput(50,
                int,
                has_input=True,
                arg_name="interval",
                descr="Interval at which data is sent",
                alias="interval"),
    })
예제 #21
0
def predictions_to_out_file(predictions: np.array, io: IO):
    obj: Dict[str, Any] = {"items": [], "genre": {"hard": 0.5, "uptempo": 0.5}}
    interval = io.get("interval")

    cur_time = 0
    for i in range(len(predictions)):
        prediction: Tuple[float] = predictions[i]
        confidence: float = prediction[0]

        if is_in_range(confidence):
            cur_obj = {}
            cur_obj["type"] = "beat"
            cur_obj["time"] = cur_time
            obj["items"].append(cur_obj)

        cur_time += interval

    return obj
예제 #22
0
def match_files(io: IO, input_paths: List[str]):
    """Match found files to analysis file contents"""
    analysis_file = io.get("analysis")
    logline(analysis_file)

    analysis = AnalysisFile(analysis_file)

    mapped: Dict[str, str] = {}
    reverse_map: Dict[str, str] = {}
    for in_path in input_paths:
        file_name = in_path.split("/")[-1].split(".")[0]
        for track_analysis in analysis.tracks:
            if track_analysis.name.lower() in file_name.lower():
                mapped[in_path] = track_analysis.name
                reverse_map[track_analysis.name] = file_name
                break

    logline("came up with the following mapping:")
    logline("")
    for file_name in mapped:
        logline('"{}" -> "{}"'.format(file_name, mapped[file_name]))

    unmapped_amount: int = 0
    for in_path in input_paths:
        if in_path not in mapped:
            warn('input file "{}" not mapped'.format(in_path))
            unmapped_amount += 1
    for track_analysis in analysis.tracks:
        if track_analysis.name not in reverse_map:
            warn('analysed file "{}" not mapped'.format(track_analysis.name))
            unmapped_amount += 1
    logline("")
    if unmapped_amount > 0:
        try:
            correct = input("is this correct? Y/n")
            if correct.lower() == "n":
                return None
        except KeyboardInterrupt:
            return None

    return analysis, mapped
예제 #23
0
def gen_outputs(file: MarkedAudioFile, io: IO) -> List[ExpectedOutput]:
    """Gen a list of marked outputs for given file"""
    out_len = len(file.bins_file.bins)
    # TODO: change
    outputs = [ExpectedOutput(0) for _ in range(out_len)]

    interval = io.get("interval")
    for timestamp in file.timestamps:
        # Round it to the range
        timestamp_time = timestamp.timestamp * 1000
        closest = get_closest(timestamp_time, io)

        timestamp_index = int(closest / interval)

        if timestamp_index >= out_len:
            continue

        output_mark = outputs[timestamp_index]
        output_mark.beat_confidence = timestamp.confidence

    return outputs
예제 #24
0
def get_io() -> IO:
    return IO(
        {
            "i": IOInput(
                "./data/preprocessed.pickle",
                str,
                has_input=True,
                arg_name="input_file",
                descr="Input preprocessed file",
                alias="input_file",
                is_generic=True,
            ),
            "ow": IOInput(
                "./data/weights.h5",
                str,
                has_input=True,
                arg_name="output_weights",
                descr="File in which the weights gets stored",
                alias="output_weights",
            ),
            "ot": IOInput(
                "./data/train_config.json",
                str,
                has_input=True,
                arg_name="output_train",
                descr="File in which the training config gets stored",
                alias="output_train",
            ),
            "s": IOInput(
                80,
                int,
                has_input=True,
                arg_name="split",
                descr="The split between training and test sets",
                alias="split",
            ),
            "b": IOInput(32, int, has_input=True, arg_name="batch_size", descr="The batch size", alias="batch_size",),
            "e": IOInput(10, int, has_input=True, arg_name="epochs", descr="The amount of epochs", alias="epochs",),
        }
    )
예제 #25
0
def run_tests(io: IO, model: Sequential, test_files: List[Preprocessed]):
    model.reset_states()

    for file in test_files:
        logline("creating test params for {}".format(file.file_name))
        test_x, test_y = get_test_params(file)

        logline("making predictions")
        predictions = model.predict(test_x, batch_size=1, verbose=1)
        model.reset_states()

        mse_total = list()
        correct = 0
        for i in range(len(predictions)):
            prediction = predictions[i]
            actual = test_y[i]
            if actual[0] == is_positive_beat(
                    prediction[0]) and actual[1] == is_positive_melody(
                        prediction[1]):
                correct += 1

            mse_total.append(mean_squared_error(actual, prediction))

        logline("predicted {}/{} ({}%) correct, mse was {}".format(
            correct,
            len(predictions),
            round(correct / len(predictions) * 100, 2),
            round(sum(mse_total) / len(predictions), 4),
        ))

        out_obj = predictions_to_out_file(predictions, io)

        out_path = os.path.join(io.get("output_annotated"),
                                "{}.json".format(file.file_name))
        with open(out_path, "w+") as out_file:
            json.dump(out_obj, out_file)
            logline("wrote object to {}".format(out_path))
예제 #26
0
파일: train_lstm.py 프로젝트: fmidev/trains
def main():
    """
    Main program
    """

    # Print GPU availability
    local_device_protos = device_lib.list_local_devices()
    logging.info(
        [x.name for x in local_device_protos if x.device_type == 'GPU'])

    bq = BQHandler()
    io = IO(gs_bucket=options.gs_bucket)
    viz = Viz(io)

    starttime, endtime = io.get_dates(options)
    logging.info('Using dataset {}.{} and time range {} - {}'.format(
        options.feature_dataset, options.feature_table,
        starttime.strftime('%Y-%m-%d'), endtime.strftime('%Y-%m-%d')))

    all_param_names = list(
        set(options.label_params + options.feature_params +
            options.meta_params))
    aggs = io.get_aggs_from_param_names(options.feature_params)

    logging.info('Building model...')
    dim = len(options.feature_params)
    if options.month: dim += 1
    model = convlstm.Regression(options, dim).get_model()

    logging.info('Reading data...')
    bq.set_params(batch_size=2500000,
                  loc_col='trainstation',
                  project=options.project,
                  dataset=options.feature_dataset,
                  table=options.feature_table,
                  parameters=all_param_names,
                  locations=options.train_stations,
                  only_winters=options.only_winters,
                  reason_code_table=options.reason_code_table)

    data = bq.get_rows(starttime, endtime)

    data = io.filter_train_type(labels_df=data,
                                train_types=options.train_types,
                                sum_types=True,
                                train_type_column='train_type',
                                location_column='trainstation',
                                time_column='time',
                                sum_columns=['train_count', 'delay'],
                                aggs=aggs)

    if options.y_avg_hours is not None:
        data = io.calc_running_delay_avg(data, options.y_avg_hours)

    if options.y_avg:
        data = io.calc_delay_avg(data)

    data.sort_values(by=['time', 'trainstation'], inplace=True)

    if options.month:
        logging.info('Adding month to the dataset...')
        data['month'] = data['time'].map(lambda x: x.month)
        options.feature_params.append('month')

    if options.normalize:
        logging.info('Normalizing data...')
        xscaler = StandardScaler()
        yscaler = StandardScaler()

        labels = data.loc[:, options.label_params].astype(
            np.float32).values.reshape((-1, 1))
        yscaler.fit(labels)
        scaled_labels = pd.DataFrame(yscaler.transform(labels),
                                     columns=['delay'])

        non_scaled_data = data.loc[:, options.meta_params + ['class']]
        scaled_features = pd.DataFrame(xscaler.fit_transform(
            data.loc[:, options.feature_params]),
                                       columns=options.feature_params)

        data = pd.concat([non_scaled_data, scaled_features, scaled_labels],
                         axis=1)

        fname = options.save_path + '/xscaler.pkl'
        io.save_scikit_model(xscaler, fname, fname)
        fname = options.save_path + '/yscaler.pkl'
        io.save_scikit_model(yscaler, fname, fname)

    if options.pca:
        logging.info('Doing PCA analyzis for the data...')
        ipca = IncrementalPCA(n_components=options.pca_components,
                              whiten=options.whiten,
                              copy=False)

        non_processed_data = data.loc[:, options.meta_params +
                                      options.label_params]
        processed_data = data.loc[:, options.feature_params]
        ipca.fit(processed_data)
        processed_features = pd.DataFrame(ipca.transform(processed_data))

        data = pd.concat([non_processed_data, processed_data], axis=1)

        fname = options.output_path + '/ipca_explained_variance.png'
        viz.explained_variance(ipca, fname)
        io._upload_to_bucket(filename=fname, ext_filename=fname)

    data_train, data_test = train_test_split(data, test_size=0.33)

    # Define model
    batch_size = 512
    logging.info('Batch size: {}'.format(batch_size))

    # Initialization
    losses, val_losses, accs, val_accs, steps = [], [], [], [], []

    boardcb = TensorBoard(log_dir=options.log_dir + '/lstm',
                          histogram_freq=0,
                          write_graph=True,
                          write_images=True)

    logging.info('Data shape: {}'.format(
        data_train.loc[:, options.feature_params].values.shape))

    data_gen = TimeseriesGenerator(
        data_train.loc[:, options.feature_params].values,
        data_train.loc[:, options.label_params].values,
        length=24,
        sampling_rate=1,
        batch_size=batch_size)

    data_test_gen = TimeseriesGenerator(
        data_test.loc[:, options.feature_params].values,
        data_test.loc[:, options.label_params].values,
        length=24,
        sampling_rate=1,
        batch_size=batch_size)

    logging.info('X batch size: {}'.format(data_gen[0][0].shape))
    logging.info('Y batch size: {}'.format(data_gen[1][0].shape))

    history = model.fit_generator(data_gen,
                                  validation_data=data_test_gen,
                                  epochs=options.epochs,
                                  callbacks=[boardcb])  #, batch_size=64)

    history_fname = options.save_path + '/history.pkl'
    io.save_keras_model(options.save_file, history_fname, model,
                        history.history)

    scores = model.evaluate_generator(data_test_gen)
    i = 0
    error_data = {}
    for name in model.metrics_names:
        logging.info('{}: {:.4f}'.format(name, scores[i]))
        error_data[name] = [scores[i]]
        i += 1

    fname = '{}/training_time_validation_errors.csv'.format(
        options.output_path)
    io.write_csv(error_data, filename=fname, ext_filename=fname)

    pred = model.predict_generator(data_test_gen)

    #io.log_class_dist(pred, 4)
    #print(history.history)
    fname = options.output_path + '/learning_over_time.png'
    viz.plot_nn_perf(history.history,
                     metrics={
                         'Error': {
                             'mean_squared_error': 'MSE',
                             'mean_absolute_error': 'MAE'
                         }
                     },
                     filename=fname)
예제 #27
0
def main():
    """
    Get data from db and save it as csv
    """
    # Helpers
    bq = bqhandler.BQHandler()
    io = IO(gs_bucket=options.gs_bucket)
    viz = Viz(io)
    predictor = Predictor(io, ModelLoader(io), options)

    ### OPTIONS ################################################################

    # Configuration
    starttime, endtime = io.get_dates(options)
    logging.info('Using dataset {} and time range {} - {}'.format(options.feature_dataset,
                                                                  starttime.strftime('%Y-%m-%d'),
                                                                  endtime.strftime('%Y-%m-%d')))

    all_param_names = options.label_params + options.feature_params + options.meta_params
    aggs = io.get_aggs_from_param_names(options.feature_params)

    ### MODELS #################################################################

    # Initialise classifier
    if hasattr(options, 'classifier_file'):
        classifier = io.load_scikit_model(options.classifier_file)
    else:
        if options.classifier == 'svc':
            params = {'kernel': options.kernel, 'gamma': options.gamma, 'C': options.penalty, 'probability': options.probability}
            #classifier = SVC(**params)
            classifier = SVCClassifier(params, limit=options.class_limit)
        elif options.classifier == 'graphsvc':

            classifier  = GraphSVCClassifier()
            graph_data = pd.read_csv(options.graph_data, names=['date', 'start_hour', 'src', 'dst', 'type', 'sum_delay','sum_ahead','add_delay','add_ahead','train_count'])
            classifier.fetch_connections(graph_data)

        elif options.classifier == 'gaussiannb':
            classifier = GaussianNBClassifier()
        elif options.classifier == 'lstm':
            num_of_features = len(options.feature_params)
            if options.month: num_of_features += 1
            class_weight=None
            if hasattr(options, 'class_weight'):
                class_weight=eval(options.class_weight)
            params = {'length': options.time_steps, 'batch_size': options.batch_size, 'epochs': options.epochs, 'num_of_features': num_of_features, 'log_dir': options.log_dir, 'class_weight':class_weight}
            classifier = LSTMClassifier(**params)
        else:
            raise('Model not specificied or wrong. Add "classifier: bgm" to config file.')

    # Initialise regression model
    if options.regression == 'rfr':
        regressor = RandomForestRegressor(n_estimators=options.n_estimators,
                                      n_jobs=-1,
                                      min_samples_leaf=options.min_samples_leaf,
                                      min_samples_split=options.min_samples_split,
                                      max_features=options.max_features,
                                      max_depth=options.max_depth,
                                      bootstrap=options.bootstrap
                                      )
        #regressor = _trans.Regressor(model=model)
    else:
        raise('Model not specificied or wrong. Add "classifier: bgm" to config file.')

    # Initialise transformer
    #transformer = _trans.Selector(classifier=classifier)

    # Initialise pipeline
    #pipe = Pipeline(
    #    [('selector', transformer),
    #     ('regression', regressor)]
    #)

    ### DATA ###################################################################

    sum_columns = ['delay']
    if 'train_count' in options.meta_params:
        sum_columns.append('train_count')

    # Pick only selected month
    where = {}
    if options.pick_month is not None:
        where = {'EXTRACT(MONTH from time)': options.pick_month}

    logging.info('Reading data...')
    bq.set_params(loc_col='trainstation',
                  project=options.project,
                  dataset=options.feature_dataset,
                  table=options.feature_table,
                  parameters=all_param_names,
                  locations=options.locations,
                  only_winters=options.only_winters,
                  reason_code_table=options.reason_code_table,
                  where=where)

    data = bq.get_rows(starttime,
                       endtime)

    data = io.filter_train_type(labels_df=data,
                                train_types=options.train_types,
                                sum_types=True,
                                train_type_column='train_type',
                                location_column='trainstation',
                                time_column='time',
                                sum_columns=sum_columns,
                                aggs=aggs)

    data['delay'] = data.loc[:, 'delay'].replace(-99, np.nan)
    data.sort_values(by=['trainstation', 'time'], inplace=True)

    logging.info('Processing {} rows...'.format(len(data)))

    # Filter only timesteps with large distribution in the whole network
    if options.filter_delay_limit is not None:
        data = io.filter_delay_with_limit(data, options.filter_delay_limit)

    # Binary class
    logging.info('Adding binary class to the dataset with limit {}...'.format(options.delay_limit))
    def set_class(x):
        if x > options.delay_limit:
            return binary_labels[1]
        elif x < options.delay_limit:
            return binary_labels[0]
        return np.nan
    data['class'] = data['delay'].map(lambda x: set_class(x))

    # Separate train and validation sets
    data_train, data_test = train_test_split(data, test_size=0.3, shuffle=False)

    # Balance
    if options.balance:
        logging.info('Balancing training data...')
        count = data_train.groupby('class').size().min()
        # SVC can't handle more than 50 000 samples
        if options.classifier == 'svc': count = min(count, 50000)
        data_train = pd.concat([data_train.loc[data_train['class'] == 0].sample(n=count),
        data_train.loc[data_train['class'] == 1].sample(n=count)])

    logging.info('Train data:')
    io.log_class_dist(data_train.loc[:, 'class'].values, labels=binary_labels)
    logging.info('Test data:')
    io.log_class_dist(data_test.loc[:, 'class'].values, labels=binary_labels)

    # Adding month
    if options.month:
        logging.info('Adding month to the datasets...')
        data_train['month'] = data_train.loc[:,'time'].map(lambda x: x.month)
        data_test['month'] = data_test.loc[:,'time'].map(lambda x: x.month)
        options.feature_params.append('month')

    #data_train.set_index('time', inplace=True)
    #y_train_class = data_train.loc[:,['class']].astype(np.int32).values.ravel()
    #y_train_delay = data_train.loc[:,['delay']].astype(np.int32).values.ravel()
    y_train_class = data_train.loc[:,['class']].values.ravel()
    y_train_delay = data_train.loc[:,['delay']].values.ravel()
    #y_test_class = data_test.loc[:,['class']].astype(np.int32).values.ravel()
    #y_test_delay = data_test.loc[:,['delay']].astype(np.int32).values.ravel()
    y_test_class = data_test.loc[:,['class']].values.ravel()
    y_test_delay = data_test.loc[:,['delay']].values.ravel()

    X_train = data_train.loc[:,options.feature_params].astype(np.float32).values
    X_test = data_test.loc[:,options.feature_params].astype(np.float32).values

    if options.smote:
        logging.info('Smoting...')
        sm = SMOTE()
        X_train_class, y_class = sm.fit_resample(X_train, y_train_class)
        io.log_class_dist(y_class, labels=binary_labels)

    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    # io.log_class_dist(y_train[:,1], [-1,1])

    # If asked, save used train and test splits into big query
    if options.save_data:
        tname = options.model+'_'+options.feature_dataset+'_'+options.config_name+'_train'
        columns = [options.feature_params, ['delay'], ['class']]
        bq.nparray_to_table([X_train, y_train_class, y_train_delay],
                            columns,
                            options.project,
                            options.feature_dataset,
                            tname
                            )
        tname = options.model+'_'+options.feature_dataset+'_'+options.config_name+'_test'
        bq.nparray_to_table([X_test, y_test_class, y_test_delay],
                            columns,
                            options.project,
                            options.feature_dataset,
                            tname
                            )

    if options.normalize:
        logging.info('Normalizing data...')
        #scale=(0,1)
        if hasattr(options, 'xscaler_file'):
            xscaler = io.load_scikit_model(options.xscaler_file)
            X_train = xscaler.transform(X_train)
            X_test = xscaler.transform(X_test)
        else:
            xscaler = MinMaxScaler(feature_range=(-1,1))
            #xscaler = StandardScaler()
            X_train = xscaler.fit_transform(X_train)
            X_test = xscaler.transform(X_test)
            fname = options.save_path+'/xscaler.pkl'
            io.save_scikit_model(xscaler, fname, fname)

        if hasattr(options, 'yscaler_file'):
            yscaler = io.load_scikit_model(options.yscaler_file)
            y_train_delay = yscaler.transform(y_train_delay)
            y_test_delay = yscaler.transform(y_test_delay)
        else:
            #yscaler = MinMaxScaler(feature_range=(0,1))
            yscaler=StandardScaler()
            y_train_delay = yscaler.fit_transform(y_train_delay.reshape(-1,1)).ravel()
            y_test_delay = yscaler.transform(y_test_delay.reshape(-1,1)).ravel()
            fname = options.save_path+'/yscaler.pkl'
            io.save_scikit_model(yscaler, fname, fname)


    data_train.loc[:,options.feature_params].to_csv('data/x_train.csv', index=False)
    data_test.loc[:,options.feature_params].to_csv('data/x_test.csv', index=False)
    data_train.loc[:,['class']].fillna(-99).astype(np.int).to_csv('data/y_train.csv', index=False)
    data_test.loc[:,['class']].fillna(-99).astype(np.int).to_csv('data/y_test.csv', index=False)
    sys.exit()

    ### TRAIN ##################################################################

    if options.cv:
        logging.info('Doing random search for hyper parameters...')
        raise("No param_grid set for given model ({})".format(options.regression))

        random_search = RandomizedSearchCV(model,
                                           param_distributions=param_grid,
                                           n_iter=int(options.n_iter_search),
                                           n_jobs=-1)

        random_search.fit(X_train, y_train)
        logging.info("RandomizedSearchCV done.")
        fname = options.output_path+'/random_search_cv_results.txt'
        report_cv_results(random_search.cv_results_, fname)
        io._upload_to_bucket(filename=fname, ext_filename=fname)
        sys.exit()
    else:
        logging.info('Training classifier...')

        if options.classifier == 'graphsvc':
            classifier.fit(X_train, y_train_class, stations=data_train.loc[:, 'trainstation'].values)
        else:
            history = classifier.fit(X_train, y_train_class, X_test, y_test_class)

        # Save classifier
        if options.classifier == 'lstm':
            history_fname = options.save_path+'/history.pkl'
            fname = options.save_path+'/classifier.h5'
            io.save_keras_model(fname, history_fname, classifier, history.history)
        else:
            fname = options.save_path+'/classifier.pkl'
            io.save_scikit_model(classifier, filename=fname, ext_filename=fname)

        # Drop data with no delay information
        X_train = X_train[~np.isnan(y_train_delay)]
        y_train_delay = y_train_delay[~np.isnan(y_train_delay)]
        y_train_class = y_train_class[~np.isnan(y_train_class)]

        y_pred_train_bin = classifier.predict(X_train, type='bool')

        # debug
        #y_pred_train_bin
        #indices = np.random.choice(np.arange(y_pred_train_bin.size),
        #                           replace=False,
        #                           size=int(y_pred_train_bin.size * 0.2))
        #y_pred_train_bin[indices] = True

        #print('y_pred_train_bin: {}'.format(y_pred_train_bin.shape))
        #print('y_train_delay: {}'.format(y_train_delay.shape))
        #print('y_train_class: {}'.format(y_train_class.shape))

        # Pick only severe values
        #y_train_delay_ = y_train_delay[(len(y_train_class)-len(y_pred_train_bin)):]
        #X_train_ = X_train[(len(y_train_class)-len(y_pred_train_bin)):]
        y_train_delay_ = y_train_delay[(len(y_train_delay)-len(y_pred_train_bin)):]
        X_train_ = X_train[(len(y_train_delay)-len(y_pred_train_bin)):]
        #print('y_train_delay_: {}'.format(y_train_delay_.shape))
        y_train_severe = y_train_delay_[y_pred_train_bin]
        X_train_severe = X_train_[y_pred_train_bin]

        logging.info('Training regressor...')
        regressor.fit(X_train_severe, y_train_severe)

    # Save regressor
    io.save_scikit_model(regressor, filename=options.save_file, ext_filename=options.save_file)

    # Learning history
    # fname = options.output_path+'/learning_over_time.png'
    # viz.plot_nn_perf(history.history, metrics={'Error': {'mean_squared_error': 'MSE',
    #                                                      'mean_absolute_error': 'MAE'}},
    #                                                      filename=fname)

    ### RESULTS FOR VALIDATION SET #############################################

    # Drop data with missing delay
    X_test = X_test[~np.isnan(y_test_class)]
    y_test_class = y_test_class[~np.isnan(y_test_class)]
    data_test = data_test[~np.isnan(data_test.delay)]

    # Metrics
    #y_pred_proba = classifier.predict_proba(X_test)
    y_pred = classifier.predict(X_test)
    y_pred_proba = classifier.y_pred_proba

    #y_test_delay = y_test_delay[~np.isnan(y_test_delay)]

    # Classification performance
    # LSTM don't have first timesteps
    y_test_class = y_test_class[(len(X_test)-len(y_pred)):]

    acc = accuracy_score(y_test_class, y_pred)
    precision = precision_score(y_test_class, y_pred, average='micro')
    recall = recall_score(y_test_class, y_pred, average='micro')
    f1 = f1_score(y_test_class, y_pred, average='micro')

    logging.info('Classification accuracy: {}'.format(acc))
    logging.info('Classification precision: {}'.format(precision))
    logging.info('Classification recall: {}'.format(recall))
    logging.info('Classification F1 score: {}'.format(f1))
    io.log_class_dist(y_pred, binary_labels)

    # Confusion matrices
    fname = '{}/confusion_matrix_validation.png'.format(options.output_path)
    viz.plot_confusion_matrix(y_test_class, y_pred, np.arange(2), filename=fname)

    fname = '{}/confusion_matrix_validation_normalised.png'.format(options.output_path)
    viz.plot_confusion_matrix(y_test_class, y_pred, np.arange(2), True, filename=fname)

    # Precision-recall curve
    fname = '{}/precision-recall-curve_validation.png'.format(options.output_path)
    viz.prec_rec_curve(y_test_class, y_pred_proba, filename=fname)

    # ROC
    fname = '{}/roc_validation.png'.format(options.output_path)
    viz.plot_binary_roc(y_test_class, y_pred_proba, filename=fname)

    if options.regression == 'rfr':
        fname = options.output_path+'/rfc_feature_importance.png'
        viz.rfc_feature_importance(regressor.feature_importances_, fname, feature_names=options.feature_params)

    # Regression performance
    y_pred_reg, y_test_reg = predictor.pred(data=data_test)
    #y_test_reg = y_test[(len(y_test)-len(y_pred)):,0]
    rmse = np.sqrt(mean_squared_error(y_test_reg, y_pred))
    mae = mean_absolute_error(y_test_reg, y_pred)
    r2 = r2_score(y_test_reg, y_pred)

    logging.info('Regression RMSE: {}'.format(rmse))
    logging.info('Regression MAE: {}'.format(mae))
    logging.info('Regression R2 score: {}'.format(r2))

    error_data = {'acc': [acc],
                  'precision': [precision],
                  'recall': [recall],
                  'f1': [f1],
                  'rmse': [rmse],
                  'mae': [mae],
                  'r2': [r2]}
    fname = '{}/training_time_classification_validation_errors.csv'.format(options.output_path)
    io.write_csv(error_data, filename=fname, ext_filename=fname)






    ############################################################################
    # EVALUATE
    ############################################################################
    if options.evaluate:
        logging.info('Loading test data...')
        test_data = bq.get_rows(dt.datetime.strptime('2010-01-01', "%Y-%m-%d"),
                                dt.datetime.strptime('2019-01-01', "%Y-%m-%d"),
                                loc_col='trainstation',
                                project=options.project,
                                dataset=options.feature_dataset,
                                table=options.test_table,
                                reason_code_table=options.reason_code_table,
                                locations=options.locations,
                                parameters=all_param_names)

        test_data = io.filter_train_type(labels_df=test_data,
                                         train_types=['K','L'],
                                         sum_types=True,
                                         train_type_column='train_type',
                                         location_column='trainstation',
                                         time_column='time',
                                         sum_columns=['delay'],
                                         aggs=aggs)

         # Sorting is actually not necessary. It's been useful for debugging.
        #test_data.sort_values(by=['time', 'trainstation'], inplace=True)

        # Filter only timesteps with large distribution in the whole network
        if options.filter_delay_limit is not None:
            test_data = io.filter_delay_with_limit(test_data, options.filter_delay_limit)

        test_data.set_index('time', inplace=True)
        logging.info('Test data contain {} rows...'.format(len(test_data)))

        logging.info('Adding binary class to the test dataset with limit {}...'.format(options.delay_limit))
        test_data['class'] = test_data['delay'].map(lambda x: binary_labels[1] if x > options.delay_limit else binary_labels[0])
        io.log_class_dist(test_data.loc[:, 'class'].values, labels=binary_labels)

        if options.month:
            logging.info('Adding month to the test dataset...')
            test_data['month'] = test_data.index.map(lambda x: x.month)

        times = [('2014-01-01', '2014-02-01'), ('2016-06-01', '2016-07-01'), ('2017-02-01', '2017-03-01'), ('2011-02-01', '2011-03-01')]
        for start, end in times:
            try:
                y_pred_proba, y_pred, y = predict_timerange(test_data, options.feature_params, classifier, xscaler, start, end)
                perf_metrics(y_pred_proba, y_pred, y, start, end, viz, io)
            except EmptyDataError:
                logging.info('No data for {} - {}'.format(start, end))
예제 #28
0
파일: train_dual.py 프로젝트: fmidev/trains
        'INFO': logging.INFO,
        'WARNING': logging.WARNING,
        'ERROR': logging.ERROR,
        'CRITICAL': logging.CRITICAL
    }
    logging.basicConfig(format=(
        "[%(levelname)s] %(asctime)s %(filename)s:%(funcName)s:%(lineno)s %(message)s"
    ),
                        level=logging_level[options.logging_level])

    logging.info('Using configuration: {} | {}'.format(options.config_filename,
                                                       options.config_name))

    # Helpers
    bq = bqhandler.BQHandler()
    io = IO(gs_bucket=options.gs_bucket)
    viz = Viz(io)
    state = State()

    starttime, endtime = io.get_dates(options)
    logging.info('Using dataset {} and time range {} - {}'.format(
        options.feature_dataset, starttime.strftime('%Y-%m-%d'),
        endtime.strftime('%Y-%m-%d')))

    if options.save_data:
        tname = options.model + '_' + options.feature_dataset + '_' + options.config_name + '_train'
        tname = tname.replace('-', '_')
        bq.delete_table(options.project, options.feature_dataset, tname)
        tname = options.model + '_' + options.feature_dataset + '_' + options.config_name + '_test'
        tname = tname.replace('-', '_')
        bq.delete_table(options.project, options.feature_dataset, tname)
예제 #29
0
def main():
    """
    Get data from db and save it as csv
    """

    bq = BQHandler()
    io = IO(gs_bucket=options.gs_bucket)
    viz = Viz(io=io)

    starttime, endtime = io.get_dates(options)
    logging.info('Using dataset {} and time range {} - {}'.format(
        options.feature_dataset, starttime.strftime('%Y-%m-%d'),
        endtime.strftime('%Y-%m-%d')))

    all_param_names = options.label_params + options.feature_params + options.meta_params
    aggs = io.get_aggs_from_param_names(options.feature_params)

    if options.model == 'rf':
        model = RandomForestRegressor(
            n_estimators=options.n_estimators,
            n_jobs=-1,
            min_samples_leaf=options.min_samples_leaf,
            min_samples_split=options.min_samples_split,
            max_features=options.max_features,
            max_depth=options.max_depth,
            bootstrap=options.bootstrap)
    elif options.model == 'lr':
        model = SGDRegressor(warm_start=True,
                             max_iter=options.n_loops,
                             shuffle=options.shuffle,
                             power_t=options.power_t,
                             penalty=options.regularizer,
                             learning_rate=options.learning_rate,
                             eta0=options.eta0,
                             alpha=options.alpha,
                             tol=0.0001)
    elif options.model == 'svr':
        model = SVR()
    elif options.model == 'ard':
        model = ARDRegression(n_iter=options.n_loops,
                              alpha_1=options.alpha_1,
                              alpha_2=options.alpha_2,
                              lambda_1=options.lambda_1,
                              lambda_2=options.lambda_2,
                              threshold_lambda=options.threshold_lambda,
                              fit_intercept=options.fit_intercept,
                              copy_X=options.copy_X)
    elif options.model == 'gp':
        k_long_term = 66.0**2 * RBF(length_scale=67.0)
        k_seasonal = 2.4**2 * RBF(length_scale=90.0) * ExpSineSquared(
            length_scale=150, periodicity=1.0, periodicity_bounds=(0, 10000))
        k_medium_term = 0.66**2 * RationalQuadratic(length_scale=1.2,
                                                    alpha=0.78)
        k_noise = 0.18**2 * RBF(length_scale=0.134) + WhiteKernel(
            noise_level=0.19**2)
        #kernel_gpml = k_long_term + k_seasonal + k_medium_term + k_noise
        kernel_gpml = k_long_term + k_seasonal + k_medium_term + k_noise

        model = GaussianProcessRegressor(
            kernel=kernel_gpml,  #alpha=0,
            optimizer=None,
            normalize_y=True)
    elif options.model == 'llasso':
        model = LocalizedLasso(num_iter=options.n_loops,
                               batch_size=options.batch_size)
    elif options.model == 'nlasso':
        model = NetworkLasso(num_iter=options.n_loops,
                             batch_size=options.batch_size)

        graph_data = pd.read_csv(options.graph_data,
                                 names=[
                                     'date', 'start_hour', 'src', 'dst',
                                     'type', 'sum_delay', 'sum_ahead',
                                     'add_delay', 'add_ahead', 'train_count'
                                 ])

        #stations_to_pick = options.stations_to_pick.split(',')
        #graph = model.fetch_connections(graph_data, stations_to_pick)
        model.fetch_connections(graph_data)

    if options.pca:
        ipca = IncrementalPCA(n_components=options.pca_components,
                              whiten=options.whiten,
                              copy=False)

    rmses, maes, r2s, skills, start_times, end_times, end_times_obj = [], [], [], [], [], [], []
    X_complete = []  # Used for feature selection

    start = starttime
    end = start + timedelta(days=int(options.day_step),
                            hours=int(options.hour_step))
    if end > endtime: end = endtime

    while end <= endtime and start < end:
        logging.info('Processing time range {} - {}'.format(
            start.strftime('%Y-%m-%d %H:%M'), end.strftime('%Y-%m-%d %H:%M')))

        # Load data ############################################################
        try:
            logging.info('Reading data...')
            data = bq.get_rows(start,
                               end,
                               loc_col='trainstation',
                               project=options.project,
                               dataset=options.feature_dataset,
                               table=options.feature_table,
                               parameters=all_param_names,
                               only_winters=options.only_winters)
            data = io.filter_train_type(labels_df=data,
                                        train_types=options.train_types,
                                        sum_types=True,
                                        train_type_column='train_type',
                                        location_column='trainstation',
                                        time_column='time',
                                        sum_columns=['train_count', 'delay'],
                                        aggs=aggs)

            # Filter only timesteps with large distribution in the whole network
            if options.filter_delay_limit is not None:
                data = io.filter_delay_with_limit(data,
                                                  options.filter_delay_limit)

            if options.y_avg_hours is not None:
                data = io.calc_running_delay_avg(data, options.y_avg_hours)

            if options.y_avg:
                data = io.calc_delay_avg(data)

            data.sort_values(by=['time', 'trainstation'], inplace=True)

            if options.impute:
                logging.info('Imputing missing values...')
                data.drop(columns=['train_type'], inplace=True)
                data = imputer.fit_transform(data)
                data.loc[:, 'train_type'] = None

            if options.month:
                logging.info('Adding month to the dataset...')
                data['month'] = data['time'].map(lambda x: x.month)
                if 'month' not in options.feature_params:
                    options.feature_params.append('month')

            if options.model == 'ard' and len(data) > options.n_samples:
                logging.info('Sampling {} values from data...'.format(
                    options.n_samples))
                data = data.sample(options.n_samples)

            l_data = data.loc[:, options.label_params]
            f_data = data.loc[:, options.feature_params]

        except ValueError as e:
            f_data, l_data = [], []

        if len(f_data) < 2 or len(l_data) < 2:
            start = end
            end = start + timedelta(days=int(options.day_step),
                                    hours=int(options.hour_step))
            continue

        logging.info('Processing {} rows...'.format(len(f_data)))

        train, test = train_test_split(data, test_size=0.1)
        X_train = train.loc[:,
                            options.feature_params].astype(np.float32).values
        y_train = train.loc[:, options.label_params].astype(
            np.float32).values.ravel()
        X_test = test.loc[:, options.feature_params].astype(np.float32).values
        y_test = test.loc[:, options.label_params].astype(
            np.float32).values.ravel()

        logging.debug('Features shape: {}'.format(X_train.shape))

        if options.normalize:
            logging.info('Normalizing data...')
            xscaler, yscaler = StandardScaler(), StandardScaler()

            X_train = xscaler.fit_transform(X_train)
            X_test = xscaler.transform(X_test)

            if len(options.label_params) == 1:
                y_train = yscaler.fit_transform(y_train.reshape(-1, 1)).ravel()
                #y_test = yscaler.transform(y_test.reshape(-1, 1)).ravel()
            else:
                y_train = yscaler.fit_transform(y_train)
                #y_test = yscaler.transform(y_test)

        if options.pca:
            logging.info('Doing PCA analyzis for the data...')
            X_train = ipca.fit_transform(X_train)
            fname = options.output_path + '/ipca_explained_variance.png'
            viz.explained_variance(ipca, fname)
            #io._upload_to_bucket(filename=fname, ext_filename=fname)
            X_test = ipca.fit_transform(X_test)

        if options.model == 'llasso':
            graph_data = pd.read_csv(options.graph_data,
                                     names=[
                                         'date', 'start_hour', 'src', 'dst',
                                         'type', 'sum_delay', 'sum_ahead',
                                         'add_delay', 'add_ahead',
                                         'train_count'
                                     ])
            graph = model.fetch_connections(graph_data)

        logging.debug('Features shape after pre-processing: {}'.format(
            X_train.shape))

        # FIT ##################################################################

        if options.cv:
            logging.info('Doing random search for hyper parameters...')

            if options.model == 'rf':
                param_grid = {
                    "n_estimators": [10, 100, 200, 800],
                    "max_depth": [3, 20, None],
                    "max_features": ["auto", "sqrt", "log2", None],
                    "min_samples_split": [2, 5, 10],
                    "min_samples_leaf": [1, 2, 4, 10],
                    "bootstrap": [True, False]
                }
            elif options.model == 'lr':
                param_grid = {
                    "penalty": [None, 'l2', 'l1'],
                    "alpha": [0.00001, 0.0001, 0.001, 0.01, 0.1],
                    "l1_ratio": [0.1, 0.15, 0.2, 0.5],
                    "shuffle": [True, False],
                    "learning_rate": ['constant', 'optimal', 'invscaling'],
                    "eta0": [0.001, 0.01, 0.1],
                    "power_t": [0.1, 0.25, 0.5]
                }
            elif options.model == 'svr':
                param_grid = {
                    "C": [0.001, 0.01, 0.1, 1, 10],
                    "epsilon": [0.01, 0.1, 0.5],
                    "kernel":
                    ['rbf', 'linear', 'poly', 'sigmoid', 'precomputed'],
                    "degree": [2, 3, 4],
                    "shrinking": [True, False],
                    "gamma": [0.001, 0.01, 0.1],
                    "coef0": [0, 0.1, 1]
                }
            else:
                raise ("No param_grid set for given model ({})".format(
                    options.model))

            random_search = RandomizedSearchCV(model,
                                               param_distributions=param_grid,
                                               n_iter=int(
                                                   options.n_iter_search),
                                               n_jobs=-1)

            random_search.fit(X_train, y_train)
            logging.info("RandomizedSearchCV done.")
            fname = options.output_path + '/random_search_cv_results.txt'
            io.report_cv_results(random_search.cv_results_, fname)
            #io._upload_to_bucket(filename=fname, ext_filename=fname)
            sys.exit()
        else:
            logging.info('Training...')
            if options.model in ['rf', 'svr', 'ard', 'gp']:
                model.fit(X_train, y_train)
                if options.feature_selection:
                    X_complete = X_train
                    y_complete = y_train
                    meta_complete = data.loc[:, options.meta_params]
            elif options.model in ['llasso']:
                model.fit(X_train,
                          y_train,
                          stations=train.loc[:, 'trainstation'].values)
            elif options.model in ['nlasso']:
                model.partial_fit(X_train,
                                  y_train,
                                  stations=train.loc[:, 'trainstation'].values)
            else:
                model.partial_fit(X_train, y_train)
                if options.feature_selection:
                    try:
                        X_complete = np.append(X_complete, X_train)
                        y_complete = np.append(Y_complete, y_train)
                        meta_complete = meta_complete.append(
                            data.loc[:, options.meta_params])
                    except (ValueError, NameError):
                        X_complete = X_train
                        y_complete = y_train
                        meta_complete = data.loc[:, options.meta_params]

        # EVALUATE #############################################################

        # Check training score to estimate amount of overfitting
        # Here we assume that we have a datetime index (from time columns)
        y_pred_train = model.predict(X_train)
        rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
        mae_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
        logging.info('Training data RMSE: {} and MAE: {}'.format(
            rmse_train, mae_train))

        #try:
        if True:
            print(train)
            #range = ('2013-02-01','2013-02-28')
            range = ('2010-01-01', '2010-01-02')
            X_train_sample = train.loc[range[0]:range[1],
                                       options.feature_params].astype(
                                           np.float32).values

            target = train.loc[range[0]:range[1], options.label_params].astype(
                np.float32).values.ravel()
            y_pred_sample = model.predict(X_train_sample)

            times = train.loc[range[0]:range[1], 'time'].values
            df = pd.DataFrame(times + y_pred_sample)
            print(df)
            sys.exit()

            # Draw visualisation
            fname = '{}/timeseries_training_data.png'.format(
                options.output_path)
            viz.plot_delay(times, target, y_pred,
                           'Delay for station {}'.format(stationName), fname)

            fname = '{}/scatter_all_stations.png'.format(options.vis_path)
            viz.scatter_predictions(times,
                                    target,
                                    y_pred,
                                    savepath=options.vis_path,
                                    filename='scatter_{}'.format(station))
        #except KeyError:
        #    pass

        # Mean delay over the whole dataset (both train and validation),
        # used to calculate Brier Skill
        if options.y_avg:
            mean_delay = 3.375953418071136
        else:
            mean_delay = 6.011229358531166

        if options.model == 'llasso':
            print('X_test shape: {}'.format(X_test.shape))
            y_pred, weights = model.predict(X_test,
                                            test.loc[:, 'trainstation'].values)
        else:
            y_pred = model.predict(X_test)

        if options.normalize:
            y_pred = yscaler.inverse_transform(y_pred)

        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        rmse_stat = math.sqrt(
            mean_squared_error(y_test, np.full_like(y_test, mean_delay)))
        skill = 1 - rmse / rmse_stat

        rmses.append(rmse)
        maes.append(mae)
        r2s.append(r2)
        skills.append(skill)
        start_times.append(start.strftime('%Y-%m-%dT%H:%M:%S'))
        end_times.append(end.strftime('%Y-%m-%dT%H:%M:%S'))
        end_times_obj.append(end)

        if options.model in ['rf', 'lr', 'ard', 'gp']:
            logging.info('R2 score for training: {}'.format(
                model.score(X_train, y_train)))

        logging.info('RMSE: {}'.format(rmse))
        logging.info('MAE: {}'.format(mae))
        logging.info('R2 score: {}'.format(r2))
        logging.info('Brier Skill Score score: {}'.format(skill))

        start = end
        end = start + timedelta(days=int(options.day_step),
                                hours=int(options.hour_step))
        if end > endtime: end = endtime

    # SAVE #####################################################################
    io.save_scikit_model(model,
                         filename=options.save_file,
                         ext_filename=options.save_file)
    if options.normalize:
        fname = options.save_path + '/xscaler.pkl'
        io.save_scikit_model(xscaler, filename=fname, ext_filename=fname)
        fname = options.save_path + '/yscaler.pkl'
        io.save_scikit_model(yscaler, filename=fname, ext_filename=fname)

    if options.model == 'rf':
        fname = options.output_path + '/rfc_feature_importance.png'
        viz.rfc_feature_importance(model.feature_importances_,
                                   fname,
                                   feature_names=options.feature_params)
        #io._upload_to_bucket(filename=fname, ext_filename=fname)

    try:
        fname = options.output_path + '/learning_over_time.png'
        viz.plot_learning_over_time(end_times_obj,
                                    rmses,
                                    maes,
                                    r2s,
                                    filename=fname)
        #io._upload_to_bucket(filename=fname, ext_filename=fname)
    except Exception as e:
        logging.error(e)

    error_data = {
        'start_times': start_times,
        'end_times': end_times,
        'rmse': rmses,
        'mae': maes,
        'r2': r2s,
        'skill': skills
    }
    fname = '{}/training_time_validation_errors.csv'.format(
        options.output_path)
    io.write_csv(error_data, filename=fname, ext_filename=fname)

    # FEATURE SELECTION ########################################################
    if options.feature_selection:
        logging.info('Doing feature selection...')
        selector = SelectFromModel(model, prefit=True)
        print(pd.DataFrame(data=X_complete))
        X_selected = selector.transform(X_complete)

        selected_columns = f_data.columns.values[selector.get_support()]
        logging.info(
            'Selected following parameters: {}'.format(selected_columns))
        data_sel = meta_complete.join(
            pd.DataFrame(data=y_complete, columns=options.label_params)).join(
                pd.DataFrame(data=X_selected, columns=selected_columns))

        print(pd.DataFrame(data=X_selected, columns=selected_columns))
        print(data_sel)
예제 #30
0
def load_preprocessed(io: IO) -> List[Preprocessed]:
    with open(io.get("input_file"), "rb") as in_file:
        return list(map(lambda x: Preprocessed(x), pickle.load(in_file)))
예제 #31
0
def main():
    """
    Get data from db and save it as csv
    """

    bq = BQHandler()
    io = IO(gs_bucket=options.gs_bucket)
    viz = Viz(io=io)
    predictor = Predictor(io, ModelLoader(io), options,
                          options.station_specific_classifier,
                          options.station_specific_regressor)
    predictor.regressor_save_file = options.save_path + '/classifier.pkl'
    predictor.classifier_save_file = options.save_path + '/regressor.pkl'

    # Mean delay over the whole dataset (both train and validation),
    # used to calculate Brier Skill
    mean_delay = options.mean_delay

    starttime, endtime = io.get_dates(options)
    logging.info('Using dataset {} and time range {} - {}'.format(
        options.feature_dataset, starttime.strftime('%Y-%m-%d'),
        endtime.strftime('%Y-%m-%d')))

    # Get params
    all_param_names = list(
        set(options.label_params + options.feature_params +
            options.meta_params + options.classifier_feature_params +
            options.regressor_feature_params))

    # Param list is modified after retrieving data
    classifier_feature_params = copy.deepcopy(
        options.classifier_feature_params)
    regressor_feature_params = copy.deepcopy(options.regressor_feature_params)

    all_feature_params = list(
        set(options.feature_params + options.meta_params +
            options.classifier_feature_params +
            options.regressor_feature_params))
    aggs = io.get_aggs_from_param_names(all_feature_params)

    # Init error dicts
    avg_delay = {}
    avg_pred_delay = {}
    avg_proba = {}
    station_count = 0
    all_times = set()

    station_rmse = {}
    station_mae = {}
    station_r2 = {}
    station_skill = {}

    # For aggregated binary classification metrics
    time_list, target_list, y_pred_bin_list, y_pred_bin_proba_list = [], [], [], []

    # If stations are given as argument use them, else use all stations
    stationList = io.get_train_stations(options.stations_file)
    all_data = None

    if options.locations is not None:
        stations = options.locations
    else:
        stations = stationList.keys()

    # Go through stations
    for station in stations:
        stationName = '{} ({})'.format(stationList[station]['name'], station)
        logging.info('Processing station {}'.format(stationName))

        if hasattr(options, 'classifier_model_file'):
            predictor.classifier_save_file = options.classifier_model_file.replace(
                '{location}', station)
        elif options.station_specific_classifier:
            predictor.classifier_save_file = options.save_path + '/{}'.format(
                station) + '/classifier.pkl'

        if hasattr(options, 'regressor_model_file'):
            predictor.regressor_save_file = options.regressor_model_file.replace(
                '{location}', station)
        elif options.station_specific_regressor:
            predictor.regressor_save_file = options.save_path + '/{}'.format(
                station) + '/regressor.pkl'

        station_rmse[station] = {}
        station_mae[station] = {}
        station_r2[station] = {}
        station_skill[station] = {}

        # Read data and filter desired train types (ic and commuter)
        table = 'features_testset'
        if hasattr(options, 'test_table'):
            table = options.test_table
        data = bq.get_rows(starttime,
                           endtime,
                           loc_col='trainstation',
                           project=options.project,
                           dataset='trains_data',
                           table=table,
                           parameters=all_param_names,
                           only_winters=options.only_winters,
                           reason_code_table=options.reason_code_table,
                           reason_codes_exclude=options.reason_codes_exclude,
                           reason_codes_include=options.reason_codes_include,
                           locations=[station])

        data = io.filter_train_type(labels_df=data,
                                    train_types=['K', 'L'],
                                    sum_types=True,
                                    train_type_column='train_type',
                                    location_column='trainstation',
                                    time_column='time',
                                    sum_columns=['train_count', 'delay'],
                                    aggs=aggs)

        if len(data) == 0:
            continue

        if options.y_avg_hours is not None:
            data = io.calc_running_delay_avg(data, options.y_avg_hours)

        if options.y_avg:
            data = io.calc_delay_avg(data)

        if options.month:
            logging.info('Adding month to the dataset...')
            data = data.assign(
                month=lambda df: df.loc[:, 'time'].map(lambda x: x.month))
            if 'month' not in options.feature_params:
                options.feature_params.append('month')
            if 'month' not in options.regressor_feature_params:
                options.regressor_feature_params.append('month')
            if 'month' not in options.classifier_feature_params:
                options.classifier_feature_params.append('month')

        data.sort_values(by=['time'], inplace=True)
        logging.info('Processing {} rows...'.format(len(data)))

        if all_data is None:
            all_data = data
        else:
            all_data.append(data, ignore_index=True)

        # Pick times for creating error time series
        times = data.loc[:, 'time']
        station_count += 1

        # Run prediction
        try:
            #target, y_pred = predictor.pred(times, data)
            y_pred, y_pred_bin, y_pred_bin_proba = predictor.pred(times, data)
            # Drop first times which LSTM are not able to predict
            #times = times[(len(data)-len(y_pred)):]
        except (PredictionError, ModelError) as e:
            logging.error(e)
            continue

        target = data.loc[:, options.label_params].reset_index(
            drop=True).values.ravel()

        if len(y_pred) < 1 or len(target) < 1:
            continue

        # Create timeseries of predicted and happended delay
        i = 0
        for t in times:
            try:
                if t not in avg_delay.keys():
                    avg_delay[t] = [target[i]]
                    avg_pred_delay[t] = [y_pred[i]]
                    if predictor.y_pred_bin_proba is not None:
                        avg_proba[t] = [predictor.y_pred_bin_proba[i, 1]]
                else:
                    avg_delay[t].append(target[i])
                    avg_pred_delay[t].append(y_pred[i])
                    if predictor.y_pred_bin_proba is not None:
                        avg_proba[t].append(predictor.y_pred_bin_proba[i, 1])
            except IndexError as e:
                # LSTM don't have first time steps because it don't
                # have necessary history
                pass
            i += 1

        # For creating visualisation
        all_times = all_times.union(set(times))

        # If only average plots are asked, continue to next station
        if options.only_avg == 1:
            continue

        # Calculate errors for given station, first for all periods and then for whole time range
        if predictor.y_pred_bin is not None:
            time_list += list(times)

            #feature_list += list()
            target_list += list(target)
            y_pred_bin_list += list(predictor.y_pred_bin)
            y_pred_bin_proba_list += list(predictor.y_pred_bin_proba)

            splits = viz._split_to_parts(list(times), [
                target, y_pred, predictor.y_pred_bin,
                predictor.y_pred_bin_proba
            ], 2592000)
        else:
            splits = viz._split_to_parts(list(times), [target, y_pred],
                                         2592000)

        for i in range(0, len(splits)):

            logging.info('Month {}:'.format(i + 1))

            if predictor.y_pred_bin is not None:
                times_, target_, y_pred_, y_pred_bin_, y_pred_bin_proba_ = splits[
                    i]
                viz.classification_perf_metrics(y_pred_bin_proba_, y_pred_bin_,
                                                target_, options, times_,
                                                station)
            else:
                times_, target_, y_pred_ = splits[i]

            rmse = math.sqrt(metrics.mean_squared_error(target_, y_pred_))
            mae = metrics.mean_absolute_error(target_, y_pred_)
            r2 = metrics.r2_score(target_, y_pred_)
            rmse_stat = math.sqrt(
                metrics.mean_squared_error(target_,
                                           np.full_like(target_, mean_delay)))
            skill = 1 - rmse / rmse_stat

            # Put errors to timeseries
            station_rmse[station][i] = rmse
            station_mae[station][i] = mae
            station_r2[station][i] = r2
            station_skill[station][i] = skill

            logging.info('RMSE of station {} month {}: {:.4f}'.format(
                stationName, i + 1, rmse))
            logging.info('MAE of station {} month {}: {:.4f}'.format(
                stationName, i + 1, mae))
            logging.info('R2 score of station {} month {}: {:.4f}'.format(
                stationName, i + 1, r2))
            logging.info('Skill (RMSE) of station {} month {}: {:.4f}'.format(
                stationName, i + 1, skill))

        mse = math.sqrt(metrics.mean_squared_error(target, y_pred))
        mae = metrics.mean_absolute_error(target, y_pred)
        r2 = metrics.r2_score(target, y_pred)
        rmse_stat = math.sqrt(
            metrics.mean_squared_error(target,
                                       np.full_like(target, mean_delay)))
        skill = 1 - rmse / rmse_stat

        station_rmse[station]['all'] = rmse
        station_mae[station]['all'] = mae
        station_r2[station]['all'] = r2
        station_skill[station]['all'] = skill

        logging.info('All periods:')
        logging.info('RMSE of station {} month {}: {:.4f}'.format(
            stationName, i + 1, rmse))
        logging.info('MAE of station {} month {}: {:.4f}'.format(
            stationName, i + 1, mae))
        logging.info('R2 score of station {} month {}: {:.4f}'.format(
            stationName, i + 1, r2))
        logging.info('Skill (RMSE) of station {} month {}: {:.4f}'.format(
            stationName, i + 1, skill))

        # Create csv and upload it to pucket
        times_formatted = [t.strftime('%Y-%m-%dT%H:%M:%S') for t in times]
        delay_data = {
            'times': times_formatted,
            'delay': target,
            'predicted delay': y_pred
        }
        fname = '{}/delays_{}.csv'.format(options.vis_path, station)
        io.write_csv(delay_data, fname, fname)

        # Draw visualisation
        if predictor.y_pred_bin_proba is not None:
            fname = '{}/timeseries_proba_{}'.format(options.vis_path, station)
            proba = predictor.y_pred_bin_proba[:, 1]
            viz.plot_delay(times,
                           target,
                           None,
                           'Delay for station {}'.format(stationName),
                           fname,
                           all_proba=proba,
                           proba_mode='same',
                           color_threshold=options.class_limit)
        #else:
        fname = '{}/timeseries_regression_{}'.format(options.vis_path, station)
        viz.plot_delay(times,
                       target,
                       y_pred,
                       'Delay for station {}'.format(stationName),
                       fname,
                       all_proba=None)

        fname = '{}/scatter_all_stations.png'.format(options.vis_path)
        viz.scatter_predictions(times,
                                target,
                                y_pred,
                                savepath=options.vis_path,
                                filename='scatter_{}'.format(station))

    # Save all station related results to csv and upload them to bucket
    fname = '{}/station_rmse.csv'.format(options.vis_path)
    io.dict_to_csv(station_rmse, fname, fname)
    fname = '{}/station_mae.csv'.format(options.vis_path)
    io.dict_to_csv(station_mae, fname, fname)
    fname = '{}/station_r2.csv'.format(options.vis_path)
    io.dict_to_csv(station_r2, fname, fname)
    fname = '{}/station_skill_rmse.csv'.format(options.vis_path)
    io.dict_to_csv(station_skill, fname, fname)

    # Create timeseries of avg actual delay and predicted delay
    all_times = sorted(list(all_times))
    for t, l in avg_delay.items():
        avg_delay[t] = sum(l) / len(l)
    for t, l in avg_pred_delay.items():
        avg_pred_delay[t] = sum(l) / len(l)
    for t, l in avg_proba.items():
        avg_proba[t] = sum(l) / len(l)

    avg_delay = list(
        OrderedDict(sorted(avg_delay.items(), key=lambda t: t[0])).values())
    avg_pred_delay = list(
        OrderedDict(sorted(avg_pred_delay.items(),
                           key=lambda t: t[0])).values())
    avg_proba = list(
        OrderedDict(sorted(avg_proba.items(), key=lambda t: t[0])).values())

    # Calculate average over all times and stations, first for all months separately, then for whole time range
    splits = viz._split_to_parts(list(times), [avg_delay, avg_pred_delay],
                                 2592000)

    for i in range(0, len(splits)):
        times_, avg_delay_, avg_pred_delay_ = splits[i]

        try:
            rmse = math.sqrt(
                metrics.mean_squared_error(avg_delay_, avg_pred_delay_))
            mae = metrics.mean_absolute_error(avg_delay_, avg_pred_delay_)
            r2 = metrics.r2_score(avg_delay_, avg_pred_delay_)
            rmse_stat = math.sqrt(
                metrics.mean_squared_error(
                    avg_delay_, np.full_like(avg_delay_, mean_delay)))
            skill = 1 - rmse / rmse_stat
        except ValueError:
            logging.warning('Zero samples in some class')
            continue

        logging.info('Month: {}'.format(i + 1))
        logging.info(
            'RMSE of average delay over all stations: {:.4f}'.format(rmse))
        logging.info(
            'MAE of average delay over all stations: {:.4f}'.format(mae))
        logging.info(
            'R2 score of average delay over all stations: {:.4f}'.format(r2))
        logging.info(
            'Skill score (RMSE) of average delay over all stations: {:.4f}'.
            format(skill))

        # Write average data into file
        avg_errors = {
            'rmse': rmse,
            'mae': mae,
            'r2': r2,
            'skill': skill,
            'nro_of_samples': len(avg_delay)
        }
        fname = '{}/avg_erros_{}.csv'.format(options.vis_path, i)
        io.dict_to_csv(avg_errors, fname, fname)

    rmse = math.sqrt(metrics.mean_squared_error(avg_delay, avg_pred_delay))
    #rmse_mean = np.mean(list(station_rmse.values()))
    mae = metrics.mean_absolute_error(avg_delay, avg_pred_delay)
    #mae_mean = np.mean(list(station_mae.values()))
    r2 = metrics.r2_score(avg_delay, avg_pred_delay)
    rmse_stat = math.sqrt(
        metrics.mean_squared_error(avg_delay,
                                   np.full_like(avg_delay, mean_delay)))
    skill = 1 - rmse / rmse_stat
    #skill_mean = 1 - rmse_mean/rmse_stat

    logging.info('All periods:')
    logging.info(
        'RMSE of average delay over all stations: {:.4f}'.format(rmse))
    #logging.info('Average RMSE of all station RMSEs: {:.4f}'.format(rmse_mean))
    logging.info('MAE of average delay over all stations: {:.4f}'.format(mae))
    #logging.info('Average MAE of all station MAEs: {:.4f}'.format(mae_mean))
    logging.info(
        'R2 score of average delay over all stations: {:.4f}'.format(r2))
    logging.info(
        'Skill score (RMSE) of average delay over all stations: {:.4f}'.format(
            skill))
    #logging.info('Skill score (avg RMSE) of all stations: {:.4f}'.format(skill_mean))

    # Write average data into file
    avg_errors = {
        'rmse': rmse,
        'mae': mae,
        'r2': r2,
        #'rmse_mean': rmse_mean,
        #'mae_mean': mae_mean,
        'skill': skill,
        #'skill_mean': skill_mean,
        'nro_of_samples': len(avg_delay)
    }
    fname = '{}/avg_erros.csv'.format(options.vis_path)
    io.dict_to_csv(avg_errors, fname, fname)

    # Create timeseries of average delay and predicted delays over all stations
    all_times_formatted = [t.strftime('%Y-%m-%dT%H:%M:%S') for t in all_times]
    delay_data = {
        'times': all_times_formatted,
        'delay': avg_delay,
        'predicted delay': avg_pred_delay
    }

    # write csv
    fname = '{}/avg_delays_all_stations.csv'.format(options.vis_path)
    io.write_csv(delay_data, fname, fname)

    # visualise
    if not avg_proba:
        proba = None
    else:
        proba = avg_proba
    fname = '{}/timeseries_avg_all_stations.png'.format(options.vis_path)

    if predictor.y_pred_bin is not None:
        viz.plot_delay(all_times,
                       avg_delay,
                       None,
                       'Average delay for all station',
                       fname,
                       all_proba=proba,
                       proba_mode='same',
                       color_threshold=options.class_limit)
    else:
        viz.plot_delay(all_times, avg_delay, avg_pred_delay,
                       'Average delay for all station', fname)

    fname = '{}/scatter_all_stations.png'.format(options.vis_path)
    viz.scatter_predictions(all_times,
                            avg_delay,
                            avg_pred_delay,
                            savepath=options.vis_path,
                            filename='scatter_all_stations')

    # Binary classification metrics
    if predictor.y_pred_bin is not None:
        all_data.sort_values(by=['time'], inplace=True)
        times = all_data.loc[:, 'time'].values
        try:
            target = all_data.loc[:, options.label_params].reset_index(
                drop=True).values.ravel()
            y_pred, y_pred_bin, y_pred_bin_proba = predictor.pred(
                times, all_data)
            # Drop first times which LSTM are not able to predict
            times = times[(len(all_data) - len(y_pred)):]
            splits = viz._split_to_parts(list(times), [
                target, y_pred, predictor.y_pred_bin,
                predictor.y_pred_bin_proba
            ], 2592000)

            for i in range(0, len(splits)):
                #times_, target_, y_pred_bin_, y_pred_bin_proba_ = splits[i]
                times_, target_, y_pred_, y_pred_bin_, y_pred_bin_proba_ = splits[
                    i]
                viz.classification_perf_metrics(y_pred_bin_proba_, y_pred_bin_,
                                                target_, options, times_,
                                                'all')
        except (PredictionError, ModelError) as e:
            logging.error(e)
            pass