예제 #1
0
def my_app(cfg: Config) -> None:
    print(OmegaConf.to_yaml(cfg))
    working_dir = get_original_cwd()
    print(f"Orig working directory    : {working_dir}")
    print(f"Current working directory : {os.getcwd()}")
    if EProgramMode.PrepareData in cfg.mode:
        logger.info(f"STAGE: PrepareData")
        prepare_data(
            cfg.prepare_data,
            to_absolute_path(cfg.train_dataset),
            to_absolute_path(cfg.test_dataset),
            to_absolute_path(cfg.vocab_path),
        )
    if EProgramMode.Train in cfg.mode:
        logger.info(f"STAGE: Train")
        main_train_model(cfg)
    if EProgramMode.Predict in cfg.mode:
        logger.info(f"STAGE: Train")
        predict_model(cfg)
    if EProgramMode.Visualize in cfg.mode:
        run_visualization(
            to_absolute_path(cfg.prepare_data.train_file),
            cfg.train.report_path,
            cfg.train.dump_model,
        )
예제 #2
0
    def parameterized_test(self, model, mode):
        # given:
        data_dir = "test-data"
        interim_dir = self.test_dir + "/interim"
        processed_dir = self.test_dir + "/processed"
        model_dir = self.test_dir + "/model"
        model_path = model_dir + ("" if mode == "full" else "_" +
                                  mode) + "/0001.txt"
        submission_dir = self.test_dir + "/submissions"
        submission_path = submission_dir + "/submission.csv"

        # data preparation
        # when:
        make_dataset(data_dir, interim_dir)

        # then:
        self.assertTrue(os.path.exists(interim_dir + "/test_data.pkl"))
        self.assertTrue(os.path.exists(interim_dir + "/test_data.pkl"))

        # feature engineering
        # when:
        build_features(data_dir, processed_dir)

        # then:
        self.assertTrue(os.path.exists(processed_dir + "/test_data.pkl"))
        self.assertTrue(os.path.exists(processed_dir + "/test_data.pkl"))

        # model training
        # when:
        train_model(model, mode, processed_dir, model_dir)

        # then:
        self.assertTrue(os.path.exists(model_path))

        # model prediction
        # when:
        predict_model(processed_dir, model, model_path, submission_path)

        # then:
        self.assertTrue(os.path.exists(submission_path))
예제 #3
0
def get_prediction_result(url):
    """
    :param url: string
    :return: a dictionary containing the message and prediction result
    """
    result = predict_model(url)

    if (result['prediction_score'] == config['data']['api']['predicted_label']
            and result['prediction_score'] >=
            config['data']['api']['confidence_score']):
        prediction = "malicious"
    else:
        prediction = "benign"

    message = "Phishy has detected that this URL is {}".format(prediction)

    return {"message": message, "result": result}
예제 #4
0
def url_bulk_consume():
    """
    Consumes urls from configured kafka topic , runs the predictions on them,
    sends the malicious urls to the second configured kafka topic

    Returns: None
    """
    logger = logging.getLogger(__name__)
    consumer = KafkaConsumer(
        config['kafka']['consumer']['kafka_topic_to_read_from'],
        bootstrap_servers=config['kafka']['bootstrap_servers'],
        api_version=tuple(config['kafka']['api_version']),
        group_id=config['kafka']['consumer']['group_id'],
        enable_auto_commit=config['kafka']['consumer']['enable_auto_commit'],
        auto_offset_reset=config['kafka']['consumer']['auto_offset_reset'],
        consumer_timeout_ms=config['kafka']['consumer']['consumer_timeout_ms'],
        max_poll_interval_ms=config['kafka']['consumer']
        ['max_poll_interval_ms'])
    start = time.time()
    logger.info("Reading data ...")
    for message in consumer:
        url = message.value.decode("utf-8")
        logger.info(url)
        result = predict_model(url)
        prediction_score = result['prediction_score']
        confidence_score = result['confidence_score']

        logger.info("prediction: " + str(prediction_score))

        if (prediction_score == config['kafka']['predicted_label']
                and confidence_score >= config['kafka']['confidence_score']):
            logger.info("Found malicious url")
            send_to_kafka(result)
        consumer.commit()
    end = time.time()
    logger.info("Ellapsed time: " + str(end - start) +
                " for consuming and predicting urls from kafka topic")
예제 #5
0
def prediction_result(predict_url):
    prediction_result = predict_model(predict_url)
    return prediction_result
예제 #6
0
def main():
    logging.info('INI main()')
    predictions = predict_model(other_texts)
    #print(predictions)
    logging.info('FIN main()')
예제 #7
0
def main():
    # Sidebar section:
    page_selection = st.sidebar.radio("Select a market:",
                                      ["Nikkey", "Bovespa"])

    dct_market = {
        "Nikkey": {
            "country": "Japan",
            "continent": "Asia",
            "index_name": "^N225"
        },
        "Bovespa": {
            "country": "Brazil",
            "continent": "America",
            "index_name": "^BVSP"
        }
    }

    st.markdown(f"# {page_selection}")

    end_date = date.today()
    start_date = end_date - timedelta(days=3150)

    # start_date = datetime.strptime('2004-11-02', '%Y-%m-%d')
    # end_date = datetime.strptime('2008-11-28', '%Y-%m-%d')

    start_date = st.sidebar.date_input('Start date', start_date)
    end_date = st.sidebar.date_input('End date', end_date)

    df = yf.download(dct_market[page_selection]["index_name"],
                     start=start_date,
                     end=end_date)

    df["rt"] = (np.log(df["Close"]) -
                np.log(df["Close"].shift(periods=1))) * 100

    df = create_shifted_rt(df, [1, 5, 37])

    df_clustered = uniform_clustering(
        df[["Close", "rt", "rt-1", "rt-5", "rt-37"]],
        ["rt", "rt-1", "rt-5", "rt-37"])
    df_clustered.dropna(how="any", axis=0, inplace=True)

    lst_relations = [('cluster_rt-37', 'cluster_rt'),
                     ('cluster_rt-5', 'cluster_rt'),
                     ('cluster_rt-1', 'cluster_rt')]

    df_clustered = df_clustered[[
        "rt", "cluster_rt-37", "cluster_rt-5", "cluster_rt-1", "cluster_rt"
    ]]

    predict_n_days = 20

    model = train_model(df_clustered.iloc[:-predict_n_days], lst_relations)

    evidence = {
        'cluster_rt-37': df_clustered.iloc[-37]['cluster_rt'],
        'cluster_rt-5': df_clustered.iloc[-5]['cluster_rt'],
        'cluster_rt-1': df_clustered.iloc[-1]['cluster_rt']
    }

    predict = predict_model(model, evidence=evidence)

    st.text(f"Previsão para amanhã: {predict[0]}")

    resultado = {}

    for i in np.arange(1, predict_n_days + 1):

        evidence = {
            'cluster_rt-37': df_clustered.iloc[-37 - i]['cluster_rt'],
            'cluster_rt-5': df_clustered.iloc[-5 - i]['cluster_rt'],
            'cluster_rt-1': df_clustered.iloc[-1 - i]['cluster_rt']
        }

        predict = predict_model(model, evidence=evidence)

        resultado[i] = [
            predict[0]['cluster_rt'], df_clustered.iloc[i]['cluster_rt'],
            df_clustered.iloc[i]['rt']
        ]

    resultado = pd.DataFrame.from_dict(resultado, orient='index')
    resultado.rename(columns={0: 'Previsão', 1: 'Real', 2: 'rt'}, inplace=True)

    rt_mean = round(
        resultado.groupby(by=["Real"]).agg(
            {"rt": ["min", "max", "count", "mean"]}), 2)[("rt", "mean")]

    if page_selection == "Nikkey":
        conditions = [
            resultado["Previsão"] == 1.0, resultado["Previsão"] == 2.0,
            resultado["Previsão"] == 3.0, resultado["Previsão"] == 4.0,
            resultado["Previsão"] == 5.0, resultado["Previsão"] == 6.0
        ]
    elif page_selection == "Bovespa":
        conditions = [
            resultado["Previsão"] == 1.0, resultado["Previsão"] == 2.0,
            resultado["Previsão"] == 3.0, resultado["Previsão"] == 4.0
        ]

    choices = rt_mean.tolist()

    resultado["rt_predict"] = np.select(conditions, choices, default=np.nan)

    resultado = resultado[::-1]

    resultado["rt_predict_acumulado"] = resultado["rt_predict"].cumsum()
    resultado["rt_acumulado"] = resultado["rt"].cumsum()

    st.dataframe(resultado)

    rmse_uniform = mean_squared_error(resultado["rt"],
                                      resultado["rt_predict"],
                                      squared=False)

    acuracia = accuracy_score(resultado["Real"],
                              resultado["Previsão"],
                              normalize=True)

    st.text(f"Acurácia: {round(acuracia*100, 2)}%")
    st.text(f"RMSE: {round(rmse_uniform, 2)}%")

    # fig = plt.figure(figsize=(20, 4))
    # ax = fig.add_subplot(111)

    # ax.plot(df['Close'], label=dct_market[page_selection]["index_name"])

    # date_min = df.index.min()
    # date_max = df.index.max()
    # ax.xaxis.set_major_locator(plt.MaxNLocator(30))
    # ax.set_xlim(left=date_min, right=date_max)

    # ax.legend(loc='lower left', frameon=False)
    # plt.xticks(rotation=90)
    # st.pyplot(fig)

    st.line_chart(df[['Close']])

    st.line_chart(df["rt"])