def my_app(cfg: Config) -> None: print(OmegaConf.to_yaml(cfg)) working_dir = get_original_cwd() print(f"Orig working directory : {working_dir}") print(f"Current working directory : {os.getcwd()}") if EProgramMode.PrepareData in cfg.mode: logger.info(f"STAGE: PrepareData") prepare_data( cfg.prepare_data, to_absolute_path(cfg.train_dataset), to_absolute_path(cfg.test_dataset), to_absolute_path(cfg.vocab_path), ) if EProgramMode.Train in cfg.mode: logger.info(f"STAGE: Train") main_train_model(cfg) if EProgramMode.Predict in cfg.mode: logger.info(f"STAGE: Train") predict_model(cfg) if EProgramMode.Visualize in cfg.mode: run_visualization( to_absolute_path(cfg.prepare_data.train_file), cfg.train.report_path, cfg.train.dump_model, )
def parameterized_test(self, model, mode): # given: data_dir = "test-data" interim_dir = self.test_dir + "/interim" processed_dir = self.test_dir + "/processed" model_dir = self.test_dir + "/model" model_path = model_dir + ("" if mode == "full" else "_" + mode) + "/0001.txt" submission_dir = self.test_dir + "/submissions" submission_path = submission_dir + "/submission.csv" # data preparation # when: make_dataset(data_dir, interim_dir) # then: self.assertTrue(os.path.exists(interim_dir + "/test_data.pkl")) self.assertTrue(os.path.exists(interim_dir + "/test_data.pkl")) # feature engineering # when: build_features(data_dir, processed_dir) # then: self.assertTrue(os.path.exists(processed_dir + "/test_data.pkl")) self.assertTrue(os.path.exists(processed_dir + "/test_data.pkl")) # model training # when: train_model(model, mode, processed_dir, model_dir) # then: self.assertTrue(os.path.exists(model_path)) # model prediction # when: predict_model(processed_dir, model, model_path, submission_path) # then: self.assertTrue(os.path.exists(submission_path))
def get_prediction_result(url): """ :param url: string :return: a dictionary containing the message and prediction result """ result = predict_model(url) if (result['prediction_score'] == config['data']['api']['predicted_label'] and result['prediction_score'] >= config['data']['api']['confidence_score']): prediction = "malicious" else: prediction = "benign" message = "Phishy has detected that this URL is {}".format(prediction) return {"message": message, "result": result}
def url_bulk_consume(): """ Consumes urls from configured kafka topic , runs the predictions on them, sends the malicious urls to the second configured kafka topic Returns: None """ logger = logging.getLogger(__name__) consumer = KafkaConsumer( config['kafka']['consumer']['kafka_topic_to_read_from'], bootstrap_servers=config['kafka']['bootstrap_servers'], api_version=tuple(config['kafka']['api_version']), group_id=config['kafka']['consumer']['group_id'], enable_auto_commit=config['kafka']['consumer']['enable_auto_commit'], auto_offset_reset=config['kafka']['consumer']['auto_offset_reset'], consumer_timeout_ms=config['kafka']['consumer']['consumer_timeout_ms'], max_poll_interval_ms=config['kafka']['consumer'] ['max_poll_interval_ms']) start = time.time() logger.info("Reading data ...") for message in consumer: url = message.value.decode("utf-8") logger.info(url) result = predict_model(url) prediction_score = result['prediction_score'] confidence_score = result['confidence_score'] logger.info("prediction: " + str(prediction_score)) if (prediction_score == config['kafka']['predicted_label'] and confidence_score >= config['kafka']['confidence_score']): logger.info("Found malicious url") send_to_kafka(result) consumer.commit() end = time.time() logger.info("Ellapsed time: " + str(end - start) + " for consuming and predicting urls from kafka topic")
def prediction_result(predict_url): prediction_result = predict_model(predict_url) return prediction_result
def main(): logging.info('INI main()') predictions = predict_model(other_texts) #print(predictions) logging.info('FIN main()')
def main(): # Sidebar section: page_selection = st.sidebar.radio("Select a market:", ["Nikkey", "Bovespa"]) dct_market = { "Nikkey": { "country": "Japan", "continent": "Asia", "index_name": "^N225" }, "Bovespa": { "country": "Brazil", "continent": "America", "index_name": "^BVSP" } } st.markdown(f"# {page_selection}") end_date = date.today() start_date = end_date - timedelta(days=3150) # start_date = datetime.strptime('2004-11-02', '%Y-%m-%d') # end_date = datetime.strptime('2008-11-28', '%Y-%m-%d') start_date = st.sidebar.date_input('Start date', start_date) end_date = st.sidebar.date_input('End date', end_date) df = yf.download(dct_market[page_selection]["index_name"], start=start_date, end=end_date) df["rt"] = (np.log(df["Close"]) - np.log(df["Close"].shift(periods=1))) * 100 df = create_shifted_rt(df, [1, 5, 37]) df_clustered = uniform_clustering( df[["Close", "rt", "rt-1", "rt-5", "rt-37"]], ["rt", "rt-1", "rt-5", "rt-37"]) df_clustered.dropna(how="any", axis=0, inplace=True) lst_relations = [('cluster_rt-37', 'cluster_rt'), ('cluster_rt-5', 'cluster_rt'), ('cluster_rt-1', 'cluster_rt')] df_clustered = df_clustered[[ "rt", "cluster_rt-37", "cluster_rt-5", "cluster_rt-1", "cluster_rt" ]] predict_n_days = 20 model = train_model(df_clustered.iloc[:-predict_n_days], lst_relations) evidence = { 'cluster_rt-37': df_clustered.iloc[-37]['cluster_rt'], 'cluster_rt-5': df_clustered.iloc[-5]['cluster_rt'], 'cluster_rt-1': df_clustered.iloc[-1]['cluster_rt'] } predict = predict_model(model, evidence=evidence) st.text(f"Previsão para amanhã: {predict[0]}") resultado = {} for i in np.arange(1, predict_n_days + 1): evidence = { 'cluster_rt-37': df_clustered.iloc[-37 - i]['cluster_rt'], 'cluster_rt-5': df_clustered.iloc[-5 - i]['cluster_rt'], 'cluster_rt-1': df_clustered.iloc[-1 - i]['cluster_rt'] } predict = predict_model(model, evidence=evidence) resultado[i] = [ predict[0]['cluster_rt'], df_clustered.iloc[i]['cluster_rt'], df_clustered.iloc[i]['rt'] ] resultado = pd.DataFrame.from_dict(resultado, orient='index') resultado.rename(columns={0: 'Previsão', 1: 'Real', 2: 'rt'}, inplace=True) rt_mean = round( resultado.groupby(by=["Real"]).agg( {"rt": ["min", "max", "count", "mean"]}), 2)[("rt", "mean")] if page_selection == "Nikkey": conditions = [ resultado["Previsão"] == 1.0, resultado["Previsão"] == 2.0, resultado["Previsão"] == 3.0, resultado["Previsão"] == 4.0, resultado["Previsão"] == 5.0, resultado["Previsão"] == 6.0 ] elif page_selection == "Bovespa": conditions = [ resultado["Previsão"] == 1.0, resultado["Previsão"] == 2.0, resultado["Previsão"] == 3.0, resultado["Previsão"] == 4.0 ] choices = rt_mean.tolist() resultado["rt_predict"] = np.select(conditions, choices, default=np.nan) resultado = resultado[::-1] resultado["rt_predict_acumulado"] = resultado["rt_predict"].cumsum() resultado["rt_acumulado"] = resultado["rt"].cumsum() st.dataframe(resultado) rmse_uniform = mean_squared_error(resultado["rt"], resultado["rt_predict"], squared=False) acuracia = accuracy_score(resultado["Real"], resultado["Previsão"], normalize=True) st.text(f"Acurácia: {round(acuracia*100, 2)}%") st.text(f"RMSE: {round(rmse_uniform, 2)}%") # fig = plt.figure(figsize=(20, 4)) # ax = fig.add_subplot(111) # ax.plot(df['Close'], label=dct_market[page_selection]["index_name"]) # date_min = df.index.min() # date_max = df.index.max() # ax.xaxis.set_major_locator(plt.MaxNLocator(30)) # ax.set_xlim(left=date_min, right=date_max) # ax.legend(loc='lower left', frameon=False) # plt.xticks(rotation=90) # st.pyplot(fig) st.line_chart(df[['Close']]) st.line_chart(df["rt"])