def __init__(self, kafka_configfile, schema_file, s3_configfile): """ class constructor that initializes the instance according to the configurations of the S3 bucket and Kafka :type kafka_configfile: str path to kafka config file :type schema_file : str path to schema file :type s3_configfile : str path to S3 config file """ self.kafka_config = utility.parse_config(kafka_configfile) self.schema = utility.parse_config(schema_file) self.s3_config = utility.parse_config(s3_configfile) self.producer = KafkaProducer( bootstrap_servers=self.kafka_config["BROKERS_IP"])
def __init__(self, s3_configfile, schema_configfile, psql_configfile): """ class constructor that initializes the instance according to the configurations of the S3 bucket, raw data and PostgreSQL table :type s3_configfile: str path to s3 config file :type schema_configfile: str path to schema config file :type psql_configfile: str path to psql config file """ self.s3_config = utility.parse_config(s3_configfile) self.schema = utility.parse_config(schema_configfile) self.psql_config = utility.parse_config(psql_configfile) self.sc = pyspark.SparkContext.getOrCreate() self.sc.setLogLevel("ERROR")
def predict(config_file): """ Main function that runs predictions Args: config_file [str]: path to config file Returns: None """ ################## # configure logger ################## logger = set_logger("../log/predict.log") ################## # Load config from config file ################## logger.info(f"Load config from {config_file}") config = parse_config(config_file) image_width = config['common']['in_image_width'] image_height = config['common']['in_image_height'] predict_img = config['predict']['folder_path'] weights_path = config['common']['weights_path'] X,img_names = preprocess(predict_img, image_width, image_height) model = KeyPointModel().getModel() logger.info(f"Loading weights from {weights_path}") model.load_weights(weights_path) # logger.info("-----------Model Summary------------") # logger.info(model.summary()) predicted_keypoints = model.predict(X) logger.info("Prediction Completed. Writing output to predicted.csv") write_output(predicted_keypoints, img_names)
def predict(config_file): """ Main function that runs predictions Args: config_file [str]: path to config file Returns: None """ ################## # configure logger ################## logger = set_logger("./log/predict.log") ################## # Load config from config file ################## logger.info(f"Load config from {config_file}") config = parse_config(config_file) model_path = Path(config["predict"]["model_path"]) processed_test = config["predict"]["processed_test"] predicted_file = config["predict"]["predicted_file"] export_result = config["predict"]["export_result"] logger.info(f"config: {config['predict']}") ################## # Load model & test set ################## # Load model logger.info( f"-------------------Load the trained model-------------------") with open(model_path, "rb") as f: trained_model = load(f) # Load test set logger.info(f"Load the test data from {processed_test}") X, y, cols = load_data(processed_test) logger.info(f"cols: {cols}") logger.info(f"X: {X.shape}") logger.info(f"y: {y.shape}") ################## # Make prediction and evaluate ################## logger.info(f"-------------------Predict and evaluate-------------------") y_hat = trained_model.predict(X) logger.info(f"Classification report: \n {classification_report(y, y_hat)}") output = pd.DataFrame(y) output["prediction"] = y_hat if export_result: output.to_csv(predicted_file, index=False) logger.info(f"Export prediction to : {predicted_file}")
def __init__(self, kafka_configfile, schema_configfile, stream_configfile, start_offset): """ class constructor that initializes the instance according to the configurations of Kafka (brokers, topic, offsets), data schema and batch interval for streaming :type kafka_configfile: str path to s3 config file :type schema_configfile: str path to schema config file :type stream_configfile: str path to stream config file :type start_offset: int offset from which to read from partitions of Kafka topic """ self.kafka_config = utility.parse_config(kafka_configfile) self.stream_config = utility.parse_config(stream_configfile) self.schema = utility.parse_config(schema_configfile) self.start_offset = start_offset self.sc = pyspark.SparkContext().getOrCreate() self.ssc = pyspark.streaming.StreamingContext( self.sc, self.stream_config["INTERVAL"]) self.sc.setLogLevel("ERROR")
def main(repo_owner, repo_name, start_date, end_date): """ main logic of ETL process : call github API get commit -> commit process -> commit to Postgre """ print ('start_date : ', str(start_date), 'end_date : ', str(end_date)) repo_url = 'https://api.github.com/repos/{}/{}/commits?since={}T00:00:00Z&until={}T23:59:59Z'.format(repo_owner, repo_name, start_date, end_date) df = Commit2df(repo_url) output_df = extract_inform(df) postgre_config = parse_config('config/postgre.config') dumptopostgre = DumpToPostgre() dumptopostgre.insert_all_to_table(output_df,'git_commit',postgre_config) # sleep 3 sec after every scraping, avoid block by server time.sleep(3)
def main(): """ main func that create fact tables, and attr table from raw git data """ postgre_config = parse_config('config/postgre.config') dumptopostgre = DumpToPostgre() connection = dumptopostgre.get_conn(postgre_config) sql_list = [sql_commit_fact, sql_commit_commitor, sql_commited_repo] for sql in sql_list: try: with connection.cursor() as cursor: print (sql) cursor.execute(sql) connection.commit() print ('>>>> table build ok') except Exception as e: print ('>>>> table build failed', str(e))
def __init__(self, kafka_configfile, schema_configfile, stream_configfile, psql_configfile, start_offset=0): """ class constructor that initializes the instance according to the configurations of Kafka (brokers, topic, offsets), PostgreSQL database, data schema and batch interval for streaming :type kafka_configfile: str path to s3 config file :type schema_configfile: str path to schema config file :type stream_configfile: str path to stream config file :type psql_configfile: str path to psql config file :type start_offset: int offset from which to read from partitions of Kafka topic """ SparkStreamerFromKafka.__init__(self, kafka_configfile, schema_configfile, stream_configfile, start_offset) self.psql_config = utility.parse_config(psql_configfile) self.sqlContext = pyspark.sql.SQLContext(self.sc) self.load_batch_data() self.psql_n = 0
def train(config_file): """ Main function that train and persists model based on training set/ Args: config_file [str]: path to config file Returns: None """ ################ # config logger ################ logger = set_logger("../log/train.log") ############################### # Load config from config file ############################### logger.info(f"Load config from {config_file}") config = parse_config(config_file) keypoints_csv = Path(config['common']['labels_csv_path']) val_split = config['common']['val_split'] train_img_scr_path = config['common']['img_source_path'] test_img_scr_path = config['common']['img_source_path'] image_width = config['common']['in_image_width'] image_height = config['common']['in_image_height'] epochs = config['train']['epochs'] train_batch_size = config['train']['batch_size'] weight_path = config['common']['weight_path'] no_of_aug = config['train']['no_of_aug'] test_batch_size = config['test']['batch_size'] ############ # Load Data ############ logger.info(f"----------------Load the data----------------") selected_img, keypoint_df = load_data(keypoints_csv) logger.info(f"Number of selected images are {selected_img.shape}") logger.info(f"Few of the selected images are {selected_img[0:5]}") #################################### # Get train and test data generators #################################### X_train, y_train, X_test, y_test = train_test_split( selected_img, keypoint_df, val_split) train_gen = Car(x_set=X_train, y_set=y_train, mode='Train', data_path=train_img_scr_path, image_width=image_width, image_height=image_height, batch_size=train_batch_size, augmentations='Self', no_of_aug=no_of_aug) test_gen = Car( x_set=X_test, y_set=y_test, mode='Test', data_path=test_img_scr_path, image_width=image_width, image_height=image_height, batch_size=test_batch_size, ) ##################### # Set and train model ##################### logger.info( f"-------------------------Initiate Model---------------------") model = KeyPointModel().getModel() logger.info( f"--------------------Model Summary---------------------------") logger.info(f"{model.summary}") # compile the model model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_absolute_error']) # modelCheckPoint = ModelCheckpoint('car-{val_loss:.2f}.h5', monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=True) earlyS = EarlyStopping(monitor='val_loss', min_delta=1, patience=3, restore_best_weights=True) reducelr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, min_lr=1e-7) history = model.fit(x=train_gen, validation_data=test_gen, callbacks=[earlyS, reducelr], epochs=epochs) logger.info(history) logger.info("------------Saving Weights--------------") model.save_weights(weight_path)
def etl(config_file): """ ETL function that load raw data and convert to train and test set Args: config_file [str]: path to config file Returns: None """ ################## # configure logger ################## logger = set_logger("./log/etl.log") ################## # Load config from config file ################## logger.info(f"Load config from {config_file}") config = parse_config(config_file) raw_data_file = config["etl"]["raw_data_file"] processed_path = Path(config["etl"]["processed_path"]) test_size = config["etl"]["test_size"] random_state = config["etl"]["random_state"] logger.info(f"config: {config['etl']}") ################## # Data transformation ################## logger.info( "-------------------Start data transformation-------------------") wine = pd.read_csv(raw_data_file) bins = (2, 6.5, 8) group_names = ["bad", "good"] wine["quality"] = pd.cut(wine["quality"], bins=bins, labels=group_names) label_quality = LabelEncoder() wine["quality"] = label_quality.fit_transform(wine["quality"]) logger.info("End data transformation") ################## # train test split & Export ################## # train test split logger.info( "-------------------Train test split & Export-------------------") train, test = train_test_split(wine, test_size=test_size, random_state=random_state) # export data logger.info(f"write data to {processed_path}") train.to_csv(processed_path / "train.csv", index=False) test.to_csv(processed_path / "test.csv", index=False) logger.info(f"train: {train.shape}") logger.info(f"test: {test.shape}") logger.info("\n")
def train(config_file): """ Main function that trains & persists model based on training set Args: config_file [str]: path to config file Returns: None """ ################## # configure logger ################## logger = set_logger("../log/train.log") ################## # Load config from config file ################## logger.info(f"Load config from {config_file}") config = parse_config(config_file) processed_train = Path(config["train"]["processed_train"]) ensemble_model = config["train"]["ensemble_model"] model_config = config["train"]["model_config"] model_path = Path(config["train"]["model_path"]) logger.info(f"config: {config['train']}") ################## # Load data ################## logger.info( f"-------------------Load the processed data-------------------") X, y, cols = load_data(processed_train) logger.info(f"cols: {cols}") logger.info(f"X: {X.shape}") logger.info(f"y: {y.shape}") ################## # Set & train model ################## # Load model # Limited to sklearn ensemble for the moment logger.info(f"-------------------Initiate model-------------------") model = initiate_model(ensemble_model, model_config) # Train model logger.info(f"Train model using {ensemble_model}, {model_config}") model.fit(X, y) logger.info(f"Train score: {model.score(X, y)}") logger.info( f"CV score: {cross_val_score(estimator = model, X = X, y = y, cv = 5).mean()}" ) ################## # Persist model ################## logger.info(f"-------------------Persist model-------------------") model_path.parent.mkdir(parents=True, exist_ok=True) with open(model_path, "wb") as f: dump(model, f) logger.info(f"Persisted model to {model_path}")