from src.pipeline.Pipeline import Pipeline from src.pipeline.DataHandler import DataHandler from src import models import pickle, math import pandas as pd import time ###### ###### # # # CONFIGURE THE PIPELINE # # # ###### ###### # Create pipeline object pipObj = Pipeline() # Define how many cpus we can paralell meta classification on cpus = os.cpu_count() # cpus = 1 ###### ###### # # # CONFIGURE THE DATA # # # ###### ###### #define training data list_with_subjects_to_classify = [ '../data/input/4003601.7z', ]
""" from pyspark import SparkContext, SparkConf, SQLContext from pyspark.sql import SparkSession from src.pipeline.Pipeline import Pipeline from src.config.param_config.param_config import param_dict from src.utils.arg_parse import pipeline_arg_parse ##################################必须传入的参数########################################### ####读入常变参数 params = pipeline_arg_parse() final_param_dict = param_dict print(params) print(param_dict) ###启动spark环境 try: sc.stop() conf = SparkConf() conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") conf.set("spark.memory.fraction", 0.8) sc = SparkContext(conf).getOrCreate() # 添加参数启动 except: conf = SparkConf() conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") conf.set("spark.memory.fraction", 0.8) sc = SparkContext().getOrCreate() spark = SparkSession.builder.enableHiveSupport().getOrCreate() Pp = Pipeline(sc, final_param_dict, spark=spark) result = Pp.run_feature()
import sys, os try: sys.path.append( os.path.abspath( os.path.join( os.path.dirname( __file__), '..'))) except: pass import pandas as pd from src.pipeline.DataHandler import DataHandler from src.pipeline.Pipeline import Pipeline p = Pipeline() list_with_subjects = [ '../data/input/006', '../data/input/008', '../data/input/009', '../data/input/training_data/small_set' ] dataframe = p.create_large_dataframe_from_multiple_input_directories( list_with_subjects, back_keywords=['Back', "b"], thigh_keywords=['Thigh', "t"], label_keywords=['GoPro', "Labels"], out_path=None, merge_column=None, master_columns=['bx', 'by', 'bz'], slave_columns=['tx', 'ty', 'tz'], rearrange_columns_to=None,
df2 = dh2.get_dataframe_iterator() print(df1.shape, df2.shape) df1.dropna(subset=['label'], inplace=True) df2.dropna(subset=['label'], inplace=True) print(df1.shape, df2.shape) ############################## THEN COMBINE INTO ONE BIG TRAINING SET AKA VERTICAL STACKING ############# dataframe = dh1.vertical_stack_dataframes(df1, df2, set_as_current_df=False) # dataframe = dh1.vertical_stack_dataframes(dataframe, df3, set_as_current_df=False) print("DATAFRAME\n", dataframe.head(5), dataframe.shape) ############################## THEN WE MUST EXTRACT FEATURES N LABELS ###################################### pipeObj = Pipeline() back_feat_train, thigh_feat_train, label_train = pipeObj.get_features_and_labels_as_np_array( dataframe) ############################## THEN WE MUST TRAIN THE CLASSIFIER ###################################### RFC = models.get("RFC", {}) ############## # MODEL ARGUMENTS ############## # Do some magic numbering sampling_frequency = 50 window_length = 120 tempearture_reading_rate = 120
import sys, os try: sys.path.append( os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) except: print("SAdsadsadhsa;hkldasjkd") from src.pipeline.Pipeline import Pipeline from src.pipeline.DataHandler import DataHandler from src import models input_dir_rel_path = "/app/data/input" data_name = "xxx_x.7z" label_file = "xxx_x intervals.json" pipeline = Pipeline() ########### # # IF first time running script on data, else it is saved in ../data/temp/name # ########## # if there allready is a temp folder with the same name # TODO get this in the unzip N Synch method, path is unzip_path + filename.7z # if os.path.exists("../data/temp/{}".format(data_name)): # print("REMVOING OLD TEMP FOLDER") # os.system("rm -rf ../data/temp/{}".format(data_name)) # # # # first unzip and synch .7z folder