def gen_phone_battery_data() -> object: """ Create pyspark dataframe with some sample phone battery data Returns: DataFrame: pyspark dataframe object with columns: ["timestamp", "offset", "battery_level", "ver", "user"] """ column_name = [ "timestamp", "localtime", "battery_level", "version", "user" ] sample_data = [] timestamp = datetime(2019, 1, 9, 11, 34, 59) tmp = 1 sample = 100 sqlContext = get_or_create_sc("sqlContext") for row in range(1000, 1, -1): tmp += 1 if tmp == 100: sample = sample - 1 tmp = 1 timestamp = timestamp + timedelta(0, 1) localtime = timestamp + timedelta(hours=5) sample_data.append((timestamp, localtime, sample, 1, "bfb2ca0c-e19c-3956-9db2-5459ccadd40c")) df = sqlContext.createDataFrame(sample_data, column_name) return df
def gen_phone_battery_data(CC, user_id, stream_name) -> object: """ Create pyspark dataframe with some sample phone battery data Returns: DataFrame: pyspark dataframe object with columns: ["timestamp", "battery_level", "version", "user"] """ column_name = [ "timestamp", "localtime", "user", "version", "battery_level" ] sample_data = [] timestamp = datetime(2019, 1, 9, 11, 34, 59) tmp = 1 sample = 100 sqlContext = get_or_create_sc("sqlContext") for row in range(1000, 1, -1): tmp += 1 if tmp == 100: sample = sample - 1 tmp = 1 timestamp = timestamp + timedelta(0, 1) localtime = timestamp - timedelta(hours=5) sample_data.append((timestamp, localtime, user_id, 1, sample)) df = sqlContext.createDataFrame(sample_data, column_name) metadata = gen_phone_battery_metadata(stream_name=stream_name) ds = DataStream(df, metadata) CC.save_stream(ds)
def gen_phone_battery_data2() -> object: """ Create pyspark dataframe with some sample phone battery data Returns: DataFrame: pyspark dataframe object with columns: ["timestamp", "offset", "battery_level", "ver", "user"] """ column_name = ["timestamp", "battery_level", "bat2", "version", "user"] sample_data = [] timestamp = datetime(2019, 1, 9, 11, 34, 59) tmp = 1 sample = 100 sample2 = 70 sqlContext = get_or_create_sc("sqlContext") for row in range(1000, 1, -1): tmp += 1 if tmp == 100: sample = sample - 1 sample2 = sample2 - 2 tmp = 1 timestamp = timestamp + timedelta(0, 1) sample_data.append((timestamp, sample, sample2, 1, "dfce1e65-2882-395b-a641-93f31748591b")) df = sqlContext.createDataFrame(sample_data, column_name) return df
def run(): """ This example: - Make call to CerebralCortex-APIServer to: - Authenticate a user - Register a new stream (`accelerometer--org.md2k.phonesensor--phone`) - Upload sample data - Create Pyspark-Kafka direct stream - Read parquet data and convert it into pandas dataframe - Add gaussian noise in sample data - Store noisy data as a new stream - Retrieve and print noisy/clean data streams """ # upload sample data and publish messages on Kafka #rest_api_client("http://0.0.0.0:8089/") # create cerebralcortex object cc_config_path = "../../conf/" CC = Kernel(cc_config_path, enable_spark_ui=True) sample_stream_name = "accelerometer--org.md2k.phonesensor--phone" upload_stream_data( "http://localhost/", "demo", "demo", sample_stream_name, "../../resources/sample_data/msgpack_files/phone_accel.msgpack.gz") # raise Exception if CC.config["messaging_service"] == "none": raise Exception( "Messaging service is disabled (none) in cerebralcortex.yml. Please update configs." ) # Kafka Consumer Configs print("*"*100, type(user_metadata)) spark_context = get_or_create_sc(type="sparkContext") ssc = StreamingContext(spark_context, int(CC.config["kafka"]["ping_kafka"])) kafka_files_stream = CC.MessagingQueue.create_direct_kafka_stream( "filequeue", ssc) if kafka_files_stream is not None: kafka_files_stream.foreachRDD( lambda rdd: iterate_on_rdd(rdd, cc_config_path)) ssc.start() ssc.awaitTermination(timeout=15) ssc.stop() CC = Kernel(cc_config_path, enable_spark_ui=True) print("*" * 15, "CLEAN DATA", "*" * 15) ds_clean = CC.get_stream(stream_name=sample_stream_name) ds_clean.show(5, truncate=False) print("*" * 15, "NOISY DATA", "*" * 15) ds_noise = CC.get_stream(stream_name=sample_stream_name + "_gaussian_noise") ds_noise.show(5, truncate=False)
def main(): date_format = '%Y%m%d' start_date = '20171001' #start_date = '20180401' start_date = datetime.strptime(start_date, date_format) end_date = '20180530' end_date = datetime.strptime(end_date, date_format) CC_CONFIG_FILEPATH = "/cerebralcortex/code/config/cc_starwars_configuration.yml" all_days = [] while True: all_days.append(start_date.strftime(date_format)) start_date += timedelta(days=1) if start_date > end_date: break userids = [] f = open('users.txt', 'r') usrs = f.read() userids = usrs.split(',') userids = [x.strip() for x in userids] f.close() #userids = ['20940a76-976b-446e-b173-89237835ae6b'] # 20180401 20940a76-976b-446e-b173-89237835ae6b print("Number of users ", len(userids)) num_cores = 24 useSpark = True #useSpark = False if useSpark: spark_context = get_or_create_sc(type="sparkContext") parallelize_per_day = [] for usr in userids: for day in all_days: parallelize_per_day.append((usr, [day])) shuffle(parallelize_per_day) print(len(parallelize_per_day)) rdd = spark_context.parallelize(parallelize_per_day, len(parallelize_per_day)) try: results = rdd.map(lambda user_day: analyze_user_day( user_day[0], user_day[1], CC_CONFIG_FILEPATH)) results.count() spark_context.stop() except Exception as e: print(e) else: for usr in userids: analyze_user_day(usr, all_days, CC_CONFIG_FILEPATH)
def gen_battery_data(CC, study_name, user_id, stream_name, version=1, hours=1): """ Create pyspark dataframe with some sample phone battery data Returns: DataFrame: pyspark dataframe object with columns: ["timestamp", "battery_level", "version", "user"] """ column_name = [ "timestamp", "localtime", "user", "version", "level", "voltage", "temperature" ] sample_data = [] timestamp = datetime(2019, 1, 9, 11, 34, 59) sample = 100 voltage = 3700 temperature = 70 sqlContext = get_or_create_sc("sqlContext") total_data = hours * 60 * 60 for row in range(total_data, 1, -1): sample = float(sample - 0.01) timestamp = timestamp + timedelta(0, 1) localtime = timestamp - timedelta(hours=5) sample_data.append((timestamp, localtime, user_id, version, sample, voltage, temperature)) df = sqlContext.createDataFrame(sample_data, column_name) stream_metadata = Metadata() stream_metadata.set_study_name(study_name).set_name(stream_name).set_description("battery sample data stream.") \ .add_dataDescriptor( DataDescriptor().set_name("timestamp").set_type("datetime").set_attribute("description", "UTC timestamp of data point collection.")) \ .add_dataDescriptor( DataDescriptor().set_name("localtime").set_type("datetime").set_attribute("description", "local timestamp of data point collection.")) \ .add_dataDescriptor( DataDescriptor().set_name("user").set_type("string").set_attribute("description", "user id")) \ .add_dataDescriptor( DataDescriptor().set_name("version").set_type("int").set_attribute("description", "version of the data")) \ .add_dataDescriptor( DataDescriptor().set_name("level").set_type("float").set_attribute("description", "current battery charge")) \ .add_dataDescriptor( DataDescriptor().set_name("voltage").set_type("float").set_attribute("description", "current battery voltage level")) \ .add_dataDescriptor( DataDescriptor().set_name("temperature").set_type("float").set_attribute("description", "current battery temperature")) \ .add_module( ModuleMetadata().set_name("battery").set_version("1.2.4").set_attribute("attribute_key", "attribute_value").set_author( "Nasir Ali", "*****@*****.**")) stream_metadata.is_valid() ds = DataStream(df, stream_metadata) CC.save_stream(ds)
def process_features(feature_list, all_users, all_days, num_cores=1): ''' This method runs the processing pipeline for each of the features in the list. ''' for module in feature_list: if num_cores > 1: #num_cores *= 4 print('Driver: Spark job', module) spark_context = get_or_create_sc(type="sparkContext") if 'core.feature.gps.gps' == str(module) \ or 'sleep_duration_analysis' in str(module) \ or 'office_time' in str(module) \ or 'phone_screen_touch_features' in str(module) \ or 'socialjetlag' in str(module) \ or 'gps_location_daywise' in str(module): ''' # FIXME # TODO Currently only GPS feature computes features on a range of days. Need to find a better way if there are other modules that also works on range of days. ''' print('-' * 120) print('MODULE parallelized on only users', module) rdd = spark_context.parallelize(all_users, num_cores) results = rdd.map(lambda user: process_feature_on_user( user, module, all_days, cc_config_path)) results.count() else: print('MODULE', module) parallelize_per_day = [] for usr in all_users: for day in all_days: parallelize_per_day.append((usr, [day])) shuffle(parallelize_per_day) rdd = spark_context.parallelize(parallelize_per_day, len(parallelize_per_day)) results = rdd.map(lambda user_day: process_feature_on_user( user_day[0], module, user_day[1], cc_config_path)) results.count() spark_context.stop() else: print('Driver: single threaded') for user in all_users: process_feature_on_user(user, module, all_days, cc_config_path)
def gen_accel_gyro_data(CC, study_name, user_id, stream_name, version=1, hours=1, frequency=32): """ Create pyspark dataframe with some sample phone battery data Returns: DataFrame: pyspark dataframe object with columns: ["timestamp", "battery_level", "version", "user"] """ column_name = ["timestamp", "localtime", "user" ,"version", "x", "y", "z"] sample_data = [] timestamp = datetime(2019, 1, 9, 11, 34, 59) sqlContext = get_or_create_sc("sqlContext") total_hours = (hours*60*60)*frequency for row in range(total_hours): x = round(random.uniform(-2,2),8) y = round(random.uniform(-2,2),8) z = round(random.uniform(-2,2),8) timestamp = timestamp + timedelta(milliseconds=1) localtime = timestamp - timedelta(hours=5) sample_data.append((timestamp, localtime, user_id, version, x, y, z)) df = sqlContext.createDataFrame(sample_data, column_name) stream_metadata = Metadata() stream_metadata.set_study_name(study_name).set_name(stream_name).set_description("wrist watch sensor sample data stream.") \ .add_dataDescriptor( DataDescriptor().set_name("timestamp").set_type("datetime").set_attribute("description", "UTC timestamp of data point collection.")) \ .add_dataDescriptor( DataDescriptor().set_name("localtime").set_type("datetime").set_attribute("description", "local timestamp of data point collection.")) \ .add_dataDescriptor( DataDescriptor().set_name("user").set_type("string").set_attribute("description", "user id")) \ .add_dataDescriptor( DataDescriptor().set_name("version").set_type("int").set_attribute("description", "version of the data")) \ .add_dataDescriptor( DataDescriptor().set_name("x").set_type("float").set_attribute("description", "x-axis")) \ .add_dataDescriptor( DataDescriptor().set_name("y").set_type("float").set_attribute("description", "y-axis")) \ .add_dataDescriptor( DataDescriptor().set_name("z").set_type("float").set_attribute("description", "z-axis")) \ .add_module( ModuleMetadata().set_name("phone.sensors").set_version("1.2.4").set_attribute("attribute_key", "attribute_value").set_author( "Nasir Ali", "*****@*****.**")) stream_metadata.is_valid() ds = DataStream(df, stream_metadata) CC.save_stream(ds)
def gen_phone_battery_data(user_id) -> object: """ Create pyspark dataframe with some sample phone battery data Returns: DataFrame: pyspark dataframe object with columns: ["timestamp", "battery_level", "version", "user"] """ column_name = ["timestamp", "battery_level", "version", "user"] sample_data = [] timestamp = datetime(2019, 1, 9, 11, 34, 59) tmp = 1 sample = 100 sqlContext = get_or_create_sc("sqlContext") for row in range(1000, 1, -1): tmp += 1 if tmp == 100: sample = sample - 1 tmp = 1 timestamp = timestamp + timedelta(0, 1) sample_data.append((timestamp, sample, 1, user_id)) df = sqlContext.createDataFrame(sample_data, column_name) return df
def run(): selected_participants = [ "622bf725-2471-4392-8f82-fcc9115a3745", "d3d33d63-101d-44fd-b6b9-4616a803225d", "c1f31960-dee7-45ea-ac13-a4fea1c9235c", "7b8358f3-c96a-4a17-87ab-9414866e18db", "8a3533aa-d6d4-450c-8232-79e4851b6e11", "e118d556-2088-4cc2-b49a-82aad5974167", "260f551d-e3c1-475e-b242-f17aad20ba2c", "dd13f25f-77a0-4a2c-83af-bb187b79a389", "17b07883-4959-4037-9b80-dde9a06b80ae", "5af23884-b630-496c-b04e-b9db94250307", "61519ad0-2aea-4250-9a82-4dcdb93a569c", "326a6c55-c963-42c2-bb8a-2591993aaaa2", "a54d9ef4-a46a-418b-b6cc-f10b49a946ac", "2fb5e890-afaf-428a-8e28-a7c70bf8bdf1", "c93a811e-1f47-43b6-aef9-c09338e43947", "9e4aeae9-8729-4b0f-9e84-5c1f4eeacc74", "479eea59-8ad8-46aa-9456-29ab1b8f2cb2", "b4ff7130-3055-4ed1-a878-8dfaca7191ac", "fbd7bc95-9f42-4c2c-94f4-27fd78a7273c", "bbc41a1e-4bbe-4417-a40c-64635cc552e6", "82a921b9-361a-4fd5-8db7-98961fdbf25a", "66a5cdf8-3b0d-4d85-bdcc-68ae69205206", "d4691f19-57be-44c4-afc2-5b5f82ec27b5", "136f8891-af6f-49c1-a69a-b4acd7116a3c" ] parser = argparse.ArgumentParser( description='CerebralCortex Kafka Message Handler.') parser.add_argument("-c", "--config_filepath", help="Configuration file path", required=True) # parser.add_argument("-d", "--data_dir", help="Directory path where all the gz files are stored by API-Server", # required=True) parser.add_argument( "-bd", "--batch_duration", help= "How frequent kafka messages shall be checked (duration in seconds)", default="5", required=False) parser.add_argument( "-mbs", "--mydb_batch_size", help="Total number of messages to fetch from MySQL for processing.", default="5000", required=True) parser.add_argument( "-participants", "--participants", help="Whether run data replay on all participants or select one.", default="all", required=False) args = vars(parser.parse_args()) participants = args["participants"] mydb_batch_size = args["mydb_batch_size"] config_filepath = str(args["config_filepath"]).strip() batch_duration = int(args["batch_duration"]) # data_path = str(args["data_dir"]).strip() # if (data_path[-1] != '/'): # data_path += '/' # Kafka Consumer Configs spark_context = get_or_create_sc(type="sparkContext") spark_context.setLogLevel("WARN") consumer_group_id = "md2k-test" CC = CerebralCortex(config_filepath) broker = str(CC.config["kafkaserver"]["host"]) + ":" + str( CC.config["kafkaserver"]["port"]) data_replay_using = str(CC.config["data_replay"]["replay_type"]) data_path = CC.config["data_replay"]["data_dir"] if data_replay_using == "mydb": for replay_batch in CC.SqlData.get_replay_batch( record_limit=mydb_batch_size): new_replay_batch = [] #get records from mysql and process (skip kafka) if participants == "all": new_replay_batch = replay_batch else: for rb in replay_batch: if rb["owner_id"] in selected_participants: new_replay_batch.append(rb) mysql_batch_to_db(spark_context, new_replay_batch, data_path, config_filepath) else: ssc = StreamingContext(spark_context, batch_duration) kafka_files_stream = spark_kafka_consumer(["filequeue"], ssc, broker, consumer_group_id, CC) if kafka_files_stream is not None: kafka_files_stream.foreachRDD( lambda rdd: kafka_file_to_json_producer( rdd, data_path, config_filepath, CC)) ssc.start() ssc.awaitTermination()
CC, config) if __name__ == '__main__': # create and load CerebralCortex object and configs parser = argparse.ArgumentParser( description='CerebralCortex Kafka Message Handler.') parser.add_argument("-cc", "--cc_config_filepath", help="Configuration file path", required=True) parser.add_argument("-mdc", "--mdebugger_config_filepath", help="mDebugger configuration file path", required=True) args = vars(parser.parse_args()) CC = CerebralCortex(args["cc_config_filepath"]) # load data diagnostic configs md_config = Configuration(args["mdebugger_config_filepath"]).config # get/create spark context spark_context = get_or_create_sc(type="sparkContext") # run for one participant # DiagnoseData().one_user_data(["cd7c2cd6-d0a3-4680-9ba2-0c59d0d0c684"], md_config, CC, spark_context) # run for all the participants in a study all_users_data("mperf", md_config, CC, spark_context)
def __init__(self, configs_dir_path: str = "", cc_configs: dict = None, study_name: str = "default", new_study: bool = False, enable_spark: bool = True, enable_spark_ui=False): """ CerebralCortex constructor Args: configs_dir_path (str): Directory path of cerebralcortex configurations. cc_configs (dict or str): if sets to cc_configs="default" all defaults configs would be loaded. Or you can provide a dict of all available cc_configs as a param study_name (str): name of the study. If there is no study, you can pass study name as study_name="default" new_study (bool): create a new study with study_name if it does not exist enable_spark (bool): enable spark enable_spark_ui (bool): enable spark ui Raises: ValueError: If configuration_filepath is None or empty. Examples: >>> CC = Kernel(cc_configs="default", study_name="default") >>> # if you want to change any of the configs, pass cc_configs as dict with new configurations >>> updated_cc_configs = {"nosql_storage": "filesystem", "filesystem_path": "/path/to/store/data/"} >>> CC = Kernel(cc_configs=updated_cc_configs, study_name="default") >>> # for complete configs, have a look at default configs at: https://github.com/MD2Korg/CerebralCortex-Kernel/blob/3.3/cerebralcortex/core/config_manager/default.yml """ try: if not os.getenv("PYSPARK_PYTHON"): os.environ["PYSPARK_PYTHON"] = os.popen( 'which python3').read().replace("\n", "") if not os.getenv("PYSPARK_DRIVER_PYTHON"): os.environ["PYSPARK_DRIVER_PYTHON"] = os.popen( 'which python3').read().replace("\n", "") except: raise Exception( "Please set PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON environment variable. For example, export PYSPARK_DRIVER_PYTHON=/path/to/python/dir" ) try: if not os.getenv("SPARK_HOME"): import pyspark spark_installation_path = os.path.dirname(pyspark.__file__) import findspark findspark.init(spark_installation_path) except: raise Exception("Set SPARK_HOME environment variable.") if not configs_dir_path and not cc_configs: raise ValueError("Please provide configs_dir_path or cc_configs.") elif configs_dir_path and cc_configs: raise ValueError("Provide only configs_dir_path OR cc_configs.") self.version = __version__ self.config_filepath = configs_dir_path self.study_name = study_name os.environ["STUDY_NAME"] = study_name self.config = Configuration(configs_dir_path, cc_configs).config if enable_spark: self.sparkContext = get_or_create_sc( enable_spark_ui=enable_spark_ui) self.sqlContext = get_or_create_sc(type="sqlContext", enable_spark_ui=enable_spark_ui) self.sparkSession = get_or_create_sc( type="sparkSession", enable_spark_ui=enable_spark_ui) else: self.sparkContext = None self.sqlContext = None self.sparkSession = None if self.config["mprov"] == "pennprov": os.environ["MPROV_HOST"] = self.config["pennprov"]["host"] os.environ["MPROV_USER"] = self.config["pennprov"]["user"] os.environ["MPROV_PASSWORD"] = self.config["pennprov"]["password"] os.environ["ENABLE_MPROV"] = "True" elif self.config["mprov"] == "none": os.environ["ENABLE_MPROV"] = "False" else: raise ValueError( "Please check cerebralcortex.yml file. mprov is not properly configured." ) self.new_study = new_study if not study_name: raise Exception("Study name cannot be None.") self.debug = self.config["cc"]["debug"] self.logging = CCLogging(self) self.logtypes = LogTypes() self.SqlData = SqlData(self) self.RawData = RawData(self) self.TimeSeriesData = None warnings.simplefilter('always', DeprecationWarning) if not new_study and not self.RawData.is_study(): raise Exception( "Study name does not exist. If this is a new study set new_study param to True" ) if self.config["visualization_storage"] != "none": self.TimeSeriesData = TimeSeriesData(self)
help="mDebugger configuration file path", required=True) parser.add_argument("-sn", "--study_name", help="mDebugger configuration file path", required=True) parser.add_argument("-spm", "--spark_master", help="mDebugger configuration file path", required=False) args = vars(parser.parse_args()) CC = CerebralCortex(args["cc_config_filepath"]) # load data reporting configs cr_config = Configuration(args["cc_reporting_config_filepath"]).config cc_config_file = args["cc_config_filepath"] # get/create spark context if args["spark_master"]: spark_context = get_or_create_sc(type="sparkContext", master=args["spark_master"]) else: spark_context = get_or_create_sc(type="sparkContext") # run for all the participants in a study #all_users_data("mperf", md_config, CC, spark_context) #TESTING all_users_data(args["study_name"], cc_config_file, cr_config, CC, spark_context)
def gen_location_datastream(user_id, stream_name) -> object: """ Create pyspark dataframe with some sample gps data (Memphis, TN, lat, long, alt coordinates) Args: user_id (str): id of a user stream_name (str): sample gps stream name Returns: DataStream: datastream object of gps location stream with its metadata """ column_name = [ "timestamp", "localtime", "user", "version", "latitude", "longitude", "altitude", "speed", "bearing", "accuracy" ] sample_data = [] timestamp = datetime(2019, 9, 1, 11, 34, 59) sqlContext = get_or_create_sc("sqlContext") lower_left = [35.079678, -90.074136] upper_right = [35.194771, -89.868766] alt = [i for i in range(83, 100)] for location in range(5): lat = random.uniform(lower_left[0], upper_right[0]) long = random.uniform(lower_left[1], upper_right[1]) for dp in range(150): lat_val = random.gauss(lat, 0.001) long_val = random.gauss(long, 0.001) alt_val = random.choice(alt) speed_val = round(random.uniform(0.0, 5.0), 6) bearing_val = round(random.uniform(0.0, 350), 6) accuracy_val = round(random.uniform(10.0, 30.4), 6) timestamp = timestamp + timedelta(minutes=1) localtime = timestamp + timedelta(hours=5) sample_data.append( (timestamp, localtime, user_id, 1, lat_val, long_val, alt_val, speed_val, bearing_val, accuracy_val)) df = sqlContext.createDataFrame(sample_data, column_name) stream_metadata = Metadata() stream_metadata.set_study_name("default").set_name(stream_name).set_description("GPS sample data stream.") \ .add_dataDescriptor( DataDescriptor().set_name("timestamp").set_type("datetime").set_attribute("description", "UTC timestamp of data point collection.")) \ .add_dataDescriptor( DataDescriptor().set_name("localtime").set_type("datetime").set_attribute("description", "local timestamp of data point collection.")) \ .add_dataDescriptor( DataDescriptor().set_name("user").set_type("string").set_attribute("description", "user id")) \ .add_dataDescriptor( DataDescriptor().set_name("version").set_type("int").set_attribute("description", "version of the data")) \ .add_dataDescriptor( DataDescriptor().set_name("latitude").set_type("float").set_attribute("description", "gps latitude")) \ .add_dataDescriptor( DataDescriptor().set_name("longitude").set_type("float").set_attribute("description", "gps longitude")) \ .add_dataDescriptor( DataDescriptor().set_name("altitude").set_type("float").set_attribute("description", "gps altitude")) \ .add_dataDescriptor( DataDescriptor().set_name("speed").set_type("float").set_attribute("description", "speed info")) \ .add_dataDescriptor( DataDescriptor().set_name("bearing").set_type("float").set_attribute("description", "bearing info")) \ .add_dataDescriptor( DataDescriptor().set_name("accuracy").set_type("float").set_attribute("description", "accuracy of gps location")) \ .add_module( ModuleMetadata().set_name("examples.util.data_helper.gen_location_data").set_attribute("attribute_key", "attribute_value").set_author( "Nasir Ali", "*****@*****.**")) stream_metadata.is_valid() ds = DataStream(data=df, metadata=stream_metadata) return ds
def gen_stress_data(stream_name, spark_df=False): data = [ [0.7, "road", "Driving", "Was Tailgated", "IN_VEHICLE"], [0.3, "work", "Job", "Bored / Not enough to do", "STILL"], [0.5, "home", "Health", "Physical inability", "STILL"], [0.6, "road", "Driving", "Saw a police car", "IN_VEHICLE"], [0.38, "work", "Job", "Technology barriers", "STILL"], [0.2, "home", "Finance", "Missed payment", "UNKNOWN"], [0.9, "work", "Finance", "Unexpected losses", "WALKING"], [0.54, "road", "Driving", "Difficulty in navigating", "IN_VEHICLE"], [0.79, "work", "Job", "Unpleasant conversation", "ON_FOOT"], [0.28, "road", "Health", "My eating habits", "IN_VEHICLE"], [ 0.47, "road", "Driving", "Indecision at a traffic intersection", "IN_VEHICLE" ], [0.67, "work", "Job", "Late arrival", "WALKING"], ] column_name = [ 'user', 'timestamp', 'localtime', 'version', 'start_time', 'end_time', 'density', 'location', 'stresser_main', 'stresser_sub', 'activity' ] sample_data = [] timestamp = datetime(2019, 1, 9, 11, 34, 59) for row in range(20, 1, -1): if row > 10: user_id = "00000000-afb8-476e-9872-6472b4e66b68" else: user_id = "b1117354-ce48-4325-b2e3-78b0cc932819" timestamp = timestamp + timedelta( hours=random.choice([1, 3, 7, 2, 4, 5])) localtime = timestamp - timedelta(hours=5) start_time = timestamp end_time = timestamp + timedelta( minutes=random.choice([12, 6, 8, 16, 29, 45, 2, 3, 8])) data_vals = random.choice(data) sample_data.append([ user_id, timestamp, localtime, 1, start_time, end_time, data_vals[0], data_vals[1], data_vals[2], data_vals[3], data_vals[4] ]) stream_metadata = Metadata() stream_metadata.set_study_name("default").set_name(stream_name).set_description("GPS sample data stream.") \ .add_dataDescriptor( DataDescriptor().set_name("start_time").set_type("datetime").set_attribute("description", "start time of a stress episode.")) \ .add_dataDescriptor( DataDescriptor().set_name("end_time").set_type("datetime").set_attribute("description", "end time of a stress episode.")) \ .add_dataDescriptor( DataDescriptor().set_name("density").set_type("float").set_attribute("description", "density of stress")) \ .add_dataDescriptor( DataDescriptor().set_name("location").set_type("string").set_attribute("description", "location where stress episode was captured.")) \ .add_dataDescriptor( DataDescriptor().set_name("stresser_main").set_type("string").set_attribute("description", "stressers' main category.")) \ .add_dataDescriptor( DataDescriptor().set_name("stresser_sub").set_type("string").set_attribute("description", "stressers' sub category.")) \ .add_dataDescriptor( DataDescriptor().set_name("activity").set_type("string").set_attribute("description", "physical activity name")) \ .add_module( ModuleMetadata().set_name("examples.util.data_helper.gen_stress_data").set_attribute("attribute_key", "attribute_value").set_author( "Nasir Ali", "*****@*****.**")) stream_metadata.is_valid() if spark_df: sqlContext = get_or_create_sc("sqlContext") df = sqlContext.createDataFrame(sample_data, column_name) else: df = pd.DataFrame(sample_data, columns=column_name) ds = DataStream(df, stream_metadata) return ds
def gen_location_datastream(user_id, stream_name) -> object: """ Create pyspark dataframe with some sample gps data (Memphis, TN, lat, long, alt coordinates) Args: user_id (str): id of a user stream_name (str): sample gps stream name Returns: DataStream: datastream object of gps location stream with its metadata """ column_name = [ "timestamp", "localtime", "user", "version", "latitude", "longitude", "altitude", "speed", "bearing", "accuracy" ] sample_data = [] timestamp = datetime(2019, 1, 9, 11, 34, 59) sqlContext = get_or_create_sc("sqlContext") lat = [ 35.1247391, 35.1257391, 35.1217391, 35.1117391, 35.1317391, 35.1287391, 35.5217391 ] long = [ -89.9750021, -89.9710021, -89.9800021, -89.9670021, -89.9790021, -89.9710021, -89.8700021 ] alt = [83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 89.0] for dp in range(500): lat_val = random.choice(lat) long_val = random.choice(long) alt_val = random.choice(alt) #ts_val = 15094)+(16272882+(dp*1000000)) speed_val = round(random.uniform(0.0, 5.0), 6) bearing_val = round(random.uniform(0.0, 350), 6) accuracy_val = round(random.uniform(10.0, 30.4), 6) #all_dps = ",".join([ts_val, lat_val, long_val, alt_val, speed_val, bearing_val, accuracy_val]) timestamp = timestamp + timedelta(minutes=1) localtime = timestamp + timedelta(hours=5) sample_data.append( (timestamp, localtime, user_id, 1, lat_val, long_val, alt_val, speed_val, bearing_val, accuracy_val)) df = sqlContext.createDataFrame(sample_data, column_name) stream_metadata = Metadata() stream_metadata.set_name(stream_name).set_version(1).set_description("GPS sample data stream.") \ .add_dataDescriptor( DataDescriptor().set_name("latitude").set_type("float").set_attribute("description", "gps latitude")) \ .add_dataDescriptor( DataDescriptor().set_name("longitude").set_type("float").set_attribute("description", "gps longitude")) \ .add_dataDescriptor( DataDescriptor().set_name("altitude").set_type("float").set_attribute("description", "gps altitude")) \ .add_dataDescriptor( DataDescriptor().set_name("speed").set_type("float").set_attribute("description", "speed info")) \ .add_dataDescriptor( DataDescriptor().set_name("bearing").set_type("float").set_attribute("description", "bearing info")) \ .add_dataDescriptor( DataDescriptor().set_name("accuracy").set_type("float").set_attribute("description", "accuracy of gps location")) \ .add_module( ModuleMetadata().set_name("examples.util.data_helper.gen_location_data").set_version("0.0.1").set_attribute("attribute_key", "attribute_value").set_author( "test_user", "test_user@test_email.com")) stream_metadata.is_valid() return DataStream(data=df, metadata=stream_metadata)