def get_metadata():
    stream_name = 'fill in your stream name'
    stream_metadata = Metadata()
    stream_metadata.set_name(stream_name).set_description("Sequence Aligment, Timestamp Correction and Decoding of MotionsenseHRV") \
        .add_dataDescriptor(
        DataDescriptor().set_name("red").set_type("float").set_attribute("description", \
                                                                         "Value of Red LED - PPG")) \
        .add_dataDescriptor( \
        DataDescriptor().set_name("infrared").set_type("float").set_attribute("description", \
                                                                              "Value of Infrared LED - PPG")) \
        .add_dataDescriptor( \
        DataDescriptor().set_name("green").set_type("float").set_attribute("description", \
                                                                           "Value of Green LED - PPG")) \
        .add_dataDescriptor( \
        DataDescriptor().set_name("aclx").set_type("float").set_attribute("description", \
                                                                          "Wrist Accelerometer X-axis")) \
        .add_dataDescriptor( \
        DataDescriptor().set_name("acly").set_type("float").set_attribute("description", \
                                                                          "Wrist Accelerometer Y-axis")) \
        .add_dataDescriptor( \
        DataDescriptor().set_name("aclz").set_type("float").set_attribute("description", \
                                                                          "Wrist Accelerometer Z-axis")) \
        .add_dataDescriptor( \
        DataDescriptor().set_name("gyrox").set_type("float").set_attribute("description", \
                                                                           "Wrist Gyroscope X-axis")) \
        .add_dataDescriptor( \
        DataDescriptor().set_name("gyroy").set_type("float").set_attribute("description", \
                                                                           "Wrist Gyroscope Y-axis")) \
        .add_dataDescriptor( \
        DataDescriptor().set_name("gyroz").set_type("float").set_attribute("description", \
                                                                           "Wrist Gyroscope Z-axis")).add_module( \
        ModuleMetadata().set_name("cerebralcortex.algorithms.raw_byte_decode.motionsenseHRV.py").set_attribute("url", "hhtps://md2k.org").set_author(
            "Md Azim Ullah", "*****@*****.**"))
    return stream_metadata
def upload_stream_data(base_url: str, username: str, password: str,
                       stream_name: str, data_file_path: str):
    """
    Upload stream data to cerebralcortex storage using CC-ApiServer

    Args:
        base_url (str): base url of CerebralCortex-APIServer. For example, http://localhost/
        username (str): username
        password (str): password of the user
        data_file_path (str): stream data file path that needs to be uploaded

    Raises:
        Exception: if stream data upload fails

    """

    login_url = base_url + "api/v3/user/login"
    register_stream_url = base_url + "api/v3/stream/register"
    user_metadata = {
        "username": username,
        "password": password,
        "user_role": "demo-user",
        "user_metadata": {
            "key": "demo-md",
            "value": "demo-vmd"
        },
        "user_settings": {
            "key": "string",
            "value": "string"
        }
    }
    metadata = Metadata().set_name(stream_name).set_description("mobile phone accelerometer sensor data.") \
        .add_dataDescriptor(
        DataDescriptor().set_name("accelerometer_x").set_type("float").set_attribute("description", "acceleration minus gx on the x-axis")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("accelerometer_y").set_type("float").set_attribute("description", "acceleration minus gy on the y-axis")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("accelerometer_z").set_type("float").set_attribute("description", "acceleration minus gz on the z-axis")) \
        .add_module(
        ModuleMetadata().set_name("cerebralcortex.streaming_operation.main").set_version("2.0.7").set_attribute("description", "data is collected using mcerebrum.").set_author(
            "test_user", "test_user@test_email.com"))

    stream_metadata = metadata.to_json()
    user_registration_url = base_url + "api/v3/user/register"
    client.register_user(url=user_registration_url,
                         user_metadata=user_metadata)

    auth = client.login_user(login_url, username, password)

    status = client.register_stream(register_stream_url,
                                    auth.get("auth_token"), stream_metadata)

    stream_upload_url = base_url + "api/v3/stream/" + status.get("hash_id")
    result = client.upload_stream_data(stream_upload_url,
                                       auth.get("auth_token"), data_file_path)

    print(result)
Пример #3
0
    def create_windows(self, window_length='hour'):
        """
        filter data

        Args:
            columnName (str): name of the column
            operator (str): basic operators (e.g., >, <, ==, !=)
            value (Any): if the columnName is timestamp, please provide python datatime object

        Returns:
            DataStream: this will return a new datastream object with blank metadata
        """
        windowed_df = self._data.withColumn('custom_window', windowing_udf('timestamp'))
        return DataStream(data=windowed_df, metadata=Metadata())
        return DataStream(data=windowed_df, metadata=Metadata())
    def get_stream_metadata_by_name(self, stream_name: str, version:int) -> Metadata:
        """
        Get a list of metadata for all versions available for a stream.

        Args:
            stream_name (str): name of a stream
            version (int): version of a stream. Acceptable parameters are all, latest, or a specific version of a stream (e.g., 2.0) (Default="all")

        Returns:
            Metadata: Returns an empty list if no metadata is available for a stream_name or a list of metadata otherwise.
        Raises:
            ValueError: stream_name cannot be None or empty.
        Examples:
            >>> CC = CerebralCortex("/directory/path/of/configs/")
            >>> CC.list_users("mperf")
            >>> [Metadata] # list of MetaData class objects
        """
        if stream_name is None or stream_name=="":
            raise ValueError("stream_name cannot be None or empty.")

        rows = self.session.query(Stream.stream_metadata).filter((Stream.name == stream_name) & (Stream.version==version) & (Stream.study_name==self.study_name)).first()

        if rows:
            return Metadata().from_json_file(rows.stream_metadata)
        else:
            return None
Пример #5
0
    def mapInPandas(self, func, schema):
        """
        Maps an iterator of batches in the current :class:`DataFrame` using a Python native
        function that takes and outputs a pandas DataFrame, and returns the result as a
        :class:`DataFrame`.
        The function should take an iterator of `pandas.DataFrame`\\s and return
        another iterator of `pandas.DataFrame`\\s. All columns are passed
        together as an iterator of `pandas.DataFrame`\\s to the function and the
        returned iterator of `pandas.DataFrame`\\s are combined as a :class:`DataFrame`.
        Each `pandas.DataFrame` size can be controlled by
        `spark.sql.execution.arrow.maxRecordsPerBatch`.

        Args:
            func: function a Python native function that takes an iterator of `pandas.DataFrame`, and
                outputs an iterator of `pandas.DataFrame`.

            schema: :class:`pyspark.sql.types.DataType` or str
                the return type of the `func` in PySpark. The value can be either a
                :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.

        Returns:

        Examples:
            >>> def filter_func(iterator):
            ...     for pdf in iterator:
            ...         yield pdf[pdf.id == 1]
            >>> ds.mapInPandas(filter_func, ds.schema).show()
        """

        return DataStream(data=self._data.mapInPandas(func=func,
                                                      schema=schema),
                          metadata=Metadata())
Пример #6
0
    def applyInPandas(self, func, schema):
        """
        The function should take a `pandas.DataFrame` and return another
        `pandas.DataFrame`. For each group, all columns are passed together as a `pandas.DataFrame`
        to the user-function and the returned `pandas.DataFrame` are combined as a
        `DataFrame`.

        The `schema` should be a `StructType` describing the schema of the returned
        `pandas.DataFrame`. The column labels of the returned `pandas.DataFrame` must either match
        the field names in the defined schema if specified as strings, or match the
        field data types by position if not strings, e.g. integer indices.
        The length of the returned `pandas.DataFrame` can be arbitrary.

        Args:
            func: a Python native function that takes a `pandas.DataFrame`, and outputs a `pandas.DataFrame`.
            schema: :class:`pyspark.sql.types.DataType` or str
            the return type of the `func` in PySpark. The value can be either a
            :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.

        Returns:

        """
        return DataStream(data=self._data.applyInPandas(func=func,
                                                        schema=schema),
                          metadata=Metadata())
Пример #7
0
    def join_stress_streams(self, dataStream, propagation='forward'):
        """
        filter data

        Args:
            columnName (str): name of the column
            operator (str): basic operators (e.g., >, <, ==, !=)
            value (Any): if the columnName is timestamp, please provide python datatime object

        Returns:
            DataStream: this will return a new datastream object with blank metadata
        """
        combined_df = self._data.join(
            dataStream.data,
            on=['user', 'timestamp', 'localtime', 'version'],
            how='full').orderBy('timestamp')
        combined_filled = combined_df.withColumn(
            "data_quality",
            F.last('data_quality', True).over(
                Window.partitionBy('user').orderBy('timestamp').rowsBetween(
                    -sys.maxsize, 0)))
        combined_filled_filtered = combined_filled.filter(
            combined_filled.ecg.isNotNull())

        return DataStream(data=combined_filled_filtered, metadata=Metadata())
Пример #8
0
    def compute(self, udfName, windowDuration: int = None, slideDuration: int = None,
                      groupByColumnName: List[str] = [], startTime=None):
        """
        Run an algorithm. This method supports running an udf method on windowed data

        Args:
            udfName: Name of the algorithm
            windowDuration (int): duration of a window in seconds
            slideDuration (int): slide duration of a window
            groupByColumnName List[str]: groupby column names, for example, groupby user, col1, col2
            startTime (datetime): The startTime is the offset with respect to 1970-01-01 00:00:00 UTC with which to start window intervals. For example, in order to have hourly tumbling windows that start 15 minutes past the hour, e.g. 12:15-13:15, 13:15-14:15... provide startTime as 15 minutes. First time of data will be used as startTime if none is provided
        Returns:
            DataStream: this will return a new datastream object with blank metadata

        """
        if slideDuration:
            slideDuration = str(slideDuration) + " seconds"
            
        if 'custom_window' in self._data.columns:
            data = self._data.groupby('user', 'custom_window').apply(udfName)
        else:
            groupbycols = ["user", "version"]
        
            if windowDuration:
                windowDuration = str(windowDuration) + " seconds"
                win = F.window("timestamp", windowDuration=windowDuration, slideDuration=slideDuration, startTime=startTime)
                groupbycols.append(win)

            if len(groupByColumnName) > 0:
                groupbycols.extend(groupByColumnName)

            data = self._data.groupBy(groupbycols).apply(udfName)

        return DataStream(data=data, metadata=Metadata())
Пример #9
0
def ema_logs(ds):
    """
    Convert json column to multiple columns.

    Args:
        ds (DataStream): Windowed/grouped DataStream object

    Returns:

    """
    schema = StructType([
        StructField("timestamp", TimestampType()),
        StructField("localtime", TimestampType()),
        StructField("user", StringType()),
        StructField("version", IntegerType()),
        StructField("status", StringType()),
        StructField("ema_id", StringType()),
        StructField("schedule_timestamp", TimestampType()),
        StructField("operation", StringType())
    ])

    @pandas_udf(schema, PandasUDFType.GROUPED_MAP)
    def parse_ema_logs(user_data):
        all_vals = []
        for index, row in user_data.iterrows():
            ema = row["log"]
            if not isinstance(ema, dict):
                ema = json.loads(ema)

            operation = ema["operation"].lower()
            if operation != "condition":
                status = ema.get("status", "")
                ema_id = ema["id"]
                schedule_timestamp = ema.get("logSchedule",
                                             {}).get("scheduleTimestamp")
                if schedule_timestamp:
                    schedule_timestamp = pd.to_datetime(schedule_timestamp,
                                                        unit='ms')

                all_vals.append([
                    row["timestamp"], row["localtime"], row["user"], 1, status,
                    ema_id, schedule_timestamp, operation
                ])

        return pd.DataFrame(all_vals,
                            columns=[
                                'timestamp', 'localtime', 'user', 'version',
                                'status', 'ema_id', 'schedule_timestamp',
                                'operation'
                            ])

    # check if datastream object contains grouped type of DataFrame
    if not isinstance(ds._data, GroupedData):
        raise Exception(
            "DataStream object is not grouped data type. Please use 'window' operation on datastream object before running this algorithm"
        )

    data = ds._data.apply(parse_ema_logs)
    return DataStream(data=data, metadata=Metadata())
Пример #10
0
def interpolate(ds,
                freq=16,
                method='linear',
                axis=0,
                limit=None,
                inplace=False,
                limit_direction='forward',
                limit_area=None,
                downcast=None):
    """
    Interpolate values according to different methods. This method internally uses pandas interpolation.

    Args:
        ds (DataStream): Windowed/grouped DataStream object
        freq (int): Frequency of the signal
        method (str): default ‘linear’
            - ‘linear’: Ignore the index and treat the values as equally spaced. This is the only method supported on MultiIndexes.
            - ‘time’: Works on daily and higher resolution data to interpolate given length of interval.
            - ‘index’, ‘values’: use the actual numerical values of the index.
            - ‘pad’: Fill in NaNs using existing values.
            - ‘nearest’, ‘zero’, ‘slinear’, ‘quadratic’, ‘cubic’, ‘spline’, ‘barycentric’, ‘polynomial’: Passed to scipy.interpolate.interp1d. These methods use the numerical values of the index. Both ‘polynomial’ and ‘spline’ require that you also specify an order (int), e.g. df.interpolate(method='polynomial', order=5).
            - ‘krogh’, ‘piecewise_polynomial’, ‘spline’, ‘pchip’, ‘akima’: Wrappers around the SciPy interpolation methods of similar names. See Notes.
            - ‘from_derivatives’: Refers to scipy.interpolate.BPoly.from_derivatives which replaces ‘piecewise_polynomial’ interpolation method in scipy 0.18.
        axis  {0 or ‘index’, 1 or ‘columns’, None}: default None. Axis to interpolate along.
        limit (int): optional. Maximum number of consecutive NaNs to fill. Must be greater than 0.
        inplace (bool): default False. Update the data in place if possible.
        limit_direction {‘forward’, ‘backward’, ‘both’}: default ‘forward’. If limit is specified, consecutive NaNs will be filled in this direction.
        limit_area  {None, ‘inside’, ‘outside’}: default None. If limit is specified, consecutive NaNs will be filled with this restriction.
            - None: No fill restriction.
            - ‘inside’: Only fill NaNs surrounded by valid values (interpolate).
            - ‘outside’: Only fill NaNs outside valid values (extrapolate).
        downcast optional, ‘infer’ or None: defaults to None
        **kwargs: Keyword arguments to pass on to the interpolating function.

    Returns DataStream: interpolated data

    """
    schema = ds._data.schema
    sample_freq = 1000 / freq

    @pandas_udf(schema, PandasUDFType.GROUPED_MAP)
    def interpolate_data(pdf):
        pdf.set_index("timestamp", inplace=True)
        pdf = pdf.resample(str(sample_freq) + "ms").bfill(limit=1).interpolate(
            method=method,
            axis=axis,
            limit=limit,
            inplace=inplace,
            limit_direction=limit_direction,
            limit_area=limit_area,
            downcast=downcast)
        pdf.ffill(inplace=True)
        pdf.reset_index(drop=False, inplace=True)
        pdf.sort_index(axis=1, inplace=True)
        return pdf

    data = ds._data.groupby(["user", "version"]).apply(interpolate_data)
    return DataStream(data=data, metadata=Metadata())
    def get_metadata():
        """
        generate metadata for the stream

        Returns:
            MetaData object
        """
        stream_metadata = Metadata()
        stream_metadata.set_name(stream_name).set_description("ECG RR interval in milliseconds") \
            .add_input_stream(ecg_data.metadata.get_name()) \
            .add_dataDescriptor(
            DataDescriptor().set_name("rr").set_type("float") \
                .set_attribute("description","rr interval")) \
            .add_module(
            ModuleMetadata().set_name("ecg rr interval") \
                .set_attribute("url","http://md2k.org/") \
                .set_attribute('algorithm','pan-tomkins').set_attribute('unit','ms').set_author("Md Azim Ullah", "*****@*****.**"))
        return stream_metadata
Пример #12
0
 def get_metadata():
     stream_metadata = Metadata()
     stream_metadata.set_name(stream_name).set_description("Stress episodes computed using MACD formula.") \
         .add_input_stream(ecg_stress_probability.metadata.get_name()) \
         .add_dataDescriptor(DataDescriptor().set_name("timestamp").set_type("datetime")) \
         .add_dataDescriptor(DataDescriptor().set_name("localtime").set_type("datetime")) \
         .add_dataDescriptor(DataDescriptor().set_name("version").set_type("int")) \
         .add_dataDescriptor(DataDescriptor().set_name("user").set_type("string")) \
         .add_dataDescriptor(
         DataDescriptor().set_name("stress_probability").set_type("float")) \
         .add_dataDescriptor(
         DataDescriptor().set_name("stress_episode").set_type("string").set_attribute("description", \
                                                                           "stress episodes calculated using MACD")) \
         .add_module(
         ModuleMetadata().set_name("cerebralcortex.algorithm.stress_prediction.stress_episodes.compute_stress_episodes")
             .set_attribute("url", "http://md2k.org/").set_author(
             "Anandatirtha Nandugudi", "*****@*****.**"))
     return stream_metadata
Пример #13
0
def process_save_stream(msg: dict, cc_config_path: str):
    """
    Process one of kafka messages, add gaussian noise to data and store data as a new stream

    Args:
        msg (dict): kafka message - {'filename': str, 'metadata_hash': str, "stream_name": str, "user_id": str}
        cc_config_path (str): path of cerebralcortex configs

    Notes:
        This method creates CC object again. This code is running on worker node. Thus, it won't have access to CC object created in run()
        CC object cannot be passed to worker nodes because it contains sockets and sockets cannot be serialized in spark to pass as a parameter

    """

    # Disable pandas warnings
    warnings.simplefilter(action='ignore', category=FutureWarning)

    CC = Kernel(cc_config_path, enable_spark=False)
    cc_config = CC.config
    stream_name = msg.get("stream_name")
    user_id = msg.get("user_id")

    if cc_config["nosql_storage"] == "filesystem":
        file_name = str(
            cc_config["filesystem"]["filesystem_path"]) + msg.get("filename")
    elif cc_config["nosql_storage"] == "hdfs":
        file_name = str(
            cc_config["hdfs"]["raw_files_dir"]) + msg.get("filename")
    else:
        raise Exception(
            str(cc_config["nosql_storage"]) +
            " is not supported. Please use filesystem or hdfs.")

    if os.path.exists(file_name):
        data = pq.read_table(file_name)
        pdf = data.to_pandas()

        pdf = add_gaussian_noise(pdf)

        new_stream_name = stream_name + "_gaussian_noise"

        metadata = Metadata().set_name(new_stream_name).set_description("Gaussian noise added to the accel sensor stream.") \
            .add_dataDescriptor(
            DataDescriptor().set_attribute("description", "noisy accel x")) \
            .add_dataDescriptor(
            DataDescriptor().set_attribute("description", "noisy accel y")) \
            .add_dataDescriptor(
            DataDescriptor().set_attribute("description", "noisy accel z")) \
            .add_module(
            ModuleMetadata().set_name("cerebralcortex.streaming_operation.main").set_version("0.0.1").set_attribute("description", "Spark streaming example using CerebralCortex. This example adds gaussian noise to a stream data.").set_author(
                "test_user", "test_user@test_email.com"))

        pdf["user"] = user_id
        ds = DataStream(data=pdf, metadata=metadata)
        CC.save_stream(ds)
    else:
        print(file_name, "does not exist.")
Пример #14
0
def count_encounters_per_cluster(ds, multiplier=10):
    schema = StructType([
        StructField('timestamp', TimestampType()),
        StructField('localtime', TimestampType()),
        StructField('version', IntegerType()),
        StructField('latitude', DoubleType()),
        StructField('longitude', DoubleType()),
        StructField('n_users', IntegerType()),
        StructField('total_encounters', DoubleType()),
        StructField('avg_encounters', DoubleType()),
        StructField('normalized_total_encounters', DoubleType())
    ])

    @pandas_udf(schema, PandasUDFType.GROUPED_MAP)
    def count_encounters(data):
        if data.shape[0] == 0:
            return pd.DataFrame([],
                                columns=[
                                    'version', 'latitude', 'longitude',
                                    'n_users', 'total_encounters',
                                    'avg_encounters', 'timestamp', 'localtime'
                                ])
        data = data.sort_values('localtime').reset_index(drop=True)
        centroid_id = data['centroid_id'].iloc[0]
        centroid_latitude = data['centroid_latitude'].iloc[0]
        centroid_longitude = data['centroid_longitude'].iloc[0]
        unique_users = np.unique(
            list(data['user'].unique()) +
            list(data['participant_identifier'].unique()))
        data['count'] = 1
        total_encounters = data.groupby(
            'user', as_index=False).sum()['count'].sum() + data.groupby(
                'participant_identifier', as_index=False).sum()['count'].sum()
        average_encounter = (total_encounters) / len(unique_users)
        total_encounters = data.shape[0]
        normalized_total_encounters = total_encounters * multiplier / data[
            'centroid_area'].iloc[0]
        timestamp = data['timestamp'].iloc[data.shape[0] // 2]
        localtime = data['localtime'].iloc[data.shape[0] // 2]
        version = data['version'].iloc[0]
        return pd.DataFrame([[
            normalized_total_encounters, version, centroid_latitude,
            centroid_longitude,
            len(unique_users), total_encounters, average_encounter, timestamp,
            localtime
        ]],
                            columns=[
                                'normalized_total_encounters', 'version',
                                'latitude', 'longitude', 'n_users',
                                'total_encounters', 'avg_encounters',
                                'timestamp', 'localtime'
                            ])

    data = ds._data.groupBy(['centroid_id', 'version']).apply(count_encounters)

    return DataStream(data=data, metadata=Metadata())
Пример #15
0
    def cogroup(self, other):
        """
        Cogroups this group with another group so that we can run cogrouped operations.

        Returns:

        """

        return DataStream(data=self._data.cogroup(other=other),
                          metadata=Metadata())
Пример #16
0
def ema_incentive(ds):
    """
    Parse stream name 'incentive--org.md2k.ema_scheduler--phone'. Convert json column to multiple columns.

    Args:
        ds: Windowed/grouped DataStream object

    Returns:
        ds: Windowed/grouped DataStream object.
    """
    schema = StructType([
        StructField("timestamp", TimestampType()),
        StructField("localtime", TimestampType()),
        StructField("user", StringType()),
        StructField("version", IntegerType()),
        StructField("incentive", FloatType()),
        StructField("total_incentive", FloatType()),
        StructField("ema_id", StringType()),
        StructField("data_quality", FloatType())
    ])

    @pandas_udf(schema, PandasUDFType.GROUPED_MAP)
    def parse_ema_incentive(user_data):
        all_vals = []
        for index, row in user_data.iterrows():
            ema = row["incentive"]
            if not isinstance(ema, dict):
                ema = json.loads(ema)

            incentive = ema["incentive"]
            total_incentive = ema["totalIncentive"]
            ema_id = ema["emaId"]
            data_quality = ema["dataQuality"]

            all_vals.append([
                row["timestamp"], row["localtime"], row["user"], 1, incentive,
                total_incentive, ema_id, data_quality
            ])

        return pd.DataFrame(all_vals,
                            columns=[
                                'timestamp', 'localtime', 'user', 'version',
                                'incentive', 'total_incentive', 'ema_id',
                                'data_quality'
                            ])

    # check if datastream object contains grouped type of DataFrame
    if not isinstance(ds._data, GroupedData):
        raise Exception(
            "DataStream object is not grouped data type. Please use 'window' operation on datastream object before running this algorithm"
        )

    data = ds._data.apply(parse_ema_incentive)
    return DataStream(data=data, metadata=Metadata())
Пример #17
0
 def get_metadata():
     stream_metadata = Metadata()
     stream_metadata.set_name(stream_name).set_description("stress likelihood computed from ECG") \
         .add_input_stream(stress_features_normalized.metadata.get_name()) \
         .add_dataDescriptor(
         DataDescriptor().set_name("stress_probability")
             .set_type("double").set_attribute("description","stress likelihood computed from ECG only model")
             .set_attribute("threshold","0.47")) \
         .add_dataDescriptor(
         DataDescriptor().set_name("window")
             .set_type("struct")
             .set_attribute("description", "window start and end time in UTC")
             .set_attribute('start', 'start of 1 minute window')
             .set_attribute('end','end of 1 minute window')) \
         .add_module(
         ModuleMetadata().set_name("ECG Stress Model")
             .set_attribute("url", "http://md2k.org/")
             .set_attribute('algorithm','cStress')
             .set_attribute('unit','ms').set_author("Md Azim Ullah", "*****@*****.**"))
     return stream_metadata
Пример #18
0
 def withColumn(self, colName, col):
     """
     Returns a new DataStream by adding a column or replacing the existing column that has the same name. The column expression must be an expression over this DataStream; attempting to add a column from some other datastream will raise an error.
     Args:
         colName (str): name of the new column.
         col: a Column expression for the new column.
     Examples:
         >>> ds.withColumn('col_name', ds.col_name + 2)
     """
     data = self._data.withColumn(colName=colName, col=col)
     return DataStream(data=data, metadata=Metadata())
Пример #19
0
    def limit(self, num):
        """
        Limits the result count to the number specified.

        Args:
            num:

        Returns:
            Datastream:
        """
        data = self._data.limit(num=num)
        return DataStream(data=data, metadata=Metadata())
Пример #20
0
    def orderBy(self, *cols):
        """
        order by column name

        Args:
            *cols:

        Returns:
            Datastream:
        """
        data = self._data.orderBy(*cols)
        return DataStream(data=data, metadata=Metadata())
def generate_metadata_encounter_daily():
    stream_metadata = Metadata()
    stream_metadata.set_name('mcontain-md2k-encounter-daily--bluetooth-gps').set_description('Contains each unique encounters between two persons along with the location of encounter') \
        .add_dataDescriptor(
        DataDescriptor().set_name("start_time").set_type("timestamp").set_attribute("description", \
                                                                                    "Start time of the encounter in localtime")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("end_time").set_type("timestamp").set_attribute("description", \
                                                                                  "End time of the encounter in localtime")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("participant_identifier").set_type("string").set_attribute("description", \
                                                                                             "Participant with whom encounter happened")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("os").set_type("string").set_attribute("description", \
                                                                         "Operating system of the phone belonging to user")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("latitude").set_type("double").set_attribute("description", \
                                                                               "Latitude of encounter location")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("longitude").set_type("double").set_attribute("description", \
                                                                                "Longitude of encounter location")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("durations").set_type("array").set_attribute("description", \
                                                                               "Mean distance between participants in encounter")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("covid").set_type("integer").set_attribute("description", \
                                                                             "0, 1 or 2 indicating if this encounter contained a covid user -- 0 - no covid-19 affected, 1 - user is, 2 - participant identifier is"))
    stream_metadata.add_module(
        ModuleMetadata().set_name('Encounter computation after parsing raw bluetooth-gps data, clustering gps locations and removing double counting') \
            .set_attribute("url", "https://mcontain.md2k.org").set_author(
            "Md Azim Ullah", "*****@*****.**"))
    return stream_metadata
def generate_metadata_notif():
    stream_metadata = Metadata()
    stream_metadata.set_name('mcontain-md2k--user-notifications').set_description('Notification generated for the Covid-19 encountered users.') \
        .add_dataDescriptor(
        DataDescriptor().set_name("user").set_type("string").set_attribute("description", \
                                                                           "user id")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("timestamp").set_type("timestamp").set_attribute("description", \
                                                                                   "Unix timestamp when the message was generated")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("localtime").set_type("timestamp").set_attribute("description", \
                                                                                   "Local timestamp when the message was generated.")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("message").set_type("string").set_attribute("description", \
                                                                              "Generated notification message")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("day").set_type("timestamp").set_attribute("description", \
                                                                             "day of the encounter")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("version").set_type("int").set_attribute("description", \
                                                                           "version"))
    stream_metadata.add_module(
        ModuleMetadata().set_name('Generated notification for a user encountered with Covid-19 participant') \
            .set_attribute("url", "https://mcontain.md2k.org").set_author(
            "Md Shiplu Hawlader", "*****@*****.**").set_version(1))
    return stream_metadata
Пример #23
0
def generate_metadata_hourly():
    stream_metadata = Metadata()
    stream_metadata.set_name('mcontain-md2k--visualization-stats--time-window').set_description('Computes visualization stats every time window defined by start time and end time') \
        .add_dataDescriptor(
        DataDescriptor().set_name("start_time").set_type("timestamp").set_attribute("description", \
                                                                                    "Start time of the time window localtime")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("end_time").set_type("timestamp").set_attribute("description", \
                                                                                  "End time of the time window in localtime")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("latitude").set_type("double").set_attribute("description", \
                                                                               "Latitude of centroid location, a gps cluster output grouping encounters in similar location together")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("longitude").set_type("double").set_attribute("description", \
                                                                                "Longitude of centroid location, a gps cluster output grouping encounters in similar location together")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("n_users").set_type("integer").set_attribute("description", \
                                                                               "Number of unique users in that cluster centroid")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("total_encounters").set_type("double").set_attribute("description", \
                                                                                       "Total encounters happening in the time window in this specific location")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("normalized_total_encounters").set_type("double").set_attribute("description", \
                                                                                                  "Total encounters normalized by the centroid area. (encounters per 10 square meter)")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("avg_encounters").set_type("double").set_attribute("description", \
                                                                                     "average encounter per participant(participants who had at least one encounter)"))
    stream_metadata.add_module(
        ModuleMetadata().set_name('Visualization stats computation in a time window between start time and end time') \
            .set_attribute("url", "https://mcontain.md2k.org").set_author(
            "Md Azim Ullah", "*****@*****.**"))
    return stream_metadata
def generate_metadata_dailystats():
    stream_metadata = Metadata()
    stream_metadata.set_name('mcontain-md2k--daily-stats').set_description('Daily stats for website') \
        .add_dataDescriptor(
        DataDescriptor().set_name("start_time").set_type("timestamp").set_attribute("description", \
                                                                                    "Start time of the day in localtime")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("end_time").set_type("timestamp").set_attribute("description", \
                                                                                  "End time of the day in localtime")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("number_of_app_users").set_type("double").set_attribute("description", \
                                                                                          "Total number of app users")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("encounter_per_user").set_type("double").set_attribute("description", \
                                                                                         "Average encounter per user")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("total_covid_encounters").set_type("double").set_attribute("description", \
                                                                                             "Total covid encounters on the day")) \
        .add_dataDescriptor(
        DataDescriptor().set_name("maximum_concurrent_encounters").set_type("double").set_attribute("description", \
                                                                                                    "Maximum concurrent encounters"))
    stream_metadata.add_module(
        ModuleMetadata().set_name('Daily encounter stats for all the users to be shown in website') \
            .set_attribute("url", "https://mcontain.md2k.org").set_author(
            "Md Azim Ullah", "*****@*****.**"))
    return stream_metadata
Пример #25
0
def combine_base_encounters(base_encounters, time_threshold=10 * 60):
    schema = StructType([
        StructField('timestamp', TimestampType()),
        StructField('localtime', TimestampType()),
        StructField('start_time', TimestampType()),
        StructField('end_time', TimestampType()),
        StructField('user', StringType()),
        StructField('version', IntegerType()),
        StructField('latitude', DoubleType()),
        StructField('distances', ArrayType(DoubleType())),
        StructField('longitude', DoubleType()),
        StructField('average_count', DoubleType()),
        StructField('participant_identifier', StringType()),
        StructField('os', StringType())
    ])
    columns = [a.name for a in schema.fields]
    #     print(columns)
    @pandas_udf(schema, PandasUDFType.GROUPED_MAP)
    def get_enconters(data):
        if data.shape[0] == 1:
            if (pd.Timestamp(data['end_time'].values[0]) -
                    pd.Timestamp(data['start_time'].values[0])
                ).total_seconds() < time_threshold:
                return pd.DataFrame([], columns=columns)
            return data[columns]
        data = data.sort_values('start_time').reset_index(drop=True)
        ts = data['timestamp'].astype('datetime64[ns]').quantile(.5)
        local_ts = data['localtime'].astype('datetime64[ns]').quantile(.5)
        st = data['start_time'].min()
        et = data['end_time'].max()
        if (pd.Timestamp(et) -
                pd.Timestamp(st)).total_seconds() < time_threshold:
            return pd.DataFrame([], columns=columns)
        user = data['user'].values[0]
        version = 1
        latitude = data['latitude'].mean()
        longitude = data['longitude'].mean()
        distances = []
        for i, row in data.iterrows():
            distances.extend(list(row['distances']))
        average_count = data['average_count'].mean()
        os = data['os'].values[0]
        participant_identifier = data['participant_identifier'].values[0]
        return pd.DataFrame([[
            ts, local_ts, st, et, user, version, latitude, distances,
            longitude, average_count, participant_identifier, os
        ]],
                            columns=columns)

    data_result = base_encounters.groupBy(['user', 'participant_identifier'
                                           ]).apply(get_enconters)
    return DataStream(data=data_result, metadata=Metadata())
Пример #26
0
    def freqItems(self, cols, support=None):
        """
        Finding frequent items for columns, possibly with false positives. Using the frequent element count algorithm described in “http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou”.

        Returns:
            DataStream:

        Examples:
            >>> ds.freqItems("col-name")
        """

        data = self._data.freqItems(cols=cols,support=support)
        return DataStream(data=data, metadata=Metadata())
Пример #27
0
    def distinct(self):
        """
        Returns a new DataStream containing the distinct rows in this DataStream.

        Returns:
            DataStream: this will return a new datastream object with blank metadata

        Examples:
            >>> ds.distinct().count()
        """

        data = self._data.distinct()
        return DataStream(data=data, metadata=Metadata())
Пример #28
0
    def filter_user(self, user_ids: List):
        """
        filter data to get only selective users' data

        Args:
            user_ids (List[str]): list of users' UUIDs
        Returns:
            DataStream: this will return a new datastream object with blank metadata
        """
        if not isinstance(user_ids, list):
            user_ids = [user_ids]
        data = self._data.where(self._data["user"].isin(user_ids))
        return DataStream(data=data, metadata=Metadata())
Пример #29
0
    def colRegex(self,colName):
        """
        Selects column based on the column name specified as a regex and returns it as Column.

        Args:
            colName (str): column name specified as a regex.

        Returns:
            DataStream:

        Examples:
            >>> ds.colRegex("colName")
        """
        return DataStream(data=self._data.colRegex(colName=colName), metadata=Metadata())
Пример #30
0
    def map_stream(self, window_ds):
        """
        Map/join a stream to a windowed stream

        Args:
            window_ds (Datastream): windowed datastream object

        Returns:
            Datastream: joined/mapped stream

        """
        window_ds = window_ds.data.drop("version", "user")
        df = window_ds.join(self.data, self.data.timestamp.between(F.col("window.start"), F.col("window.end")))
        return DataStream(data=df, metadata=Metadata())