Пример #1
0
    def __init__(self,
                 CC_obj: CerebralCortex,
                 export_dir_path: str,
                 owner_ids: List = None,
                 owner_user_names: List = None,
                 owner_name_regex: str = None,
                 start_time: str = None,
                 end_time: str = None):
        """
        :param CC_obj:
        :param export_dir_path:
        :param owner_ids: owner_user_name and owner_name_regex must be None if using owner_id
        :param owner_user_names: owner_id and owner_name_regex must be None if using owner_user_name
        :param owner_name_regex: owner_id and owner_user_name must be None if using owner_name_reges
        :param start_time:
        :param end_time:
        """

        self.streamData = Data(CC_obj)
        self.export_dir_path = export_dir_path
        self.metadata = Metadata(CC_obj)
        self.owner_ids = owner_ids
        self.owner_user_names = owner_user_names
        self.owner_name_regex = str(owner_name_regex)
        self.start_time = start_time
        self.end_time = end_time
Пример #2
0
 def get_stream_samples(self, stream_id, day, start_time=None, end_time=None) -> List:
     """
     return sample values of a stream
     :param stream_id:
     :return:
     """
     return Data(self).get_stream_samples(stream_id, day, start_time, end_time)
    def get_cassandra_raw_data(self, stream_id: uuid, day: str) -> List:
        """

        :param stream_id:
        :param day:
        :return:
        """
        return Data(self).load_cassandra_data(stream_id, day)
Пример #4
0
    def get_stream_dataframe(self, stream_identifier: uuid, start_time: datetime = None, end_time: datetime = None,
                             data_type: enumerate = DataSet.COMPLETE) -> dict:

        """
        :param stream_id:
        :param start_time:
        :param end_time:
        :param data_type: this parameter accepts only three types (i.e., all, data, metadata)
        :return: {"metadata":dict, "data":DataFrame}
        """
        return Data(self).get_stream_dataframe(stream_identifier, start_time, end_time, data_type)
Пример #5
0
 def get_datastream(self, stream_identifier: uuid, day=None, start_time: datetime = None, end_time: datetime = None,
                    data_type: enumerate = DataSet.COMPLETE) -> DataStream:
     """
     Returns a data stream with data and metadata
     :param stream_identifier:
     :param start_time:
     :param end_time:
     :param data_type:
     :return:
     """
     return Data(self).get_stream(stream_identifier, day, start_time, end_time, data_type)
Пример #6
0
 def filter_stream(self, data_stream_id: uuid, annotation_stream_name: uuid, annotation: str,
                   start_time: datetime = None, end_time: datetime = None) -> List[DataPoint]:
     """
     This method maps derived annotation stream to a data stream and returns a List of mapped Datapoints
     :param data_stream_id:
     :param annotation_stream_name:
     :param annotation:
     :param start_time:
     :param end_time:
     :return:
     """
     annotation_stream_id = Metadata(self).get_annotation_id(data_stream_id, annotation_stream_name)
     return Data(self).get_annotation_stream(data_stream_id, annotation_stream_id, annotation, start_time, end_time)
Пример #7
0
 def save_datastream(self, datastream: DataStream):
     """
     Save Datastream to appropriate datastores
     :param datastream:
     """
     Data(self).store_stream(datastream)
Пример #8
0
 def save_datastream_to_influxdb(self, datastream: DataStream):
     """
     Save Datastream to appropriate datastores
     :param datastream:
     """
     Data(self).store_data_to_influxdb(datastream)
Пример #9
0
class DataExporter():
    def __init__(self,
                 CC_obj: CerebralCortex,
                 export_dir_path: str,
                 owner_ids: List = None,
                 owner_user_names: List = None,
                 owner_name_regex: str = None,
                 start_time: str = None,
                 end_time: str = None):
        """
        :param CC_obj:
        :param export_dir_path:
        :param owner_ids: owner_user_name and owner_name_regex must be None if using owner_id
        :param owner_user_names: owner_id and owner_name_regex must be None if using owner_user_name
        :param owner_name_regex: owner_id and owner_user_name must be None if using owner_name_reges
        :param start_time:
        :param end_time:
        """

        self.streamData = Data(CC_obj)
        self.export_dir_path = export_dir_path
        self.metadata = Metadata(CC_obj)
        self.owner_ids = owner_ids
        self.owner_user_names = owner_user_names
        self.owner_name_regex = str(owner_name_regex)
        self.start_time = start_time
        self.end_time = end_time

    def start(self):
        if self.owner_ids and self.owner_ids != 'None':
            for owner_id in self.owner_ids:
                owner_name = self.metadata.owner_id_to_name(owner_id)
                self.export_data(owner_id=owner_id, owner_name=owner_name)
        elif self.owner_user_names and self.owner_user_names != 'None':
            for owner_user_name in self.owner_user_names:
                owner_id = self.metadata.owner_name_to_id(owner_user_name)
                self.export_data(owner_id=owner_id, owner_name=owner_user_name)
        elif self.owner_name_regex and self.owner_name_regex != 'None':
            owner_idz = self.metadata.get_owner_ids_by_owner_name_regex(
                self.owner_name_regex)
            for owner_id in owner_idz:
                owner_name = self.metadata.owner_id_to_name(
                    owner_id["identifier"])
                self.export_data(owner_id=owner_id["identifier"],
                                 owner_name=owner_name)

    @calculate_time
    def export_data(self, owner_id=None, owner_name=None):

        rows = self.metadata.get_stream_metadata_by_owner_id(owner_id)
        if rows == "NULL":
            print("No data found for => owner-id: " + owner_id +
                  " - owner-name: " + owner_name)
            return

        for row in rows:
            stream_id = row["identifier"]
            data_start_time = row["start_time"]
            data_end_time = row["end_time"]
            stream_metadata = {
                "identifier": stream_id,
                "owner_id": row["owner"],
                "name": row["name"],
                "data_available": {
                    "start_time": str(data_start_time),
                    "end_time": str(data_end_time)
                }
            }

            data_descriptor = json.loads(row["data_descriptor"])
            execution_context = json.loads(row["execution_context"])
            annotations = json.loads(row["annotations"])

            stream_metadata.update({"data_descriptor": data_descriptor})
            stream_metadata.update({"execution_context": execution_context})
            stream_metadata.update({"annotations": annotations})

            file_path = self.export_dir_path + owner_name
            if not os.path.exists(file_path):
                os.mkdir(file_path)

            # write metadata to json file
            self.write_to_file(file_path + "/" + stream_id + ".json",
                               json.dumps(stream_metadata))

            # load and write stream raw data to bz2
            delta = data_end_time - data_start_time

            for i in range(delta.days + 1):
                day = data_start_time + timedelta(days=i)
                day = datetime.strftime(day, "%Y%m%d")
            self.writeStreamDataToZipFile(stream_id, day, file_path)

    def writeStreamDataToZipFile(self, stream_id: uuid, day, file_path: str):
        """

        :param stream_id:
        :param file_path:
        """
        if stream_id:
            where_clause = "identifier='" + stream_id + "' and day='" + str(
                day) + "'"
        else:
            raise ValueError("Missing owner ID.")

        if self.start_time and self.end_time:
            where_clause += " and start_time>=cast('" + str(
                self.start_time
            ) + "' as timestamp) and start_time<=cast('" + str(
                self.end_time) + "' as timestamp)"
        elif self.start_time and not self.end_time:
            where_clause += " and start_time>=cast('" + str(
                self.start_time) + "' as timestamp)"
        elif not self.start_time and self.end_time:
            where_clause += " start_time<=cast('" + str(
                self.end_time) + "' as timestamp)"

        df = self.streamData.load_data_from_cassandra(
            self.streamData.datapointTable, where_clause, 1)
        df.write \
            .format("csv") \
            .option("codec", "org.apache.hadoop.io.compress.GzipCodec") \
            .save(file_path + "/" + stream_id)

        os.system("cat " + file_path + "/" + stream_id + "/p* > " + file_path +
                  "/" + stream_id + ".gz")
        if os.path.exists(file_path + "/" + stream_id + "/"):
            shutil.rmtree(file_path + "/" + stream_id + "/",
                          ignore_errors=True)

    def write_to_bz2(self, file_name, data):
        with open(file_name, 'wb+') as outfile:
            compressed_data = bz2.compress(data, 9)
            outfile.write(compressed_data)

    def write_to_file(self, file_name: str, data: str):
        """
        :param file_name:
        :param data:
        """
        with open(file_name, 'w+') as outfile:
            outfile.write(data)
Пример #10
0
 def save_datastream(self, datastream):
     Data(self.sc, self.sqlContext,
          self.configuration).store_datastream(datastream)
Пример #11
0
 def get_datastream(self, stream_identifier):
     return Data(self.sc, self.sqlContext,
                 self.configuration).get_datastream(stream_identifier)