def store_streams(data): try: st = datetime.datetime.now() CC.save_datastream_to_influxdb(data) CC.save_datastream(data, "json") print("Stream Saved: ", data['filename'], (datetime.datetime.now() - st)) except: cc_log()
def storeOffsetRanges(rdd): offsetRanges = rdd.offsetRanges() for offsets in offsetRanges: try: CC.store_or_update_Kafka_offset(offsets.topic, offsets.partition, offsets.fromOffset, offsets.untilOffset) except: cc_log()
def file_processor(msg: dict, data_path: str) -> DataStream: """ :param msg: :param data_path: :return: """ print("in file Processor") if not isinstance(msg["metadata"], dict): metadata_header = json.loads(msg["metadata"]) else: metadata_header = msg["metadata"] identifier = metadata_header["identifier"] owner = metadata_header["owner"] name = metadata_header["name"] data_descriptor = metadata_header["data_descriptor"] execution_context = metadata_header["execution_context"] if "annotations" in metadata_header: annotations = metadata_header["annotations"] else: annotations = {} if "stream_type" in metadata_header: stream_type = metadata_header["stream_type"] else: stream_type = "ds" try: gzip_file_content = get_gzip_file_contents(data_path + msg["filename"]) datapoints = list( map(lambda x: row_to_datapoint(x), gzip_file_content.splitlines())) print(datapoints) rename_file(data_path + msg["filename"]) start_time = datapoints[0].start_time end_time = datapoints[len(datapoints) - 1].end_time return DataStream(identifier, owner, name, data_descriptor, execution_context, annotations, stream_type, start_time, end_time, datapoints) except Exception as e: error_log = "In Kafka preprocessor - Error in processing file: " + str( msg["filename"] ) + " Owner-ID: " + owner + "Stream Name: " + name + " - " + str(e) cc_log(error_log, "MISSING_DATA") datapoints = [] return None
def store_stream(data: DataStream): """ Store data into Cassandra, MySQL, and influxDB :param data: """ if data: try: c1 = datetime.now() CC.save_datastream(data, "datastream") e1 = datetime.now() CC.save_datastream_to_influxdb(data) i1 = datetime.now() print("Cassandra Time: ", e1 - c1, " Influx Time: ", i1 - e1, " Batch size: ", len(data.data)) except: cc_log()
def extract_info(msg: dict, data_path: str): global cur_time global interval try: metadata_header = msg["metadata"] filename = msg["filename"] #owner = "fbf8d50c-7f1d-47aa-b958-9caeadc676bd"#metadata_header["owner"] #name = metadata_header["name"] #data_descriptor = metadata_header["data_descriptor"] #execution_context = metadata_header["execution_context"] gzip_file_content = get_gzip_file_contents(data_path + msg["filename"]) datapoints = list( map(lambda x: row_to_datapoint_cus(x), gzip_file_content.splitlines())) #print(datapoints) start_time = datapoints[0]["time"] end_time = datapoints[len(datapoints) - 1]["time"] # in the window, add into queue end_time = datetime.strptime(end_time, "%Y-%m-%d %H:%M:%S") end_time = datetime.timestamp(end_time) if end_time >= cur_time: return filename # filelist.append(filename) return None # if len(filelist) != 0: # return [len(filelist)] #return [identifier, owner, name, data_descriptor, start_time, end_time, datapoints] #list of dictionary #return [0, owner, "name", "data_descriptor", start_time, end_time, datapoints] # return valid file name instead # return datapoints except Exception as e: #error_log = "In Kafka preprocessor - Error in processing file: " + str(msg["filename"])+" Owner-ID: "+owner + "Stream Name: "+name + " - " + str(e) cc_log(error_log, "MISSING_DATA") datapoints = [] print(e) return None
def store_data_to_influxdb(self, datastream: DataStream): """ :param datastream: """ st = datetime.now() client = InfluxDBClient(host=self.influxdbIP, port=self.influxdbPort, username=self.influxdbUser, password=self.influxdbPassword, database=self.influxdbDatabase) datapoints = datastream.data stream_identifier = datastream.identifier stream_owner_id = datastream.owner stream_owner_name = Metadata(self.CC_obj).owner_id_to_name(stream_owner_id) stream_name = datastream.name if datastream.data_descriptor: total_dd_columns = len(datastream.data_descriptor) data_descriptor = datastream.data_descriptor else: data_descriptor = [] total_dd_columns = 0 influx_data = [] for datapoint in datapoints: object = {} object['measurement'] = stream_name object['tags'] = {'stream_id':stream_identifier, 'owner_id': stream_owner_id, 'owner_name': stream_owner_name} object['time'] = datapoint.start_time values = datapoint.sample if isinstance(values, tuple): values = list(values) else: try: values = [float(values)] except: try: values = list(map(float, values.split(','))) except: values = values try: object['fields'] = {} if isinstance(values, list): for i, sample_val in enumerate(values): if len(values)==total_dd_columns: dd = data_descriptor[i] if "NAME" in dd: object['fields'][dd["NAME"]] = sample_val else: object['fields']['value_'+str(i)] = sample_val else: object['fields']['value_'+str(i)] = sample_val else: dd = data_descriptor[0] if not values: values = "NULL" try: values = float(values) except: values = values if "NAME" in dd: object['fields'][dd["NAME"]] = values else: object['fields']['value_0'] = values except: try: values = json.dumps(values) object['fields']['value_0'] = values except: cc_log("Datapoint sample values conversion: "+str(values),"WARNING") object['fields']['value_0'] = str(values) influx_data.append(object) et = datetime #print('InfluxDB - Yielding:', stream_owner_id, len(influx_data), stream_identifier) try: client.write_points(influx_data) et2 = datetime.now() #print("Influx Time BreakDown: Processing: ", et-st, " Inserting: ",et2-et, " Size: ",len(influx_data)) except: cc_log()