class Config: env = os.environ if not env.get("SENTRY_DSN"): logger.warning("SENTRY_DSN is not set") else: sentry_sdk.init(env.get("SENTRY_DSN")) if not env.get("BOUNDARY"): logger.warning("BOUNDARY will be set to 65") BOUNDARY = 65 else: BOUNDARY = env.get("BOUNDARY") if not env.get("DEVICE_ENV"): logger.warning("DEVICE_ENV will be set to None") DEVICE_ENV = '' else: DEVICE_ENV = env.get("DEVICE_ENV") if not env.get("DATA_FOLDER"): logger.exception("DATA_FOLDER is not set") else: DIRECTORY = env.get("DATA_FOLDER") if not env.get("ANALYSIS_FOLDER"): logger.exception("ANALYSIS_FOLDER is not set") else: ANALYSIS = open(f"{env.get('ANALYSIS_FOLDER')}/analysis.txt", 'w') if not env.get("INSPECT_DATA") and not env.get("INSPECT_DROPPED"): INSPECT_DROPPED = True INSPECT_DATA = True logger.warning("The application will run all features\n") elif (env.get("INSPECT_DATA") is None or env.get("INSPECT_DATA").upper() == "FALSE") \ and env.get("INSPECT_DROPPED").upper() == "TRUE": INSPECT_DROPPED = True INSPECT_DATA = False logger.info("The application will inspect dropped updates\n") elif env.get("INSPECT_DATA").upper() == "TRUE" and \ (env.get("INSPECT_DROPPED") is None or env.get("INSPECT_DROPPED").upper() == "FALSE"): INSPECT_DATA = True INSPECT_DROPPED = False logger.info("The application will inspect data\n") elif env.get("INSPECT_DATA").upper() == "TRUE" and env.get( "INSPECT_DROPPED").upper() == "TRUE": INSPECT_DATA = True INSPECT_DROPPED = True logger.info("The application will run all features\n") else: logger.exception( f"Invalid combination of INSPECT_DATA and INSPECT_DROPPED\n" f"INSPECT_DATA: {env.get('INSPECT_DATA')}\n" f"INSPECT_DROPPED: {env.get('INSPECT_DROPPED')}" f"\nThe application requires at least one of the features to be set" f" and to be True")
def station_count(self): """ Return data point's stations length :return: list of stations size per hour """ stations = [0] * 24 for hour in self.data_file.buckets: for pt in self.data_file.buckets[hour]: try: pt_data = json.loads(pt['data']) if 'stations' in pt_data: stations[hour] = len(pt_data['stations']) except ValueError: logger.error(f"Invalid json string: {pt['data']}") raise ValueError logger.info(f"stations: {stations}") return stations
def get_device_type(self): """ Returns the device type for each raw data file :return: """ device_id = os.path.basename(self.data_file.file).partition("-")[0].replace('_', ':') device_type = {device_id: "Unknown"} for pt in self.data_file.sorted_data: if pt['type'] == 'slow': pt_data = pt['data'] if device_detection.detect_device_type(device_id, Config.DEVICE_ENV, pt_data) \ is not None: device_type[device_id] = \ (device_detection.detect_device_type(device_id, Config.DEVICE_ENV, pt_data)) logger.info(f"{device_type}") return device_type logger.info(f"{device_type}") return device_type
def main(): try: for file_path in glob.glob(f"{Config.DIRECTORY}/*.jsonl"): Config.ANALYSIS.write(file_path + "\n\n") logger.info(f"filename: {file_path}") sort_data = read_data(file_path) hours_bucket = initialize_buckets(sort_data) data_file = DataFile(file_path, sort_data, hours_bucket) if Config.INSPECT_DROPPED: drop = inspect_dropped.Dropped(data_file) drop.run() if Config.INSPECT_DATA: inspect = inspect_data.Data(data_file) inspect.run() logger.success(f"file {str(file_path)} completed\n") except Exception: capture_exception() logger.exception("Could not finish reading files")
def avg_upd_not_dropped(self): """ Calculates average time difference of updates that are not dropped :return: """ sum_upd = 0 count = 0 for hour in self.data_file.buckets: for i in range(len(self.data_file.buckets[hour]) - 1): if not is_dropped(self.data_file.buckets[hour][i + 1]['timestamp'], self.data_file.buckets[hour][i]['timestamp']): sum_upd += self.data_file.buckets[hour][i + 1]['timestamp'] - \ self.data_file.buckets[hour][i]['timestamp'] count += 1 if count == 0: logger.error("every update is dropped") return 1.0 average = round(sum_upd / count, 2) logger.info(f"average upd duration if not dropped:{average}s") return average
def avg_upd_dropped(self): """ Calculates average time difference of dropped updates :return: """ sum_upd = 0 count = 0 for hour in self.data_file.buckets: for i in range(len(self.data_file.buckets[hour]) - 1): if is_dropped(self.data_file.buckets[hour][i + 1]['timestamp'], self.data_file.buckets[hour][i]['timestamp']): sum_upd += self.data_file.buckets[hour][i + 1]['timestamp'] \ - self.data_file.buckets[hour][i]['timestamp'] count += 1 if count == 0: logger.info("no dropped updates") return 1 average = round(sum_upd / count, 2) logger.info(f"average update duration if dropped: {average}s") return round(sum_upd / count, 2)
def missing_reg(self): """ Write the estimated number of possible missing regular updates to analysis file :return: dictionary, consecutive reg upd tuple as keys and time diff-hour tuple as values """ keys = [] values = [] count = [0] * 24 for hour in self.data_file.buckets: for i in range(len(self.data_file.buckets[hour])): data_pt = self.data_file.buckets[hour][i] if data_pt['type'] == 'slow': time_before = self.data_file.buckets[hour][i - 1]['timestamp'] time_slow = self.data_file.buckets[hour][i]['timestamp'] if i != len(self.data_file.buckets[hour]) - 1: time_after = self.data_file.buckets[hour][i + 1]['timestamp'] missing_reg_interval(keys, values, time_before, time_after, hour) else: missing_reg_interval(keys, values, time_before, time_slow, hour) if (time_slow - time_before) / float(Config.BOUNDARY) > 1: count[hour] += round((time_slow - time_before) / float(Config.BOUNDARY)) missing_regular = dict(zip(keys, values)) logger.info(f"missing regular due to slow updates per hour: {count}") logger.info(f"missing regular due to slow updates: {missing_regular}") logger.info(f"total missing regular due to slow updates: {sum(count)}") Config.ANALYSIS.write("\n") return missing_regular
def latest_dr_ver(self): """ Latest driver version per hour :return: list of driver versions per hour """ dr_ver = [None] * 24 no_dr_ver = [] for hour in self.data_file.buckets: for pt in self.data_file.buckets[hour]: try: pt_data = json.loads(pt['data']) if 'dr_ver' in pt_data: dr_ver[hour] = pt_data['dr_ver'] except ValueError: logger.error(f"Invalid json string: {pt['data']}") raise ValueError if dr_ver[hour] is None: no_dr_ver.append(hour) if len(no_dr_ver) != 0: logger.info(f"no driver version in hours: {no_dr_ver}") logger.info(f"driver version: {dr_ver}") return dr_ver
def neighbor_count(self): """ Return data point's neighbors length :return: list of neighbors size per hour """ neighbors = [0] * 24 prev = 0 for hour in self.data_file.buckets: for pt in self.data_file.buckets[hour]: if pt['type'] == "slow": try: pt_data = json.loads(pt['data']) neighbors_count = len(pt_data['neighbors']) neighbors[hour] = neighbors_count prev = neighbors_count except ValueError: logger.error(f"Invalid json string: {pt['data']}") raise ValueError except KeyError: logger.error(f"slow update at {pt['timestamp']} does not have neighbors") if neighbors[hour] == 0: neighbors[hour] = prev logger.info(f"neighbors: {neighbors}") return neighbors