def get_next_dir(self, cur_dir): cur_dir = cur_dir.replace(self._file_tree.get_root_name(), '') cur_time = FileUtil.parse_dir_to_timestamp(dir_name=cur_dir) if self.PARTITIONER_TYPE == PartitionerStorageType.YEARLY: next_time = datetime.datetime(cur_time.year + 1, 1, 1) elif self.PARTITIONER_TYPE == PartitionerStorageType.MONTHLY: if cur_time.month == 12: next_time = datetime.datetime(cur_time.year + 1, 1, 1) else: next_time = datetime.datetime(cur_time.year, cur_time.month + 1, 1) elif self.PARTITIONER_TYPE == PartitionerStorageType.DAILY: next_time = cur_time + datetime.timedelta(days=1) elif self.PARTITIONER_TYPE == PartitionerStorageType.HOURLY: next_time = cur_time + datetime.timedelta(hours=1) else: next_time = cur_time + datetime.timedelta(minutes=1) next_dir_name = FileUtil.parse_timestamp_to_dir( timestamp=next_time).split('/') next_dir_name = '/'.join( next_dir_name[:self.PARTITIONER_TYPE_TO_HEIGHT_MAP[ self.PARTITIONER_TYPE]]) next_dir_name = FileUtil.join_paths_to_dir( root_dir=self._file_tree.get_root_name(), base_name=next_dir_name) if FileUtil.does_dir_exist(dir_name=next_dir_name): return next_dir_name else: return None
def test_parse_dir_to_timestamp(self): dir_name = '2020/01/01/12/30' self.assertEqual( FileUtil.parse_dir_to_timestamp(dir_name=dir_name), datetime.datetime(2020, 1, 1, 12, 30) ) dir_name = '2020/01/01/00/00' self.assertEqual( FileUtil.parse_dir_to_timestamp(dir_name=dir_name), datetime.datetime(2020, 1, 1) ) dir_name = '2020/01' self.assertEqual( FileUtil.parse_dir_to_timestamp(dir_name=dir_name), datetime.datetime(2020, 1, 1) )
def get_previous_dir(self, cur_dir): self.initialize_from_dir(dir_name=self.get_dir_name()) cur_dir = cur_dir.replace(self._file_tree.get_root_name(), '') cur_time = FileUtil.parse_dir_to_timestamp(dir_name=cur_dir) if self.PARTITIONER_TYPE == PartitionerStorageType.YEARLY: pre_time = datetime.datetime(cur_time.year - 1, 1, 1) elif self.PARTITIONER_TYPE == PartitionerStorageType.MONTHLY: if cur_time.month == 1: pre_time = datetime.datetime(cur_time.year - 1, 12, 1) else: pre_time = datetime.datetime(cur_time.year, cur_time.month - 1, 1) elif self.PARTITIONER_TYPE == PartitionerStorageType.DAILY: pre_time = cur_time - datetime.timedelta(days=1) elif self.PARTITIONER_TYPE == PartitionerStorageType.HOURLY: pre_time = cur_time - datetime.timedelta(hours=1) else: pre_time = cur_time - datetime.timedelta(minutes=1) last_dir_name = FileUtil.parse_timestamp_to_dir( timestamp=pre_time).split('/') last_dir_name = '/'.join( last_dir_name[:self.PARTITIONER_TYPE_TO_HEIGHT_MAP[ self.PARTITIONER_TYPE]]) last_dir_name = FileUtil.join_paths_to_dir( root_dir=self._file_tree.get_root_name(), base_name=last_dir_name) self.increment_rpc_count_by(n=1) if FileUtil.does_dir_exist(dir_name=last_dir_name): return last_dir_name else: return None
def _cmp_dir_by_timestamp(self, dir_name_1, dir_name_2): dir_name_1 = dir_name_1.replace(self._file_tree.get_root_name(), '') dir_name_2 = dir_name_2.replace(self._file_tree.get_root_name(), '') if not dir_name_2: return False else: dir_name_1 = FileUtil.normalize_dir_name(dir_name=dir_name_1) dir_name_2 = FileUtil.normalize_dir_name(dir_name=dir_name_2) dir_name_1_split, dir_name_2_split = dir_name_1.split( '/')[:-1], dir_name_2.split('/')[:-1] if len(dir_name_1_split) > len(dir_name_2_split): return False dir_name_2 = FileUtil.normalize_dir_name('/'.join( dir_name_2_split[:len(dir_name_1_split)])) dir_name_1_timestamp = FileUtil.parse_dir_to_timestamp( dir_name=dir_name_1) dir_name_2_timestamp = FileUtil.parse_dir_to_timestamp( dir_name=dir_name_2) return dir_name_1_timestamp < dir_name_2_timestamp
def read_range(self, params): def _reformat_time(timestamp): if self.PARTITIONER_TYPE == PartitionerStorageType.YEARLY: timestamp = timestamp.replace(month=1, day=1, hour=0, minute=0, second=0, microsecond=0, tzinfo=None) elif self.PARTITIONER_TYPE == PartitionerStorageType.MONTHLY: timestamp = timestamp.replace(day=1, hour=0, minute=0, second=0, microsecond=0, tzinfo=None) elif self.PARTITIONER_TYPE == PartitionerStorageType.DAILY: timestamp = timestamp.replace(hour=0, minute=0, second=0, microsecond=0, tzinfo=None) elif self.PARTITIONER_TYPE == PartitionerStorageType.HOURLY: timestamp = timestamp.replace(minute=0, second=0, microsecond=0, tzinfo=None) else: timestamp = timestamp.replace(second=0, microsecond=0, tzinfo=None) return timestamp assert 'start_time' in params and 'end_time' in params and params[ 'start_time'] <= params['end_time'] while self._writer_status != Status.IDLE: self.sys_log("Waiting for writer to finish.") time.sleep(TimeSleepObj.ONE_SECOND) self._reader_status = Status.RUNNING oldest_dir, latest_dir = self.get_oldest_dir(), self.get_latest_dir() if not latest_dir or not oldest_dir: if self.is_empty(): self._logger.warning("Current partitioner [" + self.get_dir_name() + "] is empty, cannot read anything.") self.sys_log("Current partitioner [" + self.get_dir_name() + "] is empty, cannot read anything.") return {} oldest_dir = oldest_dir.replace(self._file_tree.get_root_name(), '') latest_dir = latest_dir.replace(self._file_tree.get_root_name(), '') oldest_timestamp = FileUtil.parse_dir_to_timestamp(dir_name=oldest_dir) latest_timestamp = FileUtil.parse_dir_to_timestamp(dir_name=latest_dir) start_time = max(_reformat_time(params['start_time']), oldest_timestamp) end_time = min(_reformat_time(params['end_time']), latest_timestamp) result = {} try: while start_time <= end_time: dir_list = FileUtil.parse_timestamp_to_dir( timestamp=start_time).split('/') dir_name = '/'.join( dir_list[:self.PARTITIONER_TYPE_TO_HEIGHT_MAP[ self.PARTITIONER_TYPE]]) dir_name = FileUtil.join_paths_to_dir( root_dir=self._file_tree.get_root_name(), base_name=dir_name) if FileUtil.does_dir_exist(dir_name=dir_name): if self._underlying_storage.get_storage_type( ) == StorageType.PROTO_TABLE_STORAGE: storage = ProtoTableStorage() else: storage = DefaultStorage() file_names = FileUtil.list_files_in_dir(dir_name=dir_name) for file_name in file_names: storage.initialize_from_file(file_name=file_name) if storage.get_storage_type( ) == StorageType.PROTO_TABLE_STORAGE: result[file_name] = storage.read_all() else: result[file_name] = storage.read( params={'num_line': -1}) if self.PARTITIONER_TYPE == PartitionerStorageType.YEARLY: start_time = start_time.replace(year=start_time.year + 1, month=1, day=1) elif self.PARTITIONER_TYPE == PartitionerStorageType.MONTHLY: if start_time.month == 12: start_time = start_time.replace(year=start_time.year + 1, month=1, day=1) else: start_time = start_time.replace( month=start_time.month + 1) elif self.PARTITIONER_TYPE == PartitionerStorageType.DAILY: start_time += datetime.timedelta(days=1) elif self.PARTITIONER_TYPE == PartitionerStorageType.HOURLY: start_time += datetime.timedelta(hours=1) else: start_time += datetime.timedelta(minutes=1) self._reader_status = Status.IDLE return result except Exception as err: self.sys_log("Read range in dir [" + self.get_dir_name() + "] got exception " + str(err) + '.') self._logger.error("Read range in dir [" + self.get_dir_name() + "] got exception " + str(err) + '.') raise StorageReadException("Read range in dir [" + self.get_dir_name() + "] got exception " + str(err) + '.')
def get_dir_in_timestamp(self, dir_name): dir_name = dir_name.replace(self._file_tree.get_root_name(), '') if dir_name: return FileUtil.parse_dir_to_timestamp(dir_name=dir_name) else: return None
def read_range(self, params): self.initialize_from_dir(dir_name=self.get_dir_name()) def _reformat_time(timestamp): if self.PARTITIONER_TYPE == PartitionerStorageType.YEARLY: timestamp = timestamp.replace(month=1, day=1, hour=0, minute=0, second=0, microsecond=0, tzinfo=None) elif self.PARTITIONER_TYPE == PartitionerStorageType.MONTHLY: timestamp = timestamp.replace(day=1, hour=0, minute=0, second=0, microsecond=0, tzinfo=None) elif self.PARTITIONER_TYPE == PartitionerStorageType.DAILY: timestamp = timestamp.replace(hour=0, minute=0, second=0, microsecond=0, tzinfo=None) elif self.PARTITIONER_TYPE == PartitionerStorageType.HOURLY: timestamp = timestamp.replace(minute=0, second=0, microsecond=0, tzinfo=None) else: timestamp = timestamp.replace(second=0, microsecond=0, tzinfo=None) return timestamp assert 'start_time' in params and 'end_time' in params and params[ 'start_time'] <= params['end_time'] oldest_dir, latest_dir = self._get_oldest_dir_in_root_directory_interal( ), self._get_latest_dir_internal() if not latest_dir or not oldest_dir: self._logger.warning("Current partitioner [" + self.get_dir_name() + "] is empty, cannot read anything.") self._SYS_LOGGER.warning("Current partitioner [" + self.get_dir_name() + "] is empty, cannot read anything.") return {} oldest_dir = oldest_dir.replace(self._file_tree.get_root_name(), '') latest_dir = latest_dir.replace(self._file_tree.get_root_name(), '') oldest_timestamp = FileUtil.parse_dir_to_timestamp(dir_name=oldest_dir) latest_timestamp = FileUtil.parse_dir_to_timestamp(dir_name=latest_dir) start_time = max(_reformat_time(params['start_time']), oldest_timestamp) end_time = min(_reformat_time(params['end_time']), latest_timestamp) result = {} try: all_file_names = [] while start_time <= end_time: dir_list = FileUtil.parse_timestamp_to_dir( timestamp=start_time).split('/') dir_name = '/'.join( dir_list[:self.PARTITIONER_TYPE_TO_HEIGHT_MAP[ self.PARTITIONER_TYPE]]) dir_name = FileUtil.join_paths_to_dir( root_dir=self._file_tree.get_root_name(), base_name=dir_name) try: self.increment_rpc_count_by(n=1) file_names = FileUtil.list_files_in_dir(dir_name=dir_name) all_file_names.extend(file_names) except Exception as _: pass if self.PARTITIONER_TYPE == PartitionerStorageType.YEARLY: start_time = start_time.replace(year=start_time.year + 1, month=1, day=1) elif self.PARTITIONER_TYPE == PartitionerStorageType.MONTHLY: if start_time.month == 12: start_time = start_time.replace(year=start_time.year + 1, month=1, day=1) else: start_time = start_time.replace( month=start_time.month + 1) elif self.PARTITIONER_TYPE == PartitionerStorageType.DAILY: start_time += datetime.timedelta(days=1) elif self.PARTITIONER_TYPE == PartitionerStorageType.HOURLY: start_time += datetime.timedelta(hours=1) else: start_time += datetime.timedelta(minutes=1) result = {} self.increment_rpc_count_by(n=1) if self._underlying_storage.get_storage_type( ) == StorageType.PROTO_TABLE_STORAGE: tmp_result = gclient_ext.read_proto_messages( paths=all_file_names, message_type=ProtoTable) for file_name, v in tmp_result.items(): result[file_name] = dict(v.data) else: tmp_result = gclient_ext.read_txts(all_file_names) for file_name, v in tmp_result.items(): result[file_name] = v.rstrip().split('\n') return result except Exception as err: self._SYS_LOGGER.error("Read range in dir [" + self.get_dir_name() + "] got exception " + str(err) + '.') self._logger.error("Read range in dir [" + self.get_dir_name() + "] got exception " + str(err) + '.') raise StorageReadException("Read range in dir [" + self.get_dir_name() + "] got exception " + str(err) + '.')