def __GetTimeOutForTask(cls, task): """Returns the timeout for the task. Args: task: string: The task for which the timeout should be prepared. Returns: int: The timeout in seconds. """ timeout = FileUtils.FileContents(task + '.timeout') if not timeout: timeout = FileUtils.FileContents(os.path.join(PipelineUtils.TaskDirName(task), 'timeout')) if not timeout: return Flags.ARGS.timeout timeout = re.sub('\s*', '', timeout) timeout_parts = re.split('(\d+)', timeout) if len(timeout_parts) < 3: TermColor.Warning('Ignoring invalid timeout [%s] for task: %s' % (timeout, task)) return Flags.ARGS.timeout timeout = float(timeout_parts[1]) annotation = timeout_parts[2] if not annotation: return timeout elif annotation == 'd': timeout *= 86400 elif annotation == 'h': timeout *= 3600 elif annotation == 'm': timeout *= 60 elif annotation == 'ms': timeout *= 0.001 elif annotation == 'us': timeout *= 0.000001 return timeout
def read_js_version(javascript_version_file=None): '''Read the JS version stamp attached when building the production JS bundles to improve frontend error reporting. If no version file was added, we do not support versioning for this deployment. ''' javascript_version_file = javascript_version_file or FileUtils.GetAbsPathForFile( 'web/public/build/version.txt') # Only production has JS versions. If we are in production but the version # file does not exist, FileUtils will not resolve the absolute path and # will return None. Some deployments do not support versioning. if not IS_PRODUCTION or not javascript_version_file: return '' return FileUtils.FileContents(javascript_version_file).strip()
def build_indexing_task(version): # Create a set of absolute paths for the input path list input_paths = Flags.ARGS.data_files cwd = os.getcwd() full_paths = set() for path in input_paths: full_paths.update(build_absolute_paths(path, cwd)) assert len(full_paths) > 0, 'No matching paths found for indexing!' # Parse the task definition overrides if specified. Defaults to None task_template_json = FileUtils.FileContents(Flags.ARGS.task_template_file) metrics_spec_json = FileUtils.FileContents(Flags.ARGS.metrics_spec_file) tuning_config_json = FileUtils.FileContents(Flags.ARGS.tuning_config_file) # If no datasource name is specified, generate a valid site datasource # and use its name. datasource_name = (Flags.ARGS.datasource_name or SiteDruidDatasource(DEPLOYMENT_NAME, TODAY).name) min_data_date = datetime.datetime.strptime(Flags.ARGS.min_data_date, DRUID_DATE_FORMAT) max_data_date = datetime.datetime.strptime(Flags.ARGS.max_data_date, DRUID_DATE_FORMAT) return DruidIndexingTaskBuilder( datasource_name, DIMENSIONS, BaseRowType.DATE_FIELD, full_paths, min_data_date, max_data_date, task_template_json, metrics_spec_json, tuning_config_json, version, )
def task_contains_new_data(indexing_task, cur_datasource, cur_version): # Check to see if the current datasource has an indexing hash we # can compare to cur_hash_file = get_hash_storage_path(cur_datasource, cur_version) if not os.path.isdir(Flags.ARGS.task_hash_dir): raise RuntimeError('You need to create the task hash dir, %s' % Flags.ARGS.task_hash_dir) if not os.path.isfile(cur_hash_file): return True # Each line of the hash file contains a separate file hash. Compare # the current file hashes with the new file hashes to see if there is a # difference. # NOTE(stephen): Intentionally not using a set here since it's possible # for an indexing job to index the same file twice on purpose. cur_file_hash = sorted(FileUtils.FileContents(cur_hash_file).split('\n')) new_file_hash = sorted(indexing_task.get_file_hashes()) return cur_file_hash != new_file_hash
class DruidIndexingTaskBuilder(object): _DEFAULT_METRICS_SPEC = FileUtils.FileContents(DEFAULT_METRICS_SPEC_FILE) _DEFAULT_TASK_TEMPLATE = FileUtils.FileContents(DEFAULT_TASK_TEMPLATE_FILE) _DEFAULT_TUNING_CONFIG = FileUtils.FileContents(DEFAULT_TUNING_CONFIG_FILE) def __init__( self, datasource_name, dimensions, date_column, paths, start_date, end_date, task_template_json=None, metrics_spec_json=None, tuning_config_json=None, version=None, ): self._datasource = datasource_name self._dimensions = dimensions self._date_column = date_column self._start_date = start_date.strftime(DRUID_DATE_FORMAT) self._end_date = end_date.strftime(DRUID_DATE_FORMAT) self._task_template_json = str.strip(task_template_json or self.default_task_template()) self._metrics_spec_json = str.strip(metrics_spec_json or self.default_metrics_spec()) self._tuning_config_json = str.strip(tuning_config_json or self.default_tuning_config()) self._version = version # If an explicit version has been set, add it to the tuning config. if version: tuning_config = json.loads(self._tuning_config_json) tuning_config['version'] = version tuning_config['useExplicitVersion'] = True self._tuning_config_json = json.dumps(tuning_config) # Validate the input file paths before building the path input spec _ = [_validate_file_path(p) for p in paths] self._paths = paths self._input_spec = build_input_spec(paths) self._task_dict = self._build_task() def _build_task(self): raw_json = (self._task_template_json.replace( '{{INPUT_SPEC_JSON}}', self._input_spec).replace( '{{DATASOURCE_NAME}}', self.datasource).replace( '{{DATA_START_DATE}}', self._start_date).replace( '{{DATA_END_DATE}}', self._end_date).replace( '{{DIMENSIONS_JSON}}', json.dumps(self._dimensions)).replace( '{{DATE_COLUMN_NAME}}', self._date_column).replace( '{{METRICS_SPEC_JSON}}', self._metrics_spec_json).replace( '{{TUNING_CONFIG_JSON}}', self._tuning_config_json).strip()) return json.loads(raw_json) @property def datasource(self): return self._datasource @property def version(self): return self._version @property def task_definition(self): return self._task_dict # Compute a reproducible representation of the files designated for # ingestion. def get_task_hash(self): return '\n'.join(sorted(self.get_file_hashes())) # Create a list of file hashes for the set of files to be indexed def get_file_hashes(self): return [compute_file_hash(p) for p in self._paths] # Print a human readable overview of what this indexing task will do def print_overview(self): print('Indexing Task Overview') print('Datasource: %s' % self.datasource) print('Version: %s' % self.version) print('Dimensions: %s' % json.dumps(sorted(self._dimensions), indent=2)) print('Date column: %s' % self._date_column) print('Start date: %s' % self._start_date) print('End date: %s' % self._end_date) print('Paths: %s' % json.dumps(sorted(self._paths), indent=2)) @classmethod def default_metrics_spec(cls): return cls._DEFAULT_METRICS_SPEC @classmethod def default_task_template(cls): return cls._DEFAULT_TASK_TEMPLATE @classmethod def default_tuning_config(cls): return cls._DEFAULT_TUNING_CONFIG
import os from pylib.file.file_utils import FileUtils # Detect if the current code is running within EC2 _BIOS_VERSION_FILE = '/sys/devices/virtual/dmi/id/bios_version' RUNNING_IN_EC2 = os.path.isfile( _BIOS_VERSION_FILE ) and 'amazon' in FileUtils.FileContents(_BIOS_VERSION_FILE)