예제 #1
0
 def __init__(self,
              project: str = "brain-deepviz",
              bucket: str = "lucid-flow") -> None:
     self.project_name = project
     self.bucket_name = bucket
     self.tempdir = AbsolutePath(mkdtemp())
     self._file_list = None
     self._bucket = None
예제 #2
0
 def _writing(self, path: AbsolutePath, mode: str = "w+b") -> IO:
     blob = storage.blob.Blob(path.as_relative_path(), self.bucket)
     local_path = self.tempdir.append(path.as_relative_path())
     makedirs(dirname(local_path), exist_ok=True)
     writing_file = localfs_open(local_path, mode=mode)
     yield writing_file
     writing_file.close()
     blob.upload_from_filename(local_path)
예제 #3
0
def load(path: str, transform: str = "None") -> Sequence:
    assert path.startswith("/")
    # path = PathTemplate.path_template_prefix + raw_path # TODO: rethink
    with io.reading(AbsolutePath(path)) as handle:
        result = lucid_io_load(handle)
    if transform == "lines":
        result = result.split("\n")
    return result
예제 #4
0
 def _glob(self, glob_path: AbsolutePath) -> List[AbsolutePath]:
     fields = "items/name,items/updated,nextPageToken"
     matched_paths: List[AbsolutePath] = []
     # GCS returns folders iff a trailing slash is specified, so we try both:
     if glob_path.endswith("/"):
         other_path = AbsolutePath(glob_path[:-1])
     else:
         other_path = AbsolutePath(glob_path + "/")
     for glob_string in [glob_path, other_path]:
         # prefix = glob_string.split('*')[0] # == entire string if no '*' found
         # bucket_listing = self.bucket.list_blobs(fields=fields, prefix=prefix)
         # file_paths = [blob.name for blob in bucket_listing]
         # file_paths = self.file_list.glob(glob_string)
         # matched_paths += fnmatch.filter(file_paths, glob_string)
         matched_paths += self.file_list.glob(glob_string)
     # matched_paths = list(sorted(set(matched_paths)))  # should already be unique
     return matched_paths
예제 #5
0
 def value_for_output(cls, output: str) -> str:
     if isinstance(output, str):
         # TODO: decide if we can't find a better way to represent paths
         # E.g. maybe we should just not allow tasks to save their own results.
         # or provide a write_handle or sth.
         if output.startswith("/"):
             return AbsoluteGCSURL.from_absolute_path(AbsolutePath(output))
         else:
             return output
     else:
         raise NotImplementedError
예제 #6
0
 def normpath(self, path: str) -> AbsolutePath:
     # logging.debug(f"normpathing: {path}")
     if path.startswith("gs://"):
         path = path[5:]
     # logging.debug(f"removed gs scheme: {path}")
     if path.startswith(self.bucket_name):
         path = path[len(self.bucket_name):]
     # logging.debug(f"removed bucket: {path}")
     # if path.startswith("/"):
     # path = path[1:]
     # logging.debug(f"removed leading slash: {path}")
     return AbsolutePath(path)
예제 #7
0
def test_joining_absolute_paths():
    absolute_one = AbsolutePath("/an/absolute/path.ext")
    absolute_two = AbsolutePath("/prefix/absolute/path")
    relative_one = absolute_one.as_relative_path()
    joined_1 = relative_one.prepend(absolute_two)
    joined_2 = absolute_two.append(relative_one)
    assert joined_1 == joined_2
예제 #8
0
 def value_for_input(cls, input: object) -> object:
     if isinstance(input, str):
         # logging.debug("input is str")
         # TODO: here is where we would add convenience loading etc.
         if input.startswith("/"):
             return AbsoluteGCSURL.from_absolute_path(AbsolutePath(input))
         else:
             return input
     if isinstance(input, dict):
         # logging.debug("input is dict!")
         # this signifies an aggregating input spec! let's resolve it:
         if len(input.items()) == 1:
             logging.debug(
                 "input dict had one entry, assuming AggregatingIS")
             placeholder_string, path_template_string = list(
                 input.items())[0]
             path_template = PathTemplate(path_template_string)
             placeholders = placeholder_string.split(",")
             assert placeholders == path_template.placeholders
             paths = io.glob(path_template.glob)
             value = {}
             for path in paths:
                 matches = path_template.match(path)
                 if matches:
                     key = tuple(matches[ph] for ph in placeholders)
                     value[key] = AbsoluteGCSURL.from_absolute_path(
                         AbsolutePath(path))
             return value
         else:
             logging.debug(
                 "input dict had multiple entries, simply setting value.")
             return input
     if isinstance(input, (int, float, tuple, list, dict, set)):
         # logging.debug("input is value")
         return input
     else:
         raise NotImplementedError
예제 #9
0
def test_absolute_path_relative():
    with pytest.raises(ValueError):
        rp = AbsolutePath("a/relative/path.ext")
예제 #10
0
def test_absolute_path():
    ap = AbsolutePath("/an/absolute/path.ext")
예제 #11
0
 def exist(self, paths: List[AbsolutePath]) -> List[bool]:
     normalized = [AbsolutePath(self.normpath(path)) for path in paths]
     return self._exist(normalized)
예제 #12
0
def test_file_list_glob_negative():
    glob_path = AbsolutePath("/a/different/*/path.ext")
    absolute_path = AbsolutePath("/an/absolute/path.ext")
    file_list = FileList(paths=[absolute_path])
    assert absolute_path not in file_list.glob(glob_path)
예제 #13
0
def test_absolute_path_basename():
    ap = AbsolutePath("/an/absolute/path.ext")
    dir = ap.basename
    assert dir == "path.ext"
예제 #14
0
class TaskSpec(object):
    """Data object describing which inputs a task expects."""

    task_specification_glob = AbsolutePath("/tasks/*.py")
    variable_to_input_spec: Mapping[Variable, List[InputSpec]]

    def __init__(self, inputs: List[InputSpec], output: OutputSpec,
                 src_path: str, name: str) -> None:
        self.input_specs = inputs
        self.output_spec = output
        self.src_path = src_path
        self.name = name
        self._verify_placeholders()
        self.variable_to_input_spec = defaultdict(list)
        for input_spec in inputs:
            for variable in input_spec.declared_variables():
                self.variable_to_input_spec[variable].append(input_spec)

    def __repr__(self) -> str:
        ios: List[Spec] = cast(List[Spec],
                               self.input_specs) + [self.output_spec]
        reprs = ", ".join([repr(input) for input in ios])
        return "<TaskSpec {}, ({})>".format(self.name, reprs)

    def _verify_placeholders(self) -> None:
        return
        # TODO: this currently doesn't correctly cover all cases. Disabled for now.
        # outputs = set(self.output_spec.placeholders)
        # inputs = set.union(*[input_spec.depends_on() for input_spec in self.input_specs])
        # inputs = inputs.union(set(input_spec.name for input_spec in self.input_specs))
        # if not outputs == inputs:
        #   if outputs.issuperset(inputs):
        #     difference = outputs - inputs
        #     raise ValueError("Placeholders '{}' in task_spec '{}' do not have input variables that could replace them. (Inputs: {})".format(difference, self.name, inputs))
        #   else: # TODO: this covers both subset and entirely disjoint. is the error message bringing that across? no.
        #     difference = inputs - outputs
        #     raise ValueError("Input variables '{}' in task_spec '{}' do not have any corresponding output placeholders. (Inputs: {})".format(difference, self.name, inputs))

    @classmethod
    def is_task_path(cls, path: str) -> bool:
        return fnmatch.fnmatch(path, cls.task_specification_glob)

    @staticmethod
    def estimate_cost(num_jobs: int, example_runs: List[JobSpec]) -> None:
        """Print naive CPU time and cost estimates based on supplied sample runs."""

        print(
            "Estimates are 95% conf. intervals based on std of supplied runs. Only reasonable if colab instance has similar specs as requested AE instances!"
        )
        durations = [run.execution_duration for run in example_runs]
        duration_mean, duration_std = mean(durations), std(durations)
        cpu_time_mean = timedelta(seconds=num_jobs * duration_mean)
        cpu_time_std = timedelta(seconds=num_jobs * duration_std)
        print(
            f"Expecting to use {format_timedelta(cpu_time_mean)}±{format_timedelta(cpu_time_std)} of CPU time."
        )

        price_per_hour_in_usd = (
            0.0526 + 2 * 0.0071
        )  # 1CPU, 2GB RAM, https://cloud.google.com/appengine/pricing#flexible-environment-instances
        total_price_mean = price_per_hour_in_usd * (
            cpu_time_mean.total_seconds() / (60 * 60))
        total_price_std = price_per_hour_in_usd * (
            cpu_time_std.total_seconds() / (60 * 60))
        print(
            f"Expecting to cost ${total_price_mean:.2f}±{total_price_std:.2f}."
        )

    @property
    def manifest_path(self) -> str:
        return self.src_path.replace(".py", ".json")

    @property
    def input_names(self) -> List[str]:
        return [input_spec.name for input_spec in self.input_specs]

    @property
    def dependencies(self) -> Mapping[Variable, Set[Variable]]:
        return {spec.name: spec.depends_on() for spec in self.input_specs}

    def to_job_spec(self, bindings: Bindings) -> "JobSpec":
        str_bindings = stringify_bindings(bindings)
        output_path = self.output_spec.with_replacements(str_bindings)
        return JobSpec(bindings, output_path, self.src_path)

    def to_job_specs(self,
                     initial_bindings: Bindings = {}) -> Iterable[JobSpec]:
        return map(self.to_job_spec, self.all_bindings(initial_bindings))

    def all_bindings(self,
                     initial_bindings: Bindings = {}) -> Sequence[Bindings]:
        # TODO: return empty list if self.dependencies is empty???
        all_bindings = [initial_bindings]
        sorted_dependencies = toposort_flatten(self.dependencies)
        logging.debug("Sorted sorted_dependencies: %s", sorted_dependencies)
        for variable_name in sorted_dependencies:
            variable = Variable(variable_name)
            input_specs = self.variable_to_input_spec[variable]
            for input_spec in input_specs:
                relevant_vars = input_spec.depends_on() | set(
                    [input_spec.name])
                logging.debug(
                    f"Resolving '{variable}' via {input_spec} on relevant vars {relevant_vars}."
                )
                new_bindings: List[Bindings] = []
                memoized_values: Dict[FrozenSet[Tuple[str, str]],
                                      Set[Value]] = {}
                for bindings in all_bindings:
                    relevant_bs = frozenset((var, str(value))
                                            for var, value in bindings.items()
                                            if var in relevant_vars)
                    if relevant_bs in memoized_values:
                        values = memoized_values[relevant_bs]
                        logging.debug(
                            "Found cached values %s for bindings %s",
                            list(values),
                            bindings,
                        )
                    else:
                        values = input_spec.values(variable, bindings)
                        memoized_values[relevant_bs] = values
                        logging.debug("Memoized values %s for bindings %s",
                                      list(values), bindings)
                    for value in values:
                        value_binding = {variable: value}
                        value_binding.update(bindings)
                        new_bindings.append(value_binding)
                # logging.debug("New bindings: %s", new_bindings)
                all_bindings = new_bindings
            logging.debug("Done resolving: %s", variable)
        # TODO: what if new_bindings empty because values empty?
        return all_bindings

    def matching_input_spec(self, src_path: str) -> Optional[InputSpec]:
        for input_spec in self.input_specs:
            if input_spec.matches(src_path):
                return input_spec
        return None

    def should_handle_file(self, src_path: str) -> bool:
        return self.matching_input_spec(src_path) is not None

    def manifest(self, all_bindings: Optional[List[Bindings]] = None) -> Dict:
        bindings = all_bindings or self.all_bindings()
        keys = self.output_spec.placeholders
        assignments = sorted([binding[key] for key in keys]
                             for binding in bindings)
        return {
            "output": {
                "template": self.output_spec.path_template.template
            },
            "bindings": {
                "values": assignments
            },
        }

    def preflight(self, num_tried_jobs: int = 3) -> None:
        logging.info(f"Starting preflight, running {num_tried_jobs} jobs...")
        job_specs = list(self.to_job_specs())
        preflight_jobs = sample(job_specs, num_tried_jobs)
        for job in preflight_jobs:
            job.execute()
            logging.info(f"Job completed without error.")
        self.estimate_cost(len(job_specs), preflight_jobs)

    def deploy(self, preflight: bool = True) -> None:
        if preflight:
            self.preflight()
        remote_path = f"tasks/{self.name}"
        io.upload(self.src_path, remote_path)
예제 #15
0
def absolute_path():
    return AbsolutePath("/an/absolute/path.ext")
예제 #16
0
class GCStorageAdapter(IOAdapter):

    _file_list: Optional[FileList]
    _bucket: Optional[Any]

    def __init__(self,
                 project: str = "brain-deepviz",
                 bucket: str = "lucid-flow") -> None:
        self.project_name = project
        self.bucket_name = bucket
        self.tempdir = AbsolutePath(mkdtemp())
        self._file_list = None
        self._bucket = None

    @property
    def bucket(self) -> Any:
        if not self._bucket:
            self._client = storage.Client(project=self.project_name)
            self._bucket = self._client.bucket(self.bucket_name)
        return self._bucket

    @property
    def file_list(self) -> FileList:
        if not self._file_list:
            self._file_list = FileList(project=self.project_name,
                                       bucket=self.bucket_name)
            self._file_list._get_all_gcs_files()
        return self._file_list

    def normpath(self, path: str) -> AbsolutePath:
        # logging.debug(f"normpathing: {path}")
        if path.startswith("gs://"):
            path = path[5:]
        # logging.debug(f"removed gs scheme: {path}")
        if path.startswith(self.bucket_name):
            path = path[len(self.bucket_name):]
        # logging.debug(f"removed bucket: {path}")
        # if path.startswith("/"):
        # path = path[1:]
        # logging.debug(f"removed leading slash: {path}")
        return AbsolutePath(path)

    @contextmanager
    def _reading(self, path: AbsolutePath, mode: str = "r+b") -> IO:
        local_path = self._download(path)
        reading_file = localfs_open(local_path, mode=mode)
        yield reading_file
        reading_file.close()

    @contextmanager
    def _writing(self, path: AbsolutePath, mode: str = "w+b") -> IO:
        blob = storage.blob.Blob(path.as_relative_path(), self.bucket)
        local_path = self.tempdir.append(path.as_relative_path())
        makedirs(dirname(local_path), exist_ok=True)
        writing_file = localfs_open(local_path, mode=mode)
        yield writing_file
        writing_file.close()
        blob.upload_from_filename(local_path)

    def _makedirs(self, path: str) -> None:
        pass

    def _glob(self, glob_path: AbsolutePath) -> List[AbsolutePath]:
        fields = "items/name,items/updated,nextPageToken"
        matched_paths: List[AbsolutePath] = []
        # GCS returns folders iff a trailing slash is specified, so we try both:
        if glob_path.endswith("/"):
            other_path = AbsolutePath(glob_path[:-1])
        else:
            other_path = AbsolutePath(glob_path + "/")
        for glob_string in [glob_path, other_path]:
            # prefix = glob_string.split('*')[0] # == entire string if no '*' found
            # bucket_listing = self.bucket.list_blobs(fields=fields, prefix=prefix)
            # file_paths = [blob.name for blob in bucket_listing]
            # file_paths = self.file_list.glob(glob_string)
            # matched_paths += fnmatch.filter(file_paths, glob_string)
            matched_paths += self.file_list.glob(glob_string)
        # matched_paths = list(sorted(set(matched_paths)))  # should already be unique
        return matched_paths

    def _exist(self, paths: List[AbsolutePath]) -> List[bool]:
        return [self.file_list.exists(path) for path in paths]

    def _download(self, path: AbsolutePath) -> AbsolutePath:
        local_path = self.tempdir.append(path.as_relative_path())
        makedirs(dirname(local_path), exist_ok=True)
        blob = storage.blob.Blob(path.as_relative_path(), self.bucket)
        blob.download_to_filename(local_path)
        return local_path

    def _upload(self, local_path: str, remote_path: RelativePath) -> None:
        assert not remote_path.startswith("/")
        blob = storage.blob.Blob(remote_path, self.bucket)
        blob.upload_from_filename(local_path)
예제 #17
0
def test_gcs_connection(empty_file_list):
    file_list = empty_file_list
    file_list._get_all_gcs_files()
    assert file_list.exists(AbsolutePath("/data/noop"))
예제 #18
0
def test_file_list_invalid_glob_fails_loudly():
    glob_url = AbsoluteURL("gs://lucid-flow/an/*/path.ext")
    absolute_path = AbsolutePath("/an/absolute/path.ext")
    file_list = FileList(paths=[absolute_path])
    with pytest.raises(ValueError):
        file_list.glob(glob_url)
예제 #19
0
def test_absolute_path_url():
    with pytest.raises(ValueError):
        rp = AbsolutePath("gs://an/absolute/url.ext")
예제 #20
0
def test_absolute_path_dirname():
    ap = AbsolutePath("/an/absolute/path.ext")
    dir = ap.dirname
    assert dir == "/an/absolute"
예제 #21
0
 def _download(self, path: AbsolutePath) -> AbsolutePath:
     local_path = self.tempdir.append(path.as_relative_path())
     makedirs(dirname(local_path), exist_ok=True)
     blob = storage.blob.Blob(path.as_relative_path(), self.bucket)
     blob.download_to_filename(local_path)
     return local_path
예제 #22
0
 def glob(self, glob_string: AbsolutePath) -> List[AbsolutePath]:
     if not isinstance(glob_string, AbsolutePath):
         raise ValueError("Can only use AbsolutePath objects with FileList!")
     paths = fnmatch.filter(self.paths, glob_string)
     return [AbsolutePath(path) for path in paths]
예제 #23
0
def test_file_list_glob():
    glob_path = AbsolutePath("/an/*/path.ext")
    absolute_path = AbsolutePath("/an/absolute/path.ext")
    file_list = FileList(paths=[absolute_path])
    assert absolute_path in file_list.glob(glob_path)