def backtest_node(model_path, in_glob, out_dir) -> co.Parallel: import glob import os in_files = sorted(glob.glob(in_glob)) output = co.Parallel() for f in in_files: # Input: RAW_DATA_DIR/validate3554.tfrecord # Output: OUT_DIR/validate3554.pkl.gzip base = os.path.basename(f).replace(".tfrecord", "") out_path = os.path.join(out_dir, base + ".pkl.gzip") if len(in_files) > 50: import re parent = re.sub("(\d\d)\d\d", "\\1__", base) if parent not in output: output[parent] = co.Parallel() base = f"{parent}/{base}" output[base] = co.Exec(commands.backtest, model_path=model_path, data_path=f, out_path=out_path) return output return output
def test(projects: typing.List[str]) -> co.Parallel: "Group tests by project, all in parallel." output = co.Parallel() for project in projects: output[project] = co.Parallel() for name in utils.get_tests(project): # co.Exec often accepts a command string. In this case it takes (func, *args, **kwargs), output[project][name] = co.Exec(utils.run_test, project, test=name) return output
def build_and_test() -> co.Serial: image = co.Image(copy_dir="./code") with co.Serial(image=image, stop_on_error=False) as pipeline: with co.Parallel(name="Trade") as first_trading: first_trading['US'] = co.Exec("python3 first_stock_trading.py") first_trading['CHINA'] = co.Exec("python3 second_stock_trading.py") with co.Parallel(name="TopK") as second_trading: second_trading['US'] = co.Exec( "python3 first_topK_stock_pipeline.py") second_trading['CHINA'] = co.Exec( "python3 second_topK_stock_pipeline.py") return pipeline
def main() -> co.Serial: image = co.Image(dockerfile="./Dockerfile", copy_dir=".") with co.Serial(doc=__doc__, image=image, env=get_env()) as root: with co.Parallel(name="access check") as access_check: access_check["Heroku"] = co.Exec(TEST_HEROKU) access_check["RedisLabs"] = co.Exec(TEST_REDIS) root["deploy"] = deploy() root["integration test"] = co.Exec(INTEGRATION_TEST) with co.Parallel(name="teardown") as teardown: teardown["clear data"] = co.Exec(CLEAR_DATA) teardown["stop"] = co.Exec(STOP_APP) teardown["destroy"] = co.Exec(DESTROY_APP) return root
def make_compute_features_node(in_dir, tmp_dir, out_file, start_date="00000000") -> co.Serial: """ Builds a tree for computing features. Parallelize over different months. """ all_files = glob.glob(f"{in_dir}/*.csv") all_yyyymms = sorted({os.path.basename(f)[:-4] for f in all_files}) os.makedirs(tmp_dir, exist_ok=True) # Skip the first month because we need 1 month of history to compute features all_yyyymms = all_yyyymms[1:] # Then subset to only the ones beyond the start date all_yyyymms = [ yyyymm for yyyymm in all_yyyymms if yyyymm >= start_date[:6] ] # Make output output = co.Serial() output["Parallelize"] = co.Parallel() for node, yyyymm in co.util.makeyyyymmnodes(output["Parallelize"], all_yyyymms): node[yyyymm] = co.Exec(compute_features, yyyymm, in_dir, tmp_dir) output["Merge"] = co.Exec(merge_data, tmp_dir, out_file) return output
def life() -> co.Serial: with co.Serial(image=game_of_life) as pipeline: pipeline["initialize grid"] = co.Exec(initialize_grid) image_names = [] # TODO: instead of modeling a fixed number of clock ticks # use a lazy node to extend this until a grid state is repeated for tick in ticks: with co.Serial(name=f"tick {tick}", image=game_of_life) as iteration: iteration["show grid"] = co.Exec(show_grid(tick)) iteration["find neighbors"] = co.Exec(find_neighborhoods(tick)) with co.Parallel(name=f"apply_rules", image=game_of_life) as rules: rules["isolate"] = co.Exec(isolate(tick)) rules["survive"] = co.Exec(survive(tick)) rules["crowd"] = co.Exec(crowd(tick)) rules["reproduce"] = co.Exec(reproduce(tick)) rules["ignore"] = co.Exec(ignore(tick)) iteration["next grid"] = co.Exec(next_grid(tick)) image_names.append(f"image_{tick}.png") image_list = " ".join(image_names) pipeline["animate"] = co.Exec(animate(image_list)) return pipeline
def poll_sensors() -> co.Serial: r = co.Serial() r['/pmt'] = co.Serial() r['/pmt/poll'] = co.Parallel(image=img) for name in range(1104): if name == 1002: # presumably this sensor is broken somehow r[f'/pmt/poll/{name}'] = co.Exec(certain, 1) else: # most of the sensors work just fine r[f'/pmt/poll/{name}'] = co.Exec(certain, 0) run_callback = co.callback.slack_status(recipient="SlackUser", message="polling sensors") r.on_running(run_callback) err_callback = co.callback.slack_status(recipient="#array-status", ) r.on_error(err_callback) done_callback = co.callback.slack_status( recipient="#array-status", message="all sensors reporting nominally", ) r.on_done(done_callback) # other events include: # - on_queued # - on_running # - on_killed # - on_state_change return r
def nodes_for_this_month() -> co.Parallel: """ This function runs in the container for the generate step. It returns a node to be executed as part of the execute step. """ # linux utility: fortune # python library: sh # The above are not dependencies for launching this pipeline, but they # must be installed in the image to be referenced by this function. os.environ['PATH'] = ':'.join([os.environ['PATH'], "/usr/games"]) from sh import fortune now = datetime.now() parent = co.Parallel() for i in range(monthrange(now.year, now.month)[1]): date = f"{now.year}-{now.month}-{i + 1}" fortune_str = fortune() cmd = cleandoc(f""" echo "About {date} the spirits say:" cat << EOF {indent(fortune_str, prefix=' ')} EOF""") parent[date] = co.Exec(cmd) return parent
def disambiguate() -> co.Parallel: with co.Parallel(image=co.Image(copy_dir=".")) as node: # no ambiguity here, all kwargs refer to conducto.Node.__init__ co.Exec('''echo "node has 1.5 cpu's"''', name="A", cpu=1.5) # native method parameters come first # modify the node object in a second step, then connect it to its parent node_obj = co.Exec(myfunc, "DDR4-2933 (quad channel)", cpu=2950) node_obj.set(cpu=0.75, mem=1.5) node["B"] = node_obj # or connect it to its parent, then modify it in place node["C"] = co.Exec(myfunc, "DDR4-2667 (dual channel)") node["C"].set(cpu=0.75, mem=1.5) # some non-custom types don't have obvious string representations payload = {"foo": 2, "bar": 3} func(payload) # so you may have to handle the serialization yourself node["D"] = co.Exec(wrappedfunc, json.dumps(payload)) # custom types work, but you need to provide helpers param_obj = Emoticon(happy=True) node["E"] = co.Exec(describe, param_obj) return node
def download_node(data_root=DATA_ROOT, num_shards=500, max_shard=500) -> co.Parallel: output = co.Parallel() for ds in ["train", "validate", "test"]: output[ds] = co.Parallel() for shard in range(1, num_shards + 1): if shard > max_shard: break output[ds][f"shard_{shard}_of_{num_shards}"] = co.Exec(f""" set -x -o pipefail script=`pwd`/download.py mkdir -p {DATA_ROOT} cd {DATA_ROOT} shard={shard},{num_shards} partition=2/frame/{ds} mirror=us python $script """) return output
def main() -> co.Parallel: with co.Parallel(image=IMG) as root: # Count lines of code in the remote Git repo. root["lines of code"] = co.Exec("cloc .") # Run a simple data analysis script located there. root["biggest US cities"] = co.Exec( "cd features/copy_url && python analyze.py cities.csv") return root
def teardown(): """ Stop containers. """ with co.Parallel(image=docker_img, requires_docker=True) as node: node["stop redis"] = co.Exec(STOP_REDIS_CMD) node["stop flask"] = co.Exec(STOP_FLASK_CMD) return node
def data_pipeline() -> co.Serial: """ ### **`co.data.pipeline`** `co.data.pipeline` is a pipeline-local key-value store. This data is only visible to your pipeline and persists until your pipeline is deleted. It is useful for writing data in one pipeline step, and reading it in another. `co.data.pipeline` has both a python and command line interface as `conducto-data-pipeline`. The first node of the example prints the command line usage to show the full interface. ### Example: Parameter Search One useful application is performing and summarizing a parameter search. In this example, we try different parameterizations of an algorithm in parallel. Each one stores its results using `co.data.pipeline.puts()`. Once all of the parallel tasks are done, it reads the results using `co.data.pipeline.gets()` and prints a summary. """ # Dockerfile installs python, R, and conducto. image = co.Image(dockerfile="docker/Dockerfile.data", context=".", copy_dir="./code", reqs_py=["conducto"]) data_dir = "demo/data_science/data" output = co.Serial(image=image, doc=co.util.magic_doc()) output["usage"] = co.Exec("conducto-data-pipeline --help") output["parameter_search"] = ps = co.Parallel() for window in [25, 50, 100]: ps[f"window={window}"] = w = co.Parallel() for mean in [.05, .08, .11]: w[f"mean={mean}"] = m = co.Parallel() for volatility in [.1, .125, .15, .2]: m[f"volatility={volatility}"] = co.Exec( f"python data.py --window={window} --mean={mean} " f"--volatility={volatility} --data-dir={data_dir}") output["summarize"] = co.Exec(f"Rscript data.R {data_dir}") return output
def main() -> co.Serial: with co.Serial(image=IMG, requires_docker=True) as root: with co.Parallel(name="Init") as init: init["Build"] = co.Exec("docker build .") init["Lint"] = co.Exec("black --check .") init["Unit Test"] = co.Exec("python unit_test.py") root["Deploy"] = co.Exec("bash deploy_aws.sh") root["Integration Test"] = co.Exec("bash int_test.sh") return root
def main() -> co.Serial: with co.Serial(image=IMG, requires_docker=True) as root: with co.Parallel(name="Init") as init: init["Build"] = co.Exec("sleep 3") init["Lint"] = co.Exec("sleep 1") init["Unit Test"] = co.Exec("sleep 1.5") root["Deploy"] = co.Exec("sleep 4") root["Integration Test"] = co.Exec("sleep 2") return root
def build(projects: typing.List[str]) -> co.Parallel: "Build projects in parallel, using simple shell command." # Override the parent image to use one with docker installed. img = co.Image(image="docker:19.03", copy_dir=".") output = co.Parallel(image=img, requires_docker=True) for project in projects: # Command needs docker; inherits flag from parent node output[project] = co.Exec(f"cd {project} && docker build .") return output
def main() -> co.Parallel: """ Dynamically build pipelines for each actor in a static list. """ actors = ["Oprah Winfrey", "Kate Mara", "Don Cheadle", "Dwayne Johnson"] root = co.Parallel(image=_get_image()) for actor in actors: root[actor] = co.Lazy( f"python pipeline.py all_by_actor '{actor}'" ) return root
def pr(branch) -> co.Parallel: # Make a Docker image, based on python:alpine, with the whole repo and the contents # of the given branch. image = co.Image("python:alpine", copy_repo=True, copy_branch=branch) # Using that Docker image, run three commands in parallel to interact with the # repo's files. with co.Parallel(image=image) as root: co.Exec(f"echo {branch}", name="print branch") co.Exec("pwd", name="print working directory") co.Exec("ls -la", name="list files") return root
def main() -> co.Serial: with co.Serial(image=get_image(), doc=__doc__) as root: with co.Parallel(name="Initialize"): co.Exec("docker build -t my_image .", name="Build", requires_docker=True) co.Exec("black --check .", name="Lint") co.Exec("python test.py --verbose", name="Unit Test") root["Deploy"] = co.Exec(DEPLOY_CMD, requires_docker=True) root["Integration Test"] = co.Exec(INTEGRATION_TEST_CMD) root["Cleanup"] = co.Exec("docker kill my_app", requires_docker=True) return root
def compute_covs_node(in_glob, out_dir) -> co.Parallel: import glob import os in_files = sorted(glob.glob(in_glob)) output = co.Parallel() for f in in_files: # Input: RAW_DATA_DIR/train3554.tfrecord # Output: COVS_ROOT/train3554.pkl.gzip base = os.path.basename(f).replace(".tfrecord", "") out_path = os.path.join(out_dir, base + ".pkl.gzip") if len(in_files) > 50: import re parent = re.sub("(\d\d)\d\d", "\\1__", base) if parent not in output: output[parent] = co.Parallel() base = f"{parent}/{base}" output[base] = co.Exec(commands.compute_cov, f, out_path) return output
def islands() -> co.Serial: with co.Serial() as pipeline: pipeline["hawaii"] = co.Exec("echo big island") with co.Parallel(name="maui_county") as maui_county: maui_county["maui"] = co.Exec("echo valley isle") maui_county["lanai"] = co.Exec("echo pineapple isle") maui_county["molokai"] = co.Exec("echo friendly isle") maui_county["kahoolawe"] = co.Exec("echo target isle") pipeline["oahu"] = co.Exec("echo gathering place") with co.Serial(name="kauai_county") as kauai_county: kauai_county["kauai"] = co.Exec("echo garden isle") kauai_county["niihau"] = co.Exec("echo forbidden isle") return pipeline
def main() -> co.Serial: with co.Serial(image=img) as p: # p is for 'Pipeline root' p["get data"] = co.Exec(get_sensor_data) p["notify"] = co.Parallel() p["notify/stdout"] = co.Exec(plot_to_stdout) p["notify/channel"] = co.Exec(plot_to_slack) p["notify/team"] = co.Serial() for user in update_users: p[f"notify/team/{user}"] = co.Exec(message_to_slack_user, user) return p
def download_and_plot() -> co.Serial: download_command = """ apt update -y && apt install -y curl unzip curl https://www.fs.usda.gov/rds/archive/products/RDS-2005-0004/RDS-2005-0004.zip > data.zip unzip data.zip """ image = co.Image(dockerfile='./Dockerfile', context='.') with co.Serial(image=image) as pipeline: co.Exec(download_command, name="download") with co.Parallel(name='plot'): co.Exec('python rainfall.py', name='daily') co.Exec('python rainfall.py --resample M --save', name='monthly') return pipeline
def path() -> co.Serial: """ The Node tree can be accessed with file system-like [paths](/docs/basics/pipeline-structure#path). """ root = co.Serial(image="foo", doc=co.util.magic_doc()) root["all together"] = co.Parallel() root["all together/a"] = co.Exec("echo step 1, image bar", image="bar") root["all together/b"] = co.Exec("echo step 1, image foo") root["one at a time"] = co.Serial(image="bar") root["one at a time/c"] = co.Exec("echo step 2, image bar") root["one at a time/d"] = co.Exec("echo step 3, image bar") return root
def nodes_for_this_month(now): parent = co.Parallel() for i in range(monthrange(now.year, now.month)[1]): date = f"{now.year}-{now.month}-{i}" fortune = get_fortune() cmd = cleandoc(f""" echo "About {date} the spirits say:" cat << EOF {indent(fortune, prefix=' ')} EOF""") parent[date] = co.Exec(cmd) return parent
def dict() -> co.Serial: """ Each Node is [dict-like](/docs/basics/pipeline-structure#dict), and you can build a hierarchy by assigning children into them. """ root = co.Serial(image="foo", doc=co.util.magic_doc()) root["all together"] = co.Parallel() root["all together"]["a"] = co.Exec("echo step 1, image bar", image="bar") root["all together"]["b"] = co.Exec("echo step 1, image foo") root["one at a time"] = co.Serial(image="bar") root["one at a time"]["c"] = co.Exec("echo step 2, image bar") root["one at a time"]["d"] = co.Exec("echo step 3, image bar") return root
def run() -> co.Serial: "Download data from the US EIA, then visualize some datasets." with co.Serial(image=IMG, doc=co.util.magic_doc()) as output: # First download some data from the US Energy Information Administration. output["Download"] = co.Exec(DOWNLOAD_COMMAND) # Then make a few different visualizations of it. output["Display"] = co.Parallel() for dataset in DATASETS.keys(): # Use co.Exec shorthand for calling native Python functions. # It calls `display(dataset)` in an Exec node. It's equal to: # python pipeline.py display --dataset={dataset} output["Display"][dataset] = co.Exec(display, dataset) return output
def test() -> co.Serial: """ Check if both redis and flask are available. Then see if they're working. """ with co.Serial(image=test_img) as test: with co.Parallel(name="services up?") as check: check["redis up?"] = co.Exec(TEST_REDIS_CMD) check["flask up?"] = co.Exec(TEST_FLASK_CMD) test["integration test"] = co.Exec(INTEGRATION_TEST_CMD) return test
def all_by_actor(actor) -> co.Parallel: """ Return a pipeline listing all Netflix shows with an actor. Call with co.Lazy to generate pipeline at runtime. """ df = _load_data() titles = df[df.cast.str.contains(actor) | False].title output = co.Parallel() for title in titles: output[title] = co.Exec( f"python pipeline.py for_title {repr(title)}" ) return output
def parallelize_reps(reps:int) -> co.Parallel: output = co.Parallel() data_size = reps min_rep = 0 max_rep = reps for rep_i in range(min_rep,max_rep): print("inside rep " + str(rep_i)) output[f'rep{rep_i}'] = co.Serial() # unpredictable output[f'rep{rep_i}']['p1'] = co.Exec(f"{experiment_command} GLOBAL-randomSeed {rep_i} WORLD_CONVEYORBELT-randomize 1 && conducto-perm-data put --name rep{rep_i}p1 --file LOD_data.csv") # predictable output[f'rep{rep_i}']['p0'] = co.Exec(f"{experiment_command} GLOBAL-randomSeed {rep_i} WORLD_CONVEYORBELT-randomize 0 && conducto-perm-data put --name rep{rep_i}p0 --file LOD_data.csv") return output