def start(self): """ Use the Metaflow client to retrieve the latest successful run from our MovieStatsFlow and assign them as data artifacts in this flow. This step uses 'conda' to isolate the environment. This step will always use pandas==1.3.3 regardless of what is installed on the system. """ # Load the analysis from the MovieStatsFlow. from metaflow import Flow, get_metadata # Print metadata provider print("Using metadata provider: %s" % get_metadata()) # Load the analysis from the MovieStatsFlow. run = Flow("MovieStatsFlow").latest_successful_run print("Using analysis from '%s'" % str(run)) # Get the dataframe from the start step before we sliced into into # genre specific dataframes. self.dataframe = run["start"].task.data.dataframe # Also grab the summary statistics. self.genre_stats = run.data.genre_stats # Compute our two recommendation types in parallel. self.next(self.bonus_movie, self.genre_movies)
def eval(self): with profile("Evaluating: %s" % self.model_name): mod = MODELS[self.model_name] data_run = Flow('TaxiRegressionDataFlow')[self.data_run_id] model = mod.load_model(self.model) self.mse = mod.mse(model, data_run.data.test_data) self.next(self.join)
def train(self): self.model_name = self.input with profile('Training model: %s' % self.model_name): mod = MODELS[self.model_name] data_run = Flow('TaxiRegressionDataFlow')[self.data_run_id] model = mod.fit(data_run.data.train_data) self.model = mod.save_model(model) self.next(self.eval)
def start(self): flow = Flow('TrainModels').latest_successful_run print('using data from flow: %s' % flow.id) self.model = flow.data.simple_rf self.config = Config(**yaml.load(self.config_file)) self.next(self.end)
def start(self): run = Flow('TaxiRegressionDataFlow').latest_run self.data_run_id = run.id self.features = run.data.features self.models = [name for name, model in MODELS.items() if all(feat in self.features\ for feat in model.FEATURES)] print("Building models: %s" % ', '.join(self.models)) self.next(self.train, foreach='models')
def start(self): flow = Flow('PreprocessPaginate').latest_successful_run print('using data from flow: %s' % flow.id) self.signals = flow.data.signals_df flow = Flow('PrepareFeatures').latest_successful_run print('using data from flow: %s' % flow.id) self.features = flow.data.recent_annoated_simple_features flow = Flow('DeployModel').latest_successful_run print('using data from flow: %s' % flow.id) self.model = flow.data.model self.config = Config(**yaml.load(self.config_file)) self.next(self.predict_vocab)
def start(self): audit = Flow("VersioningDemo").latest_successful_run.data.dolt master_conf = DoltConfig(database="foo") with DoltDT(run=self, audit=audit) as dolt: self.df1 = dolt.read("bar", as_key="bar1") with DoltDT(run=self, config=master_conf) as dolt: self.df2 = dolt.read("bar", as_key="bar2") self.next(self.end)
def start(self): flow = Flow('PrepareFeatures').latest_successful_run print('using data from flow: %s' % flow.id) self.fetures = flow.data.annoated_simple_features self.config = Config(**yaml.load(self.config_file)) self.next(self.train_simple_rf_model)
def start(self): flow = Flow('PreprocessPaginate').latest_successful_run print('using data from flow: %s' % flow.id) self.signals_df = flow.data.signals_df self.clean_signals_df = flow.data.clean_signals_df self.config = Config(**yaml.load(self.config_file)) self.next(self.prepare_simple_features)
def start(self): flow = Flow('Download').latest_successful_run print('using data from flow: %s' % flow.id) self.books = flow.data.books self.logs = flow.data.logs self.config = Config(**yaml.load(self.config_file)) self.next(self.preprocess_pages_df)
def resolve_task_from_pathspec(flow_name, pathspec): """ resolves a task object for the pathspec query on the CLI. Args: flow_name : (str) : name of flow pathspec (str) : can be `stepname` / `runid/stepname` / `runid/stepname/taskid` Returns: metaflow.Task | None """ from metaflow import Flow, Step, Task from metaflow.exception import MetaflowNotFound # since pathspec can have many variations. pthsplits = pathspec.split("/") task = None run_id = None resolving_from = "task_pathspec" if len(pthsplits) == 1: # This means stepname resolving_from = "stepname" latest_run = Flow(flow_name).latest_run if latest_run is not None: run_id = latest_run.pathspec try: task = latest_run[pathspec].task except KeyError: pass elif len(pthsplits) == 2: # This means runid/stepname namespace(None) resolving_from = "step_pathspec" try: task = Step("/".join([flow_name, pathspec])).task except MetaflowNotFound: pass elif len(pthsplits) == 3: # this means runid/stepname/taskid namespace(None) resolving_from = "task_pathspec" try: task = Task("/".join([flow_name, pathspec])) except MetaflowNotFound: pass else: # raise exception for invalid pathspec format raise CommandException( msg= "The PATHSPEC argument should be of the form 'stepname' Or '<runid>/<stepname>' Or '<runid>/<stepname>/<taskid>'" ) if task is None: # raise Exception that task could not be resolved for the query. raise TaskNotFoundException(pathspec, resolving_from, run_id=run_id) return task
def start(self): from movie_recs_util import make_batches, top_movies run = Flow('MovieTrainFlow').latest_successful_run self.movie_names = run['start'].task['movie_names'].data self.model_run = run.pathspec print('Using model from', self.model_run) model_users_mtx = run['start'].task['model_users_mtx'].data self.top_movies = top_movies(model_users_mtx, self.num_top_movies) self.pairs = make_batches(combinations(self.top_movies, 2)) self.next(self.batch_recommend, foreach='pairs')
def list_many_cards( ctx, type=None, hash=None, card_id=None, follow_resumed=None, as_json=None, ): from metaflow import Flow flow = Flow(ctx.obj.flow.name) run = flow.latest_run cards_found = 0 if not as_json: pass ctx.obj.echo("Listing cards for run %s" % run.pathspec, fg="green") js_list = [] for step in run: step_str_printed = False # variable to control printing stepname once. for task in step: try: available_card_paths, card_datastore, pathspec = resolve_card( ctx, "/".join(task.pathspec.split("/")[1:]), type=type, hash=hash, card_id=card_id, follow_resumed=follow_resumed, no_echo=True, ) if not step_str_printed and not as_json: ctx.obj.echo("Step : %s" % step.id, fg="green") step_str_printed = True js_resp = list_available_cards( ctx, pathspec, available_card_paths, card_datastore, command=None, show_list_as_json=as_json, list_many=True, ) if as_json: js_list.append(js_resp) cards_found += 1 except CardNotPresentException: pass if cards_found == 0: raise CardNotPresentException(run.pathspec, card_hash=hash, card_type=type, card_id=card_id) if as_json: print(json.dumps(js_list, indent=4))
def get_run_stats(min_num_epochs=100,min_demos=100): save_objs = [] for run in Flow('TrainingSimulatorFlow').runs(): if not run.finished: continue flow_init_datum = list(run.steps())[-1].task.data if flow_init_datum.num_demos >= min_demos and flow_init_datum.num_epochs >= min_num_epochs: nw_objs = [data.to_json()for data in run.data.final_data] # capture flows > 100 demos/ 100 epochs. save_objs = save_objs + nw_objs return save_objs
def start(self): flow = Flow('Download').latest_successful_run print('using users data from flow: %s' % flow.id) self.users = flow.data.users self.vocab_skills = flow.data.vocab_skills flow = Flow('PredictVocab').latest_successful_run print('using vocab data from flow: %s' % flow.id) self.known_words_df = flow.data.known_words_df flow = Flow('PreprocessPaginate').latest_successful_run print('using signals data from flow: %s' % flow.id) self.signals_df = flow.data.signals_df self.clean_pages_df = flow.data.clean_pages_df import yaml self.config = Config(**yaml.load(self.config_file)) self.next(self.generate_stats)
def start(self): if self.use_ctas: self.paths = Flow('TaxiETLFlow').latest_run.data.paths else: with S3() as s3: objs = s3.list_recursive(URLS) self.paths = [obj.url for obj in objs] print("Processing %d Parquet files" % len(self.paths)) n = max(round(len(self.paths) / NUM_SHARDS), 1) self.shards = [ self.paths[i * n:(i + 1) * n] for i in range(NUM_SHARDS - 1) ] self.shards.append(self.paths[(NUM_SHARDS - 1) * n:]) self.next(self.preprocess_data, foreach='shards')
def start(self): with DoltDT(run=self, database='foo', branch="master") as dolt: self.df = dolt.read_table('bar') first_run = Flow("SucceedsFirstDemo").latest_successful_run first_run_ts = datetime.datetime.strptime(first_run.finished_at, "%Y-%m-%dT%H:%M:%SZ") one_minute_ago = datetime.datetime.now() + datetime.timedelta( hours=8) - datetime.timedelta(minutes=1) if first_run_ts < one_minute_ago: raise Exception( "Run `FirstDemo` within one minute of `SecondDemo`") self.next(self.middle)
def list_flows(names="all"): columns = ["flow", "id", "start", "finish"] if isinstance(names,str) and names=="all": names = [] flows = Metaflow().flows for flow in flows: names.append(flow.pathspec) df = pd.DataFrame(columns= columns) for name in names: runs = list(Flow(name)) for run in runs: index = run.path_components[1] start = run.created_at finish = run.finished_at df = df.append(dict(zip(columns, [name, index, start, finish])), ignore_index=True) return(df.sort_values(by=columns[:2]))
def start(self): """ Load test data set """ from io import StringIO # Load the data set into a pandas dataframe. self.X = pd.read_csv(StringIO(self.test_data)) print('run id: ', self.run_id) if self.run_id == 'latest_successful': self.train_run = Flow('TitanicModeling').latest_successful_run else: self.train_run = Run(f'TitanicModeling/{self.run_id}') # Compute our two recomendation types in parallel. self.next(self.categorical_prep, self.numerical_prep)
def start(self): """ Use the Metaflow client to retrieve the latest successful run from our MovieStatsFlow and assign them as data artifacts in this flow. """ from metaflow import Flow, get_metadata # Print metadata provider print("Using metadata provider: %s" % get_metadata()) # Load the analysis from the MovieStatsFlow. run = Flow("MovieStatsFlow").latest_successful_run print("Using analysis from '%s'" % str(run)) self.genre_stats = run.data.genre_stats # Compute our two recommendation types in parallel. self.next(self.bonus_movie, self.genre_movies)
def __init__(self): self.run = Flow('MovieTrainFlow').latest_successful_run self.model_ann,\ self.model_users_mtx,\ self.model_movies_mtx = load_model(self.run) self.names = load_movie_names()
from metaflow import Flow, get_metadata # Print metadata provider print("Using metadata provider: %s" % get_metadata()) # Load the analysis from the MovieStatsFlow. run = Flow('GenreStatsFlow').latest_successful_run print("Using analysis from '%s'" % str(run)) genre_stats = run.data.genre_stats print(genre_stats)
def start(self): self.model = Flow('FirstFlow').latest_run.data.model print('model:', self.model) self.next(self.end)
def start(self): run = Flow('ClassifierTrainFlow').latest_run self.train_run_id = run.pathspec self.model = run['end'].task.data.model print("Input vector", self.vector) self.next(self.end)
from metaflow import Flow, get_metadata from metaflow.datatools.dolt import DoltDT from doltpy.core import Dolt def print_data_map(data_map): for run_step in data_map.keys(): for table in data_map[run_step]: print('{}, {}'.format(run_step, table)) #print(data_map[run_step][table]) print("Current metadata provider: %s" % get_metadata()) doltdb_path = './imdb-reviews' flow = Flow('IMDBSentimentsFlow') run = flow.latest_successful_run print("Using run: %s" % str(run)) ''' Ex 1: Get all the inputs used by a specific run of a flow ''' # doltdt = DoltDT(run, doltdb_path, 'master') # data_map_for_run = doltdt.get_reads(steps=['start']) # print_data_map(data_map_for_run) ''' Ex 2: Get all the inputs used by a specific step of a run of a flow ''' # doltdt = DoltDT(run, doltdb_path, 'vinai/add-rotten-data') # data_map_for_run = doltdt.get_reads(steps=['start']) # print_data_map(data_map_for_run) ''' Ex 3 Outputs are handled identically