def wf_summary(metadata, opts): overall_wf_attributes = ('id', 'status', 'workflowName', 'workflowRoot', 'submission', 'start') (wf_id, wf_status, wf_name, wf_root, wf_submission, wf_start) = \ [metadata[x] for x in overall_wf_attributes] wf_end = get('end', metadata, default="-") puts('') puts("ID : {}".format(wf_id)) puts("Status : {}".format(wf_status)) puts("Submit Time: {} (UTC)".format(wf_submission)) puts("Start Time: {} (UTC)".format(wf_start)) puts("End Time: {} (UTC)".format(wf_end)) puts("Root : {}".format(wf_root)) puts('') (calls, states, stats) = _get_wf_call_statuses(metadata) table = [] for c in calls: counts = [stats[c][s] for s in states] row = [c] row.extend(counts) table.append(row) headers = ['call'] headers.extend([s for s in states]) print(tabulate(table, headers=headers))
def assemble(pair): a, b = pair if a is not None: joined = get(on_left, a) else: joined = get(on_right, b) if a is not None: left_entries = get(left_self_columns, a) else: left_entries = (None,) * (len(t.lhs.fields) - len(on_left)) if b is not None: right_entries = get(right_self_columns, b) else: right_entries = (None,) * (len(t.rhs.fields) - len(on_right)) return joined + left_entries + right_entries
def assemble(pair): a, b = pair if a is not None: joined = get(on_left, a) else: joined = get(on_right, b) if a is not None: left_entries = get(left_self_columns, a) else: left_entries = (None, ) * (len(t.lhs.fields) - len(on_left)) if b is not None: right_entries = get(right_self_columns, b) else: right_entries = (None, ) * (len(t.rhs.fields) - len(on_right)) return joined + left_entries + right_entries
def _get_wf_call_failures(metadata, opts): calls = [] if 'calls' in opts: calls = opts['calls'].split(',') else: calls = metadata['calls'].keys() jobids = None if 'jobids' in opts: jobids = set(opts['jobids'].split(',')) fails = {} for c in calls: tasks = metadata['calls'][c] failures = pipe( tasks, filter(lambda x: get('executionStatus', x) == 'Failed'), filter(lambda x: _valid_job_id(jobids, get('jobId', x))), map( lambda x: { 'jobId': get('jobId', x), # 'inputs' : get('inputs', x), 'stderr': get('stderr', x), 'shard': get('shardIndex', x), 'err_msg': get_in(['failures', 0, 'message'], x, 'NA'), # 'jes' : get('jes', x), # 'runtime' : get('runtimeAttributes', x), 'rc': get('returnCode', x, 'NA'), }), list) fails[c] = failures return fails
def array2sbow(array, zero_tol=1e-07): """ Convert from Gensim "sparse bag-of-words" format to array This isn't actually needed for wrapping Gensim """ if sps.issparse(array): array = array.tocoo() coo_dta = zip(array.row, array.col, array.data) for _, grp in it.groupby(coo_dta, key=tzc.get(0)): yield tuple((j, value) for _, j, value in grp) else: for row in array: yield tuple((j, value) for j, value in enumerate(row) if abs(value) < zero_tol)
def rowfunc(t): """ Rowfunc provides a function that can be mapped onto a sequence. >>> accounts = TableSymbol('accounts', '{name: string, amount: int}') >>> f = rowfunc(accounts['amount']) >>> row = ('Alice', 100) >>> f(row) 100 See Also: compute<Rowwise, Sequence> """ from cytoolz.curried import get indices = [t._child.fields.index(col) for col in t.fields] return get(indices)
def rowfunc(t): """ Rowfunc provides a function that can be mapped onto a sequence. >>> accounts = symbol('accounts', 'var * {name: string, amount: int}') >>> f = rowfunc(accounts['amount']) >>> row = ('Alice', 100) >>> f(row) 100 See Also: compute<Rowwise, Sequence> """ from cytoolz.curried import get indices = [t._child.fields.index(col) for col in t.fields] return get(indices)
def _get_wf_call_statuses(metadata): calls = metadata['calls'].keys() states = set([]) call_stats = {} for c in calls: tasks = metadata['calls'][c] counts = pipe(tasks, map(get('executionStatus')), frequencies) new_states = list(filter(lambda x: x not in states, counts.keys())) if new_states: for s in new_states: states.add(s) call_stats[c] = counts base_states = {s: 0 for s in states} final_stats = valmap(lambda d: merge(base_states, d), call_stats) return (calls, sorted(states), final_stats)
# In[29]: clf.fit(docs, labels) clf.predict(docs) # In[30]: def get_step_by_name(pipe, name): return [trans for name_, trans in pipe.steps if name_.startswith(name)][0] # In[31]: cnt_vects_pipe = get_step_by_name(tfidf_pipe, "cnt_vects") cnt_vects = [ get_step_by_name(pipe, "cnt_vect_") for _name, pipe in cnt_vects_pipe.transformer_list ] vocabulary_map = pipe( enumerate(concat(cnt_vect.vocabulary_ for cnt_vect in cnt_vects)), groupby(get(1)), valmap(lambda vals: list(pluck(0, vals))), ) vocabulary_map # In[ ]:
valmap) accounts = [ (1, 'Alice', 100, 'F'), # id, name, balance, gender (2, 'Bob', 200, 'M'), (3, 'Charlie', 150, 'M'), (4, 'Dennis', 50, 'M'), (5, 'Edith', 300, 'F') ] # I. SELECTING WITH `MAP()` AND `FILTER()` # SELECT name, balance FROM accounts WHERE balance > 150 # Functional version with pipeline and curry acc1 = pipe(accounts, filter(lambda account: account[2] > 150), map(get([1, 2])), list) print(acc1) # List comprehensions version (more Pythonic): acc2 = [(name, balance) for (id, name, balance, gender) in accounts if balance > 150] print(acc2) # II. SPLIT-APPLY-COMBINE WITH `GROUPBY` AND `REDUCEBY`: # 1. Split the dataset into groups by some property # 2. Reduce each of the groups with some synopsis function # In Memory Split-Apply-Combine # SELECT gender, SUM(balance) FROM accounts GROUP BY gender; print(groupby(get(3), accounts)) # {'M': [(2, 'Bob', 200, 'M'), (3, 'Charlie', 150, 'M'), (4, 'Dennis', 50, 'M')], 'F': [(1, 'Alice', 100, 'F'), (5, 'Edith', 300, 'F')]}
def get_colnames(self): """ Get the column names that would be assigned to a DataFrame of the result """ return list( map(tzc.get(1), sorted(self.model_.items(), key=tzc.get(0))))[:5]