def _get_wf_call_failures(metadata, opts): calls = [] if 'calls' in opts: calls = opts['calls'].split(',') else: calls = metadata['calls'].keys() jobids = None if 'jobids' in opts: jobids = set(opts['jobids'].split(',')) fails = {} for c in calls: tasks = metadata['calls'][c] failures = pipe( tasks, filter(lambda x: get('executionStatus', x) == 'Failed'), filter(lambda x: _valid_job_id(jobids, get('jobId', x))), map( lambda x: { 'jobId': get('jobId', x), # 'inputs' : get('inputs', x), 'stderr': get('stderr', x), 'shard': get('shardIndex', x), 'err_msg': get_in(['failures', 0, 'message'], x, 'NA'), # 'jes' : get('jes', x), # 'runtime' : get('runtimeAttributes', x), 'rc': get('returnCode', x, 'NA'), }), list) fails[c] = failures return fails
def find_domain_urls(self, domain: str) -> List[str]: """ Get all known urls for domain. Returns ------- all_urls : iterator """ def _urlkey_to_url(urlkey): try: # very rare bugged urlkeys appear domain, path = urlkey.split(')/', 1) except ValueError: return domain = domain.split(',') domain.reverse() domain = '.'.join(domain) if path: return '/'.join([domain, path]) return domain urls_by_index = map( lambda ind: self.__get_domain_urls_in_index(ind, domain), self.indexes) all_urls = pipe(urls_by_index, concat, map(bytes.decode), map(_urlkey_to_url), filter(None), map(unquote), map(lambda x: x.strip()), unique, list) return all_urls
def __call__(self, epoch): cyclic = 1.0 phase = epoch % self.period turn_phase, ratio = self.turning_point turn_cyclic = self.min_factor + self.range * ratio if phase <= turn_phase: cyclic = ( self.min_factor + (turn_cyclic - self.min_factor) * phase/turn_phase ) else: cyclic = turn_cyclic + \ (self.max_factor - turn_cyclic) * \ (phase - turn_phase)/(self.period - turn_phase) gamma = pipe( self.milestones, filter(lambda x: x[0] <= epoch), map(lambda x: x[1]), last ) return cyclic * gamma
def get_hashtag_string(given_item): """Return a string of hashtags associated with the given item""" return tz.pipe( tz.get_in(['entities', 'hashtags'], given_item, default=[]), tz.map(lambda x: tz.get_in(['text'], x, default=None)), tz.filter(lambda x: x is not None), lambda x: ", ".join(x))
def serde_with_class(cls): from_fields = list( map(lambda a: (a, get_in([from_key], a.metadata, [a.name])), fields(cls))) to_fields = pipe( fields(cls), map(lambda a: (a, get_in([to_key], a.metadata))), filter(lambda f: f[1]), list, ) def from_dict(d): return cls(**dict( map( lambda f: (f[0].name, get_in(f[1], d, f[0].default)), from_fields, ))) def to_dict(self): d = asdict(self) return reduce( lambda acc, f: update_in(acc, f[1], lambda _: d[f[0].name]), to_fields, {}, ) cls.from_dict = staticmethod(from_dict) cls.to_dict = to_dict return cls
def to_dict(self, convert_values: bool = False) -> MutableMapping[str, Any]: to_fields = curried.pipe( fields(self.__class__), curried.map(lambda a: (a, curried.get_in([to_key], a.metadata))), curried.filter(lambda f: f[1]), list, ) if convert_values: d = asdict(self) else: d = { a.name: getattr(self, a.name) for a in fields(self.__class__) } if not to_fields: return d return curried.reduce( lambda acc, f: curried.update_in(acc, f[1], lambda _: d[f[0]. name]), to_fields, {}, )
def get_categories(given_dict): """Return a string of the categories associated with a post""" return tz.pipe( tz.get_in(['object', 'tags'], given_dict, default = []), tz.filter(lambda x: tz.get_in(['objectType'], x, default=None) == 'category'), tz.map(lambda x: tz.get_in(['displayName'], x, default=None)), lambda x: ", ".join(x) )
def parse_format_assignments(txt): assignments = thread_last( txt.split(';'), filter(lambda x: x.strip().lower().startswith('format')), mapcat(lambda x: x.lower().split('.')), map(lambda x: x.split()), # break out vars and format (mapcat, lambda y: [(k, y[-1]) for k in y]), # tuple of var, fmt dict ) return assignments
def __init__( self, id, dataset_dir, output_dir, n_splits, base_train_config, folds, ): params = locals() torch.manual_seed(0) ids = pipe(range(n_splits), filter(lambda x: x in folds), list) train_df_path = delayed(load_train_df)( dataset_dir=join(dataset_dir, 'train'), output=join(output_dir, 'train.pqt')) train_df = delayed(pd.read_parquet)(train_df_path) kfolded = delayed(kfold)(train_df, n_splits) train_sets = pipe(ids, map(lambda x: delayed(lambda i: i[x])(kfolded)), list) model_paths = pipe( zip(ids, train_sets), map(lambda x: delayed(train_fusion)( **base_train_config, model_path=join(output_dir, f"{id}-fold-{x[0]}-base-model.pt"), sets=x[1], log_dir=f'{config["TENSORBORAD_LOG_DIR"]}/{id}/{x[0]}/base', )), list) test_df_path = load_test_df(dataset_dir='/store/tellus/test', output=join(output_dir, 'test.pqt')) test_df = delayed(pd.read_parquet)(test_df_path) test_dataset = delayed(TellusDataset)( test_df, has_y=False, ) submission_df_path = delayed(predict)( model_paths=model_paths, log_dir=f'{config["TENSORBORAD_LOG_DIR"]}/{id}/sub', dataset=test_dataset, log_interval=10, out_path=f'{output_dir}/{id}_submission.tsv', ) self.output = delayed(lambda x: x)(( model_paths, submission_df_path, ))
def parse_questions(txt): rqt = re.compile(r'[\"\']') # match quote chars assignments = thread_last( txt.split(';'), filter(lambda x: x.strip().lower().startswith('label')), mapcat(lambda x: x.lower().split('\n')), map(lambda x: x.split('=')), # break out vars and format (map, lambda y: (y[0].strip().lower(), rqt.sub('', y[1].strip()))), # tuple of var, fmt dict ) return assignments
def test_kfold(): output = load_train_df( dataset_dir='/store/tellus/train', output='/store/tmp/train.pqt' ) df = pd.read_parquet(output) sets = kfold(df, n_splits=10) for s in sets: assert pipe( s['train_pos'], take(100), map(lambda x: x['label']), filter(lambda x: x == 0), list, len ) == 0 assert pipe( s['val_pos'], take(100), map(lambda x: x['label']), filter(lambda x: x == 0), list, len ) == 0 assert pipe( s['train_neg'], take(100), map(lambda x: x['label']), filter(lambda x: x == 1), list, len ) == 0 assert pipe( s['val_neg'], take(100), map(lambda x: x['label']), filter(lambda x: x == 1), list, len ) == 0 assert len(s) == 4
def parse_variable_labels(txt, repl, lbls_to_lower=True): b2d = curry(block2dict)(repl=repl, to_lower=lbls_to_lower) labels = thread_last( txt.split(';'), filter(lambda x: x.strip().lower().startswith('value')), map(lambda x: x.strip().split('\n')), map(lambda x: (x[0].split()[1].lower(), b2d(x[1:]))), dict ) logger.info('parsed varlabels from format txt', nlabeled=len(labels.keys()), nrepl=len(repl.keys())) return labels
def block2dict(lines, repl, to_lower=False): f_lwr = str.lower if to_lower else identity f_repl = curry(lambda k, r: r[k] if k in r else k)(r=repl) rqt = re.compile(r'[\"\']') # match quote chars rws = re.compile(r'\s') # match whitespace # keep only alnum and a few unreserved symbols ruri = re.compile(r'(?![\w\s\-\_\.\'\$\-\+\(\)\/]|\.).') d = thread_last( lines, map(lambda x: x.replace('\x92', "'")), map(lambda x: rqt.sub('', x.strip()).split('=')), map(lambda x: (rws.sub('', x[0].strip()), ruri.sub('', x[1].strip()))), filter(lambda x: x[0].find('-') == -1), # no support for ranges (mapcat, lambda x: map(lambda y: (y, x[1]), x[0].split(','))), filter(lambda x: x[0].isnumeric()), # remove non-numeric codes map(lambda x: (int(x[0]), # cat codes are ints pipe(x[1], f_lwr, f_repl))), dict ) # d[-1] = np.nan #use NA as a marker for unmapped vals return d
def connect_to_twitter_filtered_stream(stream_key, saveing_function): """Connect to & consume a filtered Twitter stream, where Twitter does some of the filtering""" stream = tz.pipe( ## Connect start_stream_twitter(**CONFIG['twitter_filter']), tz.map(print_twitter_stall_warning), ## Filter tz.filter(is_tweet), # filter to tweets ## Parse tz.map(parse_tweet), # parse into a flat dictionary ) ## Collect saveing_function(stream_key, stream)
def connect_to_twitter_stream(stream_key, saveing_function): """Connect to & consume a Twitter stream""" stream = tz.pipe( ## Connect start_stream_twitter(), # public sampled stream tz.map(print_twitter_stall_warning), ## Filter tz.filter(is_tweet), # filter to tweets # tz.filter(is_user_lang_tweet(["en", "en-AU", "en-au", "en-GB", "en-gb"])), # filter to English ## Parse tz.map(parse_tweet), # parse into a flat dictionary ) # Collect saveing_function(stream_key, stream)
def get_url_location(self, url: str) -> Optional[Dict]: """ Get html location in index for url. """ params = { 'url': url, 'output': 'json', 'closest': self._cur_ts(), 'filter': '!status:404', 'fl': 'filename,length,offset,status,timestamp' } locations = pipe(self.indexes, map(lambda index: self.__locate_url(index, params)), filter(None), concat, list) if locations: location = self.__locate_most_relevant_location(locations) return location return None
def _get_wf_call_statuses(metadata): calls = metadata['calls'].keys() states = set([]) call_stats = {} for c in calls: tasks = metadata['calls'][c] counts = pipe(tasks, map(get('executionStatus')), frequencies) new_states = list(filter(lambda x: x not in states, counts.keys())) if new_states: for s in new_states: states.add(s) call_stats[c] = counts base_states = {s: 0 for s in states} final_stats = valmap(lambda d: merge(base_states, d), call_stats) return (calls, sorted(states), final_stats)
def __call__(self, epoch): phase = epoch % self.period turn_cyclic = self.min_factor + self.range cyclic = ( self.min_factor + (turn_cyclic - self.min_factor) * phase ) gamma = pipe( self.milestones, filter(lambda x: x[0] <= epoch), map(lambda x: x[1]), last ) return cyclic * gamma
def test_esampler(): output = load_train_df( dataset_dir='/store/tellus/train', output='/store/tmp/train.pqt' ) df = pd.read_parquet(output) dataset = TellusDataset( df=df, has_y=True, ) subset = Subset( dataset, list(range(1500, 1600)) ) epoch_size = 10 s = ChunkSampler( epoch_size=epoch_size, len_indices=len(subset), shuffle=True, ) batch_size = 2 train_loader = DataLoader( subset, sampler=s, batch_size=batch_size, pin_memory=True, ) for i in range(11): samples = pipe( train_loader, map(lambda x: x['id']), filter(lambda x: len(x) == batch_size), list ) assert len(samples) == epoch_size//batch_size
from cytoolz.curried import (compose, filter, get, groupby, map, pipe, pluck, valmap) accounts = [ (1, 'Alice', 100, 'F'), # id, name, balance, gender (2, 'Bob', 200, 'M'), (3, 'Charlie', 150, 'M'), (4, 'Dennis', 50, 'M'), (5, 'Edith', 300, 'F') ] # I. SELECTING WITH `MAP()` AND `FILTER()` # SELECT name, balance FROM accounts WHERE balance > 150 # Functional version with pipeline and curry acc1 = pipe(accounts, filter(lambda account: account[2] > 150), map(get([1, 2])), list) print(acc1) # List comprehensions version (more Pythonic): acc2 = [(name, balance) for (id, name, balance, gender) in accounts if balance > 150] print(acc2) # II. SPLIT-APPLY-COMBINE WITH `GROUPBY` AND `REDUCEBY`: # 1. Split the dataset into groups by some property # 2. Reduce each of the groups with some synopsis function # In Memory Split-Apply-Combine # SELECT gender, SUM(balance) FROM accounts GROUP BY gender; print(groupby(get(3), accounts))
#!/usr/bin/env python import cytoolz.curried as cc from pprint import pprint as pp import sys data_input = cc.pipe(sys.stdin.readlines(), cc.map(lambda x: x.replace('\n', '')), list) def has_no_duplicate(x): return len(set(x)) == len(x) answer = cc.pipe(data_input, cc.map(str.split), cc.filter(has_no_duplicate), list, len) pp(answer)
import pandas as pd import numpy as np from cytoolz.itertoolz import unique from cytoolz.functoolz import thread_last, identity from cytoolz.curried import map, filter, curry from survey_stats import pdutil # import sys # import traceback as tb from survey_stats import log logger = log.getLogger(__name__) US_STATES_FIPS_INTS = thread_last(us.STATES_AND_TERRITORIES, map(lambda x: x.fips), filter(lambda x: x is not None), map(lambda x: int(x)), list) SITECODE_TRANSLATORS = { 'fips': lambda x: (us.states.lookup('%.2d' % x).abbr if int(x) in US_STATES_FIPS_INTS else 'NA'), 'codes': identity } SVYDESIGN_COLS = ['sitecode', 'strata', 'psu', 'weight'] def convert_cat_codes(s, fmt): unq_lvls = list(unique([fmt[k] for k in sorted(fmt.keys())]))
cc.map(lambda x: x.split('->')), cc.map(lambda x: (x[0], [] if len(x) == 1 else cc.pipe( x[1], lambda x: x.split(','), cc.map(str.strip), list))), list) tree_val_dict = cc.pipe( data_input, cc.map(cc.first), cc.map(lambda x: [tree_val_re.match(x).group(y) for y in (1, 2)]), dict, cc.valmap(int)) tree_mapping_dict = cc.pipe( data_input, cc.map(lambda x: (tree_val_re.match(x[0]).group(1), x[1])), dict) root = cc.pipe( tree_mapping_dict.keys(), cc.filter(lambda x: x not in cc.concat(tree_mapping_dict.values())), cc.first) tree = Tree(root, tree_mapping_dict, tree_val_dict) unbalanced = tree.find_unbalanced() unbalanced_self_weight = unbalanced.weight - sum(x.weight for x in unbalanced.children) unbalanced_grouped_siblings = unbalanced.grouped('siblings') balanced_weight = cc.first( cc.valfilter(lambda x: len(x) > 1, unbalanced_grouped_siblings).keys()) unbalanced_weight = cc.first( cc.valfilter(lambda x: len(x) == 1, unbalanced_grouped_siblings).keys()) weight_offset = balanced_weight - unbalanced_weight
def find_mismatched_levels(self): return pipe(self.meta.qns[ID_COLUMN], set, map(self.compare_levels), filter(lambda x: set(x['surveys']) != set(x['socrata'])))
def __init__( self, id, dataset_dir, output_dir, n_splits, base_train_config, fine_train_config, top_num, folds, ): params = locals() ids = pipe(range(n_splits), list) dataset_df = delayed(load_dataset_df)(dataset_dir, 'train.csv') dataset = delayed(TgsSaltDataset)( dataset_df, has_y=True, ) kfolded = delayed(kfold)(dataset, n_splits) train_sets = pipe( range(n_splits), map(lambda idx: delayed(lambda x: x[idx][0])(kfolded)), map(lambda x: delayed(Subset)(dataset, x)), list) seg_sets = pipe( train_sets, map(delayed(lambda x: x.indices)), map(lambda x: delayed(get_segment_indices)(dataset, x)), map(lambda x: delayed(Subset)(dataset, x)), list) val_sets = pipe(range(n_splits), map(lambda idx: delayed(lambda x: x[idx][1])(kfolded)), map(lambda x: delayed(Subset)(dataset, x)), list) predict_dataset_df = delayed(load_dataset_df)(dataset_dir, 'sample_submission.csv') predict_set = delayed(TgsSaltDataset)(predict_dataset_df, has_y=False) trains = pipe(zip(ids, train_sets, seg_sets, val_sets), filter(lambda x: x[0] in folds), list) model_paths = pipe( trains, map(lambda x: delayed(base_train)( **base_train_config, model_path=f"{output_dir}/id-{id}-fold-{x[0]}-base-model.pt", train_set=x[1], seg_set=x[2], val_set=x[3], no_lable_set=predict_set, log_dir=f'{config["TENSORBORAD_LOG_DIR"]}/{id}/{x[0]}/base', )), list) # model_paths = pipe( # zip(trains, model_paths), # map(lambda x: delayed(fine_train)( # **fine_train_config, # base_model_path=x[1], # model_path=f"{output_dir}/id-{id}-fold-{x[0][0]}-fine-model.pt", # train_set=x[0][1], # seg_set=x[0][2], # val_set=x[0][3], # no_lable_set=predict_set, # log_dir=f'{config["TENSORBORAD_LOG_DIR"]}/{id}/{x[0][0]}/fine', # )), # list # ) # submission_df = delayed(predict)( model_paths=model_paths, log_dir=f'{config["TENSORBORAD_LOG_DIR"]}/{id}/sub', dataset=predict_set, log_interval=10, hdf5_path=f'{output_dir}/{id}.hdf5') # submission_df = delayed(lambda df: df[['rle_mask']])(submission_df) submission_file = delayed( lambda df: df.to_csv(f"{output_dir}/id-{id}-submission.csv"))( submission_df, ) self.output = delayed(lambda x: x)(( model_paths, submission_df, submission_file, ))
import pytest from mlboard_api import models as ms from mlboard_api import query as qry from mlboard_api import create_app import uuid from cytoolz.curried import pipe, map, filter from dateutil.parser import parse import datetime import uuid from .fixture import app @pytest.fixture(params=pipe( dir(ms), map(lambda x: getattr(ms, x)), filter(lambda x: type(x).__name__ == 'DeclarativeMeta'), list )) def target(request): return request.param def test_all_table(app, target): payload = { "target": target.__name__, 'entities': [], "methods": [ {"name": "limit", "args": [1], "kwargs":{}}, {"name": "all", "args": [], "kwargs":{}} ], }
#!/usr/bin/env python import cytoolz.curried as cc import itertools as it from pprint import pprint as pp import sys data_input = sys.stdin.read().replace('\n', '') data_input_midpt = cc.pipe(data_input, it.cycle, cc.drop(int(len(data_input) / 2))) answer = cc.pipe(zip(data_input, data_input_midpt), cc.filter(lambda x: x[0] == x[1]), cc.map(lambda x: int(x[0])), sum) pp(answer)
#!/usr/bin/env python import cytoolz.curried as cc from pprint import pprint as pp import sys data_input = sys.stdin.read().replace('\n', '') data_input += data_input[0] answer = cc.pipe( ((x for x in data_input), (x for x in cc.drop(1, data_input))), lambda x: zip(*x), cc.filter(lambda x: x[0] == x[1]), cc.map(lambda x: int(x[0])), sum) pp(answer)