def test_introspect_curry_py3(): if not PY3: return f = cytoolz.curry(make_func('')) assert num_required_args(f) == 0 assert is_arity(0, f) assert has_varargs(f) is False assert has_keywords(f) is False f = cytoolz.curry(make_func('x')) assert num_required_args(f) == 0 assert is_arity(0, f) is False assert is_arity(1, f) is False assert has_varargs(f) is False assert has_keywords(f) # A side-effect of being curried f = cytoolz.curry(make_func('x, y, z=0')) assert num_required_args(f) == 0 assert is_arity(0, f) is False assert is_arity(1, f) is False assert is_arity(2, f) is False assert is_arity(3, f) is False assert has_varargs(f) is False assert has_keywords(f) f = cytoolz.curry(make_func('*args, **kwargs')) assert num_required_args(f) == 0 assert has_varargs(f) assert has_keywords(f)
def test_introspect_curry_py3(): f = cytoolz.curry(make_func('')) assert num_required_args(f) == 0 assert is_arity(0, f) assert has_varargs(f) is False assert has_keywords(f) is False f = cytoolz.curry(make_func('x')) assert num_required_args(f) == 0 assert is_arity(0, f) is False assert is_arity(1, f) is False assert has_varargs(f) is False assert has_keywords(f) # A side-effect of being curried f = cytoolz.curry(make_func('x, y, z=0')) assert num_required_args(f) == 0 assert is_arity(0, f) is False assert is_arity(1, f) is False assert is_arity(2, f) is False assert is_arity(3, f) is False assert has_varargs(f) is False assert has_keywords(f) f = cytoolz.curry(make_func('*args, **kwargs')) assert num_required_args(f) == 0 assert has_varargs(f) assert has_keywords(f)
def test_introspect_curry_py3(): if not PY3: return f = cytoolz.curry(make_func("")) assert num_required_args(f) == 0 assert is_arity(0, f) assert has_varargs(f) is False assert has_keywords(f) is False f = cytoolz.curry(make_func("x")) assert num_required_args(f) == 0 assert is_arity(0, f) is False assert is_arity(1, f) is False assert has_varargs(f) is False assert has_keywords(f) # A side-effect of being curried f = cytoolz.curry(make_func("x, y, z=0")) assert num_required_args(f) == 0 assert is_arity(0, f) is False assert is_arity(1, f) is False assert is_arity(2, f) is False assert is_arity(3, f) is False assert has_varargs(f) is False assert has_keywords(f) f = cytoolz.curry(make_func("*args, **kwargs")) assert num_required_args(f) == 0 assert has_varargs(f) assert has_keywords(f)
def main(opts): print(opts) dataset, split = opts.annotation.split('_') if split == 'dev': txt_db = 'val_txt_db' elif split == 'pretrain': txt_db = 'train_txt_db' else: txt_db = f'{split}_txt_db' opts.output = os.path.join( '/txt', intermediate_dir(opts.pretrained_model_name_or_path), getattr(opts, txt_db)) os.makedirs(opts.output) # train_db_dir = os.path.join(os.path.dirname(opts.output), f'{source}_{split}.db') # meta = vars(opts) # meta['tokenizer'] = opts.toker tokenizer = AutoTokenizer.from_pretrained( opts.pretrained_model_name_or_path, use_fast=True) open_db = curry(open_lmdb, opts.output, readonly=False) with open_db() as db: id2lens = process(opts, db, tokenizer) with open(f'{opts.output}/id2len.json', 'w') as f: json.dump(id2lens, f)
def main(opts): if not exists(opts.output): os.makedirs(opts.output) else: raise ValueError("Found existing DB. Please explicitly remove " "for re-processing") meta = vars(opts) meta["tokenizer"] = opts.toker toker = BertTokenizer.from_pretrained(opts.toker, do_lower_case="uncased" in opts.toker) tokenizer = bert_tokenize(toker) meta["UNK"] = toker.convert_tokens_to_ids(["[UNK]"])[0] meta["CLS"] = toker.convert_tokens_to_ids(["[CLS]"])[0] meta["SEP"] = toker.convert_tokens_to_ids(["[SEP]"])[0] meta["MASK"] = toker.convert_tokens_to_ids(["[MASK]"])[0] meta["v_range"] = (toker.convert_tokens_to_ids("!")[0], len(toker.vocab)) with open(f"{opts.output}/meta.json", "w") as f: json.dump(vars(opts), f, indent=4) open_db = curry(open_lmdb, opts.output, readonly=False) with open_db() as db: with open(opts.annotation) as ann: if opts.missing_imgs is not None: missing_imgs = set(json.load(open(opts.missing_imgs))) else: missing_imgs = None id2lens, txt2img = process_nlvr2(ann, db, tokenizer, missing_imgs) with open(f"{opts.output}/id2len.json", "w") as f: json.dump(id2lens, f) with open(f"{opts.output}/txt2img.json", "w") as f: json.dump(txt2img, f)
def main(opts): if not exists(opts.output): os.makedirs(opts.output) else: raise ValueError('Found existing DB. Please explicitly remove ' 'for re-processing') meta = vars(opts) meta['tokenizer'] = opts.toker toker = BertTokenizer.from_pretrained(opts.toker, do_lower_case='uncased' in opts.toker) tokenizer = bert_tokenize(toker) meta['UNK'] = toker.convert_tokens_to_ids(['[UNK]'])[0] meta['CLS'] = toker.convert_tokens_to_ids(['[CLS]'])[0] meta['SEP'] = toker.convert_tokens_to_ids(['[SEP]'])[0] meta['MASK'] = toker.convert_tokens_to_ids(['[MASK]'])[0] meta['v_range'] = (toker.convert_tokens_to_ids('!')[0], len(toker.vocab)) with open(f'{opts.output}/meta.json', 'w') as f: json.dump(vars(opts), f, indent=4) open_db = curry(open_lmdb, opts.output, readonly=False) with open_db() as db: with open(opts.annotation) as ann: if opts.missing_imgs is not None: missing_imgs = set(json.load(open(opts.missing_imgs))) else: missing_imgs = None id2lens, txt2img = process_nlvr2(ann, db, tokenizer, missing_imgs) with open(f'{opts.output}/id2len.json', 'w') as f: json.dump(id2lens, f) with open(f'{opts.output}/txt2img.json', 'w') as f: json.dump(txt2img, f)
def X_feature_onehot(self, dataframe: DataFrame) -> DataFrame: # fieldgroups[basename] = [ fieldname ] # noinspection PyArgumentList fieldgroups = groupby( curry(re.sub)('\d+(st|nd|rd)?$')(''), # basename self.params['X_feature_onehot'] # fieldnames ) encodings = {} for basename, fieldnames in fieldgroups.items(): # NOTE: in theory, unique_values should be hardcoded based on data_description.txt # for Kaggle, we can cheat and just take unique_values from self.data['combined'] # BUGFIX: running to_X() separately on test/train/validate datasets results in column name mismatches unique_values = np.unique( self.data['combined'][fieldnames].dropna().values) category_dtype = CategoricalDtype(categories=unique_values) for fieldname in fieldnames: dataframe[fieldname] = dataframe[fieldname].astype( category_dtype) onehot = pd.get_dummies(dataframe[fieldname], prefix=basename, prefix_sep='_') if not basename in encodings: encodings[basename] = onehot else: encodings[basename] = onehot & encodings[ basename] # Bitwise addition # Add additional onehot columns to dataframe for basename, onehot in encodings.items(): dataframe = dataframe.join(onehot) # Mark original categorical columns for exclusion self.params['X_feature_exclude'] += self.params['X_feature_onehot'] return dataframe
def curry_namespace(ns): return dict( ( name, curry(f) if should_curry(f) else f, ) for name, f in ns.items() if '__' not in name )
def find_background_illumination(fns, radius=None, input_bitdepth=None, quantile=0.5, stretch_quantile=0.): """Use a set of related images to find uneven background illumination. Parameters ---------- fns : list of string A list of image file names radius : int, optional The radius of the structuring element used to find background. default: The width or height of the input images divided by 4, whichever is smaller. input_bitdepth : int, optional The bit-depth of the input images. Should be specified if non-standard bitdepth images are used in a 16-bit image file, e.g. 12-bit images. Default is the dtype of the input image. quantile : float in [0, 1], optional The desired quantile to find background. default: 0.5 (median) stretch_quantile : float in [0, 1], optional Stretch image to full dtype limit, saturating above this quantile. Returns ------- illum : np.ndarray, float, shape (M, N) The estimated illumination over the image field. See Also -------- `correct_image_illumination`, `correct_multiimage_illumination`. """ # this function follows the "PyToolz" streaming data model to # obtain the illumination estimate. # first, define the functions for each individual step: in_range = ('image' if input_bitdepth is None else (0, 2**input_bitdepth - 1)) rescale = tz.curry(exposure.rescale_intensity) normalize = (tz.partial(stretchlim, bottom=stretch_quantile) if stretch_quantile > 0 else skimage.img_as_float) # produce a stream of properly-scaled images ims = (tz.pipe(fn, io.imread, rescale(in_range=in_range), normalize) for fn in fns) # take the mean of that stream mean_image = mean(ims) # return the median filter of that mean radius = radius or min(mean_image.shape) // 4 illum = ndi.percentile_filter(mean_image, percentile=(quantile * 100), footprint=morphology.disk(radius)) return illum
def test_introspect_curry_valid_py3(check_valid=is_valid_args, incomplete=False): if not PY3: return orig_check_valid = check_valid check_valid = lambda _func, *args, **kwargs: orig_check_valid( _func, args, kwargs) f = cytoolz.curry(make_func("x, y, z=0")) assert check_valid(f) assert check_valid(f, 1) assert check_valid(f, 1, 2) assert check_valid(f, 1, 2, 3) assert check_valid(f, 1, 2, 3, 4) is False assert check_valid(f, invalid_keyword=True) is False assert check_valid(f(1)) assert check_valid(f(1), 2) assert check_valid(f(1), 2, 3) assert check_valid(f(1), 2, 3, 4) is False assert check_valid(f(1), x=2) is False assert check_valid(f(1), y=2) assert check_valid(f(x=1), 2) is False assert check_valid(f(x=1), y=2) assert check_valid(f(y=2), 1) assert check_valid(f(y=2), 1, z=3) assert check_valid(f(y=2), 1, 3) is False f = cytoolz.curry(make_func("x, y, z=0"), 1, x=1) assert check_valid(f) is False assert check_valid(f, z=3) is False f = cytoolz.curry(make_func("x, y, *args, z")) assert check_valid(f) assert check_valid(f, 0) assert check_valid(f(1), 0) assert check_valid(f(1, 2), 0) assert check_valid(f(1, 2, 3), 0) assert check_valid(f(1, 2, 3, 4), 0) assert check_valid(f(1, 2, 3, 4), z=4) assert check_valid(f(x=1)) assert check_valid(f(x=1), 1) is False assert check_valid(f(x=1), y=2)
def find_background_illumination(fns, radius=None, input_bitdepth=None, quantile=0.5, stretch_quantile=0.): """Use a set of related images to find uneven background illumination. Parameters ---------- fns : list of string A list of image file names radius : int, optional The radius of the structuring element used to find background. default: The width or height of the input images divided by 4, whichever is smaller. input_bitdepth : int, optional The bit-depth of the input images. Should be specified if non-standard bitdepth images are used in a 16-bit image file, e.g. 12-bit images. Default is the dtype of the input image. quantile : float in [0, 1], optional The desired quantile to find background. default: 0.5 (median) stretch_quantile : float in [0, 1], optional Stretch image to full dtype limit, saturating above this quantile. Returns ------- illum : np.ndarray, float, shape (M, N) The estimated illumination over the image field. See Also -------- `correct_image_illumination`, `correct_multiimage_illumination`. """ # this function follows the "PyToolz" streaming data model to # obtain the illumination estimate. # first, define the functions for each individual step: in_range = ('image' if input_bitdepth is None else (0, 2**input_bitdepth - 1)) rescale = tz.curry(exposure.rescale_intensity) normalize = (tz.partial(stretchlim, bottom=stretch_quantile) if stretch_quantile > 0 else skimage.img_as_float) # produce a stream of properly-scaled images ims = (tz.pipe(fn, io.imread, rescale(in_range=in_range), normalize) for fn in fns) # take the mean of that stream mean_image = mean(ims) # return the median filter of that mean radius = radius or min(mean_image.shape) // 4 mean_image = img_as_ubyte(stretchlim(mean_image)) illum = imfilter.rank.median(mean_image, selem=morphology.disk(radius)) return illum
def main(opts): if not exists(opts.output): os.makedirs(opts.output) else: print(opts.output) raise ValueError('Found existing DB. Please explicitly remove ' 'for re-processing') meta = vars(opts) meta['tokenizer'] = opts.toker toker = RobertaTokenizer.from_pretrained(opts.toker) tokenizer = roberta_tokenize(toker) meta['BOS'] = toker.convert_tokens_to_ids(['<s>'])[0] meta['EOS'] = toker.convert_tokens_to_ids(['</s>'])[0] meta['SEP'] = toker.convert_tokens_to_ids(['</s>'])[0] meta['CLS'] = toker.convert_tokens_to_ids(['<s>'])[0] meta['PAD'] = toker.convert_tokens_to_ids(['<pad>'])[0] meta['MASK'] = toker.convert_tokens_to_ids(['<mask>'])[0] meta['UNK'] = toker.convert_tokens_to_ids(['<unk>'])[0] meta['v_range'] = (toker.convert_tokens_to_ids(['.'])[0], toker.convert_tokens_to_ids(['<|endoftext|>'])[0] + 1) with open(f'{opts.output}/meta.json', 'w') as f: json.dump(vars(opts), f, indent=4) open_cap_db = curry(open_lmdb, f"{opts.output}/cap.db", readonly=False) open_clip_db = curry(open_lmdb, f"{opts.output}/clip.db", readonly=False) with open_cap_db() as cap_db, open_clip_db() as clip_db: with open(opts.annotation) as ann, open(opts.subtitles) as sub: (id2lens, cap2vid, clip2vid, vid2caps, vid2clips) = process_tvc(ann, sub, cap_db, clip_db, tokenizer) with open(f'{opts.output}/cap.db/id2len.json', 'w') as f: json.dump(id2lens, f) with open(f'{opts.output}/cap.db/cap2vid.json', 'w') as f: json.dump(cap2vid, f) with open(f'{opts.output}/clip.db/clip2vid.json', 'w') as f: json.dump(clip2vid, f) with open(f'{opts.output}/cap.db/vid2caps.json', 'w') as f: json.dump(vid2caps, f) with open(f'{opts.output}/clip.db/vid2clips.json', 'w') as f: json.dump(vid2clips, f)
def test_introspect_curry_valid_py3(check_valid=is_valid_args, incomplete=False): if not PY3: return orig_check_valid = check_valid check_valid = lambda _func, *args, **kwargs: orig_check_valid(_func, args, kwargs) f = cytoolz.curry(make_func('x, y, z=0')) assert check_valid(f) assert check_valid(f, 1) assert check_valid(f, 1, 2) assert check_valid(f, 1, 2, 3) assert check_valid(f, 1, 2, 3, 4) is False assert check_valid(f, invalid_keyword=True) is False assert check_valid(f(1)) assert check_valid(f(1), 2) assert check_valid(f(1), 2, 3) assert check_valid(f(1), 2, 3, 4) is False assert check_valid(f(1), x=2) is False assert check_valid(f(1), y=2) assert check_valid(f(x=1), 2) is False assert check_valid(f(x=1), y=2) assert check_valid(f(y=2), 1) assert check_valid(f(y=2), 1, z=3) assert check_valid(f(y=2), 1, 3) is False f = cytoolz.curry(make_func('x, y, z=0'), 1, x=1) assert check_valid(f) is False assert check_valid(f, z=3) is False f = cytoolz.curry(make_func('x, y, *args, z')) assert check_valid(f) assert check_valid(f, 0) assert check_valid(f(1), 0) assert check_valid(f(1, 2), 0) assert check_valid(f(1, 2, 3), 0) assert check_valid(f(1, 2, 3, 4), 0) assert check_valid(f(1, 2, 3, 4), z=4) assert check_valid(f(x=1)) assert check_valid(f(x=1), 1) is False assert check_valid(f(x=1), y=2)
def test_funcname_cytoolz(): @curry def foo(a, b, c): pass assert funcname(foo) == "foo" assert funcname(foo(1)) == "foo" def bar(a, b): return a + b c_bar = curry(bar, 1) assert funcname(c_bar) == "bar"
def main(opts): if not exists(opts.output): os.makedirs(opts.output) # else: # raise ValueError('Found existing DB. Please explicitly remove ' # 'for re-processing') meta = vars(opts) meta['tokenizer'] = opts.toker toker = BertTokenizer.from_pretrained(opts.toker, do_lower_case='uncased' in opts.toker) tokenizer = bert_tokenize(toker) meta['UNK'] = toker.convert_tokens_to_ids(['[UNK]'])[0] meta['CLS'] = toker.convert_tokens_to_ids(['[CLS]'])[0] meta['SEP'] = toker.convert_tokens_to_ids(['[SEP]'])[0] meta['MASK'] = toker.convert_tokens_to_ids(['[MASK]'])[0] meta['v_range'] = (toker.convert_tokens_to_ids('!')[0], len(toker.vocab)) with open(f'{opts.output}/meta.json', 'w') as f: json.dump(vars(opts), f, indent=4) if opts.dataset == "nvlr2": open_db = curry(open_lmdb, opts.output, readonly=False) with open_db() as db: with open(opts.annotation) as ann: if opts.missing_imgs is not None: missing_imgs = set(json.load(open(opts.missing_imgs))) else: missing_imgs = None id2lens, txt2img = process_nlvr2(ann, db, tokenizer, missing_imgs) with open(f'{opts.output}/id2len.json', 'w') as f: json.dump(id2lens, f) with open(f'{opts.output}/txt2img.json', 'w') as f: json.dump(txt2img, f) else: train_ann_path = os.path.join(opts.annotation, "train.json") train_img_dir = os.path.join(opts.img_dir, "train") train_output_dir = f'{opts.output}/train/' with open(train_ann_path, "r") as ann_file: ann = json.load(ann_file) process_vizwiz(ann, tokenizer, train_img_dir, train_output_dir) val_ann_path = os.path.join(opts.annotation, "val.json") val_img_dir = os.path.join(opts.img_dir, "val") val_output_dir = f'{opts.output}/val/' with open(val_ann_path) as ann_file: ann = json.load(ann_file) process_vizwiz(ann, tokenizer, val_img_dir, val_output_dir)
def reduce(function, initval=None): """ Curried version of the built-in reduce. >>> reduce(lambda x,y: x+y)( [1, 2, 3, 4, 5] ) 15 >>> reduce(lambda x,y: x+y, initval=10)( [1, 2, 3, 4, 5] ) 25 """ if initval is None: return cytoolz.curry(__builtin__.reduce)(function) else: # TODO: Port to cytoolz return lambda s: __builtin__.reduce(function, s, initval)
def other_than(groups, bools): """ Construct a Series that has booleans indicating the presence of something- or someone-else with a certain property within a group. Parameters ---------- groups : pandas.Series A column with the same index as `bools` that defines the grouping of `bools`. The `bools` Series will be used to index `groups` and then the grouped values will be counted. bools : pandas.Series A boolean Series indicating where the property of interest is present. Should have the same index as `groups`. Returns ------- others : pandas.Series A boolean Series with the same index as `groups` and `bools` indicating whether there is something- or something-else within a group with some property (as indicated by `bools`). """ counts = groups[bools].value_counts() merge_col = groups.to_frame(name='right') pipeline = tz.compose( tz.curry(pd.Series.fillna, value=False), itemgetter('left'), tz.curry(pd.DataFrame.merge, right=merge_col, how='right', left_index=True, right_on='right'), tz.curry(pd.Series.to_frame, name='left')) gt0 = pipeline(counts > 0) gt1 = pipeline(counts > 1) return gt1.where(bools, other=gt0)
def main(opts): if not exists(opts.output): os.makedirs(opts.output) else: raise ValueError('Found existing DB. Please explicitly remove ' 'for re-processing') meta = vars(opts) meta['tokenizer'] = opts.toker toker = BertTokenizer.from_pretrained( opts.toker, do_lower_case='uncased' in opts.toker) tokenizer = bert_tokenize(toker) meta['UNK'] = toker.convert_tokens_to_ids(['[UNK]'])[0] meta['CLS'] = toker.convert_tokens_to_ids(['[CLS]'])[0] meta['SEP'] = toker.convert_tokens_to_ids(['[SEP]'])[0] meta['MASK'] = toker.convert_tokens_to_ids(['[MASK]'])[0] meta['v_range'] = (toker.convert_tokens_to_ids('!')[0], len(toker.vocab)) with open(f'{opts.output}/meta.json', 'w') as f: json.dump(vars(opts), f, indent=4) open_db = curry(open_lmdb, opts.output, readonly=False) output_field_name = ['id2len', 'txt2img'] with open_db() as db: if opts.task == 'nlvr': with open(opts.annotations[0]) as ann: if opts.missing_imgs is not None: missing_imgs = set(json.load(open(opts.missing_imgs))) else: missing_imgs = None jsons = process_nlvr2( ann, db, tokenizer, missing_imgs) elif opts.task == 're': data = pickle.load(open(opts.annotations[0], 'rb')) instances = json.load(open(opts.annotations[1], 'r')) iid_to_ann_ids = json.load( open(opts.annotations[2], 'r'))['iid_to_ann_ids'] # dirs/refcoco_testA_bert-base-cased.db -> testA img_split = opts.output.split('/')[-1].split('.')[0].split('_')[1] jsons = process_referring_expressions( data, instances, iid_to_ann_ids, db, tokenizer, img_split) output_field_name = [ 'id2len', 'images', 'annotations', 'categories', 'refs'] for dump, name in zip(jsons, output_field_name): with open(f'{opts.output}/{name}.json', 'w') as f: json.dump(dump, f)
def main(opts): if not exists(opts.output): os.makedirs(opts.output) else: raise ValueError('Found existing DB. Please explicitly remove ' 'for re-processing') meta = vars(opts) meta['tokenizer'] = opts.toker toker = RobertaTokenizer.from_pretrained( opts.toker) tokenizer = roberta_tokenize(toker) meta['BOS'] = toker.convert_tokens_to_ids(['<s>'])[0] meta['EOS'] = toker.convert_tokens_to_ids(['</s>'])[0] meta['SEP'] = toker.convert_tokens_to_ids(['</s>'])[0] meta['CLS'] = toker.convert_tokens_to_ids(['<s>'])[0] meta['PAD'] = toker.convert_tokens_to_ids(['<pad>'])[0] meta['MASK'] = toker.convert_tokens_to_ids(['<mask>'])[0] meta['UNK'] = toker.convert_tokens_to_ids(['<unk>'])[0] meta['v_range'] = (toker.convert_tokens_to_ids(['.'])[0], toker.convert_tokens_to_ids(['<|endoftext|>'])[0]+1) save_json(vars(opts), f'{opts.output}/meta.json', save_pretty=True) open_db = curry(open_lmdb, opts.output, readonly=False) with open_db() as db: with open(opts.annotation, "r") as ann: if opts.task == "tvr": id2lens, query2video, query_data = process_tvr( ann, db, tokenizer) elif opts.task == "tvqa": id2lens, query2video, query_data = process_tvqa( ann, db, tokenizer) elif opts.task == "violin": id2lens, query2video, query_data = process_violin( ann, db, tokenizer) else: raise NotImplementedError( f"prepro for {opts.task} not implemented") save_json(id2lens, f'{opts.output}/id2len.json') save_json(query2video, f'{opts.output}/query2video.json') save_jsonl(query_data, f'{opts.output}/query_data.jsonl')
def main(opts): if not exists(opts.output): os.makedirs(opts.output) else: raise ValueError('Found existing DB. Please explicitly remove ' 'for re-processing') meta = vars(opts) meta['tokenizer'] = opts.toker toker = RobertaTokenizer.from_pretrained(opts.toker) tokenizer = roberta_tokenize(toker) meta['BOS'] = toker.convert_tokens_to_ids(['<s>'])[0] meta['EOS'] = toker.convert_tokens_to_ids(['</s>'])[0] meta['SEP'] = toker.convert_tokens_to_ids(['</s>'])[0] meta['CLS'] = toker.convert_tokens_to_ids(['<s>'])[0] meta['PAD'] = toker.convert_tokens_to_ids(['<pad>'])[0] meta['MASK'] = toker.convert_tokens_to_ids(['<mask>'])[0] meta['UNK'] = toker.convert_tokens_to_ids(['<unk>'])[0] meta['v_range'] = (toker.convert_tokens_to_ids(['.'])[0], toker.convert_tokens_to_ids(['<|endoftext|>'])[0] + 1) save_json(vars(opts), f'{opts.output}/meta.json', save_pretty=True) open_db = curry(open_lmdb, opts.output, readonly=False) with open_db() as db: sub_info_cache_path = f'{opts.output}/sub_info.json' try: vid2nframe = load_json(opts.vid2nframe) except Exception: vid2nframe = None if not os.path.exists(sub_info_cache_path): video2sub_info = load_process_sub_meta( opts.annotation, vid2nframe, frame_length=args.frame_length) save_json(video2sub_info, sub_info_cache_path) else: video2sub_info = load_json(sub_info_cache_path) with open(opts.annotation) as ann: vid2len, vid2max_frame_sub_len = process_tv_subtitles( ann, video2sub_info, db, tokenizer, meta['SEP']) save_json(vid2len, f'{opts.output}/vid2len.json') save_json(vid2max_frame_sub_len, f'{opts.output}/vid2max_frame_sub_len.json')
def compare_bytecode(left, right): unprefixed_left = remove_0x_prefix(left) unprefixed_right = remove_0x_prefix(right) sub = curry(re.sub) norm_pipeline = compose( sub(EMBEDDED_SWARM_HASH_REGEX, SWARM_HASH_REPLACEMENT), sub(EMBEDDED_ADDRESS_REGEX, ADDRESS_REPLACEMENT)) norm_left = norm_pipeline(unprefixed_left) norm_right = norm_pipeline(unprefixed_right) if len(norm_left) != len(unprefixed_left) or len(norm_right) != len( unprefixed_right): raise ValueError( "Invariant. Normalized bytecodes are not the correct lengths:" + "\n- left (original) :" + left + "\n- left (unprefixed):" + unprefixed_left + "\n- left (normalized):" + norm_left + "\n- right (original) :" + right + "\n- right (unprefixed):" + unprefixed_right + "\n- right (normalized):" + norm_right) return norm_left == norm_right
# -*- coding: utf-8 -*- __title__ = 'text2math' __author__ = 'Steven Cutting' __author_email__ = '*****@*****.**' __created_on__ = '02/13/2016' __copyright__ = "text2math Copyright (C) 2016 Steven Cutting" import sys import cytoolz as tlz c_map = tlz.curry(tlz.map) # -- # Specific imports # Parsing from xml.dom import minidom from bs4 import BeautifulSoup # Encoding issues import cchardet as chardet from unidecode import unidecode import ftfy if sys.version_info[0] < 3: _STRINGTYPES = (basestring,) else: # temp fix, so that 2.7 support wont break
import cytoolz from cytoolz import * from cytoolz.curried_exceptions import * # Here is the recipe used to create the list below # (and "cytoolz/tests/test_curried_toolzlike.py" verifies the list is correct): # # import toolz # import toolz.curried # # for item in sorted(key for key, val in toolz.curried.__dict__.items() # if isinstance(val, toolz.curry)): # print '%s = cytoolz.curry(%s)' % (item, item) accumulate = cytoolz.curry(accumulate) assoc = cytoolz.curry(assoc) cons = cytoolz.curry(cons) countby = cytoolz.curry(countby) dissoc = cytoolz.curry(dissoc) do = cytoolz.curry(do) drop = cytoolz.curry(drop) filter = cytoolz.curry(filter) get = cytoolz.curry(get) get_in = cytoolz.curry(get_in) groupby = cytoolz.curry(groupby) interleave = cytoolz.curry(interleave) interpose = cytoolz.curry(interpose) itemfilter = cytoolz.curry(itemfilter) itemmap = cytoolz.curry(itemmap) iterate = cytoolz.curry(iterate)
from itertools import groupby, accumulate, count from functools import reduce import operator # curried versions from cytoolz.curried import filter as cfilter from cytoolz.curried import map as cmap from cytoolz.curried import sorted as csorted from cytoolz.curried import groupby as cgroupby from cytoolz.curried import accumulate as caccumulate from cytoolz.curried import count as ccount from cytoolz.curried import reduce as creduce from cytoolz import curry cmax = curry(max) cmin = curry(min) czip = lambda xs: zip(*xs) # def contains(val): # return lambda x: val in x with catch_exc(print_error=False): import pyspark.sql.functions as F from pyspark.sql.types import * from pyspark.sql.window import Window
def curry_namespace(ns): return dict( (name, cytoolz.curry(f) if should_curry(f) else f) for name, f in ns.items() if '__' not in name )
# -*- coding: utf-8 -*- __title__ = 'text2math' __author__ = 'Steven Cutting' __author_email__ = '*****@*****.**' __created_on__ = '02/13/2016' __copyright__ = "text2math Copyright (C) 2016 Steven Cutting" from operator import eq import pytest import cytoolz as tlz c_eq = tlz.curry(eq) from text2math import raw2text from utils import osx_xfail # TODO (steven_c) Find out why km/h test fails only on OSX. @pytest.mark.parametrize("string,expected", [("<p>foo<\p><li>bar<\li>", "foobar"), ]) def test__remove_html_bits(string, expected): assert(tlz.pipe(string, raw2text.remove_html_bits, c_eq(expected))) def test__verify_unicode_fail(): with pytest.raises(AssertionError):
return self[key + '.each_event'] k, w = key[:i], key[i + 1:] if w == 'each_file': return (f[self.map(k)].value for f in self.files) elif w == 'each_event': return concat(f[self.map(k)] for f in self.files) else: raise ValueError("Key '{}' is invalid!".format(key)) try: from dbpy import (read_hightagnumber as __read_hightagnumber, read_taglist_byrun as __read_taglist_byrun, read_syncdatalist_float) from stpy import StorageReader, StorageBuffer read_hightagnumber = curry(memoize(__read_hightagnumber)) read_taglist_byrun = curry(memoize(__read_taglist_byrun)) class _ReadonlyBuffer: def __init__(self, buffer): self.__buffer = buffer @property def data(self): return self.__buffer.read_det_data(0) @property def info(self): return self.__buffer.read_det_info(0) class StorageWrapper:
to_bytes, to_canonical_address, to_checksum_address, to_dict, to_hex, to_int, to_list, to_normalized_address, to_ordered_dict, to_set, to_text, to_tuple, to_wei, ) apply_formatter_at_index = curry(apply_formatter_at_index) apply_formatter_if = curry(apply_formatter_if) apply_formatter_to_array = curry(apply_formatter_to_array) apply_formatters_to_dict = curry(apply_formatters_to_dict) apply_key_map = curry(apply_key_map) apply_one_of_formatters = curry(apply_one_of_formatters) flatten_return = curry(flatten_return) force_bytes = curry(force_bytes) force_text = curry(force_text) from_wei = curry(from_wei) hexstr_if_str = curry(hexstr_if_str) is_same_address = curry(is_same_address) reversed_return = curry(reversed_return) sort_return = curry(sort_return) text_if_str = curry(text_if_str) to_wei = curry(to_wei)
from __future__ import absolute_import import operator from cytoolz import curry # We use a blacklist instead of whitelist because: # 1. We have more things to include than exclude. # 2. This gives us access to things like matmul iff we are in Python >=3.5. no_curry = frozenset(( 'abs', 'index', 'inv', 'invert', 'neg', 'not_', 'pos', 'truth', )) locals().update( dict((name, curry(f) if name not in no_curry else f) for name, f in vars(operator).items() if callable(f)), ) # Clean up the namespace. del curry del no_curry del operator
interleave, isdistinct, isiterable, juxt, last, memoize, merge_sorted, peek, pipe, second, thread_first, thread_last, ) from .exceptions import merge, merge_with accumulate = cytoolz.curry(cytoolz.accumulate) assoc = cytoolz.curry(cytoolz.assoc) assoc_in = cytoolz.curry(cytoolz.assoc_in) cons = cytoolz.curry(cytoolz.cons) countby = cytoolz.curry(cytoolz.countby) dissoc = cytoolz.curry(cytoolz.dissoc) do = cytoolz.curry(cytoolz.do) drop = cytoolz.curry(cytoolz.drop) excepts = cytoolz.curry(cytoolz.excepts) filter = cytoolz.curry(cytoolz.filter) get = cytoolz.curry(cytoolz.get) get_in = cytoolz.curry(cytoolz.get_in) groupby = cytoolz.curry(cytoolz.groupby) interpose = cytoolz.curry(cytoolz.interpose) itemfilter = cytoolz.curry(cytoolz.itemfilter) itemmap = cytoolz.curry(cytoolz.itemmap)
def img(self, i): x, y = self.x_edges, self.y_edges spline = RectBivariateSpline(x, y, i) self.intensity = curry(spline, grid=False)
def intensity(self): x, y = self.x_centers, self.y_centers dx, dy = self.x_diffs, self.y_diffs spline = RectBivariateSpline(x, y, self.hist / outer(dx, dy)) return curry(spline, grid=False)
def _curry_namespace(ns): return dict((name, cytoolz.curry(f) if f in _curry_set else f) for name, f in ns.items() if '__' not in name)
def _curry_namespace(ns): return dict( (name, cytoolz.curry(f) if f in _curry_set else f) for name, f in ns.items() if '__' not in name )
def curry_namespace(ns): return { name: cytoolz.curry(f) if should_curry(f) else f for name, f in ns.items() if "__" not in name }