def extract_and_capitalize_headlines_from_corpus(corpus_dir, docids): """ Iterate through all the files in `corpus_dir`, extract the headlines, capitalized and return them Parameter: --------------- corpus_dir: string docids: list of string the document to be processed Return: -------------- generator of (docid, headlines): (str, list<list<str>>) """ get_tokens = partial(map, partial(get_in, ["token"])) get_features = partial(get_in, ["features"]) make_capitalized_title_new = lambda words: make_capitalized_title(title_words=words) for docid in docids: p = Path(corpus_dir) / Path(docid) auxil_p = p.with_suffix(".auxil") paf_p = p.with_suffix(".paf") if auxil_p.exists() and paf_p.exists(): try: titles, _ = separate_title_from_body(str(auxil_p), str(paf_p)) except Exception as e: yield (e, None) # pipeline: # -> get features # -> get tokens # -> capitalize headline yield (None, (p.name, list(map(compose(make_capitalized_title_new, get_tokens, get_features), titles))))
def ccds_to_bed(ccds_stream): """Convert CCDS dump to Chanjo-style BED stream. Main entry point for default Chanjo converter (ccds). It converts a sorted (start, chrom) CCDS database to the Chanjo BED-format. Args: ccds_stream (file): file handle to read CCDS lines from Yields: Interval: interval with merged block and superblock ids """ return pipe( ccds_stream, filter(grep('Public')), # filter out Public tx map(text_type.rstrip), # strip \n and spaces map(split(sep='\t')), # split into list map(extract_intervals), # convert to Interval concat, # flatten map(rename_sex_interval), # rename sex contigs partial(lazy_groupby, key=attrgetter('contig')), # group by contig pluck(1), # extract second item map(groupby(attrgetter('name'))), # non-lazy group by id map(valmap(merge_related_elements)), # group intervals map(itervalues), # extract values map(partial(sorted, key=attrgetter('start'))), # sort by start pos concat # flatten )
def read_csv(fn, *args, **kwargs): chunksize = kwargs.pop('chunksize', 2**16) categorize = kwargs.pop('categorize', None) index = kwargs.pop('index', None) if index and categorize == None: categorize = True header = kwargs.get('header', 1) nlines = linecount(fn) - header nchunks = int(ceil(1.0 * nlines / chunksize)) read = next(read_csv_names) blockdivs = tuple(range(chunksize, nlines, chunksize)) one_chunk = pd.read_csv(fn, *args, nrows=100, **kwargs) cols = [] if categorize or index: if categorize: category_columns = [c for c in one_chunk.dtypes.index if one_chunk.dtypes[c] == 'O'] else: category_columns = [] cols = category_columns + ([index] if index else []) d = read_csv(fn, *args, **merge(kwargs, dict(chunksize=chunksize, usecols=cols, categorize=False, parse_dates=None))) categories = [d[c].drop_duplicates() for c in category_columns] if index: quantiles = d[index].quantiles(np.linspace(0, 100, nchunks + 1)[1:-1]) result = compute(quantiles, *categories) quantiles, categories = result[0], result[1:] else: categories = compute(*categories) categories = dict(zip(category_columns, categories)) kwargs['chunksize'] = chunksize load = {(read, -1): (partial(pd.read_csv, *args, **kwargs), fn)} load.update(dict(((read, i), (get_chunk, (read, i-1), chunksize*i)) for i in range(nchunks))) name = next(names) dsk = dict(((name, i), (getitem, (read, i), 0)) for i in range(nchunks)) result = DataFrame(merge(dsk, load), name, one_chunk.columns, blockdivs) if categorize: func = partial(categorize_block, categories=categories) result = result.map_blocks(func, columns=result.columns) if index: result = set_partition(result, index, quantiles) return result
def test_to_tree_slice(serial): t = symbol('t', 'var * {name: string, amount: int32}') expr = t[:5] expr2 = pipe(expr, partial(to_tree, names={t: 't'}), serial.dumps, serial.loads, partial(from_tree, namespace={'t': t})) assert expr.isidentical(expr2)
def working_datetime_ranges_of_date(d, special_working_hours={}, week_working_hours={}, merge_tomorrow=True): """ Returns a list of datetimes tuples (datetime_range), indicating contiguous working periods of given date, if merge_tomorrow check if first period of tomorrow is contiguous and merge with last of today. """ # curried on working hours whs_by_date = partial(working_hours_of_date, special_working_hours=special_working_hours, week_working_hours=week_working_hours) # curried on date whs_to_dt_ranges = partial(working_hours_to_datetime_ranges, d) today_working_hours = whs_by_date(d) if not len(today_working_hours): return [] if not merge_tomorrow: return whs_to_dt_ranges(today_working_hours) tomorrow_working_hours = whs_by_date(tomorrow(d)) if are_working_hours_contiguous(today_working_hours, tomorrow_working_hours): # last range of today become a merged range between # the last of today and the first of tomorrow next_day = tomorrow(d) # when tomorrow working hour end at 00:00, certainly is (00:00, 00:00) # because is a contiguous with today working hours, in this case # we add a day to current date because end at 00:00 of day after # this cover 24/7 like situation if tomorrow_working_hours[0][1] == time(0): next_day = tomorrow(next_day) last_period = ( datetime.combine(d, today_working_hours[-1][0]), datetime.combine(next_day, tomorrow_working_hours[0][1]) ) return whs_to_dt_ranges(today_working_hours[:-1]) + [last_period] return whs_to_dt_ranges(today_working_hours)
def get(dsk, keys, optimizations=[fuse], num_workers=cpu_count): """ Multiprocessed get function appropriate for Bags """ pool = _globals['pool'] if pool is None: pool = multiprocessing.Pool(psutil.cpu_count()) cleanup = True else: cleanup = False manager = multiprocessing.Manager() queue = manager.Queue() apply_async = dill_apply_async(pool.apply_async) # Optimize Dask dsk2 = pipe(dsk, partial(cull, keys=keys), *optimizations) try: # Run result = get_async(apply_async, cpu_count, dsk2, keys, queue=queue) finally: if cleanup: pool.close() return result
def __getattr__(self, key): if key == '_hash': raise AttributeError() try: return _attr_cache[(self, key)] except: pass try: result = object.__getattribute__(self, key) except AttributeError: fields = dict(zip(map(valid_identifier, self.fields), self.fields)) if self.fields and key in fields: if isscalar(self.dshape.measure): # t.foo.foo is t.foo result = self else: result = self[fields[key]] else: d = toolz.merge(schema_methods(self.dshape.measure), dshape_methods(self.dshape)) if key in d: func = d[key] if func in method_properties: result = func(self) else: result = functools.update_wrapper(partial(func, self), func) else: raise _attr_cache[(self, key)] = result return result
def trim_internal(x, axes, boundary=None): """ Trim sides from each block This couples well with the overlap operation, which may leave excess data on each block See also -------- dask.array.chunk.trim dask.array.map_blocks """ boundary = coerce_boundary(x.ndim, boundary) olist = [] for i, bd in enumerate(x.chunks): bdy = boundary.get(i, 'none') ilist = [] for j, d in enumerate(bd): if bdy != 'none': d = d - axes.get(i, 0) * 2 else: d = d - axes.get(i, 0) if j != 0 else d d = d - axes.get(i, 0) if j != len(bd) - 1 else d ilist.append(d) olist.append(tuple(ilist)) chunks = tuple(olist) return map_blocks(partial(_trim, axes=axes, boundary=boundary), x, chunks=chunks, dtype=x.dtype)
def test_inline_ignores_curries_and_partials(): dsk = {'x': 1, 'y': 2, 'a': (partial(add, 1), 'x'), 'b': (inc, 'a')} result = inline_functions(dsk, fast_functions=set([add])) assert 'a' not in set(result.keys())
def ghost_internal(x, axes): """ Share boundaries between neighboring blocks Parameters ---------- x: da.Array A dask array axes: dict The size of the shared boundary per axis The axes dict informs how many cells to overlap between neighboring blocks {0: 2, 2: 5} means share two cells in 0 axis, 5 cells in 2 axis """ dims = list(map(len, x.blockdims)) expand_key2 = partial(expand_key, dims=dims) interior_keys = pipe(x._keys(), flatten, map(expand_key2), map(flatten), concat, list) interior_slices = dict((k, fractional_slice(k, axes)) for k in interior_keys) shape = (3,) * x.ndim name = next(ghost_names) ghost_blocks = dict(((name,) + k[1:], (rec_concatenate, (concrete, expand_key2(k)))) for k in interior_keys) blockdims = [ [bds[0] + axes.get(i, 0)] + [bd + axes.get(i, 0) * 2 for bd in bds[1:-1]] + [bds[-1] + axes.get(i, 0)] for i, bds in enumerate(x.blockdims)] return Array(merge(interior_slices, ghost_blocks, x.dask), name, blockdims=blockdims)
def is_date_a_fixed_closing_date(d, fixed_closing_days=[]): """ Check if date is in given list of dates, does not look at year to compare. """ return d in filter(None, map(partial(date_with_year, d.year), fixed_closing_days))
def format_results(terminal_width, key_list, separator, text_list, left_align=True, min_factor=3, **kwargs): """Returns formatted results in two columns. """ key_width = max(map(len, key_list)) separator_length = len(separator) desc_wrap = toolz.identity if terminal_width: if key_width / terminal_width > .5: key_width = terminal_width // 2 - 3 text_width = terminal_width - key_width - separator_length if text_width * min_factor > terminal_width: desc_wrap = toolz.compose( ('\n' + ' ' * (key_width + separator_length)).join, toolz.partial(textwrap.wrap, width=text_width, **kwargs), ) if left_align: fmt = '%-*s%s%s' else: fmt = '%*s%s%s' for key, text in zip(key_list, text_list): text = desc_wrap(text) if len(key) > key_width: yield fmt % (key_width, key, separator, '') yield fmt % (key_width, '', ' ' * separator_length, text) else: yield fmt % (key_width, key, separator, text)
def __getattr__(self, key): if key in dir(self._accessor): if isinstance(getattr(self._accessor, key), property): return self._property_map(key) else: return partial(self._function_map, key) else: raise AttributeError(key)
def hash_join(lhs, left_on, rhs, right_on, how="inner", npartitions=None, suffixes=("_x", "_y")): """ Join two DataFrames on particular columns with hash join This shuffles both datasets on the joined column and then performs an embarassingly parallel join partition-by-partition >>> hash_join(a, 'id', rhs, 'id', how='left', npartitions=10) # doctest: +SKIP """ if npartitions is None: npartitions = max(lhs.npartitions, rhs.npartitions) lhs2 = shuffle(lhs, left_on, npartitions) rhs2 = shuffle(rhs, right_on, npartitions) if isinstance(left_on, Index): left_on = None left_index = True else: left_index = False if isinstance(right_on, Index): right_on = None right_index = True else: right_index = False # dummy result dummy = pd.merge( lhs._pd, rhs._pd, how, None, left_on=left_on, right_on=right_on, left_index=left_index, right_index=right_index, suffixes=suffixes, ) merger = partial( _pdmerge, suffixes=suffixes, default_left_columns=list(lhs.columns), default_right_columns=list(rhs.columns) ) if isinstance(left_on, list): left_on = (list, tuple(left_on)) if isinstance(right_on, list): right_on = (list, tuple(right_on)) token = tokenize(lhs, left_on, rhs, right_on, left_index, right_index, how, npartitions, suffixes) name = "hash-join-" + token dsk = dict( ((name, i), (merger, (lhs2._name, i), (rhs2._name, i), how, left_on, right_on, left_index, right_index)) for i in range(npartitions) ) divisions = [None] * (npartitions + 1) return DataFrame(toolz.merge(lhs2.dask, rhs2.dask, dsk), name, dummy, divisions)
def test_get_with_dill(): with scheduler_and_workers() as (s, (a, b)): c = Client(s.address_to_clients) dsk = {'x': 1, 'y': (partial(add, 1), 'x')} keys = 'y' assert c.get(dsk, keys) == 2 c.close()
def hash_join(lhs, left_on, rhs, right_on, how='inner', npartitions=None, suffixes=('_x', '_y'), shuffle=None, indicator=False): """ Join two DataFrames on particular columns with hash join This shuffles both datasets on the joined column and then performs an embarrassingly parallel join partition-by-partition >>> hash_join(a, 'id', rhs, 'id', how='left', npartitions=10) # doctest: +SKIP """ print ('started hash_join, indicator = ', indicator) if npartitions is None: npartitions = max(lhs.npartitions, rhs.npartitions) lhs2 = shuffle_func(lhs, left_on, npartitions=npartitions, shuffle=shuffle) rhs2 = shuffle_func(rhs, right_on, npartitions=npartitions, shuffle=shuffle) if isinstance(left_on, Index): left_on = None left_index = True else: left_index = False if isinstance(right_on, Index): right_on = None right_index = True else: right_index = False # dummy result meta = pd.merge(lhs._meta_nonempty, rhs._meta_nonempty, how, None, left_on=left_on, right_on=right_on, left_index=left_index, right_index=right_index, suffixes=suffixes, indicator=indicator) merger = partial(_pdmerge, suffixes=suffixes, default_left_columns=list(lhs.columns), default_right_columns=list(rhs.columns), indicator=indicator) if isinstance(left_on, list): left_on = (list, tuple(left_on)) if isinstance(right_on, list): right_on = (list, tuple(right_on)) token = tokenize(lhs2, left_on, rhs2, right_on, left_index, right_index, how, npartitions, suffixes, shuffle) name = 'hash-join-' + token dsk = dict(((name, i), (merger, (lhs2._name, i), (rhs2._name, i), how, left_on, right_on, left_index, right_index)) for i in range(npartitions)) divisions = [None] * (npartitions + 1) return DataFrame(toolz.merge(lhs2.dask, rhs2.dask, dsk), name, meta, divisions)
def __getattr__(self, key): try: return object.__getattribute__(self, key) except AttributeError: if key in dir(pd.Series.str): if isinstance(getattr(pd.Series.str, key), property): return self._property_map(key) else: return partial(self._function_map, key) else: raise
def zte_gpon_svlan_check(): clear_log() nodes = graph.cypher.execute( "match(n:Olt)--(c:Card) where c.name='GTGO' return n.ip,collect(c.slot)") olts = ((x[0], x[1]) for x in nodes) lzte_gpon_svlan = lambda x: zte_gpon_svlan(ip=x[0], slots=x[1]) pool = Pool(8) lock = Manager().Lock() func = partial(svlan_entry, lock) list(pool.map(compose(func, lzte_gpon_svlan), olts)) pool.close() pool.join()
def test_normalize_function(): def f1(a, b, c=1): pass def f2(a, b=1, c=2): pass def f3(a): pass assert normalize_function(f2) f = lambda a: a assert normalize_function(f) assert (normalize_function(partial(f2, b=2)) == normalize_function(partial(f2, b=2))) assert (normalize_function(partial(f2, b=2)) != normalize_function(partial(f2, b=3))) assert (normalize_function(partial(f1, b=2)) != normalize_function(partial(f2, b=2))) assert (normalize_function(compose(f2, f3)) == normalize_function(compose(f2, f3))) assert (normalize_function(compose(f2, f3)) != normalize_function(compose(f2, f1))) assert normalize_function(curry(f2)) == normalize_function(curry(f2)) assert normalize_function(curry(f2)) != normalize_function(curry(f1)) assert (normalize_function(curry(f2, b=1)) == normalize_function(curry(f2, b=1))) assert (normalize_function(curry(f2, b=1)) != normalize_function(curry(f2, b=2)))
def card_check(): clear_log() # nodes = graph.find('Olt', property_key='ip', property_value='218.92.130.130') nodes = graph.find('Olt') # nodes = graph.find('Olt', property_key='company', property_value='zte') olts = [(x['ip'], x['company'], x['area']) for x in nodes] # list(map(compose(card_entry, get_card), olts)) pool = multiprocessing.Pool(8) lock = multiprocessing.Manager().Lock() func = partial(card_entry_m, lock) list(pool.map(compose(func, get_card), olts)) pool.close() pool.join()
def interface_check_m(): clear_log() # cmd = "match(s: Switch) where s.model in ['S8505','S8508'] return s.ip, s.model" cmd = "match(s: Switch) return s.ip, s.model" # cmd = "match(s:Switch) where s.model='S9306' or s.model='s9303' return s.ip,s.model limit 2" nodes = graph.cypher.execute(cmd) switchs = [(x[0], x[1]) for x in nodes] pool = Pool(16) lock = Manager().Lock() out_inf = partial(output_interface_m, lock) list(pool.map(compose(out_inf, get_interface), switchs)) pool.close() pool.join()
def svlan_check(): clear_log() # nodes = graph.find('Olt', property_key='ip', property_value='9.192.96.246') nodes = graph.find('Olt') # nodes = graph.find('Olt', property_key='company', property_value='zte') olts = [(x['ip'], x['company'], x['area']) for x in nodes] # list(map(compose(card_entry, get_card), olts)) pool = Pool(16) lock = Manager().Lock() func = partial(svlan_entry, lock) list(pool.map(compose(func, get_svlan), olts)) pool.close() pool.join()
def trim_internal(x, axes=None): """ Trim sides from each block This couples well with the ghost operation, which may leave excess data on each block See also chunk.trim map_blocks """ blockdims = tuple([tuple([d - axes.get(i, 0)*2 for d in bd]) for i, bd in enumerate(x.blockdims)]) return map_blocks(x, partial(chunk.trim, axes=axes), blockdims=blockdims)
def run_features(args): """Run image feature computation. Parameters ---------- args : argparse.Namespace The arguments parsed by the argparse library. """ if args.global_threshold: images = map(io.imread, args.images) thresholds = pre.global_threshold(images, args.random_seed) else: thresholds = None images = map(io.imread, args.images) screen_info = screens.d[args.screen] index_function, fmap = screen_info['index'], screen_info['fmap'] fmap = tz.partial(fmap, threshold=thresholds, sample_size=args.sample_size, random_seed=args.random_seed) indices = list(map(index_function, args.images)) f0, feature_names = fmap(next(images)) feature_vectors = tz.cons(f0, (fmap(im)[0] for im in images)) online_scaler = StandardScaler() online_pca = cluster.OnlineIncrementalPCA(n_components=args.n_components, batch_size=args.pca_batch_size) nimages, nfeatures = len(args.images), len(f0) emit = io.emitter_function(args.emitter) with temporary_hdf5_dataset((nimages, nfeatures), 'float') as dset: # First pass: compute the features, compute the mean and SD, # compute the PCA for i, (idx, v) in enumerate(zip(indices, feature_vectors)): emit({'_id': idx, 'feature_vector': list(v)}) dset[i] = v online_scaler.partial_fit(v.reshape(1, -1)) online_pca.add_sample(v) # Second pass: standardise the feature vectors, compute PCA-transform for i, (idx, v) in enumerate(zip(indices, dset)): v_std = online_scaler.transform(v.reshape(1, -1))[0] v_pca = online_pca.transform(v) dset[i] = v_std emit({'_id': idx, 'feature_vector_std': list(v_std), 'pca_vector': list(v_pca)}) online_pca.transform(v) # Third pass: Compute the nearest neighbors graph. # THIS ANNOYINGLY INSTANTIATES FULL ARRAY -- no out-of-core # solution that I'm aware of... ng = neighbors.kneighbors_graph(dset, args.num_neighbours, include_self=False, mode='distance') for idx, row in zip(indices, ng): emit({'_id': idx, 'neighbours': [indices[i] for i in row.indices]})
def __init__(self, bamfile, outdir): self.bamfile = bamfile stat = self.indexbamfile() self.outdir = outdir assert self.bamfile and self.outdir and stat, "Input error" self._bam = pysam.Samfile(bamfile) self._prealloc_func = partial(np.zeros, dtype=np.int) self.fake_bed_rows = [("chrX", 1, 59373566), ("chrY", 69362, 11375310)] self.sequence = pipe(self.fake_bed_rows, map(lambda interval: self.depthreader(*interval)), map(average) ) self.x_coverage, self.y_coverage = list(self.sequence) self.sex = self.predict_gender()
def overlap_internal(x, axes): """ Share boundaries between neighboring blocks Parameters ---------- x: da.Array A dask array axes: dict The size of the shared boundary per axis The axes input informs how many cells to overlap between neighboring blocks {0: 2, 2: 5} means share two cells in 0 axis, 5 cells in 2 axis """ dims = list(map(len, x.chunks)) expand_key2 = partial(expand_key, dims=dims, axes=axes) # Make keys for each of the surrounding sub-arrays interior_keys = pipe(x.__dask_keys__(), flatten, map(expand_key2), map(flatten), concat, list) name = 'overlap-' + tokenize(x, axes) getitem_name = 'getitem-' + tokenize(x, axes) interior_slices = {} overlap_blocks = {} for k in interior_keys: frac_slice = fractional_slice((x.name,) + k, axes) if (x.name,) + k != frac_slice: interior_slices[(getitem_name,) + k] = frac_slice else: interior_slices[(getitem_name,) + k] = (x.name,) + k overlap_blocks[(name,) + k] = (concatenate3, (concrete, expand_key2((None,) + k, name=getitem_name))) chunks = [] for i, bds in enumerate(x.chunks): if len(bds) == 1: chunks.append(bds) else: left = [bds[0] + axes.get(i, 0)] right = [bds[-1] + axes.get(i, 0)] mid = [] for bd in bds[1:-1]: mid.append(bd + axes.get(i, 0) * 2) chunks.append(left + mid + right) dsk = merge(interior_slices, overlap_blocks) dsk = sharedict.merge(x.dask, (name, dsk)) return Array(dsk, name, chunks, dtype=x.dtype)
def hostname_check(): clear_log() nodes = graph.find('Olt') # nodes = graph.find('Olt', property_key='ip', property_value='172.18.0.46') olts = [(x['ip'], x['company']) for x in nodes] pool = Pool(16) lock = Manager().Lock() func = partial(hostname_entry, lock) list(pool.map(compose(func, get_hostname), olts)) pool.close() pool.join() ip_hostname = (x.split(',') for x in open(result_file)) cmd = "match (n:Olt) where n.ip={ip} set n.hostname={hostname}" list(map(lambda x: graph.cypher.execute( cmd, ip=x[0], hostname=x[1]), ip_hostname))
def get(dsk, keys, optimizations=[], num_workers=None, func_loads=None, func_dumps=None, **kwargs): """ Multiprocessed get function appropriate for Bags Parameters ---------- dsk: dict dask graph keys: object or list Desired results from graph optimizations: list of functions optimizations to perform on graph before execution num_workers: int Number of worker processes (defaults to number of cores) func_dumps: function Function to use for function serialization (defaults to cloudpickle.dumps) func_loads: function Function to use for function deserialization (defaults to cloudpickle.loads) """ pool = _globals['pool'] if pool is None: pool = multiprocessing.Pool(num_workers) cleanup = True else: cleanup = False manager = multiprocessing.Manager() queue = manager.Queue() apply_async = pickle_apply_async(pool.apply_async, func_dumps=func_dumps, func_loads=func_loads) # Optimize Dask dsk2 = fuse(dsk, keys) dsk3 = pipe(dsk2, partial(cull, keys=keys), *optimizations) try: # Run result = get_async(apply_async, len(pool._pool), dsk3, keys, queue=queue, get_id=_process_get_id, **kwargs) finally: if cleanup: pool.close() return result
def test_normalize_function(): def f1(a, b, c=1): pass cf1 = curry(f1) def f2(a, b=1, c=2): pass def f3(a): pass assert normalize_function(f2) == str(f2) f = lambda a: a assert normalize_function(f) == str(f) comp = compose(partial(f2, b=2), f3) assert normalize_function(comp) == ((str(f2), (), (('b', 2),)), str(f3)) assert normalize_function(cf1) == (str(f1), (), ()) assert normalize_function(cf1(2, c=2)) == (str(f1), (2,), (('c', 2),)) assert normalize_token(cf1) == normalize_function(cf1)
def predict(model, x): """ Predict with a scikit learn model Parameters ---------- model: scikit learn classifier x: dask Array See docstring for ``da.learn.fit`` """ assert x.ndim == 2 if len(x.chunks[1]) > 1: x = x.reblock(chunks=(x.chunks[0], sum(x.chunks[1]))) func = partial(_predict, model) return x.map_blocks(func, chunks=(x.chunks[0], (1,))).squeeze()
import pytest from toolz import partial import dask from dask import compute from dask.compatibility import PY_VERSION from dask.utils import filetexts from dask.bytes import utils from dask.bag.text import read_text from fsspec.compression import compr compute = partial(compute, scheduler="sync") files = { ".test.accounts.1.json": ( '{"amount": 100, "name": "Alice"}\n' '{"amount": 200, "name": "Bob"}\n' '{"amount": 300, "name": "Charlie"}\n' '{"amount": 400, "name": "Dennis"}\n' ), ".test.accounts.2.json": ( '{"amount": 500, "name": "Alice"}\n' '{"amount": 600, "name": "Bob"}\n' '{"amount": 700, "name": "Charlie"}\n' '{"amount": 800, "name": "Dennis"}\n' ), } expected = "".join([files[v] for v in sorted(files)])
def get_pet_relations(pet): return compose(list, partial(pluck, "customer"))(frappe.get_all("Pet Relation", filters={"parent": pet}, fields=["customer"]))
def extract( image, classifier=None, context=default_context, output_folder=None, return_negatives=False, override_prediction=False, ): """ extract(image, classifier, context=default_context, output_folder=None, return_negatives=False, override_prediction=False) A function utilised the core of the package to extract required lines and by default classifies the required and non-required lines. Note ---- Needs refactoring. Parameters ---------- image : np.array or str image as loaded by ``cv2.imread`` or string path to the image on disk classifier : sklearn model or str sklearn model for classification or a string to a pickled model loads the last trained model. Current default model is loaded if nothing else is provided. context : dict parameter dictionary which contains default settings for various functions # TODO: Write better summary of how to use this output_folder : str if provided will save the predicted lines override_prediction : bool if ``True`` then it overwrites any filtering done by the model and turns this into a regular pipeline of getting just the subsets expand : dict experimental feature. This will eventually accept a dictionary of parameters which will be trickled down into the core making testing easier. at the moment we can only change the vertical padding of the system Returns ------- list | tuple a list of cutout lines in numpy array form if ``return_negatives`` is disabled, else a tuple containing both positive predictions and negatives (1s and 0s) """ # logic for classifier assessment if classifier: if isinstance(classifier, str): classifier = io.load_model(classifier) else: classifier = classifier else: classifier = io.load_model( resource_filename("readpyne", "models/classifier.pkl")) pipe = fp.compose(unfold_args(core.features), fp.partial(core.boxes, context=context)) if isinstance(image, str): pipe = fp.compose(pipe, io.load_validate) subsets, features = pipe(image) # return the subsets raw without doing any other work if override_prediction: print( "[WARN] You have chosen not to use the classifier and hence full list of lines is returned" ) return subsets # Use the model to predict prediction = classifier.predict(features) # get the zero and non-zero indices bindices_zero = prediction == 0 zeros = np.arange(len(prediction))[bindices_zero] nonzeros = np.arange(len(prediction))[~bindices_zero] # Try to get the subsets that classify as non-zero try: positives = itemgetter(*nonzeros)(subsets.copy()) except: raise NoPositivesFound("Could not get positive (1's) from subsets") # Make sure in the case of only 1 line found, we still return a list and # not an array. if not isinstance(positives, tuple) and isinstance(positives, type(np.zeros(1))): positives = (positives, ) print(f"[INFO] {len(positives)} item lines found by the classifier") # output positives if this is provided if output_folder: io.save_images(positives, path=output_folder) # if required return negatives as a tuple if return_negatives: try: negatives = itemgetter(*zeros)(subsets.copy()) except: raise Exception("Could not get 0's from subsets") if not isinstance(negatives, tuple) and isinstance( negatives, type(np.zeros(1))): negatives = (negatives, ) print( f"[INFO] {len(negatives)} non-item lines found by the classifier") # override positives to contain the final results positives = (positives, negatives) return positives
def cxonepointleafbiased(**kwargs): """Factory for cxonepointleafbiased""" termpb = kwargs.get("termpb", 0.1) return toolz.partial(deap.gp.cxOnePointLeafBiased, termpb=termpb)
def mutnodereplacement(pset, **kwargs): """Factory for mutnodereplacement""" return toolz.partial(deap.gp.mutNodeReplacement, pset=pset)
with ignoring(ImportError): import lz4 def _fixed_lz4_decompress(data): # lz4.LZ4_uncompress() doesn't accept memoryviews if isinstance(data, memoryview): data = data.tobytes() return lz4.LZ4_uncompress(data) compressions['lz4'] = {'compress': lz4.LZ4_compress, 'decompress': _fixed_lz4_decompress} default_compression = 'lz4' with ignoring(ImportError): import blosc compressions['blosc'] = {'compress': partial(blosc.compress, clevel=5, cname='lz4'), 'decompress': blosc.decompress} default = config.get('compression', 'auto') if default != 'auto': if default in compressions: default_compression = default else: raise ValueError("Default compression '%s' not found.\n" "Choices include auto, %s" % ( default, ', '.join(sorted(map(str, compressions))))) def byte_sample(b, size, n): """ Sample a bytestring from many locations
def partial_serializer(serializer_name, dump_kwargs, load_kwargs): s = serializers[serializer_name] return Serializer( s.name, t.partial(s.dump, **dump_kwargs) if dump_kwargs else s.dump, t.partial(s.load, **load_kwargs) if load_kwargs else s.load)
def resource_json_gzip(uri): return resource_json(uri, open=partial(gzip.open, mode='rt'))
except (ValueError, TypeError): if isinstance(data, memoryview): return lz4_decompress(data.tobytes()) else: raise compressions['lz4'] = { 'compress': _fixed_lz4_compress, 'decompress': _fixed_lz4_decompress } default_compression = 'lz4' with ignoring(ImportError): import blosc compressions['blosc'] = { 'compress': partial(blosc.compress, clevel=5, cname='lz4'), 'decompress': blosc.decompress } default = config.get('compression', 'auto') if default != 'auto': if default in compressions: default_compression = default else: raise ValueError("Default compression '%s' not found.\n" "Choices include auto, %s" % (default, ', '.join(sorted(map(str, compressions))))) def byte_sample(b, size, n): """ Sample a bytestring from many locations
def delete(self, id): return cs.chained_delete(self, id) def _filename(self, id): return cs.chained_filename(self, id) ### ArtifactSet logic def _set_op(operator, *sets, labels=None): new_ids = t.reduce(operator, t.map(lambda s: s.artifact_ids, sets)) return ArtifactSet(new_ids, labels) set_union = t.partial(_set_op, ops.or_) set_difference = t.partial(_set_op, ops.sub) set_intersection = t.partial(_set_op, ops.and_) artifact_set_properties = ['id', 'artifact_ids', 'created_at', 'labels'] class ArtifactSet(namedtuple('ArtifactSet', artifact_set_properties)): def __new__(cls, artifact_ids, labels=None, created_at=None, id=None): artifact_ids = t.map(_artifact_id, artifact_ids) labels = _check_labels_name(labels) ids = frozenset(artifact_ids) if id: set_id = id else: set_id = hash(ids)
""" img = resize(img) east_decode = unfold_args(fp.partial(decode, **context["boxes"])) arr = fp.compose(east_decode, forward, blobify)(img) rects, conf = arr[:, 1:], arr[:, 0] boxes = non_max_suppression(expand(rects, img.shape, **context["expand"]), probs=conf) # preserve order by sorting on startx sorted_boxes = pd.DataFrame(boxes).sort_values(1).values return img, get_subsets(img, sorted_boxes) boxesM = fp.partial(map, boxes) def features(img, subsets): """ Take an image and its subsets created from ``boxes`` and produce histogram based features for each subset. Parameters ---------- img : numpy.array numpy array representation of an image. subsets : list list of numpy arrays of the subsets.
from io import BytesIO import pytest pd = pytest.importorskip('pandas') dd = pytest.importorskip('dask.dataframe') from toolz import partition_all, valmap, partial from dask import compute from dask. async import get_sync from dask.dataframe.csv import read_csv_from_bytes, bytes_read_csv, read_csv from dask.dataframe.utils import eq from dask.utils import filetexts, filetext compute = partial(compute, get=get_sync) files = { '2014-01-01.csv': (b'name,amount,id\n' b'Alice,100,1\n' b'Bob,200,2\n' b'Charlie,300,3\n'), '2014-01-02.csv': (b'name,amount,id\n'), '2014-01-03.csv': (b'name,amount,id\n' b'Dennis,400,4\n' b'Edith,500,5\n' b'Frank,600,6\n') } header = files['2014-01-01.csv'].split(b'\n')[0] + b'\n'
import os from time import sleep import sys import pytest from toolz import concat, valmap, partial from dask import compute from dask.compatibility import FileNotFoundError, unicode from dask.utils import filetexts from dask.bytes import compression from dask.bytes.local import LocalFileSystem from dask.bytes.core import (read_bytes, open_files, get_pyarrow_filesystem, logical_size, get_fs_token_paths) compute = partial(compute, scheduler='sync') files = {'.test.accounts.1.json': (b'{"amount": 100, "name": "Alice"}\n' b'{"amount": 200, "name": "Bob"}\n' b'{"amount": 300, "name": "Charlie"}\n' b'{"amount": 400, "name": "Dennis"}\n'), '.test.accounts.2.json': (b'{"amount": 500, "name": "Alice"}\n' b'{"amount": 600, "name": "Bob"}\n' b'{"amount": 700, "name": "Charlie"}\n' b'{"amount": 800, "name": "Dennis"}\n')} csv_files = {'.test.fakedata.1.csv': (b'a,b\n' b'1,2\n'), '.test.fakedata.2.csv': (b'a,b\n' b'3,4\n'),
if not isoption: # a is not an option, this is just a return a b_dshape = discover(b) return Coalesce( a, b, DataShape(*(maxshape((a_dshape.shape, b_dshape.shape)) + (promote(a_measure, b_dshape.measure), )))) dshape_method_list = list() schema_method_list = list() method_properties = set() dshape_methods = memoize(partial(select_functions, dshape_method_list)) schema_methods = memoize(partial(select_functions, schema_method_list)) @dispatch(DataShape) def shape(ds): s = ds.shape s = tuple(int(d) if isinstance(d, Fixed) else d for d in s) return s @dispatch(object) def shape(expr): """ Shape of expression >>> symbol('s', '3 * 5 * int32').shape
def get_data(sackmann_dir, tour='atp', keep_davis_cup=False): all_csvs = glob(join(sackmann_dir, f'*{tour}_matches_????.csv')) all_csvs = sorted(all_csvs, key=lambda x: int(splitext(x)[0][-4:])) levels_to_drop = ['C', 'S'] if not keep_davis_cup: levels_to_drop.append('D') data = pipe( all_csvs, # Read CSV lambda y: map(partial(pd.read_csv, encoding="ISO=8859-1"), y), # Drop NAs in important fields lambda y: map( lambda x: x.dropna(subset=['winner_name', 'loser_name', 'score']), y), # Drop retirements and walkovers # TODO: Make this optional lambda y: map( lambda x: x[~x['score'].astype(str).str.contains( 'RET|W/O|DEF|nbsp|Def.')], y), # Drop scores that appear truncated lambda y: map(lambda x: x[x['score'].astype(str).str.len() > 4], y), # Drop challengers and futures # TODO: Make this optional too lambda y: map(lambda x: x[~x['tourney_level'].isin(levels_to_drop)], y ), pd.concat, ) round_numbers = { 'R128': 1, 'RR': 1, 'R64': 2, 'R32': 3, 'R16': 4, 'QF': 5, 'SF': 6, 'F': 7 } # Drop rounds outside this list to_keep = data['round'].isin(round_numbers) data = data[to_keep] # Add a numerical round number data['round_number'] = data['round'].replace(round_numbers) # Add date information data['tourney_date'] = pd.to_datetime( data['tourney_date'].astype(int).astype(str), format='%Y%m%d') data['year'] = data['tourney_date'].dt.year # Sort by date and round and reset index data = data.sort_values(['tourney_date', 'round_number']) data = data.reset_index(drop=True) data['pts_won_serve_winner'] = data['w_1stWon'] + data['w_2ndWon'] data['pts_won_serve_loser'] = data['l_1stWon'] + data['l_2ndWon'] data['pts_played_serve_winner'] = data['w_svpt'] data['pts_played_serve_loser'] = data['l_svpt'] # Add serve % won data['spw_winner'] = (data['w_1stWon'] + data['w_2ndWon']) / data['w_svpt'] data['spw_loser'] = (data['l_1stWon'] + data['l_2ndWon']) / data['l_svpt'] return data
from __future__ import print_function, division, absolute_import import pytest from toolz import partial from dask import compute, get from dask.utils import filetexts from dask.bytes import compression from dask.bag.text import read_text compute = partial(compute, get=get) files = { '.test.accounts.1.json': ('{"amount": 100, "name": "Alice"}\n' '{"amount": 200, "name": "Bob"}\n' '{"amount": 300, "name": "Charlie"}\n' '{"amount": 400, "name": "Dennis"}\n'), '.test.accounts.2.json': ('{"amount": 500, "name": "Alice"}\n' '{"amount": 600, "name": "Bob"}\n' '{"amount": 700, "name": "Charlie"}\n' '{"amount": 800, "name": "Dennis"}\n') } expected = ''.join([files[v] for v in sorted(files)]) fmt_bs = ([(fmt, None) for fmt in compression.files] + [(fmt, 10) for fmt in compression.seekable_files] + [(fmt, None) for fmt in compression.seekable_files]) encodings = ['ascii', 'utf-8'] # + ['utf-16', 'utf-16-le', 'utf-16-be'] fmt_bs_enc = [(fmt, bs, encoding) for fmt, bs in fmt_bs for encoding in encodings]
if isinstance(data, (memoryview, bytearray)): return lz4_decompress(bytes(data)) else: raise compressions["lz4"] = { "compress": _fixed_lz4_compress, "decompress": _fixed_lz4_decompress, } default_compression = "lz4" with ignoring(ImportError): import blosc compressions["blosc"] = { "compress": partial(blosc.compress, clevel=5, cname="lz4"), "decompress": blosc.decompress, } default = dask.config.get("distributed.comm.compression") if default != "auto": if default in compressions: default_compression = default else: raise ValueError( "Default compression '%s' not found.\n" "Choices include auto, %s" % (default, ", ".join(sorted(map(str, compressions)))) )
def mutinsert(pset, **kwargs): """Factory for mutinsert""" return toolz.partial(deap.gp.mutInsert, pset=pset)
def nsBlockFilterInit(ns): nsSet(ns, "/blocks/filter/in", partial(nsGet(ns, "/usr/local/blocks/filter/in"), "/blocks/filter")) nsSet(ns, "/blocks/filter/inF", partial(nsGet(ns, "/usr/local/blocks/filter/inF"), "/blocks/filter")) nsSet(ns, "/blocks/filter/out", partial(nsGet(ns, "/usr/local/blocks/filter/out"), "/blocks/filter")) nsSet(ns, "/blocks/filter/outF", partial(nsGet(ns, "/usr/local/blocks/filter/outF"), "/blocks/filter")) nsSet(ns, "/blocks/filter/create", partial(nsGet(ns, "/usr/local/blocks/filter/task"), "/blocks/filter")) nsSet(ns, "/blocks/filter/server", partial(nsGet(ns, "/usr/local/blocks/filter/server"), "/blocks/filter")) nsSet(ns, "/blocks/filter/handler", partial(nsGet(ns, "/usr/local/blocks/filter/handler"), "/blocks/filter")) nsSet(ns, "/blocks/filter/reject", partial(nsGet(ns, "/usr/local/blocks/filter/reject"), "/blocks/filter")) nsSet(ns, "/blocks/filter/call", partial(nsGet(ns, "/usr/local/blocks/filter/call"), "/blocks/filter")) nsSet(ns, "/blocks/filter/empty", partial(nsGet(ns, "/usr/local/blocks/filter/empty"), "/blocks/filter")) nsSet(ns, "/blocks/filter/exists", partial(nsGet(ns, "/usr/local/blocks/filter/exists"), "/blocks/filter")) nsSet(ns, "/blocks/filter/configured", True) return True
def parents(self): # Note: the last created Script object appears to bork the older ones. Must keep making new Script objects! # Note: jedi appears to already do enough caching. It does not significantly # improve performance to cache the parents. #acceptable_name_types = (jedi.parser.tree.Name, # jedi.evaluate.representation.InstanceElement) if self.definition and self.definition.module_path: script = jedi.api.Script( source_path=self.definition.module_path, sys_path=self.definition._evaluator.sys_path, line=self.definition.line, column=self.definition.column) usages = catch_errors( tz.partial(jedi_alt.usages.usages_with_additional_modules, script, self.usage_resolution_modules), [], 'while finding usages of {}'.format(self.code_element.name)) elif self.code_element.call_pos[0]: call_pos_script = jedi.api.Script( source_path=self.code_element.call_pos[0], sys_path=self.definition._evaluator.sys_path if self.definition else self.sys_path, line=self.code_element.call_pos[1][0], column=self.code_element.call_pos[1][1]) usages = catch_errors( tz.partial(jedi_alt.usages.usages_with_additional_modules, call_pos_script, self.usage_resolution_modules), [], 'while finding usages of {}'.format(self.code_element.name)) elif self.definition: script = create_import_script( self.definition._evaluator.sys_path if self.definition else self.sys_path, self.code_element.name) usages = [ usage for usage in catch_errors( tz.partial(jedi_alt.usages.usages_with_additional_modules, script, self.usage_resolution_modules), [], 'while finding usages of {}'.format( self.code_element.name)) if usage.module_name ] else: return () _unfiltered_parents = [] positions = set() for usage in usages: tree_name = usage._name.tree_name if tree_name: position = (usage.module_path, tree_name.start_pos, tree_name.end_pos) else: position = (None, (None, None), (None, None)) if position not in positions or position == (None, (None, None), (None, None)): _usage_parent = parent_definition(usage) if _usage_parent.module_path: JediCodeElementNode.usage_resolution_modules |= frozenset( (_usage_parent._name.get_root_context(), )) usage_node = JediCodeElementNode.from_definition( 'parent', position, _usage_parent) # check if this usage is actually the definition of the # current node, and is therefore already covered by the # "- [sig]" node. if (usage_node.code_element.call_pos[0] == self.code_element.path and usage_node.code_element.call_pos[1] == self.code_element.start_pos and usage_node.code_element.type == 'module'): logger.info( 'Usages: Skipped definition of {} at {}:{}.'.format( self.code_element.name, usage_node.code_element.name, usage_node.code_element.call_pos[1][0])) continue else: _unfiltered_parents.append(usage_node) positions.add(position) _cleanup_signal_queue() return _unfiltered_parents
def nsBlockFilterTask(ns, block_path, name, _handler=None, _reject=None, **kw): task_path = "/tasks/filter/{}".format(name) if name in nsDir(ns, task_path): return True nsMkdir(ns, task_path) nsSet(ns, "{}/id".format(task_path), str(uuid.uuid4())) nsSet(ns, "{}/args".format(task_path), ()) nsSet(ns, "{}/kw".format(task_path), {}) nsSet(ns, "{}/blocking".format(task_path), False) nsSet(ns, "{}/in_q".format(task_path), Queue()) nsSet(ns, "{}/out_q".format(task_path), Queue()) nsSet(ns, "{}/in".format(task_path), partial(nsGet(ns, "/blocks/filter/in"), task_path)) nsSet(ns, "{}/inF".format(task_path), partial(nsGet(ns, "/blocks/filter/inF"), task_path)) nsSet(ns, "{}/out".format(task_path), partial(nsGet(ns, "/blocks/filter/out"), task_path)) nsSet(ns, "{}/outF".format(task_path), partial(nsGet(ns, "/blocks/filter/outF"), task_path)) nsSet(ns, "{}/empty".format(task_path), partial(nsGet(ns, "/blocks/filter/empty"), task_path)) nsSet(ns, "{}/server".format(task_path), partial(nsGet(ns, "/blocks/filter/server"), task_path)) if _handler is None: nsSet(ns, "{}/handler".format(task_path), partial(nsGet(ns, "/blocks/filter/handler"), task_path)) else: nsSet(ns, "{}/handler".format(task_path), partial(_handler, block_path, task_path)) if _reject is None: nsSet(ns, "{}/reject".format(task_path), partial(nsGet(ns, "/blocks/filter/reject"), task_path)) else: nsSet(ns, "{}/reject".format(task_path), partial(_reject, block_path, task_path)) nsSet(ns, "{}/call".format(task_path), partial(nsGet(ns, "/blocks/filter/call"), task_path)) for k in kw: nsSet(ns, "{}/{}".format(task_path, k), kw[k]) nsDaemon(ns, "TASK:filter:{}".format(name), nsGet(ns, "{}/server".format(task_path)), _raw=True) return True
def main(config_file, model_name, fit_hyperparams, folds, submission, cv): print('Config file: ' + config_file) print('Model: ' + model_name) print('Fit hyperparams? ' + str(fit_hyperparams)) print('Folds for which predictions will be added: ' + str(folds)) print('Generate submission file? ' + str(submission)) print('Cross-validate? ' + str(cv)) with open(config_file, 'r') as f: config = yaml.load(f) with open(config['hyperparams_file'], 'r') as f: hyperparams = yaml.load(f) # Load data. print('Loading data...') train_df = pd.read_pickle(config['train']) test_df = pd.read_pickle(config['test']) # The model names and their definitions. model_dict = {'test':TestClassifier, 'nn':NN, 'nnBagged':toolz.partial(StratifiedBaggingClassifier, base_estimator=NN(**hyperparams['nn']['constructor']), fit_params=hyperparams['nn']['fit']), 'xgbBagged':toolz.partial(StratifiedBaggingClassifier, base_estimator=XGBClassifier(**hyperparams['xgb']['constructor']), fit_params=hyperparams['xgb']['fit']), 'lgbmBagged':toolz.partial(StratifiedBaggingClassifier, base_estimator=LGBMClassifier(**hyperparams['lgbm']['constructor']), fit_params=hyperparams['lgbm']['fit']), 'lgbm':LGBMClassifier, 'xgb':XGBClassifier, 'xgbHist':XGBoostWrapper, 'svm':toolz.partial(svm.SVC, probability=True), 'randomForest':toolz.partial(RandomForestClassifier), 'logisticRegression':toolz.partial(LogisticRegression, class_weight='balanced'), 'logisticRegressionBagged':toolz.partial(StratifiedBaggingClassifier, base_estimator=LogisticRegression(**hyperparams['logisticRegression']['constructor']), fit_params=hyperparams['logisticRegression']['fit'])} if fit_hyperparams: print('Finding hyperparameters...') # Construct distributions from tuning_hyperparams. param_dists = {} tuning_hyperparams = hyperparams[model_name]['tuning_hyperparams'] constructor_hyperparams = hyperparams[model_name]['constructor'] nontuning_hyperparams = {x:constructor_hyperparams[x] for x in constructor_hyperparams if x not in tuning_hyperparams} for param in tuning_hyperparams: vals = tuning_hyperparams[param]['vals'] if tuning_hyperparams[param]['type'] == 'int': min = np.min(vals) max = np.max(vals) param_dists[param] = randint(min, max + 1) # randint is like [min, max). elif tuning_hyperparams[param]['type'] == 'float': min = np.min(vals) max = np.max(vals) param_dists[param] = uniform(loc=min, scale=(max - min)) elif tuning_hyperparams[param]['type'] == 'string': param_dists[param] = vals else: raise ValueError("Unexpected tuning parameter type: " + str(tuning_hyperparams[param]['type'])) clf = RandomizedSearchCV(model_dict[model_name](**nontuning_hyperparams), param_distributions=param_dists, n_iter=config['tuning']['n_iter'], n_jobs=config['tuning']['n_jobs'], cv=config['tuning']['n_splits'], scoring='roc_auc', verbose=5) X = train_df.drop(['target', 'fold'], axis=1) y = train_df.loc[:, 'target'] clf.fit(X=X, y=y, **hyperparams[model_name]['fit']) print('Found best hyperparams:') print(clf.best_params_) print('With AUC score:') print(clf.best_score_) # Put grid search best params in hyperparams dict. # Floats are in numpy format, and trying to write them as-is to file # causes it to be filled with junk, so convert to normal float first if # necessary. for param, value in clf.best_params_.items(): try: sanitised_value = value.item() # Gets number from numpy class. except AttributeError as e: # Was plain number anyway. sanitised_value = value hyperparams[model_name]['constructor'][param] = sanitised_value # Save hyperparams. with open(config['hyperparams_file'], 'w') as f: yaml.dump(hyperparams, f, default_flow_style=False, indent=2) print('Wrote best params to ' + str(config['hyperparams_file'])) if cv: # Cross-validate model to estimate accuracy. # Define model. print('Define model...') model = model_dict[model_name](**hyperparams[model_name]['constructor']) X = train_df.drop(['target', 'fold'], axis=1) y = train_df.loc[:, 'target'] n_splits = 3 fit_params = hyperparams[model_name]['fit'] print("Estimating scores using cross-validation...") scores = cross_val_score(estimator=model, X=X, y=y, cv=n_splits, verbose=5, fit_params=fit_params, scoring=gini_scoring_fn, n_jobs=1) # Report error. print('Gini score mean (standard deviation): ' + str(np.mean(scores)) + ' (' + str(np.sqrt(np.var(scores))) + ')') if submission: # Train and produce submission file. # Define model. print('Define model...') model = model_dict[model_name](**hyperparams[model_name]['constructor']) print('Fitting...') model.fit(X=train_df.drop(['target', 'fold'], axis=1), y=train_df.loc[:, 'target']) # Create submission file with predictions. print("Predicting...") submit_file = config['submit_prefix'] + '_' + model_name + '_' + datetime_for_filename() + '.csv' (test_df .assign(target=model.predict_proba(test_df.drop('id', axis=1))[:,1]) .loc[:, ['id', 'target']] .to_csv(submit_file, float_format=float_format, index=False)) print("Saved submit file to " + submit_file) elif not folds is None: # Train with folds, for stacking. # Check that folds are valid. bad_folds = [x for x in folds if not x in range(-1, config['n_folds'])] if len(bad_folds) > 0: raise ValueError("These specified folds do not exist: " + str(bad_folds)) # Define model. print('Define model...') model = model_dict[model_name](**hyperparams[model_name]['constructor']) model_col_name = 'model_' + model_name for fold in folds: print("Fitting for fold " + str(fold) + "...") if fold != -1: # Fit for a specific fold. print('Fitting...') train_columns = list(set(train_df.columns) - set(['fold', 'target']) - set([x for x in train_df.columns if x.startswith('model_')])) model.fit(X=train_df.loc[train_df['fold'] != fold, train_columns], y=train_df.loc[train_df['fold'] != fold, 'target'], **(hyperparams[model_name]['fit'])) # Add predictions for fold. print("Predicting...") train_df.loc[train_df['fold'] == fold, model_col_name] = model.predict_proba(train_df.loc[train_df['fold'] == fold, train_columns])[:,1] train_df.to_pickle(config['train']) print('Added predictions for model ' + model_name + ', fold ' + str(fold) + ' to column ' + model_col_name + ' of ' + config['train']) else: # Ignore folds and fit all data. print('Fitting...') columns_to_drop = ['target', 'fold'] + [x for x in train_df.columns if x.startswith('model_')] model.fit(X=train_df.drop(columns_to_drop, axis=1), y=train_df.loc[:, 'target']) # Add predictions for whole test set to test CSV. print("Predicting...") test_file = config['test'] test_columns_to_drop = ['id'] + [x for x in test_df.columns if x.startswith('model_')] (test_df .assign(**{model_col_name:model.predict_proba(test_df.drop(test_columns_to_drop, axis=1))[:,1]}) .to_pickle(test_file)) print('Added predictions for model ' + model_name + ' to column ' + model_col_name + ' of ' + test_file)
def test_sensitive_to_partials(): assert (delayed(partial(add, 10), pure=True)(2)._key != delayed( partial(add, 20), pure=True)(2)._key)
import typing as t from datetime import datetime from operator import itemgetter import snug from toolz import flip, partial from . import types registry = snug.load.PrimitiveRegistry({ datetime: partial(flip(datetime.strptime), '%Y-%m-%dT%H:%M:%SZ'), **{ c: c for c in [ int, float, bool, str, types.Issue.State ] } }) | snug.load.GenericRegistry({ t.List: snug.load.list_loader }) | snug.load.get_optional_loader | snug.load.AutoDataclassRegistry()
# -*- coding: utf-8 -*- """ parse_helpers.py This module contains helper functions used in parsing scraper data. """ from toolz import partial import requests from bs4 import BeautifulSoup import re is_e_type = lambda element_type, element: element.name == element_type is_bold = partial(is_e_type, 'b') is_anchor = partial(is_e_type, 'a') is_strong = partial(is_e_type, 'strong') b_or_strong = lambda e: is_strong(e) or is_bold(e) a_cleanse = lambda txt: txt.replace(u'Â', u'').replace('\r\n', '\n') get_content = lambda url: requests.get(url).content get_soup = lambda url: BeautifulSoup(get_content(url)) starts_with_end_tag = re.compile(r'^(\s*<\s*/.*?>)+') # TODO: refactor and document def split(soup, splitter): """ TODO: write this docstring """
from __future__ import print_function, division, absolute_import from pyspark import SparkContext, SparkConf conf = SparkConf().setAppName("dumbeddown") sc = SparkContext(conf=conf) from uuid import uuid4 from sabaody import Island, run_island, problem_constructor, getQualifiedName from toolz import partial run_id = str(uuid4()) num_islands = 4 island_ids = [str(uuid4()) for x in range(num_islands)] islands = [Island(u, problem_constructor, partial(getQualifiedName, 'B2', str(run_id)), 'luna', 11211) for u in island_ids] print(sc.parallelize(islands).map(run_island).collect())
def overlap_internal(x, axes): """ Share boundaries between neighboring blocks Parameters ---------- x: da.Array A dask array axes: dict The size of the shared boundary per axis The axes input informs how many cells to overlap between neighboring blocks {0: 2, 2: 5} means share two cells in 0 axis, 5 cells in 2 axis """ dims = list(map(len, x.chunks)) expand_key2 = partial(expand_key, dims=dims, axes=axes) # Make keys for each of the surrounding sub-arrays interior_keys = pipe( x.__dask_keys__(), flatten, map(expand_key2), map(flatten), concat, list ) name = "overlap-" + tokenize(x, axes) getitem_name = "getitem-" + tokenize(x, axes) interior_slices = {} overlap_blocks = {} for k in interior_keys: frac_slice = fractional_slice((x.name,) + k, axes) if (x.name,) + k != frac_slice: interior_slices[(getitem_name,) + k] = frac_slice else: interior_slices[(getitem_name,) + k] = (x.name,) + k overlap_blocks[(name,) + k] = ( concatenate3, (concrete, expand_key2((None,) + k, name=getitem_name)), ) chunks = [] for i, bds in enumerate(x.chunks): depth = axes.get(i, 0) if isinstance(depth, tuple): left_depth = depth[0] right_depth = depth[1] else: left_depth = depth right_depth = depth if len(bds) == 1: chunks.append(bds) else: left = [bds[0] + right_depth] right = [bds[-1] + left_depth] mid = [] for bd in bds[1:-1]: mid.append(bd + left_depth + right_depth) chunks.append(left + mid + right) dsk = merge(interior_slices, overlap_blocks) graph = HighLevelGraph.from_collections(name, dsk, dependencies=[x]) return Array(graph, name, chunks, meta=x)
'result/olt_info.txt') authenticate('61.155.48.36:7474', neo4j_username, neo4j_password) graph = Graph("http://61.155.48.36:7474/db/data") def clear_log(): for f in [log_file, result_file]: if os.path.exists(f): os.remove(f) os.mknod(f) ######################card check################################ zte_card_check = partial(Zte.card_check, username=zte_olt_username, password=zte_olt_password) hw_card_check = partial(Huawei.card_check, username=hw_olt_username, password=hw_olt_password) def get_card(olt): functions = dict(zte=zte_card_check, hw=hw_card_check) no_company = lambda x: ['fail', None] ip, company = olt[:2] return functions.get(company, no_company)(ip) + [','.join(olt)] def card_entry(info): create_card_node = lambda x: graph.create(
def get_data(filters): compute_data = partial(_compute_days, today()) return compose(list, partial(map, compute_data))(_get_inpatient_records(filters))