def train(): x_lines = [ *toolz.take(LIMIT, open('data/x.txt').read().lower().split('\n')) ] y_lines = [ *toolz.take(LIMIT, open('data/y.txt').read().lower().split('\n')) ] encoder = encoder_for_lines(S2S_PARAMS, x_lines + y_lines) try: start_idx = encoder.word_vocab[S2S_PARAMS.start_token] pad_idx = encoder.word_vocab[PAD_TOKEN] except AttributeError: start_idx = int(encoder.vocabulary_[S2S_PARAMS.start_token]) pad_idx = encoder.vocabulary_[PAD_TOKEN] reverse_enc = {idx: word for word, idx in encoder.vocabulary_.items()} model = build_model(S2S_PARAMS, start_idx, pad_idx) x = encode_data(encoder, x_lines, is_input=True) y = encode_data(encoder, y_lines, is_input=False) print(x.shape, y.shape) x = x[:S2S_PARAMS.batch_size * int(len(x) / S2S_PARAMS.batch_size)] y = y[:S2S_PARAMS.batch_size * int(len(y) / S2S_PARAMS.batch_size)] test_x = x[:S2S_PARAMS.batch_size] losses = [] if USE_COMET: experiment = Experiment(api_key="DQqhNiimkjP0gK6c8iGz9orzL", log_code=True) experiment.log_multiple_params(S2S_PARAMS._asdict()) for idx in range(1000): print("Shuffling data...") random_idx = random.sample([*range(len(x))], len(x)) x = x[random_idx] y = y[random_idx] print("Training in epoch " + str(idx)) losses.append(model.train_epoch(x, y, experiment=experiment)) experiment.log_epoch_end(idx) print('Loss history: {}'.format(', '.join( ['{:.4f}'.format(loss) for loss in losses]))) test_y = model.predict(test_x) for i in range(min([3, S2S_PARAMS.batch_size])): print('> ' + ' '.join( reverse_enc.get(idx, '<unk/>') for idx in list(test_y[i]))) else: for idx in range(1000): print("Training in epoch " + str(idx)) model.train_epoch(x, y)
def sample_ssh(data, lines=500): """ Grab a few lines from the remote file """ with tmpfile() as fn: with open(fn, 'w') as f: for line in take(lines, data.lines()): f.write(line) yield fn
def create_merge_tree(func, keys, token): """Create a task tree that merges all the keys with a reduction function. Parameters ---------- func: callable Reduction function that accepts a single list of values to reduce. keys: iterable Keys to reduce from the source dask graph. token: object Included in each key of the returned dict. This creates a k-ary tree where k depends on the current level and is greater the further away a node is from the root node. This reduces the total number of nodes (thereby reducing scheduler overhead), but still has beneficial properties of trees. For reasonable numbers of keys, N < 1e5, the total number of nodes in the tree is roughly ``N**0.78``. For 1e5 < N < 2e5, is it roughly ``N**0.8``. """ level = 0 prev_width = len(keys) prev_keys = iter(keys) rv = {} while prev_width > 1: width = tree_width(prev_width) groups = tree_groups(prev_width, width) keys = [(token, level, i) for i in range(width)] rv.update((key, (func, list(take(num, prev_keys)))) for num, key in zip(groups, keys)) prev_width = width prev_keys = iter(keys) level += 1 return rv
def run(n, x, *goals, results_filter=None): """Run a logic program and obtain n solutions that satisfy the given goals. >>> from kanren import run, var, eq >>> x = var() >>> run(1, x, eq(x, 1)) (1,) Parameters ---------- n: int The number of desired solutions. `n=0` returns a tuple with all results and `n=None` returns a lazy sequence of all results. x: object The form to reify and output. Usually contains logic variables used in the given goals. goals: Callables A sequence of goals that must be true in logical conjunction (i.e. `lall`). results_filter: Callable A function to apply to the results stream (e.g. a `unique` filter). """ results = map(partial(reify, x), lall(*goals)({})) if results_filter is not None: results = results_filter(results) if n is None: return results elif n == 0: return tuple(results) else: return tuple(take(n, results))
def test_map(client): with client.get_executor() as e: N = 10 it = e.map(inc, range(N)) expected = set(range(1, N + 1)) for x in it: expected.remove(x) assert not expected with client.get_executor(pure=False) as e: N = 10 it = e.map(slowinc, range(N), [0.1] * N, timeout=0.4) results = [] with pytest.raises(TimeoutError): for x in it: results.append(x) assert 2 <= len(results) < 7 with client.get_executor(pure=False) as e: N = 10 # Not consuming the iterator will cancel remaining tasks it = e.map(slowinc, range(N), [0.1] * N) for x in take(2, it): pass # Some tasks still processing assert number_of_processing_tasks(client) > 0 # Garbage collect the iterator => remaining tasks are cancelled del it assert number_of_processing_tasks(client) == 0
def main(): print("mining started") collective_match_data = [] try: top_players = get_live_players() for player_id in top_players: try: matches = api.get_match_history(player_id) except: continue match_ids = parse_match_ids_from_player_data(matches) for match_id in list(take(3, match_ids)): match_details = api.get_match_details(match_id) collective_match_data.append(match_details) minified = minify_data(collective_match_data) insert_many(minified) except Exception as e: print(e) print("miner added " + str(len(collective_match_data)) + " new matches")
def convert_bars(r: Tuple, cutoff: int = 10) -> Tuple: """Reduce number of CHARs in a BAR expression to the cutoff number.""" ty = r[0] if ty == BAR: tail_converted = tuple(map(convert_bars, r[1:])) grouped = toolz.groupby(lambda x: classify_char(x[1]) if x[0] == PToken.CHAR else CharSet.OTHER, tail_converted) if CharSet.WORD in grouped and cutoff < len(grouped[CharSet.WORD]): logger.info('reducing WORD choices') grouped[CharSet.WORD] = toolz.take(cutoff, grouped[CharSet.WORD]) return (PToken.BAR,) + tuple(toolz.concat(grouped.values())) elif ty == PToken.BACKREF: raise NotImplementedError elif ty == PToken.CHAR: return r elif ty == PToken.GROUP: return tuple(convert_bars(elt) if ii > 1 else elt for ii, elt in enumerate(r)) else: return tuple(convert_bars(elt) if ii > 0 else elt for ii, elt in enumerate(r))
def read_events_in_batch(config, batch_id, batch): first_rec = batch.iloc[0] asof_dt = first_rec.asof_dt bucket = config['VIDEO_END_BUCKET'] print(f'>> start event download,batch_id={batch_id}') limit_events_per_batch = config.get("LIMIT_EVENTS_PER_BATCH") if limit_events_per_batch is None: print( f">> WARNING: Limiting events to no more than {limit_events_per_batch} events per batch" ) s3 = get_client() def download_events(name): print(f">>downloading {name}") filename = Path(name).name retr = s3.get_object(Bucket=bucket, Key=str(name)) reader = retr['Body'].iter_lines() reader = map(safe_json_loads, reader) reader = (merge(x, { 'file_idx': file_idx, 'file': filename, 'asof_dt': asof_dt }) for file_idx, x in enumerate(reader)) return reader reader = map(download_events, batch.name) reader = concat(reader) if limit_events_per_batch is not None: reader = take(limit_events_per_batch, reader) return batch_id, reader
def __init__(self, path, mode='rt', schema=None, columns=None, types=None, typehints=None, dialect=None, header=None, open=open, nrows_discovery=50, chunksize=1024, encoding=DEFAULT_ENCODING, **kwargs): if 'r' in mode and not os.path.isfile(path): raise ValueError('CSV file "%s" does not exist' % path) if schema is None and 'w' in mode: raise ValueError('Please specify schema for writable CSV file') self.path = path self.mode = mode self.open = {'gz': gzip.open, 'bz2': bz2.BZ2File}.get(ext(path), open) self._abspath = os.path.abspath(path) self.chunksize = chunksize self.encoding = encoding sample = get_sample(self) self.dialect = dialect = discover_dialect(sample, dialect, **kwargs) if header is None: header = has_header(sample, encoding=encoding) elif isinstance(header, int): header = True self.header = header if not schema and 'w' not in mode: schema = discover_csv(path, encoding=encoding, dialect=dialect, header=self.header, typehints=typehints, types=types, columns=columns, nrows_discovery=nrows_discovery) self._schema = schema self.header = header if 'w' not in mode: try: nd.array(list(take(10, self._iter(chunksize=10))), dtype=str(schema)) except (ValueError, TypeError) as e: raise ValueError("Automatic datashape discovery failed\n" "Discovered the following datashape: %s\n" "But DyND generated the following error: %s\n" "Consider providing type hints using " "typehints={'column-name': 'type'}\n" "like typehints={'start-time': 'string'}" % (schema, e.args[0]))
def test_local_client(loop): def produce(n): with local_client() as c: x = c.channel('x') for i in range(n): future = c.submit(slowinc, i, delay=0.01, key='f-%d' % i) x.append(future) x.flush() def consume(): with local_client() as c: x = c.channel('x') y = c.channel('y') last = 0 for i, future in enumerate(x): last = c.submit(add, future, last, key='add-' + future.key) y.append(last) with cluster() as (s, [a, b]): with Client(('127.0.0.1', s['port']), loop=loop) as c: x = c.channel('x') y = c.channel('y') producers = (c.submit(produce, 5), c.submit(produce, 10)) consumer = c.submit(consume) results = [] for i, future in enumerate(take(15, y)): result = future.result() results.append(result) assert len(results) == 15 assert all(0 < r < 100 for r in results)
def main(config_f=None, overrides=None): config = get_config(config_f=config_f, overrides=overrides) print(f">> start extract events, config={config}") batch_limit = config.get('BATCH_LIMIT') partitions_d = Path(config['PARTITIONS_D']) write_partition = get_partition_writer(partitions_d=partitions_d) batches = read_event_batches(config) if batch_limit: print( f">> WARNING: limiting run to no more than {batch_limit} batches") batches = take(batch_limit, batches) reader = (read_events_in_batch(config=config, batch_id=batch_id, batch=batch) for batch_id, batch in batches) reader = (parse_events_in_batch(batch_id=batch_id, reader=rdr) for batch_id, rdr in reader) reader = (write_partition(batch_id, rdr) for batch_id, rdr in reader) for x in reader: pass print(">> end proc events")
def forcastall(intid): data=map(int,read_artist(intid)["action_1"]) sun=training(data,4) fun=toolz.compose(str,int) predictdata=map(fun,toolz.take(60,sun)) #focast 60 days with open("./past_forcast/{aid}.csv".format(aid=intid),"wt") as f: f.write(",".join(predictdata))
def song_info(artist, title): if title is u'': print("Searching for '%s'" % artist) result = song.search(combined=artist) else: print("Searching for '%s - %s'" % (artist, title)) result = song.search(artist=artist, title=title) print_search_results(take(3, result))
async def rotate_tokens(self): # try each token, query its rate limit # if none of them works log and sleep for token in toolz.take(self._n_tokens, self._tokens): remaining = await self.rate_limit(token) if remaining > self._rotate_at: return self._set_token(token)
def _is_from_ncbi(gff3_file): with open(gff3_file) as in_handle: for line in tz.take(10000, in_handle): if "Dbxref" in line: return "Dbxref" if "db_xref" in line: return "db_xref" return None
def create_categories(width, plus_one): length = int(width / 8) + plus_one return [ ''.join(cs) for cs in take( 2**width + plus_one, product([chr(c) for c in range(256)], repeat=length), ) ]
def create_categories(width, plus_one): length = int(width / 8) + plus_one return [ ''.join(cs) for cs in take( 2 ** width + plus_one, product([chr(c) for c in range(256)], repeat=length), ) ]
def discover_bson(b, n=10, **kwargs): with bson_lines(b.path) as lines: data = list(take(n, lines)) if len(data) < n: ds = discover(data) else: ds = var * discover(data).subshape[0] return ds
def _evaluate_split(split, section_aug, model, device, running_metrics_overall, config, debug=False): logger = logging.getLogger(__name__) TestSectionLoader = get_test_loader(config) test_set = TestSectionLoader( data_dir=config.DATASET.ROOT, split=split, is_transform=True, augmentations=section_aug, ) n_classes = test_set.n_classes test_loader = data.DataLoader(test_set, batch_size=1, num_workers=config.WORKERS, shuffle=False) if debug: logger.info("Running in Debug/Test mode") test_loader = take(1, test_loader) running_metrics_split = runningScore(n_classes) # testing mode: with torch.no_grad(): # operations inside don't track history model.eval() total_iteration = 0 for i, (images, labels) in enumerate(test_loader): logger.info(f"split: {split}, section: {i}") total_iteration = total_iteration + 1 outputs = model(images.to(device)) pred = outputs.detach().max(1)[1].cpu().numpy() gt = labels.numpy() running_metrics_split.update(gt, pred) running_metrics_overall.update(gt, pred) # get scores score, class_iou = running_metrics_split.get_scores() # Log split results logger.info(f'Pixel Acc: {score["Pixel Acc: "]:.3f}') for cdx, class_name in enumerate(_CLASS_NAMES): logger.info( f' {class_name}_accuracy {score["Class Accuracy: "][cdx]:.3f}') logger.info(f'Mean Class Acc: {score["Mean Class Acc: "]:.3f}') logger.info(f'Freq Weighted IoU: {score["Freq Weighted IoU: "]:.3f}') logger.info(f'Mean IoU: {score["Mean IoU: "]:0.3f}') running_metrics_split.reset()
def speed_test_buffer(buffersize): t1 = time.time() n_iter = 10_000_000 gen = read_binary_bus(B1.bus_file, decode_seq=False, buffersize=buffersize) gen = toolz.take(n_iter, gen) for a in gen: pass t2 = time.time() return t2 - t1
def fit(cost_f, cost_df, h_theta0, data, eta=0.1, it_max=500, gf='gd'): ''' Compute values of multiple linear regression coefficients Parameters cost_f: Cost function (J) cost_df: gradient of cost function (gradJ for batch and gradJS for stochastic) h_theta0: initial guess for fitting parameters (j cols) data: list of tuples [(Xi, yi)] X: matrix of independent variables (i rows of observations and j cols of variables). x0=1 for all i y: dependent variable (i rows) eta: learning rate it_max: maximum number of iterations Returns Fitting parameters (j cols) ''' X, y = zip(*data) if gf == 'gd': f = partial(cost_f, X, y) df = partial(cost_df, X, y) ans = list( take(it_max, ((h_theta, f(h_theta)) for h_theta in fgd.gradient_descent(df, h_theta0, eta=eta)))) value = list(T(ans)[0]) cost = list(T(ans)[1]) #t = list(until_within_tol(cost, 1e-7)) return value[-1], cost elif gf == 'sgd': df = cost_df cost = [sum(cost_f(xi, yi, h_theta0) for xi, yi in data)] h_theta = h_theta0 eta_new = eta for _ in xrange(it_max): ans = list( take(len(y), (e for e in fgd.sgd(df, X, y, h_theta, eta=eta_new)))) h_theta = ans[-1] cost.append(sum(cost_f(xi, yi, h_theta) for xi, yi in data)) eta_new = 0.99 * eta_new return h_theta, cost else: print('Not a valid function') return
def collect(grouper, npartitions, group, pbags): """ Collect partitions from disk and yield k,v group pairs """ from pbag import PBag pbags = list(take(npartitions, pbags)) result = defaultdict(list) for pb in pbags: part = pb.get_partition(group) groups = groupby(grouper, part) for k, v in groups.items(): result[k].extend(v) return list(result.items())
def discover(coll, n=50): items = list(take(n, coll.find())) for item in items: del item['_id'] ds = discover(items) if isdimension(ds[0]): return coll.count() * ds.subshape[0] else: raise ValueError("Consistent datashape not found")
def plot_lrates(f, df, x0, etas, niter): fig, ax = plt.subplots(nrows=1, ncols=1) for eta in etas: ax.plot(list(xrange(1, niter + 1)), list(take(niter,(f(e) for e in gradient_descent(df, x0, eta=eta)))), label=unicode(eta)) ax.set_xlabel('Iteration Number') ax.set_ylabel('f(x)') plt.legend(title='Learning Rate') plt.show() plt.clf()
def get_selected_indices(self): indices = range(self.len_inp * self.out_inp_factor) num_extra_elems = self.out_inp_factor * self.len_inp - self.len_out selected_groups = set( np.random.choice(self.len_inp, num_extra_elems, replace=False)) selected_indices = list( concat( take(self.out_inp_factor - 1, group) if i in selected_groups else group for i, group in enumerate( partition(self.out_inp_factor, indices)))) return selected_indices
def fit(cost_f, cost_df, h_theta0, data, eta=0.1, it_max=500, gf='gd'): ''' Compute values of multiple linear regression coefficients Parameters cost_f: Cost function (J) cost_df: gradient of cost function (gradJ for batch and gradJS for stochastic) h_theta0: initial guess for fitting parameters (j cols) data: list of tuples [(Xi, yi)] X: matrix of independent variables (i rows of observations and j cols of variables). x0=1 for all i y: dependent variable (i rows) eta: learning rate it_max: maximum number of iterations Returns Fitting parameters (j cols) ''' X, y = zip(*data) if gf == 'gd': f = partial(cost_f, X, y) df = partial(cost_df, X, y) ans = list(take(it_max, ((h_theta, f(h_theta)) for h_theta in fgd.gradient_descent(df, h_theta0, eta=eta)))) value = list(T(ans)[0]) cost = list(T(ans)[1]) #t = list(until_within_tol(cost, 1e-7)) return value[-1], cost elif gf == 'sgd': df = cost_df cost = [sum(cost_f(xi, yi, h_theta0) for xi, yi in data)] h_theta = h_theta0 eta_new = eta for _ in xrange(it_max): ans = list(take(len(y), (e for e in fgd.sgd(df, X, y, h_theta, eta=eta_new)))) h_theta = ans[-1] cost.append(sum(cost_f(xi, yi, h_theta) for xi, yi in data)) eta_new = 0.99 * eta_new return h_theta, cost else: print('Not a valid function') return
def discover_pymongo_collection(coll, n=50): items = list(take(n, coll.find())) oid_cols = [k for k, v in items[0].items() if isinstance(v, ObjectId)] for item in items: for col in oid_cols: del item[col] ds = discover(items) if isdimension(ds[0]): return coll.count() * ds.subshape[0] else: raise ValueError("Consistent datashape not found")
def plot_lrates(f, df, x0, etas, niter): fig, ax = plt.subplots(nrows=1, ncols=1) for eta in etas: ax.plot(list(xrange(1, niter + 1)), list( take(niter, (f(e) for e in gradient_descent(df, x0, eta=eta)))), label=unicode(eta)) ax.set_xlabel('Iteration Number') ax.set_ylabel('f(x)') plt.legend(title='Learning Rate') plt.show() plt.clf()
def test_cast_string_to_date(alltypes, df, type): import toolz string_col = alltypes.date_string_col month, day, year = toolz.take(3, string_col.split('/')) expr = '20' + ibis.literal('-').join([year, month, day]) expr = expr.cast(type) result = expr.execute().astype('datetime64[ns]').sort_values().reset_index( drop=True).rename('date_string_col') expected = pd.to_datetime( df.date_string_col).dt.normalize().sort_values().reset_index(drop=True) tm.assert_series_equal(result, expected)
def transform(self, X, y): positives = [] negatives = [] items = groupby(lambda i: y[i], range(len(y))).items() for label, group_ix in items: possible_positives = len(group_ix)**2 if self.num_positive >= possible_positives: raise ValueError( 'Not enough combinations for positive examples') group_ix = np.random.permutation(group_ix) positives.extend( take( self.num_positive, itertools.product(group_ix, np.random.permutation(group_ix)))) other_candidates_ix = np.concatenate([ group_ix for label_neg, group_ix in items if label_neg != label ]) possible_negatives = len(other_candidates_ix) * len(group_ix) if self.num_negative >= possible_negatives: raise ValueError( 'Not enough combinations for negative examples') negatives.extend( take( self.num_negative, itertools.product( group_ix, np.random.permutation(other_candidates_ix)))) all_pairs = positives + negatives all_output = [1] * len(positives) + [0] * len(negatives) return X.take(all_pairs, axis=0), all_output
def _limit_inlines(max_inlines, images_iter): if max_inlines is not None: images_list = list(images_iter) if max_inlines > len(images_list): warn_msg = (f"The number of max inlines {max_inlines} is greater" f"than the number of inlines found {len(images_list)}." f"Setting max inlines to {len(images_list)}") warnings.warning(warn_msg) max_inlines = len(images_list) images_iter = images_list else: shuffled_list = random.shuffle(images_list) images_iter = take(max_inlines, shuffled_list) return images_iter, max_inlines
def get_dirs_and_files_in_path(path): # filter function def isdir(a): return os.path.isdir(a) # gives the opposite results as above not_isdir = toolz.complement(isdir) if not path and platform.system() == 'Windows': import win32api drives = win32api.GetLogicalDriveStrings() drives = [d for d in drives.split('\000') if d] return drives elif os.path.exists(path): r = os.listdir(path) # 2x acccess means I have to remove the generator f = [os.path.join(path, a) for a in r] dirs = filter(isdir, f) files = filter(not_isdir, f) else: try: head, tail = os.path.split(path) r = os.listdir(head) filtered_everything = filter(lambda a: a.startswith(tail), r) # because this was accesssed twice, I needed to remove the generator filtered_everything = [os.path.join(head, a) for a in filtered_everything] dirs = filter(isdir, filtered_everything) files = filter(not_isdir, filtered_everything) except Exception as e: print('{0} doesn\'t even exist you stupid'.format(head)) return None result = (sorted(list(toolz.take(100, dirs))), sorted(list(toolz.take(100, files)))) return result
def _solve(print=print): print('This isn' 't really done. Groups of 4 works ok, but 5s take forever.') print('Coming up with a lazy version of combinations would be better') with mp.Pool(4) as pool: cs = combinations(take(200, primes()), 4) for remarkable, combo in pool.imap_unordered(is_remarkable_2, cs, chunksize=125): if remarkable: print(combo) return False
def test_cast_string_to_date(alltypes, df, type): import toolz string_col = alltypes.date_string_col month, day, year = toolz.take(3, string_col.split("/")) expr = ibis.literal("-").join([year, month, day]) expr = expr.cast(type) result = (expr.execute().iloc[:, 0].astype("datetime64[ns]").sort_values(). reset_index(drop=True).rename("date_string_col")) expected = (pd.to_datetime( df.date_string_col).dt.normalize().sort_values().reset_index( drop=True)) tm.assert_series_equal(result, expected)
def _get_callable_regions(data): """Retrieve regions to parallelize by from callable regions, variant regions or chromosomes """ callable_files = data.get("callable_regions") or data.get("variant_regions") if callable_files: assert len(callable_files) == 1 regions = [(r.chrom, int(r.start), int(r.stop)) for r in pybedtools.BedTool(callable_files[0])] else: work_bam = list(tz.take(1, filter(lambda x: x.endswith(".bam"), data["work_bams"]))) if work_bam: with contextlib.closing(pysam.Samfile(work_bam[0], "rb")) as pysam_bam: regions = [(chrom, 0, length) for (chrom, length) in zip(pysam_bam.references, pysam_bam.lengths)] else: raise NotImplementedError("No variant regions or BAM files to calculate chromosomes") return regions
def make_frames(frames, width, scale): incrementer = itertools.count() stencil = RightHandedSimplexStencil(2, 30) rotate = np.array([1, -1]) offset = width / 2 + rotate * width / 10 points = list(take(frames, stencil.generate_stencil_points())) for point in points: point.point = rotate * point.point * width / 12 + offset def make_frame(t): i = next(incrementer) surface = gizeh.Surface(width=width, height=width, bg_color=(1, 1, 1)) line = gizeh.polyline([[offset[0], 0], [offset[0], width]], stroke=grid_color, stroke_width=2) line.draw(surface) line = gizeh.polyline([[0, offset[1]], [width, offset[1]]], stroke=grid_color, stroke_width=2) line.draw(surface) x = offset[0] + width/scale y = offset[1] - width/scale while x <= width + 1: line = gizeh.polyline([[x, 0], [x, width]], stroke=grid_color, stroke_width=0.5) line.draw(surface) line = gizeh.polyline([[0, y], [width, y]], stroke=grid_color, stroke_width=0.5) line.draw(surface) x += width/scale y -= width/scale x = offset[0] - width/scale y = offset[1] + width/scale while x >= -1: line = gizeh.polyline([[x, 0], [x, width]], stroke=grid_color, stroke_width=0.5) line.draw(surface) line = gizeh.polyline([[0, y], [width, y]], stroke=grid_color, stroke_width=0.5) line.draw(surface) x -= width/scale y += width/scale circle = gizeh.circle(r=3.25, xy=offset, fill=halving_colors[0]) circle.draw(surface) if i > 0: for i in range(i-1): point = points[i] color = halving_colors[point.halvings] circle = gizeh.circle(r=max(0.5, 3.25 - 0.75*point.halvings), xy=point.point, fill=color) circle.draw(surface) return surface.get_npimage() return make_frame
def _get_callable_regions(data): """Retrieve regions to parallelize by from callable regions, variant regions or chromosomes """ import pybedtools callable_files = data.get("callable_regions") or data.get("variant_regions") if callable_files: assert len(callable_files) == 1 regions = [(r.chrom, int(r.start), int(r.stop)) for r in pybedtools.BedTool(callable_files[0])] else: work_bam = list(tz.take(1, filter(lambda x: x.endswith(".bam"), data["work_bams"]))) if work_bam: with pysam.Samfile(work_bam[0], "rb") as pysam_bam: regions = [(chrom, 0, length) for (chrom, length) in zip(pysam_bam.references, pysam_bam.lengths)] else: regions = [(r.name, 0, r.size) for r in ref.file_contigs(dd.get_ref_file(data), data["config"])] return regions
def get_displacement(n_input, n_tilings): """ Get the displacement vector to use in offsetting the tilings. Essentially, we look for numbers less than `n_tilings//2` that are coprime with `n_tilings`. If we can find at least `n_input` of them, we just take the first `n_input`. If there are fewer such viable numbers, we instead cycle through the candidates, ensuring we repeat as seldom as possible. ..note:: It's recommended by the CMAC people to just increase the number of tilings when there aren't enough candidate values for the displacement vector. """ viable = [i for i in range(1, n_tilings//2) if gcd(i, n_tilings) == 1] ret = list(take(n_input, cycle(viable))) return np.array(ret)
def bag_range(n, npartitions): """ Numbers from zero to n Examples -------- >>> import dask.bag as db >>> b = db.range(5, npartitions=2) >>> list(b) [0, 1, 2, 3, 4] """ size = n // npartitions name = 'range-%d-npartitions-%d' % (n, npartitions) ijs = list(enumerate(take(npartitions, range(0, n, size)))) dsk = dict(((name, i), (reify, (range, j, min(j + size, n)))) for i, j in ijs) if n % npartitions != 0: i, j = ijs[-1] dsk[(name, i)] = (reify, (range, j, n)) return Bag(dsk, name, npartitions)
def roll(request): form = forms.LunchGroupForm if request.method == 'GET': ParticipantFormset = modelformset_factory(models.Participant) formset = ParticipantFormset(queryset=models.Participant.objects.filter(is_participating=True)) context = {'form': form(), 'formset': formset } return render(request, 'lunch_roulette/base.html', context) if request.method == 'POST': form = form(request.POST) if not form.is_valid(): context['messages'] = ['Date is not valid'] render(request, 'lunch_roulette/base.html', context) date = form.cleaned_data.get('date') participants = list(models.Participant.objects.filter(is_participating=True)) random.shuffle(participants) while 1: subgroup = list(toolz.take(4, participants)) participants = participants[4:] logging.warn(subgroup) if not subgroup: break group = models.LunchGroup(date=date) group.save() group.participants.add(*subgroup) group.save() # import pdb; pdb.set_trace() return redirect(roll)
def test_cast_string_to_date(alltypes, df, type): import toolz string_col = alltypes.date_string_col month, day, year = toolz.take(3, string_col.split('/')) expr = '20' + ibis.literal('-').join([year, month, day]) expr = expr.cast(type) result = ( expr.execute() .astype('datetime64[ns]') .sort_values() .reset_index(drop=True) .rename('date_string_col') ) expected = ( pd.to_datetime(df.date_string_col) .dt.normalize() .sort_values() .reset_index(drop=True) ) tm.assert_series_equal(result, expected)
def clean(name, before=None, after=None, keep_last=None, environ=os.environ): """Clean up data that was created with ``ingest`` or ``$ python -m zipline ingest`` Parameters ---------- name : str The name of the bundle to remove data for. before : datetime, optional Remove data ingested before this date. This argument is mutually exclusive with: keep_last after : datetime, optional Remove data ingested after this date. This argument is mutually exclusive with: keep_last keep_last : int, optional Remove all but the last ``keep_last`` ingestions. This argument is mutually exclusive with: before after environ : mapping, optional The environment variables. Defaults of os.environ. Returns ------- cleaned : set[str] The names of the runs that were removed. Raises ------ BadClean Raised when ``before`` and or ``after`` are passed with ``keep_last``. This is a subclass of ``ValueError``. """ try: all_runs = sorted( filter( complement(pth.hidden), os.listdir(pth.data_path([name], environ=environ)), ), key=from_bundle_ingest_dirname, ) except OSError as e: if e.errno != errno.ENOENT: raise raise UnknownBundle(name) if ((before is not None or after is not None) and keep_last is not None): raise BadClean(before, after, keep_last) if keep_last is None: def should_clean(name): dt = from_bundle_ingest_dirname(name) return ( (before is not None and dt < before) or (after is not None and dt > after) ) elif keep_last >= 0: last_n_dts = set(take(keep_last, reversed(all_runs))) def should_clean(name): return name not in last_n_dts else: raise BadClean(before, after, keep_last) cleaned = set() for run in all_runs: if should_clean(run): path = pth.data_path([name, run], environ=environ) shutil.rmtree(path) cleaned.add(path) return cleaned
from itertools import repeat from functools import partial import types import pytest import toolz as tlz map_c = tlz.curry(tlz.map) reduce_c = tlz.curry(tlz.reduce) from smpl_tokenizer import utils is_generator = lambda obj: isinstance(obj, types.GeneratorType) var_len_strings = lambda n: list(tlz.take(n, tlz.iterate(lambda string: string + "a", ""))) """ @pytest.mark.parametrize("test_input,expected", [ ("3+5", 8), ("2+4", 6), ("6*9", 42), ]) def _eval(test_input, expected): assert _eval(test_input) == expected @pytest.mark.parametrize("x", [0, 1]) @pytest.mark.parametrize("y", [2, 3]) def test_foo(x, y):
def estimate_max_mapq(in_bam, nreads=1e6): """Guess maximum MAPQ in a BAM file of reads with alignments """ with pysam.Samfile(in_bam, "rb") as work_bam: reads = tz.take(nreads, work_bam) return max([x.mapq for x in reads if not x.is_unmapped])
def degrade_latin(para, om_frac = 0.1, com_frac = 0.1, max_N_om = 5, max_N_com = 5, ): ''' 'latin' because this tokenizes using str.split takes a (str)paragraph returns a (str)paragraph' with possible degradations (errors): - omissions (deletions) - commissions (alterations) arguments: om_ratio, com_ratio: fraction of items to alter, where the basis is the number of TOKENS max_N_om, max_N_com: maximum whole number count of tokens to alter ''' buf = para.split() ntoken_ = len(buf) # a convenience rendered version html_rep = copy.copy(buf) result = EasyDict( omission_index_list = [], commission_index_list = [], ) OM_LIMIT = int(math.ceil(om_frac*ntoken_)) COM_LIMIT = int(math.ceil(com_frac*ntoken_)) # run omissions first ilist = range(ntoken_) random.shuffle(ilist) result.omission_index_list = list(z.take(min(OM_LIMIT, max_N_om), ilist)) result.omission_index_list.sort() for i in reversed(result.omission_index_list): del buf[i] html_rep[i] = '<span class="deleted">%s</span>'%html_rep[i] # THIS HAS CHANGED! ntoken_ = len(buf) # create new index -> original index mapping imapping = dict((i,i) for i in range(ntoken_)) for i_deleted in result.omission_index_list: for i_inc in range(i_deleted, ntoken_): imapping[i_inc] += 1 # then run commissions ilist = range(ntoken_) random.shuffle(ilist) com_idx_list = z.take(min(COM_LIMIT, max_N_com), ilist) for i in reversed(sorted(com_idx_list)): token = buf[i] j_degrade = random.randint(0, len(token)-1) while True: ch = random.choice(string.ascii_lowercase) if ch != token[j_degrade]: break buf[i] = token[:j_degrade]+ch+token[j_degrade+1:] original_index = imapping[i] #result.commission_index_list.append(original_index) html_rep[original_index] = \ token[:j_degrade] + \ '<span class="altered">%s</span>'%ch + \ token[j_degrade+1:] result.text = ' '.join(buf) result.html_representation = ' '.join(html_rep) return result
def test_bz2_stream(): text = '\n'.join(map(str, range(10000))) compressed = bz2.compress(text.encode()) assert (list(take(100, bz2_stream(compressed))) == list(map(lambda x: str(x) + '\n', range(100))))
from __future__ import print_function, division, unicode_literals from toolz import take, compose, pluck import matplotlib.pyplot as plt from pylsy2 import pylsytable2 from utility import until_within_tol from func_gradient_descent import gradient_descent from out_utils import plot_lrates def f(x_i): return sum(x_ij**2 for x_ij in x_i) def df(x_i): return [2 * x_ij for x_ij in x_i] x0 = [6., 33., 12.2] tol = 1.e-6 al = [1., 0.3, 0.1, 0.03, 0.01, 0.003, 0.001] niter = 100 plot_lrates(f, df, x0, al, niter) result = list(take(50, ((f(e), e) for e in gradient_descent(df, x0)) )) xs = ['x' + unicode(i) for i in xrange(len(x0))] table = pylsytable2(['y'] + xs) table.add_data('y', list(pluck(0, result)), '{:.2e}') for i, x in enumerate(xs): table.add_data(x, list(pluck(i,pluck(1, result))), '{:.2e}') print(table)
def initial(s): return list(take(n, s))
def _assert_initial_matches(a, b, n=10): assert list(take(n, a)) == list(take(n, b))
def f(_): sub = Sub('a') return list(toolz.take(5, sub))