def succeeded(self, event): command = self.started_cmds[event.request_id] if not command: return self.started_cmds.pop(event.request_id) duration = event.duration_micros if self.is_below_lwm(duration): return [cmd, q, meta] = take(3, command.items()) self.render_cmd(cmd, duration, q) ents = pipe( traceback.extract_stack(), self.config.stack_preprocess, map(lambda rec: StackEntry(self.config.file_capture, *rec)), filter(lambda ent: ent.file_capture()), filter(lambda ent: len( list( filter(lambda p: re.match(p, ent.file, re.M), self.config. ignores))) == 0), groupby(lambda ent: ent.file), ) self.render_stack(ents)
def do_localizer_block(event_listener, target): start = time.time() target_is_face = target == BlockTarget.FACE stim_orientations = get_stim_1_orientations() source = res.faces() if target_is_face else res.houses() stim_list = pipe(random_elem(source), take(len(stim_orientations)), list) face_list, house_list = flip_if(not target_is_face, ( lmap(lambda ori, stim: stim[ori], stim_orientations, stim_list), [None] * len(stim_list))) display_onsets, decisions, decision_onsets, RTs, ITIs = \ do_trials(event_listener, face_list, house_list) return { "time": (start, time.time()), "target": target.name }, { "presentations_onset": display_onsets, "decision_onset": decision_onsets, "decision": [ori.name if ori else "None" for ori in decisions], "RT": RTs, "following_ITI": ITIs, "stim_orientation": [ori.name for ori in stim_orientations], "stim_id": [stim.name for stim in stim_list] }
def compare_streams(db_engine, date_range, stream_names, allowed_parts_of_speech, max_num_words): """Compare tokens from each stream in the stream_names list""" ## Create token count dictionaries for each stream name count_dicts_dict = {} for stream_name in stream_names: count_dicts_dict[stream_name] = tz.pipe( get_content( db_engine, stream_name, date_range), parse_content_into_count(max_num_words, allowed_parts_of_speech)) ## Create cross-stream count dictionary all_streams_count_dict = reduce( lambda x,y: tz.merge_with(sum, x, y), count_dicts_dict.values()) ## Calculate posterior probabilities of the tokens posterior_probs = {} for stream_name in stream_names: posterior_probs[stream_name] = tz.pipe( get_posterior_probs_freq( 500, # limited to the 500 most frequent words in this stream, at this time all_streams_count_dict, count_dicts_dict[stream_name]), tz.map(lambda x: tz.merge({"stream":stream_name}, x)), tz.take(max_num_words), list, ) return posterior_probs
def get_top_tokens(n, count_dict): """Return the top n most frequent tokens in the count_dict If n > len(count_dict), it will just return them all""" return tz.pipe( count_dict, lambda x: x.items(), lambda x: sorted(x, key=lambda y: -y[1]), lambda x: tz.take(n, x), list)
def discover_jsonlines(j, n=10, encoding='utf-8', **kwargs): with json_lines(j.path, encoding=encoding) as lines: data = pipe(lines, filter(nonempty), map(json.loads), take(n), list) if len(data) < n: ds = discover(data) else: ds = var * discover(data).subshape[0] return date_to_datetime_dshape(ds)
def limit_layers(max_count, graphs): assert max_count > 0, "max count needs to > 0" graphs_iterator = iter(graphs) return tlz.concat([ tlz.take(max_count - 1, graphs_iterator), # Merges all graphs remaining in the iterator, after initial # max_count - 1 have been taken. (lambda: (yield merge_graphs(graphs_iterator)))() ])
def parse_links(txt): """ Parses the contents of all Markdown links in txt. >>> parse_links("some text with [some](http://md.com/link)") [('some', 'http://md.com/link')] >>> parse_links("some text with [some](http://md.com/link_(bull_crap\\))") [('some', 'http://md.com/link_(bull_crap\\\)')] """ return tlz.pipe(re.findall(MARKUP_REGEX, txt), ctlz.map(ctlz.take(2)), ctlz.map(tuple), list)
def export_intervals(chanjo_db, include_header=True, bed_score=0): r"""Return BED-formatted interval lines from existing ``chanjo_db``. BED lines are ready to be printed or written to a file. Args: chanjo_db (session): ``sqlalchemy.orm.session`` object with a ``.query``-method include_header (bool, optional): whether to include BED header bed_score (int, optional): dummy score (0-1000) to insert at field 5 to complete the BED format Yields: str: stringified and tab-delimited interval Examples: >>> from chanjo import export_intervals, Store ... # instantiate a new connection to a Chanjo database >>> db = Store('./coverage.sqlite3') >>> with open('intervals.sorted.bed', 'w') as stream: ... # write intervals in BED-format with appropriate headers ... for bed_line in export_intervals(db): ... stream.write(bed_line + '\n') """ if include_header: yield '#chrom\tchromStart\tchromEnd\tname\tscore\tstrand' # setup up which columns to fetch to make BED file # column 5 is just a silly default for the "score" field in BED i = Interval # alias columns = (i.contig, i.start - 1, i.end, i.id, i.strand) # BED files are tab-delimited delimiter = '\t' # 1. fetch interval tuples from the database (producer) # 2. stringify each item in each subsequence (interval tuple) # 3. join lines on tab-character # 4. prepend the header bed_lines = pipe( fetch_records(chanjo_db, columns), map(map(str)), # convert fields to strings map( juxt( compose(list, take(4)), # keep first 4 fields lambda _: [str(bed_score)], # insert BED score compose(list, last))), # keep last field map(concat), # flatten each item map(delimiter.join) # join on \t ) for bed_line in bed_lines: yield bed_line
def export_intervals(chanjo_db, include_header=True, bed_score=0): r"""Return BED-formatted interval lines from existing ``chanjo_db``. BED lines are ready to be printed or written to a file. Args: chanjo_db (session): ``sqlalchemy.orm.session`` object with a ``.query``-method include_header (bool, optional): whether to include BED header bed_score (int, optional): dummy score (0-1000) to insert at field 5 to complete the BED format Yields: str: stringified and tab-delimited interval Examples: >>> from chanjo import export_intervals, Store ... # instantiate a new connection to a Chanjo database >>> db = Store('./coverage.sqlite') >>> with open('intervals.sorted.bed', 'w') as stream: ... # write intervals in BED-format with appropriate headers ... for bed_line in export_intervals(db): ... stream.write(bed_line + '\n') """ if include_header: yield '#chrom\tchromStart\tchromEnd\tname\tscore\tstrand' # setup up which columns to fetch to make BED file # column 5 is just a silly default for the "score" field in BED i = Interval # alias columns = (i.contig, i.start - 1, i.end, i.id, i.strand) # BED files are tab-delimited delimiter = '\t' # 1. fetch interval tuples from the database (producer) # 2. stringify each item in each subsequence (interval tuple) # 3. join lines on tab-character # 4. prepend the header bed_lines = pipe( fetch_records(chanjo_db, columns), map(map(str)), # convert fields to strings map(juxt(compose(list, take(4)), # keep first 4 fields lambda _: [str(bed_score)], # insert BED score compose(list, last))), # keep last field map(concat), # flatten each item map(delimiter.join) # join on \t ) for bed_line in bed_lines: yield bed_line
def _make_samples(meta, shuffle): def _to_sample(person, images): # Random images needed for representation interpolation (3.5) x1 = _get_random_image() x2 = _get_random_image() return m(id=person["id_class"] - 1, images=freeze(list(images)), x1=freeze(x1), x2=freeze(x2)) samples = pipe( meta["persons"], tz.take(limit) if limit is not None else tz.identity, tz.map(lambda p: m(p=p, i=tz.partition( args.N_images, _shuffled(p["images"]) if shuffle else p["images"]))), tz.mapcat(lambda s: [_to_sample(s.p, i) for i in s.i]), tz.take(limit) if limit is not None else tz.identity, list) if shuffle: random.shuffle(samples) return samples
def __init__(self, limit=None, schema=None, keep_properties=True, chunk=False): self.schema = schema self.limit = limit self.chunk = chunk or False self.set_property_filter(keep_properties) # Set up pipeline, in reverse order steps = [self.validate, self.process] if self.limit is not None: self.logger.debug(f'Loading %s features only', self.limit) steps.append(take(self.limit)) if self.chunk: self.logger.debug(f'Features will arrive in batches of %s', self.chunk) steps.append(lambda it: grouper(self.chunk, it)) self.pipeline = compose(*reversed(steps))
def stop_by_no_improvement_parallel(logs: ListLogListType, extractor: ExtractorFnType, metric_name: str, early_stop: int = 3, threshold: float = 0.001) -> bool: """ Checks for logs to see if feature selection should stop Parameters ---------- logs : list of list of dict A list of log-like lists of dictionaries evaluations. extractor: function str -> float A extractor that take a string and returns the value of that string on a dict metric_name: str String with the name of the column that refers to the metric column to be extracted early_stop: int (default 3) Number of iterations without improvements before stopping threshold: float (default 0.001) Threshold for model performance comparison Returns ---------- stop: bool A boolean whether to stop recursion or not """ if len(logs) < early_stop: return False log_list = [ get_best_performing_log(log, extractor, metric_name) for log in logs ] limited_logs = list(take(early_stop, log_list)) curr_auc = get_avg_metric_from_extractor(limited_logs[-1], extractor, metric_name) return all([ (curr_auc - get_avg_metric_from_extractor(log, extractor, metric_name)) <= threshold for log in limited_logs[:-1] ])
def txt_parser(filelike, max_num_nodes=MAXINT): if isinstance(filelike, io.IOBase): fileobj = filelike else: # assume filename fileobj = open(filelike, 'rb') g = nx.DiGraph() # this pipe assumes there are no empty lines at the start of the file records = tz.pipe(fileobj, c.map(_decode), c.partitionby(_line_is_empty), # split on empty lines c.take_nth(2), # discard those empty lines c.take(max_num_nodes), c.map(get_record)) for record in records: g.add_node(record['index'], attr_dict=record) for reference in record.get('references', []): g.add_edge(record['index'], reference) return g
def sample_url_line_delimited(data, lines=5, encoding='utf-8'): """Get a size `length` sample from an URL CSV or URL line-delimited JSON. Parameters ---------- data : URL(CSV) A hosted CSV lines : int, optional, default ``5`` Number of lines to read into memory """ with closing(urlopen(data.url)) as r: raw = pipe(r, take(lines), map(bytes.strip), curry(codecs.iterdecode, encoding=encoding), b'\n'.decode(encoding).join) with tmpfile(data.filename) as fn: with codecs.open(fn, 'wb', encoding=encoding) as f: f.write(raw) yield fn
def sample_url_line_delimited(data, lines=5, encoding="utf-8", timeout=None): """Get a size `length` sample from an URL CSV or URL line-delimited JSON. Parameters ---------- data : URL(CSV) A hosted CSV lines : int, optional, default ``5`` Number of lines to read into memory """ with closing(urlopen(data.url, timeout=timeout)) as r: raw = pipe( r, take(lines), map(bytes.strip), curry(codecs.iterdecode, encoding=encoding), b"\n".decode(encoding).join ) with tmpfile(data.filename) as fn: with codecs.open(fn, "wb", encoding=encoding) as f: f.write(raw) yield fn
def from_file_path(cls, file_path: FilePath, sheet_name: str, *, row_limit: int = 100): """Help function to populate the columns of a sheet.""" wb = get_wb(file_path) ws = wb[sheet_name] rows = tz.take(row_limit, ws.rows) header = next(rows) names = [c.value for c in header] letters = [c.column_letter for c in header] indices = [c.column for c in header] data_types = tz.pipe( rows # For each row, create a dict usng names has keys , tz.map(lambda row: dict(zip(names, row))) # Get the .xlsx data_type for each cell , tz.map(tz.valmap(lambda cell: cell.data_type)) # Combine cells into a list per column , tz.merge_with(list) # Count the cells for each data type in the column , tz.valmap(tz.frequencies) # Consolidate types , tz.valmap(lambda freq: ( # If at least 1 "d" "date" if "d" in freq else # If at least 1 "s" "text" if "s" in freq else # If at least 1 "n" "number" if "n" in freq else str(freq))), lambda d: [v for k, v in d.items()]) cols = [ Col(name=N, letter=L, index=I, data_type=D) for N, L, I, D in zip(names, letters, indices, data_types) ] return cls(name=sheet_name, cols=cols)
def test_take(): assert list(take(2)([1, 2, 3])) == [1, 2]
def head(data: Table, limit=100) -> Table: """Returns the first {limit} records of a Table.""" return list(tz.take(limit, data))
def cols2hrs24(df): "Convert columns from `12:00 am, 1:00 am, ...11:00 pm` to `0, 1, ...23`" hrs = z.pipe(range(1, 13), it.cycle, z.drop(11), z.take(12), list) hrs24 = ['{}:00 {}'.format(hr, half) for half in ('am', 'pm') for hr in hrs] assert all(df.columns[2:] == hrs24), "Expecting columns of form `12:00 am, 1:00 am, ...11:00 pm`" return df.rename(columns=dict(zip(hrs24, map(str, range(24)))))
def test_toolz_take(executor): actual = executor(take(5), range(10), npartitions=3) assert list(actual) == [0, 1, 2, 3, 4]
def edges_from_cycle(c: Cycle) -> Complex: return pipe(c, cycle, sliding_window(2), take(len(c)), map(pset), pset)
def remove_by_feature_shuffling(log: LogType, predict_fn: PredictFnType, eval_fn: EvalFnType, eval_data: pd.DataFrame, extractor: ExtractorFnType, metric_name: str, max_removed_by_step: int = 50, threshold: float = 0.005, speed_up_by_importance: bool = False, parallel: bool = False, nthread: int = 1, seed: int = 7) -> List[str]: """ Performs feature selection based on the evaluation of the test vs the evaluation of the test with randomly shuffled features Parameters ---------- log : LogType Dictionaries evaluations. predict_fn: function pandas.DataFrame -> pandas.DataFrame A partially defined predictor that takes a DataFrame and returns the predicted score for this dataframe eval_fn : function DataFrame -> log dict A partially defined evaluation function that takes a dataset with prediction and returns the evaluation logs. eval_data: pandas.DataFrame Data used to evaluate the model after shuffling extractor: function str -> float A extractor that take a string and returns the value of that string on a dict metric_name: str String with the name of the column that refers to the metric column to be extracted max_removed_by_step: int (default 5) The maximum number of features to remove. It will only consider the least max_removed_by_step in terms of feature importance. If speed_up_by_importance=True it will first filter the least relevant feature an shuffle only those. If speed_up_by_importance=False it will shuffle all features and drop the last max_removed_by_step in terms of PIMP. In both cases, the features will only be removed if drop in performance is up to the defined threshold. threshold: float (default 0.005) Threshold for model performance comparison speed_up_by_importance: bool (default True) If it should narrow search looking at feature importance first before getting PIMP importance. If True, will only shuffle the top num_removed_by_step in terms of feature importance. parallel: bool (default False) nthread: int (default 1) seed: int (default 7) Random seed Returns ---------- features: list of str The remaining features after removing based on feature importance """ random.seed(seed) curr_metric = get_avg_metric_from_extractor(log, extractor, metric_name) eval_size = eval_data.shape[0] features_to_shuffle = order_feature_importance_avg_from_logs(log)[-max_removed_by_step:] \ if speed_up_by_importance else get_used_features(log) def shuffle(feature: str) -> pd.DataFrame: return eval_data.assign( **{feature: eval_data[feature].sample(frac=1.0)}) feature_to_delta_metric = compose( lambda m: curr_metric - m, get_avg_metric_from_extractor(extractor=extractor, metric_name=metric_name), gen_validator_log(fold_num=0, test_size=eval_size), eval_fn, predict_fn, shuffle) if parallel: metrics = Parallel(n_jobs=nthread, backend="threading")( delayed(feature_to_delta_metric)(feature) for feature in features_to_shuffle) feature_to_delta_metric = dict(zip(features_to_shuffle, metrics)) gc.collect() else: feature_to_delta_metric = { feature: feature_to_delta_metric(feature) for feature in features_to_shuffle } return pipe(feature_to_delta_metric, valfilter(lambda delta_metric: delta_metric < threshold), sorted(key=lambda f: feature_to_delta_metric.get(f)), take(max_removed_by_step), list)
im = ax.imshow(model, cmap='magma') axcolor = fig.add_axes([0.91, 0.1, 0.02, 0.8]) plt.colorbar(im, cax=axcolor) for axis in [ax.xaxis, ax.yaxis]: axis.set_ticks(range(8)) axis.set_ticks_position('none') axis.set_ticklabels(labels) plt.savefig('./8_3_markov_model.png') if __name__ == "__main__": dm = '../data/dm6.fa' dm_gz = '../data/dm6.fa.gz' demo = False if demo: model = tz.pipe(dm_gz, genome, c.take(10**7), markov) else: model = tz.pipe(dm_gz, genome, markov) print('The model is:\n') print(' ', ' '.join('ACGTacgt'), '\n') print(model) print('visualization ...') plot_model(model, labels='ACGTacgt') ''' The dictionary is {('A', 'A'): (0, 0), ('A', 'C'): (0, 1), ('A', 'G'): (0, 2), ('A', 'T'): (0, 3), ('A', 'a'): (0, 4), ('A', 'c'): (0, 5), ('A', 'g'): (0, 6), ('A', 't'): (0, 7), ('C', 'A'): (1, 0), ('C', 'C'): (1, 1), ('C', 'G'): (1, 2), ('C', 'T'): (1, 3), ('C', 'a'): (1, 4), ('C', 'c'): (1, 5), ('C', 'g'): (1, 6), ('C', 't'): (1, 7), ('G', 'A'): (2, 0), ('G', 'C'): (2, 1), ('G', 'G'): (2, 2), ('G', 'T'): (2, 3), ('G', 'a'): (2, 4), ('G', 'c'): (2, 5), ('G', 'g'): (2, 6), ('G', 't'): (2, 7), ('T', 'A'): (3, 0), ('T', 'C'): (3, 1), ('T', 'G'): (3, 2), ('T', 'T'): (3, 3), ('T', 'a'): (3, 4), ('T', 'c'): (3, 5), ('T', 'g'): (3, 6), ('T', 't'): (3, 7), ('a', 'A'): (4, 0), ('a', 'C'): (4, 1), ('a', 'G'): (4, 2), ('a', 'T'): (4, 3), ('a', 'a'): (4, 4), ('a', 'c'): (4, 5), ('a', 'g'): (4, 6), ('a', 't'): (4, 7), ('c', 'A'): (5, 0), ('c', 'C'): (5, 1), ('c', 'G'): (5, 2), ('c', 'T'): (5, 3), ('c', 'a'): (5, 4), ('c', 'c'): (5, 5), ('c', 'g'): (5, 6), ('c', 't'): (5, 7), ('g', 'A'): (6, 0), ('g', 'C'): (6, 1), ('g', 'G'): (6, 2), ('g', 'T'): (6, 3), ('g', 'a'): (6, 4), ('g', 'c'): (6, 5), ('g', 'g'): (6, 6), ('g', 't'): (6, 7), ('t', 'A'): (7, 0), ('t', 'C'): (7, 1), ('t', 'G'): (7, 2), ('t', 'T'): (7, 3), ('t', 'a'): (7, 4), ('t', 'c'): (7, 5), ('t', 'g'): (7, 6), ('t', 't'): (7, 7)} The model is: A C G T a c g t [[0.351 0.181 0.189 0.279 0. 0. 0. 0. ]