def compare_streams(db_engine, date_range, stream_names, allowed_parts_of_speech, max_num_words): """Compare tokens from each stream in the stream_names list""" ## Create token count dictionaries for each stream name count_dicts_dict = {} for stream_name in stream_names: count_dicts_dict[stream_name] = tz.pipe( get_content( db_engine, stream_name, date_range), parse_content_into_count(max_num_words, allowed_parts_of_speech)) ## Create cross-stream count dictionary all_streams_count_dict = reduce( lambda x,y: tz.merge_with(sum, x, y), count_dicts_dict.values()) ## Calculate posterior probabilities of the tokens posterior_probs = {} for stream_name in stream_names: posterior_probs[stream_name] = tz.pipe( get_posterior_probs_freq( 500, # limited to the 500 most frequent words in this stream, at this time all_streams_count_dict, count_dicts_dict[stream_name]), tz.map(lambda x: tz.merge({"stream":stream_name}, x)), tz.take(max_num_words), list, ) return posterior_probs
def word_doc_counts( self, normalize: str = 'lemma', weighting: str = 'count', smooth_idf: bool = True, as_strings: bool = False, corpora: Optional[Union[str, Sequence[str]]] = None ) -> dict: # pylint: disable=too-many-arguments ''' Map the set of unique words in the Corpora to their document counts as absolute, relative, inverse, or binary frequencies of occurence. ''' func = methodcaller('word_doc_counts', normalize, weighting, smooth_idf, as_strings) return self._agg_with(func, tlzc.merge_with(sum), corpora)
def word_counts( self, normalize: str = 'lemma', weighting: str = 'count', as_strings: bool = False, corpora: Optional[Union[str, Sequence[str]]] = None ) -> dict: ''' Map the set of unique words in :class:`Corpus` to their counts as absolute, relative, or binary frequencies of occurence, similar to :meth:`Doc._.to_bag_of_words()` but aggregated over all corpora an docs. ''' func = methodcaller('word_counts', normalize, weighting, as_strings) return self._agg_with(func, tlzc.merge_with(sum), corpora)
def _create_plot(*, x_sc=bq.LinearScale, y_sc=bq.LinearScale, x_ax=bq.Axis, y_ax=bq.Axis, mark=bq.Mark, fig=bq.Figure, options={}, params={}): """ Initializes all components of a bqplot figure and returns resulting (mark, figure) tuple. Each plot component is passed in as a class. The plot options should be passed into options. Any additional parameters required by the plot components are passed into params as a dict of { plot_component: { trait: value, ... } }. For example, to change the grid lines of the x-axis: { 'x_ax': {'grid_lines' : 'solid'} }. If the param value is a function, it will be called with the options dict augmented with all previously created plot elements. This permits dependencies on plot elements: { 'x_ax': {'scale': lambda opts: opts['x_sc'] } } """ def maybe_call(maybe_fn, opts): if callable(maybe_fn): return maybe_fn(opts) return maybe_fn def call_params(component, opts): return {trait: maybe_call(val, opts) for trait, val in params[component].items()} # Perform a 2-level deep merge params = tz.merge_with(tz.merge, _default_params, params) x_sc = x_sc(**call_params('x_sc', options)) y_sc = y_sc(**call_params('y_sc', options)) options = {**options, **{'x_sc': x_sc, 'y_sc': y_sc}} x_ax = x_ax(**call_params('x_ax', options)) y_ax = y_ax(**call_params('y_ax', options)) options = {**options, **{'x_ax': x_ax, 'y_ax': y_ax}} mark = mark(**call_params('mark', options)) options = {**options, **{'mark': mark}} fig = fig(**call_params('fig', options)) return mark, fig
def _merge_with_defaults(params): """ Performs a 2-level deep merge of params with _default_params with corrent merging of params for each mark. This is a bit complicated since params['marks'] is a list and we need to make sure each mark gets the default params. """ marks_params = [ tz.merge(default, param) for default, param in zip( itertools.repeat(_default_params['marks']), params['marks']) ] if 'marks' in params else [_default_params['marks']] merged_without_marks = tz.merge_with(tz.merge, tz.dissoc(_default_params, 'marks'), tz.dissoc(params, 'marks')) return tz.merge(merged_without_marks, {'marks': marks_params})
def from_file_path(cls, file_path: FilePath, sheet_name: str, *, row_limit: int = 100): """Help function to populate the columns of a sheet.""" wb = get_wb(file_path) ws = wb[sheet_name] rows = tz.take(row_limit, ws.rows) header = next(rows) names = [c.value for c in header] letters = [c.column_letter for c in header] indices = [c.column for c in header] data_types = tz.pipe( rows # For each row, create a dict usng names has keys , tz.map(lambda row: dict(zip(names, row))) # Get the .xlsx data_type for each cell , tz.map(tz.valmap(lambda cell: cell.data_type)) # Combine cells into a list per column , tz.merge_with(list) # Count the cells for each data type in the column , tz.valmap(tz.frequencies) # Consolidate types , tz.valmap(lambda freq: ( # If at least 1 "d" "date" if "d" in freq else # If at least 1 "s" "text" if "s" in freq else # If at least 1 "n" "number" if "n" in freq else str(freq))), lambda d: [v for k, v in d.items()]) cols = [ Col(name=N, letter=L, index=I, data_type=D) for N, L, I, D in zip(names, letters, indices, data_types) ] return cls(name=sheet_name, cols=cols)
def test_merge_with_list(): assert merge_with(sum, [{'a': 1}, {'a': 2}]) == {'a': 3}
def test_merge_with(): assert merge_with(sum)({1: 1}, {1: 2}) == {1: 3}
def _expected_data(self): sids = 0, 1, 2 modifier = { 'low': 0, 'open': 1, 'close': 2, 'high': 3, 'volume': 0, } pricing = [ np.hstack(( np.arange(252, dtype='float64')[:, np.newaxis] + 1 + sid * 10000 + modifier[column] * 1000 for sid in sorted(sids) )) for column in self.columns ] # There are two dividends and 1 split for each company. def dividend_adjustment(sid, which): """The dividends occur at indices 252 // 4 and 3 * 252 / 4 with a cash amount of sid + 1 / 10 and sid + 2 / 10 """ if which == 'first': idx = 252 // 4 else: idx = 3 * 252 // 4 return { idx: [Float64Multiply( first_row=0, last_row=idx, first_col=sid, last_col=sid, value=float( 1 - ((sid + 1 + (which == 'second')) / 10) / (idx - 1 + sid * 10000 + 2000) ), )], } def split_adjustment(sid, volume): """The splits occur at index 252 // 2 with a ratio of (sid + 1):1 """ idx = 252 // 2 return { idx: [Float64Multiply( first_row=0, last_row=idx, first_col=sid, last_col=sid, value=(identity if volume else op.truediv(1))(sid + 2), )], } merge_adjustments = merge_with(flip(sum, [])) adjustments = [ # ohlc merge_adjustments( *tuple(dividend_adjustment(sid, 'first') for sid in sids) + tuple(dividend_adjustment(sid, 'second') for sid in sids) + tuple(split_adjustment(sid, volume=False) for sid in sids) ) ] * (len(self.columns) - 1) + [ # volume merge_adjustments( split_adjustment(sid, volume=True) for sid in sids ), ] return pricing, adjustments
def _expected_data(self): sids = 0, 1, 2 modifier = { 'low': 0, 'open': 1, 'close': 2, 'high': 3, 'volume': 0, } pricing = [ np.hstack((np.arange(252, dtype='float64')[:, np.newaxis] + 1 + sid * 10000 + modifier[column] * 1000 for sid in sorted(sids))) for column in self.columns ] # There are two dividends and 1 split for each company. def dividend_adjustment(sid, which): """The dividends occur at indices 252 // 4 and 3 * 252 / 4 with a cash amount of sid + 1 / 10 and sid + 2 / 10 """ if which == 'first': idx = 252 // 4 else: idx = 3 * 252 // 4 return { idx: [ Float64Multiply( first_row=0, last_row=idx, first_col=sid, last_col=sid, value=float(1 - ((sid + 1 + (which == 'second')) / 10) / (idx - 1 + sid * 10000 + 2000)), ) ], } def split_adjustment(sid, volume): """The splits occur at index 252 // 2 with a ratio of (sid + 1):1 """ idx = 252 // 2 return { idx: [ Float64Multiply( first_row=0, last_row=idx, first_col=sid, last_col=sid, value=(identity if volume else op.truediv(1))(sid + 2), ) ], } merge_adjustments = merge_with(flip(sum, [])) adjustments = [ # ohlc merge_adjustments( *tuple(dividend_adjustment(sid, 'first') for sid in sids) + tuple(dividend_adjustment(sid, 'second') for sid in sids) + tuple(split_adjustment(sid, volume=False) for sid in sids)) ] * (len(self.columns) - 1) + [ # volume merge_adjustments( split_adjustment(sid, volume=True) for sid in sids), ] return pricing, adjustments
def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('-d', '--debug', action='store_true', default=False, help='Display debug messages') parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Increase output verbosity') global args args = parser.parse_args() logging.basicConfig( level=logging.DEBUG if args.debug else (logging.INFO if args.verbose else logging.WARNING), stream=sys.stdout, ) if not os.path.isdir(json_dir_path): os.mkdir(json_dir_path) if not os.path.isdir(ast_dir_path): os.mkdir(ast_dir_path) # Load variables definitions tgvh_infos = list(load_tgvH_file()) # Write constants constant_by_name = pipe( tgvh_infos, filter(lambda val: val['type'] == 'variable_const'), map(lambda d: (d['name'], d['value'])), dict, ) write_json_file(data=constant_by_name, file_name='constants.json') # Write variables dependencies regles_nodes = list(mapcat(load_regles_nodes, iter_json_file_names('chap-*.json', 'res-ser*.json'))) dependencies_by_formula_name = dict(list(mapcat(dependencies_visitors.visit_node, regles_nodes))) write_json_file(data=dependencies_by_formula_name, file_name='formulas_dependencies.json') # Write variables definitions ast_infos_by_variable_name = {} for regle_node in regles_nodes: regle_infos = { 'regle_applications': regle_node['applications'], 'regle_linecol': regle_node['linecol'], 'regle_name': regle_node['name'], 'source_file_name': regle_node['source_file_name'], } regle_tags = list(pluck('value', regle_node.get('tags', []))) if regle_tags: regle_infos['regle_tags'] = regle_tags for formula_node in regle_node['formulas']: if formula_node['type'] == 'formula': ast_infos_by_variable_name[formula_node['name']] = assoc( regle_infos, 'formula_linecol', formula_node['linecol']) elif formula_node['type'] == 'pour_formula': for unlooped_formula_node in unloop_helpers.iter_unlooped_nodes( loop_variables_nodes=formula_node['loop_variables'], node=formula_node['formula'], unloop_keys=['name'], ): pour_formula_infos = merge(regle_infos, { 'pour_formula_linecol': formula_node['formula']['linecol'], 'pour_formula_name': formula_node['formula']['name'], }) ast_infos_by_variable_name[unlooped_formula_node['name']] = pour_formula_infos else: assert False, 'Unhandled formula_node type: {}'.format(formula_node) def rename_key(d, key_name, key_new_name): return assoc(dissoc(d, key_name), key_new_name, d[key_name]) tgvh_infos_by_variable_name = pipe( tgvh_infos, filter(lambda d: d['type'] in ('variable_calculee', 'variable_saisie')), map(lambda d: rename_key(d, 'linecol', 'tgvh_linecol')), map(lambda d: (d['name'], d)), # Index by name dict, ) definition_by_variable_name = merge_with(merge, ast_infos_by_variable_name, tgvh_infos_by_variable_name) write_json_file(data=definition_by_variable_name, file_name='variables_definitions.json') return 0