def compare_streams(db_engine, date_range, stream_names, allowed_parts_of_speech, max_num_words):
    """Compare tokens from each stream in the stream_names list"""

    ## Create token count dictionaries for each stream name
    count_dicts_dict = {}
    for stream_name in stream_names:
        count_dicts_dict[stream_name] = tz.pipe(
            get_content(
                db_engine, 
                stream_name,
                date_range),
            parse_content_into_count(max_num_words, allowed_parts_of_speech))

    ## Create cross-stream count dictionary
    all_streams_count_dict = reduce(
        lambda x,y: tz.merge_with(sum, x, y),
        count_dicts_dict.values())

    ## Calculate posterior probabilities of the tokens
    posterior_probs = {}
    for stream_name in stream_names:
        posterior_probs[stream_name] = tz.pipe(
            get_posterior_probs_freq(
                500, # limited to the 500 most frequent words in this stream, at this time
                all_streams_count_dict, 
                count_dicts_dict[stream_name]),
            tz.map(lambda x: tz.merge({"stream":stream_name}, x)),
            tz.take(max_num_words),
            list,
        )
    return posterior_probs
예제 #2
0
    def word_doc_counts(
            self,
            normalize: str = 'lemma',
            weighting: str = 'count',
            smooth_idf: bool = True,
            as_strings: bool = False,
            corpora: Optional[Union[str, Sequence[str]]] = None
    ) -> dict: # pylint: disable=too-many-arguments
        '''
        Map the set of unique words in the Corpora to their document counts as absolute,
        relative, inverse, or binary frequencies of occurence.
        '''
        func = methodcaller('word_doc_counts', normalize, weighting, smooth_idf, as_strings)

        return self._agg_with(func, tlzc.merge_with(sum), corpora)
예제 #3
0
    def word_counts(
            self,
            normalize: str = 'lemma',
            weighting: str = 'count',
            as_strings: bool = False,
            corpora: Optional[Union[str, Sequence[str]]] = None
    ) -> dict:
        '''
        Map the set of unique words in :class:`Corpus` to their counts as
        absolute, relative, or binary frequencies of occurence,
        similar to :meth:`Doc._.to_bag_of_words()` but aggregated over all corpora an docs.
        '''
        func = methodcaller('word_counts', normalize, weighting, as_strings)

        return self._agg_with(func, tlzc.merge_with(sum), corpora)
예제 #4
0
def _create_plot(*, x_sc=bq.LinearScale, y_sc=bq.LinearScale,
                 x_ax=bq.Axis, y_ax=bq.Axis, mark=bq.Mark, fig=bq.Figure,
                 options={}, params={}):
    """
    Initializes all components of a bqplot figure and returns resulting
    (mark, figure) tuple. Each plot component is passed in as a class.

    The plot options should be passed into options. Any additional parameters
    required by the plot components are passed into params as a dict of
    { plot_component: { trait: value, ... } }.

    For example, to change the grid lines of the x-axis:
    { 'x_ax': {'grid_lines' : 'solid'} }.

    If the param value is a function, it will be called with the options dict
    augmented with all previously created plot elements. This permits
    dependencies on plot elements:
    { 'x_ax': {'scale': lambda opts: opts['x_sc'] } }
    """
    def maybe_call(maybe_fn, opts):
        if callable(maybe_fn):
            return maybe_fn(opts)
        return maybe_fn

    def call_params(component, opts):
        return {trait: maybe_call(val, opts)
                for trait, val in params[component].items()}

    # Perform a 2-level deep merge
    params = tz.merge_with(tz.merge, _default_params, params)

    x_sc = x_sc(**call_params('x_sc', options))
    y_sc = y_sc(**call_params('y_sc', options))
    options = {**options, **{'x_sc': x_sc, 'y_sc': y_sc}}

    x_ax = x_ax(**call_params('x_ax', options))
    y_ax = y_ax(**call_params('y_ax', options))
    options = {**options, **{'x_ax': x_ax, 'y_ax': y_ax}}

    mark = mark(**call_params('mark', options))
    options = {**options, **{'mark': mark}}

    fig = fig(**call_params('fig', options))

    return mark, fig
예제 #5
0
def _merge_with_defaults(params):
    """
    Performs a 2-level deep merge of params with _default_params with corrent
    merging of params for each mark.

    This is a bit complicated since params['marks'] is a list and we need to
    make sure each mark gets the default params.
    """
    marks_params = [
        tz.merge(default, param) for default, param in zip(
            itertools.repeat(_default_params['marks']), params['marks'])
    ] if 'marks' in params else [_default_params['marks']]

    merged_without_marks = tz.merge_with(tz.merge,
                                         tz.dissoc(_default_params, 'marks'),
                                         tz.dissoc(params, 'marks'))

    return tz.merge(merged_without_marks, {'marks': marks_params})
예제 #6
0
    def from_file_path(cls,
                       file_path: FilePath,
                       sheet_name: str,
                       *,
                       row_limit: int = 100):
        """Help function to populate the columns of a sheet."""
        wb = get_wb(file_path)
        ws = wb[sheet_name]
        rows = tz.take(row_limit, ws.rows)
        header = next(rows)
        names = [c.value for c in header]
        letters = [c.column_letter for c in header]
        indices = [c.column for c in header]
        data_types = tz.pipe(
            rows
            # For each row, create a dict usng names has keys
            ,
            tz.map(lambda row: dict(zip(names, row)))
            # Get the .xlsx data_type for each cell
            ,
            tz.map(tz.valmap(lambda cell: cell.data_type))
            # Combine cells into a list per column
            ,
            tz.merge_with(list)
            # Count the cells for each data type in the column
            ,
            tz.valmap(tz.frequencies)
            # Consolidate types
            ,
            tz.valmap(lambda freq: (
                # If at least 1 "d"
                "date" if "d" in freq else
                # If at least 1 "s"
                "text" if "s" in freq else
                # If at least 1 "n"
                "number" if "n" in freq else str(freq))),
            lambda d: [v for k, v in d.items()])

        cols = [
            Col(name=N, letter=L, index=I, data_type=D)
            for N, L, I, D in zip(names, letters, indices, data_types)
        ]
        return cls(name=sheet_name, cols=cols)
예제 #7
0
def test_merge_with_list():
    assert merge_with(sum, [{'a': 1}, {'a': 2}]) == {'a': 3}
예제 #8
0
def test_merge_with():
    assert merge_with(sum)({1: 1}, {1: 2}) == {1: 3}
예제 #9
0
    def _expected_data(self):
        sids = 0, 1, 2
        modifier = {
            'low': 0,
            'open': 1,
            'close': 2,
            'high': 3,
            'volume': 0,
        }
        pricing = [
            np.hstack((
                np.arange(252, dtype='float64')[:, np.newaxis] +
                1 +
                sid * 10000 +
                modifier[column] * 1000
                for sid in sorted(sids)
            ))
            for column in self.columns
        ]

        # There are two dividends and 1 split for each company.

        def dividend_adjustment(sid, which):
            """The dividends occur at indices 252 // 4 and 3 * 252 / 4
            with a cash amount of sid + 1 / 10 and sid + 2 / 10
            """
            if which == 'first':
                idx = 252 // 4
            else:
                idx = 3 * 252 // 4

            return {
                idx: [Float64Multiply(
                    first_row=0,
                    last_row=idx,
                    first_col=sid,
                    last_col=sid,
                    value=float(
                        1 -
                        ((sid + 1 + (which == 'second')) / 10) /
                        (idx - 1 + sid * 10000 + 2000)
                    ),
                )],
            }

        def split_adjustment(sid, volume):
            """The splits occur at index 252 // 2 with a ratio of (sid + 1):1
            """
            idx = 252 // 2
            return {
                idx: [Float64Multiply(
                    first_row=0,
                    last_row=idx,
                    first_col=sid,
                    last_col=sid,
                    value=(identity if volume else op.truediv(1))(sid + 2),
                )],
            }

        merge_adjustments = merge_with(flip(sum, []))

        adjustments = [
            # ohlc
            merge_adjustments(
                *tuple(dividend_adjustment(sid, 'first') for sid in sids) +
                tuple(dividend_adjustment(sid, 'second') for sid in sids) +
                tuple(split_adjustment(sid, volume=False) for sid in sids)
            )
        ] * (len(self.columns) - 1) + [
            # volume
            merge_adjustments(
                split_adjustment(sid, volume=True) for sid in sids
            ),
        ]

        return pricing, adjustments
예제 #10
0
    def _expected_data(self):
        sids = 0, 1, 2
        modifier = {
            'low': 0,
            'open': 1,
            'close': 2,
            'high': 3,
            'volume': 0,
        }
        pricing = [
            np.hstack((np.arange(252, dtype='float64')[:, np.newaxis] + 1 +
                       sid * 10000 + modifier[column] * 1000
                       for sid in sorted(sids))) for column in self.columns
        ]

        # There are two dividends and 1 split for each company.

        def dividend_adjustment(sid, which):
            """The dividends occur at indices 252 // 4 and 3 * 252 / 4
            with a cash amount of sid + 1 / 10 and sid + 2 / 10
            """
            if which == 'first':
                idx = 252 // 4
            else:
                idx = 3 * 252 // 4

            return {
                idx: [
                    Float64Multiply(
                        first_row=0,
                        last_row=idx,
                        first_col=sid,
                        last_col=sid,
                        value=float(1 - ((sid + 1 +
                                          (which == 'second')) / 10) /
                                    (idx - 1 + sid * 10000 + 2000)),
                    )
                ],
            }

        def split_adjustment(sid, volume):
            """The splits occur at index 252 // 2 with a ratio of (sid + 1):1
            """
            idx = 252 // 2
            return {
                idx: [
                    Float64Multiply(
                        first_row=0,
                        last_row=idx,
                        first_col=sid,
                        last_col=sid,
                        value=(identity if volume else op.truediv(1))(sid + 2),
                    )
                ],
            }

        merge_adjustments = merge_with(flip(sum, []))

        adjustments = [
            # ohlc
            merge_adjustments(
                *tuple(dividend_adjustment(sid, 'first') for sid in sids) +
                tuple(dividend_adjustment(sid, 'second') for sid in sids) +
                tuple(split_adjustment(sid, volume=False) for sid in sids))
        ] * (len(self.columns) - 1) + [
            # volume
            merge_adjustments(
                split_adjustment(sid, volume=True) for sid in sids),
        ]

        return pricing, adjustments
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('-d', '--debug', action='store_true', default=False, help='Display debug messages')
    parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Increase output verbosity')
    global args
    args = parser.parse_args()
    logging.basicConfig(
        level=logging.DEBUG if args.debug else (logging.INFO if args.verbose else logging.WARNING),
        stream=sys.stdout,
        )

    if not os.path.isdir(json_dir_path):
        os.mkdir(json_dir_path)
    if not os.path.isdir(ast_dir_path):
        os.mkdir(ast_dir_path)

    # Load variables definitions

    tgvh_infos = list(load_tgvH_file())

    # Write constants

    constant_by_name = pipe(
         tgvh_infos,
         filter(lambda val: val['type'] == 'variable_const'),
         map(lambda d: (d['name'], d['value'])),
         dict,
         )
    write_json_file(data=constant_by_name, file_name='constants.json')

    # Write variables dependencies

    regles_nodes = list(mapcat(load_regles_nodes, iter_json_file_names('chap-*.json', 'res-ser*.json')))
    dependencies_by_formula_name = dict(list(mapcat(dependencies_visitors.visit_node, regles_nodes)))
    write_json_file(data=dependencies_by_formula_name, file_name='formulas_dependencies.json')

    # Write variables definitions

    ast_infos_by_variable_name = {}
    for regle_node in regles_nodes:
        regle_infos = {
            'regle_applications': regle_node['applications'],
            'regle_linecol': regle_node['linecol'],
            'regle_name': regle_node['name'],
            'source_file_name': regle_node['source_file_name'],
            }
        regle_tags = list(pluck('value', regle_node.get('tags', [])))
        if regle_tags:
            regle_infos['regle_tags'] = regle_tags
        for formula_node in regle_node['formulas']:
            if formula_node['type'] == 'formula':
                ast_infos_by_variable_name[formula_node['name']] = assoc(
                    regle_infos, 'formula_linecol', formula_node['linecol'])
            elif formula_node['type'] == 'pour_formula':
                for unlooped_formula_node in unloop_helpers.iter_unlooped_nodes(
                        loop_variables_nodes=formula_node['loop_variables'],
                        node=formula_node['formula'],
                        unloop_keys=['name'],
                        ):
                    pour_formula_infos = merge(regle_infos, {
                        'pour_formula_linecol': formula_node['formula']['linecol'],
                        'pour_formula_name': formula_node['formula']['name'],
                        })
                    ast_infos_by_variable_name[unlooped_formula_node['name']] = pour_formula_infos
            else:
                assert False, 'Unhandled formula_node type: {}'.format(formula_node)

    def rename_key(d, key_name, key_new_name):
        return assoc(dissoc(d, key_name), key_new_name, d[key_name])

    tgvh_infos_by_variable_name = pipe(
        tgvh_infos,
        filter(lambda d: d['type'] in ('variable_calculee', 'variable_saisie')),
        map(lambda d: rename_key(d, 'linecol', 'tgvh_linecol')),
        map(lambda d: (d['name'], d)),  # Index by name
        dict,
        )

    definition_by_variable_name = merge_with(merge, ast_infos_by_variable_name, tgvh_infos_by_variable_name)

    write_json_file(data=definition_by_variable_name, file_name='variables_definitions.json')

    return 0