def guess_input(raw):
    """Prints summary information about the input"""
    lines = raw.splitlines()
    print(f"# lines: {len(lines)}")
    line_lengths = set(filter(lambda x: x > 0, [len(line) for line in lines]))
    print(f"Line length range: {min(line_lengths)} to {max(line_lengths)}")
    print(f"# chars: {len(raw)}")
    double_newline = len(raw.split("\n\n")) - 1
    print(f"# double newlines: {double_newline}")
    whitespace, tabs = False, False
    seen = []
    for ch in raw:
        if ch == '\n':
            continue
        if ch == '\t':
            tabs = True
        elif ch.isspace():
            whitespace = True
        elif ch not in seen:
            seen.append(ch)
    print(f"Contains tabs: {tabs}")
    print(f"Contains whitespace: {whitespace}")
    print(f"Chars: {''.join(sorted(seen))}")
    ints = list(extract_ints(raw, negative=True))
    print(f"# Ints: {len(ints)}")
    if len(ints) > 0:
        print(f"Int range: {min(ints)} to {max(ints)}")
    ms = Multiset()
    for word in raw.split():
        if word.isnumeric():
            continue
        ms.add(word)
    common = sorted(ms.items(), key=by_index(1), reverse=True)
    print(f"Most common words: {common}")
    print()
Exemplo n.º 2
0
def commutative_sequence_variable_partition_iter(values: Multiset, variables: List[VariableWithCount]
                                                ) -> Iterator[Dict[str, Multiset]]:
    """Yield all possible variable substitutions for given values and variables.

    .. note::

        The results are not yielded in any particular order because the algorithm uses dictionaries. Dictionaries until
        Python 3.6 do not keep track of the insertion order.

    Example:

        For a subject like ``fc(a, a, a, b, b, c)`` and a pattern like ``f(x__, y___, y___)`` one can define the
        following input parameters for the partitioning:

        >>> x = VariableWithCount(name='x', count=1, minimum=1, default=None)
        >>> y = VariableWithCount(name='y', count=2, minimum=0, default=None)
        >>> values = Multiset('aaabbc')

        Then the solutions are found (and sorted to get a unique output):

        >>> substitutions = commutative_sequence_variable_partition_iter(values, [x, y])
        >>> as_strings = list(str(Substitution(substitution)) for substitution in substitutions)
        >>> for substitution in sorted(as_strings):
        ...     print(substitution)
        {x ↦ {a, a, a, b, b, c}, y ↦ {}}
        {x ↦ {a, a, a, c}, y ↦ {b}}
        {x ↦ {a, b, b, c}, y ↦ {a}}
        {x ↦ {a, c}, y ↦ {a, b}}

    Args:
        values:
            The multiset of values which are partitioned and distributed among the variables.
        variables:
            A list of the variables to distribute the values among. Each variable has a name, a count of how many times
            it occurs and a minimum number of values it needs.

    Yields:
        Each possible substitutions that is a valid partitioning of the values among the variables.
    """
    if len(variables) == 1:
        yield from _commutative_single_variable_partiton_iter(values, variables[0])
        return

    generators = []
    for value, count in values.items():
        generators.append(_make_variable_generator_factory(value, count, variables))

    initial = dict((var.name, Multiset()) for var in variables)  # type: Dict[str, 'Multiset[T]']
    for subst in generator_chain(initial, *generators):
        valid = True
        for var in variables:
            if var.default is not None and len(subst[var.name]) == 0:
                subst[var.name] = var.default
            elif len(subst[var.name]) < var.minimum:
                valid = False
                break
        if valid:
            if None in subst:
                del subst[None]
            yield subst
Exemplo n.º 3
0
def count():

    db = Loader.load()

    cube = CubeLoader(db).load()

    # for printing in (printing for printing in set(cube.all_printings) if printing.front_face.artist.name=='Eric Deschamps'):
    # 	print(printing)

    artists = Multiset(printing.front_face.artist
                       for printing in set(cube.all_printings))

    for artist, multiplicity in sorted(artists.items(), key=lambda vs: vs[1]):
        print(artist, multiplicity)
Exemplo n.º 4
0
def _match_commutative_operation(subject_operands, pattern, substitution,
                                 constraints, matcher):
    subjects = Multiset(subject_operands)  # type: Multiset
    if not pattern.constant <= subjects:
        return
    subjects -= pattern.constant
    rest_expr = pattern.rest + pattern.syntactic
    needed_length = (pattern.sequence_variable_min_length +
                     pattern.fixed_variable_length + len(rest_expr) +
                     pattern.wildcard_min_length)

    if len(subjects) < needed_length:
        return

    fixed_vars = Multiset(pattern.fixed_variables)  # type: Multiset[str]
    for name, count in pattern.fixed_variables.items():
        if name in substitution:
            replacement = substitution[name]
            if issubclass(pattern.operation,
                          AssociativeOperation) and isinstance(
                              replacement, pattern.operation):
                needed_count = Multiset(substitution[name])  # type: Multiset
            else:
                if not isinstance(replacement, Expression):
                    return
                needed_count = Multiset({replacement: 1})
            if count > 1:
                needed_count *= count
            if not needed_count <= subjects:
                return
            subjects -= needed_count
            del fixed_vars[name]

    factories = [
        _fixed_expr_factory(e, constraints, matcher) for e in rest_expr
    ]

    if not issubclass(pattern.operation, AssociativeOperation):
        for name, count in fixed_vars.items():
            min_count, symbol_type = pattern.fixed_variable_infos[name]
            factory = _fixed_var_iter_factory(name, count, min_count,
                                              symbol_type, constraints)
            factories.append(factory)

        if pattern.wildcard_fixed is True:
            factory = _fixed_var_iter_factory(None, 1,
                                              pattern.wildcard_min_length,
                                              None, constraints)
            factories.append(factory)
    else:
        for name, count in fixed_vars.items():
            min_count, symbol_type = pattern.fixed_variable_infos[name]
            if symbol_type is not None:
                factory = _fixed_var_iter_factory(name, count, min_count,
                                                  symbol_type, constraints)
                factories.append(factory)

    expr_counter = Multiset(subjects)  # type: Multiset

    for rem_expr, substitution in generator_chain((expr_counter, substitution),
                                                  *factories):
        sequence_vars = _variables_with_counts(pattern.sequence_variables,
                                               pattern.sequence_variable_infos)
        if issubclass(pattern.operation, AssociativeOperation):
            sequence_vars += _variables_with_counts(
                fixed_vars, pattern.fixed_variable_infos)
            if pattern.wildcard_fixed is True:
                sequence_vars += (VariableWithCount(
                    None, 1, pattern.wildcard_min_length), )
        if pattern.wildcard_fixed is False:
            sequence_vars += (VariableWithCount(None, 1,
                                                pattern.wildcard_min_length), )

        for sequence_subst in commutative_sequence_variable_partition_iter(
                Multiset(rem_expr), sequence_vars):
            if issubclass(pattern.operation, AssociativeOperation):
                for v in fixed_vars.distinct_elements():
                    if v not in sequence_subst:
                        continue
                    l = pattern.fixed_variable_infos[v].min_count
                    value = cast(Multiset, sequence_subst[v])
                    if len(value) > l:
                        normal = Multiset(list(value)[:l - 1])
                        wrapped = pattern.operation(*(value - normal))
                        normal.add(wrapped)
                        sequence_subst[v] = normal if l > 1 else iter(
                            normal).next()
                    else:
                        assert len(
                            value
                        ) == 1 and l == 1, u"Fixed variables with length != 1 are not supported."
                        sequence_subst[v] = iter(value).next()
            try:
                result = substitution.union(sequence_subst)
            except ValueError:
                pass
            else:
                for i in _check_constraints(result, constraints):
                    yield i
Exemplo n.º 5
0
class CommutativePatternsParts(object):
    """Representation of the parts of a commutative pattern expression.

    This data structure contains all the operands of a commutative operation pattern.
    They are distinguished by how they need to be matched against an expression.

    All parts are represented by a :class:`.Multiset`, because the order of operands does not matter
    in a commutative operation.

    In addition, some lengths are precalculated during the initialization of this data structure
    so that they do not have to be recalculated later.

    This data structure is meant to be immutable, so do not change any of its attributes!

    Attributes:
        operation (Type[Operation]):
            The type of of the original pattern expression. Must be a subclass of
            :class:`.Operation`.

        constant (Multiset):
            A :class:`~.Multiset` representing the constant operands of the pattern.
            An expression is constant, if it does not contain variables or wildcards.
        syntactic (Multiset[Operation]):
            A :class:`.Multiset` representing the syntactic operands of the pattern.
            An expression is syntactic, if it does contain neither associative nor commutative operations
            nor sequence variables. Here, constant expressions and variables also get their own counters,
            so they are not included in this counter.
        sequence_variables (Multiset[str]):
            A :class:`.Multiset` representing the sequence variables of the pattern.
            Variables are represented by their name. Additional information is stored in
            ``sequence_variable_infos``. For wildcards without variable, the name will be ``None``.
        sequence_variable_infos (Dict[str, VarInfo]):
            A dictionary mapping sequence variable names to more information about the variable, i.e. its
            ``min_count`` and ``constraint``.
        fixed_variables (Multiset[VarInfo]):
            A :class:`.Multiset` representing the fixed length variables of the pattern.
            Here the key is a tuple of the form `(name, length)` of the variable.
            For wildcards without variable, the name will be `None`.
        fixed_variable_infos (Dict[str, VarInfo]):
            A dictionary mapping fixed variable names to more information about the variable, i.e. its
            ``min_count`` and ``constraint``.
        rest (Multiset):
            A :class:`.Multiset` representing the operands of the pattern that do not fall
            into one of the previous categories. That means it contains operation expressions, which
            are not syntactic.

        length (int):
            The total count of operands of the commutative operation pattern.
        sequence_variable_min_length (int):
            The total combined minimum length of all sequence variables in the commutative
            operation pattern. This is the sum of the `min_count` attributes of the sequence
            variables.
        fixed_variable_length (int):
            The total combined length of all fixed length variables in the commutative
            operation pattern. This is the sum of the `min_count` attributes of the
            variables.
        wildcard_fixed (Optional[bool]):
            Iff none of the operands is an unnamed wildcards, it is ``None``.
            Iff there are any unnamed sequence wildcards, it is ``True``.
            Otherwise, it is ``False``.
        wildcard_min_length (int):
            If :attr:`wildcard_fixed` is not ``None``, this is the total combined minimum length of all unnamed
            wildcards.
    """
    def __init__(self, operation: Type[Operation],
                 *expressions: Expression) -> None:
        """Create a CommutativePatternsParts instance.

        Args:
            operation:
                The type of the commutative operation. Must be a subclass of :class:`.Operation` with
                :attr:`~.Operation.commutative` set to ``True``.
            *expressions:
                The operands of the commutative operation.
        """
        self.operation = operation
        self.length = len(expressions)

        self.constant = Multiset()  # type: Multiset
        self.syntactic = Multiset()  # type: Multiset
        self.sequence_variables = Multiset()  # type: Multiset[str]
        self.sequence_variable_infos = dict()
        self.fixed_variables = Multiset()  # type: Multiset[str]
        self.fixed_variable_infos = dict()
        self.rest = Multiset()  # type: Multiset

        self.sequence_variable_min_length = 0
        self.fixed_variable_length = 0
        self.wildcard_min_length = 0
        self.optional_count = 0
        self.wildcard_fixed = None

        for expression in expressions:
            expression = expression
            if is_constant(expression):
                self.constant[expression] += 1
            elif isinstance(expression, Wildcard):
                wc = cast(Wildcard, expression)
                if wc.variable_name:
                    name = wc.variable_name
                    if wc.fixed_size:
                        self.fixed_variables[name] += 1
                        symbol_type = getattr(wc, 'symbol_type', None)
                        self._update_var_info(self.fixed_variable_infos, name,
                                              wc.min_count, symbol_type,
                                              wc.optional)
                        if wc.optional is None:
                            self.fixed_variable_length += wc.min_count
                        else:
                            self.optional_count += 1
                    else:
                        self.sequence_variables[name] += 1
                        self._update_var_info(self.sequence_variable_infos,
                                              name, wc.min_count, None,
                                              wc.optional)
                        if wc.optional is None:
                            self.sequence_variable_min_length += wc.min_count
                else:
                    self.wildcard_min_length += wc.min_count
                    if self.wildcard_fixed is None:
                        self.wildcard_fixed = wc.fixed_size
                    else:
                        self.wildcard_fixed = self.wildcard_fixed and wc.fixed_size
            elif is_syntactic(expression):
                self.syntactic[expression] += 1
            else:
                self.rest[expression] += 1

    @staticmethod
    def _update_var_info(infos, name, count, symbol_type=None, default=None):
        if name not in infos:
            infos[name] = VarInfo(count, symbol_type, default)
        else:
            existing_info = infos[name]
            assert existing_info.min_count == count
            assert existing_info.type == symbol_type
            assert existing_info.default == default

    def __str__(self):
        parts = []
        parts.extend(map(str, self.constant))
        parts.extend(map(str, self.syntactic))
        parts.extend(map(str, self.rest))

        for name, count in self.sequence_variables.items():
            parts.extend([name] * count)

        for name, count in self.fixed_variables.items():
            parts.extend([name] * count)

        return '{}({})'.format(
            getattr(self.operation, 'name', self.operation.__name__),
            ', '.join(parts))
        if item[1] >= min_bound:
            # Il 100 serve perchè l'intersection prende il numero di parole minore nel multiset
            # Vogliamo che il numero minore sia il numero di token
            count += 1
            common_words.add(item[0], 100)

    print('second part')
    # process the data
    clean_data = []
    for s in data:
        cleaner = s[1].intersection(common_words)
        clean_data.append([s[0], cleaner])

    print('third part')

    output_data = []
    for s, ms in clean_data:
        tokens = []
        for item in ms.items():
            for i in range(0, item[1]):
                tokens.append(item[0])

        output_data.append(s + [tokens])

    df = pd.DataFrame(
        output_data,
        columns=['band', 'album', 'year', 'song', 'genre', 'tokens'])

    df = df.drop(['band', 'album', 'year', 'song'], axis=1)
    df.to_csv('darklyrics-proc-tokens.csv', index=False)
Exemplo n.º 7
0
word_sets = {}
word_msets = {}
letter_lists = [[{le: []
                  for le in letters} for n in range(N + 1)]
                for match in MATCH_RANGE]
for wi, w in enumerate(words):
    word_s = set(w)
    word_sets[w] = word_s
    word_ms = Multiset(w)
    word_msets[w] = word_ms
    for n, le1 in enumerate(w):
        for le0 in letters_set.difference([le1]):
            letter_lists[0][n][le0].append(wi)
        letter_lists[1][n][le1].append(wi)
    for le2, n in word_ms.items():
        letter_lists[2][n][le2].append(wi)
    for le2 in letters_set.difference(word_s):
        letter_lists[2][0][le2].append(wi)

letter_sets = [[{le: set(letter_lists[match][n][le])
                 for le in letters} for n in range(N + 1)]
               for match in MATCH_RANGE]
n_words = len(words)
letter_sets_len = \
    [[{le: len(letter_sets[match][n][le]) for le in letters} for n in range(N + 1)] for match in MATCH_RANGE]

freqs = {(n, le): 0 for n in range(-1, N) for le in letters}
for w, wd in zip(words, word_msets):
    for le in wd:
        freqs[(-1, le)] += 1
Exemplo n.º 8
0
from multiset import Multiset

x = Multiset()

for i in range(10**10 + 1):
    x.update(str(i))

    if i % 10000000 == 0:
        print(i)

    for digit, count in x.items():

        if count == i:
            print(digit, i)
Exemplo n.º 9
0
def ac_operand_lists(t1: Operation, t2: Operation)\
                    -> List[List[Tuple[Expression, Expression]]]:
    """Find all the sets of operand unification problems
    we can get from t1 and t2"""
    # Remove common operations
    t1_op_set = Multiset(t1.operands)
    t2_op_set = Multiset(t2.operands)
    common_ops = t1_op_set & t2_op_set
    t1_op_set -= common_ops
    t2_op_set -= common_ops

    t1_duplicate_vars = any(
        isinstance(e, Wildcard) and n > 1 for e, n in t1_op_set.items())
    t2_duplicate_vars = any(
        isinstance(e, Wildcard) and n > 1 for e, n in t2_op_set.items())
    if t1_duplicate_vars and t2_duplicate_vars:
        raise (NotImplementedError(
            "Possible nontermination on this algo, dispatch slowward"))  # noqa
    elif t1_duplicate_vars or t2_duplicate_vars:
        print("Redundant solutions really gosh darn likely")

    ret = []

    op_function = get_head(t1)

    t1_ops = to_ac_operands(t1_op_set)
    t2_ops = to_ac_operands(t2_op_set)

    all_t1_ops = t1_ops.consts + t1_ops.terms + t1_ops.vars
    all_t2_ops = t2_ops.consts + t2_ops.terms + t2_ops.vars

    t1_n_consts = len(t1_ops.consts)
    t2_n_consts = len(t2_ops.consts)
    t1_n_terms = len(t1_ops.terms)
    t2_n_terms = len(t2_ops.terms)
    t1_n_vars = len(t1_ops.vars)
    t2_n_vars = len(t2_ops.vars)
    t1_n_ops = len(all_t1_ops)
    t2_n_ops = len(all_t2_ops)

    t1_var_start = t1_n_ops - t1_n_vars
    t2_var_start = t2_n_ops - t2_n_vars

    t1_equal_consts = [
        idx for idx in range(0, t1_n_consts - 1)
        if t1_ops.consts[idx] == t1_ops.consts[idx + 1]
    ]
    t2_equal_consts = [
        idx for idx in range(0, t2_n_consts - 1)
        if t2_ops.consts[idx] == t2_ops.consts[idx + 1]
    ]
    t1_equal_terms = [
        idx for idx in range(0, t1_n_terms - 1)
        if t1_ops.terms[idx] == t1_ops.terms[idx + 1]
    ]
    t2_equal_terms = [
        idx for idx in range(0, t2_n_terms - 1)
        if t2_ops.terms[idx] == t2_ops.terms[idx + 1]
    ]
    t1_equal_vars = [
        idx for idx in range(0, t1_n_vars - 1)
        if t1_ops.vars[idx] == t1_ops.vars[idx + 1]
    ]
    t2_equal_vars = [
        idx for idx in range(0, t2_n_vars - 1)
        if t2_ops.vars[idx] == t2_ops.vars[idx + 1]
    ]

    for const_rows_true_idx in ints_walking_range(t2_var_start, t2_n_ops,
                                                  t1_n_consts):
        # Drop clear violations of the repeat property here
        if some_pairs_sorted(const_rows_true_idx, t1_equal_consts):
            continue

        for const_cols_true_idx in ints_walking_range(t1_var_start, t1_n_ops,
                                                      t2_n_consts):
            if some_pairs_sorted(const_cols_true_idx, t2_equal_consts):
                continue

            for term_rows_true_idx in ints_walking_range(
                    t2_n_consts, t2_n_ops, t1_n_terms):
                if some_pairs_sorted(term_rows_true_idx, t1_equal_terms):
                    continue

                for term_cols_true_idx in ints_walking_range(
                        t1_n_consts, t1_n_ops, t2_n_terms):
                    if some_pairs_sorted(term_cols_true_idx, t2_equal_terms):
                        continue

                    # Term mismatch
                    if any(row_nr < t1_var_start and (term_rows_true_idx[
                            row_nr - t1_n_consts] != rel_col_nr + t2_n_consts)
                           for rel_col_nr, row_nr in enumerate(
                               term_cols_true_idx)):
                        continue

                    if any(col_nr < t2_var_start and (term_cols_true_idx[
                            col_nr - t2_n_consts] != rel_row_nr + t1_n_consts)
                           for rel_row_nr, col_nr in enumerate(
                               term_rows_true_idx)):
                        continue

                    set_cols = (set(const_rows_true_idx)
                                | set(term_rows_true_idx))
                    set_rows = (set(const_cols_true_idx)
                                | set(term_cols_true_idx))

                    for var_mat in all_boolean_matrices(t1_n_vars, t2_n_vars):
                        # Filter out failures of unification
                        if any(row_sum == 0 and raw_idx[0] +
                               t1_var_start not in set_rows
                               for raw_idx, row_sum in np.ndenumerate(
                                   np.sum(var_mat, axis=1))):
                            continue

                        if any(col_sum == 0 and raw_idx[0] +
                               t2_var_start not in set_cols
                               for raw_idx, col_sum in np.ndenumerate(
                                   np.sum(var_mat, axis=0))):
                            continue

                        if any(
                                compare_equal_variable_vectors(
                                    i, var_mat[i, :], var_mat[i + 1, :],
                                    const_cols_true_idx, term_cols_true_idx)
                                for i in t1_equal_vars):
                            continue
                        if any(
                                compare_equal_variable_vectors(
                                    i, var_mat[:, i], var_mat[:, i + 1],
                                    const_rows_true_idx, term_rows_true_idx)
                                for i in t2_equal_vars):
                            continue

                        operand_tuples = []
                        t1_var_unifiers = defaultdict(
                            list
                        )  # type: DefaultDict[Expression, List[Expression]] # noqa: E501
                        t2_var_unifiers = defaultdict(
                            list
                        )  # type: DefaultDict[Expression, List[Expression]] # noqa: E501
                        for const, var_idx in zip(t1_ops.consts,
                                                  const_rows_true_idx):
                            var = all_t2_ops[var_idx]
                            t2_var_unifiers[var].append(const)
                        for const, var_idx in zip(t2_ops.consts,
                                                  const_cols_true_idx):
                            var = all_t1_ops[var_idx]
                            t1_var_unifiers[var].append(const)
                        for term, var_idx in zip(t1_ops.terms,
                                                 term_rows_true_idx):
                            expr = all_t2_ops[var_idx]
                            if isinstance(expr, Wildcard):
                                t2_var_unifiers[expr].append(term)
                            else:
                                operand_tuples.append((term, expr))
                        for term, var_idx in zip(t2_ops.terms,
                                                 term_cols_true_idx):
                            expr = all_t1_ops[var_idx]
                            if isinstance(expr, Wildcard):
                                t1_var_unifiers[expr].append(term)
                            # Else case handled above

                        for idxs in np.transpose(np.nonzero(var_mat)):
                            row = t1_ops.vars[idxs[0]]
                            col = t2_ops.vars[idxs[1]]
                            t1_var_unifiers[row].append(col)
                            t2_var_unifiers[col].append(row)
                        for d in [t1_var_unifiers, t2_var_unifiers]:
                            for var, ops in d.items():
                                if len(ops) == 1:
                                    operand_tuples.append((var, ops[0]))
                                else:
                                    operand_tuples.append(
                                        (var, op_function(*ops)))

                        ret.append(operand_tuples)
    return ret