def build_vocabulary(this_wordcount, extra_words=EXTRA_WORDS, 
        is_reset=True, truncate_to_most_frequent=0):
    """
    Builds vocabulary from wordcount.
    It also adds extra words to the vocabulary.

    In:
        this_wordcount - dictionary of wordcounts, e.g. {'cpu':3}
        extra_words - additional words to build the vocabulary
            dictionary of {word: id}
            by default {UNKNOWN: 0}
        is_reset - if True we restart the vocabulary counting
            by defaults False
        truncate_to_most_frequent - if positive then the vocabulary
            is truncated to 'truncate_to_most_frequent' words;
            by default 0 

    Out:
        word2index - mapping from words to indices
        index2word - mapping from indices to words
    """
    if is_reset:
        _myinc.counter=len(EXTRA_WORDS)
    if truncate_to_most_frequent > 0:
        sorted_wordcount = dict(sorted(
                this_wordcount.items(), key=lambda x:x[1], reverse=True)[:truncate_to_most_frequent])
        this_wordcount = sorted_wordcount
        
    word2index = itemmap(_myinc, this_wordcount)
    if not extra_words == {}:
        assert(all([el not in word2index.values() for el in extra_words.values()]))
        word2index.update(extra_words)
    index2word = itemmap(reversed, word2index)
    return word2index, index2word
Пример #2
0
def fmap(keys: Collection, funcs: Collection[Callable],
         data_dict: Dict, val_as_args: bool = False) -> Dict:
    """

    :param keys: a collection, should support __contains__
                 and __getitem__
    :param funcs: a iterable of callables
    :param data_dict: a data dictionary
    :param val_as_args: bool
    :return: a data dictionary
    """
    func_map = dict(
        zip_longest(
            keys, funcs,
            fillvalue=lambda x: x
        )
    )

    def _apply_func(item):
        k, v = item
        if k in func_map:
            if val_as_args:
                return k, func_map[k](*v)
            else:
                return k, func_map[k](v)
        else:
            return k, v

    return dict(
        itemmap(_apply_func, data_dict)
    )
Пример #3
0
def get_required_fields(link: ParseResult) -> Dict[str, str]:
    fields = link.netloc.split(':')
    return itemmap(
        lambda i: decode_field(i[0], fields[i[1]]),
        REQUIRED_FIELDS_POSITION,
        OrderedDict
    )
Пример #4
0
def invert(d):
    """Inverts a dictionary from key->value mapping to a
    value->key mapping. The values being switched to keys must be hashable.

    >>> invert({'ashley': 6, 'timothy': 15})
    ... {6: 'ashley', 15: 'timothy'}
    """
    return dict(toolz.itemmap(reversed, d))
Пример #5
0
def get_table_lines(dicts, max_width: int or dict = 50, keys=None):
    """Return list of lines formatted as table."""
    display_dicts = [{key: d.get(key) for key in (keys or d)} for d in dicts]
    cut_at_max_width = partial(cut_values, max_width=max_width)
    display_dicts = [toolz.itemmap(cut_at_max_width, d) for d in display_dicts]
    table_string = tabulate.tabulate(display_dicts,
                                     headers="keys",
                                     tablefmt="github")
    table_lines = get_unique_lines(table_string)
    return table_lines
Пример #6
0
def invert_with(f, d):
    """Inverts a dictionary from key->value mapping to a
    value->key mapping with some transform on the old values.
    The new keys must be hashable, per dictionary requirements.

    The transforming function needs to accept only the value as an
    argument.

    >>> invert_with(sum, {'ashley': [1,2,3], 'timothy': [4,5,6]})
    ... {6: 'ashley', 15: 'timothy'}
    """
    r_f = lambda item: (f(item[1]), item[0])
    return dict(toolz.itemmap(r_f, d))
def build_vocabulary(this_wordcount,
                     extra_words=EXTRA_WORDS,
                     is_reset=True,
                     truncate_to_most_frequent=0):
    """
    Builds vocabulary from wordcount.
    It also adds extra words to the vocabulary.

    In:
        this_wordcount - dictionary of wordcounts, e.g. {'cpu':3}
        extra_words - additional words to build the vocabulary
            dictionary of {word: id}
            by default {UNKNOWN: 0}
        is_reset - if True we restart the vocabulary counting
            by defaults False
        truncate_to_most_frequent - if positive then the vocabulary
            is truncated to 'truncate_to_most_frequent' words;
            by default 0 

    Out:
        word2index - mapping from words to indices
        index2word - mapping from indices to words
    """
    if is_reset:
        _myinc.counter = len(EXTRA_WORDS)
    if truncate_to_most_frequent > 0:
        sorted_wordcount = dict(
            sorted(this_wordcount.items(), key=lambda x: x[1],
                   reverse=True)[:truncate_to_most_frequent])
        this_wordcount = sorted_wordcount

    word2index = itemmap(_myinc, this_wordcount)
    if not extra_words == {}:
        assert (all(
            [el not in word2index.values() for el in extra_words.values()]))
        word2index.update(extra_words)
    index2word = itemmap(reversed, word2index)
    return word2index, index2word
def build_graph(rules: List[str]) -> Tuple[MultiDiGraph, dict]:
    """
    Rather than building the graph so that every individual bag is represented
    by a node, and colored, and doing the calculations based on colors in the
    graph, I did it so that every type of bag was one node, and the connections
    between the types of bag were represented by multiple edges. This is
    probably not the best way to do it, but it does work for these use cases.
    """
    nodes = [parse(rule)[0] for rule in rules]
    nodemap = dict(enumerate(nodes))
    color_to_node = itemmap(reversed, nodemap)
    noded = add_nodes(MultiDiGraph(), nodemap)
    edged = reduce(partial(parse_edges, color_to_node), rules, noded)
    return (edged, color_to_node)
Пример #9
0
def frename(keys: Collection, data_dict: Dict) -> Dict:
    """
    Rename keys according to a mapping
    :param keys:
    :param data_dict:
    :return:
    """
    def _keys(item):
        k, v = item
        if k in keys:
            return keys[k], v
        else:
            return k, v

    return dict(
        itemmap(_keys, data_dict)
    )
Пример #10
0
MAPPING = {'object': 'String',
           'uint64': 'UInt64',
           'uint32': 'UInt32',
           'uint16': 'UInt16',
           'uint8': 'UInt8',
           'float64': 'Float64',
           'float32': 'Float32',
           'int64': 'Int64',
           'int32': 'Int32',
           'int16': 'Int16',
           'int8': 'Int8',
           'datetime64[D]': 'Date',
           'datetime64[ns]': 'DateTime'}

PD2CH = keymap(np.dtype, MAPPING)
CH2PD = itemmap(reversed, MAPPING)
CH2PD['Null'] = 'object'
CH2PD['Nothing'] = 'object'

NULLABLE_COLS = ['UInt64', 'UInt32', 'UInt16', 'UInt8', 'Float64', 'Float32',
                 'Int64', 'Int32', 'Int16', 'Int8', 'String', 'DateTime']

for col in NULLABLE_COLS:
    CH2PD['Nullable({})'.format(col)] = CH2PD[col]
PY3 = sys.version_info[0] == 3


def normalize(df, index=True):
    if index:
        df = df.reset_index()
__all__ = [
    'build_vocabulary', 'index_sequence', 'encode_questions_index',
    'encode_questions_one_hot', 'encode_answers_one_hot'
]

###
###
# Constants
###
PADDING = '<pad>'
UNKNOWN = '<unk>'
EOA = '<eoa>'  # end of answer
EOQ = '<eoq>'  # end of question
EXTRA_WORDS_NAMES = [PADDING, UNKNOWN, EOA, EOQ]
EXTRA_WORDS = {PADDING: 0, UNKNOWN: 1, EOA: 2, EOQ: 3}
EXTRA_WORDS_ID = itemmap(reversed, EXTRA_WORDS)


###
# Functions
###
def static_vars(**kwargs):
    def decorate(func):
        for k in kwargs:
            setattr(func, k, kwargs[k])
        return func

    return decorate


@static_vars(counter=len(EXTRA_WORDS))
__all__ = ['build_vocabulary', 'index_sequence',
        'encode_questions_index','encode_questions_one_hot',
        'encode_answers_one_hot']

###
###
# Constants
###
PADDING = '<pad>'
UNKNOWN = '<unk>'
EOA = '<eoa>'       # end of answer
EOQ = '<eoq>'       # end of question
EXTRA_WORDS_NAMES = [PADDING, UNKNOWN, EOA, EOQ]
EXTRA_WORDS = {PADDING:0, UNKNOWN:1, EOA:2, EOQ:3}
EXTRA_WORDS_ID = itemmap(reversed, EXTRA_WORDS)

###
# Functions
###
def static_vars(**kwargs):
    def decorate(func):
        for k in kwargs:
            setattr(func, k, kwargs[k])
        return func
    return decorate


@static_vars(counter=len(EXTRA_WORDS))
def _myinc(d):
    """
Пример #13
0
def get_extra_params(link: ParseResult) -> Dict[str, str]:
    return itemmap(
        lambda i: decode_field(*i),
        valmap(first, parse_qs(link.query))
    )
Пример #14
0
def json2list(filter_fn, map_fn, dictionary):
    """Filter and map dictionary in succession to obtain list."""
    filtered = toolz.itemfilter(star(filter_fn), dictionary)
    return toolz.itemmap(star(map_fn), filtered, factory=list)