Пример #1
0
def build_batchers(word2id, cuda, debug):
    prepro = prepro_fn(args.max_art, args.max_abs)
    def sort_key(sample):
        src, target = sample
        return (len(target), len(src))
    batchify = compose(
        batchify_fn_copy(PAD, START, END, cuda=cuda),
        convert_batch_copy(UNK, word2id)
    )

    train_loader = DataLoader(
        MatchDataset('train'), batch_size=BUCKET_SIZE,
        shuffle=not debug,
        num_workers=4 if cuda and not debug else 0,
        collate_fn=coll_fn
    )
    train_batcher = BucketedGenerater(train_loader, prepro, sort_key, batchify,
                                      single_run=False, fork=not debug)

    val_loader = DataLoader(
        MatchDataset('val'), batch_size=BUCKET_SIZE,
        shuffle=False, num_workers=4 if cuda and not debug else 0,
        collate_fn=coll_fn
    )
    val_batcher = BucketedGenerater(val_loader, prepro, sort_key, batchify,
                                    single_run=True, fork=not debug)
    return train_batcher, val_batcher
Пример #2
0
def sort_prioritized_configs(backend_configs, master_config):
    resolved_backend_configs = tuple(
        (
            backend_name,
            resolve_config(backend_configs.get_config(backend_name), master_config),
        )
        for backend_name
        in backend_configs
    )
    backends_with_conflicting_priorities = tuple((
        backend_name
        for backend_name, count
        in collections.Counter((
            (backend_name, config['priority'])
            for backend_name, config
            in resolved_backend_configs
        )).items()
        if count > 1
    ))
    if backends_with_conflicting_priorities:
        raise ValueError(
            "The following package backends have conflicting priority "
            "values.  '{0}'.  Ensure that all priority values are unique "
            "across all backends.".format(
                ', '.join((backends_with_conflicting_priorities))
            )
        )

    return sorted(
        resolved_backend_configs,
        key=compose(
            operator.itemgetter('priority'),
            operator.itemgetter(1),
        ),
    )
Пример #3
0
def build_batchers(net_type, word2id, cuda, debug):
    assert net_type in ['ff', 'rnn']
    prepro = prepro_fn_extract(args.max_word, args.max_sent)
    def sort_key(sample):
        src_sents, _ = sample
        return len(src_sents)
    batchify_fn = (batchify_fn_extract_ff if net_type == 'ff'
                   else batchify_fn_extract_ptr)
    convert_batch = (convert_batch_extract_ff if net_type == 'ff'
                     else convert_batch_extract_ptr)
    batchify = compose(batchify_fn(PAD, cuda=cuda),
                       convert_batch(UNK, word2id))

    train_loader = DataLoader(
        ExtractDataset('train'), batch_size=BUCKET_SIZE,
        shuffle=not debug,
        num_workers=4 if cuda and not debug else 0,
        collate_fn=coll_fn_extract
    )
    train_batcher = BucketedGenerater(train_loader, prepro, sort_key, batchify,
                                      single_run=False, fork=not debug)

    val_loader = DataLoader(
        ExtractDataset('val'), batch_size=BUCKET_SIZE,
        shuffle=False, num_workers=4 if cuda and not debug else 0,
        collate_fn=coll_fn_extract
    )
    val_batcher = BucketedGenerater(val_loader, prepro, sort_key, batchify,
                                    single_run=True, fork=not debug)
    return train_batcher, val_batcher
Пример #4
0
    def _iter(self, usecols=None):

        # get the date column [(name, type)] pairs
        datecols = list(map(first, get_date_columns(self.schema)))

        # figure out which ones pandas needs to parse
        parse_dates = ordered_index(datecols, self.schema)
        if usecols is not None:
            parse_dates = [d for d in parse_dates if d in set(usecols)]

        reader = self.iterreader(parse_dates=parse_dates, usecols=usecols,
                                 squeeze=True)

        # pop one off the iterator
        initial = next(iter(reader))

        # get our names and initial dtypes for later inference
        if isinstance(initial, pd.Series):
            names = [str(initial.name)]
            formats = [initial.dtype]
        else:
            if usecols is None:
                index = slice(None)
            else:
                index = initial.columns.get_indexer(usecols)
            names = list(map(str, initial.columns[index]))
            formats = initial.dtypes[index].tolist()

        initial_dtype = np.dtype({'names': names, 'formats': formats})

        # what dtype do we actually want to see when we read
        streaming_dtype = self.get_streaming_dtype(initial_dtype)

        # everything must ultimately be a list of tuples
        m = partial(bz.into, list)

        slicerf = lambda x: x.replace('', np.nan)

        if isinstance(initial, pd.Series):
            streaming_dtype = streaming_dtype[first(streaming_dtype.names)]

        if streaming_dtype != initial_dtype:
            # we don't have the desired type so jump through hoops with
            # to_records -> astype(desired dtype) -> listify
            def mapper(x, dtype=streaming_dtype):
                r = slicerf(x)

                try:
                    r = r.to_records(index=False)
                except AttributeError:
                    # We have a series
                    r = r.values
                return m(r.astype(dtype))
        else:
            mapper = compose(m, slicerf)

        # convert our initial NDFrame to a list
        return it.chain(mapper(initial),
                        it.chain.from_iterable(map(mapper, reader)))
def get_predictions(location_name, key):

    __get_predictions = tlz.compose(
        clean_data,
        helpers.pull_json,
        predictions_url)

    return __get_predictions(location_name, key)
Пример #6
0
def build_batchers(net_type, word2id, cuda, debug, use_bert, bert_tokenizer):
    assert net_type in ['ff', 'rnn']
    def sort_key(sample):
        src_sents, _ = sample
        return len(src_sents)

    if not use_bert:
        prepro = prepro_fn_extract(args.max_word, args.max_sent)
        batchify_fn = (batchify_fn_extract_ff if net_type == 'ff'
                   else batchify_fn_extract_ptr)
        convert_batch = (convert_batch_extract_ff if net_type == 'ff'
                        else convert_batch_extract_ptr)
        batchify = compose(batchify_fn(PAD, cuda=cuda),
                       convert_batch(UNK, word2id))

    else:
        # prepro = prepro_fn_extract(args.max_word, args.max_sent)
        # batchify_fn = batchify_fn_bert_extract_ptr2
        # convert_batch = convert_batch_bert_extract_ptr2
        # batchify = compose(batchify_fn(bert_tokenizer.pad_token_id, cuda=cuda),
        #                 convert_batch(bert_tokenizer))

        prepro = prepro_fn_identity
        batchify_fn = batchify_fn_bert_extract_ptr2
        convert_batch = convert_batch_bert_extract_ptr3
        batchify = compose(batchify_fn(bert_tokenizer.pad_token_id, cuda=cuda),
                        convert_batch(bert_tokenizer, max_len=args.max_word, max_sent=args.max_sent))


    train_loader = DataLoader(
        ExtractDataset('train'), batch_size=BUCKET_SIZE,
        shuffle=not debug,
        num_workers=4 if cuda and not debug else 0,
        collate_fn=coll_fn_extract
    )
    train_batcher = BucketedGenerater(train_loader, prepro, sort_key, batchify,
                                      single_run=False, fork=not debug)

    val_loader = DataLoader(
        ExtractDataset('val'), batch_size=BUCKET_SIZE,
        shuffle=False, num_workers=4 if cuda and not debug else 0,
        collate_fn=coll_fn_extract
    )
    val_batcher = BucketedGenerater(val_loader, prepro, sort_key, batchify,
                                    single_run=True, fork=not debug)
    return train_batcher, val_batcher
Пример #7
0
def build_batchers(net_type, word2id, cuda, debug):
    assert net_type in ['ff', 'rnn', 'trans_rnn']
    prepro = prepro_fn_extract(args.max_word, args.max_sent)

    def sort_key(sample):
        src_sents, _ = sample
        return len(src_sents)

    if net_type == 'trans_rnn':
        prepro = prepro_fn_extract_trans(args.max_word, args.max_sent)
        batchify = compose(batchify_fn_extract_trans(cuda=cuda),
                           convert_batch_extract_trans)
    else:
        prepro = prepro_fn_extract(args.max_word, args.max_sent)
        batchify_fn = (batchify_fn_extract_ff
                       if net_type == 'ff' else batchify_fn_extract_ptr)
        convert_batch = (convert_batch_extract_ff
                         if net_type == 'ff' else convert_batch_extract_ptr)
        batchify = compose(batchify_fn(PAD, cuda=cuda),
                           convert_batch(UNK, word2id))

    train_loader = DataLoader(ExtractDataset('train'),
                              batch_size=BUCKET_SIZE,
                              shuffle=not debug,
                              num_workers=4 if cuda and not debug else 0,
                              collate_fn=coll_fn_extract)
    train_batcher = BucketedGenerater(train_loader,
                                      prepro,
                                      sort_key,
                                      batchify,
                                      single_run=False,
                                      fork=not debug)

    val_loader = DataLoader(ExtractDataset('val'),
                            batch_size=BUCKET_SIZE,
                            shuffle=False,
                            num_workers=4 if cuda and not debug else 0,
                            collate_fn=coll_fn_extract)
    val_batcher = BucketedGenerater(val_loader,
                                    prepro,
                                    sort_key,
                                    batchify,
                                    single_run=True,
                                    fork=not debug)
    return train_batcher, val_batcher
Пример #8
0
def hash_key(args, kwargs):
    # return (args, hash(frozenset(kwargs.items())))
    # return (map(make_hashable, args), frozenset(kwargs.items()))
    args = tuple(map(make_hashable, args))
    kwargs = frozenset(
        map(compose(tuple, partial(map, make_hashable)), kwargs.items()))
    # print('args', args)
    # print('kwargs', kwargs)
    return (args, kwargs)
Пример #9
0
 def compute(self, df):
     if isinstance(df, pd.Series):
         # Col() is NOT MEANT TO BE USED DIRECTLY ON SERIES -- IT IS NOT TESTED OR SUPPORTED
         # This is only here to support the case of scalar loc/iloc access in with_column
         # TODO: find a better solution
         col = df.loc[self.spec]
     else:
         col = df.loc[:, self.spec]
     if not self.fns:
         return col
     return tz.compose(*reversed(self.fns))(col)
Пример #10
0
    def normalize_result(msg: Tuple[BlockBody, ...]) -> BlockBodyBundles:
        uncles_hashes = tuple(
            map(compose(keccak, rlp.encode),
                tuple(body.uncles for body in msg)))
        transaction_roots_and_trie_data = tuple(
            map(make_trie_root_and_nodes,
                tuple(body.transactions for body in msg)))

        body_bundles = tuple(
            zip(msg, transaction_roots_and_trie_data, uncles_hashes))
        return body_bundles
Пример #11
0
def zhsent_preprocess(s):
    s = strQ2B(s)
    # zh_chars = ' '.join(nltk.word_tokenize(zh_chars))

    s, orig_number_strs = find_n_replace_numbers(s)
    s, orig_latin_strs = find_n_replace_latins(s)

    restore_num = partial(restore_place_holder, place_holder="{{CD}}", orig_strs=orig_number_strs)
    restore_latin = partial(restore_place_holder, place_holder="{{FW}}", orig_strs=orig_latin_strs)
    restore_all_place_holder = compose(restore_latin, restore_num)
    return s, restore_all_place_holder
Пример #12
0
def test_gaussian_GFE_entropy_gradient():
    num_units = 5
    lay = layers.GaussianLayer(num_units)

    lay.params.loc[:] = be.rand_like(lay.params.loc)
    lay.params.log_var[:] = be.randn(be.shape(lay.params.loc))

    from cytoolz import compose
    sum_square = compose(be.tsum, be.square)

    for itr in range(10):
        mag = lay.get_random_magnetization()
        lms = lay.lagrange_multipliers_analytic(mag)
        entropy = lay.TAP_entropy(mag)
        lr = 0.001
        gogogo = True
        grad = lay.TAP_magnetization_grad(mag, [], [], [])
        grad_mag = math.sqrt(be.float_scalar(be.accumulate(sum_square, grad)))
        normit = partial(be.tmul_, be.float_scalar(1.0/grad_mag))
        be.apply_(normit, grad)
        rand_grad = lay.get_random_magnetization()
        grad_mag = math.sqrt(be.float_scalar(be.accumulate(sum_square, rand_grad)))
        normit = partial(be.tmul_, be.float_scalar(1.0/grad_mag))
        be.apply_(normit, rand_grad)
        while gogogo:
            cop1_mag = deepcopy(mag)
            cop1_lms = deepcopy(lms)
            cop2_mag = deepcopy(mag)
            cop2_lms = deepcopy(lms)

            cop1_mag.mean[:] = mag.mean + lr * grad.mean
            cop2_mag.mean[:] = mag.mean + lr * rand_grad.mean
            cop1_mag.variance[:] = mag.variance + lr * grad.variance
            cop2_mag.variance[:] = mag.variance + lr * rand_grad.variance
            lay.clip_magnetization_(cop1_mag)
            lay.clip_magnetization_(cop2_mag)
            cop1_lms = lay.lagrange_multipliers_analytic(cop1_mag)
            cop2_lms = lay.lagrange_multipliers_analytic(cop2_mag)

            entropy_1 = lay.TAP_entropy(cop1_mag)
            entropy_2 = lay.TAP_entropy(cop2_mag)

            regress = entropy_1 - entropy_2 < 0.0
            #print(itr, "[",lr, "] ", entropy, entropy_1, entropy_2, regress)
            if regress:
                #print(grad, rand_grad)
                if lr < 1e-6:
                    assert False,\
                    "Gaussian GFE magnetization gradient is wrong"
                    break
                else:
                    lr *= 0.5
            else:
                break
Пример #13
0
def pop_nested_key(config, key):
    key_head, _, key_tail = key.rpartition('.')

    head_getters = (operator.itemgetter(key_part)
                    for key_part in key_head.split('.') if key_part)
    tail_popper = operator.methodcaller('pop', key_tail)

    popper_fn = compose(
        *reversed(tuple(itertools.chain(head_getters, (tail_popper, )))))

    return popper_fn(config)
Пример #14
0
def set_nested_key(config, key, value):
    key_head, _, key_tail = key.rpartition('.')

    head_setters = (operator.methodcaller('setdefault', key_part, {})
                    for key_part in key_head.split('.') if key_part)
    tail_setter = operator.methodcaller('__setitem__', key_tail, value)

    setter_fn = compose(
        *reversed(tuple((itertools.chain(head_setters, (tail_setter, ))))))

    # must write to both the config_for_read and config_for_write
    return setter_fn(config)
Пример #15
0
def defunct_hash_message(primitive=None, hexstr=None, text=None):
    '''
    Convert the provided message into a message hash, to be signed.
    This provides the same prefix and hashing approach as
    :meth:`w3.eth.sign() <web3.eth.Eth.sign>`. That means that the
    message will automatically be prepended with text
    defined in EIP-191 as version 'E': ``b'\\x19Ethereum Signed Message:\\n'``
    concatenated with the number of bytes in the message.

    Awkwardly, the number of bytes in the message is encoded in decimal ascii. So
    if the message is 'abcde', then the length is encoded as the ascii
    character '5'. This is one of the reasons that this message format is not preferred.
    There is ambiguity when the message '00' is encoded, for example.
    Only use this method if you must have compatibility with
    :meth:`w3.eth.sign() <web3.eth.Eth.sign>`.

    Supply exactly one of the three arguments:
    bytes, a hex string, or a unicode string.

    :param primitive: the binary message to be signed
    :type primitive: bytes or int
    :param str hexstr: the message encoded as hex
    :param str text: the message as a series of unicode characters (a normal Py3 str)
    :returns: The hash of the message, after adding the prefix
    :rtype: ~hexbytes.main.HexBytes

    .. code-block:: python

        >>> from newchain_account.messages import defunct_hash_message

        >>> msg = "I♥SF"
        >>> defunct_hash_message(text=msg)
        HexBytes('0x1476abb745d423bf09273f1afd887d951181d25adc66c4834a70491911b7f750')

        # these four also produce the same hash:
        >>> defunct_hash_message(w3.toBytes(text=msg))
        HexBytes('0x1476abb745d423bf09273f1afd887d951181d25adc66c4834a70491911b7f750')

        >>> defunct_hash_message(bytes(msg, encoding='utf-8'))
        HexBytes('0x1476abb745d423bf09273f1afd887d951181d25adc66c4834a70491911b7f750')

        >>> Web3.toHex(text=msg)
        '0x49e299a55346'
        >>> defunct_hash_message(hexstr='0x49e299a55346')
        HexBytes('0x1476abb745d423bf09273f1afd887d951181d25adc66c4834a70491911b7f750')

        >>> defunct_hash_message(0x49e299a55346)
        HexBytes('0x1476abb745d423bf09273f1afd887d951181d25adc66c4834a70491911b7f750')
    '''
    message_bytes = to_bytes(primitive, hexstr=hexstr, text=text)
    recovery_hasher = compose(HexBytes, keccak, signature_wrapper)
    return recovery_hasher(message_bytes)
Пример #16
0
 def interleave(
     cls,
     datasets: List[Dataset],
     identifier: Identifier,
 ) -> Dataset:
     """Interleave a list of datasets."""
     return cls.from_batch(
         tz.merge_with(
             tz.compose(list, tz.interleave),
             *[dataset[:] for dataset in datasets],
         ),
         identifier=identifier,
     )
Пример #17
0
 def chain(
     cls,
     datasets: List[Dataset],
     identifier: Identifier,
 ) -> Dataset:
     """Chain a list of datasets."""
     return cls.from_batch(
         tz.merge_with(
             tz.compose(list, tz.concat),
             *[dataset[:] for dataset in datasets],
         ),
         identifier=identifier,
     )
Пример #18
0
def grad_norm(grad):
    """
    Compute the l2 norm of the gradient.

    Args:
        grad (Gradient)

    Returns:
        magnitude (float)

    """
    tensor_sum_square = compose(be.tsum, be.square)
    return sqrt(grad_accumulate(tensor_sum_square, grad))
Пример #19
0
def grad_magnitude(grad):
    """
    Compute the root-mean-square of the gradient.

    Args:
        grad (Gradient)

    Returns:
        magnitude (float)

    """
    n = len(grad.layers) + len(grad.weights)
    tensor_mean_square = compose(be.mean, be.square)
    return sqrt(grad_accumulate(tensor_mean_square, grad) / n)
Пример #20
0
def validation_middleware(make_request, web3):
    transaction_validator = apply_formatters_to_dict({
        'chainId': validate_chain_id(web3),
    })

    transaction_sanitizer = compose(transaction_normalizer, transaction_validator)

    def middleware(method, params):
        if method in ('eth_sendTransaction', 'eth_estimateGas', 'eth_call'):
            post_validated_params = apply_formatter_at_index(transaction_sanitizer, 0, params)
            return make_request(method, post_validated_params)
        else:
            return make_request(method, params)
    return middleware
Пример #21
0
def to_string_pairs(segmentsbytxt, separator=" + "):
    """
    segmentsbytxt - Output from dual_segment_many.

    >>> exdata = [[([u"foo"], [u"foo"])], [([u"foo", u"bar", u"baz"], [u"foo", u"bar", u"baz"])]]

    >>> to_string_pairs(exdata)
    [(u"foo", u"foo"), (u"foo + bar + baz", u"foo + bar + baz")],

    >>> to_string_pairs(exdata, separator=", ")
    [(u"foo", u"foo"), (u"foo, bar, baz", u"foo, bar, baz")],
    """
    return tlz.pipe(segmentsbytxt, tlz.concat,
                    tlzc.map(tlz.compose(tuple, tlzc.map(separator.join))))
Пример #22
0
def pop_nested_key(config, key):
    key_head, _, key_tail = key.rpartition('.')

    head_getters = (
        operator.itemgetter(key_part)
        for key_part
        in key_head.split('.')
        if key_part
    )
    tail_popper = operator.methodcaller('pop', key_tail)

    popper_fn = compose(*reversed(tuple(itertools.chain(head_getters, (tail_popper,)))))

    return popper_fn(config)
Пример #23
0
def test_extract_links():
    first_link = compose(tuple, next, iter, extract_links)

    assert_equal(first_link("[[foo|bar]]"), ("Foo", "bar"))
    assert_equal(first_link("[[foo]]"), ("Foo", "foo"))
    assert_equal(first_link("[[File:picture!]] [[foo]]"), ("Foo", "foo"))
    assert_equal(first_link("[[foo]]bar."), ("Foo", "foobar"))
    assert_equal(first_link("[[baz|foobar]];"), ("Baz", "foobar"))
    assert_equal(first_link("[[baz#quux]]"), ("Baz", "baz#quux"))
    assert_equal(first_link("[[baz#quux|bla]]"), ("Baz", "bla"))
    assert_equal(first_link("[[FOO_BAR|foo bar]]"), ("FOO BAR", "foo bar"))

    # Links like these commonly occur in nlwiki (and presumably dewiki and
    # other compounding languages):
    assert_equal(first_link("foo[[baz|bar]]"), ("Baz", "foobar"))

    # MediaWiki only considers alphabetic characters outside [[]] part of the
    # anchor.
    assert_equal(first_link("foo-[[bar]]"), ("Bar", "bar"))
    assert_equal(first_link("[[bar]]/baz"), ("Bar", "bar"))
    # XXX The following are broken. They do occur in the wild, e.g.,
    # -18[[Celsius|°C]] and 700[[Megabyte|MB]]-cd (found in nlwiki dump).
    # assert_equal(first_link("[[bar]]0"), ("Bar", "bar"))
    # assert_equal(first_link("[[bar]]_"), ("Bar", "bar"))

    # We're not interested in section links
    assert_equal(first_link("[[#Some section|elsewhere]] [[other_article]]"),
                 ("Other article", "other_article"))

    # This construct appears in enwiki for chemical formulae etc., but also in
    # nlwiki (and dewiki?) for more general compound nouns. The current
    # handling may not be exactly what we want; any fix should update the test
    # accordingly.
    assert_equal(list(extract_links("[[Lithium|Li]][[Fluorine|F]]")),
                 [("Lithium", "Li"), ("Fluorine", "F")])
    assert_equal(list(extract_links("[[tera-|tera]][[becquerel]]s")),
                 [("Tera-", "tera"), ("Becquerel", "becquerels")])
    assert_equal(
        list(
            extract_links("""[[Lord's
        prayer]]
        [[Dismissal 
        (cricket)|dismissal]] [[Badass|Chuck 
        Norris]]""")), [("Lord's prayer", "Lord's prayer"),
                        ("Dismissal (cricket)", "dismissal"),
                        ("Badass", "Chuck Norris")])

    assert_equal(
        list(extract_links("[[C. Stephen Evans | Evans, C. Stephen]]")),
        [('C. Stephen Evans', 'Evans, C. Stephen')])
def test_extract_links():
    first_link = compose(tuple, next, iter, extract_links)

    assert_equal(first_link("[[foo|bar]]"), ("Foo", "bar"))
    assert_equal(first_link("[[foo]]"), ("Foo", "foo"))
    assert_equal(first_link("[[File:picture!]] [[foo]]"), ("Foo", "foo"))
    assert_equal(first_link("[[foo]]bar."), ("Foo", "foobar"))
    assert_equal(first_link("[[baz|foobar]];"), ("Baz", "foobar"))
    assert_equal(first_link("[[baz#quux]]"), ("Baz", "baz#quux"))
    assert_equal(first_link("[[baz#quux|bla]]"), ("Baz", "bla"))
    assert_equal(first_link("[[FOO_BAR|foo bar]]"), ("FOO BAR", "foo bar"))

    # Links like these commonly occur in nlwiki (and presumably dewiki and
    # other compounding languages):
    assert_equal(first_link("foo[[baz|bar]]"), ("Baz", "foobar"))

    # MediaWiki only considers alphabetic characters outside [[]] part of the
    # anchor.
    assert_equal(first_link("foo-[[bar]]"), ("Bar", "bar"))
    assert_equal(first_link("[[bar]]/baz"), ("Bar", "bar"))
    # XXX The following are broken. They do occur in the wild, e.g.,
    # -18[[Celsius|°C]] and 700[[Megabyte|MB]]-cd (found in nlwiki dump).
    # assert_equal(first_link("[[bar]]0"), ("Bar", "bar"))
    # assert_equal(first_link("[[bar]]_"), ("Bar", "bar"))

    # We're not interested in section links
    assert_equal(first_link("[[#Some section|elsewhere]] [[other_article]]"), ("Other article", "other_article"))

    # This construct appears in enwiki for chemical formulae etc., but also in
    # nlwiki (and dewiki?) for more general compound nouns. The current
    # handling may not be exactly what we want; any fix should update the test
    # accordingly.
    assert_equal(list(extract_links("[[Lithium|Li]][[Fluorine|F]]")), [("Lithium", "Li"), ("Fluorine", "F")])
    assert_equal(list(extract_links("[[tera-|tera]][[becquerel]]s")), [("Tera-", "tera"), ("Becquerel", "becquerels")])
    assert_equal(
        list(
            extract_links(
                """[[Lord's
        prayer]]
        [[Dismissal 
        (cricket)|dismissal]] [[Badass|Chuck 
        Norris]]"""
            )
        ),
        [("Lord's prayer", "Lord's prayer"), ("Dismissal (cricket)", "dismissal"), ("Badass", "Chuck Norris")],
    )

    assert_equal(
        list(extract_links("[[C. Stephen Evans | Evans, C. Stephen]]")), [("C. Stephen Evans", "Evans, C. Stephen")]
    )
Пример #25
0
def process(args, i):
    data_dir = join(args.data_dir, args.mode)
    with open(join(data_dir, '{}.json'.format(i))) as f:
        data = json.loads(f.read())
    tokenize = compose(list, _split_words)
    art_sents = tokenize(data['article'])
    abs_sents = tokenize(data['abstract'])
    if art_sents and abs_sents:  # some data contains empty article/abstract
        extracted, scores = get_extract_label(art_sents, abs_sents)
    else:
        extracted, scores = [], []
    data['extracted'] = extracted
    data['score'] = scores
    with open(join(data_dir, '{}.json'.format(i)), 'w') as f:
        json.dump(data, f, indent=4)
Пример #26
0
def set_nested_key(config, key, value):
    key_head, _, key_tail = key.rpartition('.')

    head_setters = (
        operator.methodcaller('setdefault', key_part, {})
        for key_part
        in key_head.split('.')
        if key_part
    )
    tail_setter = operator.methodcaller('__setitem__', key_tail, value)

    setter_fn = compose(*reversed(tuple((itertools.chain(head_setters, (tail_setter,))))))

    # must write to both the config_for_read and config_for_write
    return setter_fn(config)
def process(split, i):
    data_dir = join(DATA_DIR, split)
    with open(join(data_dir, '{}.json'.format(i)), encoding='utf-8') as f:
        data = json.loads(f.read(), encoding='utf-8')
    tokenize = compose(list, _split_words)
    art_sents = tokenize(data['article'])
    abs_sents = tokenize(data['abstract'])
    if art_sents and abs_sents:  # some data contains empty article/abstract
        extracted, scores = get_extract_label(art_sents, abs_sents)
    else:
        extracted, scores = [], []
    data['extracted'] = extracted
    data['score'] = scores
    with open(join(data_dir, '{}.json'.format(i)), 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=4, ensure_ascii=False)
def process(split, i):
    data_dir = join(DATA_DIR, split)
    with open(join(data_dir, '{}.json'.format(i))) as f:
        data = json.loads(f.read())
    tokenize = compose(list, _split_words)
    art_sents = tokenize(data['article'])
    abs_sents = tokenize(data['abstract'])
    if art_sents and abs_sents: # some data contains empty article/abstract
        extracted, scores = get_extract_label(art_sents, abs_sents)
    else:
        extracted, scores = [], []
    data['extracted'] = extracted
    data['score'] = scores
    with open(join(data_dir, '{}.json'.format(i)), 'w') as f:
        json.dump(data, f, indent=4)
Пример #29
0
    def from_batches(
        cls,
        batches: Sequence[Batch],
        identifier: Identifier = None,
        dataset_fmt: str = "in_memory",
    ) -> Dataset:
        """Convert a list of batches to a dataset."""

        return cls.from_batch(
            tz.merge_with(
                tz.compose(list, tz.concat),
                *batches,
            ),
            identifier=identifier,
            dataset_fmt=dataset_fmt,
        )
def get_observations(location_name, key):
    """Get a cleaned up list of observations at location_name

    `key` must be an API key for the met office

    For the last 24 hours.

    `location_name` is looked up in _met_office_location_codes dict.
    """

    __get_observations = tlz.compose(
        clean_data,
        helpers.pull_json,
        observations_url)

    return __get_observations(location_name, key)
Пример #31
0
    def score(self, batch: Dict[str, List], columns: List[str], *args,
              **kwargs) -> np.ndarray:
        # Compute the length of each example under each key
        lengths = [
            Spacy.retrieve(
                batch=batch,
                columns=[key],
                proc_fns=tz.compose(
                    # Compute lengths (# of words) for each tokenized text in a batch
                    lambda l: np.array([len(t) for t in l]),
                    # Extract tokens using Spacy
                    Spacy.tokens,
                ),
            )[key] for key in columns
        ]

        # Reduction over the key axis
        return self.reduction_fn(np.array(lengths), axis=0)
Пример #32
0
        def __getitem__(self, key: str) -> iter:
            if key not in self.map:
                raise ValueError(
                    dedent("""\
                        Key '{}' is invalid!
                        Valid keys: {}
                        """.format(
                        key,
                        reduce(lambda k1, k2: '{}, {}'.format(k1, k2),
                               map(lambda k: "'{}'".format(k), self.map)))))
            ref = self.map[key]
            if 'api' not in ref:
                ref['api'] = 'dbpy'  # default api
            api = ref['api']

            # load reader
            if key not in self.cache:
                print("Loading '{}' reader...".format(key))
                if api not in ('dbpy', 'stpy'):
                    raise ValueError("Invalid api type '{}'!".format(api))
                if 'id' not in ref:
                    ref['id'] = key  # default id
                id = ref['id']
                if api == 'dbpy':
                    self.cache[key] = fromiter(
                        read_syncdatalist_float(id, self.hi_tag,
                                                tuple(map(int,
                                                          self.low_tags))),
                        'float')
                if api == 'stpy':
                    self.cache[key] = StorageWrapper(*map(int, self.runs),
                                                     beamline=self.beamline,
                                                     id=id)
                if 'deco' not in ref:
                    ref['deco'] = identity  # default deco
                print('Loaded!')

            data = self.cache[key]
            deco = ref['deco'] if hasattr(ref['deco'], '__call__') else eval(
                ref['deco'])
            if api == 'dbpy':
                return map(deco, data)
            if api == 'stpy':
                return map(compose(deco, data.__getitem__), self.low_tags)
Пример #33
0
def all_of(inners, arg):
    """All of the inner valudators must pass.

    The order of inner validators matters.

    Parameters
    ----------
    inners : List[validator]
      Functions are applied from right to left so allof([rule1, rule2], arg) is
      the same as rule1(rule2(arg)).
    arg : Any
      Value to be validated.

    Returns
    -------
    arg : Any
      Value maybe coerced by inner validators to the appropiate types
    """
    return compose(*inners)(arg)
Пример #34
0
def process(split, i):
    data_dir = join(DATA_DIR, split)
    #data_dir = './email_dataset'
    with open(join(data_dir, '{}.json'.format(i))) as f:
        print(join(data_dir, '{}.json'.format(i)))
        data = json.loads(f.read())
    #data = pd.read_csv(r'./email_dataset/Processed_Email_Dataset.csv')
    #data = data.iloc[i]
    tokenize = compose(list, _split_words)
    art_sents = tokenize(data['email_body'])
    abs_sents = tokenize(data['subject'])
    if art_sents and abs_sents: # some data contains empty article/abstract
        extracted, scores = get_extract_label(art_sents, abs_sents)
    else:
        extracted, scores = [], []
    data['extracted'] = extracted
    data['score'] = scores
    with open(join(data_dir, '{}.json'.format(i)), 'w') as f:
        json.dump(data, f, indent=4)
Пример #35
0
def all_of(inners, arg):
    """All of the inner validators must pass.

    The order of inner validators matters.

    Parameters
    ----------
    inners : List[validator]
      Functions are applied from right to left so allof([rule1, rule2], arg) is
      the same as rule1(rule2(arg)).
    arg : Any
      Value to be validated.

    Returns
    -------
    arg : Any
      Value maybe coerced by inner validators to the appropiate types
    """
    return compose(*inners)(arg)
def process(split, i):
    data_dir = join(DATA_DIR, split)
    with open(join(data_dir, '{}.json'.format(i))) as f:
        data = json.loads(f.read())
    tokenize = compose(list, _split_words)
    art_sents = tokenize(data['article'])
    abs_sents = tokenize(data['abstract'])
    if art_sents and abs_sents:  # some data contains empty article/abstract
        extracted, scores = get_extract_label(art_sents, abs_sents)
    else:
        extracted, scores = [], []
    data['extracted'] = extracted
    data['score'] = scores
    # with open(join(join('GT_12L_avg/test', split), '{}.dec'.format(i)), 'w') as a:
    #     label_sent = [data['article'][i] for i in extracted]
    #     a.write('\n'.join(label_sent))

    with open(join(data_dir, '{}.json'.format(i)), 'w') as f:
        json.dump(data, f, indent=4)
def label(split):
    start = time()
    print('start processing {} split...'.format(split))
    data_dir = join(DATA_DIR, split)
    n_data = count_data(data_dir)
    for i in range(n_data):
        print('processing {}/{} ({:.2f}%%)\r'.format(i, n_data, 100*i/n_data),
              end='')
        with open(join(data_dir, '{}.json'.format(i))) as f:
            data = json.loads(f.read())
        tokenize = compose(list, _split_words)
        art_sents = tokenize(data['article'])
        abs_sents = tokenize(data['abstract'])
        extracted, scores = get_extract_label(art_sents, abs_sents)
        data['extracted'] = extracted
        data['score'] = scores
        with open(join(data_dir, '{}.json'.format(i)), 'w') as f:
            json.dump(data, f, indent=4)
    print('finished in {}'.format(timedelta(seconds=time()-start)))
Пример #38
0
def process(split, i):
    data_dir = join(DATA_DIR, split)
    with open(join(data_dir, '{}.json'.format(i))) as f:
        data = json.loads(f.read())
    tokenize = compose(list, _split_words)
    art_sents = tokenize(data['article'])
    abs_sents = tokenize(data['abstract'])
    if art_sents and abs_sents:  # some data contains empty article/abstract
        extracted, scores, new_abs_sents, art_sents = get_extract_label(
            art_sents, abs_sents)
    else:
        extracted, scores, new_abs_sents = [], [], []
    data['extracted'] = extracted
    data['score'] = scores
    data['new_abs_sents'] = [' '.join(s) for s in new_abs_sents]
    data['article'] = [' '.join(s) for s in art_sents]
    print(split, '{}.json'.format(i))
    with open(join(data_dir, '{}.json'.format(i)), 'w') as f:
        json.dump(data, f, indent=4)
Пример #39
0
def get_nested_key(config, key):
    key_head, _, key_tail = key.rpartition('.')

    head_getters = (operator.itemgetter(key_part)
                    for key_part in key_head.split('.') if key_part)

    tail_getter = operator.itemgetter(key_tail)

    getter_fn = compose(
        *reversed(tuple(itertools.chain(head_getters, (tail_getter, )))))

    try:
        return getter_fn(config)
    except TypeError as err:
        raise KeyError("Error getting nested key {0} from {1}: {2}".format(
            key,
            force_text(repr(config)),
            str(err),
        ))
def label(split):
    start = time()
    print('start processing {} split...'.format(split))
    data_dir = join(DATA_DIR, split)
    n_data = count_data(data_dir)
    for i in range(n_data):
        print('processing {}/{} ({:.2f}%%)\r'.format(i, n_data, 100*i/n_data),
              end='')
        with open(join(data_dir, '{}.json'.format(i))) as f:
            data = json.loads(f.read())
        tokenize = compose(list, _split_words)
        art_sents = tokenize(data['article'])
        abs_sents = tokenize(data['abstract'])
        extracted, scores = get_extract_label(art_sents, abs_sents)
        data['extracted'] = extracted
        data['score'] = scores
        with open(join(data_dir, '{}.json'.format(i)), 'w') as f:
            json.dump(data, f, indent=4)
    print('finished in {}'.format(timedelta(seconds=time()-start)))
Пример #41
0
def create_slice(args):
    # Unpack args
    dataset, slice_membership, slice_batches, i, batch_size, slice_cache_hash = args

    # Create a new empty slice
    sl = Slice.from_dict({})

    # Create a Slice "copy" of the Dataset
    sl.__dict__.update(dataset.__dict__)
    sl._identifier = None

    # Filter
    sl = sl.filter(
        lambda example, idx: bool(slice_membership[idx, i]),
        with_indices=True,
        input_columns=["index"],
        batch_size=batch_size,
        cache_file_name=str(
            dataset.logdir / ("cache-" + str(abs(slice_cache_hash)) + "-filter.arrow")
        ),
    )

    slice_batch = tz.merge_with(tz.compose(list, tz.concat), slice_batches)

    # FIXME(karan): interaction tape history is wrong here, esp with augmenation/attacks

    # Map
    if len(sl):
        sl = sl.map(
            lambda batch, indices: tz.valmap(
                lambda v: v[indices[0] : indices[0] + batch_size], slice_batch
            ),
            batched=True,
            batch_size=batch_size,
            with_indices=True,
            remove_columns=sl.column_names,
            cache_file_name=str(
                dataset.logdir / ("cache-" + str(abs(slice_cache_hash)) + ".arrow")
            ),
        )

    return sl
Пример #42
0
def process(split, i):
    data_dir = join(DATA_DIR, split)
    with open(join(data_dir, '{}.json'.format(i))) as f:
        try:
            data = json.loads(f.read())
        except JSONDecodeError:
            data = {'article': '', 'abstract': ''}

    tokenize = compose(list, _split_words)
    art_sents = tokenize(data['article']) if data['article'] is not '' else []
    abs_sents = tokenize(
        data['abstract']) if data['abstract'] is not '' else []
    if art_sents and abs_sents:  # some data contains empty article/abstract
        extracted, scores = get_extract_label(art_sents, abs_sents)
    else:
        extracted, scores = [], []
    data['extracted'] = extracted
    data['score'] = scores
    with open(join(data_dir, '{}.json'.format(i)), 'w') as f:
        json.dump(data, f, indent=4)
Пример #43
0
def build_batchers_entity(net_type, word2id, cuda, debug):
    assert net_type in ['entity']

    prepro = prepro_fn_extract_entity(args.max_word, args.max_sent)

    # def sort_key(sample):
    #     src_sents, _, _ = sample
    #     return len(src_sents)
    def sort_key(sample):
        src_sents = sample[0]
        return len(src_sents)


    key = 'filtered_rule23_6_input_mention_cluster'


    batchify_fn = batchify_fn_extract_ptr_entity
    convert_batch = convert_batch_extract_ptr_entity


    batchify = compose(batchify_fn(PAD, cuda=cuda),
                       convert_batch(UNK, word2id))

    train_loader = DataLoader(
        EntityExtractDataset_combine('train', key), batch_size=BUCKET_SIZE,
        shuffle=not debug,
        num_workers=4 if cuda and not debug else 0,
        collate_fn=coll_fn_extract_entity
    )
    train_batcher = BucketedGenerater(train_loader, prepro, sort_key, batchify,
                                      single_run=False, fork=not debug)

    val_loader = DataLoader(
        EntityExtractDataset_combine('val', key), batch_size=BUCKET_SIZE,
        shuffle=False, num_workers=4 if cuda and not debug else 0,
        collate_fn=coll_fn_extract_entity
    )
    val_batcher = BucketedGenerater(val_loader, prepro, sort_key, batchify,
                                    single_run=True, fork=not debug)

    return train_batcher, val_batcher
Пример #44
0
def get_nested_key(config, key):
    key_head, _, key_tail = key.rpartition('.')

    head_getters = (
        operator.itemgetter(key_part)
        for key_part
        in key_head.split('.')
        if key_part
    )

    tail_getter = operator.itemgetter(key_tail)

    getter_fn = compose(*reversed(tuple(itertools.chain(head_getters, (tail_getter,)))))

    try:
        return getter_fn(config)
    except TypeError as err:
        raise KeyError(
            "Error getting nested key {0} from {1}: {2}".format(
                key,
                force_text(repr(config)),
                str(err),
            )
        )
Пример #45
0
FILTER_PARAMS_MAPPINGS = {
    'fromBlock': 'from_block',
    'toBlock': 'to_block',
}

filter_params_remapper = apply_key_map(FILTER_PARAMS_MAPPINGS)

FILTER_PARAMS_FORMATTERS = {
    'fromBlock': to_integer_if_hex,
    'toBlock': to_integer_if_hex,
}

filter_params_formatter = apply_formatters_to_dict(FILTER_PARAMS_FORMATTERS)

filter_params_transformer = compose(filter_params_remapper, filter_params_formatter)


TRANSACTION_FORMATTERS = {
    'to': apply_formatter_if(partial(operator.eq, b''), static_return(None)),
}


transaction_formatter = apply_formatters_to_dict(TRANSACTION_FORMATTERS)


RECEIPT_FORMATTERS = {
    'logs': apply_formatter_to_array(log_key_remapper),
}

Пример #46
0
__title__ = 'text2math'
__author__ = 'Steven Cutting'
__author_email__ = '*****@*****.**'
__created_on__ = '02/06/2016'
__copyright__ = "text2math Copyright (C) 2016  Steven Cutting"

import cytoolz as tlz

from text2math.raw2text import(remove_html_bits, decode_and_fix, adv_decode)
from text2math.text2tokens import(ngram, unigram, bigram, trigram,
                                  uni_and_bigram_tuples)
from text2math.tokens2numbers import freq


tknize_uni_n_bi = tlz.compose(tuple,
                              uni_and_bigram_tuples,
                              decode_and_fix,
                              remove_html_bits)

total_counts = tlz.compose(freq, tlz.concat)
Пример #47
0
 def hashMessage(data=None, hexstr=None, text=None):
     message_bytes = to_bytes(data, hexstr=hexstr, text=text)
     recovery_hasher = compose(HexBytes, keccak, signature_wrapper)
     return recovery_hasher(message_bytes)