예제 #1
0
def build_batchers(word2id, cuda, debug):
    prepro = prepro_fn(args.max_art, args.max_abs)
    def sort_key(sample):
        src, target = sample
        return (len(target), len(src))
    batchify = compose(
        batchify_fn_copy(PAD, START, END, cuda=cuda),
        convert_batch_copy(UNK, word2id)
    )

    train_loader = DataLoader(
        MatchDataset('train'), batch_size=BUCKET_SIZE,
        shuffle=not debug,
        num_workers=4 if cuda and not debug else 0,
        collate_fn=coll_fn
    )
    train_batcher = BucketedGenerater(train_loader, prepro, sort_key, batchify,
                                      single_run=False, fork=not debug)

    val_loader = DataLoader(
        MatchDataset('val'), batch_size=BUCKET_SIZE,
        shuffle=False, num_workers=4 if cuda and not debug else 0,
        collate_fn=coll_fn
    )
    val_batcher = BucketedGenerater(val_loader, prepro, sort_key, batchify,
                                    single_run=True, fork=not debug)
    return train_batcher, val_batcher
예제 #2
0
파일: helpers.py 프로젝트: miohtama/populus
def sort_prioritized_configs(backend_configs, master_config):
    resolved_backend_configs = tuple(
        (
            backend_name,
            resolve_config(backend_configs.get_config(backend_name), master_config),
        )
        for backend_name
        in backend_configs
    )
    backends_with_conflicting_priorities = tuple((
        backend_name
        for backend_name, count
        in collections.Counter((
            (backend_name, config['priority'])
            for backend_name, config
            in resolved_backend_configs
        )).items()
        if count > 1
    ))
    if backends_with_conflicting_priorities:
        raise ValueError(
            "The following package backends have conflicting priority "
            "values.  '{0}'.  Ensure that all priority values are unique "
            "across all backends.".format(
                ', '.join((backends_with_conflicting_priorities))
            )
        )

    return sorted(
        resolved_backend_configs,
        key=compose(
            operator.itemgetter('priority'),
            operator.itemgetter(1),
        ),
    )
예제 #3
0
def build_batchers(net_type, word2id, cuda, debug):
    assert net_type in ['ff', 'rnn']
    prepro = prepro_fn_extract(args.max_word, args.max_sent)
    def sort_key(sample):
        src_sents, _ = sample
        return len(src_sents)
    batchify_fn = (batchify_fn_extract_ff if net_type == 'ff'
                   else batchify_fn_extract_ptr)
    convert_batch = (convert_batch_extract_ff if net_type == 'ff'
                     else convert_batch_extract_ptr)
    batchify = compose(batchify_fn(PAD, cuda=cuda),
                       convert_batch(UNK, word2id))

    train_loader = DataLoader(
        ExtractDataset('train'), batch_size=BUCKET_SIZE,
        shuffle=not debug,
        num_workers=4 if cuda and not debug else 0,
        collate_fn=coll_fn_extract
    )
    train_batcher = BucketedGenerater(train_loader, prepro, sort_key, batchify,
                                      single_run=False, fork=not debug)

    val_loader = DataLoader(
        ExtractDataset('val'), batch_size=BUCKET_SIZE,
        shuffle=False, num_workers=4 if cuda and not debug else 0,
        collate_fn=coll_fn_extract
    )
    val_batcher = BucketedGenerater(val_loader, prepro, sort_key, batchify,
                                    single_run=True, fork=not debug)
    return train_batcher, val_batcher
예제 #4
0
파일: csv.py 프로젝트: ChrisBeaumont/blaze
    def _iter(self, usecols=None):

        # get the date column [(name, type)] pairs
        datecols = list(map(first, get_date_columns(self.schema)))

        # figure out which ones pandas needs to parse
        parse_dates = ordered_index(datecols, self.schema)
        if usecols is not None:
            parse_dates = [d for d in parse_dates if d in set(usecols)]

        reader = self.iterreader(parse_dates=parse_dates, usecols=usecols,
                                 squeeze=True)

        # pop one off the iterator
        initial = next(iter(reader))

        # get our names and initial dtypes for later inference
        if isinstance(initial, pd.Series):
            names = [str(initial.name)]
            formats = [initial.dtype]
        else:
            if usecols is None:
                index = slice(None)
            else:
                index = initial.columns.get_indexer(usecols)
            names = list(map(str, initial.columns[index]))
            formats = initial.dtypes[index].tolist()

        initial_dtype = np.dtype({'names': names, 'formats': formats})

        # what dtype do we actually want to see when we read
        streaming_dtype = self.get_streaming_dtype(initial_dtype)

        # everything must ultimately be a list of tuples
        m = partial(bz.into, list)

        slicerf = lambda x: x.replace('', np.nan)

        if isinstance(initial, pd.Series):
            streaming_dtype = streaming_dtype[first(streaming_dtype.names)]

        if streaming_dtype != initial_dtype:
            # we don't have the desired type so jump through hoops with
            # to_records -> astype(desired dtype) -> listify
            def mapper(x, dtype=streaming_dtype):
                r = slicerf(x)

                try:
                    r = r.to_records(index=False)
                except AttributeError:
                    # We have a series
                    r = r.values
                return m(r.astype(dtype))
        else:
            mapper = compose(m, slicerf)

        # convert our initial NDFrame to a list
        return it.chain(mapper(initial),
                        it.chain.from_iterable(map(mapper, reader)))
def get_predictions(location_name, key):

    __get_predictions = tlz.compose(
        clean_data,
        helpers.pull_json,
        predictions_url)

    return __get_predictions(location_name, key)
예제 #6
0
def build_batchers(net_type, word2id, cuda, debug, use_bert, bert_tokenizer):
    assert net_type in ['ff', 'rnn']
    def sort_key(sample):
        src_sents, _ = sample
        return len(src_sents)

    if not use_bert:
        prepro = prepro_fn_extract(args.max_word, args.max_sent)
        batchify_fn = (batchify_fn_extract_ff if net_type == 'ff'
                   else batchify_fn_extract_ptr)
        convert_batch = (convert_batch_extract_ff if net_type == 'ff'
                        else convert_batch_extract_ptr)
        batchify = compose(batchify_fn(PAD, cuda=cuda),
                       convert_batch(UNK, word2id))

    else:
        # prepro = prepro_fn_extract(args.max_word, args.max_sent)
        # batchify_fn = batchify_fn_bert_extract_ptr2
        # convert_batch = convert_batch_bert_extract_ptr2
        # batchify = compose(batchify_fn(bert_tokenizer.pad_token_id, cuda=cuda),
        #                 convert_batch(bert_tokenizer))

        prepro = prepro_fn_identity
        batchify_fn = batchify_fn_bert_extract_ptr2
        convert_batch = convert_batch_bert_extract_ptr3
        batchify = compose(batchify_fn(bert_tokenizer.pad_token_id, cuda=cuda),
                        convert_batch(bert_tokenizer, max_len=args.max_word, max_sent=args.max_sent))


    train_loader = DataLoader(
        ExtractDataset('train'), batch_size=BUCKET_SIZE,
        shuffle=not debug,
        num_workers=4 if cuda and not debug else 0,
        collate_fn=coll_fn_extract
    )
    train_batcher = BucketedGenerater(train_loader, prepro, sort_key, batchify,
                                      single_run=False, fork=not debug)

    val_loader = DataLoader(
        ExtractDataset('val'), batch_size=BUCKET_SIZE,
        shuffle=False, num_workers=4 if cuda and not debug else 0,
        collate_fn=coll_fn_extract
    )
    val_batcher = BucketedGenerater(val_loader, prepro, sort_key, batchify,
                                    single_run=True, fork=not debug)
    return train_batcher, val_batcher
예제 #7
0
def build_batchers(net_type, word2id, cuda, debug):
    assert net_type in ['ff', 'rnn', 'trans_rnn']
    prepro = prepro_fn_extract(args.max_word, args.max_sent)

    def sort_key(sample):
        src_sents, _ = sample
        return len(src_sents)

    if net_type == 'trans_rnn':
        prepro = prepro_fn_extract_trans(args.max_word, args.max_sent)
        batchify = compose(batchify_fn_extract_trans(cuda=cuda),
                           convert_batch_extract_trans)
    else:
        prepro = prepro_fn_extract(args.max_word, args.max_sent)
        batchify_fn = (batchify_fn_extract_ff
                       if net_type == 'ff' else batchify_fn_extract_ptr)
        convert_batch = (convert_batch_extract_ff
                         if net_type == 'ff' else convert_batch_extract_ptr)
        batchify = compose(batchify_fn(PAD, cuda=cuda),
                           convert_batch(UNK, word2id))

    train_loader = DataLoader(ExtractDataset('train'),
                              batch_size=BUCKET_SIZE,
                              shuffle=not debug,
                              num_workers=4 if cuda and not debug else 0,
                              collate_fn=coll_fn_extract)
    train_batcher = BucketedGenerater(train_loader,
                                      prepro,
                                      sort_key,
                                      batchify,
                                      single_run=False,
                                      fork=not debug)

    val_loader = DataLoader(ExtractDataset('val'),
                            batch_size=BUCKET_SIZE,
                            shuffle=False,
                            num_workers=4 if cuda and not debug else 0,
                            collate_fn=coll_fn_extract)
    val_batcher = BucketedGenerater(val_loader,
                                    prepro,
                                    sort_key,
                                    batchify,
                                    single_run=True,
                                    fork=not debug)
    return train_batcher, val_batcher
예제 #8
0
def hash_key(args, kwargs):
    # return (args, hash(frozenset(kwargs.items())))
    # return (map(make_hashable, args), frozenset(kwargs.items()))
    args = tuple(map(make_hashable, args))
    kwargs = frozenset(
        map(compose(tuple, partial(map, make_hashable)), kwargs.items()))
    # print('args', args)
    # print('kwargs', kwargs)
    return (args, kwargs)
예제 #9
0
 def compute(self, df):
     if isinstance(df, pd.Series):
         # Col() is NOT MEANT TO BE USED DIRECTLY ON SERIES -- IT IS NOT TESTED OR SUPPORTED
         # This is only here to support the case of scalar loc/iloc access in with_column
         # TODO: find a better solution
         col = df.loc[self.spec]
     else:
         col = df.loc[:, self.spec]
     if not self.fns:
         return col
     return tz.compose(*reversed(self.fns))(col)
예제 #10
0
    def normalize_result(msg: Tuple[BlockBody, ...]) -> BlockBodyBundles:
        uncles_hashes = tuple(
            map(compose(keccak, rlp.encode),
                tuple(body.uncles for body in msg)))
        transaction_roots_and_trie_data = tuple(
            map(make_trie_root_and_nodes,
                tuple(body.transactions for body in msg)))

        body_bundles = tuple(
            zip(msg, transaction_roots_and_trie_data, uncles_hashes))
        return body_bundles
예제 #11
0
파일: tools.py 프로젝트: d2207197/smttoktag
def zhsent_preprocess(s):
    s = strQ2B(s)
    # zh_chars = ' '.join(nltk.word_tokenize(zh_chars))

    s, orig_number_strs = find_n_replace_numbers(s)
    s, orig_latin_strs = find_n_replace_latins(s)

    restore_num = partial(restore_place_holder, place_holder="{{CD}}", orig_strs=orig_number_strs)
    restore_latin = partial(restore_place_holder, place_holder="{{FW}}", orig_strs=orig_latin_strs)
    restore_all_place_holder = compose(restore_latin, restore_num)
    return s, restore_all_place_holder
예제 #12
0
def test_gaussian_GFE_entropy_gradient():
    num_units = 5
    lay = layers.GaussianLayer(num_units)

    lay.params.loc[:] = be.rand_like(lay.params.loc)
    lay.params.log_var[:] = be.randn(be.shape(lay.params.loc))

    from cytoolz import compose
    sum_square = compose(be.tsum, be.square)

    for itr in range(10):
        mag = lay.get_random_magnetization()
        lms = lay.lagrange_multipliers_analytic(mag)
        entropy = lay.TAP_entropy(mag)
        lr = 0.001
        gogogo = True
        grad = lay.TAP_magnetization_grad(mag, [], [], [])
        grad_mag = math.sqrt(be.float_scalar(be.accumulate(sum_square, grad)))
        normit = partial(be.tmul_, be.float_scalar(1.0/grad_mag))
        be.apply_(normit, grad)
        rand_grad = lay.get_random_magnetization()
        grad_mag = math.sqrt(be.float_scalar(be.accumulate(sum_square, rand_grad)))
        normit = partial(be.tmul_, be.float_scalar(1.0/grad_mag))
        be.apply_(normit, rand_grad)
        while gogogo:
            cop1_mag = deepcopy(mag)
            cop1_lms = deepcopy(lms)
            cop2_mag = deepcopy(mag)
            cop2_lms = deepcopy(lms)

            cop1_mag.mean[:] = mag.mean + lr * grad.mean
            cop2_mag.mean[:] = mag.mean + lr * rand_grad.mean
            cop1_mag.variance[:] = mag.variance + lr * grad.variance
            cop2_mag.variance[:] = mag.variance + lr * rand_grad.variance
            lay.clip_magnetization_(cop1_mag)
            lay.clip_magnetization_(cop2_mag)
            cop1_lms = lay.lagrange_multipliers_analytic(cop1_mag)
            cop2_lms = lay.lagrange_multipliers_analytic(cop2_mag)

            entropy_1 = lay.TAP_entropy(cop1_mag)
            entropy_2 = lay.TAP_entropy(cop2_mag)

            regress = entropy_1 - entropy_2 < 0.0
            #print(itr, "[",lr, "] ", entropy, entropy_1, entropy_2, regress)
            if regress:
                #print(grad, rand_grad)
                if lr < 1e-6:
                    assert False,\
                    "Gaussian GFE magnetization gradient is wrong"
                    break
                else:
                    lr *= 0.5
            else:
                break
예제 #13
0
def pop_nested_key(config, key):
    key_head, _, key_tail = key.rpartition('.')

    head_getters = (operator.itemgetter(key_part)
                    for key_part in key_head.split('.') if key_part)
    tail_popper = operator.methodcaller('pop', key_tail)

    popper_fn = compose(
        *reversed(tuple(itertools.chain(head_getters, (tail_popper, )))))

    return popper_fn(config)
예제 #14
0
def set_nested_key(config, key, value):
    key_head, _, key_tail = key.rpartition('.')

    head_setters = (operator.methodcaller('setdefault', key_part, {})
                    for key_part in key_head.split('.') if key_part)
    tail_setter = operator.methodcaller('__setitem__', key_tail, value)

    setter_fn = compose(
        *reversed(tuple((itertools.chain(head_setters, (tail_setter, ))))))

    # must write to both the config_for_read and config_for_write
    return setter_fn(config)
예제 #15
0
def defunct_hash_message(primitive=None, hexstr=None, text=None):
    '''
    Convert the provided message into a message hash, to be signed.
    This provides the same prefix and hashing approach as
    :meth:`w3.eth.sign() <web3.eth.Eth.sign>`. That means that the
    message will automatically be prepended with text
    defined in EIP-191 as version 'E': ``b'\\x19Ethereum Signed Message:\\n'``
    concatenated with the number of bytes in the message.

    Awkwardly, the number of bytes in the message is encoded in decimal ascii. So
    if the message is 'abcde', then the length is encoded as the ascii
    character '5'. This is one of the reasons that this message format is not preferred.
    There is ambiguity when the message '00' is encoded, for example.
    Only use this method if you must have compatibility with
    :meth:`w3.eth.sign() <web3.eth.Eth.sign>`.

    Supply exactly one of the three arguments:
    bytes, a hex string, or a unicode string.

    :param primitive: the binary message to be signed
    :type primitive: bytes or int
    :param str hexstr: the message encoded as hex
    :param str text: the message as a series of unicode characters (a normal Py3 str)
    :returns: The hash of the message, after adding the prefix
    :rtype: ~hexbytes.main.HexBytes

    .. code-block:: python

        >>> from newchain_account.messages import defunct_hash_message

        >>> msg = "I♥SF"
        >>> defunct_hash_message(text=msg)
        HexBytes('0x1476abb745d423bf09273f1afd887d951181d25adc66c4834a70491911b7f750')

        # these four also produce the same hash:
        >>> defunct_hash_message(w3.toBytes(text=msg))
        HexBytes('0x1476abb745d423bf09273f1afd887d951181d25adc66c4834a70491911b7f750')

        >>> defunct_hash_message(bytes(msg, encoding='utf-8'))
        HexBytes('0x1476abb745d423bf09273f1afd887d951181d25adc66c4834a70491911b7f750')

        >>> Web3.toHex(text=msg)
        '0x49e299a55346'
        >>> defunct_hash_message(hexstr='0x49e299a55346')
        HexBytes('0x1476abb745d423bf09273f1afd887d951181d25adc66c4834a70491911b7f750')

        >>> defunct_hash_message(0x49e299a55346)
        HexBytes('0x1476abb745d423bf09273f1afd887d951181d25adc66c4834a70491911b7f750')
    '''
    message_bytes = to_bytes(primitive, hexstr=hexstr, text=text)
    recovery_hasher = compose(HexBytes, keccak, signature_wrapper)
    return recovery_hasher(message_bytes)
예제 #16
0
 def interleave(
     cls,
     datasets: List[Dataset],
     identifier: Identifier,
 ) -> Dataset:
     """Interleave a list of datasets."""
     return cls.from_batch(
         tz.merge_with(
             tz.compose(list, tz.interleave),
             *[dataset[:] for dataset in datasets],
         ),
         identifier=identifier,
     )
예제 #17
0
 def chain(
     cls,
     datasets: List[Dataset],
     identifier: Identifier,
 ) -> Dataset:
     """Chain a list of datasets."""
     return cls.from_batch(
         tz.merge_with(
             tz.compose(list, tz.concat),
             *[dataset[:] for dataset in datasets],
         ),
         identifier=identifier,
     )
예제 #18
0
def grad_norm(grad):
    """
    Compute the l2 norm of the gradient.

    Args:
        grad (Gradient)

    Returns:
        magnitude (float)

    """
    tensor_sum_square = compose(be.tsum, be.square)
    return sqrt(grad_accumulate(tensor_sum_square, grad))
예제 #19
0
def grad_magnitude(grad):
    """
    Compute the root-mean-square of the gradient.

    Args:
        grad (Gradient)

    Returns:
        magnitude (float)

    """
    n = len(grad.layers) + len(grad.weights)
    tensor_mean_square = compose(be.mean, be.square)
    return sqrt(grad_accumulate(tensor_mean_square, grad) / n)
예제 #20
0
def validation_middleware(make_request, web3):
    transaction_validator = apply_formatters_to_dict({
        'chainId': validate_chain_id(web3),
    })

    transaction_sanitizer = compose(transaction_normalizer, transaction_validator)

    def middleware(method, params):
        if method in ('eth_sendTransaction', 'eth_estimateGas', 'eth_call'):
            post_validated_params = apply_formatter_at_index(transaction_sanitizer, 0, params)
            return make_request(method, post_validated_params)
        else:
            return make_request(method, params)
    return middleware
예제 #21
0
def to_string_pairs(segmentsbytxt, separator=" + "):
    """
    segmentsbytxt - Output from dual_segment_many.

    >>> exdata = [[([u"foo"], [u"foo"])], [([u"foo", u"bar", u"baz"], [u"foo", u"bar", u"baz"])]]

    >>> to_string_pairs(exdata)
    [(u"foo", u"foo"), (u"foo + bar + baz", u"foo + bar + baz")],

    >>> to_string_pairs(exdata, separator=", ")
    [(u"foo", u"foo"), (u"foo, bar, baz", u"foo, bar, baz")],
    """
    return tlz.pipe(segmentsbytxt, tlz.concat,
                    tlzc.map(tlz.compose(tuple, tlzc.map(separator.join))))
예제 #22
0
def pop_nested_key(config, key):
    key_head, _, key_tail = key.rpartition('.')

    head_getters = (
        operator.itemgetter(key_part)
        for key_part
        in key_head.split('.')
        if key_part
    )
    tail_popper = operator.methodcaller('pop', key_tail)

    popper_fn = compose(*reversed(tuple(itertools.chain(head_getters, (tail_popper,)))))

    return popper_fn(config)
예제 #23
0
def test_extract_links():
    first_link = compose(tuple, next, iter, extract_links)

    assert_equal(first_link("[[foo|bar]]"), ("Foo", "bar"))
    assert_equal(first_link("[[foo]]"), ("Foo", "foo"))
    assert_equal(first_link("[[File:picture!]] [[foo]]"), ("Foo", "foo"))
    assert_equal(first_link("[[foo]]bar."), ("Foo", "foobar"))
    assert_equal(first_link("[[baz|foobar]];"), ("Baz", "foobar"))
    assert_equal(first_link("[[baz#quux]]"), ("Baz", "baz#quux"))
    assert_equal(first_link("[[baz#quux|bla]]"), ("Baz", "bla"))
    assert_equal(first_link("[[FOO_BAR|foo bar]]"), ("FOO BAR", "foo bar"))

    # Links like these commonly occur in nlwiki (and presumably dewiki and
    # other compounding languages):
    assert_equal(first_link("foo[[baz|bar]]"), ("Baz", "foobar"))

    # MediaWiki only considers alphabetic characters outside [[]] part of the
    # anchor.
    assert_equal(first_link("foo-[[bar]]"), ("Bar", "bar"))
    assert_equal(first_link("[[bar]]/baz"), ("Bar", "bar"))
    # XXX The following are broken. They do occur in the wild, e.g.,
    # -18[[Celsius|°C]] and 700[[Megabyte|MB]]-cd (found in nlwiki dump).
    # assert_equal(first_link("[[bar]]0"), ("Bar", "bar"))
    # assert_equal(first_link("[[bar]]_"), ("Bar", "bar"))

    # We're not interested in section links
    assert_equal(first_link("[[#Some section|elsewhere]] [[other_article]]"),
                 ("Other article", "other_article"))

    # This construct appears in enwiki for chemical formulae etc., but also in
    # nlwiki (and dewiki?) for more general compound nouns. The current
    # handling may not be exactly what we want; any fix should update the test
    # accordingly.
    assert_equal(list(extract_links("[[Lithium|Li]][[Fluorine|F]]")),
                 [("Lithium", "Li"), ("Fluorine", "F")])
    assert_equal(list(extract_links("[[tera-|tera]][[becquerel]]s")),
                 [("Tera-", "tera"), ("Becquerel", "becquerels")])
    assert_equal(
        list(
            extract_links("""[[Lord's
        prayer]]
        [[Dismissal 
        (cricket)|dismissal]] [[Badass|Chuck 
        Norris]]""")), [("Lord's prayer", "Lord's prayer"),
                        ("Dismissal (cricket)", "dismissal"),
                        ("Badass", "Chuck Norris")])

    assert_equal(
        list(extract_links("[[C. Stephen Evans | Evans, C. Stephen]]")),
        [('C. Stephen Evans', 'Evans, C. Stephen')])
def test_extract_links():
    first_link = compose(tuple, next, iter, extract_links)

    assert_equal(first_link("[[foo|bar]]"), ("Foo", "bar"))
    assert_equal(first_link("[[foo]]"), ("Foo", "foo"))
    assert_equal(first_link("[[File:picture!]] [[foo]]"), ("Foo", "foo"))
    assert_equal(first_link("[[foo]]bar."), ("Foo", "foobar"))
    assert_equal(first_link("[[baz|foobar]];"), ("Baz", "foobar"))
    assert_equal(first_link("[[baz#quux]]"), ("Baz", "baz#quux"))
    assert_equal(first_link("[[baz#quux|bla]]"), ("Baz", "bla"))
    assert_equal(first_link("[[FOO_BAR|foo bar]]"), ("FOO BAR", "foo bar"))

    # Links like these commonly occur in nlwiki (and presumably dewiki and
    # other compounding languages):
    assert_equal(first_link("foo[[baz|bar]]"), ("Baz", "foobar"))

    # MediaWiki only considers alphabetic characters outside [[]] part of the
    # anchor.
    assert_equal(first_link("foo-[[bar]]"), ("Bar", "bar"))
    assert_equal(first_link("[[bar]]/baz"), ("Bar", "bar"))
    # XXX The following are broken. They do occur in the wild, e.g.,
    # -18[[Celsius|°C]] and 700[[Megabyte|MB]]-cd (found in nlwiki dump).
    # assert_equal(first_link("[[bar]]0"), ("Bar", "bar"))
    # assert_equal(first_link("[[bar]]_"), ("Bar", "bar"))

    # We're not interested in section links
    assert_equal(first_link("[[#Some section|elsewhere]] [[other_article]]"), ("Other article", "other_article"))

    # This construct appears in enwiki for chemical formulae etc., but also in
    # nlwiki (and dewiki?) for more general compound nouns. The current
    # handling may not be exactly what we want; any fix should update the test
    # accordingly.
    assert_equal(list(extract_links("[[Lithium|Li]][[Fluorine|F]]")), [("Lithium", "Li"), ("Fluorine", "F")])
    assert_equal(list(extract_links("[[tera-|tera]][[becquerel]]s")), [("Tera-", "tera"), ("Becquerel", "becquerels")])
    assert_equal(
        list(
            extract_links(
                """[[Lord's
        prayer]]
        [[Dismissal 
        (cricket)|dismissal]] [[Badass|Chuck 
        Norris]]"""
            )
        ),
        [("Lord's prayer", "Lord's prayer"), ("Dismissal (cricket)", "dismissal"), ("Badass", "Chuck Norris")],
    )

    assert_equal(
        list(extract_links("[[C. Stephen Evans | Evans, C. Stephen]]")), [("C. Stephen Evans", "Evans, C. Stephen")]
    )
예제 #25
0
def process(args, i):
    data_dir = join(args.data_dir, args.mode)
    with open(join(data_dir, '{}.json'.format(i))) as f:
        data = json.loads(f.read())
    tokenize = compose(list, _split_words)
    art_sents = tokenize(data['article'])
    abs_sents = tokenize(data['abstract'])
    if art_sents and abs_sents:  # some data contains empty article/abstract
        extracted, scores = get_extract_label(art_sents, abs_sents)
    else:
        extracted, scores = [], []
    data['extracted'] = extracted
    data['score'] = scores
    with open(join(data_dir, '{}.json'.format(i)), 'w') as f:
        json.dump(data, f, indent=4)
예제 #26
0
def set_nested_key(config, key, value):
    key_head, _, key_tail = key.rpartition('.')

    head_setters = (
        operator.methodcaller('setdefault', key_part, {})
        for key_part
        in key_head.split('.')
        if key_part
    )
    tail_setter = operator.methodcaller('__setitem__', key_tail, value)

    setter_fn = compose(*reversed(tuple((itertools.chain(head_setters, (tail_setter,))))))

    # must write to both the config_for_read and config_for_write
    return setter_fn(config)
def process(split, i):
    data_dir = join(DATA_DIR, split)
    with open(join(data_dir, '{}.json'.format(i)), encoding='utf-8') as f:
        data = json.loads(f.read(), encoding='utf-8')
    tokenize = compose(list, _split_words)
    art_sents = tokenize(data['article'])
    abs_sents = tokenize(data['abstract'])
    if art_sents and abs_sents:  # some data contains empty article/abstract
        extracted, scores = get_extract_label(art_sents, abs_sents)
    else:
        extracted, scores = [], []
    data['extracted'] = extracted
    data['score'] = scores
    with open(join(data_dir, '{}.json'.format(i)), 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=4, ensure_ascii=False)
def process(split, i):
    data_dir = join(DATA_DIR, split)
    with open(join(data_dir, '{}.json'.format(i))) as f:
        data = json.loads(f.read())
    tokenize = compose(list, _split_words)
    art_sents = tokenize(data['article'])
    abs_sents = tokenize(data['abstract'])
    if art_sents and abs_sents: # some data contains empty article/abstract
        extracted, scores = get_extract_label(art_sents, abs_sents)
    else:
        extracted, scores = [], []
    data['extracted'] = extracted
    data['score'] = scores
    with open(join(data_dir, '{}.json'.format(i)), 'w') as f:
        json.dump(data, f, indent=4)
예제 #29
0
    def from_batches(
        cls,
        batches: Sequence[Batch],
        identifier: Identifier = None,
        dataset_fmt: str = "in_memory",
    ) -> Dataset:
        """Convert a list of batches to a dataset."""

        return cls.from_batch(
            tz.merge_with(
                tz.compose(list, tz.concat),
                *batches,
            ),
            identifier=identifier,
            dataset_fmt=dataset_fmt,
        )
def get_observations(location_name, key):
    """Get a cleaned up list of observations at location_name

    `key` must be an API key for the met office

    For the last 24 hours.

    `location_name` is looked up in _met_office_location_codes dict.
    """

    __get_observations = tlz.compose(
        clean_data,
        helpers.pull_json,
        observations_url)

    return __get_observations(location_name, key)
예제 #31
0
    def score(self, batch: Dict[str, List], columns: List[str], *args,
              **kwargs) -> np.ndarray:
        # Compute the length of each example under each key
        lengths = [
            Spacy.retrieve(
                batch=batch,
                columns=[key],
                proc_fns=tz.compose(
                    # Compute lengths (# of words) for each tokenized text in a batch
                    lambda l: np.array([len(t) for t in l]),
                    # Extract tokens using Spacy
                    Spacy.tokens,
                ),
            )[key] for key in columns
        ]

        # Reduction over the key axis
        return self.reduction_fn(np.array(lengths), axis=0)
예제 #32
0
        def __getitem__(self, key: str) -> iter:
            if key not in self.map:
                raise ValueError(
                    dedent("""\
                        Key '{}' is invalid!
                        Valid keys: {}
                        """.format(
                        key,
                        reduce(lambda k1, k2: '{}, {}'.format(k1, k2),
                               map(lambda k: "'{}'".format(k), self.map)))))
            ref = self.map[key]
            if 'api' not in ref:
                ref['api'] = 'dbpy'  # default api
            api = ref['api']

            # load reader
            if key not in self.cache:
                print("Loading '{}' reader...".format(key))
                if api not in ('dbpy', 'stpy'):
                    raise ValueError("Invalid api type '{}'!".format(api))
                if 'id' not in ref:
                    ref['id'] = key  # default id
                id = ref['id']
                if api == 'dbpy':
                    self.cache[key] = fromiter(
                        read_syncdatalist_float(id, self.hi_tag,
                                                tuple(map(int,
                                                          self.low_tags))),
                        'float')
                if api == 'stpy':
                    self.cache[key] = StorageWrapper(*map(int, self.runs),
                                                     beamline=self.beamline,
                                                     id=id)
                if 'deco' not in ref:
                    ref['deco'] = identity  # default deco
                print('Loaded!')

            data = self.cache[key]
            deco = ref['deco'] if hasattr(ref['deco'], '__call__') else eval(
                ref['deco'])
            if api == 'dbpy':
                return map(deco, data)
            if api == 'stpy':
                return map(compose(deco, data.__getitem__), self.low_tags)
예제 #33
0
파일: rules.py 프로젝트: cloudera/ibis
def all_of(inners, arg):
    """All of the inner valudators must pass.

    The order of inner validators matters.

    Parameters
    ----------
    inners : List[validator]
      Functions are applied from right to left so allof([rule1, rule2], arg) is
      the same as rule1(rule2(arg)).
    arg : Any
      Value to be validated.

    Returns
    -------
    arg : Any
      Value maybe coerced by inner validators to the appropiate types
    """
    return compose(*inners)(arg)
예제 #34
0
def process(split, i):
    data_dir = join(DATA_DIR, split)
    #data_dir = './email_dataset'
    with open(join(data_dir, '{}.json'.format(i))) as f:
        print(join(data_dir, '{}.json'.format(i)))
        data = json.loads(f.read())
    #data = pd.read_csv(r'./email_dataset/Processed_Email_Dataset.csv')
    #data = data.iloc[i]
    tokenize = compose(list, _split_words)
    art_sents = tokenize(data['email_body'])
    abs_sents = tokenize(data['subject'])
    if art_sents and abs_sents: # some data contains empty article/abstract
        extracted, scores = get_extract_label(art_sents, abs_sents)
    else:
        extracted, scores = [], []
    data['extracted'] = extracted
    data['score'] = scores
    with open(join(data_dir, '{}.json'.format(i)), 'w') as f:
        json.dump(data, f, indent=4)
예제 #35
0
def all_of(inners, arg):
    """All of the inner validators must pass.

    The order of inner validators matters.

    Parameters
    ----------
    inners : List[validator]
      Functions are applied from right to left so allof([rule1, rule2], arg) is
      the same as rule1(rule2(arg)).
    arg : Any
      Value to be validated.

    Returns
    -------
    arg : Any
      Value maybe coerced by inner validators to the appropiate types
    """
    return compose(*inners)(arg)
def process(split, i):
    data_dir = join(DATA_DIR, split)
    with open(join(data_dir, '{}.json'.format(i))) as f:
        data = json.loads(f.read())
    tokenize = compose(list, _split_words)
    art_sents = tokenize(data['article'])
    abs_sents = tokenize(data['abstract'])
    if art_sents and abs_sents:  # some data contains empty article/abstract
        extracted, scores = get_extract_label(art_sents, abs_sents)
    else:
        extracted, scores = [], []
    data['extracted'] = extracted
    data['score'] = scores
    # with open(join(join('GT_12L_avg/test', split), '{}.dec'.format(i)), 'w') as a:
    #     label_sent = [data['article'][i] for i in extracted]
    #     a.write('\n'.join(label_sent))

    with open(join(data_dir, '{}.json'.format(i)), 'w') as f:
        json.dump(data, f, indent=4)
def label(split):
    start = time()
    print('start processing {} split...'.format(split))
    data_dir = join(DATA_DIR, split)
    n_data = count_data(data_dir)
    for i in range(n_data):
        print('processing {}/{} ({:.2f}%%)\r'.format(i, n_data, 100*i/n_data),
              end='')
        with open(join(data_dir, '{}.json'.format(i))) as f:
            data = json.loads(f.read())
        tokenize = compose(list, _split_words)
        art_sents = tokenize(data['article'])
        abs_sents = tokenize(data['abstract'])
        extracted, scores = get_extract_label(art_sents, abs_sents)
        data['extracted'] = extracted
        data['score'] = scores
        with open(join(data_dir, '{}.json'.format(i)), 'w') as f:
            json.dump(data, f, indent=4)
    print('finished in {}'.format(timedelta(seconds=time()-start)))
예제 #38
0
def process(split, i):
    data_dir = join(DATA_DIR, split)
    with open(join(data_dir, '{}.json'.format(i))) as f:
        data = json.loads(f.read())
    tokenize = compose(list, _split_words)
    art_sents = tokenize(data['article'])
    abs_sents = tokenize(data['abstract'])
    if art_sents and abs_sents:  # some data contains empty article/abstract
        extracted, scores, new_abs_sents, art_sents = get_extract_label(
            art_sents, abs_sents)
    else:
        extracted, scores, new_abs_sents = [], [], []
    data['extracted'] = extracted
    data['score'] = scores
    data['new_abs_sents'] = [' '.join(s) for s in new_abs_sents]
    data['article'] = [' '.join(s) for s in art_sents]
    print(split, '{}.json'.format(i))
    with open(join(data_dir, '{}.json'.format(i)), 'w') as f:
        json.dump(data, f, indent=4)
예제 #39
0
def get_nested_key(config, key):
    key_head, _, key_tail = key.rpartition('.')

    head_getters = (operator.itemgetter(key_part)
                    for key_part in key_head.split('.') if key_part)

    tail_getter = operator.itemgetter(key_tail)

    getter_fn = compose(
        *reversed(tuple(itertools.chain(head_getters, (tail_getter, )))))

    try:
        return getter_fn(config)
    except TypeError as err:
        raise KeyError("Error getting nested key {0} from {1}: {2}".format(
            key,
            force_text(repr(config)),
            str(err),
        ))
def label(split):
    start = time()
    print('start processing {} split...'.format(split))
    data_dir = join(DATA_DIR, split)
    n_data = count_data(data_dir)
    for i in range(n_data):
        print('processing {}/{} ({:.2f}%%)\r'.format(i, n_data, 100*i/n_data),
              end='')
        with open(join(data_dir, '{}.json'.format(i))) as f:
            data = json.loads(f.read())
        tokenize = compose(list, _split_words)
        art_sents = tokenize(data['article'])
        abs_sents = tokenize(data['abstract'])
        extracted, scores = get_extract_label(art_sents, abs_sents)
        data['extracted'] = extracted
        data['score'] = scores
        with open(join(data_dir, '{}.json'.format(i)), 'w') as f:
            json.dump(data, f, indent=4)
    print('finished in {}'.format(timedelta(seconds=time()-start)))
예제 #41
0
def create_slice(args):
    # Unpack args
    dataset, slice_membership, slice_batches, i, batch_size, slice_cache_hash = args

    # Create a new empty slice
    sl = Slice.from_dict({})

    # Create a Slice "copy" of the Dataset
    sl.__dict__.update(dataset.__dict__)
    sl._identifier = None

    # Filter
    sl = sl.filter(
        lambda example, idx: bool(slice_membership[idx, i]),
        with_indices=True,
        input_columns=["index"],
        batch_size=batch_size,
        cache_file_name=str(
            dataset.logdir / ("cache-" + str(abs(slice_cache_hash)) + "-filter.arrow")
        ),
    )

    slice_batch = tz.merge_with(tz.compose(list, tz.concat), slice_batches)

    # FIXME(karan): interaction tape history is wrong here, esp with augmenation/attacks

    # Map
    if len(sl):
        sl = sl.map(
            lambda batch, indices: tz.valmap(
                lambda v: v[indices[0] : indices[0] + batch_size], slice_batch
            ),
            batched=True,
            batch_size=batch_size,
            with_indices=True,
            remove_columns=sl.column_names,
            cache_file_name=str(
                dataset.logdir / ("cache-" + str(abs(slice_cache_hash)) + ".arrow")
            ),
        )

    return sl
예제 #42
0
def process(split, i):
    data_dir = join(DATA_DIR, split)
    with open(join(data_dir, '{}.json'.format(i))) as f:
        try:
            data = json.loads(f.read())
        except JSONDecodeError:
            data = {'article': '', 'abstract': ''}

    tokenize = compose(list, _split_words)
    art_sents = tokenize(data['article']) if data['article'] is not '' else []
    abs_sents = tokenize(
        data['abstract']) if data['abstract'] is not '' else []
    if art_sents and abs_sents:  # some data contains empty article/abstract
        extracted, scores = get_extract_label(art_sents, abs_sents)
    else:
        extracted, scores = [], []
    data['extracted'] = extracted
    data['score'] = scores
    with open(join(data_dir, '{}.json'.format(i)), 'w') as f:
        json.dump(data, f, indent=4)
예제 #43
0
def build_batchers_entity(net_type, word2id, cuda, debug):
    assert net_type in ['entity']

    prepro = prepro_fn_extract_entity(args.max_word, args.max_sent)

    # def sort_key(sample):
    #     src_sents, _, _ = sample
    #     return len(src_sents)
    def sort_key(sample):
        src_sents = sample[0]
        return len(src_sents)


    key = 'filtered_rule23_6_input_mention_cluster'


    batchify_fn = batchify_fn_extract_ptr_entity
    convert_batch = convert_batch_extract_ptr_entity


    batchify = compose(batchify_fn(PAD, cuda=cuda),
                       convert_batch(UNK, word2id))

    train_loader = DataLoader(
        EntityExtractDataset_combine('train', key), batch_size=BUCKET_SIZE,
        shuffle=not debug,
        num_workers=4 if cuda and not debug else 0,
        collate_fn=coll_fn_extract_entity
    )
    train_batcher = BucketedGenerater(train_loader, prepro, sort_key, batchify,
                                      single_run=False, fork=not debug)

    val_loader = DataLoader(
        EntityExtractDataset_combine('val', key), batch_size=BUCKET_SIZE,
        shuffle=False, num_workers=4 if cuda and not debug else 0,
        collate_fn=coll_fn_extract_entity
    )
    val_batcher = BucketedGenerater(val_loader, prepro, sort_key, batchify,
                                    single_run=True, fork=not debug)

    return train_batcher, val_batcher
예제 #44
0
def get_nested_key(config, key):
    key_head, _, key_tail = key.rpartition('.')

    head_getters = (
        operator.itemgetter(key_part)
        for key_part
        in key_head.split('.')
        if key_part
    )

    tail_getter = operator.itemgetter(key_tail)

    getter_fn = compose(*reversed(tuple(itertools.chain(head_getters, (tail_getter,)))))

    try:
        return getter_fn(config)
    except TypeError as err:
        raise KeyError(
            "Error getting nested key {0} from {1}: {2}".format(
                key,
                force_text(repr(config)),
                str(err),
            )
        )
예제 #45
0
FILTER_PARAMS_MAPPINGS = {
    'fromBlock': 'from_block',
    'toBlock': 'to_block',
}

filter_params_remapper = apply_key_map(FILTER_PARAMS_MAPPINGS)

FILTER_PARAMS_FORMATTERS = {
    'fromBlock': to_integer_if_hex,
    'toBlock': to_integer_if_hex,
}

filter_params_formatter = apply_formatters_to_dict(FILTER_PARAMS_FORMATTERS)

filter_params_transformer = compose(filter_params_remapper, filter_params_formatter)


TRANSACTION_FORMATTERS = {
    'to': apply_formatter_if(partial(operator.eq, b''), static_return(None)),
}


transaction_formatter = apply_formatters_to_dict(TRANSACTION_FORMATTERS)


RECEIPT_FORMATTERS = {
    'logs': apply_formatter_to_array(log_key_remapper),
}

예제 #46
0
__title__ = 'text2math'
__author__ = 'Steven Cutting'
__author_email__ = '*****@*****.**'
__created_on__ = '02/06/2016'
__copyright__ = "text2math Copyright (C) 2016  Steven Cutting"

import cytoolz as tlz

from text2math.raw2text import(remove_html_bits, decode_and_fix, adv_decode)
from text2math.text2tokens import(ngram, unigram, bigram, trigram,
                                  uni_and_bigram_tuples)
from text2math.tokens2numbers import freq


tknize_uni_n_bi = tlz.compose(tuple,
                              uni_and_bigram_tuples,
                              decode_and_fix,
                              remove_html_bits)

total_counts = tlz.compose(freq, tlz.concat)
예제 #47
0
파일: account.py 프로젝트: syngraph/web3.py
 def hashMessage(data=None, hexstr=None, text=None):
     message_bytes = to_bytes(data, hexstr=hexstr, text=text)
     recovery_hasher = compose(HexBytes, keccak, signature_wrapper)
     return recovery_hasher(message_bytes)