def build_batchers(word2id, cuda, debug): prepro = prepro_fn(args.max_art, args.max_abs) def sort_key(sample): src, target = sample return (len(target), len(src)) batchify = compose( batchify_fn_copy(PAD, START, END, cuda=cuda), convert_batch_copy(UNK, word2id) ) train_loader = DataLoader( MatchDataset('train'), batch_size=BUCKET_SIZE, shuffle=not debug, num_workers=4 if cuda and not debug else 0, collate_fn=coll_fn ) train_batcher = BucketedGenerater(train_loader, prepro, sort_key, batchify, single_run=False, fork=not debug) val_loader = DataLoader( MatchDataset('val'), batch_size=BUCKET_SIZE, shuffle=False, num_workers=4 if cuda and not debug else 0, collate_fn=coll_fn ) val_batcher = BucketedGenerater(val_loader, prepro, sort_key, batchify, single_run=True, fork=not debug) return train_batcher, val_batcher
def sort_prioritized_configs(backend_configs, master_config): resolved_backend_configs = tuple( ( backend_name, resolve_config(backend_configs.get_config(backend_name), master_config), ) for backend_name in backend_configs ) backends_with_conflicting_priorities = tuple(( backend_name for backend_name, count in collections.Counter(( (backend_name, config['priority']) for backend_name, config in resolved_backend_configs )).items() if count > 1 )) if backends_with_conflicting_priorities: raise ValueError( "The following package backends have conflicting priority " "values. '{0}'. Ensure that all priority values are unique " "across all backends.".format( ', '.join((backends_with_conflicting_priorities)) ) ) return sorted( resolved_backend_configs, key=compose( operator.itemgetter('priority'), operator.itemgetter(1), ), )
def build_batchers(net_type, word2id, cuda, debug): assert net_type in ['ff', 'rnn'] prepro = prepro_fn_extract(args.max_word, args.max_sent) def sort_key(sample): src_sents, _ = sample return len(src_sents) batchify_fn = (batchify_fn_extract_ff if net_type == 'ff' else batchify_fn_extract_ptr) convert_batch = (convert_batch_extract_ff if net_type == 'ff' else convert_batch_extract_ptr) batchify = compose(batchify_fn(PAD, cuda=cuda), convert_batch(UNK, word2id)) train_loader = DataLoader( ExtractDataset('train'), batch_size=BUCKET_SIZE, shuffle=not debug, num_workers=4 if cuda and not debug else 0, collate_fn=coll_fn_extract ) train_batcher = BucketedGenerater(train_loader, prepro, sort_key, batchify, single_run=False, fork=not debug) val_loader = DataLoader( ExtractDataset('val'), batch_size=BUCKET_SIZE, shuffle=False, num_workers=4 if cuda and not debug else 0, collate_fn=coll_fn_extract ) val_batcher = BucketedGenerater(val_loader, prepro, sort_key, batchify, single_run=True, fork=not debug) return train_batcher, val_batcher
def _iter(self, usecols=None): # get the date column [(name, type)] pairs datecols = list(map(first, get_date_columns(self.schema))) # figure out which ones pandas needs to parse parse_dates = ordered_index(datecols, self.schema) if usecols is not None: parse_dates = [d for d in parse_dates if d in set(usecols)] reader = self.iterreader(parse_dates=parse_dates, usecols=usecols, squeeze=True) # pop one off the iterator initial = next(iter(reader)) # get our names and initial dtypes for later inference if isinstance(initial, pd.Series): names = [str(initial.name)] formats = [initial.dtype] else: if usecols is None: index = slice(None) else: index = initial.columns.get_indexer(usecols) names = list(map(str, initial.columns[index])) formats = initial.dtypes[index].tolist() initial_dtype = np.dtype({'names': names, 'formats': formats}) # what dtype do we actually want to see when we read streaming_dtype = self.get_streaming_dtype(initial_dtype) # everything must ultimately be a list of tuples m = partial(bz.into, list) slicerf = lambda x: x.replace('', np.nan) if isinstance(initial, pd.Series): streaming_dtype = streaming_dtype[first(streaming_dtype.names)] if streaming_dtype != initial_dtype: # we don't have the desired type so jump through hoops with # to_records -> astype(desired dtype) -> listify def mapper(x, dtype=streaming_dtype): r = slicerf(x) try: r = r.to_records(index=False) except AttributeError: # We have a series r = r.values return m(r.astype(dtype)) else: mapper = compose(m, slicerf) # convert our initial NDFrame to a list return it.chain(mapper(initial), it.chain.from_iterable(map(mapper, reader)))
def get_predictions(location_name, key): __get_predictions = tlz.compose( clean_data, helpers.pull_json, predictions_url) return __get_predictions(location_name, key)
def build_batchers(net_type, word2id, cuda, debug, use_bert, bert_tokenizer): assert net_type in ['ff', 'rnn'] def sort_key(sample): src_sents, _ = sample return len(src_sents) if not use_bert: prepro = prepro_fn_extract(args.max_word, args.max_sent) batchify_fn = (batchify_fn_extract_ff if net_type == 'ff' else batchify_fn_extract_ptr) convert_batch = (convert_batch_extract_ff if net_type == 'ff' else convert_batch_extract_ptr) batchify = compose(batchify_fn(PAD, cuda=cuda), convert_batch(UNK, word2id)) else: # prepro = prepro_fn_extract(args.max_word, args.max_sent) # batchify_fn = batchify_fn_bert_extract_ptr2 # convert_batch = convert_batch_bert_extract_ptr2 # batchify = compose(batchify_fn(bert_tokenizer.pad_token_id, cuda=cuda), # convert_batch(bert_tokenizer)) prepro = prepro_fn_identity batchify_fn = batchify_fn_bert_extract_ptr2 convert_batch = convert_batch_bert_extract_ptr3 batchify = compose(batchify_fn(bert_tokenizer.pad_token_id, cuda=cuda), convert_batch(bert_tokenizer, max_len=args.max_word, max_sent=args.max_sent)) train_loader = DataLoader( ExtractDataset('train'), batch_size=BUCKET_SIZE, shuffle=not debug, num_workers=4 if cuda and not debug else 0, collate_fn=coll_fn_extract ) train_batcher = BucketedGenerater(train_loader, prepro, sort_key, batchify, single_run=False, fork=not debug) val_loader = DataLoader( ExtractDataset('val'), batch_size=BUCKET_SIZE, shuffle=False, num_workers=4 if cuda and not debug else 0, collate_fn=coll_fn_extract ) val_batcher = BucketedGenerater(val_loader, prepro, sort_key, batchify, single_run=True, fork=not debug) return train_batcher, val_batcher
def build_batchers(net_type, word2id, cuda, debug): assert net_type in ['ff', 'rnn', 'trans_rnn'] prepro = prepro_fn_extract(args.max_word, args.max_sent) def sort_key(sample): src_sents, _ = sample return len(src_sents) if net_type == 'trans_rnn': prepro = prepro_fn_extract_trans(args.max_word, args.max_sent) batchify = compose(batchify_fn_extract_trans(cuda=cuda), convert_batch_extract_trans) else: prepro = prepro_fn_extract(args.max_word, args.max_sent) batchify_fn = (batchify_fn_extract_ff if net_type == 'ff' else batchify_fn_extract_ptr) convert_batch = (convert_batch_extract_ff if net_type == 'ff' else convert_batch_extract_ptr) batchify = compose(batchify_fn(PAD, cuda=cuda), convert_batch(UNK, word2id)) train_loader = DataLoader(ExtractDataset('train'), batch_size=BUCKET_SIZE, shuffle=not debug, num_workers=4 if cuda and not debug else 0, collate_fn=coll_fn_extract) train_batcher = BucketedGenerater(train_loader, prepro, sort_key, batchify, single_run=False, fork=not debug) val_loader = DataLoader(ExtractDataset('val'), batch_size=BUCKET_SIZE, shuffle=False, num_workers=4 if cuda and not debug else 0, collate_fn=coll_fn_extract) val_batcher = BucketedGenerater(val_loader, prepro, sort_key, batchify, single_run=True, fork=not debug) return train_batcher, val_batcher
def hash_key(args, kwargs): # return (args, hash(frozenset(kwargs.items()))) # return (map(make_hashable, args), frozenset(kwargs.items())) args = tuple(map(make_hashable, args)) kwargs = frozenset( map(compose(tuple, partial(map, make_hashable)), kwargs.items())) # print('args', args) # print('kwargs', kwargs) return (args, kwargs)
def compute(self, df): if isinstance(df, pd.Series): # Col() is NOT MEANT TO BE USED DIRECTLY ON SERIES -- IT IS NOT TESTED OR SUPPORTED # This is only here to support the case of scalar loc/iloc access in with_column # TODO: find a better solution col = df.loc[self.spec] else: col = df.loc[:, self.spec] if not self.fns: return col return tz.compose(*reversed(self.fns))(col)
def normalize_result(msg: Tuple[BlockBody, ...]) -> BlockBodyBundles: uncles_hashes = tuple( map(compose(keccak, rlp.encode), tuple(body.uncles for body in msg))) transaction_roots_and_trie_data = tuple( map(make_trie_root_and_nodes, tuple(body.transactions for body in msg))) body_bundles = tuple( zip(msg, transaction_roots_and_trie_data, uncles_hashes)) return body_bundles
def zhsent_preprocess(s): s = strQ2B(s) # zh_chars = ' '.join(nltk.word_tokenize(zh_chars)) s, orig_number_strs = find_n_replace_numbers(s) s, orig_latin_strs = find_n_replace_latins(s) restore_num = partial(restore_place_holder, place_holder="{{CD}}", orig_strs=orig_number_strs) restore_latin = partial(restore_place_holder, place_holder="{{FW}}", orig_strs=orig_latin_strs) restore_all_place_holder = compose(restore_latin, restore_num) return s, restore_all_place_holder
def test_gaussian_GFE_entropy_gradient(): num_units = 5 lay = layers.GaussianLayer(num_units) lay.params.loc[:] = be.rand_like(lay.params.loc) lay.params.log_var[:] = be.randn(be.shape(lay.params.loc)) from cytoolz import compose sum_square = compose(be.tsum, be.square) for itr in range(10): mag = lay.get_random_magnetization() lms = lay.lagrange_multipliers_analytic(mag) entropy = lay.TAP_entropy(mag) lr = 0.001 gogogo = True grad = lay.TAP_magnetization_grad(mag, [], [], []) grad_mag = math.sqrt(be.float_scalar(be.accumulate(sum_square, grad))) normit = partial(be.tmul_, be.float_scalar(1.0/grad_mag)) be.apply_(normit, grad) rand_grad = lay.get_random_magnetization() grad_mag = math.sqrt(be.float_scalar(be.accumulate(sum_square, rand_grad))) normit = partial(be.tmul_, be.float_scalar(1.0/grad_mag)) be.apply_(normit, rand_grad) while gogogo: cop1_mag = deepcopy(mag) cop1_lms = deepcopy(lms) cop2_mag = deepcopy(mag) cop2_lms = deepcopy(lms) cop1_mag.mean[:] = mag.mean + lr * grad.mean cop2_mag.mean[:] = mag.mean + lr * rand_grad.mean cop1_mag.variance[:] = mag.variance + lr * grad.variance cop2_mag.variance[:] = mag.variance + lr * rand_grad.variance lay.clip_magnetization_(cop1_mag) lay.clip_magnetization_(cop2_mag) cop1_lms = lay.lagrange_multipliers_analytic(cop1_mag) cop2_lms = lay.lagrange_multipliers_analytic(cop2_mag) entropy_1 = lay.TAP_entropy(cop1_mag) entropy_2 = lay.TAP_entropy(cop2_mag) regress = entropy_1 - entropy_2 < 0.0 #print(itr, "[",lr, "] ", entropy, entropy_1, entropy_2, regress) if regress: #print(grad, rand_grad) if lr < 1e-6: assert False,\ "Gaussian GFE magnetization gradient is wrong" break else: lr *= 0.5 else: break
def pop_nested_key(config, key): key_head, _, key_tail = key.rpartition('.') head_getters = (operator.itemgetter(key_part) for key_part in key_head.split('.') if key_part) tail_popper = operator.methodcaller('pop', key_tail) popper_fn = compose( *reversed(tuple(itertools.chain(head_getters, (tail_popper, ))))) return popper_fn(config)
def set_nested_key(config, key, value): key_head, _, key_tail = key.rpartition('.') head_setters = (operator.methodcaller('setdefault', key_part, {}) for key_part in key_head.split('.') if key_part) tail_setter = operator.methodcaller('__setitem__', key_tail, value) setter_fn = compose( *reversed(tuple((itertools.chain(head_setters, (tail_setter, )))))) # must write to both the config_for_read and config_for_write return setter_fn(config)
def defunct_hash_message(primitive=None, hexstr=None, text=None): ''' Convert the provided message into a message hash, to be signed. This provides the same prefix and hashing approach as :meth:`w3.eth.sign() <web3.eth.Eth.sign>`. That means that the message will automatically be prepended with text defined in EIP-191 as version 'E': ``b'\\x19Ethereum Signed Message:\\n'`` concatenated with the number of bytes in the message. Awkwardly, the number of bytes in the message is encoded in decimal ascii. So if the message is 'abcde', then the length is encoded as the ascii character '5'. This is one of the reasons that this message format is not preferred. There is ambiguity when the message '00' is encoded, for example. Only use this method if you must have compatibility with :meth:`w3.eth.sign() <web3.eth.Eth.sign>`. Supply exactly one of the three arguments: bytes, a hex string, or a unicode string. :param primitive: the binary message to be signed :type primitive: bytes or int :param str hexstr: the message encoded as hex :param str text: the message as a series of unicode characters (a normal Py3 str) :returns: The hash of the message, after adding the prefix :rtype: ~hexbytes.main.HexBytes .. code-block:: python >>> from newchain_account.messages import defunct_hash_message >>> msg = "I♥SF" >>> defunct_hash_message(text=msg) HexBytes('0x1476abb745d423bf09273f1afd887d951181d25adc66c4834a70491911b7f750') # these four also produce the same hash: >>> defunct_hash_message(w3.toBytes(text=msg)) HexBytes('0x1476abb745d423bf09273f1afd887d951181d25adc66c4834a70491911b7f750') >>> defunct_hash_message(bytes(msg, encoding='utf-8')) HexBytes('0x1476abb745d423bf09273f1afd887d951181d25adc66c4834a70491911b7f750') >>> Web3.toHex(text=msg) '0x49e299a55346' >>> defunct_hash_message(hexstr='0x49e299a55346') HexBytes('0x1476abb745d423bf09273f1afd887d951181d25adc66c4834a70491911b7f750') >>> defunct_hash_message(0x49e299a55346) HexBytes('0x1476abb745d423bf09273f1afd887d951181d25adc66c4834a70491911b7f750') ''' message_bytes = to_bytes(primitive, hexstr=hexstr, text=text) recovery_hasher = compose(HexBytes, keccak, signature_wrapper) return recovery_hasher(message_bytes)
def interleave( cls, datasets: List[Dataset], identifier: Identifier, ) -> Dataset: """Interleave a list of datasets.""" return cls.from_batch( tz.merge_with( tz.compose(list, tz.interleave), *[dataset[:] for dataset in datasets], ), identifier=identifier, )
def chain( cls, datasets: List[Dataset], identifier: Identifier, ) -> Dataset: """Chain a list of datasets.""" return cls.from_batch( tz.merge_with( tz.compose(list, tz.concat), *[dataset[:] for dataset in datasets], ), identifier=identifier, )
def grad_norm(grad): """ Compute the l2 norm of the gradient. Args: grad (Gradient) Returns: magnitude (float) """ tensor_sum_square = compose(be.tsum, be.square) return sqrt(grad_accumulate(tensor_sum_square, grad))
def grad_magnitude(grad): """ Compute the root-mean-square of the gradient. Args: grad (Gradient) Returns: magnitude (float) """ n = len(grad.layers) + len(grad.weights) tensor_mean_square = compose(be.mean, be.square) return sqrt(grad_accumulate(tensor_mean_square, grad) / n)
def validation_middleware(make_request, web3): transaction_validator = apply_formatters_to_dict({ 'chainId': validate_chain_id(web3), }) transaction_sanitizer = compose(transaction_normalizer, transaction_validator) def middleware(method, params): if method in ('eth_sendTransaction', 'eth_estimateGas', 'eth_call'): post_validated_params = apply_formatter_at_index(transaction_sanitizer, 0, params) return make_request(method, post_validated_params) else: return make_request(method, params) return middleware
def to_string_pairs(segmentsbytxt, separator=" + "): """ segmentsbytxt - Output from dual_segment_many. >>> exdata = [[([u"foo"], [u"foo"])], [([u"foo", u"bar", u"baz"], [u"foo", u"bar", u"baz"])]] >>> to_string_pairs(exdata) [(u"foo", u"foo"), (u"foo + bar + baz", u"foo + bar + baz")], >>> to_string_pairs(exdata, separator=", ") [(u"foo", u"foo"), (u"foo, bar, baz", u"foo, bar, baz")], """ return tlz.pipe(segmentsbytxt, tlz.concat, tlzc.map(tlz.compose(tuple, tlzc.map(separator.join))))
def pop_nested_key(config, key): key_head, _, key_tail = key.rpartition('.') head_getters = ( operator.itemgetter(key_part) for key_part in key_head.split('.') if key_part ) tail_popper = operator.methodcaller('pop', key_tail) popper_fn = compose(*reversed(tuple(itertools.chain(head_getters, (tail_popper,))))) return popper_fn(config)
def test_extract_links(): first_link = compose(tuple, next, iter, extract_links) assert_equal(first_link("[[foo|bar]]"), ("Foo", "bar")) assert_equal(first_link("[[foo]]"), ("Foo", "foo")) assert_equal(first_link("[[File:picture!]] [[foo]]"), ("Foo", "foo")) assert_equal(first_link("[[foo]]bar."), ("Foo", "foobar")) assert_equal(first_link("[[baz|foobar]];"), ("Baz", "foobar")) assert_equal(first_link("[[baz#quux]]"), ("Baz", "baz#quux")) assert_equal(first_link("[[baz#quux|bla]]"), ("Baz", "bla")) assert_equal(first_link("[[FOO_BAR|foo bar]]"), ("FOO BAR", "foo bar")) # Links like these commonly occur in nlwiki (and presumably dewiki and # other compounding languages): assert_equal(first_link("foo[[baz|bar]]"), ("Baz", "foobar")) # MediaWiki only considers alphabetic characters outside [[]] part of the # anchor. assert_equal(first_link("foo-[[bar]]"), ("Bar", "bar")) assert_equal(first_link("[[bar]]/baz"), ("Bar", "bar")) # XXX The following are broken. They do occur in the wild, e.g., # -18[[Celsius|°C]] and 700[[Megabyte|MB]]-cd (found in nlwiki dump). # assert_equal(first_link("[[bar]]0"), ("Bar", "bar")) # assert_equal(first_link("[[bar]]_"), ("Bar", "bar")) # We're not interested in section links assert_equal(first_link("[[#Some section|elsewhere]] [[other_article]]"), ("Other article", "other_article")) # This construct appears in enwiki for chemical formulae etc., but also in # nlwiki (and dewiki?) for more general compound nouns. The current # handling may not be exactly what we want; any fix should update the test # accordingly. assert_equal(list(extract_links("[[Lithium|Li]][[Fluorine|F]]")), [("Lithium", "Li"), ("Fluorine", "F")]) assert_equal(list(extract_links("[[tera-|tera]][[becquerel]]s")), [("Tera-", "tera"), ("Becquerel", "becquerels")]) assert_equal( list( extract_links("""[[Lord's prayer]] [[Dismissal (cricket)|dismissal]] [[Badass|Chuck Norris]]""")), [("Lord's prayer", "Lord's prayer"), ("Dismissal (cricket)", "dismissal"), ("Badass", "Chuck Norris")]) assert_equal( list(extract_links("[[C. Stephen Evans | Evans, C. Stephen]]")), [('C. Stephen Evans', 'Evans, C. Stephen')])
def test_extract_links(): first_link = compose(tuple, next, iter, extract_links) assert_equal(first_link("[[foo|bar]]"), ("Foo", "bar")) assert_equal(first_link("[[foo]]"), ("Foo", "foo")) assert_equal(first_link("[[File:picture!]] [[foo]]"), ("Foo", "foo")) assert_equal(first_link("[[foo]]bar."), ("Foo", "foobar")) assert_equal(first_link("[[baz|foobar]];"), ("Baz", "foobar")) assert_equal(first_link("[[baz#quux]]"), ("Baz", "baz#quux")) assert_equal(first_link("[[baz#quux|bla]]"), ("Baz", "bla")) assert_equal(first_link("[[FOO_BAR|foo bar]]"), ("FOO BAR", "foo bar")) # Links like these commonly occur in nlwiki (and presumably dewiki and # other compounding languages): assert_equal(first_link("foo[[baz|bar]]"), ("Baz", "foobar")) # MediaWiki only considers alphabetic characters outside [[]] part of the # anchor. assert_equal(first_link("foo-[[bar]]"), ("Bar", "bar")) assert_equal(first_link("[[bar]]/baz"), ("Bar", "bar")) # XXX The following are broken. They do occur in the wild, e.g., # -18[[Celsius|°C]] and 700[[Megabyte|MB]]-cd (found in nlwiki dump). # assert_equal(first_link("[[bar]]0"), ("Bar", "bar")) # assert_equal(first_link("[[bar]]_"), ("Bar", "bar")) # We're not interested in section links assert_equal(first_link("[[#Some section|elsewhere]] [[other_article]]"), ("Other article", "other_article")) # This construct appears in enwiki for chemical formulae etc., but also in # nlwiki (and dewiki?) for more general compound nouns. The current # handling may not be exactly what we want; any fix should update the test # accordingly. assert_equal(list(extract_links("[[Lithium|Li]][[Fluorine|F]]")), [("Lithium", "Li"), ("Fluorine", "F")]) assert_equal(list(extract_links("[[tera-|tera]][[becquerel]]s")), [("Tera-", "tera"), ("Becquerel", "becquerels")]) assert_equal( list( extract_links( """[[Lord's prayer]] [[Dismissal (cricket)|dismissal]] [[Badass|Chuck Norris]]""" ) ), [("Lord's prayer", "Lord's prayer"), ("Dismissal (cricket)", "dismissal"), ("Badass", "Chuck Norris")], ) assert_equal( list(extract_links("[[C. Stephen Evans | Evans, C. Stephen]]")), [("C. Stephen Evans", "Evans, C. Stephen")] )
def process(args, i): data_dir = join(args.data_dir, args.mode) with open(join(data_dir, '{}.json'.format(i))) as f: data = json.loads(f.read()) tokenize = compose(list, _split_words) art_sents = tokenize(data['article']) abs_sents = tokenize(data['abstract']) if art_sents and abs_sents: # some data contains empty article/abstract extracted, scores = get_extract_label(art_sents, abs_sents) else: extracted, scores = [], [] data['extracted'] = extracted data['score'] = scores with open(join(data_dir, '{}.json'.format(i)), 'w') as f: json.dump(data, f, indent=4)
def set_nested_key(config, key, value): key_head, _, key_tail = key.rpartition('.') head_setters = ( operator.methodcaller('setdefault', key_part, {}) for key_part in key_head.split('.') if key_part ) tail_setter = operator.methodcaller('__setitem__', key_tail, value) setter_fn = compose(*reversed(tuple((itertools.chain(head_setters, (tail_setter,)))))) # must write to both the config_for_read and config_for_write return setter_fn(config)
def process(split, i): data_dir = join(DATA_DIR, split) with open(join(data_dir, '{}.json'.format(i)), encoding='utf-8') as f: data = json.loads(f.read(), encoding='utf-8') tokenize = compose(list, _split_words) art_sents = tokenize(data['article']) abs_sents = tokenize(data['abstract']) if art_sents and abs_sents: # some data contains empty article/abstract extracted, scores = get_extract_label(art_sents, abs_sents) else: extracted, scores = [], [] data['extracted'] = extracted data['score'] = scores with open(join(data_dir, '{}.json'.format(i)), 'w', encoding='utf-8') as f: json.dump(data, f, indent=4, ensure_ascii=False)
def process(split, i): data_dir = join(DATA_DIR, split) with open(join(data_dir, '{}.json'.format(i))) as f: data = json.loads(f.read()) tokenize = compose(list, _split_words) art_sents = tokenize(data['article']) abs_sents = tokenize(data['abstract']) if art_sents and abs_sents: # some data contains empty article/abstract extracted, scores = get_extract_label(art_sents, abs_sents) else: extracted, scores = [], [] data['extracted'] = extracted data['score'] = scores with open(join(data_dir, '{}.json'.format(i)), 'w') as f: json.dump(data, f, indent=4)
def from_batches( cls, batches: Sequence[Batch], identifier: Identifier = None, dataset_fmt: str = "in_memory", ) -> Dataset: """Convert a list of batches to a dataset.""" return cls.from_batch( tz.merge_with( tz.compose(list, tz.concat), *batches, ), identifier=identifier, dataset_fmt=dataset_fmt, )
def get_observations(location_name, key): """Get a cleaned up list of observations at location_name `key` must be an API key for the met office For the last 24 hours. `location_name` is looked up in _met_office_location_codes dict. """ __get_observations = tlz.compose( clean_data, helpers.pull_json, observations_url) return __get_observations(location_name, key)
def score(self, batch: Dict[str, List], columns: List[str], *args, **kwargs) -> np.ndarray: # Compute the length of each example under each key lengths = [ Spacy.retrieve( batch=batch, columns=[key], proc_fns=tz.compose( # Compute lengths (# of words) for each tokenized text in a batch lambda l: np.array([len(t) for t in l]), # Extract tokens using Spacy Spacy.tokens, ), )[key] for key in columns ] # Reduction over the key axis return self.reduction_fn(np.array(lengths), axis=0)
def __getitem__(self, key: str) -> iter: if key not in self.map: raise ValueError( dedent("""\ Key '{}' is invalid! Valid keys: {} """.format( key, reduce(lambda k1, k2: '{}, {}'.format(k1, k2), map(lambda k: "'{}'".format(k), self.map))))) ref = self.map[key] if 'api' not in ref: ref['api'] = 'dbpy' # default api api = ref['api'] # load reader if key not in self.cache: print("Loading '{}' reader...".format(key)) if api not in ('dbpy', 'stpy'): raise ValueError("Invalid api type '{}'!".format(api)) if 'id' not in ref: ref['id'] = key # default id id = ref['id'] if api == 'dbpy': self.cache[key] = fromiter( read_syncdatalist_float(id, self.hi_tag, tuple(map(int, self.low_tags))), 'float') if api == 'stpy': self.cache[key] = StorageWrapper(*map(int, self.runs), beamline=self.beamline, id=id) if 'deco' not in ref: ref['deco'] = identity # default deco print('Loaded!') data = self.cache[key] deco = ref['deco'] if hasattr(ref['deco'], '__call__') else eval( ref['deco']) if api == 'dbpy': return map(deco, data) if api == 'stpy': return map(compose(deco, data.__getitem__), self.low_tags)
def all_of(inners, arg): """All of the inner valudators must pass. The order of inner validators matters. Parameters ---------- inners : List[validator] Functions are applied from right to left so allof([rule1, rule2], arg) is the same as rule1(rule2(arg)). arg : Any Value to be validated. Returns ------- arg : Any Value maybe coerced by inner validators to the appropiate types """ return compose(*inners)(arg)
def process(split, i): data_dir = join(DATA_DIR, split) #data_dir = './email_dataset' with open(join(data_dir, '{}.json'.format(i))) as f: print(join(data_dir, '{}.json'.format(i))) data = json.loads(f.read()) #data = pd.read_csv(r'./email_dataset/Processed_Email_Dataset.csv') #data = data.iloc[i] tokenize = compose(list, _split_words) art_sents = tokenize(data['email_body']) abs_sents = tokenize(data['subject']) if art_sents and abs_sents: # some data contains empty article/abstract extracted, scores = get_extract_label(art_sents, abs_sents) else: extracted, scores = [], [] data['extracted'] = extracted data['score'] = scores with open(join(data_dir, '{}.json'.format(i)), 'w') as f: json.dump(data, f, indent=4)
def all_of(inners, arg): """All of the inner validators must pass. The order of inner validators matters. Parameters ---------- inners : List[validator] Functions are applied from right to left so allof([rule1, rule2], arg) is the same as rule1(rule2(arg)). arg : Any Value to be validated. Returns ------- arg : Any Value maybe coerced by inner validators to the appropiate types """ return compose(*inners)(arg)
def process(split, i): data_dir = join(DATA_DIR, split) with open(join(data_dir, '{}.json'.format(i))) as f: data = json.loads(f.read()) tokenize = compose(list, _split_words) art_sents = tokenize(data['article']) abs_sents = tokenize(data['abstract']) if art_sents and abs_sents: # some data contains empty article/abstract extracted, scores = get_extract_label(art_sents, abs_sents) else: extracted, scores = [], [] data['extracted'] = extracted data['score'] = scores # with open(join(join('GT_12L_avg/test', split), '{}.dec'.format(i)), 'w') as a: # label_sent = [data['article'][i] for i in extracted] # a.write('\n'.join(label_sent)) with open(join(data_dir, '{}.json'.format(i)), 'w') as f: json.dump(data, f, indent=4)
def label(split): start = time() print('start processing {} split...'.format(split)) data_dir = join(DATA_DIR, split) n_data = count_data(data_dir) for i in range(n_data): print('processing {}/{} ({:.2f}%%)\r'.format(i, n_data, 100*i/n_data), end='') with open(join(data_dir, '{}.json'.format(i))) as f: data = json.loads(f.read()) tokenize = compose(list, _split_words) art_sents = tokenize(data['article']) abs_sents = tokenize(data['abstract']) extracted, scores = get_extract_label(art_sents, abs_sents) data['extracted'] = extracted data['score'] = scores with open(join(data_dir, '{}.json'.format(i)), 'w') as f: json.dump(data, f, indent=4) print('finished in {}'.format(timedelta(seconds=time()-start)))
def process(split, i): data_dir = join(DATA_DIR, split) with open(join(data_dir, '{}.json'.format(i))) as f: data = json.loads(f.read()) tokenize = compose(list, _split_words) art_sents = tokenize(data['article']) abs_sents = tokenize(data['abstract']) if art_sents and abs_sents: # some data contains empty article/abstract extracted, scores, new_abs_sents, art_sents = get_extract_label( art_sents, abs_sents) else: extracted, scores, new_abs_sents = [], [], [] data['extracted'] = extracted data['score'] = scores data['new_abs_sents'] = [' '.join(s) for s in new_abs_sents] data['article'] = [' '.join(s) for s in art_sents] print(split, '{}.json'.format(i)) with open(join(data_dir, '{}.json'.format(i)), 'w') as f: json.dump(data, f, indent=4)
def get_nested_key(config, key): key_head, _, key_tail = key.rpartition('.') head_getters = (operator.itemgetter(key_part) for key_part in key_head.split('.') if key_part) tail_getter = operator.itemgetter(key_tail) getter_fn = compose( *reversed(tuple(itertools.chain(head_getters, (tail_getter, ))))) try: return getter_fn(config) except TypeError as err: raise KeyError("Error getting nested key {0} from {1}: {2}".format( key, force_text(repr(config)), str(err), ))
def create_slice(args): # Unpack args dataset, slice_membership, slice_batches, i, batch_size, slice_cache_hash = args # Create a new empty slice sl = Slice.from_dict({}) # Create a Slice "copy" of the Dataset sl.__dict__.update(dataset.__dict__) sl._identifier = None # Filter sl = sl.filter( lambda example, idx: bool(slice_membership[idx, i]), with_indices=True, input_columns=["index"], batch_size=batch_size, cache_file_name=str( dataset.logdir / ("cache-" + str(abs(slice_cache_hash)) + "-filter.arrow") ), ) slice_batch = tz.merge_with(tz.compose(list, tz.concat), slice_batches) # FIXME(karan): interaction tape history is wrong here, esp with augmenation/attacks # Map if len(sl): sl = sl.map( lambda batch, indices: tz.valmap( lambda v: v[indices[0] : indices[0] + batch_size], slice_batch ), batched=True, batch_size=batch_size, with_indices=True, remove_columns=sl.column_names, cache_file_name=str( dataset.logdir / ("cache-" + str(abs(slice_cache_hash)) + ".arrow") ), ) return sl
def process(split, i): data_dir = join(DATA_DIR, split) with open(join(data_dir, '{}.json'.format(i))) as f: try: data = json.loads(f.read()) except JSONDecodeError: data = {'article': '', 'abstract': ''} tokenize = compose(list, _split_words) art_sents = tokenize(data['article']) if data['article'] is not '' else [] abs_sents = tokenize( data['abstract']) if data['abstract'] is not '' else [] if art_sents and abs_sents: # some data contains empty article/abstract extracted, scores = get_extract_label(art_sents, abs_sents) else: extracted, scores = [], [] data['extracted'] = extracted data['score'] = scores with open(join(data_dir, '{}.json'.format(i)), 'w') as f: json.dump(data, f, indent=4)
def build_batchers_entity(net_type, word2id, cuda, debug): assert net_type in ['entity'] prepro = prepro_fn_extract_entity(args.max_word, args.max_sent) # def sort_key(sample): # src_sents, _, _ = sample # return len(src_sents) def sort_key(sample): src_sents = sample[0] return len(src_sents) key = 'filtered_rule23_6_input_mention_cluster' batchify_fn = batchify_fn_extract_ptr_entity convert_batch = convert_batch_extract_ptr_entity batchify = compose(batchify_fn(PAD, cuda=cuda), convert_batch(UNK, word2id)) train_loader = DataLoader( EntityExtractDataset_combine('train', key), batch_size=BUCKET_SIZE, shuffle=not debug, num_workers=4 if cuda and not debug else 0, collate_fn=coll_fn_extract_entity ) train_batcher = BucketedGenerater(train_loader, prepro, sort_key, batchify, single_run=False, fork=not debug) val_loader = DataLoader( EntityExtractDataset_combine('val', key), batch_size=BUCKET_SIZE, shuffle=False, num_workers=4 if cuda and not debug else 0, collate_fn=coll_fn_extract_entity ) val_batcher = BucketedGenerater(val_loader, prepro, sort_key, batchify, single_run=True, fork=not debug) return train_batcher, val_batcher
def get_nested_key(config, key): key_head, _, key_tail = key.rpartition('.') head_getters = ( operator.itemgetter(key_part) for key_part in key_head.split('.') if key_part ) tail_getter = operator.itemgetter(key_tail) getter_fn = compose(*reversed(tuple(itertools.chain(head_getters, (tail_getter,))))) try: return getter_fn(config) except TypeError as err: raise KeyError( "Error getting nested key {0} from {1}: {2}".format( key, force_text(repr(config)), str(err), ) )
FILTER_PARAMS_MAPPINGS = { 'fromBlock': 'from_block', 'toBlock': 'to_block', } filter_params_remapper = apply_key_map(FILTER_PARAMS_MAPPINGS) FILTER_PARAMS_FORMATTERS = { 'fromBlock': to_integer_if_hex, 'toBlock': to_integer_if_hex, } filter_params_formatter = apply_formatters_to_dict(FILTER_PARAMS_FORMATTERS) filter_params_transformer = compose(filter_params_remapper, filter_params_formatter) TRANSACTION_FORMATTERS = { 'to': apply_formatter_if(partial(operator.eq, b''), static_return(None)), } transaction_formatter = apply_formatters_to_dict(TRANSACTION_FORMATTERS) RECEIPT_FORMATTERS = { 'logs': apply_formatter_to_array(log_key_remapper), }
__title__ = 'text2math' __author__ = 'Steven Cutting' __author_email__ = '*****@*****.**' __created_on__ = '02/06/2016' __copyright__ = "text2math Copyright (C) 2016 Steven Cutting" import cytoolz as tlz from text2math.raw2text import(remove_html_bits, decode_and_fix, adv_decode) from text2math.text2tokens import(ngram, unigram, bigram, trigram, uni_and_bigram_tuples) from text2math.tokens2numbers import freq tknize_uni_n_bi = tlz.compose(tuple, uni_and_bigram_tuples, decode_and_fix, remove_html_bits) total_counts = tlz.compose(freq, tlz.concat)
def hashMessage(data=None, hexstr=None, text=None): message_bytes = to_bytes(data, hexstr=hexstr, text=text) recovery_hasher = compose(HexBytes, keccak, signature_wrapper) return recovery_hasher(message_bytes)