def compute_rouge_l_sent_level(outputs, reference, mode='f'): """ compute ROUGE-L for a single pair of summary and reference output, reference are list of words """ # reference list of sents sents are list of words # output list of sents sents are list of words assert mode in list('fpr') # F-1, precision, recall lcs = 0 word_count = 0 lcs_r = 0 sum_count = 0 for output in outputs: lcs += _lcs_len(output, list(concat(reference))) word_count += len(output) for ref in reference: lcs_r += _lcs_len(ref, list(concat(output))) sum_count += len(ref) if lcs == 0: score = 0.0 else: precision = lcs / word_count recall = lcs_r / sum_count f_score = 2 * (precision * recall) / (precision + recall) if mode == 'p': score = precision elif mode == 'r': score = recall else: score = f_score return score
async def poll_erc20_logs_loop(self): while True: try: new_blocks: List[AttributeDict] = await self._new_blocks_queue.get() transfer_tasks = [] approval_tasks = [] for address in self._addresses_to_contracts.keys(): contract_event_logger: ContractEventLogger = self._contract_event_loggers[address] transfer_tasks.append( contract_event_logger.get_new_entries_from_logs(TRANSFER_EVENT_NAME, new_blocks) ) approval_tasks.append( contract_event_logger.get_new_entries_from_logs(APPROVAL_EVENT_NAME, new_blocks) ) raw_transfer_entries = await safe_gather(*transfer_tasks) raw_approval_entries = await safe_gather(*approval_tasks) transfer_entries = list(cytoolz.concat(raw_transfer_entries)) approval_entries = list(cytoolz.concat(raw_approval_entries)) for transfer_entry in transfer_entries: await self._handle_event_data(transfer_entry) for approval_entry in approval_entries: await self._handle_event_data(approval_entry) except asyncio.CancelledError: raise except asyncio.TimeoutError: continue except Exception: self.logger().network("Error fetching new events from ERC20 contracts.", exc_info=True, app_warning_msg="Error fetching new events from ERC20 contracts. " "Check wallet network connection")
def a2c_validate(agent, abstractor, loader): agent.eval() start = time() print('start running validation...', end='') avg_reward = 0 i = 0 with torch.no_grad(): for art_batch, abs_batch in loader: ext_sents = [] ext_inds = [] for raw_arts in art_batch: indices = agent(raw_arts) ext_inds += [(len(ext_sents), len(indices)-1)] ext_sents += [raw_arts[idx.item()] for idx in indices if idx.item() < len(raw_arts)] all_summs = abstractor(ext_sents) for (j, n), abs_sents in zip(ext_inds, abs_batch): summs = all_summs[j:j+n] # python ROUGE-1 (not official evaluation) avg_reward += compute_rouge_n(list(concat(summs)), list(concat(abs_sents)), n=1) i += 1 avg_reward /= (i/100) print('finished in {}! avg reward: {:.2f}'.format( timedelta(seconds=int(time()-start)), avg_reward)) return {'reward': avg_reward}
def compute_rouge_l_summ(summs, refs, mode='f'): """ summary level ROUGE-L""" assert mode in list('fpr') # F-1, precision, recall tot_hit = 0 ref_cnt = Counter(concat(refs)) summ_cnt = Counter(concat(summs)) for ref in refs: for summ in summs: lcs = _lcs(summ, ref) for gram in lcs: if ref_cnt[gram] > 0 and summ_cnt[gram] > 0: tot_hit += 1 ref_cnt[gram] -= 1 summ_cnt[gram] -= 1 if tot_hit == 0: score = 0.0 else: precision = tot_hit / sum((len(s) for s in summs)) recall = tot_hit / sum((len(r) for r in refs)) f_score = 2 * (precision * recall) / (precision + recall) if mode == 'p': score = precision if mode == 'r': score = recall else: score = f_score return score
def test_gen(): from zbox import gen c = toolz.concat([[1], [2], [3]]) g = gen(c) assert not isinstance(c, types.GeneratorType) assert isinstance(g, types.GeneratorType) assert list(g) == list(toolz.concat([[1], [2], [3]]))
async def poll_erc20_logs_loop(self): while True: try: new_blocks: List[AttributeDict] = await self._new_blocks_queue.get() block_hashes: List[HexBytes] = [block["hash"] for block in new_blocks] transfer_tasks = [] approval_tasks = [] for address in self._addresses_to_contracts.keys(): contract_event_logger: ContractEventLogger = self._contract_event_loggers[address] transfer_tasks.append( contract_event_logger.get_new_entries_from_logs(TRANSFER_EVENT_NAME, block_hashes) ) approval_tasks.append( contract_event_logger.get_new_entries_from_logs(APPROVAL_EVENT_NAME, block_hashes) ) raw_transfer_entries = await asyncio.gather(*transfer_tasks) raw_approval_entries = await asyncio.gather(*approval_tasks) transfer_entries = list(cytoolz.concat(raw_transfer_entries)) approval_entries = list(cytoolz.concat(raw_approval_entries)) for transfer_entry in transfer_entries: await self._handle_event_data(transfer_entry) for approval_entry in approval_entries: await self._handle_event_data(approval_entry) except asyncio.CancelledError: raise except asyncio.TimeoutError: continue except Exception: self.logger().error("Unknown error trying to fetch new events from ERC20 contracts.", exc_info=True)
def a2c_validate(agent, abstractor, loader): agent.eval() start = time() print('start running validation...', end='') avg_reward = 0 i = 0 with torch.no_grad(): for art_batch, topic_batch, abs_batch in loader: ext_sents = [] ext_inds = [] for raw_arts, topic in zip(art_batch, topic_batch): indices = agent(raw_arts, topic) ext_inds += [(len(ext_sents), len(indices) - 1)] ext_sents += [ raw_arts[idx.item()] for idx in indices if idx.item() < len(raw_arts) ] all_summs = abstractor(ext_sents) for (j, n), abs_sents in zip(ext_inds, abs_batch): summs = all_summs[j:j + n] # python ROUGE-1 (not official evaluation) avg_reward += compute_rouge_n(list(concat(summs)), list(concat(abs_sents)), n=1) i += 1 avg_reward /= (i / 100) print('finished in {}! avg reward: {:.2f}'.format( timedelta(seconds=int(time() - start)), avg_reward)) return {'reward': avg_reward}
def get_all_leaf_paths(coll): """Returns a list of paths to all leaf nodes in a nested dict. Paths can travel through lists and the index is inserted into the path. """ if isinstance(coll, Mapping): return list( tz.concat( map( lambda t: list(map(lambda p: [t[0]] + p, get_all_leaf_paths(t[1]))), coll.items(), ) ) ) elif isinstance(coll, list): return list( tz.concat( map( lambda t: list(map(lambda p: [t[0]] + p, get_all_leaf_paths(t[1]))), enumerate(coll), ) ) ) else: return [[]]
def evaluate_subset(self, subset): #External call to looping function if subset == False: l1_cost, l2_match_cost = get_subset(self.candidates, self.costs, self.matches, self.pointers) else: l1_cost, l2_match_cost = get_subset(self.candidates[subset], self.costs[subset], self.matches[subset], self.pointers[subset]) #Find unencoded indexes if subset == False: unencoded_indexes = list( ct.concat([self.indexes[i] for i in range(len(self.indexes))])) unencoded_indexes = self.max_index - len( list(ct.unique(unencoded_indexes))) else: unencoded_indexes = list( ct.concat([self.indexes[i] for i in subset])) unencoded_indexes = self.max_index - len( list(ct.unique(unencoded_indexes))) #Use unencoded indexes to get regret cost #Regret cost applied twice, once for encoding and once for grammar if unencoded_indexes > 0: if subset == False: unencoded_cost = -math.log2(float(1.0 / (unencoded_indexes))) l2_regret_cost = (unencoded_cost * unencoded_indexes) * 2 else: unencoded_cost = -math.log2( float(1.0 / (unencoded_indexes + len(subset)))) l2_regret_cost = (unencoded_cost * unencoded_indexes) * 2 else: l2_regret_cost = 0 #Total all terms total_mdl = l1_cost + l2_match_cost + l2_regret_cost #DEBUGGING print("\t\tMDL: " + str(total_mdl)) print("\t\tL1 Cost: " + str(l1_cost)) print("\t\tL2 Match Cost: " + str(l2_match_cost)) print("\t\tL2 Regret Cost: " + str(l2_regret_cost)) print("\t\tEncoded: " + str(self.max_index - unencoded_indexes)) print("\t\tUnencoded: " + str(unencoded_indexes)) #Calculate baseline if subset == False: baseline_cost_per = -math.log2(float(1.0 / self.max_index)) baseline_mdl = baseline_cost_per * self.max_index print("\t\tBaseline: " + str(baseline_mdl)) print("\t\tRatio: " + str(total_mdl / baseline_mdl)) return total_mdl
def evaluate_subset(self, subset): #External call to looping function if subset == False: l1_cost, l2_match_cost = get_subset(self.candidates, self.costs, self.matches, self.pointers ) else: l1_cost, l2_match_cost = get_subset(self.candidates[subset], self.costs[subset], self.matches[subset], self.pointers[subset] ) #Find unencoded indexes if subset == False: unencoded_indexes = list(ct.concat([self.indexes[i] for i in range(len(self.indexes))])) unencoded_indexes = self.max_index - len(list(ct.unique(unencoded_indexes))) else: unencoded_indexes = list(ct.concat([self.indexes[i] for i in subset])) unencoded_indexes = self.max_index - len(list(ct.unique(unencoded_indexes))) #Use unencoded indexes to get regret cost #Regret cost applied twice, once for encoding and once for grammar if unencoded_indexes > 0: if subset == False: unencoded_cost = -math.log2(float(1.0/(unencoded_indexes))) l2_regret_cost = (unencoded_cost * unencoded_indexes) * 2 else: unencoded_cost = -math.log2(float(1.0/(unencoded_indexes + len(subset)))) l2_regret_cost = (unencoded_cost * unencoded_indexes) * 2 else: l2_regret_cost = 0 #Total all terms total_mdl = l1_cost + l2_match_cost + l2_regret_cost #DEBUGGING print("\t\tMDL: " + str(total_mdl)) print("\t\tL1 Cost: " + str(l1_cost)) print("\t\tL2 Match Cost: " + str(l2_match_cost)) print("\t\tL2 Regret Cost: " + str(l2_regret_cost)) print("\t\tEncoded: " + str(self.max_index - unencoded_indexes)) print("\t\tUnencoded: " + str(unencoded_indexes)) #Calculate baseline if subset == False: baseline_cost_per = -math.log2(float(1.0/self.max_index)) baseline_mdl = baseline_cost_per * self.max_index print("\t\tBaseline: " + str(baseline_mdl)) print("\t\tRatio: " + str(total_mdl/baseline_mdl)) return total_mdl
def coll_fn(data): source_lists, target_lists = unzip(data) # NOTE: independent filtering works because # source and targets are matched properly by the Dataset sources = list(filter(bool, concat(source_lists))) targets = list(filter(bool, concat(target_lists))) assert all(sources) and all(targets) return sources, targets
def a2c_validate(agent, abstractor, loader): agent.eval() start = time() print('start running validation...', end='') avg_reward = 0 i = 0 with torch.no_grad(): for art_batch, abs_batch, extract in loader: greedy_inputs = [] for idx, raw_arts in enumerate(art_batch): greedy, sample, log_probs = agent(raw_arts, sample_time=1, validate=True) sample = sample[0] log_probs = log_probs[0] greedy_sents = [raw_arts[ind] for ind in greedy] greedy_sents = [word for sent in greedy_sents for word in sent] #print(greedy_sents) #greedy_sents = list(concat(greedy_sents)) greedy_sents = [] ext_sent = [] for ids in greedy: if ids < len(raw_arts): if ids == 0: if ext_sent: greedy_sents.append(ext_sent) ext_sent = [] else: ext_sent += raw_arts[ids] if greedy[-1] != 0 and ext_sent: greedy_sents.append(ext_sent) #print(greedy_sents) #exit() greedy_inputs.append(greedy_sents) greedy_abstracts = [] for abs_src in greedy_inputs: with torch.no_grad(): greedy_outputs = abstractor(abs_src) #greedy_abstract = [] #for greedy_sents in greedy_outputs: # greedy_sents = sent_tokenize(' '.join(greedy_sents)) # greedy_sents = [sent.strip().split(' ') for sent in greedy_sents] # greedy_abstract += greedy_sents greedy_abstract = list(concat(greedy_outputs)) greedy_abstracts.append(greedy_abstract) for idx, greedy_sents in enumerate(greedy_abstracts): abss = abs_batch[idx] bs = compute_rouge_n(greedy_sents, list(concat(abss))) avg_reward += bs i += 1 #print(i) #print(avg_reward) #exit() avg_reward /= (i / 100) print('finished in {}! avg reward: {:.2f}'.format( timedelta(seconds=int(time() - start)), avg_reward)) return {'reward': avg_reward}
def a2c_validate(agent, abstractor, loader): agent.eval() start = time() print('start running validation...', end='') avg_reward = 0 i = 0 with torch.no_grad(): for art_batch, abs_batch, sent_batch in loader: print(i) ext_sents = [] ext_inds = [] masks = [] dirty = [] for raw_arts, sent_labels in zip(art_batch, sent_batch): indices = agent(raw_arts, sent_labels) ext_inds += [(len(ext_sents), len(indices) - 1)] assert indices[-1][-1].item() == len(raw_arts) + 1 tmp_stop = indices[-1][-1].item() tmp_truncate = tmp_stop - 1 str_arts = list(map(lambda x: ' '.join(x), raw_arts)) for idx in indices: t, m = rl_edu_to_sentence(str_arts, idx) if t == []: assert len(idx) == 1 id = idx[0].item() if id == tmp_truncate: dirty.append(len(ext_sents)) ext_sents.append(label) masks.append(label_mask) else: if idx[-1].item() != tmp_stop: ext_sents.append(t) masks.append(m) all_summs = abstractor(ext_sents, masks) for d in dirty: all_summs[d] = [] for (j, n), abs_sents in zip(ext_inds, abs_batch): summs = all_summs[j:j + n] # python ROUGE-1 (not official evaluation) avg_reward += compute_rouge_n(list(concat(summs)), list(concat(abs_sents)), n=1) i += 1 if i % 100 == 1: print(avg_reward / i, i) ''' with open('./compare/rl/' + str(i - 1) + '.dec', 'w') as f: for s in summs: s = ' '.join(s) f.write(s + '\n') ''' #if i > 1000: # break avg_reward /= (i / 100) print('finished in {}! avg reward: {:.2f}'.format( timedelta(seconds=int(time() - start)), avg_reward)) return {'reward': avg_reward}
def batchify_fn(pad, data, cuda=True): source_lists, targets = tuple(map(list, unzip(data))) sources = pad_batch_tensorize(inputs=list(concat(source_lists)), pad=pad, cuda=cuda) tensor_type = torch.cuda.LongTensor if cuda else torch.LongTensor target = tensor_type(list(concat(targets))) fw_args = (sources,) loss_args = (target,) return fw_args, loss_args
def summarize_text(doc: Doc): ''' Reduces a large doc to a few sentences at the beginning middle and end of the document. ''' sentences = list(sent.text for sent in doc.sents) doc_start = " ".join(tlz.concat(sentences[:2])) mid_i = int(len(sentences) / 2) doc_mid = " ".join(tlz.concat(sentences[mid_i:mid_i + 2])) doc_end = " ".join(tlz.concat(sentences[-2:])) return f'{doc_start}\n...\n{doc_mid}\n...\n{doc_end}'
async def _download_receipts(self, target_td: int, all_headers: Tuple[BlockHeader, ...]) -> None: """ Downloads and persists the receipts for the given set of block headers. Receipts are requested from all peers in equal sized batches. """ # Post-Byzantium blocks may have identical receipt roots (e.g. when they have the same # number of transactions and all succeed/failed: ropsten blocks 2503212 and 2503284), # so we do this to avoid requesting the same receipts multiple times. headers = tuple(unique( (header for header in all_headers if not _is_receipts_empty(header)), key=operator.attrgetter('receipt_root'), )) while headers: # split the remaining headers into equal sized batches for each peer. peers = cast(Tuple[ETHPeer, ...], self.peer_pool.get_peers(target_td)) if not peers: raise NoEligiblePeers( "No connected peers have the receipts we need for td={0}".format(target_td) ) batch_size = math.ceil(len(headers) / len(peers)) batches = tuple(partition_all(batch_size, headers)) # issue requests to all of the peers and wait for all of them to respond. requests = tuple( self._get_receipts(peer, batch) for peer, batch in zip(peers, batches) ) responses = await self.wait(asyncio.gather( *requests, loop=self.get_event_loop(), )) # extract the returned receipt data and the headers for which we # are still missing receipts. all_receipt_bundles, all_missing_headers = zip(*responses) receipt_bundles = tuple(concat(all_receipt_bundles)) headers = tuple(concat(all_missing_headers)) if len(receipt_bundles) == 0: continue # process all of the returned receipts, storing their trie data # dicts in the database receipts, trie_roots_and_data_dicts = zip(*receipt_bundles) trie_roots, trie_data_dicts = zip(*trie_roots_and_data_dicts) for trie_data in trie_data_dicts: await self.wait(self.db.coro_persist_trie_data_dict(trie_data)) self.logger.debug("Got receipts batch for %d headers", len(all_headers))
def a2c_validate(agent, abstractor, loader): agent.eval() start = time() print('start running validation...', end='') avg_reward = 0 i = 0 with torch.no_grad(): for art_batch, abs_batch, ext_batch in loader: ext_sents = [] ext_inds = [] sent_acts = [] for raw_arts in art_batch: (indices, _), actions = agent(raw_arts) ext_inds += [(len(ext_sents), len(indices) - 1)] ext_sents += [ raw_arts[idx.item()] for idx in indices if idx.item() < len(raw_arts) ] sent_acts += [ actions[j] for j, idx in enumerate(indices) if idx.item() < len(raw_arts) ] assert len(ext_sents) == len(sent_acts) all_summs = [] need_abs_sents = [ ext_sents[iters] for iters, act in enumerate(sent_acts) if act == 0 ] if len(need_abs_sents) > 0: turn_abs_sents = abstractor(need_abs_sents) for nums, action in enumerate(sent_acts): if action == 0: all_summs += turn_abs_sents.pop(0) else: all_summs += ext_sents[nums] for (j, n), abs_sents in zip(ext_inds, abs_batch): summs = all_summs[j:j + n] # python ROUGE-1 (not official evaluation) avg_reward += compute_rouge_n(list(concat(summs)), list(concat(abs_sents)), n=1) i += 1 avg_reward /= (i / 100) print('finished in {}! avg reward: {:.2f}'.format( timedelta(seconds=int(time() - start)), avg_reward)) return {'reward': avg_reward}
async def _download_block_bodies( self, target_td: int, all_headers: Tuple[BlockHeader, ...] ) -> Dict[Tuple[Hash32, Hash32], BlockBody]: """ Downloads and persists the block bodies for the given set of block headers. Block bodies are requested from all peers in equal sized batches. """ headers = tuple(header for header in all_headers if not _is_body_empty(header)) block_bodies_by_key: Dict[Tuple[Hash32, Hash32], BlockBody] = {} while headers: # split the remaining headers into equal sized batches for each peer. peers = cast(Tuple[ETHPeer, ...], self.peer_pool.get_peers(target_td)) if not peers: raise NoEligiblePeers( "No connected peers have the block bodies we need for td={0}" .format(target_td)) batch_size = math.ceil(len(headers) / len(peers)) batches = tuple(partition_all(batch_size, headers)) # issue requests to all of the peers and wait for all of them to respond. requests = tuple( self._get_block_bodies(peer, batch) for peer, batch in zip(peers, batches)) responses = await self.wait( asyncio.gather( *requests, loop=self.get_event_loop(), )) # extract the returned block body data and the headers for which we # are still missing block bodies. all_block_body_bundles, all_missing_headers = zip(*responses) for (body, (tx_root, trie_data_dict), uncles_hash) in concat(all_block_body_bundles): await self.wait( self.db.coro_persist_trie_data_dict(trie_data_dict)) block_bodies_by_key = merge( block_bodies_by_key, {(transaction_root, uncles_hash): block_body for block_body, (transaction_root, trie_dict_data), uncles_hash in concat(all_block_body_bundles)}) headers = tuple(concat(all_missing_headers)) self.logger.debug("Got block bodies batch for %d headers", len(all_headers)) return block_bodies_by_key
def test_mapped(): _cfg = cfg.get('chipmunk-ard', env=test.env) chipmap = chips.mapped(x=test.x, y=test.y, acquired=test.acquired, specmap=specs.mapped( ubids=cfg.ubids.get('chipmunk-ard'), specs=_cfg.get('registry_fn')()), chips_fn=_cfg.get('chips_fn')) assert len(chipmap) > 0 assert all(map(lambda x: type(x) is dict, concat(chipmap.values()))) assert len(list(concat(chipmap.values()))) > 0
def prepro_rl_graph(tokenized_sents, nodes, edges, paras, subgraphs, adj_type='edge_as_node', docgraph=True): max_len = len(list(concat(tokenized_sents))) _, word_inpara_freq_feat, _, sent_inpara_freq_feat = create_word_freq_in_para_feat( paras, tokenized_sents, list(concat(tokenized_sents))) if docgraph: nodewords, nodelength, nodefreq, sum_worthy, triples, relations, sent_node_aligns = process_nodes( nodes, edges, max_len, max_sent_num=len(list(tokenized_sents)), key='InSalientSent', adj_type=adj_type) nodes = (nodewords, nodefreq, word_inpara_freq_feat, sent_inpara_freq_feat, triples, relations, sent_node_aligns) else: nodewords, node_lists, nodefreq, sum_worthy, triples, relations = process_subgraphs( nodes, edges, subgraphs, paras, max_len, max_sent=len(list(tokenized_sents)), key='InSalientSent', adj_type=adj_type) sent_align_para = [] last_idx = 0 for sent in range(len(tokenized_sents)): flag = False for _idx, para in enumerate(paras): if sent in para: sent_align_para.append([_idx]) last_idx = _idx flag = True break if not flag: sent_align_para.append([last_idx]) assert len(sent_align_para) == len(tokenized_sents) sent_align_para.append([last_idx + 1]) nodes = (nodewords, nodefreq, word_inpara_freq_feat, sent_inpara_freq_feat, triples, relations, sent_align_para, node_lists) return nodes
def process_file(file, language, workers = 64): start = time.time() while True: try: df = pd.read_csv(file) break except Exception as e: print(e) time.sleep(10) pages = df.loc[:,"Text"].values del df pages = [str(x).split("\n") for x in pages] pages = list(ct.concat(pages)) #Multi-process pool_instance = mp.Pool(processes = workers, maxtasksperchild = None) codes = pool_instance.map(get_lid, pages, chunksize = 100) pool_instance.close() pool_instance.join() pages = [pages[i] for i in range(len(pages)) if codes[i][0] == language and codes[i][1] == language] print("\t" + file + " " + str(time.time() - start) + " with " + str(len(pages))) return pages
def run_pearson_prune(self, vector_array, class_array, significance_level, cor_level): time_start = time.time() print("Multi-processing Pearson's R feature pruning:") #Multi-process Pearson pruning# pool_instance=mp.Pool(processes = self.workers, maxtasksperchild = 1) remove_list = pool_instance.map(partial(self.process_pearson_prune, vector_array = vector_array, significance_level = significance_level, cor_level = cor_level, max = vector_array.shape[1] ), [i for i in range(0, vector_array.shape[1])], chunksize = 1) pool_instance.close() pool_instance.join() remove_list = list(ct.concat(remove_list)) remove_list = list(set(remove_list)) vector_array = vector_array[:, [x for x in range(0, vector_array.shape[1]) if x not in remove_list]] print("") print("Features above correlation threshold (" + str(cor_level) + "): " + str(len(remove_list))) print("Time for completion: " + str((float(time.time())) - time_start)) print("Finished with Pearson R Feature Pruning.") print("") return vector_array
def vsm_collate(inputs): (video_inputs, vids, sub_queries_and_targets) = map(list, unzip(inputs)) (input_ids, attn_masks, sub_vids, targets) = map( list, unzip(concat(outs for outs in sub_queries_and_targets))) batch = video_collate(video_inputs) vid2idx = {vid: i for i, vid in enumerate(vids)} batch["q_vidx"] = torch.tensor([vid2idx[s_vid] for s_vid in sub_vids], dtype=torch.long) # text batches input_ids = pad_sequence(input_ids, batch_first=True, padding_value=1) position_ids = torch.arange(0, input_ids.size(1), dtype=torch.long ).unsqueeze(0) attn_masks = pad_sequence(attn_masks, batch_first=True, padding_value=0) vsm_targets = pad_sequence( targets, batch_first=True, padding_value=-1) batch.update({ 'query_input_ids': input_ids, 'query_pos_ids': position_ids, 'query_attn_masks': attn_masks, 'targets': vsm_targets, 'vids': vids}) return batch
def tags(beamline, runs): hi_tags = fromiter(map(read_hightagnumber(beamline), runs), 'int') if not hi_tags.all(): raise ValueError('Not all the runs have a single high tag!') hi_tag = hi_tags[0] low_tags = concat(map(read_taglist_byrun(beamline), runs)) return hi_tag, low_tags
def recmerge(*objs, merge_sequences=False): """Recursively merge an arbitrary number of collections. For conflicting values, later collections to the right are given priority. By default (merge_sequences=False), sequences are treated as a normal value and not merged. Args: *objs: collections to merge merge_sequences: whether to merge values that are sequences Returns: merged collection """ if isinstance(objs, tuple) and len(objs) == 1: # A squeeze operation since merge_with generates tuple(list_of_objs,) objs = objs[0] if all([isinstance(obj, Mapping) for obj in objs]): # Merges all the collections, recursively applies merging to the combined values return tz.merge_with( partial(recmerge, merge_sequences=merge_sequences), *objs) elif all([isinstance(obj, Sequence) for obj in objs]) and merge_sequences: # Merges sequence values by concatenation return list(tz.concat(objs)) else: # If colls does not contain mappings, simply pick the last one return tz.last(objs)
def prepro(tokenizer, d, max_len=512): """ make sure data is not empty""" source_sents, extracts = d tokenized_sents = [ tokenizer.tokenize(source_sent.lower()) for source_sent in source_sents ] tokenized_sents = [ tokenized_sent + ['[SEP]'] for tokenized_sent in tokenized_sents ] tokenized_sents[0] = ['[CLS]'] + tokenized_sents[0] word_num = [ len(tokenized_sent) for tokenized_sent in tokenized_sents ] truncated_word_num = [] total_count = 0 for num in word_num: if total_count + num < max_len: truncated_word_num.append(num) else: truncated_word_num.append(512 - total_count) break total_count += num tokenized_sents = list(concat(tokenized_sents))[:max_len] tokenized_sents = tokenizer.convert_tokens_to_ids(tokenized_sents) abs_sents = tokenize(None, extracts) art_sents = tokenize(None, source_sents) return (art_sents, tokenized_sents, truncated_word_num), abs_sents
async def check_incoming_eth(self, new_blocks: List[AttributeDict]): watch_addresses: Set[str] = self._watch_addresses filtered_blocks: List[AttributeDict] = [block for block in new_blocks if block is not None] block_to_timestamp: Dict[str, float] = dict((block.hash, float(block.timestamp)) for block in filtered_blocks) transactions: List[AttributeDict] = list(cytoolz.concat(b.transactions for b in filtered_blocks)) incoming_eth_transactions: List[AttributeDict] = [t for t in transactions if ((t.get("to") in watch_addresses) and (t.get("value", 0) > 0))] for incoming_transaction in incoming_eth_transactions: # Filter out failed transactions. receipt: AttributeDict = self._w3.eth.getTransactionReceipt(incoming_transaction.hash) if receipt.status != 1: continue # Emit event. raw_eth_value: int = incoming_transaction.get("value") eth_value: float = raw_eth_value * 1e-18 from_address: str = incoming_transaction.get("from") to_address: str = incoming_transaction.get("to") timestamp: float = block_to_timestamp[incoming_transaction.get("blockHash")] self.trigger_event(IncomingEthWatcherEvent.ReceivedEther, WalletReceivedAssetEvent(timestamp, incoming_transaction.hash.hex(), from_address, to_address, "ETH", eth_value, raw_eth_value))
def itm_rank_collate(inputs): ( input_ids, img_feats, img_pos_feats, attn_masks, ) = map(list, unzip(concat(i for i in inputs))) txt_lens = [i.size(0) for i in input_ids] input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0) position_ids = torch.arange(0, input_ids.size(1), dtype=torch.long).unsqueeze(0) num_bbs = [f.size(0) for f in img_feats] img_feat = pad_tensors(img_feats, num_bbs) img_pos_feat = pad_tensors(img_pos_feats, num_bbs) attn_masks = pad_sequence(attn_masks, batch_first=True, padding_value=0) sample_size = len(inputs[0]) assert all(sample_size == len(i) for i in inputs) bs, max_tl = input_ids.size() out_size = attn_masks.size(1) gather_index = get_gather_index(txt_lens, num_bbs, bs, max_tl, out_size) batch = { 'input_ids': input_ids, 'position_ids': position_ids, 'img_feat': img_feat, 'img_pos_feat': img_pos_feat, 'attn_masks': attn_masks, 'gather_index': gather_index, 'sample_size': sample_size } return batch
def collate(inputs): (video_inputs, all_clip_ranges, attn_masks_list, metas) = map(list, unzip(inputs)) all_attn_masks = list(concat(attn_masks_list)) attn_mask = pad_sequence(all_attn_masks, batch_first=True, padding_value=0) batch = { 'cap_attn_mask': attn_mask, 'clip_ranges': tuple(map(tuple, all_clip_ranges)) } vid_batch = video_collate(video_inputs) batch.update(vid_batch) # meta vids, clip_ids, all_ts = [], [], [] for vid, cids, tss in metas: for cid, ts in zip(cids, tss): vids.append(vid) clip_ids.append(int(cid)) all_ts.append(ts) batch['vid_names'] = vids batch['clip_ids'] = clip_ids batch['all_ts'] = all_ts return batch
def vcr_collate(inputs): (input_ids, txt_type_ids, img_feats, img_pos_feats, attn_masks, targets) = map(list, unzip(concat(inputs))) txt_lens = [i.size(0) for i in input_ids] input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0) txt_type_ids = pad_sequence(txt_type_ids, batch_first=True, padding_value=0) position_ids = torch.arange(0, input_ids.size(1), dtype=torch.long).unsqueeze(0) # image batches num_bbs = [f.size(0) for f in img_feats] img_feat = pad_tensors(img_feats, num_bbs) img_pos_feat = pad_tensors(img_pos_feats, num_bbs) attn_masks = pad_sequence(attn_masks, batch_first=True, padding_value=0) targets = torch.stack(targets, dim=0) bs, max_tl = input_ids.size() out_size = attn_masks.size(1) gather_index = get_gather_index(txt_lens, num_bbs, bs, max_tl, out_size) batch = { 'input_ids': input_ids, 'txt_type_ids': txt_type_ids, 'position_ids': position_ids, 'img_feat': img_feat, 'img_pos_feat': img_pos_feat, 'attn_masks': attn_masks, 'gather_index': gather_index, 'targets': targets } return batch
def async_requests( url_payload: List[Tuple[str, Optional[MutableMapping[str, Any]]]], read: str, request: str = "GET", max_workers: int = 8, ) -> List[Union[str, MutableMapping[str, Any], bytes]]: """Send async requests. This function is based on `this <https://github.com/HydrologicEngineeringCenter/data-retrieval-scripts/blob/master/qpe_async_download.py>`__ script. Parameters ---------- url_payload : list of tuples A list of URLs and payloads as a tuple. read : str The method for returning the request; binary, json, and text. request : str, optional The request type; GET or POST, defaults to GET. max_workers : int, optional The maximum number of async processes, defaults to 8. Returns ------- list A list of responses """ chunked_urls = tlz.partition_all(max_workers, url_payload) results = (asyncio.get_event_loop().run_until_complete( _async_session(c, read, request)) for c in chunked_urls) return list(tlz.concat(results))
def collate(inputs): video_inputs, all_clip_ranges, cap_inputs = map(list, unzip(inputs)) (all_input_ids, all_tgt_ids, all_attn_masks) = map(list, unzip(concat(outs for outs in cap_inputs))) input_ids = pad_sequence(all_input_ids, batch_first=True, padding_value=1) position_ids = torch.arange(0, input_ids.size(1), dtype=torch.long).unsqueeze(0) tgt_ids = pad_sequence(all_tgt_ids, batch_first=True, padding_value=-1) attn_mask = pad_sequence(all_attn_masks, batch_first=True, padding_value=0) batch = { 'cap_input_ids': input_ids, 'cap_pos_ids': position_ids, 'cap_tgt_ids': tgt_ids, 'cap_attn_mask': attn_mask, 'clip_ranges': tuple(map(tuple, all_clip_ranges)) } vid_batch = video_collate(video_inputs) batch.update(vid_batch) return batch
def compute_up(t, seq, **kwargs): try: row = first(seq) except StopIteration: return () seq = concat([[row], seq]) # re-add row to seq if isinstance(row, list): seq = map(tuple, seq) return unique(seq)
def next_search_beam(beam, beam_size, finished, end, topk, lp, hists, attn=None, diverse=1.0): """generate the next beam(K-best hyps)""" topks, lps, hists_list, attns = _unpack_topk(topk, lp, hists, attn) hyps_lists = [h.extend_k(topks[i], lps[i], hists_list[i], attns[i], diverse) for i, h in enumerate(beam)] hyps = list(concat(hyps_lists)) finished, beam = _clean_beam(finished, hyps, end, beam_size) return finished, beam
def batchify_fn_extract_ff(pad, data, cuda=True): source_lists, targets = tuple(map(list, unzip(data))) src_nums = list(map(len, source_lists)) sources = list(map(pad_batch_tensorize(pad=pad, cuda=cuda), source_lists)) tensor_type = torch.cuda.FloatTensor if cuda else torch.FloatTensor target = tensor_type(list(concat(targets))) fw_args = (sources, src_nums) loss_args = (target, ) return fw_args, loss_args
def broadcast_dimensions(argpairs, numblocks, sentinels=(1, (1,)), consolidate=None): """ Find block dimensions from arguments Parameters ---------- argpairs: iterable name, ijk index pairs numblocks: dict maps {name: number of blocks} sentinels: iterable (optional) values for singleton dimensions consolidate: func (optional) use this to reduce each set of common blocks into a smaller set Examples -------- >>> argpairs = [('x', 'ij'), ('y', 'ji')] >>> numblocks = {'x': (2, 3), 'y': (3, 2)} >>> broadcast_dimensions(argpairs, numblocks) {'i': 2, 'j': 3} Supports numpy broadcasting rules >>> argpairs = [('x', 'ij'), ('y', 'ij')] >>> numblocks = {'x': (2, 1), 'y': (1, 3)} >>> broadcast_dimensions(argpairs, numblocks) {'i': 2, 'j': 3} Works in other contexts too >>> argpairs = [('x', 'ij'), ('y', 'ij')] >>> d = {'x': ('Hello', 1), 'y': (1, (2, 3))} >>> broadcast_dimensions(argpairs, d) {'i': 'Hello', 'j': (2, 3)} """ # List like [('i', 2), ('j', 1), ('i', 1), ('j', 2)] argpairs2 = [(a, ind) for a, ind in argpairs if ind is not None] L = toolz.concat([zip(inds, dims) for (x, inds), (x, dims) in toolz.join(toolz.first, argpairs2, toolz.first, numblocks.items())]) g = toolz.groupby(0, L) g = dict((k, set([d for i, d in v])) for k, v in g.items()) g2 = dict((k, v - set(sentinels) if len(v) > 1 else v) for k, v in g.items()) if consolidate: return toolz.valmap(consolidate, g2) if g2 and not set(map(len, g2.values())) == set([1]): raise ValueError("Shapes do not align %s" % g) return toolz.valmap(toolz.first, g2)
def compute_up(t, seq, **kwargs): if t.on: raise NotImplementedError("python backend cannot specify what columns to distinct on") try: row = toolz.first(seq) except StopIteration: return () seq = concat([[row], seq]) # re-add row to seq if isinstance(row, list): seq = map(tuple, seq) return unique(seq)
def pre_compute(expr, seq): try: if isinstance(seq, Iterator): first = next(seq) seq = concat([[first], seq]) else: first = next(iter(seq)) except StopIteration: return [] if isinstance(first, dict): return pluck(expr.fields, seq) else: return seq
def pre_compute(expr, seq, scope=None, **kwargs): try: if isinstance(seq, Iterator): first = next(seq) seq = concat([[first], seq]) else: first = next(iter(seq)) except StopIteration: return [] if isinstance(first, dict): leaf = expr._leaves()[0] return pluck(leaf.fields, seq) else: return seq
def _dict(self): if hasattr(self, '_cached_dict'): return self._cached_dict else: keys = tuple(map(blockwise_token, range(len(self.indices)))) func = SubgraphCallable(self.dsk, self.output, keys) self._cached_dict = make_blockwise_graph( func, self.output, self.output_indices, *list(toolz.concat(self.indices)), new_axes=self.new_axes, numblocks=self.numblocks, concatenate=self.concatenate ) return self._cached_dict
def isempty(seq): """ Is the sequence empty? >>> seq = iter([1, 2, 3]) >>> empty, seq = isempty(seq) >>> empty False >>> list(seq) # seq is preserved [1, 2, 3] >>> seq = iter([]) >>> empty, seq = isempty(seq) >>> empty True """ try: first = next(seq) return False, concat([[first], seq]) except StopIteration: return True, False
def call_function(func, func_token, args, kwargs, pure=None, nout=None): dask_key_name = kwargs.pop('dask_key_name', None) pure = kwargs.pop('pure', pure) if dask_key_name is None: name = '%s-%s' % (funcname(func), tokenize(func_token, *args, pure=pure, **kwargs)) else: name = dask_key_name args2, collections = unzip(map(unpack_collections, args), 2) collections = list(concat(collections)) if kwargs: dask_kwargs, collections2 = unpack_collections(kwargs) collections.extend(collections2) task = (apply, func, list(args2), dask_kwargs) else: task = (func,) + args2 graph = HighLevelGraph.from_collections(name, {name: task}, dependencies=collections) nout = nout if nout is not None else None return Delayed(name, graph, length=nout)
def ordered_intersect(*sets): """Set intersection of two sequences that preserves order. Parameters ---------- sets : tuple of Sequence Returns ------- generator Examples -------- >>> list(ordered_intersect('abcd', 'cdef')) ['c', 'd'] >>> list(ordered_intersect('bcda', 'bdfga')) ['b', 'd', 'a'] >>> list(ordered_intersect('zega', 'age')) # 1st sequence determines order ['e', 'g', 'a'] >>> list(ordered_intersect('gah', 'bag', 'carge')) ['g', 'a'] """ common = frozenset.intersection(*map(frozenset, sets)) return (x for x in unique(concat(sets)) if x in common)
# instead of modifying/using global state, choosing to pass in # the updated request as a param means that the handler functions # are all pure functions of their input params. # # This should make testing them easier - it's one less thing to mock. return req_fun(t.merge(opts, r), *args, **kwargs) return requirejson_wrapper return reqjson # converts a dictionary to flat list of key/value pairs. # each key can have multiple values and they will all be unpacked accordingly. multipairs=lambda d: list(t.concat(t.map( lambda i: (lambda k,v: t.concat((k,e) for e in v) if isinstance(v,list) else (k,v))(i[0],i[1]), d.items()))) # -------------------------------------------------------------------------- # REST API # -------------------------------------------------------------------------- @app.get('/') def default(message=''): return template('signin', message=message) @app.post('/signin') @params(keys=['barcode']) def signin(p): u = filter(lambda v: v.id == p['barcode'], data['users'].values()) if len(u) > 0:
def rerank_mp(all_beams, ext_inds): beam_lists = [all_beams[i: i+n] for i, n in ext_inds if n > 0] with mp.Pool(8) as pool: reranked = pool.map(rerank_one, beam_lists) return list(concat(reranked))
def rerank(all_beams, ext_inds): beam_lists = (all_beams[i: i+n] for i, n in ext_inds if n > 0) return list(concat(map(rerank_one, beam_lists)))
def recursive_beam(self, previous_start, line, i, line_length): go = False if len(previous_start) < 2: go = True if self.search_monitor.count(previous_start[0:2]) < 40: go = True if go == True: self.search_monitor.append(previous_start[0:2]) #Progress down the line i += 1 #Stop at the end if i < line_length: #For each available next path for start in [(1, line[i][0]), (2, line[i][1]), (3, line[i][2])]: #Create larger path try: previous_start = list(ct.concat(previous_start)) except: previous_start = previous_start current_path = list(ct.concat([previous_start, start])) current_path = tuple(ct.partition(2, current_path)) if len(current_path) > 2: test_path = current_path[-2:] current_dict = self.association_dict[test_path] if current_dict != {}: delta_p = max(current_dict["LR"], current_dict["RL"]) if delta_p > self.delta_threshold: self.recursive_beam(current_path, line, i, line_length) #This is the end of a candidate sequence else: #Has to be at least 3 slots if len(current_path) > 3: #Remove the bad part current_path = current_path[0:-1] #Add to candidate_stack self.candidate_stack[i - len(current_path) + 1].append(current_path) else: current_dict = self.association_dict[current_path] if current_dict != {}: delta_p = max(current_dict["LR"], current_dict["RL"]) if delta_p > self.delta_threshold: self.recursive_beam(current_path, line, i, line_length) return
def concat(self): return self.__class__(cytoolz.concat(self))
def compute_up(t, example, children, **kwargs): return concat(children)
def build_ngram_model(sentences, n, pad='<eos>'): """ generates a dictionary of word-ngram counts from a list of sentences. """ return frequencies( concat(ngrams(sent, n, pad) for sent in sentences) )
def unpack_collections(expr): """Normalize a python object and merge all sub-graphs. - Replace ``Delayed`` with their keys - Convert literals to things the schedulers can handle - Extract dask graphs from all enclosed values Parameters ---------- expr : object The object to be normalized. This function knows how to handle dask collections, as well as most builtin python types. Returns ------- task : normalized task to be run collections : a tuple of collections Examples -------- >>> a = delayed(1, 'a') >>> b = delayed(2, 'b') >>> task, collections = unpack_collections([a, b, 3]) >>> task # doctest: +SKIP ['a', 'b', 3] >>> collections # doctest: +SKIP (a, b) >>> task, collections = unpack_collections({a: 1, b: 2}) >>> task # doctest: +SKIP (dict, [['a', 1], ['b', 2]]) >>> collections # doctest: +SKIP {a, b} """ if isinstance(expr, Delayed): return expr._key, (expr,) if is_dask_collection(expr): finalized = finalize(expr) return finalized._key, (finalized,) if isinstance(expr, Iterator): expr = tuple(expr) typ = type(expr) if typ in (list, tuple, set): args, collections = unzip((unpack_collections(e) for e in expr), 2) args = list(args) collections = tuple(unique(concat(collections), key=id)) # Ensure output type matches input type if typ is not list: args = (typ, args) return args, collections if typ is dict: args, collections = unpack_collections([[k, v] for k, v in expr.items()]) return (dict, args), collections if typ is slice: args, collections = unpack_collections([expr.start, expr.stop, expr.step]) return (slice,) + tuple(args), collections if is_dataclass(expr): args, collections = unpack_collections([[f.name, getattr(expr, f.name)] for f in dataclass_fields(expr)]) return (apply, typ, (), (dict, args)), collections return expr, ()
def apply_gufunc(func, signature, *args, **kwargs): """ Apply a generalized ufunc or similar python function to arrays. ``signature`` determines if the function consumes or produces core dimensions. The remaining dimensions in given input arrays (``*args``) are considered loop dimensions and are required to broadcast naturally against each other. In other terms, this function is like np.vectorize, but for the blocks of dask arrays. If the function itself shall also be vectorized use ``vectorize=True`` for convenience. Parameters ---------- func : callable Function to call like ``func(*args, **kwargs)`` on input arrays (``*args``) that returns an array or tuple of arrays. If multiple arguments with non-matching dimensions are supplied, this function is expected to vectorize (broadcast) over axes of positional arguments in the style of NumPy universal functions [1]_ (if this is not the case, set ``vectorize=True``). If this function returns multiple outputs, ``output_core_dims`` has to be set as well. signature: string Specifies what core dimensions are consumed and produced by ``func``. According to the specification of numpy.gufunc signature [2]_ *args : numeric Input arrays or scalars to the callable function. output_dtypes : dtype or list of dtypes, keyword only dtype or list of output dtypes. output_sizes : dict, optional, keyword only Optional mapping from dimension names to sizes for outputs. Only used if new core dimensions (not found on inputs) appear on outputs. vectorize: bool, keyword only If set to ``True``, ``np.vectorize`` is applied to ``func`` for convenience. Defaults to ``False``. allow_rechunk: Optional, bool, keyword only Allows rechunking, otherwise chunk sizes need to match and core dimensions are to consist only of one chunk. Warning: enabling this can increase memory usage significantly. Defaults to ``False``. **kwargs : dict Extra keyword arguments to pass to `func` Returns ------- Single dask.array.Array or tuple of dask.array.Array Examples -------- >>> import dask.array as da >>> import numpy as np >>> def stats(x): ... return np.mean(x, axis=-1), np.std(x, axis=-1) >>> a = da.random.normal(size=(10,20,30), chunks=(5, 10, 30)) >>> mean, std = da.apply_gufunc(stats, "(i)->(),()", a, output_dtypes=2*(a.dtype,)) >>> mean.compute().shape (10, 20) >>> def outer_product(x, y): ... return np.einsum("i,j->ij", x, y) >>> a = da.random.normal(size=( 20,30), chunks=(10, 30)) >>> b = da.random.normal(size=(10, 1,40), chunks=(5, 1, 40)) >>> c = da.apply_gufunc(outer_product, "(i),(j)->(i,j)", a, b, output_dtypes=a.dtype, vectorize=True) >>> c.compute().shape (10, 20, 30, 40) References ---------- .. [1] http://docs.scipy.org/doc/numpy/reference/ufuncs.html .. [2] http://docs.scipy.org/doc/numpy/reference/c-api.generalized-ufuncs.html """ output_dtypes = kwargs.pop("output_dtypes", None) output_sizes = kwargs.pop("output_sizes", None) vectorize = kwargs.pop("vectorize", None) allow_rechunk = kwargs.pop("allow_rechunk", False) # Input processing: ## Signature if not isinstance(signature, str): raise TypeError('`signature` has to be of type string') core_input_dimss, core_output_dimss = _parse_gufunc_signature(signature) ## Determine nout: nout = None for functions of one direct return; nout = int for return tuples nout = None if not isinstance(core_output_dimss, list) else len(core_output_dimss) ## Assert output_dtypes if output_dtypes is None: raise ValueError("Must specify `output_dtypes` of output array(s)") elif isinstance(output_dtypes, str): otypes = list(output_dtypes) output_dtypes = otypes[0] if nout is None else otypes elif isinstance(output_dtypes, (tuple, list)): if nout is None: raise ValueError("Must specify single dtype for `output_dtypes` for function with one output") otypes = output_dtypes else: if nout is not None: raise ValueError("Must specify tuple of dtypes for `output_dtypes` for function with multiple outputs") otypes = [output_dtypes] ## Vectorize function, if required if vectorize: func = np.vectorize(func, signature=signature, otypes=otypes) ## Miscellaneous if output_sizes is None: output_sizes = {} # Main code: ## Cast all input arrays to dask args = [asarray(a) for a in args] if len(core_input_dimss) != len(args): ValueError("According to `signature`, `func` requires %d arguments, but %s given" % (len(core_output_dimss), len(args))) ## Assess input args for loop dims input_shapes = [a.shape for a in args] input_chunkss = [tuple(c[0] for c in a.chunks) for a in args] num_loopdims = [len(s) - len(cd) for s, cd in zip(input_shapes, core_input_dimss)] max_loopdims = max(num_loopdims) if num_loopdims else None _core_input_shapes = [dict(zip(cid, s[n:])) for s, n, cid in zip(input_shapes, num_loopdims, core_input_dimss)] core_shapes = merge(output_sizes, *_core_input_shapes) loop_input_dimss = [tuple("__loopdim%d__" % d for d in range(max_loopdims - n, max_loopdims)) for n in num_loopdims] input_dimss = [l + c for l, c in zip(loop_input_dimss, core_input_dimss)] loop_output_dims = max(loop_input_dimss, key=len) if loop_input_dimss else set() ## Assess input args for same size and chunk sizes ### Collect sizes and chunksizes of all dims in all arrays dimsizess = {} chunksizess = {} for dims, shape, chunksizes in zip(input_dimss, input_shapes, input_chunkss): for dim, size, chunksize in zip(dims, shape, chunksizes): _dimsizes = dimsizess.get(dim, []) _dimsizes.append(size) dimsizess[dim] = _dimsizes _chunksizes = chunksizess.get(dim, []) _chunksizes.append(chunksize) chunksizess[dim] = _chunksizes ### Assert correct partitioning, for case: for dim, sizes in dimsizess.items(): #### Check that the arrays have same length for same dimensions or dimension `1` if set(sizes).union({1}) != {1, max(sizes)}: raise ValueError("Dimension `'{}'` with different lengths in arrays".format(dim)) if not allow_rechunk: chunksizes = chunksizess[dim] #### Check if core dimensions consist of only one chunk if (dim in core_shapes) and (chunksizes[0] < core_shapes[dim]): raise ValueError("Core dimension `'{}'` consists of multiple chunks. To fix, rechunk into a single \ chunk along this dimension or set `allow_rechunk=True`, but beware that this may increase memory usage \ significantly.".format(dim)) #### Check if loop dimensions consist of same chunksizes, when they have sizes > 1 relevant_chunksizes = list(unique(c for s, c in zip(sizes, chunksizes) if s > 1)) if len(relevant_chunksizes) > 1: raise ValueError("Dimension `'{}'` with different chunksize present".format(dim)) ## Apply function - use atop here arginds = list(concat(zip(args, input_dimss))) ### Use existing `atop` but only with loopdims to enforce ### concatenation for coredims that appear also at the output ### Modifying `atop` could improve things here. tmp = atop(func, loop_output_dims, *arginds, dtype=int, # Only dummy dtype, anyone will do concatenate=True, **kwargs) ## Prepare output shapes loop_output_shape = tmp.shape loop_output_chunks = tmp.chunks dsk = tmp.__dask_graph__() keys = list(flatten(tmp.__dask_keys__())) _anykey = keys[0] name, token = _anykey[0].split('-') ### *) Treat direct output if nout is None: core_output_dimss = [core_output_dimss] output_dtypes = [output_dtypes] ## Split output leaf_arrs = [] for i, cod, odt in zip(count(0), core_output_dimss, output_dtypes): core_output_shape = tuple(core_shapes[d] for d in cod) core_chunkinds = len(cod) * (0,) output_shape = loop_output_shape + core_output_shape output_chunks = loop_output_chunks + core_output_shape leaf_name = "%s_%d-%s" % (name, i, token) leaf_dsk = {(leaf_name,) + key[1:] + core_chunkinds: ((getitem, key, i) if nout else key) for key in keys} leaf_arr = Array(sharedict.merge((leaf_name, leaf_dsk), dsk), leaf_name, chunks=output_chunks, shape=output_shape, dtype=odt) leaf_arrs.append(leaf_arr) return leaf_arrs if nout else leaf_arrs[0] # Undo *) from above
def a2c_train_step(agent, abstractor, loader, opt, grad_fn, gamma=0.99, reward_fn=compute_rouge_l, stop_reward_fn=compute_rouge_n(n=1), stop_coeff=1.0): opt.zero_grad() indices = [] probs = [] baselines = [] ext_sents = [] art_batch, abs_batch = next(loader) for raw_arts in art_batch: (inds, ms), bs = agent(raw_arts) baselines.append(bs) indices.append(inds) probs.append(ms) ext_sents += [raw_arts[idx.item()] for idx in inds if idx.item() < len(raw_arts)] with torch.no_grad(): summaries = abstractor(ext_sents) i = 0 rewards = [] avg_reward = 0 for inds, abss in zip(indices, abs_batch): rs = ([reward_fn(summaries[i+j], abss[j]) for j in range(min(len(inds)-1, len(abss)))] + [0 for _ in range(max(0, len(inds)-1-len(abss)))] + [stop_coeff*stop_reward_fn( list(concat(summaries[i:i+len(inds)-1])), list(concat(abss)))]) assert len(rs) == len(inds) avg_reward += rs[-1]/stop_coeff i += len(inds)-1 # compute discounted rewards R = 0 disc_rs = [] for r in rs[::-1]: R = r + gamma * R disc_rs.insert(0, R) rewards += disc_rs indices = list(concat(indices)) probs = list(concat(probs)) baselines = list(concat(baselines)) # standardize rewards reward = torch.Tensor(rewards).to(baselines[0].get_device()) reward = (reward - reward.mean()) / ( reward.std() + float(np.finfo(np.float32).eps)) baseline = torch.cat(baselines).squeeze() avg_advantage = 0 losses = [] for action, p, r, b in zip(indices, probs, reward, baseline): advantage = r - b avg_advantage += advantage losses.append(-p.log_prob(action) * (advantage/len(indices))) # divide by T*B critic_loss = F.mse_loss(baseline, reward) # backprop and update autograd.backward( [critic_loss] + losses, [torch.ones(1).to(critic_loss.get_device())]*(1+len(losses)) ) grad_log = grad_fn() opt.step() log_dict = {} log_dict.update(grad_log) log_dict['reward'] = avg_reward/len(art_batch) log_dict['advantage'] = avg_advantage.item()/len(indices) log_dict['mse'] = critic_loss.item() assert not math.isnan(log_dict['grad_norm']) return log_dict
def apply_gufunc(func, signature, *args, **kwargs): """ Apply a generalized ufunc or similar python function to arrays. ``signature`` determines if the function consumes or produces core dimensions. The remaining dimensions in given input arrays (``*args``) are considered loop dimensions and are required to broadcast naturally against each other. In other terms, this function is like np.vectorize, but for the blocks of dask arrays. If the function itself shall also be vectorized use ``vectorize=True`` for convenience. Parameters ---------- func : callable Function to call like ``func(*args, **kwargs)`` on input arrays (``*args``) that returns an array or tuple of arrays. If multiple arguments with non-matching dimensions are supplied, this function is expected to vectorize (broadcast) over axes of positional arguments in the style of NumPy universal functions [1]_ (if this is not the case, set ``vectorize=True``). If this function returns multiple outputs, ``output_core_dims`` has to be set as well. signature: string Specifies what core dimensions are consumed and produced by ``func``. According to the specification of numpy.gufunc signature [2]_ *args : numeric Input arrays or scalars to the callable function. axes: List of tuples, optional, keyword only A list of tuples with indices of axes a generalized ufunc should operate on. For instance, for a signature of ``"(i,j),(j,k)->(i,k)"`` appropriate for matrix multiplication, the base elements are two-dimensional matrices and these are taken to be stored in the two last axes of each argument. The corresponding axes keyword would be ``[(-2, -1), (-2, -1), (-2, -1)]``. For simplicity, for generalized ufuncs that operate on 1-dimensional arrays (vectors), a single integer is accepted instead of a single-element tuple, and for generalized ufuncs for which all outputs are scalars, the output tuples can be omitted. axis: int, optional, keyword only A single axis over which a generalized ufunc should operate. This is a short-cut for ufuncs that operate over a single, shared core dimension, equivalent to passing in axes with entries of (axis,) for each single-core-dimension argument and ``()`` for all others. For instance, for a signature ``"(i),(i)->()"``, it is equivalent to passing in ``axes=[(axis,), (axis,), ()]``. keepdims: bool, optional, keyword only If this is set to True, axes which are reduced over will be left in the result as a dimension with size one, so that the result will broadcast correctly against the inputs. This option can only be used for generalized ufuncs that operate on inputs that all have the same number of core dimensions and with outputs that have no core dimensions , i.e., with signatures like ``"(i),(i)->()"`` or ``"(m,m)->()"``. If used, the location of the dimensions in the output can be controlled with axes and axis. output_dtypes : Optional, dtype or list of dtypes, keyword only Valid numpy dtype specification or list thereof. If not given, a call of ``func`` with a small set of data is performed in order to try to automatically determine the output dtypes. output_sizes : dict, optional, keyword only Optional mapping from dimension names to sizes for outputs. Only used if new core dimensions (not found on inputs) appear on outputs. vectorize: bool, keyword only If set to ``True``, ``np.vectorize`` is applied to ``func`` for convenience. Defaults to ``False``. allow_rechunk: Optional, bool, keyword only Allows rechunking, otherwise chunk sizes need to match and core dimensions are to consist only of one chunk. Warning: enabling this can increase memory usage significantly. Defaults to ``False``. **kwargs : dict Extra keyword arguments to pass to `func` Returns ------- Single dask.array.Array or tuple of dask.array.Array Examples -------- >>> import dask.array as da >>> import numpy as np >>> def stats(x): ... return np.mean(x, axis=-1), np.std(x, axis=-1) >>> a = da.random.normal(size=(10,20,30), chunks=(5, 10, 30)) >>> mean, std = da.apply_gufunc(stats, "(i)->(),()", a) >>> mean.compute().shape (10, 20) >>> def outer_product(x, y): ... return np.einsum("i,j->ij", x, y) >>> a = da.random.normal(size=( 20,30), chunks=(10, 30)) >>> b = da.random.normal(size=(10, 1,40), chunks=(5, 1, 40)) >>> c = da.apply_gufunc(outer_product, "(i),(j)->(i,j)", a, b, vectorize=True) >>> c.compute().shape (10, 20, 30, 40) References ---------- .. [1] https://docs.scipy.org/doc/numpy/reference/ufuncs.html .. [2] https://docs.scipy.org/doc/numpy/reference/c-api.generalized-ufuncs.html """ axes = kwargs.pop("axes", None) axis = kwargs.pop("axis", None) keepdims = kwargs.pop("keepdims", False) output_dtypes = kwargs.pop("output_dtypes", None) output_sizes = kwargs.pop("output_sizes", None) vectorize = kwargs.pop("vectorize", None) allow_rechunk = kwargs.pop("allow_rechunk", False) # Input processing: ## Signature if not isinstance(signature, str): raise TypeError('`signature` has to be of type string') input_coredimss, output_coredimss = _parse_gufunc_signature(signature) ## Determine nout: nout = None for functions of one direct return; nout = int for return tuples nout = None if not isinstance(output_coredimss, list) else len(output_coredimss) ## Determine and handle output_dtypes if output_dtypes is None: if vectorize: tempfunc = np.vectorize(func, signature=signature) else: tempfunc = func output_dtypes = apply_infer_dtype(tempfunc, args, kwargs, "apply_gufunc", "output_dtypes", nout) if isinstance(output_dtypes, (tuple, list)): if nout is None: if len(output_dtypes) > 1: raise ValueError(("Must specify single dtype or list of one dtype " "for `output_dtypes` for function with one output")) otypes = output_dtypes output_dtypes = output_dtypes[0] else: otypes = output_dtypes else: if nout is not None: raise ValueError("Must specify tuple of dtypes for `output_dtypes` for function with multiple outputs") otypes = [output_dtypes] ## Vectorize function, if required if vectorize: func = np.vectorize(func, signature=signature, otypes=otypes) ## Miscellaneous if output_sizes is None: output_sizes = {} ## Axes input_axes, output_axes = _validate_normalize_axes(axes, axis, keepdims, input_coredimss, output_coredimss) # Main code: ## Cast all input arrays to dask args = [asarray(a) for a in args] if len(input_coredimss) != len(args): ValueError("According to `signature`, `func` requires %d arguments, but %s given" % (len(input_coredimss), len(args))) ## Axes: transpose input arguments transposed_args = [] for arg, iax, input_coredims in zip(args, input_axes, input_coredimss): shape = arg.shape iax = tuple(a if a < 0 else a - len(shape) for a in iax) tidc = tuple(i for i in range(-len(shape) + 0, 0) if i not in iax) + iax transposed_arg = arg.transpose(tidc) transposed_args.append(transposed_arg) args = transposed_args ## Assess input args for loop dims input_shapes = [a.shape for a in args] input_chunkss = [a.chunks for a in args] num_loopdims = [len(s) - len(cd) for s, cd in zip(input_shapes, input_coredimss)] max_loopdims = max(num_loopdims) if num_loopdims else None core_input_shapes = [dict(zip(icd, s[n:])) for s, n, icd in zip(input_shapes, num_loopdims, input_coredimss)] core_shapes = merge(*core_input_shapes) core_shapes.update(output_sizes) loop_input_dimss = [tuple("__loopdim%d__" % d for d in range(max_loopdims - n, max_loopdims)) for n in num_loopdims] input_dimss = [l + c for l, c in zip(loop_input_dimss, input_coredimss)] loop_output_dims = max(loop_input_dimss, key=len) if loop_input_dimss else tuple() ## Assess input args for same size and chunk sizes ### Collect sizes and chunksizes of all dims in all arrays dimsizess = {} chunksizess = {} for dims, shape, chunksizes in zip(input_dimss, input_shapes, input_chunkss): for dim, size, chunksize in zip(dims, shape, chunksizes): dimsizes = dimsizess.get(dim, []) dimsizes.append(size) dimsizess[dim] = dimsizes chunksizes_ = chunksizess.get(dim, []) chunksizes_.append(chunksize) chunksizess[dim] = chunksizes_ ### Assert correct partitioning, for case: for dim, sizes in dimsizess.items(): #### Check that the arrays have same length for same dimensions or dimension `1` if set(sizes).union({1}) != {1, max(sizes)}: raise ValueError("Dimension `'{}'` with different lengths in arrays".format(dim)) if not allow_rechunk: chunksizes = chunksizess[dim] #### Check if core dimensions consist of only one chunk if (dim in core_shapes) and (chunksizes[0][0] < core_shapes[dim]): raise ValueError("Core dimension `'{}'` consists of multiple chunks. To fix, rechunk into a single \ chunk along this dimension or set `allow_rechunk=True`, but beware that this may increase memory usage \ significantly.".format(dim)) #### Check if loop dimensions consist of same chunksizes, when they have sizes > 1 relevant_chunksizes = list(unique(c for s, c in zip(sizes, chunksizes) if s > 1)) if len(relevant_chunksizes) > 1: raise ValueError("Dimension `'{}'` with different chunksize present".format(dim)) ## Apply function - use blockwise here arginds = list(concat(zip(args, input_dimss))) ### Use existing `blockwise` but only with loopdims to enforce ### concatenation for coredims that appear also at the output ### Modifying `blockwise` could improve things here. tmp = blockwise( func, loop_output_dims, *arginds, dtype=int, # Only dummy dtype, anyone will do concatenate=True, **kwargs ) ## Prepare output shapes loop_output_shape = tmp.shape loop_output_chunks = tmp.chunks keys = list(flatten(tmp.__dask_keys__())) name, token = keys[0][0].split('-') ### *) Treat direct output if nout is None: output_coredimss = [output_coredimss] output_dtypes = [output_dtypes] ## Split output leaf_arrs = [] for i, ocd, odt, oax in zip(count(0), output_coredimss, output_dtypes, output_axes): core_output_shape = tuple(core_shapes[d] for d in ocd) core_chunkinds = len(ocd) * (0,) output_shape = loop_output_shape + core_output_shape output_chunks = loop_output_chunks + core_output_shape leaf_name = "%s_%d-%s" % (name, i, token) leaf_dsk = {(leaf_name,) + key[1:] + core_chunkinds: ((getitem, key, i) if nout else key) for key in keys} graph = HighLevelGraph.from_collections(leaf_name, leaf_dsk, dependencies=[tmp]) leaf_arr = Array(graph, leaf_name, chunks=output_chunks, shape=output_shape, dtype=odt) ### Axes: if keepdims: slices = len(leaf_arr.shape) * (slice(None),) + len(oax) * (np.newaxis,) leaf_arr = leaf_arr[slices] tidcs = [None] * len(leaf_arr.shape) for i, oa in zip(range(-len(oax), 0), oax): tidcs[oa] = i j = 0 for i in range(len(tidcs)): if tidcs[i] is None: tidcs[i] = j j += 1 leaf_arr = leaf_arr.transpose(tidcs) leaf_arrs.append(leaf_arr) return leaf_arrs if nout else leaf_arrs[0] # Undo *) from above