def inference(self, input_samples, max_len=200, chunk_size=64, num_return_sequences=4, **kwargs): """ input_samples: [{'all_raw_queries':['sadfad','adfad'], ...}] """ self.eval() with torch.no_grad(): new_samples = [] for chunk_samples in tqdm(list(chunks(input_samples, chunk_size)), desc="Re-writing"): input_text = [ self.sample_to_input_text(s) for s in chunk_samples ] input_tok_obj = self.tokenizer(input_text, return_tensors='pt', padding=True) input_ids = input_tok_obj['input_ids'].to(self.device) input_att_mask = input_tok_obj['attention_mask'].to( self.device) decoder_start_token_id = self.tokenizer.get_vocab( )['<extra_id_0>'] outputs = self.transformer.generate( input_ids, attention_mask=input_att_mask, num_return_sequences=num_return_sequences, num_beams=max(4, num_return_sequences), max_length=max_len, early_stopping=True, return_dict_in_generate=True, output_scores=True, decoder_start_token_id=decoder_start_token_id, **kwargs) output_ids = outputs.sequences output_scores = outputs.sequences_scores output_text = [ self.tokenizer.decode(single_output_ids, skip_special_tokens=True) for single_output_ids in output_ids ] output_chunks = list(chunks(output_text, num_return_sequences)) output_score_chunks = list( chunks(output_scores, num_return_sequences)) for sample, all_gen_per_sample, scores in zip( chunk_samples, output_chunks, output_score_chunks): sample['all_generations'] = all_gen_per_sample sample['scores'] = scores.softmax(-1) sample['top_output'] = all_gen_per_sample[0] new_samples += chunk_samples return new_samples
def test_chunk_with_larger_number(self): sample_list = list(range(self.CHUNK_SIZE)) chunks = list(utils.chunks(sample_list, self.CHUNK_SIZE + 1)) self.assertEqual(len(chunks), 1) self.assertEqual(len(chunks[0]), self.CHUNK_SIZE)
def solve_labels_collision(subject, subjects_dir, atlas, backup_atlas, n_jobs=1): now = time.time() print('Read labels') labels = utils.read_labels_parallel(subject, subjects_dir, atlas, n_jobs) backup_labels_fol = op.join(subjects_dir, subject, 'label', backup_atlas) labels_fol = op.join(subjects_dir, subject, 'label', atlas) if op.isdir(backup_labels_fol): shutil.rmtree(backup_labels_fol) os.rename(labels_fol, backup_labels_fol) utils.make_dir(labels_fol) hemis_verts, labels_hemi, pia_verts = {}, {}, {} print('Read surface ({:.2f}s)'.format(time.time() - now)) for hemi in HEMIS: surf_fname = op.join(subjects_dir, subject, 'surf', '{}.pial'.format(hemi)) hemis_verts[hemi], _ = mne.surface.read_surface(surf_fname) labels_hemi[hemi] = [l for l in labels if l.hemi == hemi] print('Calc centroids ({:.2f}s)'.format(time.time() - now)) centroids = calc_labels_centroids(labels_hemi, hemis_verts) for hemi in HEMIS: print('Calc vertices labeling for {} ({:.2f}s)'.format(hemi, time.time() - now)) hemi_centroids_dist = cdist(hemis_verts[hemi], centroids[hemi]) vertices_labels_indices = np.argmin(hemi_centroids_dist, axis=1) labels_hemi_chunks = utils.chunks(list(enumerate(labels_hemi[hemi])), len(labels_hemi[hemi]) / n_jobs) params = [(labels_hemi_chunk, atlas, vertices_labels_indices, hemis_verts, labels_fol) for labels_hemi_chunk in labels_hemi_chunks] print('Save labels for {} ({:.2f}s)'.format(hemi, time.time() - now)) utils.run_parallel(_save_new_labels_parallel, params, n_jobs)
def calc_vector_mean_cov_cl(ys, fol, hs_tr, measure, atlas, k_type='triangular', sim=False, overwrite=False, n_jobs=1): output_fname = op.join(fol, 'vector_mean_cov_cl{}_{}_{}_{}.npz'.format('_sim' if sim else '', k_type, measure, atlas)) if op.isfile(output_fname) and not overwrite: d = np.load(output_fname) if np.any(np.array(d['hs_tr']) != np.array(hs_tr)): overwrite = True if not op.isfile(output_fname) or overwrite: # d = np.load(op.join(fol, 'vector_mean_var{}_{}.npz'.format('_sim' if sim else '', k_type))) # means_est = d['means'] #, d['hs_tr'], d['hs_ms'] means_est = None mean_ll_cl, mean_pan_cl = np.zeros((len(hs_tr))), np.zeros((len(hs_tr))) cov_ll_cl, cov_pan_cl = np.zeros((len(hs_tr))), np.zeros((len(hs_tr))) h_chunks = utils.chunks(list(enumerate(hs_tr)), len(hs_tr) / n_jobs) params = [(ys, means_est, h_chunk, k_type) for h_chunk in h_chunks] results = utils.run_parallel(_calc_vector_mean_and_cov_cl_parallel, params, n_jobs) for chunk_mean_ll_cl, chunk_mean_pan_cl, chunk_cov_ll_cl, chunk_cov_pan_cl in results: for h_ind in chunk_mean_ll_cl.keys(): mean_ll_cl[h_ind] = chunk_mean_ll_cl[h_ind] mean_pan_cl[h_ind] = chunk_mean_pan_cl[h_ind] cov_ll_cl[h_ind] = chunk_cov_ll_cl[h_ind] cov_pan_cl[h_ind] = chunk_cov_pan_cl[h_ind] print('Saving results in {}'.format(output_fname)) np.savez(output_fname, mean_ll_cl=mean_ll_cl, mean_pan_cl=mean_pan_cl, cov_ll_cl=cov_ll_cl, cov_pan_cl=cov_pan_cl, hs_tr=hs_tr)
def est_mean_and_var(ys, names, hs_tr, hs_s, t_axis, fol, k_type='triangular', sim=False, overwrite=False, specific_label='', n_jobs=1): output_fname = op.join(fol, 'mean_var{}_{}_{}-{}.npz'.format('_sim' if sim else '', k_type, hs_s[0], hs_s[-1])) if op.isfile(output_fname) and not overwrite: d = np.load(op.join(fol, 'mean_var_sim_{}.npz'.format(k_type))) if np.any(np.array(d['hs_tr']) != np.array(hs_tr)): overwrite = True print('The parameter hs_tr is not the same as in the saved file, recalculating.') W = len(hs_s) if not op.isfile(output_fname) or overwrite: # all_means, all_vars = np.zeros((ys.shape[0], W)), np.zeros((ys.shape[0], W)) all_means = np.zeros((len(hs_tr), len(names), ys.shape[1])) all_vars = np.zeros((len(hs_tr), len(names), ys.shape[1])) h_chunks = utils.chunks(list(zip(hs_tr, hs_s, range(W))), W / n_jobs) params = [(ys, names, h_chunk, t_axis, fol, k_type, specific_label) for h_chunk in h_chunks] results = utils.run_parallel(_est_mean_and_var_parallel, params, n_jobs) # for chunk_means, chunk_vars in results: # for (h_ind, mean), var in zip(chunk_means.items(), chunk_vars.values()): # all_means[h_ind] = mean # all_vars[h_ind] = var for (chunk_means, chunk_vars) in results: for h_ind, label_ind in chunk_means.keys(): all_means[h_ind, label_ind, :] = chunk_means[(h_ind, label_ind)] all_vars[h_ind, label_ind, :] = chunk_vars[(h_ind, label_ind)] # for ind, (h_tr, h_s) in enumerate(zip(hs_tr, hs_s)): # print('h: {}s'.format(h_s)) # all_means[ind] = est_vector_mean_ll(ys, h_tr, k_type) # all_vars[ind] = est_vector_est_var_t(ys, all_means[ind], h_tr, k_type) np.savez(output_fname, means=all_means, vars=all_vars, hs_tr=hs_tr, hs_ms=hs_s)
def test_chunks(self): sample_list = list(range(self.CHUNK_SIZE)) split_size = 100 chunks = list(utils.chunks(sample_list, split_size)) self.assertEqual(len(chunks), self.CHUNK_SIZE // split_size) self.assertEqual(len(chunks[0]), split_size)
def read_labels_from_folder(subject, subjects_dir, atlas, n_jobs): labels_files = glob.glob(op.join(subjects_dir, subject, 'label', atlas, '*.label')) files_chunks = utils.chunks(labels_files, len(labels_files) / n_jobs) results = utils.run_parallel(_read_labels_parallel, files_chunks, n_jobs) labels = [] for labels_chunk in results: labels.extend(labels_chunk) return labels
def main(): sqs_client = boto3.client("sqs") utils.purge_queue_or_exit(sqs_client, INPUT_QUEUE_URL) utils.purge_queue_or_exit(sqs_client, RESULT_QUEUE_URL) code = utils.read_token() utils.start_test_or_exit(TARGET_URL, 3, code) request = utils.make_request(f"{TARGET_URL}/test3", headers={"AccessToken": code}, method="GET") datas = [] index = 0 while index < TARGET_COUNT: print(f"{index}th request") try: response = urlopen(request) datas += json.loads(response.read()) index += 1 except HTTPError as err: message = err.read().decode("utf-8") print(f"request failed: {err.code} - {message}") time.sleep(1) for chunk in utils.chunks(datas, SIZE_OF_CHUNK): utils.send_queue_message(sqs_client, INPUT_QUEUE_URL, json.dumps(chunk)) invoker = utils.create_async_invoker( boto3.client("lambda"), TARGET_URL, code, "test3", function_name="stibee_project_invoke_api2", ) for i in range(int(len(datas) // SIZE_OF_CHUNK)): result = invoker() print( f"{i}th lambda ", "invoke succeed." if result["ResponseMetadata"]["HTTPStatusCode"] == 202 else "invoke failed.", ) # 1000번 다 보냈음에도 통과 되지 않는때가 있음. # 1000개 request 이후 delay가 필요한건가? queues = utils.read_queue_message(sqs_client, RESULT_QUEUE_URL) if len(queues) == len(datas): utils.end_test_or_exit(TARGET_URL, 3, code)
def QA_inference(self, input_samples, chunk_size=64): self.eval() with torch.no_grad(): new_samples = [] for chunk_samples in tqdm(list(chunks(input_samples, chunk_size)), desc="QA-inference"): batch_size = len(chunk_samples) input_text = [ self.sample_to_input_text(s) for s in chunk_samples ] input_tok_obj = self.tokenizer(input_text, return_tensors='pt', padding=True) input_ids = input_tok_obj['input_ids'].to(self.device) input_att_mask = input_tok_obj['attention_mask'].to( self.device) decoder_start_token_id = self.tokenizer.get_vocab( )['<extra_id_0>'] decoder_ids = torch.full( (batch_size, 1), decoder_start_token_id).to(self.device) outputs = self.transformer(input_ids, decoder_input_ids=decoder_ids, attention_mask=input_att_mask, use_cache=False) logits = outputs[0] true_token_id = self.tokenizer.get_vocab()[ '▁true'] # careful "▁" is not an underscore "_" false_token_id = self.tokenizer.get_vocab()['▁false'] for sample, token_logits in zip(chunk_samples, logits): true_logit = token_logits[0][true_token_id] false_logit = token_logits[0][false_token_id] sample['false/true'] = torch.tensor( [false_logit, true_logit]).softmax(-1) if 'answer' in sample: label = torch.tensor([sample['answer'] ]) # tensor [True/False] self.EM_accuracy(sample['false/true'].unsqueeze(0), label) new_samples += chunk_samples return new_samples
def calc_vector_mean_cl(ys, fol, hs_tr, k_type='triangular', sim=False, overwrite=False, n_jobs=1): output_fname = op.join(fol, 'vector_mean_cl{}_{}.npz'.format('_sim' if sim else '', k_type)) if op.isfile(output_fname) and not overwrite: d = np.load(output_fname) if np.any(np.array(d['hs_tr']) != np.array(hs_tr)): overwrite = True if not op.isfile(output_fname) or overwrite: d = np.load(op.join(fol, 'vector_mean_var{}_{}.npz'.format('_sim' if sim else '', k_type))) means_est, vars_est, hs_tr, hs_ms = d['means'], d['vars'], d['hs_tr'], d['hs_ms'] mean_cl, var_cl = np.zeros((len(hs_tr))), np.zeros((len(hs_tr))) h_chunks = utils.chunks(list(enumerate(hs_tr)), len(hs_tr) / n_jobs) params = [(ys, means_est, h_chunk, k_type) for h_chunk in h_chunks] results = utils.run_parallel(_calc_vector_mean_cl_parallel, params, n_jobs) for chunk_mean_cl in results: for h_ind in chunk_mean_cl.keys(): mean_cl[h_ind] = chunk_mean_cl[h_ind] # for hs_ind, hs_tr in enumerate(hs_tr): # mean_cl[hs_ind] = vector_mean_cl_stat(ys, means[hs_ind], hs_tr, k_type) np.savez(output_fname, mean_cl=mean_cl, var_cl=var_cl, hs_tr=hs_tr, hs_ms=hs_ms)
def calc_mean_var_cl(ys, fol, hs_tr, hs_s, labels_names, k_type='triangular', sim=False, overwrite=False, specific_label='', n_jobs=1): output_fname = op.join(fol, 'mean_var{}_cl_{}_{}-{}.npz'.format('_sim' if sim else '', k_type, hs_s[0], hs_s[-1])) if op.isfile(output_fname) and not overwrite: d = np.load(output_fname) if np.any(np.array(d['hs_tr']) != np.array(hs_tr)): overwrite = True if not op.isfile(output_fname) or overwrite: intpu_fname = op.join(fol, 'mean_var{}_{}_{}-{}.npz'.format('_sim' if sim else '', k_type, hs_s[0], hs_s[-1])) d = np.load(intpu_fname) means_est, vars_est, hs_tr, hs_ms = d['means'], d['vars'], d['hs_tr'], d['hs_ms'] mean_cl, var_cl = np.zeros((ys.shape[0], len(hs_tr))), np.zeros((ys.shape[0], len(hs_tr))) params_to_chunk = [] for (h_tr_ind, h_tr), (label_ind, label_name) in product(enumerate(hs_tr), enumerate(labels_names)):# range(ys.shape[0])): params_to_chunk.append((h_tr_ind, h_tr, label_ind, label_name)) h_l_chunks = utils.chunks(params_to_chunk, len(params_to_chunk) / n_jobs) params = [(ys, means_est, vars_est, h_l_chunk, k_type, specific_label) for h_l_chunk in h_l_chunks] results = utils.run_parallel(_calc_mean_var_cl_parallel, params, n_jobs) for (chunk_mean_cl, chunk_var_cl) in results: for h_ind, label_ind in chunk_mean_cl.keys(): mean_cl[label_ind, h_ind] = chunk_mean_cl[(h_ind, label_ind)] var_cl[label_ind, h_ind] = chunk_var_cl[(h_ind, label_ind)] np.savez(output_fname, mean_cl=mean_cl, var_cl=var_cl, hs_tr=hs_tr, hs_ms=hs_ms)
def knot_hash(plain, keys): sparse_hash = hash_round(plain, keys, rounds=64) dense_hash = [ reduce(lambda x, y: x ^ y, chunk) for chunk in chunks(sparse_hash, 16) ] return ''.join("%0.2x" % x for x in dense_hash)
def inference(self, input_samples, max_len=30, chunk_size=64, num_return_sequences=1, return_hidden_states=False, **kwargs): """ input_samples: [{'all_raw_queries':['sadfad','adfad'], ...}] """ self.eval() with torch.no_grad(): new_samples = [] for chunk_samples in tqdm(list(chunks(input_samples, chunk_size)), desc="Inference"): flat_sample_text, fusion_map = self.samples_to_input( chunk_samples) encoder_tok_obj = self.tokenizer(flat_sample_text, return_tensors='pt', padding=True) input_ids = encoder_tok_obj['input_ids'].to(self.device) attention_mask = encoder_tok_obj['attention_mask'].to( self.device) encoder_outputs = self.encoder_forward( fusion_map, input_ids, attention_mask, return_hidden_states=return_hidden_states) fused_attention_mask = encoder_outputs.attentions encoder_layer_states = encoder_outputs.hidden_states if return_hidden_states else [ None ] * len(chunk_samples) batch_target_text = [ self.sample_to_inference_target_text(s) for s in chunk_samples ] decoder_tok_obj = self.decoder_tokenizer( batch_target_text, return_tensors='pt', padding=True, add_special_tokens=False) decoder_input_ids = decoder_tok_obj['input_ids'].to( self.device) decoder_attention_mask = decoder_tok_obj['attention_mask'].to( self.device) kwargs.update({ 'encoder_outputs': encoder_outputs, 'decoder_attention_mask': decoder_attention_mask }) def prefix_allowed_tokens_fn(batch_id, input_ids): if input_ids.shape[0] < decoder_input_ids[batch_id].shape[ 0]: return decoder_input_ids[batch_id][ input_ids.shape[0]].tolist() else: return list(range(self.decoder_tokenizer.vocab_size)) outputs = self.transformer.generate( decoder_input_ids, attention_mask=fused_attention_mask, num_return_sequences=num_return_sequences, num_beams=num_return_sequences, max_length=max_len, early_stopping=True, output_hidden_states=True, return_dict_in_generate=True, output_scores=True, prefix_allowed_tokens_fn=prefix_allowed_tokens_fn, use_cache=False, **kwargs) output_ids = outputs.sequences output_scores = outputs.sequences_scores if num_return_sequences > 1 else torch.tensor( [1.0] * len(chunk_samples)) output_text = [ self.tokenizer.decode(single_output_ids, skip_special_tokens=True) for single_output_ids in output_ids ] output_chunks = list(chunks(output_text, num_return_sequences)) output_score_chunks = list( chunks(output_scores, num_return_sequences)) decoder_layer_states = torch.einsum( 'ijkl->jkil', torch.stack( outputs.decoder_hidden_states[-1]) ) if return_hidden_states else [None] * len(chunk_samples) for i in range(len(chunk_samples)): start, end = fusion_map[i] chunk_samples[i]['encoder_input_ids'] = input_ids[ start:end] chunk_samples[i]['decoder_input_ids'] = output_ids[i] chunk_samples[i]['all_generations'] = output_chunks[i] chunk_samples[i]['scores'] = output_score_chunks[ i].softmax(-1) chunk_samples[i]['top_output'] = output_chunks[i][0] chunk_samples[i][ 'encoder_hidden_states'] = encoder_layer_states[i] chunk_samples[i][ 'decoder_hidden_states'] = decoder_layer_states[i] if 'answer' in chunk_samples[i]: target_text = self.sample_to_train_target_text( chunk_samples[i]) target_text = self.decoder_tokenizer.decode( self.decoder_tokenizer.encode( target_text, add_special_tokens=False), skip_special_tokens=True) chunk_samples[i]['target_text'] = target_text is_same = chunk_samples[i]['top_output'] == target_text self.EM_accuracy( torch.tensor([[not is_same, is_same]]).to(torch.float), torch.tensor([1])) chunk_samples[i]['EM'] = is_same new_samples += chunk_samples return new_samples