示例#1
0
    def inference(self,
                  input_samples,
                  max_len=200,
                  chunk_size=64,
                  num_return_sequences=4,
                  **kwargs):
        """
        input_samples: [{'all_raw_queries':['sadfad','adfad'], ...}]
        """
        self.eval()
        with torch.no_grad():
            new_samples = []
            for chunk_samples in tqdm(list(chunks(input_samples, chunk_size)),
                                      desc="Re-writing"):
                input_text = [
                    self.sample_to_input_text(s) for s in chunk_samples
                ]
                input_tok_obj = self.tokenizer(input_text,
                                               return_tensors='pt',
                                               padding=True)
                input_ids = input_tok_obj['input_ids'].to(self.device)
                input_att_mask = input_tok_obj['attention_mask'].to(
                    self.device)

                decoder_start_token_id = self.tokenizer.get_vocab(
                )['<extra_id_0>']

                outputs = self.transformer.generate(
                    input_ids,
                    attention_mask=input_att_mask,
                    num_return_sequences=num_return_sequences,
                    num_beams=max(4, num_return_sequences),
                    max_length=max_len,
                    early_stopping=True,
                    return_dict_in_generate=True,
                    output_scores=True,
                    decoder_start_token_id=decoder_start_token_id,
                    **kwargs)
                output_ids = outputs.sequences
                output_scores = outputs.sequences_scores
                output_text = [
                    self.tokenizer.decode(single_output_ids,
                                          skip_special_tokens=True)
                    for single_output_ids in output_ids
                ]
                output_chunks = list(chunks(output_text, num_return_sequences))
                output_score_chunks = list(
                    chunks(output_scores, num_return_sequences))

                for sample, all_gen_per_sample, scores in zip(
                        chunk_samples, output_chunks, output_score_chunks):
                    sample['all_generations'] = all_gen_per_sample
                    sample['scores'] = scores.softmax(-1)
                    sample['top_output'] = all_gen_per_sample[0]
                new_samples += chunk_samples
            return new_samples
示例#2
0
    def test_chunk_with_larger_number(self):
        sample_list = list(range(self.CHUNK_SIZE))

        chunks = list(utils.chunks(sample_list, self.CHUNK_SIZE + 1))

        self.assertEqual(len(chunks), 1)
        self.assertEqual(len(chunks[0]), self.CHUNK_SIZE)
示例#3
0
def solve_labels_collision(subject, subjects_dir, atlas, backup_atlas, n_jobs=1):
    now = time.time()
    print('Read labels')
    labels = utils.read_labels_parallel(subject, subjects_dir, atlas, n_jobs)
    backup_labels_fol = op.join(subjects_dir, subject, 'label', backup_atlas)
    labels_fol = op.join(subjects_dir, subject, 'label', atlas)
    if op.isdir(backup_labels_fol):
        shutil.rmtree(backup_labels_fol)
    os.rename(labels_fol, backup_labels_fol)
    utils.make_dir(labels_fol)
    hemis_verts, labels_hemi, pia_verts = {}, {}, {}
    print('Read surface ({:.2f}s)'.format(time.time() - now))
    for hemi in HEMIS:
        surf_fname = op.join(subjects_dir, subject, 'surf', '{}.pial'.format(hemi))
        hemis_verts[hemi], _ = mne.surface.read_surface(surf_fname)
        labels_hemi[hemi] = [l for l in labels if l.hemi == hemi]
    print('Calc centroids ({:.2f}s)'.format(time.time() - now))
    centroids = calc_labels_centroids(labels_hemi, hemis_verts)
    for hemi in HEMIS:
        print('Calc vertices labeling for {} ({:.2f}s)'.format(hemi, time.time() - now))
        hemi_centroids_dist = cdist(hemis_verts[hemi], centroids[hemi])
        vertices_labels_indices = np.argmin(hemi_centroids_dist, axis=1)
        labels_hemi_chunks = utils.chunks(list(enumerate(labels_hemi[hemi])), len(labels_hemi[hemi]) / n_jobs)
        params = [(labels_hemi_chunk, atlas, vertices_labels_indices, hemis_verts, labels_fol) for labels_hemi_chunk in labels_hemi_chunks]
        print('Save labels for {} ({:.2f}s)'.format(hemi, time.time() - now))
        utils.run_parallel(_save_new_labels_parallel, params, n_jobs)
示例#4
0
def calc_vector_mean_cov_cl(ys, fol, hs_tr, measure, atlas, k_type='triangular', sim=False, overwrite=False, n_jobs=1):
    output_fname = op.join(fol, 'vector_mean_cov_cl{}_{}_{}_{}.npz'.format('_sim' if sim else '', k_type, measure, atlas))
    if op.isfile(output_fname) and not overwrite:
        d = np.load(output_fname)
        if np.any(np.array(d['hs_tr']) != np.array(hs_tr)):
            overwrite = True
    if not op.isfile(output_fname) or overwrite:
        # d = np.load(op.join(fol, 'vector_mean_var{}_{}.npz'.format('_sim' if sim else '', k_type)))
        # means_est = d['means'] #, d['hs_tr'], d['hs_ms']
        means_est = None
        mean_ll_cl, mean_pan_cl = np.zeros((len(hs_tr))), np.zeros((len(hs_tr)))
        cov_ll_cl, cov_pan_cl = np.zeros((len(hs_tr))), np.zeros((len(hs_tr)))

        h_chunks = utils.chunks(list(enumerate(hs_tr)), len(hs_tr) / n_jobs)
        params = [(ys, means_est, h_chunk, k_type) for h_chunk in h_chunks]
        results = utils.run_parallel(_calc_vector_mean_and_cov_cl_parallel, params, n_jobs)
        for chunk_mean_ll_cl, chunk_mean_pan_cl, chunk_cov_ll_cl, chunk_cov_pan_cl in results:
            for h_ind in chunk_mean_ll_cl.keys():
                mean_ll_cl[h_ind] = chunk_mean_ll_cl[h_ind]
                mean_pan_cl[h_ind] = chunk_mean_pan_cl[h_ind]
                cov_ll_cl[h_ind] = chunk_cov_ll_cl[h_ind]
                cov_pan_cl[h_ind] = chunk_cov_pan_cl[h_ind]
        print('Saving results in {}'.format(output_fname))
        np.savez(output_fname, mean_ll_cl=mean_ll_cl, mean_pan_cl=mean_pan_cl, cov_ll_cl=cov_ll_cl,
                 cov_pan_cl=cov_pan_cl, hs_tr=hs_tr)
示例#5
0
def est_mean_and_var(ys, names, hs_tr, hs_s, t_axis, fol, k_type='triangular', sim=False, overwrite=False,
                     specific_label='', n_jobs=1):
    output_fname = op.join(fol, 'mean_var{}_{}_{}-{}.npz'.format('_sim' if sim else '', k_type, hs_s[0], hs_s[-1]))
    if op.isfile(output_fname) and not overwrite:
        d = np.load(op.join(fol, 'mean_var_sim_{}.npz'.format(k_type)))
        if np.any(np.array(d['hs_tr']) != np.array(hs_tr)):
            overwrite = True
            print('The parameter hs_tr is not the same as in the saved file, recalculating.')
    W = len(hs_s)
    if not op.isfile(output_fname) or overwrite:
        # all_means, all_vars = np.zeros((ys.shape[0], W)), np.zeros((ys.shape[0], W))
        all_means = np.zeros((len(hs_tr), len(names), ys.shape[1]))
        all_vars = np.zeros((len(hs_tr), len(names), ys.shape[1]))

        h_chunks = utils.chunks(list(zip(hs_tr, hs_s, range(W))), W / n_jobs)
        params = [(ys, names, h_chunk, t_axis, fol, k_type, specific_label) for h_chunk in h_chunks]
        results = utils.run_parallel(_est_mean_and_var_parallel, params, n_jobs)
        # for chunk_means, chunk_vars in results:
        #     for (h_ind, mean), var in zip(chunk_means.items(), chunk_vars.values()):
        #         all_means[h_ind] = mean
        #         all_vars[h_ind] = var
        for (chunk_means, chunk_vars) in results:
            for h_ind, label_ind in chunk_means.keys():
                all_means[h_ind, label_ind, :] = chunk_means[(h_ind, label_ind)]
                all_vars[h_ind, label_ind, :] = chunk_vars[(h_ind, label_ind)]

        # for ind, (h_tr, h_s) in enumerate(zip(hs_tr, hs_s)):
        #     print('h: {}s'.format(h_s))
        #     all_means[ind] = est_vector_mean_ll(ys, h_tr, k_type)
        #     all_vars[ind] = est_vector_est_var_t(ys, all_means[ind], h_tr, k_type)
        np.savez(output_fname, means=all_means, vars=all_vars, hs_tr=hs_tr, hs_ms=hs_s)
示例#6
0
    def test_chunks(self):
        sample_list = list(range(self.CHUNK_SIZE))
        split_size = 100

        chunks = list(utils.chunks(sample_list, split_size))

        self.assertEqual(len(chunks), self.CHUNK_SIZE // split_size)
        self.assertEqual(len(chunks[0]), split_size)
示例#7
0
def read_labels_from_folder(subject, subjects_dir, atlas, n_jobs):
    labels_files = glob.glob(op.join(subjects_dir, subject, 'label', atlas, '*.label'))
    files_chunks = utils.chunks(labels_files, len(labels_files) / n_jobs)
    results = utils.run_parallel(_read_labels_parallel, files_chunks, n_jobs)
    labels = []
    for labels_chunk in results:
        labels.extend(labels_chunk)
    return labels
示例#8
0
def main():
    sqs_client = boto3.client("sqs")
    utils.purge_queue_or_exit(sqs_client, INPUT_QUEUE_URL)
    utils.purge_queue_or_exit(sqs_client, RESULT_QUEUE_URL)

    code = utils.read_token()
    utils.start_test_or_exit(TARGET_URL, 3, code)

    request = utils.make_request(f"{TARGET_URL}/test3",
                                 headers={"AccessToken": code},
                                 method="GET")
    datas = []
    index = 0
    while index < TARGET_COUNT:
        print(f"{index}th request")
        try:
            response = urlopen(request)
            datas += json.loads(response.read())
            index += 1
        except HTTPError as err:
            message = err.read().decode("utf-8")
            print(f"request failed: {err.code} - {message}")

        time.sleep(1)

    for chunk in utils.chunks(datas, SIZE_OF_CHUNK):
        utils.send_queue_message(sqs_client, INPUT_QUEUE_URL,
                                 json.dumps(chunk))

    invoker = utils.create_async_invoker(
        boto3.client("lambda"),
        TARGET_URL,
        code,
        "test3",
        function_name="stibee_project_invoke_api2",
    )
    for i in range(int(len(datas) // SIZE_OF_CHUNK)):
        result = invoker()
        print(
            f"{i}th lambda ",
            "invoke succeed." if result["ResponseMetadata"]["HTTPStatusCode"]
            == 202 else "invoke failed.",
        )

    # 1000번 다 보냈음에도 통과 되지 않는때가 있음.
    # 1000개 request 이후 delay가 필요한건가?
    queues = utils.read_queue_message(sqs_client, RESULT_QUEUE_URL)
    if len(queues) == len(datas):
        utils.end_test_or_exit(TARGET_URL, 3, code)
示例#9
0
    def QA_inference(self, input_samples, chunk_size=64):
        self.eval()
        with torch.no_grad():
            new_samples = []
            for chunk_samples in tqdm(list(chunks(input_samples, chunk_size)),
                                      desc="QA-inference"):
                batch_size = len(chunk_samples)
                input_text = [
                    self.sample_to_input_text(s) for s in chunk_samples
                ]
                input_tok_obj = self.tokenizer(input_text,
                                               return_tensors='pt',
                                               padding=True)
                input_ids = input_tok_obj['input_ids'].to(self.device)
                input_att_mask = input_tok_obj['attention_mask'].to(
                    self.device)

                decoder_start_token_id = self.tokenizer.get_vocab(
                )['<extra_id_0>']
                decoder_ids = torch.full(
                    (batch_size, 1), decoder_start_token_id).to(self.device)

                outputs = self.transformer(input_ids,
                                           decoder_input_ids=decoder_ids,
                                           attention_mask=input_att_mask,
                                           use_cache=False)
                logits = outputs[0]

                true_token_id = self.tokenizer.get_vocab()[
                    '▁true']  # careful "▁" is not an underscore "_"
                false_token_id = self.tokenizer.get_vocab()['▁false']

                for sample, token_logits in zip(chunk_samples, logits):
                    true_logit = token_logits[0][true_token_id]
                    false_logit = token_logits[0][false_token_id]
                    sample['false/true'] = torch.tensor(
                        [false_logit, true_logit]).softmax(-1)
                    if 'answer' in sample:
                        label = torch.tensor([sample['answer']
                                              ])  # tensor [True/False]
                        self.EM_accuracy(sample['false/true'].unsqueeze(0),
                                         label)
                new_samples += chunk_samples

            return new_samples
示例#10
0
def calc_vector_mean_cl(ys, fol, hs_tr, k_type='triangular', sim=False, overwrite=False, n_jobs=1):
    output_fname = op.join(fol, 'vector_mean_cl{}_{}.npz'.format('_sim' if sim else '', k_type))
    if op.isfile(output_fname) and not overwrite:
        d = np.load(output_fname)
        if np.any(np.array(d['hs_tr']) != np.array(hs_tr)):
            overwrite = True
    if not op.isfile(output_fname) or overwrite:
        d = np.load(op.join(fol, 'vector_mean_var{}_{}.npz'.format('_sim' if sim else '', k_type)))
        means_est, vars_est, hs_tr, hs_ms = d['means'], d['vars'], d['hs_tr'], d['hs_ms']
        mean_cl, var_cl = np.zeros((len(hs_tr))), np.zeros((len(hs_tr)))

        h_chunks = utils.chunks(list(enumerate(hs_tr)), len(hs_tr) / n_jobs)
        params = [(ys, means_est, h_chunk, k_type) for h_chunk in h_chunks]
        results = utils.run_parallel(_calc_vector_mean_cl_parallel, params, n_jobs)
        for chunk_mean_cl in results:
            for h_ind in chunk_mean_cl.keys():
                mean_cl[h_ind] = chunk_mean_cl[h_ind]
        # for hs_ind, hs_tr in enumerate(hs_tr):
        #     mean_cl[hs_ind] = vector_mean_cl_stat(ys, means[hs_ind], hs_tr, k_type)
        np.savez(output_fname, mean_cl=mean_cl, var_cl=var_cl, hs_tr=hs_tr, hs_ms=hs_ms)
示例#11
0
def calc_mean_var_cl(ys, fol, hs_tr, hs_s, labels_names, k_type='triangular', sim=False, overwrite=False,
                     specific_label='', n_jobs=1):
    output_fname = op.join(fol, 'mean_var{}_cl_{}_{}-{}.npz'.format('_sim' if sim else '', k_type, hs_s[0], hs_s[-1]))
    if op.isfile(output_fname) and not overwrite:
        d = np.load(output_fname)
        if np.any(np.array(d['hs_tr']) != np.array(hs_tr)):
            overwrite = True
    if not op.isfile(output_fname) or overwrite:
        intpu_fname = op.join(fol, 'mean_var{}_{}_{}-{}.npz'.format('_sim' if sim else '', k_type, hs_s[0], hs_s[-1]))
        d = np.load(intpu_fname)
        means_est, vars_est, hs_tr, hs_ms = d['means'], d['vars'], d['hs_tr'], d['hs_ms']
        mean_cl, var_cl = np.zeros((ys.shape[0], len(hs_tr))), np.zeros((ys.shape[0], len(hs_tr)))
        params_to_chunk = []
        for (h_tr_ind, h_tr), (label_ind, label_name) in product(enumerate(hs_tr), enumerate(labels_names)):# range(ys.shape[0])):
            params_to_chunk.append((h_tr_ind, h_tr, label_ind, label_name))
        h_l_chunks = utils.chunks(params_to_chunk, len(params_to_chunk) / n_jobs)
        params = [(ys, means_est, vars_est, h_l_chunk, k_type, specific_label) for h_l_chunk in h_l_chunks]
        results = utils.run_parallel(_calc_mean_var_cl_parallel, params, n_jobs)
        for (chunk_mean_cl, chunk_var_cl) in results:
            for h_ind, label_ind in chunk_mean_cl.keys():
                mean_cl[label_ind, h_ind] = chunk_mean_cl[(h_ind, label_ind)]
                var_cl[label_ind, h_ind] = chunk_var_cl[(h_ind, label_ind)]
        np.savez(output_fname, mean_cl=mean_cl, var_cl=var_cl, hs_tr=hs_tr, hs_ms=hs_ms)
示例#12
0
def knot_hash(plain, keys):
    sparse_hash = hash_round(plain, keys, rounds=64)
    dense_hash = [
        reduce(lambda x, y: x ^ y, chunk) for chunk in chunks(sparse_hash, 16)
    ]
    return ''.join("%0.2x" % x for x in dense_hash)
示例#13
0
    def inference(self,
                  input_samples,
                  max_len=30,
                  chunk_size=64,
                  num_return_sequences=1,
                  return_hidden_states=False,
                  **kwargs):
        """
        input_samples: [{'all_raw_queries':['sadfad','adfad'], ...}]
        """
        self.eval()
        with torch.no_grad():
            new_samples = []
            for chunk_samples in tqdm(list(chunks(input_samples, chunk_size)),
                                      desc="Inference"):
                flat_sample_text, fusion_map = self.samples_to_input(
                    chunk_samples)
                encoder_tok_obj = self.tokenizer(flat_sample_text,
                                                 return_tensors='pt',
                                                 padding=True)
                input_ids = encoder_tok_obj['input_ids'].to(self.device)
                attention_mask = encoder_tok_obj['attention_mask'].to(
                    self.device)

                encoder_outputs = self.encoder_forward(
                    fusion_map,
                    input_ids,
                    attention_mask,
                    return_hidden_states=return_hidden_states)
                fused_attention_mask = encoder_outputs.attentions
                encoder_layer_states = encoder_outputs.hidden_states if return_hidden_states else [
                    None
                ] * len(chunk_samples)

                batch_target_text = [
                    self.sample_to_inference_target_text(s)
                    for s in chunk_samples
                ]
                decoder_tok_obj = self.decoder_tokenizer(
                    batch_target_text,
                    return_tensors='pt',
                    padding=True,
                    add_special_tokens=False)
                decoder_input_ids = decoder_tok_obj['input_ids'].to(
                    self.device)
                decoder_attention_mask = decoder_tok_obj['attention_mask'].to(
                    self.device)

                kwargs.update({
                    'encoder_outputs': encoder_outputs,
                    'decoder_attention_mask': decoder_attention_mask
                })

                def prefix_allowed_tokens_fn(batch_id, input_ids):
                    if input_ids.shape[0] < decoder_input_ids[batch_id].shape[
                            0]:
                        return decoder_input_ids[batch_id][
                            input_ids.shape[0]].tolist()
                    else:
                        return list(range(self.decoder_tokenizer.vocab_size))

                outputs = self.transformer.generate(
                    decoder_input_ids,
                    attention_mask=fused_attention_mask,
                    num_return_sequences=num_return_sequences,
                    num_beams=num_return_sequences,
                    max_length=max_len,
                    early_stopping=True,
                    output_hidden_states=True,
                    return_dict_in_generate=True,
                    output_scores=True,
                    prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
                    use_cache=False,
                    **kwargs)
                output_ids = outputs.sequences
                output_scores = outputs.sequences_scores if num_return_sequences > 1 else torch.tensor(
                    [1.0] * len(chunk_samples))
                output_text = [
                    self.tokenizer.decode(single_output_ids,
                                          skip_special_tokens=True)
                    for single_output_ids in output_ids
                ]
                output_chunks = list(chunks(output_text, num_return_sequences))
                output_score_chunks = list(
                    chunks(output_scores, num_return_sequences))

                decoder_layer_states = torch.einsum(
                    'ijkl->jkil', torch.stack(
                        outputs.decoder_hidden_states[-1])
                ) if return_hidden_states else [None] * len(chunk_samples)

                for i in range(len(chunk_samples)):
                    start, end = fusion_map[i]
                    chunk_samples[i]['encoder_input_ids'] = input_ids[
                        start:end]
                    chunk_samples[i]['decoder_input_ids'] = output_ids[i]
                    chunk_samples[i]['all_generations'] = output_chunks[i]
                    chunk_samples[i]['scores'] = output_score_chunks[
                        i].softmax(-1)
                    chunk_samples[i]['top_output'] = output_chunks[i][0]
                    chunk_samples[i][
                        'encoder_hidden_states'] = encoder_layer_states[i]
                    chunk_samples[i][
                        'decoder_hidden_states'] = decoder_layer_states[i]
                    if 'answer' in chunk_samples[i]:
                        target_text = self.sample_to_train_target_text(
                            chunk_samples[i])
                        target_text = self.decoder_tokenizer.decode(
                            self.decoder_tokenizer.encode(
                                target_text, add_special_tokens=False),
                            skip_special_tokens=True)
                        chunk_samples[i]['target_text'] = target_text
                        is_same = chunk_samples[i]['top_output'] == target_text
                        self.EM_accuracy(
                            torch.tensor([[not is_same,
                                           is_same]]).to(torch.float),
                            torch.tensor([1]))
                        chunk_samples[i]['EM'] = is_same

                new_samples += chunk_samples
            return new_samples