Пример #1
0
def convert_pool_matrices(pool_input, word2ind):
    """Converts a dictionary of pooled captions/questions into matrices.

    Args:
        pool_input: Dictionary of pooled captions/questions
        word2ind: Dictionary of word -> vocabulary index conversion.

    Returns:
        item_tokens: Items in the pool tokenized and converted into a matrix.
        item_lens: Length of items in the matrix.
    """
    unk_token = word2ind["<unk>"]

    def tokenizer(x):
        return [word2ind.get(ii, unk_token) for ii in word_tokenize(x.lower())]

    if isinstance(pool_input, dict):
        pool_list = sorted(pool_input, key=lambda x: pool_input[x])
    else:
        pool_list = pool_input

    tokenized_items = [tokenizer(item) for item in progressbar(pool_list)]
    max_item_len = max(len(ii) for ii in tokenized_items)
    item_tokens = np.zeros(
        (len(tokenized_items), max_item_len)).astype("int32")
    item_tokens.fill(word2ind["<pad>"])
    item_lens = np.zeros(len(tokenized_items)).astype("int32")
    for item_id, tokens in progressbar(enumerate(tokenized_items)):
        item_lens[item_id] = len(tokens)
        item_tokens[item_id, :item_lens[item_id]] = np.array(tokens)
    return item_tokens, item_lens
Пример #2
0
def convert_pool_matrices_pretrained_tokenizer(pool_input,
                                               pretrained_tokenizer):
    """Converts a dictionary of pooled captions/questions into matrices.

    Args:
        pool_input: Dictionary of pooled captions/questions
        pretrained_tokenizer: Huggingface tokenizer for pretrained models.

    Returns:
        item_tokens: Items in the pool tokenized and converted into a matrix.
        item_lens: Length of items in the matrix.
    """
    def tokenizer(x):
        return pretrained_tokenizer.encode(x, add_special_tokens=True)

    if isinstance(pool_input, dict):
        pool_list = sorted(pool_input, key=lambda x: pool_input[x])
    else:
        pool_list = pool_input

    tokenized_items = [tokenizer(item) for item in progressbar(pool_list)]
    max_item_len = max(len(ii) for ii in tokenized_items)
    item_tokens = np.zeros(
        (len(tokenized_items), max_item_len)).astype("int32")
    item_tokens.fill(pretrained_tokenizer.pad_token_id)
    item_lens = np.zeros(len(tokenized_items)).astype("int32")
    for item_id, tokens in progressbar(enumerate(tokenized_items)):
        item_lens[item_id] = len(tokens)
        item_tokens[item_id, :item_lens[item_id]] = np.array(tokens)
    return item_tokens, item_lens
Пример #3
0
    def check_progress(self):
        """Helper metrics to check overall progression"""
        self._step_progressbar = None
        self._sub_progressbars = dict()
        if self._queues is not None:
            total_bar = len(self._succeeded_steps) + len(self._failed_steps)
            sub_bars = dict()
            import sys
            self._step_progressbar = progressbar(total=self._num_steps, desc='__Total__',
                                                 initial=total_bar, postfix=None, position=0)
            for i, priority in enumerate(self._queues.keys()):
                if len(self._queues_labels) != 0:
                    label = 'Step::{}-{}'.format(self._queues_labels[priority], priority)
                else:
                    label = 'priority::{}'.format(str(priority + 1).zfill(3))
                if priority in self._succeeded_workers.keys():
                    sub_bar = len(self._succeeded_workers[priority])
                else:
                    sub_bar = 0
                self._sub_progressbars[priority] = progressbar(total=len(self._queues[priority]),
                                                               desc=label, initial=sub_bar, position=1+i)
                sub_bars[priority] = sub_bar

            def workon(n_finished_steps, n_sub_tasks):
                while n_finished_steps < self._num_steps:
                    cur_finished_steps = len(self._succeeded_steps) + len(self._failed_steps)
                    step_delta = cur_finished_steps - n_finished_steps
                    if step_delta > 0:
                        n_finished_steps += step_delta
                        self._step_progressbar.update(step_delta)
                    for p in self._queues.keys():
                        if p in self._succeeded_workers.keys():
                            sub_delta = len(self._succeeded_workers[p]) - n_sub_tasks[p]
                            if sub_delta > 0:
                                n_sub_tasks[p] += sub_delta
                                self._sub_progressbars[p].update(sub_delta)
                    time.sleep(0.2)
                self._step_progressbar.close()
                for p in self._queues.keys():
                    self._sub_progressbars[p].close()
            import threading
            thread = threading.Thread(target=workon, args=(total_bar, sub_bars))
            thread.daemon = True
            if notebook_env:
                display(self._step_progressbar)
                for p in self._queues.keys():
                    display(self._sub_progressbars[p])
                thread.start()
            else:
                thread.start()
        else:
            print('[No scheduled jobs]')
Пример #4
0
    def random_search(self,
                      iterations,
                      config,
                      train_epochs,
                      transformations,
                      verbose=True,
                      result_path="../data/results.csv"):
        self._searching = True
        all_scores = []
        for _ in progressbar(range(iterations)):
            if self._kill:
                print("Will exit now because of signal!")
                break
            current_config = self._get_random_config(config)
            current_score = self.train_one_configuration(
                current_config, train_epochs, transformations)

            current_score.extend(
                [str(current_config),
                 str(hash(str(current_config)))])
            all_scores.append(current_score)
            if verbose:
                print(all_scores)
        self._save_results(all_scores, result_path)
        self._searching = False
        return all_scores
Пример #5
0
def evalRandomModelAllEigenvalues():
    SIZE = 10
    N_MODELS = 100
    N_STEPS = 500

    rmse = []
    n_evals_list = np.arange(SIZE, 0, -1, dtype=np.int)

    for n_model in progressbar(range(N_MODELS)):
        # a = np.random.normal(0.0, 1.0, (SIZE, SIZE))
        a = np.diag(np.random.normal(0, 1, SIZE))
        rmse.append(evalModelAllEigenvalues(a, N_STEPS))

    mpl.style.use('seaborn')
    fig, ax = plt.subplots()

    stats = [[np.abs(model[n_eval]) for n_eval in model] for model in rmse]

    # print(rmse)
    # print(np.abs(list(rmse.values())))

    mean_error = np.mean(np.mean(stats, axis=2), axis=0)
    std_error = np.mean(np.std(stats, axis=2), axis=0)

    fig, ax = plt.subplots()
    ax.plot(n_evals_list, mean_error)
    ax.fill_between(n_evals_list,
                    mean_error - std_error,
                    mean_error + std_error,
                    facecolor='#a9cce3')
    ax.set_xlabel("# Eigenvalues used")
    ax.set_ylabel("RMSE to original model")
    plt.show()
Пример #6
0
    def fit(self, optim, loss_fn, data_loader, validation_data_loader,
            num_epochs, logger):
        best_loss = float("inf")
        for e in progressbar(range(num_epochs)):
            self._epoch = e

            iter_per_epoch = len(data_loader)
            data_iter = iter(data_loader)
            for i in range(iter_per_epoch):
                inputs, labels = self._get_inputs(data_iter)

                predictions, classes = self.predict(inputs,
                                                    return_classes=True)

                optim.zero_grad()
                loss = loss_fn(predictions, labels)
                loss.backward()
                optim.step()

                self._accumulate_results(
                    self.to_np(labels).squeeze(),
                    classes,
                    loss=loss.data[0],
                    probs=self.to_np(predictions).squeeze())
            stats = self.evaluate(logger,
                                  validation_data_loader,
                                  loss_fn,
                                  switch_to_eval=True)
            is_best = stats["val_loss"] < best_loss
            best_loss = min(best_loss, stats["val_loss"])
            model_path = ProjectConfig.combine(
                ProjectConfig.model_directory, "%s_%s_fold_%s.mdl" %
                (self.model_name, str(e + 1), self.fold_number))
            self.save(model_path, optim, is_best, scores=stats)
        return best_loss
Пример #7
0
def get_neuron_ordering_granular(model,
                                 class_to_idx,
                                 granularity=50,
                                 search_stride=100):
    weights = list(model.parameters())[0].data.cpu()
    num_neurons = weights.numpy().shape[1]
    neuron_orderings = [
        get_top_neurons(model, p / search_stride, class_to_idx)[0]
        for p in progressbar(range(search_stride + 1))
    ]

    sliding_idx = 0
    considered_neurons = set()
    ordering = []
    cutoffs = []
    for i in range(0, num_neurons + 1, granularity):
        while len(neuron_orderings[sliding_idx]) < i:
            sliding_idx = sliding_idx + 1
        new_neurons = set(
            neuron_orderings[sliding_idx]).difference(considered_neurons)
        if len(new_neurons) != 0:
            ordering = ordering + list(new_neurons)
            considered_neurons = considered_neurons.union(new_neurons)

            cutoffs.append(len(ordering))

    return ordering, cutoffs
Пример #8
0
def save_mean_std_image(FLAGS):
  """Compute and save mean and std image from train images.

  Args:
    FLAGS: Commandline arguments
  """

  import pdb
  image_list = os.listdir(os.path.join(FLAGS.image_root, 'train'))

  # compute the mean of the train images and save
  mean_img = None
  std_img = None
  for image_name in progressbar(image_list):
    image_path = os.path.join(FLAGS.image_root, 'train', image_name)
    image = support.load_image(image_path)

    if mean_img is None:
      mean_img = image
      std_img = image ** 2
    else:
      mean_img += image
      std_img += image ** 2

  mean_img = mean_img / len(image_list)
  std_img = std_img / len(image_list)

  mean_img = np.mean(np.mean(mean_img, 0), 0)
  std_img = np.mean(np.mean(std_img, 0), 0)
  std_img = np.sqrt(std_img - mean_img ** 2)

  print('Saving mean and std at: %s' % FLAGS.mean_save_path)
  np.save(FLAGS.mean_save_path, {'mean_img': mean_img, 'std_img': std_img})
Пример #9
0
def map_embarrassingly_parallel(input_list, mapper, project, n_jobs=-1, batch_size=-1,
                                checkpoint=False, cleanup=True, **kwargs):
    """
    Process items in a list in parallel (optionally, one smaller batch at a time).

    Args:
        input_list: An input object that has a list-like interface (indexing and slicing).
        mapper: A function to apply to each item of the input list.
        project: An instance of pygoose project.
        n_jobs: The number of parallel processing jobs. -1 will use the number of CPUs on the system.
        batch_size: The maximum number of input items in each batch. -1 will store all data as a single batch.
        checkpoint: Whether to save each batch and its corresponding output to disk.
        cleanup: Whether to remove the batch checkpoints from the disk after all batches are processed.
        **kwargs: Additional keyword arguments to joblib.Parallel.

    Returns:
        A list representing the combined output from the mapper function called on all input items.
    """

    if batch_size < 0:
        batch_size = len(input_list)

    # Partition the data.
    job_id = _create_job_id()
    print('Creating job ID:', job_id)

    batch_storage_dir = os.path.join(project.temp_dir, job_id)
    batches = split_into_batches(input_list, batch_size, batch_storage_dir, checkpoint)

    # The results will be collected here.
    # TODO: collecting lists like this may be memory inefficient. Perhaps we could use another callback function.
    combined_results = []

    # Process data one batch at a time.
    for batch in batches:
        description = 'Batch {}/{}'.format(batch['index'] + 1, len(batches))

        # Process each item in the batch in parallel.
        batch_result = Parallel(n_jobs=n_jobs, **kwargs)(
            delayed(mapper)(input_item)
            for input_item in progressbar(
                batch['data'],
                desc=description,
                total=len(batch['data']),
                file=sys.stdout,
            )
        )
        if checkpoint:
            save(batch_result, batch['result_filename'])

        combined_results.extend(batch_result)

    # Remove the temporary files.
    if checkpoint and cleanup:
        shutil.rmtree(batch_storage_dir)

    return combined_results
Пример #10
0
def _test_set():
    ds = IcebergDataset("../data/orig/test.json", im_dir="../data/vis/test", inference_only=True,
                        mu_sigma=None, colormap="inferno", add_feature_planes="complex")
    for i in progressbar(range(len(ds))):
        # print(i, ds[i]["inputs"].size(), ds[i]["id"])
        ds.vis(i, average=False, prefix="pure_")
        if i == 3:
            break

    loader = DataLoader(ds, batch_size=6, shuffle=False, num_workers=1)
    for i, batch in enumerate(loader):
        print(i, batch["inputs"].size(), batch["id"])
        if i == 3:
            break
Пример #11
0
def get_neuron_ordering(model, class_to_idx, search_stride=100):
    neuron_orderings = [
        get_top_neurons(model, p / search_stride, class_to_idx)[0]
        for p in progressbar(range(search_stride + 1))
    ]

    considered_neurons = set()
    ordering = []
    cutoffs = []
    for local_ordering in neuron_orderings:
        local_ordering = list(local_ordering)
        new_neurons = set(local_ordering).difference(considered_neurons)
        ordering = ordering + list(new_neurons)
        considered_neurons = considered_neurons.union(new_neurons)

        cutoffs.append(len(ordering))

    return ordering, cutoffs
Пример #12
0
def map_batch_parallel(input_list, batch_size, item_mapper=None, batch_mapper=None, flatten=True, n_jobs=-1, **kwargs):
    """
    Split the data into batches and process each batch in its own thread.

    Args:
        input_list: An input object that has a list-like interface (indexing and slicing).
        item_mapper: (optional) A function to apply to each item in the batch.
        batch_mapper: (optional) A function to apply to each batch. Either item_mapper or batch_mapper must be set.
        flatten: Whether to unwrap individual batch results or keep them grouped by batch.
        n_jobs: The number of parallel processing jobs. -1 will use the number of CPUs on the system.
        batch_size: The maximum number of input items in each batch. -1 will store all data as a single batch.
        **kwargs: Additional keyword arguments to joblib.Parallel.

    Returns:
        A list representing the combined output from the mapper function called on all input items of each batch.
    """

    # We must specify either how to process each batch or how to process each item.
    if item_mapper is None and batch_mapper is None:
        raise ValueError('You should specify either batch_mapper or item_mapper.')

    if batch_mapper is None:
        batch_mapper = _default_batch_mapper

    batches = split_into_batches(input_list, batch_size, batch_storage_dir='')
    all_batch_results = Parallel(n_jobs=n_jobs, **kwargs)(
        delayed(batch_mapper)(batch['data'], item_mapper)
        for batch in progressbar(
            batches,
            desc='Batches',
            total=len(batches),
            file=sys.stdout,
        )
    )

    # Unwrap the individual batch results if necessary.
    if flatten:
        final_result = []
        for batch_result in all_batch_results:
            final_result.extend(batch_result)
    else:
        final_result = all_batch_results

    return final_result
Пример #13
0
def extract_from_indices_file():
    base_output_dir = (
        "/mnt/data/tiny_images/py-tiny-image-access/loaded_images/cifar100"
    )
    # the first 50'000 indices are for training
    train_dir = base_output_dir + "/train"
    # the last 10'000 indices are for testing
    test_dir = base_output_dir + "/test"

    tinyimage.openTinyImage()
    cifar100_indices = get_indices()
    for i, index in enumerate(progressbar(cifar100_indices)):
        if i < 50000:
            output_dir = train_dir
        else:
            output_dir = test_dir
        meta = tinyimage.getMetaData(index)
        tinyimage.sliceToImage(tinyimage.sliceToBin(index), output_dir + "/" + meta[1])
    tinyimage.closeTinyImage()
def main(args):
    # reading data
    print('Reading from: ' + args.data_file)
    with open(args.data_file, 'r') as file_id:
        data = json.load(file_id)

    # open a text file to write the questions
    save_path = args.data_file.replace('.json', '_ques_flat.txt')
    print('Saving to: ' + save_path)
    with open(save_path, 'w') as file_id:
        for ques in progressbar(data['data']['questions']):
            file_id.write(clean_non_ascii(ques) + ' ?\n')

    # open a text file to write the captions
    save_path = args.data_file.replace('.json', '_cap_flat.txt')
    print('Saving to: ' + save_path)
    with open(save_path, 'w') as file_id:
        captions = [ii['caption'] for ii in data['data']['dialogs']]

        for cap in captions:
            file_id.write(clean_non_ascii(cap) + ' .\n')
Пример #15
0
def save_vocabularies(train_examples, FLAGS):
  """Extract and save vocabularies for questions and answers.

  Args:
    train_examples: Training examples

  Returns:
    words: Vocabulary (dictionary) extracted from the questions
    ans_list: List of possible answers, extracted from train set
  """

  words = {}
  ans_list = {}
  for datum in progressbar(train_examples):
    for ques_datum in datum['qa']:
      token = ques_datum['answer'].lower()
      words[token] = words.get(token, 0) + 1
      ans_list[token] = 1

      for token in word_tokenize(ques_datum['question']):
        token = token.lower()
        words[token] = words.get(token, 0) + 1

  # additional tokens
  words['<pad>'] = 1
  words['<start>'] = 1
  words['<end>'] = 1
  words['<unk>'] = 1

  print('Saving to: ' + FLAGS.vocab_save_path)
  with open(FLAGS.vocab_save_path, 'w') as file_id:
    file_id.write('\n'.join(sorted(words.keys())))

  # answer lists
  ans_list = list(ans_list.keys())
  ans_list.append('<unk>')
  print('Saving to: ' + FLAGS.answers_save_path)
  with open(FLAGS.answers_save_path, 'w') as file_id:
    file_id.write('\n'.join(ans_list))
Пример #16
0
    def train_set():
        t1 = ToTensor()
        t2 = transforms.Compose([Flip(axis=2), ToTensor()])
        t3 = transforms.Compose([Flip(axis=1), ToTensor()])
        t4 = transforms.Compose([Flip(axis=2), Flip(axis=1), ToTensor()])
        t5 = transforms.Compose([Flip(axis=1), Flip(axis=2), ToTensor()])
        t6 = transforms.Compose([Rotate(90), ToTensor()])

        ds1 = IcebergDataset("../data/all.npy", transform=None, im_dir="../data/vis/train",
                             colormap="inferno", add_feature_planes="complex")
        for i in progressbar(range(len(ds1))):
            sample = ds1[i]
            ds1.vis(i, average=False, prefix="")
            # print(i, sample['inputs'].size(), sample['targets'].size(), sample["targets"].numpy()[0])
            # if i == 10:
            #     break

        dataloader = DataLoader(ds1, batch_size=4, shuffle=True, num_workers=1, pin_memory=True)
        for i_batch, sample_batched in enumerate(dataloader):
            print(i_batch, sample_batched['inputs'].size(), sample_batched['targets'].size())
            if i_batch == 3:
                break
Пример #17
0
    def fit(self, optim, loss_fn, data_loader, validation_data_loader,
            num_epochs, logger):
        best_loss = float("inf")
        start_point = random.randint(0, 32)
        for e in progressbar(range(num_epochs)):
            self._epoch = e
            iter_per_epoch = len(data_loader)
            data_iter = iter(data_loader)
            inputs, targets, predictions = None, None, None
            for i in range(iter_per_epoch):
                inputs, targets = self._get_inputs(data_iter)

                predictions, mu, logvar = self.predict(targets)

                optim.zero_grad()
                loss = loss_fn(predictions, targets, mu, logvar)
                loss.backward()
                optim.step()

                self._accumulate_results(None, None, loss=loss.data[0])
            self._log_images(inputs,
                             targets,
                             predictions,
                             logger,
                             start=start_point,
                             prefix="train_",
                             reshape=(2, 75, 75))
            stats = self.evaluate(logger,
                                  validation_data_loader,
                                  loss_fn,
                                  switch_to_eval=True)
            is_best = stats["val_loss"] < best_loss
            best_loss = min(best_loss, stats["val_loss"])
            model_path = ProjectConfig.combine(
                ProjectConfig.model_directory, "%s_%s_fold_%s.mdl" %
                (self.model_name, str(e + 1), self.fold_number))
            self.save(model_path, optim, is_best, scores=stats)
        return best_loss
Пример #18
0
def infer(path, num_folds, average=True):
    ds = IcebergDataset(path,
                        inference_only=True,
                        transform=ToTensor(),
                        add_feature_planes="no")
    loader = DataLoader(ds, 64)
    predictions = defaultdict(list)

    for fold in range(num_folds):
        model = LeNet.restore("../models/LeNet_78_fold_None.mdl")
        if torch.cuda.is_available():
            model.cuda()
        iterator = iter(loader)
        iter_per_epoch = len(loader)
        for _ in progressbar(range(iter_per_epoch)):
            next_batch = next(iterator)
            inputs_tensor, ids = next_batch["inputs"], next_batch["id"]
            inputs = model.to_var(inputs_tensor)
            probs, _ = model.predict(inputs, return_classes=False)
            probs = model.to_np(probs).squeeze()
            probs = probs.tolist()
            chunk = dict(zip(ids, probs))
            for k, v in chunk.items():
                predictions[k].append(v)
    if average:
        result = {k: sum(v) / len(v) for k, v in predictions.items()}
    else:
        result = {}
        for k, v in predictions.items():
            prob = np.mean(np.array(v))
            if prob <= 0.1:
                prob = 0
            elif prob >= 0.9:
                prob = 1
            result[k] = prob
    return result
Пример #19
0
    def hyperscreen(self, softening=1.0):
        """[summary]

        Returns:
            [type] -- [description]
        """

        data = self.data[self.data['Hyperbola test passed']]

        # taprange = range(data['crsu'].min(), data['crsu'].max() + 1)
        taprange_u = range(data['crsu'].min() - 1, data['crsu'].max() + 1)
        taprange_v = range(data['crsv'].min() - 1, data['crsv'].max() + 1)

        if self.numevents < 100000:
            bins = [50, 50]  # number of bins
        else:
            bins = [200, 200]

        # Instantiate these empty dictionaries to hold our results
        u_axis_survivals = {}
        v_axis_survivals = {}

        if self.verbose is False:
            progressbar_disable = True
        elif self.verbose is True:
            progressbar_disable = False

        if self.verbose is True:
            print(
                colorama.Fore.YELLOW +
                "\nApplying Otsu's Method to every Tap-specific boomerang across U-axis taps {} through {}"
                .format(taprange_u[0] + 1, taprange_u[-1] + 1))

        skiptaps_u = []
        skiptaps_v = []

        for tap in progressbar(taprange_u,
                               disable=progressbar_disable,
                               ascii=False):
            # Do the U axis
            tapmask_u = data[data['crsu'] == tap].index.values
            if len(tapmask_u) < 20:
                skiptaps_u.append((tap + 1, len(tapmask_u)))
                continue
            keep_u = np.isfinite(data['fb_u'][tapmask_u])

            hist_u, xbounds_u, ybounds_u = np.histogram2d(
                data['fb_u'][tapmask_u][keep_u],
                data['fp_u'][tapmask_u][keep_u],
                bins=bins)
            thresh_hist_u = self.threshold(hist_u,
                                           bins=bins,
                                           softening=softening)

            posx_u = np.digitize(data['fb_u'][tapmask_u], xbounds_u)
            posy_u = np.digitize(data['fp_u'][tapmask_u], ybounds_u)
            hist_mask_u = (posx_u > 0) & (posx_u <= bins[0]) & (
                posy_u > -1) & (posy_u <= bins[1])

            # Values of the histogram where the points are
            hhsub_u = thresh_hist_u[posx_u[hist_mask_u] - 1,
                                    posy_u[hist_mask_u] - 1]
            pass_fb_u = data['fb_u'][tapmask_u][hist_mask_u][np.isfinite(
                hhsub_u)]

            u_axis_survivals["U Axis Tap {:02d}".format(
                tap)] = pass_fb_u.index.values

        if self.verbose is True:
            print(
                "\nThe following {} U-axis taps were skipped due to a (very) low number of counts: "
                .format(len(skiptaps_u)))
            for skipped_tap in skiptaps_u:
                tapnum, counts = skipped_tap
                print("Skipped U-axis Tap {}, which had {} count(s)".format(
                    tapnum, counts))
            print(colorama.Fore.MAGENTA +
                  "\n... doing the same for the V axis taps {} through {}".
                  format(taprange_v[0] + 1, taprange_v[-1] + 1))

        for tap in progressbar(taprange_v,
                               disable=progressbar_disable,
                               ascii=False):
            # Now do the V axis:
            tapmask_v = data[data['crsv'] == tap].index.values
            if len(tapmask_v) < 20:
                skiptaps_v.append((tap + 1, len(tapmask_v)))
                continue
            keep_v = np.isfinite(data['fb_v'][tapmask_v])

            hist_v, xbounds_v, ybounds_v = np.histogram2d(
                data['fb_v'][tapmask_v][keep_v],
                data['fp_v'][tapmask_v][keep_v],
                bins=bins)
            thresh_hist_v = self.threshold(hist_v,
                                           bins=bins,
                                           softening=softening)

            posx_v = np.digitize(data['fb_v'][tapmask_v], xbounds_v)
            posy_v = np.digitize(data['fp_v'][tapmask_v], ybounds_v)
            hist_mask_v = (posx_v > 0) & (posx_v <= bins[0]) & (
                posy_v > -1) & (posy_v <= bins[1])

            # Values of the histogram where the points are
            hhsub_v = thresh_hist_v[posx_v[hist_mask_v] - 1,
                                    posy_v[hist_mask_v] - 1]
            pass_fb_v = data['fb_v'][tapmask_v][hist_mask_v][np.isfinite(
                hhsub_v)]

            v_axis_survivals["V Axis Tap {:02d}".format(
                tap)] = pass_fb_v.index.values

        if self.verbose is True:
            print(
                "\nThe following {} V-axis taps were skipped due to a (very) low number of counts: "
                .format(len(skiptaps_v)))
            for skipped_tap in skiptaps_v:
                tapnum, counts = skipped_tap
                print("Skipped V-axis Tap {}, which had {} count(s)".format(
                    tapnum, counts))

        # Done looping over taps

        if self.verbose is True:
            print(
                colorama.Fore.BLUE +
                "\nCollecting events that pass both U- and V-axis HyperScreen tests...",
                end=" ")

        u_all_survivals = np.concatenate(
            [x for x in u_axis_survivals.values()])
        v_all_survivals = np.concatenate(
            [x for x in v_axis_survivals.values()])

        # If the event passes both U- and V-axis tests, it survives
        all_survivals = np.intersect1d(u_all_survivals, v_all_survivals)
        survival_mask = np.isin(self.data.index.values, all_survivals)
        failure_mask = np.logical_not(survival_mask)

        num_survivals = sum(survival_mask)
        num_failures = sum(failure_mask)

        percent_hyperscreen_rejected = round(
            ((num_failures / self.numevents) * 100), 2)

        # Do a sanity check to look for lost events. Shouldn't be any.
        if num_survivals + num_failures != self.numevents:
            print("WARNING: Total Number of survivals and failures does \
            not equal total events in the EVT1 file. Something is wrong!")

        legacy_hyperbola_test_failures = sum(
            self.data['Hyperbola test failed'])
        percent_legacy_hyperbola_test_rejected = round(
            ((legacy_hyperbola_test_failures / self.numevents) * 100), 2)

        percent_improvement_over_legacy_test = round(
            (percent_hyperscreen_rejected -
             percent_legacy_hyperbola_test_rejected), 2)

        if self.verbose is True:
            print("Done")
            print(colorama.Fore.GREEN + "HyperScreen rejected" +
                  colorama.Fore.YELLOW +
                  " {}% of all events ({:,} bad events / {:,} total events)".
                  format(percent_hyperscreen_rejected, sum(failure_mask),
                         self.numevents) + colorama.Fore.GREEN +
                  "\nThe Murray+ algorithm rejects" + colorama.Fore.MAGENTA +
                  " {}% of all events ({:,} bad events / {:,} total events)".
                  format(percent_legacy_hyperbola_test_rejected,
                         legacy_hyperbola_test_failures, self.numevents))

            print(
                colorama.Fore.GREEN +
                "As long as the results pass sanity checks, this is a POTENTIAL improvement of \n"
                + colorama.Fore.BLUE +
                "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ POTENTIAL Improvement ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"
                + colorama.Fore.WHITE +
                "                                      {}%\n".format(
                    percent_improvement_over_legacy_test) +
                colorama.Fore.BLUE +
                "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"
            )

        hyperscreen_results_dict = {
            "ObsID": self.obsid,
            "Target": self.target,
            "Exposure Time": self.exptime,
            "Detector": self.detector,
            "Number of Events": self.numevents,
            "Number of Good Time Events": self.goodtimeevents,
            "U Axis Survivals by Tap": u_axis_survivals,
            "V Axis Survivals by Tap": v_axis_survivals,
            "U Axis All Survivals": u_all_survivals,
            "V Axis All Survivals": v_all_survivals,
            "All Survivals (event indices)": all_survivals,
            "All Survivals (boolean mask)": survival_mask,
            "All Failures (boolean mask)": failure_mask,
            "Percent rejected by Tapscreen": percent_hyperscreen_rejected,
            "Percent rejected by Hyperbola":
            percent_legacy_hyperbola_test_rejected,
            "Percent improvement": percent_improvement_over_legacy_test
        }

        return hyperscreen_results_dict
Пример #20
0
		for label in f:
			labels.append(label.rstrip())
	return labels


def get_indices():
	indices = []
	with open('./indices_cifar100','r') as f:
		for index in f:
			indices.append(int(index.rstrip()))
	return indices


if __name__ == "__main__":
	keywords = get_labels()
	tinyimage.openTinyImage()
	images = []
	ignore_indices = get_indices()
	pick = len(ignore_indices)
	for keyword in progressbar(keywords):
		indexes = tinyimage.retrieveByTerm(keyword)
		for i in indexes:
			if i not in ignore_indices:
				image = tinyimage.sliceToBin(i).reshape(32,32,3, order="F").astype('float32') / 255.
				images.append(image)
	relevant = np.array(images)
	np.random.shuffle(relevant)
	relevant = relevant[:pick]
	np.save("relevant_images",relevant)
	tinyimage.closeTinyImage()
Пример #21
0
  def __init__(self, imdb, params):
    """Initialize by reading the data and pre-processing it.
    """

    self.imdb = imdb
    self.params = params
    self.num_inst = len(self.imdb['data'])
    self.num_rounds = len(self.imdb['data'][0]['question_ind'])

    # load vocabulary
    vocab_path = params['text_vocab_path']
    self.vocab_dict = text_processing.VocabDict(vocab_path)
    self.T_encoder = params['max_enc_len']

    # record special token ids
    self.start_token_id = self.vocab_dict.word2idx('<start>')
    self.end_token_id = self.vocab_dict.word2idx('<end>')
    self.pad_token_id = self.vocab_dict.word2idx('<pad>')
    # Load answers
    with open(params['args']['answer_list_path'], 'r') as file_id:
      choices = [ii.strip('\n') for ii in file_id.readlines()]
      self.num_choices = len(choices)
      self.choices2ind = {ii: index for index, ii in enumerate(choices)}
      self.ind2choices = {index: ii for index, ii in enumerate(choices)}

    # peek one example to see whether answer and gt_layout are in the data
    test_data = self.imdb['data'][0]
    self.load_gt_layout = test_data.get('gt_layout_tokens', False)
    if 'load_gt_layout' in params:
      self.load_gt_layout = params['load_gt_layout']

    if self.load_gt_layout:
      self.T_decoder = params['max_dec_len']
      self.assembler = params['assembler']

    # load the mean of the images
    load_path = params['path'].split('/')[:-1] + ['train_image_mean.npy']
    load_path = '/'.join(load_path)
    print('Loading training image stats from: ' + load_path)
    img_stats = np.load(load_path)[()]
    mean_img = img_stats['mean_img'].reshape([1, 1, -1])
    std_img = img_stats['std_img'].reshape([1, 1, -1])

    # read all the images
    images = {}
    print('Reading images..')
    #TODO: Change this back!
    for datum in progressbar(self.imdb['data'][::3]):
      img_path = datum['image_path']

      if img_path not in images:
        cur_img = support.load_image(img_path)
        cur_img = (cur_img - mean_img) / std_img
        images[img_path] = cur_img

    self.images = images

    # get the shape from random image
    for _, sample in self.images.items():
      self.img_size = sample.shape
      break

    # convert to tokens
    self.digitizer = lambda x: [self.vocab_dict.word2idx(w) for w in x]

    # use history if needed by the program generator
    self.use_history = self.params['generator'] == 'mem'
    if self.use_history:
      self._construct_history()

    # if fact is to be used
    if self.params['use_fact']:
      self._construct_fact()
            image = self._add_planes(image)
        elif self.add_feature_planes == "simple":
            image = self._get_simple_planes(image)
        noise_factor = 0.4
        planes = [image[i, :, :] for i in range(image.shape[0])]
        stats = [self.get_image_stat(i) for i in planes]
        masks = [np.random.binomial(1, 1 - noise_factor, i.shape) for i in planes]
        noise = [masks[i] * np.random.normal(loc=stats[i][0], scale=stats[i][1],
                                             size=masks[i].shape) for i in range(len(planes))]
        noisy = [planes[i] + noise[i] for i in range(len(planes))]
        noisy = np.stack(noisy, axis=0)
        item = {"inputs": noisy, "targets": image}
        if self.transform:
            item = self.transform(item)
        return item

    def vis(self, idx, average=False, prefix=""):
        base_dir = self.im_dir or "./"
        image1 = self[idx]["inputs"]
        image2 = self[idx]["targets"]
        self._vis_image(idx, image2, average, base_dir, prefix)
        self._vis_image(idx, image1, average, base_dir, "noise_" + prefix)


if __name__ == "__main__":
    ds1 = AutoEncoderDataset("../data/folds/test_0.npy", transform=None, im_dir="../data/vis/test",
                             colormap="inferno", add_feature_planes="no")
    for i in progressbar(range(len(ds1))):
        sample = ds1[i]
        ds1.vis(i, average=True, prefix="")
Пример #23
0
hashes = json.loads(open(hash_file).read())

#######################
with open("word_embeddings/word2num.json", "r") as f:
    word2num = json.load(f)
    word2num = {w: i for i, w in enumerate(word2num)}

#######################

shuffle(examples)
batch_size = 1024
dir_count = -1
res_dir = None
data = []

for ex in progressbar(examples):
    if res_dir is None or len(data) >= batch_size:
        if res_dir is not None:
            with open(os.path.join(res_dir, "data.json"), "w") as f:
                json.dump(data, f)

        dir_count += 1
        res_dir = os.path.join(DDIR, str(dir_count))
        data = []

        if not os.path.isdir(res_dir):
            os.mkdir(res_dir)

    ID = ex["identifier"].split("-")[:3]
    ID = "-".join(ID)
    if ID + "-img0.png" not in id2path:
Пример #24
0
        mu1, sigma1, med1, maximum_1, minimum_1, percentile75_1 = IcebergDataset.get_image_stat(image[0, :, :])
        mu2, sigma2, med2, maximum_2, minimum_2, percentile75_2 = IcebergDataset.get_image_stat(image[1, :, :])
        result.append((mu1, sigma1, med1, maximum_1, minimum_1, percentile75_1, mu2, sigma2,
                       med2, maximum_2, minimum_2, percentile75_2, angle[0], label[0]))
    new_frame = pd.DataFrame(result, columns=["mu1", "sigma1", "med1", "max1", "min1", "per75_1",
                                              "mu2", "sigma2", "med2", "max2", "min2", "per75_2", "angle", "label"])
    new_frame.to_csv("../data/stats.csv", index=False)
    print()


if __name__ == "__main__":
    data = IcebergDataset("../data/orig/test.json", mu_sigma=None, inference_only=True,
                          colormap="inferno", im_dir="../data/vis/test/cluster_1")
    X = np.array([i["inputs"].ravel() for i in data])
    # get_best_clusters(X)
    clusterer = KMeans(n_clusters=2, random_state=10)
    cluster_labels = clusterer.fit_predict(X)

    positives = []
    for i in progressbar(range(len(data))):
        if cluster_labels[i] == 1:
            # sample = data[i]
            # positives.append(sample["targets"][0])
            data.vis(i, prefix="C1_")

    print("Len", len(positives))
    print("Positives", sum(positives))

    # inspect_angle()
    print("Finished!")
Пример #25
0
    def evaluate(self, dataloader, dtype):
        # network in evaluation mode
        self.eval()
        gtRanks = []
        numInst = dataloader.numInst[dtype]

        # save all scores and gtLabels
        scores = []
        gtLabels = []
        imageIds = []

        # Get gt scores for all options
        for startId in progressbar(range(0, numInst, self.batchSize)):
            # Obtain test batch, argument set and GT members
            batch = dataloader.getTestBatch(startId, dtype)
            batchSize = batch['set'].size(0)

            # Extract set, positive
            setEmbed = bottle(self.wordTransform, Variable(batch['set']))
            # if set is empty, reset to zero
            if self.setSize == 0: setEmbed.data.fill_(0.0)
            setEmbed, _ = self.selfatt(setEmbed, setEmbed, setEmbed)
            setEmbed = self.pooler(setEmbed, 1)
            if type(setEmbed).__name__ == 'tuple': setEmbed = setEmbed[0]
            setEmbed = setEmbed.squeeze()

            # If image exists
            if 'image' in batch:
                imgEmbed = self.imgTransform(Variable(batch['image']))
                setEmbed = torch.cat((setEmbed, imgEmbed), 1)

            # current batch scores
            batchScores = torch.FloatTensor(batchSize, self.vocabSize)

            # Get the scores for all possible options
            for ii in range(0, self.vocabSize, self.batchSize):
                end = min(ii + self.batchSize, self.vocabSize)

                # Interact gt and set to get score
                argInds = torch.arange(ii, end).long().unsqueeze(0)
                if self.useGPU: argInds = argInds.cuda()
                argInds = argInds.repeat(batchSize, 1)
                argEmbed = bottle(self.wordTransform, Variable(argInds))
                argScore = self.scoreInstanceSet(argEmbed, setEmbed)
                # save scores for this batch
                batchScores[:, ii:end] = argScore.data.float().squeeze()

            # Assign the set least possible score (-Inf) to set elements
            rangeInds = torch.arange(0, batchSize).long()
            for ii in range(self.evalSize):
                # satwik: edits for new pytorch
                scatInds = torch.stack((rangeInds, batch['set'][:, ii].cpu()),
                                       1)
                batchScores.scatter_(1, scatInds, float('-inf'))

            # Convert to numpy array
            batchScores = batchScores.numpy()
            # rank data is ascending, need descending
            batchRanks = np.apply_along_axis(rankdata, 1, -1 * batchScores)
            # save the batch scores
            scores.append(batchScores)

            # Assign the ranks
            gtLabels.extend(batch['pos'])
            if 'imageId' in batch: imageIds.extend(batch['imageId'])
            for ii in range(batchSize):
                gtRank = [batchRanks[ii, jj] for jj in batch['pos'][ii]]
                gtRanks.append(gtRank)

        # Compute rank statistics
        metrics = computeRankStats(np.concatenate(gtRanks))
        # network in training mode
        self.train()

        return metrics, np.concatenate(scores), {'gtLabels': gtLabels, \
                                                'imageId': imageIds}
Пример #26
0
def evaluate_agent(wizard, val_loader, args):
    """Evaluate a SIMMC agent given a dataloader.

    Args:
        wizard: SIMMC model
        dataloader: Dataloader to use to run the model on
        args: Arguments for evaluation
    """
    total_iters = int(val_loader.num_instances / args["batch_size"])
    # Turn autograd off for evaluation -- light-weight and faster.
    with torch.no_grad():
        wizard.eval()
        matches = []
        for batch in progressbar(val_loader.get_batch(),
                                 total=int(total_iters)):
            if args["bleu_evaluation"]:
                mode = {"next_token": "ARGMAX", "beam_size": 5}
            else:
                mode = None
            batch_outputs = wizard(batch, mode)
            # Stringify model responses.
            if args["bleu_evaluation"]:
                batch_outputs["model_response"] = (
                    val_loader.stringify_beam_outputs(
                        batch_outputs["beam_output"], batch))
                # Remove beam output to avoid memory issues.
                del batch_outputs["beam_output"]
            matches.append(batch_outputs)
    wizard.train()

    # Compute perplexity.
    total_loss_sum = sum(ii["loss_sum"].item() for ii in matches)
    num_tokens = sum(ii["num_tokens"].item() for ii in matches)
    avg_loss_eval = total_loss_sum / num_tokens

    # Compute BLEU score.
    if args["bleu_evaluation"]:
        model_responses = [jj for ii in matches for jj in ii["model_response"]]
        bleu_score = val_loader.evaluate_response_generation(model_responses)
    else:
        model_responses = None
        bleu_score = -1.

    # Evaluate retrieval score.
    if args["retrieval_evaluation"]:
        candidate_scores = [
            jj for ii in matches for jj in ii["candidate_scores"]
        ]
        retrieval_metrics = val_loader.evaluate_response_retrieval(
            candidate_scores)
        print(retrieval_metrics)
    else:
        retrieval_metrics = {}

    # Evaluate action prediction.
    action_predictions = [jj for ii in matches for jj in ii["action_preds"]]
    action_metrics = val_loader.evaluate_action_prediction(action_predictions)
    print(action_metrics["confusion_matrix"])
    print_str = ("\nEvaluation\n\tLoss: {:.2f}\n\t"
                 "Perplexity: {:.2f}\n\tBLEU: {:.3f}\n\t"
                 "Action: {:.2f}\n\t"
                 "Action Perplexity: {:.2f}\n\t"
                 "Action Attribute Accuracy: {:.2f}")
    print(
        print_str.format(avg_loss_eval, math.exp(avg_loss_eval), bleu_score,
                         100 * action_metrics["action_accuracy"],
                         action_metrics["action_perplexity"],
                         100 * action_metrics["attribute_accuracy"]))
    # Save the results to a file.
    eval_dict = {
        "loss": avg_loss_eval,
        "perplexity": math.exp(avg_loss_eval),
        "bleu": bleu_score,
        "action_accuracy": action_metrics["action_accuracy"],
        "action_perplexity": action_metrics["action_perplexity"],
        "action_attribute": action_metrics["attribute_accuracy"]
    }
    eval_dict.update(retrieval_metrics)
    eval_outputs = {
        "model_actions": action_predictions,
        "model_responses": model_responses
    }
    return eval_dict, eval_outputs
Пример #27
0
    id2synet[ID] = list(id2synet[ID])[0]

#####################

id2path = dict()
for root, _, files in os.walk(imgs_dir):
    for file in files:
        if os.path.splitext(file)[1] == ".png":
            id2path[file] = os.path.join(root, file)

examples = [json.loads(line) for line in open(json_file).readlines()]
hashes = json.loads(open(hash_file).read())

res_dict = dict()

for ID in progressbar(id2path):
    img = read_img(id2path[ID])
    if img is None:
        continue
    C = id2synet[ID]
    res_dir = os.path.join(DDIR, str(C))
    if not os.path.isdir(res_dir):
        os.mkdir(res_dir)

    path = os.listdir(res_dir)
    path = filter(lambda p: os.path.splitext(p)[1] == ".png", path)
    path = sum(1 for _ in path)
    path = os.path.join(res_dir, str(path) + ".png")

    res_dict[ID] = path
    cv2.imwrite(path, img)
Пример #28
0
  if len(members) == 1:
    return list(members)
  if len(drr) == 2 and drr[0] == "VP" and isinstance(drr[1], list):
    if len(drr[1]) == 0:
      return []
    elif drr[1][0] == "VP" and len(drr[1]) == 2:
      return [rr[1][0], rr[1][1]]
  return rr

def pp(lol):
  if isinstance(lol, str):
    return lol
  return "(%s)" % " ".join([pp(l) for l in lol])

with open(sys.argv[1]) as ptb_f:
  for line in progressbar(ptb_f):
    tree = ParentedTree.fromstring(line)
    # record the list of substitutions
    lookup = {};
    index = 0
    for st in tree.subtrees():
      if len(list(st.subtrees())) == 1:
        lookup[index] = st[0];
        st[0] = str(index)
        index += 1
    colparse = collapse(strip(tree))
    final = finalize(colparse)
    print(pp(final))
    #print(lookup)
    #print('')
    #pdb.set_trace();
Пример #29
0
snapshot_saver = tf.train.Saver(max_to_keep=None)
# keep all snapshots
snapshot_saver.restore(sess, args['checkpoint'])

print('Evaluating on %s' % args['testSplit'])
ansMatches = []
progMatches = []
totalIter = int(valLoader.batchLoader.numInst / args['batchSize'])
maxIters = 100
curIter = 0
toSave = {
    'output': [],
    'batch': []
}

for batch in progressbar(valLoader.batches(), total=totalIter):
    _, outputs = model.runVisualizeIteration(batch, sess)

    toSave['output'].append(outputs)
    toSave['batch'].append(batch)

    # debug -- also compute the ranks during visualization
    #ranks.append(batchRanks);

    curIter += 1
    if curIter >= maxIters: break

# save the output + batch
batchPath = args['checkpoint'] + '.100_batches.npy'
print('Printing the batches: ' + batchPath)
support.saveBatch(toSave, batchPath)
Пример #30
0
snapshot_saver = tf.train.Saver(max_to_keep=None)  # keep all snapshots
snapshot_saver.restore(sess, args['checkpoint'])

print('Evaluating on %s' % args['test_split'])
ranks = []
matches = []
total_iter = int(val_loader.batch_loader.num_inst / args['batch_size'])
num_iters = 0

# get confusion matrix only if using refer
confusion_mat = np.zeros((2, 2))
if args['use_refer']:
  refer_token = question_assembler.name2idx_dict['_Refer']
  find_token = question_assembler.name2idx_dict['_Find']

for batch in progressbar(val_loader.batches(), total=total_iter):
  batch_ranks, outputs = model.run_evaluate_iteration(batch, sess)

  ranks.append(batch_ranks)
  if 'matches' in outputs: matches.append(outputs['matches'])

  # debug, get confusion between find/refer
  if args['use_refer']:
    find_gt = batch['gt_layout'] == find_token
    refer_gt = batch['gt_layout'] == refer_token
    find_pred = outputs['pred_tokens'] == find_token
    refer_pred = outputs['pred_tokens'] == refer_token

    confusion_mat[0, 0] += np.sum(find_pred & find_gt)
    confusion_mat[0, 1] += np.sum(refer_pred & find_gt)
    confusion_mat[1, 0] += np.sum(find_pred & refer_gt)