示例#1
0
 def optimize_global(self, niter=200, npop=50, population=None, label='Global optimisation', leave=False):
     if self.de is None:
         self.de = DiffEvol(self.lnposterior, clip(self.ps.bounds, -1, 1), npop, maximize=True, vectorize=True)
         if population is None:
             self.de._population[:, :] = self.create_pv_population(npop)
         else:
             self.de._population[:, :] = population
     for _ in tqdm(self.de(niter), total=niter, desc=label, leave=leave):
         pass
示例#2
0
文件: lpf.py 项目: hpparvi/PyTransit
 def sample_mcmc(self, niter=500, thin=5, label='MCMC sampling', reset=False, leave=True):
     if self.sampler is None:
         self.sampler = EnsembleSampler(self.de.n_pop, self.de.n_par, self.lnposterior, vectorize=True)
         pop0 = self.de.population
     else:
         pop0 = self.sampler.chain[:,-1,:].copy()
     if reset:
         self.sampler.reset()
     for _ in tqdm(self.sampler.sample(pop0, iterations=niter, thin=thin), total=niter, desc=label, leave=False):
         pass
示例#3
0
def bestThresshold(y_train,train_preds):
    tmp = [0,0,0] # idx, cur, max
    delta = 0
    for tmp[0] in tqdm(np.arange(0.1, 0.501, 0.01)):
        tmp[1] = f1_score(y_train, np.array(train_preds)>tmp[0])
        if tmp[1] > tmp[2]:
            delta = tmp[0]
            tmp[2] = tmp[1]
    print('best threshold is {:.4f} with F1 score: {:.4f}'.format(delta, tmp[2]))
    return delta
示例#4
0
def display_progress(iterator, total=None, **kwargs):
    """
    displays a progress bar when iterating
    """
    if tqdm is not None:
        return tqdm(iterator, total=total, **kwargs)
    else:
        if display_progress._show_warning:
            logging.getLogger(__name__).warn('Module `tqdm` is not available '
                                             'and progress cannot be displayed')
            display_progress._show_warning = False
        return iterator
示例#5
0
    def transcribe(
        self,
        paths2audio_files: List[str],
        batch_size: int = 4,
        return_hypotheses: bool = False,
        partial_hypothesis: Optional[List['Hypothesis']] = None,
        num_workers: int = 0,
    ) -> (List[str], Optional[List['Hypothesis']]):
        """
        Uses greedy decoding to transcribe audio files. Use this method for debugging and prototyping.

        Args:

            paths2audio_files: (a list) of paths to audio files. \
        Recommended length per file is between 5 and 25 seconds. \
        But it is possible to pass a few hours long file if enough GPU memory is available.
            batch_size: (int) batch size to use during inference. \
        Bigger will result in better throughput performance but would use more memory.
            return_hypotheses: (bool) Either return hypotheses or text
        With hypotheses can do some postprocessing like getting timestamp or rescoring
            num_workers: (int) number of workers for DataLoader

        Returns:
            A list of transcriptions in the same order as paths2audio_files. Will also return
        """
        if paths2audio_files is None or len(paths2audio_files) == 0:
            return {}
        # We will store transcriptions here
        hypotheses = []
        all_hypotheses = []
        # Model's mode and device
        mode = self.training
        device = next(self.parameters()).device
        dither_value = self.preprocessor.featurizer.dither
        pad_to_value = self.preprocessor.featurizer.pad_to

        if num_workers is None:
            num_workers = min(batch_size, os.cpu_count() - 1)

        try:
            self.preprocessor.featurizer.dither = 0.0
            self.preprocessor.featurizer.pad_to = 0

            # Switch model to evaluation mode
            self.eval()
            # Freeze the encoder and decoder modules
            self.encoder.freeze()
            self.decoder.freeze()
            self.joint.freeze()
            logging_level = logging.get_verbosity()
            logging.set_verbosity(logging.WARNING)
            # Work in tmp directory - will store manifest file there
            with tempfile.TemporaryDirectory() as tmpdir:
                with open(os.path.join(tmpdir, 'manifest.json'),
                          'w',
                          encoding='utf-8') as fp:
                    for audio_file in paths2audio_files:
                        entry = {
                            'audio_filepath': audio_file,
                            'duration': 100000,
                            'text': 'nothing'
                        }
                        fp.write(json.dumps(entry) + '\n')

                config = {
                    'paths2audio_files': paths2audio_files,
                    'batch_size': batch_size,
                    'temp_dir': tmpdir,
                    'num_workers': num_workers,
                }

                temporary_datalayer = self._setup_transcribe_dataloader(config)
                for test_batch in tqdm(temporary_datalayer,
                                       desc="Transcribing"):
                    encoded, encoded_len = self.forward(
                        input_signal=test_batch[0].to(device),
                        input_signal_length=test_batch[1].to(device))
                    best_hyp, all_hyp = self.decoding.rnnt_decoder_predictions_tensor(
                        encoded,
                        encoded_len,
                        return_hypotheses=return_hypotheses,
                        partial_hypotheses=partial_hypothesis,
                    )

                    hypotheses += best_hyp
                    if all_hyp is not None:
                        all_hypotheses += all_hyp
                    else:
                        all_hypotheses += best_hyp

                    del encoded
                    del test_batch
        finally:
            # set mode back to its original value
            self.train(mode=mode)
            self.preprocessor.featurizer.dither = dither_value
            self.preprocessor.featurizer.pad_to = pad_to_value

            logging.set_verbosity(logging_level)
            if mode is True:
                self.encoder.unfreeze()
                self.decoder.unfreeze()
                self.joint.unfreeze()
        return hypotheses, all_hypotheses
    def trainGAN(self):
        gen = self.generator().to(self.device)
        gen_opt = torch.optim.Adam(gen.parameters(),
                                   lr=self.lr,
                                   betas=(self.beta1, self.beta2))
        critic = self.critic().to(self.device)
        critic_opt = torch.optim.Adam(critic.parameters(),
                                      lr=self.lr,
                                      betas=(self.beta1, self.beta2))

        cur_step = 0
        loadAndAgumentMasks = makeMasks.MaskClass(self.config, rand_seed=None)

        # måske nn.Conv2d med vægte ikke virker når vi bruger partconv2d, i så fald måske tilføje
        # or isinstance(m,partConv2d) og læg partconv2d et sted hvor den er accessible.
        def weights_init(m):
            if isinstance(m, nn.Conv2d) or isinstance(
                    m, nn.ConvTranspose2d) or isinstance(m, PartialConv2d):
                nn.init.kaiming_normal_(m.weight,
                                        mode='fan_out',
                                        nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                torch.nn.init.constant_(m.weight, 1)
                torch.nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.LayerNorm):
                torch.nn.init.normal_(m.weight, 0.0, 0.02)
                torch.nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0.0, 0.02)

        gen = gen.apply(weights_init)
        critic = critic.apply(weights_init)

        print("Setup loss function...")
        loss_func = CalculateLoss(config=self.config).to(self.device)

        for epoch in range(self.epochs):
            for real, SAR in tqdm(self.dataloader,
                                  position=0,
                                  leave=True,
                                  disable=True):  #self.config.run_polyaxon):

                masks = loadAndAgumentMasks.returnTensorMasks(self.batchSize)
                masks = torch.from_numpy(masks)
                masks = masks.type(torch.cuda.FloatTensor)
                masks = 1 - masks
                masks.to(self.device)

                real = real.to(self.device)

                # ---------------------
                #  Train critic
                # ---------------------
                critic.zero_grad()
                # Real images
                real_validity = critic(real)
                d_real = real_validity.mean()
                # Generate a batch of images with mask
                #Masked_fake_img = torch.mul(real_vv_vh, masks4)
                Masked_fake_img = torch.mul(real, masks)
                fake_imgs = gen(Masked_fake_img, masks)

                # Fake images
                fake_validity = critic(fake_imgs)  # Detach or not?
                d_fake = fake_validity.mean()

                gradient_penalty = self.calc_gradient_penalty(
                    critic, real.data, fake_imgs.data)
                d_loss = d_fake - d_real + gradient_penalty
                d_loss.backward()

                critic_opt.step()

                # Values for txt / logging
                critic_cost = d_fake - d_real + gradient_penalty
                wasserstein_d = d_real - d_fake
                critic_score = real_validity.mean().item()
                gen_score = fake_validity.mean().item()

                # Train the generator every n_critic steps
                if cur_step % self.n_critic == 0:

                    # -----------------
                    #  Train Generator
                    # -----------------
                    gen.zero_grad()
                    # Generate a batch of images
                    fake_noise = torch.mul(real, masks)
                    fake_imgs = gen(fake_noise, masks)
                    # Loss measures generator's ability to fool the critic
                    # Train on fake images
                    fake_validity1 = critic(fake_imgs)

                    loss_dict = loss_func(real, masks, fake_imgs, real)
                    loss = 0.0

                    # sums up each loss value
                    for key, value in loss_dict.items():
                        loss += value

                    loss.backward(retain_graph=True)

                    g_loss = fake_validity1.mean()
                    #g_lossMSE = criterionMSE(real, fake_imgs)
                    #g_lossMSE.backward(retain_graph=True)

                    g_loss = -g_loss
                    g_loss.backward()  #mone

                    gen_opt.step()
                    gen_cost = g_loss
                cur_step += 1
            if self.config.run_polyaxon and epoch % 5 == 0:
                metrics = {}
                for key, value in loss_dict.items():
                    modelHelper.saveMetricsNewPolyaxon(metrics, key,
                                                       value.item(), epoch,
                                                       self.config)
                modelHelper.saveMetricsNewPolyaxon(metrics, 'critic cost',
                                                   critic_cost.item(), epoch,
                                                   self.config)
                modelHelper.saveMetricsNewPolyaxon(metrics,
                                                   'Wasserstein distance',
                                                   wasserstein_d.item(), epoch,
                                                   self.config)
                modelHelper.saveMetricsNewPolyaxon(metrics, 'Gen cost',
                                                   gen_cost.item(), epoch,
                                                   self.config)

            if epoch % self.save_model_step == 0 and self.trainMode == True:
                name = str(self.modelName) + '_' + str(epoch)
                model_path = modelHelper.saveModel(name, self.modelOutputPath,
                                                   gen, self.modelName)
                if self.config.nir_data:
                    modelHelper.save_tensor_batch_NIR(
                        real, Masked_fake_img, fake_imgs, self.batchSize,
                        Path.joinpath(self.ImageOutputPath,
                                      'epoch_' + str(epoch)))

                else:
                    modelHelper.save_tensor_batch_NIR(
                        real, Masked_fake_img, fake_imgs, self.batchSize,
                        Path.joinpath(self.ImageOutputPath,
                                      'epoch_' + str(epoch)))
                    #Changed the else loop to handle the new generator taking SAR
                #else:
                #    modelHelper.save_tensor_batch(real, Masked_fake_img, fake_imgs, self.batchSize,
                #                              Path.joinpath(self.ImageOutputPath, 'epoch_' + str(epoch)))
                # Save loss from generator and critic to a file

                #filename = Path.joinpath(self.modelOutputPath, self.modelName + '_' + str(self.batchSize) + 'Errors.txt')
                #saveString = 'wasserStein Number: ' + str(wasserstein_d) +' Generator loss: ' + str(g_loss.item()) + '\n' + 'critic loss: ' + str(d_loss.item()) + '\n' + 'critic guess on reals: ' + str(critic_score) + ' critic guess on fakes: ' + str(gen_score) + ' Updated critic guess on fake: ' + str(gen_cost) + '\n'
                #modelHelper.saveToTxt(filename, saveString)

        if self.trainWithFreeze:
            #trainFrozenModel = trainFrozenGan(self.dataloader,gen,critic,gen_opt,critic_opt, self.config)
            #trainFrozenGan.trainGAN()
            #Frys BN i encoder parts of the network
            #Bruge affine? eller sætte weight og bias til module.eval
            for name, module in gen.named_modules():
                if isinstance(module, nn.BatchNorm2d) and 'down' in name:
                    module.eval()

            for epoch in range(self.epochsFrozen):
                for real, SAR in tqdm(self.dataloader,
                                      position=0,
                                      leave=True,
                                      disable=self.config.run_polyaxon):

                    masks = loadAndAgumentMasks.returnTensorMasks(
                        self.batchSize)
                    masks = torch.from_numpy(masks)
                    masks = masks.type(torch.cuda.FloatTensor)
                    masks = 1 - masks
                    masks.to(self.device)

                    real = real.to(self.device)

                    # ---------------------
                    #  Train critic
                    # ---------------------
                    critic.zero_grad()
                    # Real images
                    real_validity = critic(real)
                    d_real = real_validity.mean()
                    # Generate a batch of images with mask
                    Masked_fake_img = torch.mul(real, masks)
                    fake_imgs = gen(Masked_fake_img, masks)
                    # Fake images
                    fake_validity = critic(fake_imgs)  # Detach or not?
                    d_fake = fake_validity.mean()

                    gradient_penalty = self.calc_gradient_penalty(
                        critic, real.data, fake_imgs.data)
                    d_loss = d_fake - d_real + gradient_penalty
                    d_loss.backward()

                    critic_opt.step()

                    # Values for txt / logging
                    critic_cost = d_fake - d_real + gradient_penalty
                    wasserstein_d = d_real - d_fake
                    critic_score = real_validity.mean().item()
                    gen_score = fake_validity.mean().item()

                    # Train the generator every n_critic steps
                    if cur_step % self.n_critic == 0:

                        # -----------------
                        #  Train Generator
                        # -----------------
                        gen.zero_grad()
                        # Generate a batch of images
                        fake_noise = torch.mul(real, masks)
                        fake_imgs = gen(fake_noise, masks)
                        # Loss measures generator's ability to fool the critic
                        # Train on fake images
                        fake_validity1 = critic(fake_imgs)

                        loss_dict = loss_func(fake_noise, masks, fake_imgs,
                                              real)
                        loss = 0.0

                        # sums up each loss value
                        for key, value in loss_dict.items():
                            loss += value

                        loss.backward(retain_graph=True)

                        g_loss = fake_validity1.mean()
                        # g_lossMSE = criterionMSE(real, fake_imgs)
                        # g_lossMSE.backward(retain_graph=True)

                        g_loss = -g_loss
                        g_loss.backward()  # mone

                        gen_opt.step()
                        gen_cost = g_loss
                    cur_step += 1

                if self.config.run_polyaxon and epoch % 5 == 0:
                    metrics = {}
                    for key, value in loss_dict.items():
                        for key, value in loss_dict.items():
                            modelHelper.saveMetricsNewPolyaxon(
                                metrics, key, value.item(), epoch, self.config)
                        modelHelper.saveMetricsNewPolyaxon(
                            metrics, 'critic cost', critic_cost.item(), epoch,
                            self.config)
                        modelHelper.saveMetricsNewPolyaxon(
                            metrics, 'Wasserstein distance',
                            wasserstein_d.item(), epoch, self.config)
                        modelHelper.saveMetricsNewPolyaxon(
                            metrics, 'Gen cost', gen_cost.item(), epoch,
                            self.config)
                if epoch % self.save_model_step == 0 and self.trainMode == True:
                    name = str(self.modelName) + '_' + str(epoch + self.epochs)
                    model_path = modelHelper.saveModel(name,
                                                       self.modelOutputPath,
                                                       gen, self.modelName)
                    if self.config.nir_data:
                        modelHelper.save_tensor_batch_NIR(
                            real, Masked_fake_img, fake_imgs, self.batchSize,
                            Path.joinpath(self.ImageOutputPath,
                                          'epoch_' + str(epoch)))
                    else:
                        modelHelper.save_tensor_batch(
                            real, Masked_fake_img, fake_imgs, self.batchSize,
                            Path.joinpath(self.ImageOutputPath,
                                          'epoch_' + str(epoch + self.epochs)))
                    # Save loss from generator and critic to a file

                    filename = Path.joinpath(
                        self.modelOutputPath, self.modelName + '_' +
                        str(self.batchSize) + 'Errors.txt')
                    saveString = 'wasserStein Number: ' + str(
                        wasserstein_d) + ' Generator loss: ' + str(g_loss.item(
                        )) + '\n' + 'critic loss: ' + str(d_loss.item(
                        )) + '\n' + 'critic guess on reals: ' + str(
                            critic_score) + ' critic guess on fakes: ' + str(
                                gen_score
                            ) + ' Updated critic guess on fake: ' + str(
                                gen_cost) + '\n'
                    modelHelper.saveToTxt(filename, saveString)

        return model_path
def main():
    args = parse_args()

    # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
    accelerator = Accelerator()
    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
    logger.info(accelerator.state)

    # Setup logging, we only want one process per machine to log things on the screen.
    # accelerator.is_local_main_process is only True for one process per machine.
    logger.setLevel(
        logging.INFO if accelerator.is_local_main_process else logging.ERROR)
    if accelerator.is_local_main_process:
        datasets.utils.logging.set_verbosity_warning()
        transformers.utils.logging.set_verbosity_info()
    else:
        datasets.utils.logging.set_verbosity_error()
        transformers.utils.logging.set_verbosity_error()

    # If passed along, set the training seed now.
    if args.seed is not None:
        set_seed(args.seed)

    # Handle the repository creation
    if accelerator.is_main_process:
        if args.push_to_hub:
            if args.hub_model_id is None:
                repo_name = get_full_repo_name(Path(args.output_dir).name,
                                               token=args.hub_token)
            else:
                repo_name = args.hub_model_id
            repo = Repository(args.output_dir, clone_from=repo_name)
        elif args.output_dir is not None:
            os.makedirs(args.output_dir, exist_ok=True)
    accelerator.wait_for_everyone()

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        raw_datasets = load_dataset(args.dataset_name,
                                    args.dataset_config_name)
    else:
        data_files = {}
        if args.train_file is not None:
            data_files["train"] = args.train_file
        if args.validation_file is not None:
            data_files["validation"] = args.validation_file
        extension = args.train_file.split(".")[-1]
        raw_datasets = load_dataset(extension, data_files=data_files)
    # Trim a number of training examples
    if args.debug:
        for split in raw_datasets.keys():
            raw_datasets[split] = raw_datasets[split].select(range(100))
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    if raw_datasets["train"] is not None:
        column_names = raw_datasets["train"].column_names
    else:
        column_names = raw_datasets["validation"].column_names

    # When using your own dataset or a different dataset from swag, you will probably need to change this.
    ending_names = [f"ending{i}" for i in range(4)]
    context_name = "sent1"
    question_header_name = "sent2"
    label_column_name = "label" if "label" in column_names else "labels"

    # Load pretrained model and tokenizer
    #
    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    if args.config_name:
        config = AutoConfig.from_pretrained(args.model_name_or_path)
    elif args.model_name_or_path:
        config = AutoConfig.from_pretrained(args.model_name_or_path)
    else:
        config = CONFIG_MAPPING[args.model_type]()
        logger.warning(
            "You are instantiating a new config instance from scratch.")

    if args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(
            args.tokenizer_name, use_fast=not args.use_slow_tokenizer)
    elif args.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(
            args.model_name_or_path, use_fast=not args.use_slow_tokenizer)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
        )

    if args.model_name_or_path:
        model = AutoModelForMultipleChoice.from_pretrained(
            args.model_name_or_path,
            from_tf=bool(".ckpt" in args.model_name_or_path),
            config=config,
        )
    else:
        logger.info("Training new model from scratch")
        model = AutoModelForMultipleChoice.from_config(config)

    model.resize_token_embeddings(len(tokenizer))

    # Preprocessing the datasets.
    # First we tokenize all the texts.
    padding = "max_length" if args.pad_to_max_length else False

    def preprocess_function(examples):
        first_sentences = [[context] * 4 for context in examples[context_name]]
        question_headers = examples[question_header_name]
        second_sentences = [[
            f"{header} {examples[end][i]}" for end in ending_names
        ] for i, header in enumerate(question_headers)]
        labels = examples[label_column_name]

        # Flatten out
        first_sentences = list(chain(*first_sentences))
        second_sentences = list(chain(*second_sentences))

        # Tokenize
        tokenized_examples = tokenizer(
            first_sentences,
            second_sentences,
            max_length=args.max_length,
            padding=padding,
            truncation=True,
        )
        # Un-flatten
        tokenized_inputs = {
            k: [v[i:i + 4] for i in range(0, len(v), 4)]
            for k, v in tokenized_examples.items()
        }
        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    with accelerator.main_process_first():
        processed_datasets = raw_datasets.map(
            preprocess_function,
            batched=True,
            remove_columns=raw_datasets["train"].column_names)

    train_dataset = processed_datasets["train"]
    eval_dataset = processed_datasets["validation"]

    # Log a few random samples from the training set:
    for index in random.sample(range(len(train_dataset)), 3):
        logger.info(
            f"Sample {index} of the training set: {train_dataset[index]}.")

    # DataLoaders creation:
    if args.pad_to_max_length:
        # If padding was already done ot max length, we use the default data collator that will just convert everything
        # to tensors.
        data_collator = default_data_collator
    else:
        # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of
        # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
        # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
        data_collator = DataCollatorForMultipleChoice(
            tokenizer,
            pad_to_multiple_of=(8 if accelerator.use_fp16 else None))

    train_dataloader = DataLoader(train_dataset,
                                  shuffle=True,
                                  collate_fn=data_collator,
                                  batch_size=args.per_device_train_batch_size)
    eval_dataloader = DataLoader(eval_dataset,
                                 collate_fn=data_collator,
                                 batch_size=args.per_device_eval_batch_size)

    # Optimizer
    # Split weights in two groups, one with weight decay and the other not.
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)

    # Use the device given by the `accelerator` object.
    device = accelerator.device
    model.to(device)

    # Prepare everything with our `accelerator`.
    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader)

    # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
    # shorter in multiprocess)

    # Scheduler and math around the number of training steps.
    num_update_steps_per_epoch = math.ceil(
        len(train_dataloader) / args.gradient_accumulation_steps)
    if args.max_train_steps is None:
        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
    else:
        args.num_train_epochs = math.ceil(args.max_train_steps /
                                          num_update_steps_per_epoch)

    lr_scheduler = get_scheduler(
        name=args.lr_scheduler_type,
        optimizer=optimizer,
        num_warmup_steps=args.num_warmup_steps,
        num_training_steps=args.max_train_steps,
    )

    # Metrics
    metric = load_metric("accuracy")

    # Train!
    total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps

    logger.info("***** Running training *****")
    logger.info(f"  Num examples = {len(train_dataset)}")
    logger.info(f"  Num Epochs = {args.num_train_epochs}")
    logger.info(
        f"  Instantaneous batch size per device = {args.per_device_train_batch_size}"
    )
    logger.info(
        f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
    )
    logger.info(
        f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
    logger.info(f"  Total optimization steps = {args.max_train_steps}")
    # Only show the progress bar once on each machine.
    progress_bar = tqdm(range(args.max_train_steps),
                        disable=not accelerator.is_local_main_process)
    completed_steps = 0

    for epoch in range(args.num_train_epochs):
        model.train()
        for step, batch in enumerate(train_dataloader):
            outputs = model(**batch)
            loss = outputs.loss
            loss = loss / args.gradient_accumulation_steps
            accelerator.backward(loss)
            if step % args.gradient_accumulation_steps == 0 or step == len(
                    train_dataloader) - 1:
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()
                progress_bar.update(1)
                completed_steps += 1

            if completed_steps >= args.max_train_steps:
                break

        model.eval()
        for step, batch in enumerate(eval_dataloader):
            with torch.no_grad():
                outputs = model(**batch)
            predictions = outputs.logits.argmax(dim=-1)
            metric.add_batch(
                predictions=accelerator.gather(predictions),
                references=accelerator.gather(batch["labels"]),
            )

        eval_metric = metric.compute()
        accelerator.print(f"epoch {epoch}: {eval_metric}")

        if args.push_to_hub and epoch < args.num_train_epochs - 1:
            accelerator.wait_for_everyone()
            unwrapped_model = accelerator.unwrap_model(model)
            unwrapped_model.save_pretrained(args.output_dir,
                                            save_function=accelerator.save)
            if accelerator.is_main_process:
                tokenizer.save_pretrained(args.output_dir)
                repo.push_to_hub(
                    commit_message=f"Training in progress epoch {epoch}",
                    blocking=False,
                    auto_lfs_prune=True)

    if args.output_dir is not None:
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        unwrapped_model.save_pretrained(args.output_dir,
                                        save_function=accelerator.save)
        if accelerator.is_main_process:
            tokenizer.save_pretrained(args.output_dir)
            if args.push_to_hub:
                repo.push_to_hub(commit_message="End of training",
                                 auto_lfs_prune=True)
示例#8
0
    def predict(self, to_predict):
        """
        Performs predictions on a list of text.
        Args:
            to_predict: A python list of text (str) to be sent to the model for prediction.
        Returns:
            preds: A Python list of lists with dicts containg each word mapped to its NER tag.
            model_outputs: A python list of the raw model outputs for each text.
        """

        device = self.device
        model = self.model
        args = self.args
        pad_token_label_id = self.pad_token_label_id

        self._move_model_to_device()

        predict_examples = [
            InputExample(i, sentence.split(),
                         ["O" for word in sentence.split()])
            for i, sentence in enumerate(to_predict)
        ]

        eval_dataset = self.load_and_cache_examples(
            None, to_predict=predict_examples)

        eval_sampler = SequentialSampler(eval_dataset)
        eval_dataloader = DataLoader(eval_dataset,
                                     sampler=eval_sampler,
                                     batch_size=args["eval_batch_size"])

        eval_loss = 0.0
        nb_eval_steps = 0
        preds = None
        out_label_ids = None
        model.eval()

        for batch in tqdm(eval_dataloader, disable=args["silent"]):
            batch = tuple(t.to(device) for t in batch)

            with torch.no_grad():
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "labels": batch[3],
                }
                # XLM and RoBERTa don"t use segment_ids
                if args["model_type"] in ["bert", "xlnet"]:
                    inputs["token_type_ids"] = batch[2]
                outputs = model(**inputs)
                tmp_eval_loss, logits = outputs[:2]

                eval_loss += tmp_eval_loss.mean().item()

            nb_eval_steps += 1

            if preds is None:
                preds = logits.detach().cpu().numpy()
                out_label_ids = inputs["labels"].detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                out_label_ids = np.append(
                    out_label_ids,
                    inputs["labels"].detach().cpu().numpy(),
                    axis=0)

        eval_loss = eval_loss / nb_eval_steps
        model_outputs = preds
        preds = np.argmax(preds, axis=2)

        label_map = {i: label for i, label in enumerate(self.labels)}

        out_label_list = [[] for _ in range(out_label_ids.shape[0])]
        preds_list = [[] for _ in range(out_label_ids.shape[0])]

        for i in range(out_label_ids.shape[0]):
            for j in range(out_label_ids.shape[1]):
                if out_label_ids[i, j] != pad_token_label_id:
                    out_label_list[i].append(label_map[out_label_ids[i][j]])
                    preds_list[i].append(label_map[preds[i][j]])

        preds = [[{
            word: preds_list[i][j]
        } for j, word in enumerate(sentence.split()[:len(preds_list[i])])]
                 for i, sentence in enumerate(to_predict)]

        return preds, model_outputs
示例#9
0
def coverage_table(
    peakfile,
    datafiles,
    window,
    log_transform=True,
    normalization="none",
    top=0,
    topmethod="var",
    rmdup=True,
    rmrepeats=True,
    ncpus=12,
):
    for x in datafiles:
        if not os.path.isfile(x):
            print("ERROR: Data file '{0}' does not exist".format(x))
            sys.exit(1)
    for x in datafiles:
        if ".bam" in x and not os.path.isfile("{0}.bai".format(x)):
            print("Data file '{0}' does not have an index file."
                  " Creating an index file for {0}.".format(x))
            pysam.index(x)

    logger.info("Loading data")
    data = {}
    try:
        # Load data in parallel
        pool = multiprocessing.Pool(processes=ncpus)
        jobs = []
        for datafile in datafiles:
            jobs.append(
                pool.apply_async(
                    load_heatmap_data,
                    args=(
                        peakfile,
                        datafile,
                        1,
                        window // 2,
                        window // 2,
                        rmdup,
                        False,
                        rmrepeats,
                        None,
                        False,
                        None,
                    ),
                ))
        for job in tqdm(jobs):
            track, regions, profile, guard = job.get()
            data[os.path.splitext(track)[0]] = profile[:, 0]
    except Exception as e:
        sys.stderr.write("Error loading data in parallel, trying serial\n")
        sys.stderr.write("Error: {}\n".format(e))
        for datafile in tqdm(datafiles):
            track, regions, profile, guard = load_heatmap_data(
                peakfile,
                datafile,
                1,
                window // 2,
                window // 2,
                rmdup,
                False,
                rmrepeats,
                None,
                False,
                None,
            )
            data[os.path.splitext(track)[0]] = profile[:, 0]

    # Create DataFrame with regions as index
    regions = ["{}:{}-{}".format(*region[:3]) for region in regions]
    df = pd.DataFrame(data, index=regions)

    if log_transform:
        logger.info("Log transform")
        df = np.log1p(df)
    if normalization == "scale":
        logger.info("Normalization by scaling")
        df[:] = scale(df, axis=0)
    if normalization == "quantile":
        logger.info("Normalization by quantile normalization")
        df = qnorm.quantile_normalize(df)
    else:
        logger.info("No normalization")

    if top > 0:
        if topmethod == "var":
            idx = df.var(1).sort_values().tail(top).index
        elif topmethod == "std":
            idx = df.std(1).sort_values().tail(top).index
        elif topmethod == "mean":
            idx = df.mean(1).sort_values().tail(top).index
        elif topmethod == "random":
            idx = df.sample(top).index
        else:
            raise ValueError(
                "unknown method {} for selecting regions".format(topmethod))
        df = df.loc[idx]
    return df
示例#10
0
def render_dataset(dataset: np.ndarray, names: np.ndarray, args):
    '''Renders a list of tex equations

    Args:
        dataset (numpy.ndarray): List of equations
        names (numpy.ndarray): List of integers of size `dataset` that give the name of the saved image
        args (Union[Namespace, Munch]): additional arguments: mode (equation or inline), out (output directory), divable (common factor )
                                        batchsize (how many samples to render at once), dpi, font (Math font), preprocess (crop, alpha off)
                                        shuffle (bool)

    Returns:
        list: equation indices that could not be rendered. 
    '''
    assert len(names) == len(
        dataset), 'names and dataset must be of equal size'
    math_mode = '$$' if args.mode == 'equation' else '$'
    os.makedirs(args.out, exist_ok=True)
    indices = np.array([
        int(os.path.basename(img).split('.')[0])
        for img in glob.glob(os.path.join(args.out, '*.png'))
    ])

    valid = [i for i, j in enumerate(names) if j not in indices]
    dataset = dataset[valid]
    names = names[valid]
    order = np.random.permutation(len(dataset)) if args.shuffle else np.arange(
        len(dataset))
    faulty = []
    for i in tqdm(range(0, len(dataset), args.batchsize)):
        batch = dataset[order[i:i + args.batchsize]]
        #batch = [x for j, x in enumerate(batch) if order[i+j] not in indices]
        if len(batch) == 0:
            continue
        math = [math_mode + x + math_mode for x in batch if x != '']
        #print('\n', i, len(math), '\n'.join(math))
        if len(args.font) > 1:
            font = np.random.choice(args.font)
        else:
            font = args.font[0]
        if len(args.dpi) > 1:
            dpi = np.random.choice(np.arange(min(args.dpi), max(args.dpi)))
        else:
            dpi = args.dpi[0]
        if len(math) > 0:
            try:
                if args.preprocess:
                    pngs = tex2pil(math, dpi=dpi, font=font)
                else:
                    pngs = Latex(math, dpi=dpi,
                                 font=font).write(return_bytes=False)
            except Exception as e:
                #print(e)
                #print(math)
                #raise e
                faulty.extend(list(names[order[i:i + args.batchsize]]))
                continue

            for j, k in enumerate(range(i, i + len(pngs))):
                outpath = os.path.join(args.out, '%07d.png' % names[order[k]])
                if args.preprocess:
                    try:
                        data = np.asarray(pngs[j])
                        # print(data.shape)
                        gray = 255 * (data[..., 0] < 128).astype(
                            np.uint8)  # To invert the text to white
                        coords = cv2.findNonZero(
                            gray)  # Find all non-zero points (text)
                        a, b, w, h = cv2.boundingRect(
                            coords)  # Find minimum spanning bounding box
                        rect = data[b:b + h, a:a + w]
                        im = Image.fromarray((255 - rect[..., -1]).astype(
                            np.uint8)).convert('L')
                        dims = []
                        for x in [w, h]:
                            div, mod = divmod(x, args.divable)
                            dims.append(args.divable * (div +
                                                        (1 if mod > 0 else 0)))
                        padded = Image.new('L', dims, 255)
                        padded.paste(im, im.getbbox())
                        padded.save(outpath)
                    except Exception as e:
                        print(e)
                        pass
                else:
                    shutil.move(pngs[j], outpath)

    return np.array(faulty)
示例#11
0
    def train(self,
              train_dataset,
              output_dir,
              show_running_loss=True,
              eval_df=None,
              verbose=True):
        """
        Trains the model on train_dataset.
        Utility function to be used by the train_model() method. Not intended to be used directly.
        """

        device = self.device
        model = self.model
        args = self.args

        tb_writer = SummaryWriter(logdir=args["tensorboard_dir"])
        train_sampler = RandomSampler(train_dataset)
        train_dataloader = DataLoader(train_dataset,
                                      sampler=train_sampler,
                                      batch_size=args["train_batch_size"])

        t_total = len(train_dataloader) // args[
            "gradient_accumulation_steps"] * args["num_train_epochs"]

        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in model.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                args["weight_decay"],
            },
            {
                "params": [
                    p for n, p in model.named_parameters()
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.0,
            },
        ]

        warmup_steps = math.ceil(t_total * args["warmup_ratio"])
        args["warmup_steps"] = warmup_steps if args[
            "warmup_steps"] == 0 else args["warmup_steps"]

        optimizer = AdamW(
            optimizer_grouped_parameters,
            lr=args["learning_rate"],
            eps=args["adam_epsilon"],
        )
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=args["warmup_steps"],
            num_training_steps=t_total)

        if args["fp16"]:
            try:
                from apex import amp
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
                )

            model, optimizer = amp.initialize(model,
                                              optimizer,
                                              opt_level=args["fp16_opt_level"])

        if args["n_gpu"] > 1:
            model = torch.nn.DataParallel(model)

        global_step = 0
        tr_loss, logging_loss = 0.0, 0.0
        model.zero_grad()
        train_iterator = trange(int(args["num_train_epochs"]),
                                desc="Epoch",
                                disable=args["silent"])
        epoch_number = 0
        best_eval_metric = None
        early_stopping_counter = 0

        if args["evaluate_during_training"]:
            training_progress_scores = self._create_training_progress_scores()
        if args["wandb_project"]:
            wandb.init(project=args["wandb_project"],
                       config={**args},
                       **args["wandb_kwargs"])
            wandb.watch(self.model)

        model.train()
        for _ in train_iterator:
            # epoch_iterator = tqdm(train_dataloader, desc="Iteration")
            for step, batch in enumerate(
                    tqdm(train_dataloader,
                         desc="Current iteration",
                         disable=args["silent"])):
                batch = tuple(t.to(device) for t in batch)

                inputs = self._get_inputs_dict(batch)

                outputs = model(**inputs)
                # model outputs are always tuple in pytorch-transformers (see doc)
                loss = outputs[0]

                if args["n_gpu"] > 1:
                    loss = loss.mean(
                    )  # mean() to average on multi-gpu parallel training

                current_loss = loss.item()

                if show_running_loss:
                    print("\rRunning loss: %f" % loss, end="")

                if args["gradient_accumulation_steps"] > 1:
                    loss = loss / args["gradient_accumulation_steps"]

                if args["fp16"]:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                    # torch.nn.utils.clip_grad_norm_(
                    #     amp.master_params(optimizer), args["max_grad_norm"]
                    # )
                else:
                    loss.backward()
                    # torch.nn.utils.clip_grad_norm_(
                    #     model.parameters(), args["max_grad_norm"]
                    # )

                tr_loss += loss.item()
                if (step + 1) % args["gradient_accumulation_steps"] == 0:
                    if args["fp16"]:
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(optimizer),
                            args["max_grad_norm"])
                    else:
                        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                       args["max_grad_norm"])
                    optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    model.zero_grad()
                    global_step += 1

                    if args["logging_steps"] > 0 and global_step % args[
                            "logging_steps"] == 0:
                        # Log metrics
                        tb_writer.add_scalar("lr",
                                             scheduler.get_lr()[0],
                                             global_step)
                        tb_writer.add_scalar(
                            "loss",
                            (tr_loss - logging_loss) / args["logging_steps"],
                            global_step,
                        )
                        logging_loss = tr_loss
                        if args["wandb_project"]:
                            wandb.log({
                                "Training loss": current_loss,
                                "lr": scheduler.get_lr()[0],
                                "global_step": global_step,
                            })

                    if args["save_steps"] > 0 and global_step % args[
                            "save_steps"] == 0:
                        # Save model checkpoint
                        output_dir_current = os.path.join(
                            output_dir, "checkpoint-{}".format(global_step))

                        self._save_model(output_dir_current, model=model)

                    if args["evaluate_during_training"] and (
                            args["evaluate_during_training_steps"] > 0
                            and global_step %
                            args["evaluate_during_training_steps"] == 0):
                        # Only evaluate when single GPU otherwise metrics may not average well
                        results, _, _ = self.eval_model(eval_df, verbose=True)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value,
                                                 global_step)

                        output_dir_current = os.path.join(
                            output_dir, "checkpoint-{}".format(global_step))

                        os.makedirs(output_dir_current, exist_ok=True)

                        if args["save_eval_checkpoints"]:
                            self._save_model(output_dir_current,
                                             model=model,
                                             results=results)

                        training_progress_scores["global_step"].append(
                            global_step)
                        training_progress_scores["train_loss"].append(
                            current_loss)
                        for key in results:
                            training_progress_scores[key].append(results[key])
                        report = pd.DataFrame(training_progress_scores)
                        report.to_csv(
                            os.path.join(args["output_dir"],
                                         "training_progress_scores.csv"),
                            index=False,
                        )

                        if args["wandb_project"]:
                            wandb.log(
                                self._get_last_metrics(
                                    training_progress_scores))

                        if not best_eval_metric:
                            best_eval_metric = results[
                                args["early_stopping_metric"]]
                            self._save_model(args["best_model_dir"],
                                             model=model,
                                             results=results)
                        if best_eval_metric and args[
                                "early_stopping_metric_minimize"]:
                            if results[args[
                                    "early_stopping_metric"]] - best_eval_metric < args[
                                        "early_stopping_delta"]:
                                best_eval_metric = results[
                                    args["early_stopping_metric"]]
                                self._save_model(args["best_model_dir"],
                                                 model=model,
                                                 results=results)
                                early_stopping_counter = 0
                            else:
                                if args["use_early_stopping"]:
                                    if early_stopping_counter < args[
                                            "early_stopping_patience"]:
                                        early_stopping_counter += 1
                                        if verbose:
                                            logger.info(
                                                f" No improvement in {args['early_stopping_metric']}"
                                            )
                                            logger.info(
                                                f" Current step: {early_stopping_counter}"
                                            )
                                            logger.info(
                                                f" Early stopping patience: {args['early_stopping_patience']}"
                                            )
                                    else:
                                        if verbose:
                                            logger.info(
                                                f" Patience of {args['early_stopping_patience']} steps reached"
                                            )
                                            logger.info(
                                                " Training terminated.")
                                            train_iterator.close()
                                        return global_step, tr_loss / global_step
                        else:
                            if results[args[
                                    "early_stopping_metric"]] - best_eval_metric > args[
                                        "early_stopping_delta"]:
                                best_eval_metric = results[
                                    args["early_stopping_metric"]]
                                self._save_model(args["best_model_dir"],
                                                 model=model,
                                                 results=results)
                                early_stopping_counter = 0
                            else:
                                if args["use_early_stopping"]:
                                    if early_stopping_counter < args[
                                            "early_stopping_patience"]:
                                        early_stopping_counter += 1
                                        if verbose:
                                            logger.info(
                                                f" No improvement in {args['early_stopping_metric']}"
                                            )
                                            logger.info(
                                                f" Current step: {early_stopping_counter}"
                                            )
                                            logger.info(
                                                f" Early stopping patience: {args['early_stopping_patience']}"
                                            )
                                    else:
                                        if verbose:
                                            logger.info(
                                                f" Patience of {args['early_stopping_patience']} steps reached"
                                            )
                                            logger.info(
                                                " Training terminated.")
                                            train_iterator.close()
                                        return global_step, tr_loss / global_step

            epoch_number += 1
            output_dir_current = os.path.join(
                output_dir,
                "checkpoint-{}-epoch-{}".format(global_step, epoch_number))

            if args["save_model_every_epoch"] or args[
                    "evaluate_during_training"]:
                os.makedirs(output_dir_current, exist_ok=True)

            if args["save_model_every_epoch"]:
                self._save_model(output_dir_current, model=model)

            if args["evaluate_during_training"]:
                results, _, _ = self.eval_model(eval_df, verbose=True)

                self._save_model(output_dir_current, results=results)

                training_progress_scores["global_step"].append(global_step)
                training_progress_scores["train_loss"].append(current_loss)
                for key in results:
                    training_progress_scores[key].append(results[key])
                report = pd.DataFrame(training_progress_scores)
                report.to_csv(os.path.join(args["output_dir"],
                                           "training_progress_scores.csv"),
                              index=False)

                if not best_eval_metric:
                    best_eval_metric = results[args["early_stopping_metric"]]
                    self._save_model(args["best_model_dir"],
                                     model=model,
                                     results=results)
                if best_eval_metric and args["early_stopping_metric_minimize"]:
                    if results[args[
                            "early_stopping_metric"]] - best_eval_metric < args[
                                "early_stopping_delta"]:
                        best_eval_metric = results[
                            args["early_stopping_metric"]]
                        self._save_model(args["best_model_dir"],
                                         model=model,
                                         results=results)
                        early_stopping_counter = 0
                else:
                    if results[args[
                            "early_stopping_metric"]] - best_eval_metric > args[
                                "early_stopping_delta"]:
                        best_eval_metric = results[
                            args["early_stopping_metric"]]
                        self._save_model(args["best_model_dir"],
                                         model=model,
                                         results=results)
                        early_stopping_counter = 0

        return global_step, tr_loss / global_step
示例#12
0
    def _prediction_loop(
        self, dataloader: DataLoader, description: str, prediction_loss_only: Optional[bool] = None
    ) -> PredictionOutput:
        """
        Prediction/evaluation loop, shared by `evaluate()` and `predict()`.

        Works both with or without labels.
        """

        prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else self.prediction_loss_only

        model = self.model
        # multi-gpu eval
        if self.args.n_gpu > 1:
            model = torch.nn.DataParallel(model)
        else:
            model = self.model
        # Note: in torch.distributed mode, there's no point in wrapping the model
        # inside a DistributedDataParallel as we'll be under `no_grad` anyways.

        batch_size = dataloader.batch_size
        logger.info("***** Running %s *****", description)
        logger.info("  Num examples = %d", self.num_examples(dataloader))
        logger.info("  Batch size = %d", batch_size)
        eval_losses: List[float] = []
        preds: torch.Tensor = None
        label_ids: torch.Tensor = None
        model.eval()

        if is_torch_tpu_available():
            dataloader = pl.ParallelLoader(dataloader, [self.args.device]).per_device_loader(self.args.device)

        if self.args.past_index >= 0:
            past = None

        for inputs in tqdm(dataloader, desc=description):
            has_labels = any(inputs.get(k) is not None for k in ["labels", "lm_labels", "masked_lm_labels"])

            for k, v in inputs.items():
                if isinstance(v, torch.Tensor):
                    inputs[k] = v.to(self.args.device)
            if self.args.past_index >= 0:
                inputs["mems"] = past

            with torch.no_grad():
                outputs = model(**inputs)
                if has_labels:
                    step_eval_loss, logits = outputs[:2]
                    eval_losses += [step_eval_loss.mean().item()]
                else:
                    logits = outputs[0]
                if self.args.past_index >= 0:
                    past = outputs[self.args.past_index if has_labels else self.args.past_index - 1]

            if not prediction_loss_only:
                if preds is None:
                    preds = logits.detach()
                else:
                    preds = torch.cat((preds, logits.detach()), dim=0)
                if inputs.get("labels") is not None:
                    if label_ids is None:
                        label_ids = inputs["labels"].detach()
                    else:
                        label_ids = torch.cat((label_ids, inputs["labels"].detach()), dim=0)

        if self.args.local_rank != -1:
            # In distributed mode, concatenate all results from all nodes:
            if preds is not None:
                preds = self.distributed_concat(preds, num_total_examples=self.num_examples(dataloader))
            if label_ids is not None:
                label_ids = self.distributed_concat(label_ids, num_total_examples=self.num_examples(dataloader))
        elif is_torch_tpu_available():
            # tpu-comment: Get all predictions and labels from all worker shards of eval dataset
            if preds is not None:
                preds = xm.mesh_reduce("eval_preds", preds, torch.cat)
            if label_ids is not None:
                label_ids = xm.mesh_reduce("eval_label_ids", label_ids, torch.cat)

        # Finally, turn the aggregated tensors into numpy arrays.
        if preds is not None:
            preds = preds.cpu().numpy()
        if label_ids is not None:
            label_ids = label_ids.cpu().numpy()

        if self.compute_metrics is not None and preds is not None and label_ids is not None:
            metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids))
        else:
            metrics = {}
        if len(eval_losses) > 0:
            metrics["eval_loss"] = np.mean(eval_losses)

        # Prefix all keys with eval_
        for key in list(metrics.keys()):
            if not key.startswith("eval_"):
                metrics[f"eval_{key}"] = metrics.pop(key)

        return PredictionOutput(predictions=preds, label_ids=label_ids, metrics=metrics)
示例#13
0
def compute_IDFS(output_folder, cut):
    config = wandb.config
    if not os.path.isdir(output_folder):
        os.makedirs(output_folder)
    cpus = max(1, config.number_of_cpus)
    logging.info("Computing IDF with %i cpus" % cpus)
    excess_lines = config.corpus_size % cpus
    number_of_chunks = cpus
    if excess_lines > 0:
        number_of_chunks = cpus - 1
        excess_lines = config.corpus_size % number_of_chunks
    lines_per_chunk = config.corpus_size // number_of_chunks
    logging.info("{}  lines per chunk".format(lines_per_chunk))
    logging.info("{}  lines for last chunk".format(excess_lines))
    assert (number_of_chunks * lines_per_chunk +
            excess_lines) == config.corpus_size

    if cut == 'cut':
        docs_path = os.path.join(config.data_home,
                                 "docs/msmarco-docs.tokenized.cut.tsv")
    else:
        docs_path = os.path.join(config.data_home,
                                 "docs/msmarco-docs.tokenized.tsv")

    block_offset = dict()
    if cpus < 2:
        block_offset[0] = 0
    else:  # Compute offset for documents for each chunk to be processed
        output_file = os.path.join(output_folder,
                                   "blocks_offset_{}-cpus".format(cpus))
        if not os.path.isfile(output_file):
            pbar = tqdm(total=config.corpus_size + 1,
                        desc="Computing chunks for each processor")
            with open(docs_path) as f:
                current_chunk = 0
                counter = 0
                line = True
                while (line):
                    if counter % lines_per_chunk == 0:
                        block_offset[current_chunk] = f.tell()
                        current_chunk += 1
                    line = f.readline()
                    pbar.update()
                    counter += 1
            pbar.close()
            pickle.dump(block_offset, open(output_file, 'wb'))
        else:
            block_offset = pickle.load(open(output_file, 'rb'))

    if cpus < 2:  # Single CPU, compute directly.
        process_chunk(0, block_offset, docs_path, lines_per_chunk,
                      output_folder)
    else:
        pbar = tqdm(total=cpus, position=0)

        def update(*a):  # Update progress bar
            pbar.update()

        pool = mp.Pool(cpus)
        jobs = []
        for i in range(len(block_offset)):
            jobs.append(
                pool.apply_async(process_chunk,
                                 args=(i, block_offset, docs_path,
                                       lines_per_chunk, output_folder),
                                 callback=update))
        for job in jobs:
            job.get()
        pool.close()
        pbar.close()
    full_IDFS = Counter()
    for i in range(len(block_offset)):
        _idf = pickle.load(
            open(os.path.join(output_folder, "IDFS-{}".format(i)), 'rb'))
        for k in _idf:
            full_IDFS[k] += _idf[k]
        os.remove(os.path.join(output_folder, "IDFS-{}".format(i)))
    pickle.dump(
        full_IDFS,
        open(os.path.join(output_folder, "IDFS-FULL-{}".format(cut)), 'wb'))
bboxs = np.stack(df['bbox'].apply(lambda x: np.fromstring(x[1:-1], sep=',')))
for i, column in enumerate(['x', 'y', 'w', 'h']):
    df[column] = bboxs[:, i]
df.drop(columns=['bbox'], inplace=True)
df['x_center'] = df['x'] + df['w'] / 2
df['y_center'] = df['y'] + df['h'] / 2
df['classes'] = 0
from tqdm.auto import tqdm
import shutil as sh
df = df[['image_id', 'x', 'y', 'w', 'h', 'x_center', 'y_center', 'classes']]

source = 'train'
if True:
    for fold in [0]:
        val_index = index[len(index) * fold // 5:len(index) * (fold + 1) // 5]
        for name, mini in tqdm(df.groupby('image_id')):
            if name in val_index:
                path2save = 'val2017/'
            else:
                path2save = 'train2017/'
            if not os.path.exists('convertor/fold{}/labels/'.format(fold) +
                                  path2save):
                os.makedirs('convertor/fold{}/labels/'.format(fold) +
                            path2save)
            with open(
                    'convertor/fold{}/labels/'.format(fold) + path2save +
                    name + ".txt", 'w+') as f:
                row = mini[['classes', 'x_center', 'y_center', 'w',
                            'h']].astype(float).values
                row = row / 1024
                row = row.astype(str)
示例#15
0
    def train(self, model_path: Optional[str] = None):
        """
        Main training entry point.

        Args:
            model_path (:obj:`str`, `optional`):
                Local path to the model if the model to train has been instantiated from a local path. If present,
                training will resume from the optimizer/scheduler states loaded here.
        """
        train_dataloader = self.get_train_dataloader()
        if self.args.max_steps > 0:
            t_total = self.args.max_steps
            num_train_epochs = (
                self.args.max_steps // (len(train_dataloader) // self.args.gradient_accumulation_steps) + 1
            )
        else:
            t_total = int(len(train_dataloader) // self.args.gradient_accumulation_steps * self.args.num_train_epochs)
            num_train_epochs = self.args.num_train_epochs

        optimizer, scheduler = self.get_optimizers(num_training_steps=t_total)

        # Check if saved optimizer or scheduler states exist
        if (
            model_path is not None
            and os.path.isfile(os.path.join(model_path, "optimizer.pt"))
            and os.path.isfile(os.path.join(model_path, "scheduler.pt"))
        ):
            # Load in optimizer and scheduler states
            optimizer.load_state_dict(
                torch.load(os.path.join(model_path, "optimizer.pt"), map_location=self.args.device)
            )
            scheduler.load_state_dict(torch.load(os.path.join(model_path, "scheduler.pt")))

        model = self.model
        if self.args.fp16:
            if not is_apex_available():
                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
            model, optimizer = amp.initialize(model, optimizer, opt_level=self.args.fp16_opt_level)

        # multi-gpu training (should be after apex fp16 initialization)
        if self.args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        # Distributed training (should be after apex fp16 initialization)
        if self.args.local_rank != -1:
            model = torch.nn.parallel.DistributedDataParallel(
                model,
                device_ids=[self.args.local_rank],
                output_device=self.args.local_rank,
                find_unused_parameters=True,
            )

        if self.tb_writer is not None:
            self.tb_writer.add_text("args", self.args.to_json_string())
            self.tb_writer.add_hparams(self.args.to_sanitized_dict(), metric_dict={})

        # Train!
        if is_torch_tpu_available():
            total_train_batch_size = self.args.train_batch_size * xm.xrt_world_size()
        else:
            total_train_batch_size = (
                self.args.train_batch_size
                * self.args.gradient_accumulation_steps
                * (torch.distributed.get_world_size() if self.args.local_rank != -1 else 1)
            )
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", self.num_examples(train_dataloader))
        logger.info("  Num Epochs = %d", num_train_epochs)
        logger.info("  Instantaneous batch size per device = %d", self.args.per_device_train_batch_size)
        logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d", total_train_batch_size)
        logger.info("  Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps)
        logger.info("  Total optimization steps = %d", t_total)

        self.global_step = 0
        self.epoch = 0
        epochs_trained = 0
        steps_trained_in_current_epoch = 0
        # Check if continuing training from a checkpoint
        if model_path is not None:
            # set global_step to global_step of last saved checkpoint from model path
            try:
                self.global_step = int(model_path.split("-")[-1].split("/")[0])
                epochs_trained = self.global_step // (len(train_dataloader) // self.args.gradient_accumulation_steps)
                steps_trained_in_current_epoch = self.global_step % (
                    len(train_dataloader) // self.args.gradient_accumulation_steps
                )

                logger.info("  Continuing training from checkpoint, will skip to saved global_step")
                logger.info("  Continuing training from epoch %d", epochs_trained)
                logger.info("  Continuing training from global step %d", self.global_step)
                logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
            except ValueError:
                self.global_step = 0
                logger.info("  Starting fine-tuning.")

        tr_loss = 0.0
        logging_loss = 0.0
        model.zero_grad()
        train_iterator = trange(
            epochs_trained, int(num_train_epochs), desc="Epoch", disable=not self.is_local_master()
        )
        for epoch in train_iterator:
            if isinstance(train_dataloader, DataLoader) and isinstance(train_dataloader.sampler, DistributedSampler):
                train_dataloader.sampler.set_epoch(epoch)

            if is_torch_tpu_available():
                parallel_loader = pl.ParallelLoader(train_dataloader, [self.args.device]).per_device_loader(
                    self.args.device
                )
                epoch_iterator = tqdm(parallel_loader, desc="Iteration", disable=not self.is_local_master())
            else:
                epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=not self.is_local_master())

            # Reset the past mems state at the beginning of each epoch if necessary.
            if self.args.past_index >= 0:
                self._past = None

            for step, inputs in enumerate(epoch_iterator):

                # Skip past any already trained steps if resuming training
                if steps_trained_in_current_epoch > 0:
                    steps_trained_in_current_epoch -= 1
                    continue

                tr_loss += self._training_step(model, inputs, optimizer)

                if (step + 1) % self.args.gradient_accumulation_steps == 0 or (
                    # last step in epoch but step is always smaller than gradient_accumulation_steps
                    len(epoch_iterator) <= self.args.gradient_accumulation_steps
                    and (step + 1) == len(epoch_iterator)
                ):
                    if self.args.fp16:
                        torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), self.args.max_grad_norm)
                    else:
                        torch.nn.utils.clip_grad_norm_(model.parameters(), self.args.max_grad_norm)

                    if is_torch_tpu_available():
                        xm.optimizer_step(optimizer)
                    else:
                        optimizer.step()

                    scheduler.step()
                    model.zero_grad()
                    self.global_step += 1
                    self.epoch = epoch + (step + 1) / len(epoch_iterator)

                    if (self.args.logging_steps > 0 and self.global_step % self.args.logging_steps == 0) or (
                        self.global_step == 1 and self.args.logging_first_step
                    ):
                        logs: Dict[str, float] = {}
                        logs["loss"] = (tr_loss - logging_loss) / self.args.logging_steps
                        # backward compatibility for pytorch schedulers
                        logs["learning_rate"] = (
                            scheduler.get_last_lr()[0]
                            if version.parse(torch.__version__) >= version.parse("1.4")
                            else scheduler.get_lr()[0]
                        )
                        logging_loss = tr_loss

                        self._log(logs)

                    if self.args.evaluate_during_training and self.global_step % self.args.eval_steps == 0:
                        self.evaluate()

                    if self.args.save_steps > 0 and self.global_step % self.args.save_steps == 0:
                        # In all cases (even distributed/parallel), self.model is always a reference
                        # to the model we want to save.
                        if hasattr(model, "module"):
                            assert model.module is self.model
                        else:
                            assert model is self.model
                        # Save model checkpoint
                        output_dir = os.path.join(self.args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{self.global_step}")

                        self.save_model(output_dir)

                        if self.is_world_master():
                            self._rotate_checkpoints()

                        if is_torch_tpu_available():
                            xm.rendezvous("saving_optimizer_states")
                            xm.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                            xm.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                        elif self.is_world_master():
                            torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                            torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))

                if self.args.max_steps > 0 and self.global_step > self.args.max_steps:
                    epoch_iterator.close()
                    break
            if self.args.max_steps > 0 and self.global_step > self.args.max_steps:
                train_iterator.close()
                break
            if self.args.tpu_metrics_debug or self.args.debug:
                # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.)
                xm.master_print(met.metrics_report())

        if self.tb_writer:
            self.tb_writer.close()
        if self.args.past_index and hasattr(self, "_past"):
            # Clean the state at the end of training
            delattr(self, "_past")

        logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n")
        return TrainOutput(self.global_step, tr_loss / self.global_step)
                               'DFIPS': str
                           })
census_tracts = pd.read_csv('united-states-commutes/census_tracts_2010.csv',
                            dtype={'GEOID': str})
census_tracts = census_tracts[census_tracts['USPS'].isin(filterUSPS)]
commute_data = commute_data[commute_data['OFIPS'].isin(
    census_tracts['GEOID'].unique())]
commute_data = commute_data[commute_data['DFIPS'].isin(
    census_tracts['GEOID'].unique())]
census_tracts = census_tracts.sort_values('POP10', ascending=True)
census_tracts.reset_index(inplace=True)
commute_data.reset_index(inplace=True)

attribution = np.empty(len(census_tracts))
attribution[:] = np.nan
for index, row in tqdm(census_tracts.iterrows(), total=census_tracts.shape[0]):
    attribution[index] = int(str(row['GEOID'])[:5])
census_tracts['county'] = attribution
counties_names = np.unique(attribution)
n_counties = len(counties_names)

pop = np.zeros(n_counties)
state_USPS = []
for i, ct in enumerate(counties_names):
    pop[i] = census_tracts[census_tracts['county'] == ct]['POP10'].sum()
    state_USPS.append(
        census_tracts[census_tracts['county'] == ct].USPS.iloc[0])

groups = {'geoid': counties_names, 'pop2010': pop, 'stateUSPS': state_USPS}
geodata = pd.DataFrame.from_dict(groups)
示例#17
0
def predict(model, ds_test, batch_size, device='cpu', scaler=None):
    """
    Gather all predictions into xarray.
    
    When we generate prediction in a sequence to sequence model we start at a time then predict
    N steps into the future. So we have 2 dimensions: source time, target time.

    But we also care about how far we were predicting into the future, so we have 3 dimensions: source time, target time, time ahead.

    It's hard to use pandas for data with virtual dimensions so we will use xarray. Xarray has an interface similar to pandas but also allows coordinates which are virtual dimensions.
    """
    load_test = torch.utils.data.dataloader.DataLoader(ds_test,
                                                       batch_size=batch_size)
    freq = ds_test.df.index.freq
    xrs = []
    for i, batch in enumerate(tqdm(load_test, desc='predict', leave=False)):
        model.eval()
        with torch.no_grad():
            x_past, y_past, x_future, y_future = [d.to(device) for d in batch]
            y_dist, extra = model(x_past, y_past, x_future)
            nll = -y_dist.log_prob(y_future)

            # Convert to numpy
            mean = to_numpy(y_dist.loc.squeeze(-1))
            std = to_numpy(y_dist.scale.squeeze(-1))
            nll = to_numpy(nll.squeeze(-1))
            y_future = to_numpy(y_future.squeeze(-1))
            y_past = to_numpy(y_past.squeeze(-1))

        # Make an xarray.Dataset for the data
        bs = y_future.shape[0]
        wp = ds_test.window_past
        t_source = ds_test.df.index[wp + i * bs - 1:wp + i * bs + bs -
                                    1].values
        t_ahead = pd.timedelta_range(1,
                                     periods=ds_test.window_future,
                                     freq=freq).values
        t_behind = pd.timedelta_range(end=0,
                                      periods=ds_test.window_past,
                                      freq=freq)
        xr_out = xr.Dataset(
            {
                # Format> name: ([dimensions,...], array),
                "y_past": ([
                    "t_source",
                    "t_behind",
                ], y_past),
                "nll": ([
                    "t_source",
                    "t_ahead",
                ], nll),
                "y_pred": ([
                    "t_source",
                    "t_ahead",
                ], mean),
                "y_pred_std": ([
                    "t_source",
                    "t_ahead",
                ], std),
                "y_true": ([
                    "t_source",
                    "t_ahead",
                ], y_future),
            },
            coords={
                "t_source": t_source,
                "t_ahead": t_ahead,
                "t_behind": t_behind
            },
            attrs={
                'freq': str(ds_test.freq),
                "model": str(type(model)),
                "targets": ds_test.columns_target
            })
        xrs.append(xr_out)

    # Join all batches
    ds_preds = xr.concat(xrs, dim="t_source")

    # undo scaling on y
    if scaler:
        ds_preds['y_pred_std'].values = ds_preds.y_pred_std * scaler.scale_
        ds_preds['y_past'].values = scaler.inverse_transform(ds_preds.y_past)
        ds_preds['y_pred'].values = scaler.inverse_transform(ds_preds.y_pred)
        ds_preds['y_true'].values = scaler.inverse_transform(ds_preds.y_true)

    # Add some derived coordinates, they will be the ones not in bold
    # The target time, is a function of the source time, and how far we predict ahead
    ds_preds = ds_preds.assign_coords(t_target=ds_preds.t_source +
                                      ds_preds.t_ahead)

    ds_preds = ds_preds.assign_coords(t_past=ds_preds.t_source +
                                      ds_preds.t_behind)

    # Some plots don't like timedeltas, so lets make a coordinate for time ahead in hours
    ds_preds = ds_preds.assign_coords(
        t_ahead_hours=(ds_preds.t_ahead * 1.0e-9 / 60 / 60).astype(float))
    return ds_preds
示例#18
0
def import_from_context(context, num_traces, log, parameters=None):
    """
    Import a XES log from an iterparse context

    Parameters
    --------------
    context
        Iterparse context
    num_traces
        Number of traces of the XES log
    log
        Event log (empty)
    parameters
        Parameters of the algorithm

    Returns
    --------------
    log
        Event log (filled with the contents of the XES log)
    """
    if parameters is None:
        parameters = {}

    max_no_traces_to_import = exec_utils.get_param_value(
        Parameters.MAX_TRACES, parameters, sys.maxsize)
    timestamp_sort = exec_utils.get_param_value(Parameters.TIMESTAMP_SORT,
                                                parameters, False)
    timestamp_key = exec_utils.get_param_value(
        Parameters.TIMESTAMP_KEY, parameters,
        xes_constants.DEFAULT_TIMESTAMP_KEY)
    reverse_sort = exec_utils.get_param_value(Parameters.REVERSE_SORT,
                                              parameters, False)
    show_progress_bar = exec_utils.get_param_value(
        Parameters.SHOW_PROGRESS_BAR, parameters, True)

    date_parser = dt_parser.get()
    progress = None
    if pkgutil.find_loader("tqdm") and show_progress_bar:
        from tqdm.auto import tqdm
        progress = tqdm(total=num_traces,
                        desc="parsing log, completed traces :: ")

    trace = None
    event = None

    tree = {}
    compression_dictio = {}

    for tree_event, elem in context:
        if tree_event == _EVENT_START:  # starting to read
            parent = tree[
                elem.getparent()] if elem.getparent() in tree else None

            if elem.tag.endswith(xes_constants.TAG_STRING):
                if parent is not None:
                    tree = __parse_attribute(elem, parent,
                                             elem.get(xes_constants.KEY_KEY),
                                             elem.get(xes_constants.KEY_VALUE),
                                             tree, compression_dictio)
                continue

            elif elem.tag.endswith(xes_constants.TAG_DATE):
                try:
                    dt = date_parser.apply(elem.get(xes_constants.KEY_VALUE))
                    tree = __parse_attribute(elem, parent,
                                             elem.get(xes_constants.KEY_KEY),
                                             dt, tree, compression_dictio)
                except TypeError:
                    logging.info("failed to parse date: " +
                                 str(elem.get(xes_constants.KEY_VALUE)))
                except ValueError:
                    logging.info("failed to parse date: " +
                                 str(elem.get(xes_constants.KEY_VALUE)))
                continue

            elif elem.tag.endswith(xes_constants.TAG_EVENT):
                if event is not None:
                    raise SyntaxError(
                        'file contains <event> in another <event> tag')
                event = Event()
                tree[elem] = event
                continue

            elif elem.tag.endswith(xes_constants.TAG_TRACE):
                if len(log) >= max_no_traces_to_import:
                    break
                if trace is not None:
                    raise SyntaxError(
                        'file contains <trace> in another <trace> tag')
                trace = Trace()
                tree[elem] = trace.attributes
                continue

            elif elem.tag.endswith(xes_constants.TAG_FLOAT):
                if parent is not None:
                    try:
                        val = float(elem.get(xes_constants.KEY_VALUE))
                        tree = __parse_attribute(
                            elem, parent, elem.get(xes_constants.KEY_KEY), val,
                            tree, compression_dictio)
                    except ValueError:
                        logging.info("failed to parse float: " +
                                     str(elem.get(xes_constants.KEY_VALUE)))
                continue

            elif elem.tag.endswith(xes_constants.TAG_INT):
                if parent is not None:
                    try:
                        val = int(elem.get(xes_constants.KEY_VALUE))
                        tree = __parse_attribute(
                            elem, parent, elem.get(xes_constants.KEY_KEY), val,
                            tree, compression_dictio)
                    except ValueError:
                        logging.info("failed to parse int: " +
                                     str(elem.get(xes_constants.KEY_VALUE)))
                continue

            elif elem.tag.endswith(xes_constants.TAG_BOOLEAN):
                if parent is not None:
                    try:
                        val0 = elem.get(xes_constants.KEY_VALUE)
                        val = False
                        if str(val0).lower() == "true":
                            val = True
                        tree = __parse_attribute(
                            elem, parent, elem.get(xes_constants.KEY_KEY), val,
                            tree, compression_dictio)
                    except ValueError:
                        logging.info("failed to parse boolean: " +
                                     str(elem.get(xes_constants.KEY_VALUE)))
                continue

            elif elem.tag.endswith(xes_constants.TAG_LIST):
                if parent is not None:
                    # lists have no value, hence we put None as a value
                    tree = __parse_attribute(elem, parent,
                                             elem.get(xes_constants.KEY_KEY),
                                             None, tree, compression_dictio)
                continue

            elif elem.tag.endswith(xes_constants.TAG_ID):
                if parent is not None:
                    tree = __parse_attribute(elem, parent,
                                             elem.get(xes_constants.KEY_KEY),
                                             elem.get(xes_constants.KEY_VALUE),
                                             tree, compression_dictio)
                continue

            elif elem.tag.endswith(xes_constants.TAG_EXTENSION):
                if elem.get(xes_constants.KEY_NAME) is not None and elem.get(
                        xes_constants.KEY_PREFIX) is not None and elem.get(
                            xes_constants.KEY_URI) is not None:
                    log.extensions[elem.get(xes_constants.KEY_NAME)] = {
                        xes_constants.KEY_PREFIX:
                        elem.get(xes_constants.KEY_PREFIX),
                        xes_constants.KEY_URI:
                        elem.get(xes_constants.KEY_URI)
                    }
                continue

            elif elem.tag.endswith(xes_constants.TAG_GLOBAL):
                if elem.get(xes_constants.KEY_SCOPE) is not None:
                    log.omni_present[elem.get(xes_constants.KEY_SCOPE)] = {}
                    tree[elem] = log.omni_present[elem.get(
                        xes_constants.KEY_SCOPE)]
                continue

            elif elem.tag.endswith(xes_constants.TAG_CLASSIFIER):
                if elem.get(xes_constants.KEY_KEYS) is not None:
                    classifier_value = elem.get(xes_constants.KEY_KEYS)
                    if "'" in classifier_value:
                        log.classifiers[elem.get(xes_constants.KEY_NAME)] = [
                            x for x in classifier_value.split("'")
                            if x.strip()
                        ]
                    else:
                        log.classifiers[elem.get(xes_constants.KEY_NAME
                                                 )] = classifier_value.split()
                continue

            elif elem.tag.endswith(xes_constants.TAG_LOG):
                tree[elem] = log.attributes
                continue

        elif tree_event == _EVENT_END:
            if elem in tree:
                del tree[elem]
            elem.clear()
            if elem.getprevious() is not None:
                try:
                    del elem.getparent()[0]
                except TypeError:
                    pass

            if elem.tag.endswith(xes_constants.TAG_EVENT):
                if trace is not None:
                    trace.append(event)
                    event = None
                continue

            elif elem.tag.endswith(xes_constants.TAG_TRACE):
                log.append(trace)

                if progress is not None:
                    progress.update()

                trace = None
                continue

            elif elem.tag.endswith(xes_constants.TAG_LOG):
                continue

    # gracefully close progress bar
    if progress is not None:
        progress.close()
    del context, progress

    if timestamp_sort:
        log = sorting.sort_timestamp(log,
                                     timestamp_key=timestamp_key,
                                     reverse_sort=reverse_sort)

    return log
示例#19
0
文件: get_graph.py 项目: bbkjunior/LL
    raise "No_token"
session = vk.AuthSession(access_token=token_to_use)
api = vk.API(session, v='5.95', lang='ru', timeout=10)


def get_friends_ids(uid):
    ids = api.friends.get(user_id=uid)
    return ids['items']


try:
    with open("whole_leo_subscribers_list.json", "r") as f:
        whole_leo_subscribers_list = json.load(f)
except:
    whole_leo_subscribers_list = []
    for offset_curr in tqdm(range(0, 390556, 1000)):
        subscribers_ids_leo = api.groups.getMembers(
            group_id='15787787',
            fields="""sex, bdate, city, country,
                                                photo_max_orig, lists, domain, 
                                                contacts, connections, education, 
                                                universities, schools,  
                                                status, relation, relatives""",
            offset=offset_curr)
        whole_leo_subscribers_list.extend(subscribers_ids_leo['items'])
        time.sleep(0.1)

whole_leo_opened_subscribers_list = []
for friend in whole_leo_subscribers_list:
    if 'deactivated' in friend or friend['is_closed'] == True:
        pass
def main():
    logging.basicConfig(level=logging.INFO)
    opts = Settings()
    opts = update_settings(opts)
    pool_states = [{} for _ in range(opts.num_workers)]
    # for train in [False, True]:
    for train in [False, True]:
        name = 'objectron-train' if train else 'objectron-test'
        logging.info(F'Processing {name}')

        max_bytes = opts.max_train_bytes if train else opts.max_test_bytes

        # TODO(ycho): Consider fancier (e.g. class-equalizing) shard samplers.
        shards = ObjectronDetection(ObjectronDetection.Settings(local=False),
                                    train).shards

        out_dir = (Path(opts.cache_dir).expanduser() / name)
        out_dir.mkdir(parents=True, exist_ok=True)

        if opts.use_pool:
            # NOTE(ycho): The initial approach based on mp.Pool().
            # Turned out that it is not possible to guarantee graceful exit in
            # this way.
            _download = functools.partial(download_shard, out_dir=out_dir)
            with mp.Pool(opts.num_workers, init_worker) as p:
                with tqdm(total=max_bytes) as pbar:
                    total_bytes = 0
                    for shard_bytes in p.imap_unordered(_download, shards):
                        pbar.update(shard_bytes)
                        # Accumulate and check for termination.
                        total_bytes += shard_bytes
                        if total_bytes >= max_bytes:
                            logging.info(F'Done: {total_bytes} > {max_bytes}')
                            # NOTE(ycho): Due to bug in mp.Pool(), imap_unordered() with close()/join()
                            # does NOT work, thus we implicitly call terminate() via context manager
                            # which may result in incomplete shards. This condition
                            # must be checked.
                            break
        else:
            init_bytes = sum(f.stat().st_size for f in out_dir.rglob('*')
                             if f.is_file())
            logging.info(F'Starting from {init_bytes}/{max_bytes} ...')
            ctx = mp.get_context('fork')
            stop = ctx.Value('b', (init_bytes >= max_bytes))
            queue = ctx.Queue()
            workers = [
                ctx.Process(target=download_shards,
                            args=(shards[i::opts.num_workers], out_dir, stop,
                                  queue)) for i in range(opts.num_workers)
            ]
            # Start!
            for p in workers:
                p.start()

            # Progress logging ...
            try:
                with tqdm(initial=init_bytes, total=max_bytes) as pbar:
                    # Periodically check progress...
                    total_bytes = init_bytes
                    while True:
                        shard_bytes = queue.get()
                        pbar.update(shard_bytes)
                        total_bytes += shard_bytes
                        if total_bytes >= max_bytes:
                            break
            except KeyboardInterrupt:
                logging.info('Cancelling download, trying to clean up ...')
                pass
            finally:
                # Stop.
                with stop.get_lock():
                    stop.value = True

                # Join.
                logging.info(
                    'Download completed, joining the rest of the processes...')
                for p in workers:
                    p.join()
示例#21
0
    def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray:
        """
        Generate adversarial samples and return them in an array.

        :param x: An array with the original inputs.
        :param y: Target values (class labels) one-hot-encoded of shape `(nb_samples, nb_classes)` or indices of shape
                  (nb_samples,). Only provide this parameter if you'd like to use true labels when crafting adversarial
                  samples. Otherwise, model predictions are used as labels to avoid the "label leaking" effect
                  (explained in this paper: https://arxiv.org/abs/1611.01236). Default is `None`.
        :param mask: An array with a mask broadcastable to input `x` defining where to apply adversarial perturbations.
                     Shape needs to be broadcastable to the shape of x and can also be of the same shape as `x`. Any
                     features for which the mask is zero will not be adversarially perturbed.
        :type mask: `np.ndarray`
        :return: An array holding the adversarial examples.
        """
        import torch  # lgtm [py/repeated-import]

        mask = self._get_mask(x, **kwargs)

        # Ensure eps is broadcastable
        self._check_compatibility_input_and_eps(x=x)

        # Check whether random eps is enabled
        self._random_eps()

        # Set up targets
        targets = self._set_targets(x, y)

        # Create dataset
        if mask is not None:
            # Here we need to make a distinction: if the masks are different for each input, we need to index
            # those for the current batch. Otherwise (i.e. mask is meant to be broadcasted), keep it as it is.
            if len(mask.shape) == len(x.shape):
                dataset = torch.utils.data.TensorDataset(
                    torch.from_numpy(x.astype(ART_NUMPY_DTYPE)),
                    torch.from_numpy(targets.astype(ART_NUMPY_DTYPE)),
                    torch.from_numpy(mask.astype(ART_NUMPY_DTYPE)),
                )

            else:
                dataset = torch.utils.data.TensorDataset(
                    torch.from_numpy(x.astype(ART_NUMPY_DTYPE)),
                    torch.from_numpy(targets.astype(ART_NUMPY_DTYPE)),
                    torch.from_numpy(np.array([mask.astype(ART_NUMPY_DTYPE)] * x.shape[0])),
                )

        else:
            dataset = torch.utils.data.TensorDataset(
                torch.from_numpy(x.astype(ART_NUMPY_DTYPE)),
                torch.from_numpy(targets.astype(ART_NUMPY_DTYPE)),
            )

        data_loader = torch.utils.data.DataLoader(
            dataset=dataset, batch_size=self.batch_size, shuffle=False, drop_last=False
        )

        # Start to compute adversarial examples
        adv_x = x.astype(ART_NUMPY_DTYPE)

        # Compute perturbation with batching
        for (batch_id, batch_all) in enumerate(
            tqdm(data_loader, desc="PGD - Batches", leave=False, disable=not self.verbose)
        ):

            self._batch_id = batch_id

            if mask is not None:
                (batch, batch_labels, mask_batch) = batch_all[0], batch_all[1], batch_all[2]
            else:
                (batch, batch_labels, mask_batch) = batch_all[0], batch_all[1], None

            batch_index_1, batch_index_2 = batch_id * self.batch_size, (batch_id + 1) * self.batch_size

            # Compute batch_eps and batch_eps_step
            if isinstance(self.eps, np.ndarray) and isinstance(self.eps_step, np.ndarray):
                if len(self.eps.shape) == len(x.shape) and self.eps.shape[0] == x.shape[0]:
                    batch_eps = self.eps[batch_index_1:batch_index_2]
                    batch_eps_step = self.eps_step[batch_index_1:batch_index_2]

                else:
                    batch_eps = self.eps
                    batch_eps_step = self.eps_step

            else:
                batch_eps = self.eps
                batch_eps_step = self.eps_step

            for rand_init_num in range(max(1, self.num_random_init)):
                if rand_init_num == 0:
                    # first iteration: use the adversarial examples as they are the only ones we have now
                    adv_x[batch_index_1:batch_index_2] = self._generate_batch(
                        x=batch, targets=batch_labels, mask=mask_batch, eps=batch_eps, eps_step=batch_eps_step
                    )
                else:
                    adversarial_batch = self._generate_batch(
                        x=batch, targets=batch_labels, mask=mask_batch, eps=batch_eps, eps_step=batch_eps_step
                    )

                    # return the successful adversarial examples
                    attack_success = compute_success_array(
                        self.estimator,
                        batch,
                        batch_labels,
                        adversarial_batch,
                        self.targeted,
                        batch_size=self.batch_size,
                    )
                    adv_x[batch_index_1:batch_index_2][attack_success] = adversarial_batch[attack_success]

        logger.info(
            "Success rate of attack: %.2f%%",
            100 * compute_success(self.estimator, x, targets, adv_x, self.targeted, batch_size=self.batch_size),
        )

        if self.summary_writer is not None:
            self.summary_writer.reset()

        return adv_x
示例#22
0
    for epoch in range(args.epoch):
        good_smiles = sorted(set(good_smiles))
        random.shuffle(good_smiles)
        dataset = hgraph.MoleculeDataset(good_smiles, args.vocab,
                                         args.atom_vocab, args.batch_size)

        print(f'Epoch {epoch} training...')
        for _ in range(args.inner_epoch):
            meters = np.zeros(6)
            dataloader = DataLoader(dataset,
                                    batch_size=1,
                                    collate_fn=lambda x: x[0],
                                    shuffle=True,
                                    num_workers=16)
            for batch in tqdm(dataloader):
                model.zero_grad()
                loss, kl_div, wacc, iacc, tacc, sacc = model(*batch, beta=beta)
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(), args.clip_norm)
                optimizer.step()
                meters = meters + np.array([
                    kl_div,
                    loss.item(), wacc * 100, iacc * 100, tacc * 100, sacc * 100
                ])

            meters /= len(dataset)
            print(
                "Beta: %.3f, KL: %.2f, loss: %.3f, Word: %.2f, %.2f, Topo: %.2f, Assm: %.2f, PNorm: %.2f, GNorm: %.2f"
                % (beta, meters[0], meters[1], meters[2], meters[3], meters[4],
                   meters[5], param_norm(model), grad_norm(model)))
示例#23
0
def train(model,
          train_loader,
          train_subval_loader,
          epochs,
          controller=None,
          gamma_start=0.0,
          gamma_end=0.0,
          lr=3e-3,
          prefix_name=''):

    if isinstance(model, SuperNet):
        gamma_scheduler = GammaScheduler(model,
                                         total_steps=epochs *
                                         len(train_loader),
                                         gamma_start=gamma_start,
                                         gamma_end=gamma_end)
    else:
        gamma_scheduler = None

    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    history = []

    for epoch in range(epochs):
        train_loss = 0
        val_loss = 0
        best_val_loss = float('inf')

        # Train on 50k/60k training data
        model.train()
        tq = tqdm(train_loader, leave=False)
        for x, cls in tq:
            if isinstance(model, SuperNet):
                out, choice = model(x.to(device))
            else:
                out = model(x.to(device))
            loss = criterion(out, cls.to(device))
            optimizer.zero_grad()
            loss.backward()
            norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 10)
            optimizer.step()
            train_loss += loss.item() / len(train_loader)
            if gamma_scheduler:
                gamma_scheduler.step()
            if controller:
                controller.update_score(choice, loss.item())
            tq.set_postfix(loss=f'{loss.item():.4f}', norm=f'{norm:.4f}')

        # Evaluate on 10k/60k training data
        model.eval()
        tq = tqdm(train_subval_loader, leave=False)
        for x, cls in tq:
            with torch.no_grad():
                if isinstance(model, SuperNet):
                    out, choice = model(x.to(device))
                else:
                    out = model(x.to(device))
            loss = criterion(out, cls.to(device))
            val_loss += loss.item() / len(train_subval_loader)
            if controller:
                controller.update_score(choice, loss.item())
            tq.set_postfix(loss=f'{loss.item():.4f}')

        history.append([train_loss, val_loss])
        state_dict = {
            'model': model.state_dict(),
            'controller': controller,
            'gamma_scheduler': gamma_scheduler,
            'history': history
        }
        torch.save(state_dict, f'{prefix_name}_last.pt')
        if (val_loss < best_val_loss):
            best_val_loss = val_loss
            torch.save(state_dict, f'{prefix_name}_best.pt')

        print(
            f'{epoch+1:>2} / {epochs:>2}, loss = {train_loss:.4f}, val_loss = {val_loss:.4f}'
        )

    return history
示例#24
0
def main():
    args = parse_args()

    # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
    accelerator = Accelerator()
    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
    logger.info(accelerator.state)

    # Setup logging, we only want one process per machine to log things on the screen.
    # accelerator.is_local_main_process is only True for one process per machine.
    logger.setLevel(logging.INFO if accelerator.is_local_main_process else logging.ERROR)
    if accelerator.is_local_main_process:
        datasets.utils.logging.set_verbosity_warning()
        transformers.utils.logging.set_verbosity_info()
    else:
        datasets.utils.logging.set_verbosity_error()
        transformers.utils.logging.set_verbosity_error()

    # If passed along, set the training seed now.
    if args.seed is not None:
        set_seed(args.seed)

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
        if "validation" not in raw_datasets.keys():
            raw_datasets["validation"] = load_dataset(
                args.dataset_name,
                args.dataset_config_name,
                split=f"train[:{args.validation_split_percentage}%]",
            )
            raw_datasets["train"] = load_dataset(
                args.dataset_name,
                args.dataset_config_name,
                split=f"train[{args.validation_split_percentage}%:]",
            )
    else:
        data_files = {}
        if args.train_file is not None:
            data_files["train"] = args.train_file
        if args.validation_file is not None:
            data_files["validation"] = args.validation_file
        extension = args.train_file.split(".")[-1]
        if extension == "txt":
            extension = "text"
        raw_datasets = load_dataset(extension, data_files=data_files)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # Load pretrained model and tokenizer
    #
    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    if args.config_name:
        config = AutoConfig.from_pretrained(args.config_name)
    elif args.model_name_or_path:
        config = AutoConfig.from_pretrained(args.model_name_or_path)
    else:
        config = CONFIG_MAPPING[args.model_type]()
        logger.warning("You are instantiating a new config instance from scratch.")

    if args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=not args.use_slow_tokenizer)
    elif args.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
        )

    if args.model_name_or_path:
        model = AutoModelForMaskedLM.from_pretrained(
            args.model_name_or_path,
            from_tf=bool(".ckpt" in args.model_name_or_path),
            config=config,
        )
    else:
        logger.info("Training new model from scratch")
        model = AutoModelForMaskedLM.from_config(config)

    model.resize_token_embeddings(len(tokenizer))

    # Preprocessing the datasets.
    # First we tokenize all the texts.
    column_names = raw_datasets["train"].column_names
    text_column_name = "text" if "text" in column_names else column_names[0]

    if args.max_seq_length is None:
        max_seq_length = tokenizer.model_max_length
        if max_seq_length > 1024:
            logger.warning(
                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
                "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx."
            )
            max_seq_length = 1024
    else:
        if args.max_seq_length > tokenizer.model_max_length:
            logger.warning(
                f"The max_seq_length passed ({args.max_seq_length}) is larger than the maximum length for the"
                f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
            )
        max_seq_length = min(args.max_seq_length, tokenizer.model_max_length)

    if args.line_by_line:
        # When using line_by_line, we just tokenize each nonempty line.
        padding = "max_length" if args.pad_to_max_length else False

        def tokenize_function(examples):
            # Remove empty lines
            examples[text_column_name] = [
                line for line in examples[text_column_name] if len(line) > 0 and not line.isspace()
            ]
            return tokenizer(
                examples[text_column_name],
                padding=padding,
                truncation=True,
                max_length=max_seq_length,
                # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it
                # receives the `special_tokens_mask`.
                return_special_tokens_mask=True,
            )

        tokenized_datasets = raw_datasets.map(
            tokenize_function,
            batched=True,
            num_proc=args.preprocessing_num_workers,
            remove_columns=[text_column_name],
            load_from_cache_file=not args.overwrite_cache,
            desc="Running tokenizer on dataset line_by_line",
        )
    else:
        # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
        # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
        # efficient when it receives the `special_tokens_mask`.
        def tokenize_function(examples):
            return tokenizer(examples[text_column_name], return_special_tokens_mask=True)

        tokenized_datasets = raw_datasets.map(
            tokenize_function,
            batched=True,
            num_proc=args.preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=not args.overwrite_cache,
            desc="Running tokenizer on every text in dataset",
        )

        # Main data processing function that will concatenate all texts from our dataset and generate chunks of
        # max_seq_length.
        def group_texts(examples):
            # Concatenate all texts.
            concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
            total_length = len(concatenated_examples[list(examples.keys())[0]])
            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
            # customize this part to your needs.
            total_length = (total_length // max_seq_length) * max_seq_length
            # Split by chunks of max_len.
            result = {
                k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
                for k, t in concatenated_examples.items()
            }
            return result

        # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
        # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
        # might be slower to preprocess.
        #
        # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
        # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map

        tokenized_datasets = tokenized_datasets.map(
            group_texts,
            batched=True,
            num_proc=args.preprocessing_num_workers,
            load_from_cache_file=not args.overwrite_cache,
            desc=f"Grouping texts in chunks of {max_seq_length}",
        )

    train_dataset = tokenized_datasets["train"]
    eval_dataset = tokenized_datasets["validation"]

    # Log a few random samples from the training set:
    for index in random.sample(range(len(train_dataset)), 3):
        logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")

    # Data collator
    # This one will take care of randomly masking the tokens.
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=args.mlm_probability)

    # DataLoaders creation:
    train_dataloader = DataLoader(
        train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size
    )
    eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size)

    # Optimizer
    # Split weights in two groups, one with weight decay and the other not.
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)

    # Prepare everything with our `accelerator`.
    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader
    )

    # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
    # shorter in multiprocess)

    # Scheduler and math around the number of training steps.
    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
    if args.max_train_steps is None:
        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
    else:
        args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)

    lr_scheduler = get_scheduler(
        name=args.lr_scheduler_type,
        optimizer=optimizer,
        num_warmup_steps=args.num_warmup_steps,
        num_training_steps=args.max_train_steps,
    )

    # Train!
    total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps

    logger.info("***** Running training *****")
    logger.info(f"  Num examples = {len(train_dataset)}")
    logger.info(f"  Num Epochs = {args.num_train_epochs}")
    logger.info(f"  Instantaneous batch size per device = {args.per_device_train_batch_size}")
    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
    logger.info(f"  Total optimization steps = {args.max_train_steps}")
    # Only show the progress bar once on each machine.
    progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
    completed_steps = 0

    for epoch in range(args.num_train_epochs):
        model.train()
        for step, batch in enumerate(train_dataloader):
            outputs = model(**batch)
            loss = outputs.loss
            loss = loss / args.gradient_accumulation_steps
            accelerator.backward(loss)
            if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()
                progress_bar.update(1)
                completed_steps += 1

            if completed_steps >= args.max_train_steps:
                break

        model.eval()
        losses = []
        for step, batch in enumerate(eval_dataloader):
            with torch.no_grad():
                outputs = model(**batch)

            loss = outputs.loss
            losses.append(accelerator.gather(loss.repeat(args.per_device_eval_batch_size)))

        losses = torch.cat(losses)
        losses = losses[: len(eval_dataset)]
        try:
            perplexity = math.exp(torch.mean(losses))
        except OverflowError:
            perplexity = float("inf")

        logger.info(f"epoch {epoch}: perplexity: {perplexity}")

    if args.output_dir is not None:
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
示例#25
0
    def evaluate(self, eval_dataset, output_dir):
        """
        Evaluates the model on eval_dataset.
        Utility function to be used by the eval_model() method. Not intended to be used directly.
        """

        device = self.device
        model = self.model
        args = self.args
        pad_token_label_id = self.pad_token_label_id
        eval_output_dir = output_dir

        results = {}

        eval_sampler = SequentialSampler(eval_dataset)
        eval_dataloader = DataLoader(eval_dataset,
                                     sampler=eval_sampler,
                                     batch_size=args["eval_batch_size"])

        eval_loss = 0.0
        nb_eval_steps = 0
        preds = None
        out_label_ids = None
        model.eval()

        for batch in tqdm(eval_dataloader, disable=args["silent"]):
            batch = tuple(t.to(device) for t in batch)

            with torch.no_grad():
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "labels": batch[3],
                }
                # XLM and RoBERTa don"t use segment_ids
                if args["model_type"] in ["bert", "xlnet"]:
                    inputs["token_type_ids"] = batch[2]
                outputs = model(**inputs)
                tmp_eval_loss, logits = outputs[:2]

                eval_loss += tmp_eval_loss.mean().item()

            nb_eval_steps += 1

            if preds is None:
                preds = logits.detach().cpu().numpy()
                out_label_ids = inputs["labels"].detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                out_label_ids = np.append(
                    out_label_ids,
                    inputs["labels"].detach().cpu().numpy(),
                    axis=0)

        eval_loss = eval_loss / nb_eval_steps
        model_outputs = preds
        preds = np.argmax(preds, axis=2)

        label_map = {i: label for i, label in enumerate(self.labels)}

        out_label_list = [[] for _ in range(out_label_ids.shape[0])]
        preds_list = [[] for _ in range(out_label_ids.shape[0])]

        for i in range(out_label_ids.shape[0]):
            for j in range(out_label_ids.shape[1]):
                if out_label_ids[i, j] != pad_token_label_id:
                    out_label_list[i].append(label_map[out_label_ids[i][j]])
                    preds_list[i].append(label_map[preds[i][j]])

        result = {
            "eval_loss": eval_loss,
            "precision": precision_score(out_label_list, preds_list),
            "recall": recall_score(out_label_list, preds_list),
            "f1_score": f1_score(out_label_list, preds_list),
        }

        results.update(result)

        output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            if args["classification_report"]:
                cls_report = classification_report(out_label_list, preds_list)
                writer.write("{}\n".format(cls_report))
            for key in sorted(result.keys()):
                writer.write("{} = {}\n".format(key, str(result[key])))

        #MAKE A TABEL FROM OUT LABEL LIST AND PREDS LIST

        return results, model_outputs, preds_list
示例#26
0
def proc_preds(
    examples,
    features,
    predictions,
    version_2_with_negative=False,
    n_best_size=20,
    max_answer_length=30,
    start_n_top=5,
    end_n_top=5,
    out_dir=None,
    prefix=None,
    log_level=logging.WARNING,
):
    if len(predictions) != 5:
        raise ValueError("`predictions` should be a tuple with five elements.")
    start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits = predictions
    if len(predictions[0]) != len(features):
        raise ValueError(
            f"Got {len(predictions[0])} predictions and {len(features)} features."
        )
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[
            feature["example_id"]]].append(i)
    all_predictions = collections.OrderedDict()
    all_nbest_json = collections.OrderedDict()
    scores_diff_json = collections.OrderedDict(
    ) if version_2_with_negative else None
    log.setLevel(log_level)
    log.info(
        f"Post-processing {len(examples)} example predictions split into {len(features)} features."
    )
    for example_index, example in enumerate(tqdm(examples)):
        feature_indices = features_per_example[example_index]
        min_null_score = None
        prelim_predictions = []
        for feature_index in feature_indices:
            start_log_prob = start_top_log_probs[feature_index]
            start_indexes = start_top_index[feature_index]
            end_log_prob = end_top_log_probs[feature_index]
            end_indexes = end_top_index[feature_index]
            feature_null_score = cls_logits[feature_index]
            offset_mapping = features[feature_index]["offset_mapping"]
            token_is_max_context = features[feature_index].get(
                "token_is_max_context", None)
            if min_null_score is None or feature_null_score < min_null_score:
                min_null_score = feature_null_score
            for i in range(start_n_top):
                for j in range(end_n_top):
                    start_index = int(start_indexes[i])
                    j_index = i * end_n_top + j
                    end_index = int(end_indexes[j_index])
                    if (start_index >= len(offset_mapping)
                            or end_index >= len(offset_mapping)
                            or offset_mapping[start_index] is None
                            or offset_mapping[end_index] is None):
                        continue
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue
                    if token_is_max_context is not None and not token_is_max_context.get(
                            str(start_index), False):
                        continue
                    prelim_predictions.append({
                        "offsets": (
                            offset_mapping[start_index][0],
                            offset_mapping[end_index][1],
                        ),
                        "score":
                        start_log_prob[i] + end_log_prob[j_index],
                        "start_log_prob":
                        start_log_prob[i],
                        "end_log_prob":
                        end_log_prob[j_index],
                    })
        predictions = sorted(prelim_predictions,
                             key=lambda x: x["score"],
                             reverse=True)[:n_best_size]
        context = example["context"]
        for pred in predictions:
            offsets = pred.pop("offsets")
            pred["text"] = context[offsets[0]:offsets[1]]
        if len(predictions) == 0:
            predictions.insert(
                0, {
                    "text": "",
                    "start_logit": -1e-6,
                    "end_logit": -1e-6,
                    "score": -2e-6
                })
        scores = np.array([pred.pop("score") for pred in predictions])
        exp_scores = np.exp(scores - np.max(scores))
        probs = exp_scores / exp_scores.sum()
        for prob, pred in zip(probs, predictions):
            pred["probability"] = prob
        all_predictions[example["id"]] = predictions[0]["text"]
        if version_2_with_negative:
            scores_diff_json[example["id"]] = float(min_null_score)
        all_nbest_json[example["id"]] = [{
            k: (float(v) if isinstance(v, (np.float16, np.float32,
                                           np.float64)) else v)
            for k, v in pred.items()
        } for pred in predictions]
    if out_dir is not None:
        if not os.path.isdir(out_dir):
            raise EnvironmentError(f"{out_dir} is not a directory.")
        prediction_file = os.path.join(
            out_dir, "predictions.json"
            if prefix is None else f"{prefix}_predictions.json")
        nbest_file = os.path.join(
            out_dir,
            "nbest_predictions.json"
            if prefix is None else f"{prefix}_nbest_predictions.json",
        )
        if version_2_with_negative:
            null_odds_file = os.path.join(
                out_dir, "null_odds.json"
                if prefix is None else f"{prefix}_null_odds.json")
        log.info(f"Saving predictions to {prediction_file}.")
        with open(prediction_file, "w") as writer:
            writer.write(json.dumps(all_predictions, indent=4) + "\n")
        log.info(f"Saving nbest_preds to {nbest_file}.")
        with open(nbest_file, "w") as writer:
            writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
        if version_2_with_negative:
            log.info(f"Saving null_odds to {null_odds_file}.")
            with open(null_odds_file, "w") as writer:
                writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
    return all_predictions, scores_diff_json
示例#27
0
    def _generate_cost_matrices(
        self,
        adata: AnnData,
        cost_matrices: Optional[
            Union[str, Mapping[Tuple[float, float], np.ndarray]]
        ] = None,
    ) -> Tuple[Mapping[Tuple[float, float], Optional[np.ndarray]], str]:
        timepoints = self.experimental_time.cat.categories
        timepoints = list(zip(timepoints[:-1], timepoints[1:]))

        if cost_matrices is None:
            logg.info("Using default cost matrices")
            return {tpair: None for tpair in timepoints}, "default"

        if isinstance(cost_matrices, dict):
            logg.info("Using precomputed cost matrices")

            cmats = {}
            for tpair in timepoints:
                if tpair not in cost_matrices:
                    logg.warning(
                        f"Unable to find cost matrix for pair `{tpair}`. Using default"
                    )
                cmats[tpair] = cmat = cost_matrices.get(tpair, None)
                if cmat is not None:
                    n_start = len(np.where(self.experimental_time == tpair[0])[0])
                    n_end = len(np.where(self.experimental_time == tpair[1])[0])
                    try:
                        if cmat.shape != (n_start, n_end):
                            raise ValueError(
                                f"Expected cost matrix for time pair `{tpair}` to be "
                                f"of shape `{(n_start, n_end)}`, found `{cmat.shape}`."
                            )
                    except AttributeError:
                        logg.warning(
                            f"Unable to verify whether supplied cost matrix for time pair `{tpair}` "
                            f"has the correct shape `{(n_start, n_end)}`"
                        )

            # prevent equality comparison when comparing with cache
            return cmats, nstr("precomputed")

        if isinstance(cost_matrices, str):
            logg.info(f"Computing cost matrices using `{cost_matrices!r}` key")
            if cost_matrices == "X":
                cost_matrices = None

            try:
                features = adata._get_X(layer=cost_matrices)
                modifier = "layer"
            except KeyError:
                try:
                    features = adata.obsm[cost_matrices]
                    modifier = "obsm"
                except KeyError:
                    raise KeyError(
                        f"Unable to find key `{cost_matrices!r}` in `adata.layers` or `adata.obsm`."
                    ) from None

            cmats = {}
            for tpair in tqdm(timepoints, unit="cost matrix"):
                start_ixs = np.where(self.experimental_time == tpair[0])[0]
                end_ixs = np.where(self.experimental_time == tpair[1])[0]

                # being sparse is handled in WOT's function below
                cmats[tpair] = wot.ot.OTModel.compute_default_cost_matrix(
                    features[start_ixs], features[end_ixs]
                )

            return cmats, f"{modifier}:{cost_matrices}"

        raise NotImplementedError(
            f"Specifying cost matrices as "
            f"`{type(cost_matrices).__name__}` is not yet implemented."
        )
示例#28
0
def test_eval(model, log_dir, mini_batch, lstm_layer, lstm_dim, max_sen_len,
              gpu, cuda, reverse, unk, trunc, epoch, id_to_de):
    mini_batch = int(mini_batch / 4)

    # check dir, make dir
    test_dir = os.path.join(log_dir, 'test')
    if not os.path.exists(test_dir):
        os.mkdir(test_dir)

    # load test data
    print("Load the preprocessed test data..")
    if unk:
        if reverse:
            with open(
                    f'datasets/preprocessed/test/test{trunc}_source_reverse_unk.pkl',
                    'rb') as fr:
                test_source_input, test_source_len = pickle.load(fr)
        else:
            with open(f'datasets/preprocessed/test/test{trunc}_source_unk.pkl',
                      'rb') as fr:
                test_source_input, test_source_len = pickle.load(fr)
        with open(f'datasets/preprocessed/test/test{trunc}_label_unk.pkl',
                  'rb') as fr:
            test_target_output = pickle.load(fr)
    else:
        if reverse:
            with open(
                    f'datasets/preprocessed/test/test{trunc}_source_reverse.pkl',
                    'rb') as fr:
                test_source_input, test_source_len = pickle.load(fr)
        else:
            with open(f'datasets/preprocessed/test/test{trunc}_source.pkl',
                      'rb') as fr:
                test_source_input, test_source_len = pickle.load(fr)
        with open(f'datasets/preprocessed/test/test{trunc}_label.pkl',
                  'rb') as fr:
            test_target_output = pickle.load(fr)
    print("Complete.")

    print("Split the data into mini_batch..")
    test_src_input = make_batch(test_source_input, mini_batch)
    test_src_len = make_batch(test_source_len, mini_batch)
    test_tgt_output = make_batch(test_target_output, mini_batch)
    print("Complete.")

    test_src_input = torch.from_numpy(test_src_input)
    test_src_len = torch.from_numpy(test_src_len)
    test_tgt_output = torch.from_numpy(test_tgt_output)

    test_src_input = test_src_input.to(torch.int64)
    test_src_len = test_src_len.to(torch.int64)
    test_tgt_output = test_tgt_output.to(torch.int64)

    # test start
    cur = 0
    output = torch.zeros_like(test_src_input)  # output = (40, 64, 51)
    for batch_src_input, batch_src_len in tqdm(
            zip(test_src_input, test_src_len),
            total=len(test_src_input),
            bar_format='{l_bar}{bar:30}{r_bar}'):
        # init hidden state
        h_0 = torch.zeros(lstm_layer, mini_batch, lstm_dim)  # (4, 128, 1000)
        c_0 = torch.zeros(lstm_layer, mini_batch, lstm_dim)
        hidden = (h_0, c_0)
        # hidden = [state.detach() for state in hidden]

        tgt = torch.ones(mini_batch,
                         1)  # tgt = (mini_batch, 1)  ==> SOS tokens
        tgt = tgt.to(torch.int64)
        if gpu:
            device = torch.device(
                f"cuda:{cuda}" if torch.cuda.is_available() else "cpu")
            batch_src_input = batch_src_input.to(device)
            batch_src_len = batch_src_len.to(device)
            tgt = tgt.to(device)
            hidden = [state.to(device) for state in hidden]

        # first decoder (past) output
        hht = torch.zeros(mini_batch, 1,
                          lstm_dim)  # first time-step prev decoder context
        if gpu:
            device = torch.device(
                f"cuda:{cuda}" if torch.cuda.is_available() else "cpu")
            hht = hht.to(device)

        for i in range(max_sen_len):
            out = model(batch_src_input, tgt, hidden, hht, batch_src_len)
            if gpu:
                device = torch.device(
                    f"cuda:{cuda}" if torch.cuda.is_available() else "cpu")
                out = out.to(device)  # out = (mini_batch, seq_len, tgt_vocab)
            pred = torch.max(out, dim=-1)[1]  # pred = (mini_batch, seq_len)
            tgt = torch.cat((tgt, pred[:, i].unsqueeze(1)), dim=1)
        output[cur] = tgt[:, 1:]  # output[cur] = (mini_batch, seq_len)
        cur += 1

    # make prediction.txt
    output = output.view(-1, max_sen_len)
    test_pred_output = []
    for line in output:
        sentence = ' '.join([id_to_de[int(idx)] for idx in line])
        sentence = sentence.replace('</s>', '').strip() + ' \n'
        test_pred_output.append(sentence)
    # make label.txt
    test_tgt_output = test_tgt_output.view(-1, max_sen_len)
    test_label = []
    for line in test_tgt_output:
        sentence = ' '.join([id_to_de[int(idx)] for idx in line])
        sentence = sentence.replace('</s>', '').strip() + ' \n'
        test_label.append(sentence)

    # save the prediction and label text.
    test_dir = os.path.join(log_dir, 'test')
    if not os.path.exists(test_dir):
        os.mkdir(test_dir)
    with open(os.path.join(test_dir, f'output_{epoch+1}.txt'),
              'w',
              encoding='utf8') as fw:
        fw.writelines(test_pred_output)
    with open(os.path.join(test_dir, f'label_{epoch+1}.txt'),
              'w',
              encoding='utf8') as fw:
        fw.writelines(test_label)
    print("Succeed to save the prediction and label text file!")
    print('\n')
示例#29
0
def ppo(env_name,
        total_steps,
        model,
        act_var_schedule=[0.7],
        epoch_batch_size=2048,
        gamma=0.99,
        lam=0.99,
        eps=0.2,
        seed=0,
        pol_batch_size=1024,
        val_batch_size=1024,
        pol_lr=1e-4,
        val_lr=1e-4,
        pol_epochs=10,
        val_epochs=10,
        target_kl=.01,
        use_gpu=False,
        reward_stop=None,
        normalize_return=True,
        env_config={}):
    """
    Implements proximal policy optimization with clipping

    Args:
        env_name: name of the openAI gym environment to solve
        total_steps: number of timesteps to run the PPO for
        model: model from seagul.rl.models. Contains policy and value fn
        act_var_schedule: schedule to set the variance of the policy. Will linearly interpolate values
        epoch_batch_size: number of environment steps to take per batch, total steps will be num_epochs*epoch_batch_size
        seed: seed for all the rngs
        gamma: discount applied to future rewards, usually close to 1
        lam: lambda for the Advantage estimation, usually close to 1
        eps: epsilon for the clipping, usually .1 or .2
        pol_batch_size: batch size for policy updates
        val_batch_size: batch size for value function updates
        pol_lr: learning rate for policy pol_optimizer
        val_lr: learning rate of value function pol_optimizer
        pol_epochs: how many epochs to use for each policy update
        val_epochs: how many epochs to use for each value update
        target_kl: max KL before breaking
        use_gpu:  want to use the GPU? set to true
        reward_stop: reward value to stop if we achieve
        normalize_return: should we normalize the return?
        env_config: dictionary containing kwargs to pass to your the environment

    Returns:
        model: trained model
        avg_reward_hist: list with the average reward per episode at each epoch
        var_dict: dictionary with all locals, for logging/debugging purposes

    Example:
        from seagul.rl.algos import ppo
        from seagul.nn import MLP
        from seagul.rl.models import PPOModel
        import torch

        input_size = 3
        output_size = 1
        layer_size = 64
        num_layers = 2

        policy = MLP(input_size, output_size, num_layers, layer_size)
        value_fn = MLP(input_size, 1, num_layers, layer_size)
        model = PPOModel(policy, value_fn)

        model, rews, var_dict = ppo("Pendulum-v0", 10000, model)

    """

    # init everything
    # ==============================================================================
    torch.set_num_threads(1)

    env = gym.make(env_name, **env_config)
    if isinstance(env.action_space, gym.spaces.Box):
        act_size = env.action_space.shape[0]
        act_dtype = torch.double
    else:
        raise NotImplementedError("trying to use unsupported action space",
                                  env.action_space)

    actvar_lookup = make_variance_schedule(act_var_schedule, model,
                                           total_steps)
    model.action_var = actvar_lookup(0)

    obs_size = env.observation_space.shape[0]
    obs_mean = torch.zeros(obs_size)
    obs_var = torch.ones(obs_size)
    adv_mean = torch.zeros(1)
    adv_var = torch.ones(1)
    rew_mean = torch.zeros(1)
    rew_var = torch.ones(1)

    old_model = pickle.loads(
        pickle.dumps(model)
    )  # copy.deepcopy broke for me with older version of torch. Using pickle for this is weird but works fine
    pol_opt = torch.optim.Adam(model.policy.parameters(), lr=pol_lr)
    val_opt = torch.optim.Adam(model.value_fn.parameters(), lr=val_lr)

    # seed all our RNGs
    env.seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)

    # set defaults, and decide if we are using a GPU or not
    use_cuda = torch.cuda.is_available() and use_gpu
    device = torch.device("cuda:0" if use_cuda else "cpu")

    # init logging stuff
    raw_rew_hist = []
    val_loss_hist = []
    pol_loss_hist = []
    progress_bar = tqdm.tqdm(total=total_steps)
    cur_total_steps = 0
    progress_bar.update(0)
    early_stop = False

    # Train until we hit our total steps or reach our reward threshold
    # ==============================================================================
    while cur_total_steps < total_steps:

        batch_obs = torch.empty(0)
        batch_act = torch.empty(0)
        batch_adv = torch.empty(0)
        batch_discrew = torch.empty(0)
        cur_batch_steps = 0

        # Bail out if we have met out reward threshold
        if len(raw_rew_hist) > 2 and reward_stop:
            if raw_rew_hist[-1] >= reward_stop and raw_rew_hist[
                    -2] >= reward_stop:
                early_stop = True
                break

        # construct batch data from rollouts
        # ==============================================================================
        while cur_batch_steps < epoch_batch_size:

            ep_obs, ep_act, ep_rew, ep_steps = do_rollout(env, model)

            raw_rew_hist.append(sum(ep_rew))
            ep_rew = (ep_rew - ep_rew.mean()) / (ep_rew.std() + 1e-6)

            batch_obs = torch.cat((batch_obs, ep_obs[:-1]))
            batch_act = torch.cat((batch_act, ep_act[:-1]))

            ep_discrew = discount_cumsum(
                ep_rew, gamma
            )  # [:-1] because we appended the value function to the end as an extra reward
            batch_discrew = torch.cat((batch_discrew, ep_discrew[:-1]))

            if normalize_return:
                rew_mean = update_mean(batch_discrew, rew_mean,
                                       cur_total_steps)
                rew_var = update_std(batch_discrew, rew_var, cur_total_steps)
                batch_discrew = (batch_discrew - rew_mean) / (rew_var + 1e-6)

            # calculate this episodes advantages
            last_val = model.value_fn(ep_obs[-1]).reshape(-1, 1)
            ep_val = model.value_fn(ep_obs)
            ep_val[-1] = last_val

            deltas = ep_rew[:-1] + gamma * ep_val[1:] - ep_val[:-1]
            ep_adv = discount_cumsum(deltas.detach(), gamma * lam)
            batch_adv = torch.cat((batch_adv, ep_adv))

            cur_batch_steps += ep_steps
            cur_total_steps += ep_steps

        # make sure our advantages are zero mean and unit variance
        adv_mean = update_mean(batch_adv, adv_mean, cur_total_steps)
        adv_var = update_std(batch_adv, adv_var, cur_total_steps)
        batch_adv = (batch_adv - adv_mean) / (adv_var + 1e-6)

        # policy update
        # ========================================================================
        num_mbatch = int(batch_obs.shape[0] / pol_batch_size)

        # Update the policy using the PPO loss
        for pol_epoch in range(pol_epochs):
            for i in range(num_mbatch):
                cur_sample = i * pol_batch_size

                logp = model.get_logp(
                    batch_obs[cur_sample:cur_sample + pol_batch_size],
                    batch_act[cur_sample:cur_sample + pol_batch_size]).reshape(
                        -1, act_size)
                old_logp = old_model.get_logp(
                    batch_obs[cur_sample:cur_sample + pol_batch_size],
                    batch_act[cur_sample:cur_sample + pol_batch_size]).reshape(
                        -1, act_size)
                r = torch.exp(logp - old_logp)
                clip_r = torch.clamp(r, 1 - eps, 1 + eps)
                pol_loss = -torch.min(
                    r * batch_adv[cur_sample:cur_sample + pol_batch_size],
                    clip_r *
                    batch_adv[cur_sample:cur_sample + pol_batch_size]).mean()

                approx_kl = (logp - old_logp).mean()
                if approx_kl > target_kl:
                    break

                pol_opt.zero_grad()
                pol_loss.backward()
                pol_opt.step()

        # value_fn update
        # ========================================================================
        num_mbatch = int(batch_obs.shape[0] / val_batch_size)

        # Update value function with the standard L2 Loss
        for val_epoch in range(val_epochs):
            for i in range(num_mbatch):
                cur_sample = i * pol_batch_size

                # predict and calculate loss for the batch
                val_preds = model.value_fn(batch_obs[cur_sample:cur_sample +
                                                     pol_batch_size])
                val_loss = ((val_preds -
                             batch_discrew[cur_sample:cur_sample +
                                           pol_batch_size])**2).mean()

                # do the normal pytorch update
                val_opt.zero_grad()
                val_loss.backward()
                val_opt.step()

        # update observation mean and variance
        obs_mean = update_mean(batch_obs, obs_mean, cur_total_steps)
        obs_var = update_std(batch_obs, obs_var, cur_total_steps)
        model.policy.state_means = obs_mean
        model.value_fn.state_means = obs_mean
        model.policy.state_std = obs_var
        model.value_fn.state_std = obs_var
        model.action_var = actvar_lookup(cur_total_steps)
        old_model = pickle.loads(pickle.dumps(model))

        val_loss_hist.append(val_loss)
        pol_loss_hist.append(pol_loss)

        progress_bar.update(cur_batch_steps)

    progress_bar.close()
    return model, raw_rew_hist, locals()
示例#30
0
def prepare_wenet_speech(
    corpus_dir: Pathlike,
    dataset_parts: Union[str, Sequence[str]] = "all",
    output_dir: Optional[Pathlike] = None,
    num_jobs: int = 1,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions
    :param corpus_dir: Pathlike, the path of the data dir.
    :param dataset_parts: Which parts of dataset to prepare, all for all the
                          parts.
    :param output_dir: Pathlike, the path where to write the manifests.
    :num_jobs Number of workers to extract manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with
             the keys 'recordings' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

    subsets = WETNET_SPEECH_PARTS if "all" in dataset_parts else dataset_parts

    manifests = defaultdict(dict)
    for sub in subsets:
        if sub not in WETNET_SPEECH_PARTS:
            raise ValueError(f"No such part of dataset in WenetSpeech : {sub}")
        manifests[sub] = {"recordings": [], "supervisions": []}

    raw_manifests_path = corpus_dir / "WenetSpeech.json"
    assert raw_manifests_path.is_file(), f"No such file : {raw_manifests_path}"
    logging.info(f"Loading raw manifests from : {raw_manifests_path}")
    raw_manifests = json.load(open(raw_manifests_path, "r", encoding="utf8"))

    with ProcessPoolExecutor(num_jobs) as ex:
        for recording, segments in tqdm(
                ex.map(
                    parse_utterance,
                    raw_manifests["audios"],
                    repeat(corpus_dir),
                    repeat(subsets),
                ),
                desc="Processing WenetSpeech JSON entries",
        ):
            for part in segments:
                manifests[part]["recordings"].append(recording)
                manifests[part]["supervisions"].extend(segments[part])

    for sub in subsets:
        recordings, supervisions = fix_manifests(
            recordings=RecordingSet.from_recordings(
                manifests[sub]["recordings"]),
            supervisions=SupervisionSet.from_segments(
                manifests[sub]["supervisions"]),
        )
        validate_recordings_and_supervisions(recordings=recordings,
                                             supervisions=supervisions)

        if output_dir is not None:
            supervisions.to_file(output_dir /
                                 f"wenetspeech_supervisions_{sub}.jsonl.gz")
            recordings.to_file(output_dir /
                               f"wenetspeech_recordings_{sub}.jsonl.gz")

        manifests[sub] = {
            "recordings": recordings,
            "supervisions": supervisions,
        }

    return manifests
示例#31
0
文件: general.py 项目: pgierz/esm-viz
    def progress_bar(self, config, log):
        _, throughput, _ = log.compute_throughput()

        exp = config["basedir"]
        model = config["model"].lower()
        self._connect()
        date_filename = exp.split("/")[-1] + "_" + model + ".date"
        remote_command = ("cd " + config["basedir"] + "/scripts/; cat " +
                          date_filename + " |awk '{ print $1 }'")
        stdin, stdout, stderr = self.ssh.exec_command(remote_command)
        # stdout is now something like 19500101
        # Assume that you get something like Y*YMMDD; so cut off the last 4 digits
        # (note that we dont know how many places the year has; so we need to cut
        # from the end)
        current_date = int(stdout.readlines()[0][:-5])

        remote_command = ("cd " + config["basedir"] + "/scripts/; cat " +
                          date_filename + " |awk '{ print $2 }'")
        stdin, stdout, stderr = self.ssh.exec_command(remote_command)
        current_run = int(stdout.readlines()[0])

        runscript_file = config.get("runscript",
                                    config["basedir"] + "/scripts/*run")
        # POTENTIAL BUG: These things are all very dependent on the runscript's way
        # of defining time control. It might be better to do this somehow
        # differently
        start_year = self.ssh.exec_command("grep INITIAL_DATE_" + model + " " +
                                           runscript_file)[1].readlines()[0]
        final_year = self.ssh.exec_command("grep FINAL_DATE_" + model + " " +
                                           runscript_file)[1].readlines()[0]
        # POTENTIAL BUG: What about people who run on monthly basis?
        run_size = self.ssh.exec_command("grep NYEAR_" + model + " " +
                                         runscript_file)[1].readlines()[0]
        # Reformat to get just the years and run sizes
        start_year = int(start_year.split("=")[1].split("-")[0])
        final_year = int(final_year.split("=")[1].split("-")[0])
        run_size = int(stripComments(" ".join(run_size.split("=")[1].split())))

        total_number_of_runs = int((final_year - start_year) / run_size)
        years_per_day = throughput

        years_left = final_year - current_date
        days_left = years_left / years_per_day
        finishing_date = datetime.datetime.now() + datetime.timedelta(
            days=days_left)
        using_tqdm = False
        using_html = True
        if using_tqdm:
            r_bar = (" " + str(current_run) + "/" + str(total_number_of_runs) +
                     ", Throughput ~" + str(np.round(years_per_day, 2)) +
                     "runs/day")

            pbar = tqdm(
                total=total_number_of_runs,
                desc="Done on: " + finishing_date.strftime("%d %b, %Y"),
                bar_format="{n}/|/{l_bar} " + r_bar,
            )
            pbar.update(current_run)
            return pbar
        if using_html:

            DOC = ("""
<style>
#myProgress {
  width: 100%;
  background-color: #ddd;
}
""" + """
#myBar {
  width: """ + str(100 * current_run / total_number_of_runs) + "%" + """;
  height: 30px;
  background-color: #4CAF50;
  text-align: center;
  line-height: 30px;
  color: black;
}
</style>
<body>


""" + """Based on current throughput, this run may be done on: """ +
                   finishing_date.strftime("%d %b, %Y") + """
<div id="myProgress">
<div id="myBar">""" + str(100 * current_run / total_number_of_runs) +
                   """%</div>
</div>
""" + str(current_run) + "/" + str(total_number_of_runs) + " Throughput ~" +
                   str(np.round(years_per_day, 2)) + """ runs/day</body>""")
            return DOC
#     img = Image.fromarray(img)
#     img.save(filename)

# local path setting
path = '/home/ubuntu/context/data'
print(path)

path_cropped = '/home/ubuntu/context/context_encoder_pytorch-master_ver_1/dataset/train/annals'
path_cropped2 = '/home/ubuntu/context/context_encoder_pytorch-master_ver_1/dataset/val/annals'

# original data path
os.chdir(path)
file_list = os.listdir(os.getcwd())

cnt = 0
for l in tqdm(file_list):
    # a,b,...
    path_img = os.path.join(path, l)
    os.chdir(path_img)
    image_list = glob.glob('*.jpg')
    print(len(image_list))

    for i, img in tqdm(enumerate(image_list)):
        try:
            image = load_image(os.path.join(path_img, img))
            chunk = ImageChunker(256, 256, overlap=0)
            results = chunk.dimension_preprocess(image)
            print("=======cropped========>", img)
            cnt += 1
            if cnt < 20000:
                save_image(img, results, count=True)
示例#33
0
phases = ['train', 'val']
for f in ['images', 'labels']:
    if not os.path.isdir(f): os.mkdir(f)
    for p in phases:
        if not os.path.isdir(os.path.join(f, p)): os.mkdir(os.path.join(f, p))

image_folder = 'images'
df = pd.read_csv('train.csv', index_col=0)
box_count = df.groupby(level=0).count().width
#all_id = list(box_count.index)
all_id = list(set([i[:-4] for i in os.listdir('all_images')]))
idsets = train_test_split(all_id, test_size=0.1, random_state=7)

for p, ids in zip(phases, idsets):
    i = 0
    for iid in tqdm(ids):
        imagefile1 = os.path.join('all_images', iid + '.jpg')
        imagefile2 = os.path.join('images', p, iid + '.jpg')
        copyfile(imagefile1, imagefile2)
        fn = os.path.join('labels', p, iid + '.txt')
        if iid not in df.index:
            open(fn, 'w').close()
            continue
        idf = df.loc[iid]
        if isinstance(idf, pd.Series): idf = df.loc[[iid]]
        with open(fn, 'w') as fw:
            for _, row in idf.iterrows():
                wi, hi, bbox = row.width, row.height, row.bbox
                xb, yb, wb, hb = ast.literal_eval(bbox)
                yolo_row = [
                    0, (xb + wb / 2) / wi, (yb + hb / 2) / hi, wb / wi, hb / hi
    def on_epoch_end(self, learner):
        with torch.no_grad():
            model = learner.model
            model.eval()

            h_context = []
            h_candidate = []

            pb_h = tqdm(self.dl_holdout,
                        total=len(self.dl_holdout),
                        desc='MRR-Score: Calculate H')
            for batch in pb_h:
                data, targets = learner.to_device(
                    batch[0],
                    self.device), learner.to_device(batch[1], self.device)
                T_context, X_context, T_candidate, X_candidate = data

                h_context_batch = model.get_h_context(
                    X_context, T_context).detach().cpu().numpy()
                h_candidate_batch = model.get_h_candidate(
                    X_candidate, T_candidate).detach().cpu().numpy()

                h_context.append(h_context_batch)
                h_candidate.append(h_candidate_batch)

            h_context = flatten(h_context)
            h_candidate = np.vstack(h_candidate)

            dl_h_candidate = DataLoader(dataset=TensorDataset(
                torch.FloatTensor(h_candidate)),
                                        batch_size=self.dl_holdout.batch_size,
                                        shuffle=False)

            all_contexts_logits = []

            pb_h_context = tqdm(h_context,
                                total=len(h_context),
                                desc='Scoring')

            for h_context in pb_h_context:
                context_logits = []
                h_context_batch = torch.FloatTensor([h_context
                                                     ]).to(self.device)

                for h_candidate_batch in dl_h_candidate:
                    h_candidate_batch = h_candidate_batch[0].to(self.device)
                    context_logits_ = model.get_logits(
                        h_context_batch,
                        h_candidate_batch).detach().cpu().numpy()
                    context_logits.extend(context_logits_)

                all_contexts_logits.append(context_logits)

            all_contexts_logits = np.array(all_contexts_logits)
            ranks = np.abs(
                np.apply_along_axis(rankdata, 1, all_contexts_logits) -
                all_contexts_logits.shape[1]) + 1
            mrr = np.mean(1 / ranks.diagonal())

            self.tb_writer.add_scalar(f'MRRScore/valid', mrr,
                                      learner.cur_epoch)