Exemplo n.º 1
0
def update_url_id(message_file_name, directory):
    message_id = message_file_name.split('.')[0]
    print(message_id)
    r = tg.call_method('getMessageLink',
                       params={
                           'chat_id': chat_id,
                           'message_id': message_id
                       })
    r.wait()
    if not r.update:
        return
    if 'url' not in r.update:
        return
    url_id = r.update['url'].split('/')[-1]
    print('https://t.me/cyclingmarket/{}'.format(url_id))
    full_path = os.path.join(directory, message_file_name)
    data = util.load_json_file(full_path)
    data['url_id'] = url_id
    util.save_json_file(full_path, data)
Exemplo n.º 2
0
def download_full_history():

    next_message_id = 0
    while True:
        r = tg.call_method('getChatHistory',
                           params={
                               'chat_id': chat_id,
                               'from_message_id': next_message_id,
                               'offset': 0,
                               'limit': 100,
                               'only_local': False,
                           })
        r.wait()
        update = r.update
        if 'messages' not in update:
            print('no messages in update')
            break

        for message in update['messages']:
            media_album_id = message['media_album_id']
            if int(media_album_id) == 0:
                util.save_json_file(
                    '{}/{}.json'.format(SINGLE_DIR, message['id']), message)
            else:
                album_dir = ALBUMS_DIR + media_album_id
                if not os.path.exists(album_dir):
                    os.mkdir(album_dir)
                util.save_json_file(
                    '{}/{}/{}.json'.format(ALBUMS_DIR, media_album_id,
                                           message['id']), message)
            text = message.get('text', None)
            date = message['date']
            timestamp = datetime.datetime.fromtimestamp(date)
            print(timestamp.strftime('%Y-%m-%d %H:%M:%S'))
        if len(update['messages']) == 0:
            break
        next_message_id = update['messages'][-1]['id']
def generate_model_outputs(args,
                           model,
                           tokenizer,
                           is_dev=False,
                           prefix='',
                           save_dir=''):
    dataset, examples, features = load_and_cache_examples(args,
                                                          tokenizer,
                                                          evaluate=is_dev,
                                                          output_examples=True)
    logger.info(
        f'REAL number of examples {len(examples)} and features {len(features)}!'
    )

    if not save_dir and args.local_rank in [-1, 0]:
        os.makedirs(save_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly
    sampler = SequentialSampler(dataset)
    dataloader = DataLoader(dataset,
                            sampler=sampler,
                            batch_size=args.eval_batch_size)

    # multi-gpu evaluate
    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)

    # Output!
    logger.info("***** Generating outputs {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    # all_results = collections.defaultdict(list)
    all_results = []
    start_time = timeit.default_timer()
    for batch in tqdm(dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            if args.model_type in [
                    "xlm", "roberta", "distilbert", "camembert"
            ]:
                del inputs["token_type_ids"]

            example_indices = batch[3]

            # XLNet and XLM use more arguments for their predictions
            if args.model_type in ["xlnet", "xlm"]:
                inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
                # for lang_id-sensitive xlm models
                if hasattr(model, "config") and hasattr(
                        model.config, "lang2id"):
                    inputs.update({
                        "langs":
                        (torch.ones(batch[0].shape, dtype=torch.int64) *
                         args.lang_id).to(args.device)
                    })

            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
            if len(output) >= 5:
                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]

                result = SquadResult(
                    unique_id,
                    start_logits,
                    end_logits,
                    start_top_index=start_top_index,
                    end_top_index=end_top_index,
                    cls_logits=cls_logits,
                )

            else:
                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    print('# of resuls in all_results:', len(all_results))
    # Save feaures
    with open(os.path.join(save_dir, 'features.pkl'), 'wb') as f:
        pickle.dump(features, f)

    # Save all_results
    with open(os.path.join(save_dir, 'all_results.pkl'), 'wb') as f:
        pickle.dump(all_results, f)

    # Save tokenizer
    with open(os.path.join(save_dir, 'tokenizer.pkl'), 'wb') as f:
        pickle.dump(tokenizer, f)

    json_to_save = {
        'model_name': args.name,
        'type': 'dev' if is_dev else 'train',
        'num_examples': len(examples),
        'num_features': len(features)
    }
    util.save_json_file(os.path.join(save_dir, 'config.json'), json_to_save)
def evaluate(args,
             model,
             tokenizer,
             prefix="",
             save_dir='',
             save_log_path=None):
    dataset, examples, features = load_and_cache_examples(args,
                                                          tokenizer,
                                                          evaluate=True,
                                                          output_examples=True)

    if not save_dir and args.local_rank in [-1, 0]:
        os.makedirs(save_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # multi-gpu evaluate
    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            if args.model_type in [
                    "xlm", "roberta", "distilbert", "camembert"
            ]:
                del inputs["token_type_ids"]

            example_indices = batch[3]

            # XLNet and XLM use more arguments for their predictions
            if args.model_type in ["xlnet", "xlm"]:
                inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
                # for lang_id-sensitive xlm models
                if hasattr(model, "config") and hasattr(
                        model.config, "lang2id"):
                    inputs.update({
                        "langs":
                        (torch.ones(batch[0].shape, dtype=torch.int64) *
                         args.lang_id).to(args.device)
                    })

            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
            if len(output) >= 5:
                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]

                result = SquadResult(
                    unique_id,
                    start_logits,
                    end_logits,
                    start_top_index=start_top_index,
                    end_top_index=end_top_index,
                    cls_logits=cls_logits,
                )

            else:
                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)",
                evalTime, evalTime / len(dataset))

    # Compute predictions
    output_prediction_file = os.path.join(save_dir,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        save_dir, "nbest_predictions_{}.json".format(prefix))

    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(
            save_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    # XLNet and XLM use a more complex post-processing procedure
    if args.model_type in ["xlnet", "xlm"]:
        start_n_top = model.config.start_n_top if hasattr(
            model, "config") else model.module.config.start_n_top
        end_n_top = model.config.end_n_top if hasattr(
            model, "config") else model.module.config.end_n_top

        predictions = compute_predictions_log_probs(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            start_n_top,
            end_n_top,
            args.version_2_with_negative,
            tokenizer,
            args.verbose_logging,
        )
    else:
        predictions = compute_predictions_logits(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            args.do_lower_case,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            args.verbose_logging,
            args.version_2_with_negative,
            args.null_score_diff_threshold,
            tokenizer,
        )

    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions)

    # save log to file
    if save_log_path:
        util.save_json_file(save_log_path, results)

    return results
def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter(args.save_dir)

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    # Check if saved optimizer or scheduler states exist
    if os.path.isfile(os.path.join(
            args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
                os.path.join(args.model_name_or_path, "scheduler.pt")):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )

        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 1
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if os.path.exists(args.model_name_or_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split(
                "/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) //
                                             args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (
                len(train_dataloader) // args.gradient_accumulation_steps)

            logger.info(
                "  Continuing training from checkpoint, will skip to saved global_step"
            )
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d",
                        global_step)
            logger.info("  Will skip the first %d steps in the first epoch",
                        steps_trained_in_current_epoch)
        except ValueError:
            logger.info("  Starting fine-tuning.")

    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(epochs_trained,
                            int(args.num_train_epochs),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0])
    # Added here for reproductibility
    set_seed(args)

    # global var for current best f1
    cur_best_f1 = 0.0

    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            model.train()
            batch = tuple(t.to(args.device) for t in batch)

            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
                "start_positions": batch[3],
                "end_positions": batch[4],
            }

            if args.model_type in [
                    "xlm", "roberta", "distilbert", "camembert"
            ]:
                del inputs["token_type_ids"]

            if args.model_type in ["xlnet", "xlm"]:
                inputs.update({"cls_index": batch[5], "p_mask": batch[6]})
                if args.version_2_with_negative:
                    inputs.update({"is_impossible": batch[7]})
                if hasattr(model, "config") and hasattr(
                        model.config, "lang2id"):
                    inputs.update({
                        "langs":
                        (torch.ones(batch[0].shape, dtype=torch.int64) *
                         args.lang_id).to(args.device)
                    })

            outputs = model(**inputs)
            # model outputs are always tuple in transformers (see doc)
            loss = outputs[0]
            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel (not distributed) training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1
                eval_results = None  # Evaluation result

                # Log metrics
                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Only evaluate when single GPU otherwise metrics may not average well
                    if args.local_rank == -1 and args.evaluate_during_training:
                        # Create output dir/path
                        output_dir = os.path.join(
                            args.output_dir,
                            "checkpoint-{}".format(global_step))
                        if not os.path.exists(output_dir):
                            os.makedirs(output_dir)
                        output_path = os.path.join(output_dir,
                                                   'eval_result.json')

                        # Get eval results and save the log to output path
                        eval_results = evaluate(args,
                                                model,
                                                tokenizer,
                                                save_dir=output_dir,
                                                save_log_path=output_path)
                        for key, value in eval_results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value,
                                                 global_step)

                        # log eval result
                        logger.info(
                            f"Evaluation result at {global_step} step: {eval_results}"
                        )
                    tb_writer.add_scalar("lr",
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss

                # Save model checkpoint
                if args.local_rank in [
                        -1, 0
                ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                    output_dir = os.path.join(
                        args.output_dir,
                        'cur_best') if args.save_best_only else os.path.join(
                            args.output_dir,
                            "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)

                    # Get eval results and save the log to output path
                    if args.local_rank in [-1, 0
                                           ] and args.evaluate_during_saving:
                        output_path = os.path.join(output_dir,
                                                   'eval_result.json')
                        eval_results = evaluate(args,
                                                model,
                                                tokenizer,
                                                save_dir=output_dir,
                                                save_log_path=None)
                        for key, value in eval_results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value,
                                                 global_step)
                        # log eval result
                        logger.info(
                            f"Evaluation result at {global_step} step: {eval_results}"
                        )
                        # save current result at args.output_dir
                        if os.path.exists(
                                os.path.join(args.output_dir,
                                             "eval_result.json")):
                            util.read_and_update_json_file(
                                os.path.join(args.output_dir,
                                             "eval_result.json"),
                                {global_step: eval_results})
                        else:
                            util.save_json_file(
                                os.path.join(args.output_dir,
                                             "eval_result.json"),
                                {global_step: eval_results})

                    # Save cur best model only
                    # Take care of distributed/parallel training
                    if (eval_results and cur_best_f1 < eval_results['f1']
                        ) or not args.save_best_only:
                        if eval_results and cur_best_f1 < eval_results['f1']:
                            cur_best_f1 = eval_results['f1']
                        model_to_save = model.module if hasattr(
                            model, "module") else model
                        model_to_save.save_pretrained(output_dir)
                        tokenizer.save_pretrained(output_dir)

                        torch.save(
                            args, os.path.join(output_dir,
                                               "training_args.bin"))
                        logger.info("Saving model checkpoint to %s",
                                    output_dir)

                        torch.save(optimizer.state_dict(),
                                   os.path.join(output_dir, "optimizer.pt"))
                        torch.save(scheduler.state_dict(),
                                   os.path.join(output_dir, "scheduler.pt"))
                        logger.info(
                            "Saving optimizer and scheduler states to %s",
                            output_dir)

                        if args.save_best_only:
                            util.save_json_file(
                                os.path.join(output_dir, "eval_result.json"),
                                {global_step: eval_results})

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
Exemplo n.º 6
0
def evaluate(args,
             model,
             tokenizer,
             prefix="",
             save_dir='',
             save_log_path=None):
    dataset, examples, features = load_and_cache_examples(args,
                                                          tokenizer,
                                                          evaluate=True,
                                                          output_examples=True)

    if not save_dir and args.local_rank in [-1, 0]:
        os.makedirs(save_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # multi-gpu evaluate
    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()

    # y_cls_correct = 0
    # y_cls_incorrect = 0
    y_cls_tp, y_cls_tn, y_cls_fp, y_cls_fn = 0, 0, 0, 0
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            if args.model_type in [
                    "xlm", "roberta", "distilbert", "camembert"
            ]:
                del inputs["token_type_ids"]

            example_indices = batch[3]
            # XLNet and XLM use more arguments for their predictions
            if args.model_type in ["xlnet", "xlm"]:
                inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
                # for lang_id-sensitive xlm models
                if hasattr(model, "config") and hasattr(
                        model.config, "lang2id"):
                    inputs.update({
                        "langs":
                        (torch.ones(batch[0].shape, dtype=torch.int64) *
                         args.lang_id).to(args.device)
                    })

            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)
            is_impossible = eval_feature.is_impossible

            output = [to_list(output[i]) for output in outputs]

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
            if len(output) >= 5:
                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]

                result = SquadResult(
                    unique_id,
                    start_logits,
                    end_logits,
                    start_top_index=start_top_index,
                    end_top_index=end_top_index,
                    cls_logits=cls_logits,
                )

            else:
                start_logits, end_logits, logits_cls, prob_cls = output

                prob_cls = np.asarray(prob_cls, dtype=np.float)
                predict_cls = np.argmax(prob_cls)

                if predict_cls == int(not is_impossible):
                    if is_impossible:
                        y_cls_tn += 1
                    else:
                        y_cls_tp += 1
                else:
                    if is_impossible:
                        y_cls_fp += 1
                    else:
                        y_cls_fn += 1
                result = SquadResult(unique_id, start_logits, end_logits)
                # Add cls prediction
                if args.force_cls_pred:
                    result.prob_cls = prob_cls

            all_results.append(result)

    # print(y_cls_correct, y_cls_incorrect)
    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)",
                evalTime, evalTime / len(dataset))

    # Compute predictions
    output_prediction_file = os.path.join(save_dir,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        save_dir, "nbest_predictions_{}.json".format(prefix))

    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(
            save_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    # XLNet and XLM use a more complex post-processing procedure
    if args.model_type in ["xlnet", "xlm"]:
        start_n_top = model.config.start_n_top if hasattr(
            model, "config") else model.module.config.start_n_top
        end_n_top = model.config.end_n_top if hasattr(
            model, "config") else model.module.config.end_n_top

        predictions = compute_predictions_log_probs(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            start_n_top,
            end_n_top,
            args.version_2_with_negative,
            tokenizer,
            args.verbose_logging,
        )
    else:
        predictions = compute_predictions_logits(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            args.do_lower_case,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            args.verbose_logging,
            args.version_2_with_negative,
            args.null_score_diff_threshold,
            tokenizer,
        )

    if args.force_cls_pred:
        example_index_to_features = collections.defaultdict(list)
        for feature in features:
            example_index_to_features[feature.example_index].append(feature)

        unique_id_to_result = {}
        for result in all_results:
            unique_id_to_result[result.unique_id] = result

        n_force = 0
        for example_index, example in enumerate(examples):
            eval_features = example_index_to_features[example_index]
            prob = []
            for eval_feature in eval_features:
                eval_result = unique_id_to_result[eval_feature.unique_id]
                prob.append(eval_result.prob_cls[0])

            if np.mean(prob) >= 0.8:
                predictions[example.qas_id] = ""
                n_force += 1

        print("\n")
        print("num of force prediction:", n_force)
    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions)

    cls_accuracy = (y_cls_tn + y_cls_tp) / (y_cls_tn + y_cls_tp + y_cls_fn +
                                            y_cls_fp)
    cls_no_ans_accuracy = y_cls_tn / (y_cls_tn + y_cls_fp)
    cls_has_ans_accuracy = y_cls_tp / (y_cls_tp + y_cls_fn)
    # Add CLS accuracy to result
    results.update({
        'cls_accuracy': cls_accuracy,
        'cls_no_ans_accuracy': cls_no_ans_accuracy,
        'cls_has_ans_accuracy': cls_has_ans_accuracy
    })
    # save log to file
    if save_log_path:
        util.save_json_file(save_log_path, results)

    return results
Exemplo n.º 7
0
def main():
    args = get_bert_args()

    assert not (args.do_output
                and args.do_train), 'Don\'t output and train at the same time!'
    if args.do_output:
        sub_dir_prefix = 'output'
    elif args.do_train:
        sub_dir_prefix = 'train'
    else:
        sub_dir_prefix = 'test'
    # No matter what, we do ensemble here lol
    sub_dir_prefix = 'ensemble3'
    args.save_dir = util.get_save_dir(args.save_dir, args.name, sub_dir_prefix)
    args.output_dir = args.save_dir

    global logger
    logger = util.get_logger(args.save_dir, args.name)

    if args.doc_stride >= args.max_seq_length - args.max_query_length:
        logger.warning(
            "WARNING - You've set a doc stride which may be superior to the document length in some "
            "examples. This could result in errors when building features from the examples. Please reduce the doc "
            "stride or increase the maximum length to ensure the features are correctly built."
        )

    if not args.evaluate_during_saving and args.save_best_only:
        raise ValueError("No best result without evaluation during saving")

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd

        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device

    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )

    # Set seed
    set_seed(args)

    if args.local_rank == 0:
        # Make sure only the first process in distributed training will download model & vocab
        torch.distributed.barrier()

    logger.info("Training/evaluation parameters %s", args)

    # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is set.
    # Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running `--fp16_opt_level="O2"` will
    # remove the need for this code, but it is still valid.
    if args.fp16:
        try:
            import apex
            apex.amp.register_half_function(torch, "einsum")
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )

    # Training
    if args.do_train and (args.do_weighted_ensemble or args.do_stack_ensemble):
        examples, features, train_dataset, tokenizer, n_models = load_combined_examples(
            args, evaluate=False)
        model = EnsembleQA(
            n_models) if args.do_weighted_ensemble else EnsembleStackQA(
                n_models)
        model = model.to(args.device)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step,
                    tr_loss)

    # Save the trained model and the tokenizer
    if args.do_train and (args.local_rank == -1
                          or torch.distributed.get_rank() == 0):
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        model_to_save = model.module if hasattr(model, "module") else model
        # model_to_save.save_pretrained(output_dir)  # BertQA is not a PreTrainedModel class
        torch.save(model_to_save,
                   os.path.join(args.output_dir,
                                'pytorch_model.bin'))  # save entire model
        tokenizer.save_pretrained(args.output_dir)  # save tokenizer

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

        # Load a trained model and vocabulary that you have fine-tuned
        model = torch.load(
            os.path.join(args.output_dir, 'cur_best', 'pytorch_model.bin'))
        model.to(args.device)

    # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        if args.do_train:
            logger.info(
                "Loading checkpoints saved during training for evaluation")
            checkpoints = [args.output_dir]
            if args.eval_all_checkpoints:
                checkpoints = list(
                    os.path.dirname(c) for c in sorted(
                        glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME,
                                  recursive=True)))
                # logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs
        else:
            logger.info("Loading checkpoint %s for evaluation",
                        args.model_name_or_path)
            checkpoints = [args.eval_dir]
        logger.info("Evaluate the following checkpoints: %s", checkpoints)

        for checkpoint in checkpoints:
            # Reload the model
            global_step = checkpoint.split(
                "-")[-1] if len(checkpoints) > 1 else ""
            # Load a trained model and vocabulary that you have fine-tuned
            model = torch.load(
                os.path.join(args.output_dir, 'cur_best', 'pytorch_model.bin'))
            model.to(args.device)

            # Evaluate
            result, all_predictions = evaluate(args,
                                               model,
                                               tokenizer,
                                               prefix=global_step,
                                               save_dir=args.output_dir,
                                               save_log_path=os.path.join(
                                                   checkpoint,
                                                   'eval_result.json'))

            result = dict(
                (k + ("_{}".format(global_step) if global_step else ""), v)
                for k, v in result.items())
            results.update(result)

            logger.info(
                f'Convert format and Writing submission file to directory {args.output_dir}...'
            )
            util.save_json_file(
                os.path.join(args.output_dir, 'cur_best', 'predictions_.json'),
                all_predictions)
            util.convert_submission_format_and_save(
                args.output_dir,
                prediction_file_path=os.path.join(args.output_dir, 'cur_best',
                                                  'predictions_.json'))

    logger.info("Results: {}".format(results))

    # Generate ensemble output
    if args.do_ensemble_voting and args.local_rank in [-1, 0]:
        results = ensemble_vote(args,
                                save_dir=args.save_dir,
                                predict_prob_mode='add')

    return results
Exemplo n.º 8
0
def ensemble_vote(args,
                  save_dir='',
                  save_log_path=None,
                  prefix='',
                  predict_prob_mode='add'):
    examples, all_model_features, all_model_results, tokenizers = load_saved_examples(
        args, evaluate=True)

    if not save_dir and args.local_rank in [-1, 0]:
        os.makedirs(save_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly
    # eval_sampler = SequentialSampler(dataset)
    # eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

    # multi-gpu evaluate
    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info(f"***** Running ensemble {prefix}*****")
    logger.info("  Num examples = %d", len(examples))
    logger.info("  Batch size = %d", args.eval_batch_size)

    # We do pure voting now, not taking new inputs
    # start_time = timeit.default_timer()
    # evalTime = timeit.default_timer() - start_time
    # logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset))

    # Compute predictions
    output_prediction_file = os.path.join(save_dir,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        save_dir, "nbest_predictions_{}.json".format(prefix))
    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(
            save_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    all_predictions = []
    all_probs = []
    logger.info(f'predict_prob_mode: {predict_prob_mode}')
    for model_idx in tqdm(range(len(tokenizers)), desc="Predicting"):
        features = all_model_features[model_idx]
        all_results = all_model_results[model_idx]
        tokenizer = tokenizers[model_idx]

        predictions, probs = hack.compute_predictions_logits(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            args.do_lower_case,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            args.verbose_logging,
            args.version_2_with_negative,
            args.null_score_diff_threshold,
            tokenizer,
            prob_mode=predict_prob_mode)
        all_predictions.append(predictions)
        all_probs.append(probs)
        # continue

    # num of predictions
    num_of_predicions = len(all_predictions[0])
    logger.info(f'Number of predicions {num_of_predicions}')

    final_predictions = collections.OrderedDict()
    output_result = collections.OrderedDict()
    # Grid Search
    if args.do_grid_search:
        grid_search_results = collections.OrderedDict()
        grid_search_predictions = collections.OrderedDict()
        for weights in product(np.arange(6), repeat=len(all_probs)):
            if weights == (0, 0, 0, 0, 0):
                continue
            for qas_id in all_predictions[0].keys():
                probs = np.array([d_prob[qas_id] for d_prob in all_probs])
                for i, w in enumerate(weights):
                    probs[i] *= w

                idx = np.argmax(probs)
                final_predictions[qas_id] = all_predictions[idx][qas_id]
            """
            logger.info('Model individual results')
            for i in range(len(tokenizers)):
                results = squad_evaluate(examples, all_predictions[i])
                logger.info(results)
            """
            # Compute the F1 and exact scores.
            logger.info(f'Weights: {weights}')
            logger.info('Ensemble results')
            final_results = squad_evaluate(examples, final_predictions)
            logger.info(final_results)

            if len(grid_search_results) == 0:
                best_weights = weights
                grid_search_results = final_results
                grid_search_predictions = final_predictions
            else:
                if grid_search_results['exact'] + grid_search_results[
                        'f1'] < final_results['exact'] + final_results['f1']:
                    best_weights = weights
                    grid_search_results = final_results
                    grid_search_predictions = final_predictions
        # save log to file
        logger.info(f'Best Weights: {best_weights}')
        output_result[best_weights] = grid_search_results
        util.save_json_file(os.path.join(save_dir, 'eval_results.json'),
                            output_result)

        # save prediction to file
        # TODO save grid search best
        util.save_json_file(os.path.join(save_dir, 'predictions_.json'),
                            grid_search_predictions)
        util.convert_submission_format_and_save(
            save_dir,
            prediction_file_path=os.path.join(save_dir, 'predictions_.json'))

        return grid_search_results
    else:
        for qas_id in all_predictions[0].keys():
            probs = np.array([d_prob[qas_id] for d_prob in all_probs])

            idx = np.argmax(probs)
            final_predictions[qas_id] = all_predictions[idx][qas_id]

        logger.info('Model individual results')
        for i in range(len(tokenizers)):
            results = squad_evaluate(examples, all_predictions[i])
            logger.info(results)
        # Compute the F1 and exact scores.
        logger.info('Ensemble results')
        final_results = squad_evaluate(examples, final_predictions)
        logger.info(final_results)

        # save log to file
        util.save_json_file(os.path.join(save_dir, 'eval_results.json'),
                            final_results)

        util.save_json_file(os.path.join(save_dir, 'predictions_.json'),
                            final_predictions)
        util.convert_submission_format_and_save(
            save_dir,
            prediction_file_path=os.path.join(save_dir, 'predictions_.json'))

        return final_results
Exemplo n.º 9
0
def evaluate(args,
             model,
             tokenizer,
             prefix="",
             save_dir='',
             save_log_path=None,
             return_predicts=True):
    examples, features, dataset, tokenizer, n_models = load_combined_examples(
        args, evaluate=True)

    if not save_dir and args.local_rank in [-1, 0]:
        os.makedirs(save_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # multi-gpu evaluate
    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {
                "predict_start_logits": batch[0],
                "predict_end_logits": batch[1],
            }
            example_indices = batch[2]
            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)
            is_impossible = eval_feature.is_impossible

            output = [to_list(output[i]) for output in outputs]
            start_logits, end_logits = output
            result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)",
                evalTime, evalTime / len(dataset))

    # Compute predictions
    output_prediction_file = os.path.join(save_dir,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        save_dir, "nbest_predictions_{}.json".format(prefix))

    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(
            save_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    predictions, probs = hack.compute_predictions_logits(
        examples,
        features,
        all_results,
        args.n_best_size,
        args.max_answer_length,
        args.do_lower_case,
        output_prediction_file,
        output_nbest_file,
        output_null_log_odds_file,
        args.verbose_logging,
        args.version_2_with_negative,
        args.null_score_diff_threshold,
        tokenizer,
        prob_mode='add')

    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions)

    # save log to file
    if save_log_path:
        util.save_json_file(save_log_path, results)

    if return_predicts:
        return results, predictions
    return results
Exemplo n.º 10
0
def process_singles():
    singles = os.listdir(SINGLE_DIR)
    for fname in singles:
        fname = SINGLE_DIR + fname
        data = util.load_json_file(fname)

        if 'content' not in data:
            continue
        if 'caption' not in data['content']:
            continue

        content = data['content']
        caption = content['caption']
        text = caption['text']

        prod_caption_ent = None
        prod_price_ent = None
        prod_seller_ent = None
        prod_descr_ent = None
        hashtag_ents = []
        entities = caption['entities']
        for e in entities:
            entity_type = e['type']['@type']
            if entity_type == 'textEntityTypeHashtag':
                hashtag_ents.append(e)
            if entity_type == 'textEntityTypeBold':
                if not prod_caption_ent:
                    prod_caption_ent = e
                else:
                    prod_price_ent = e
            if entity_type == 'textEntityTypeItalic':
                prod_descr_ent = e
            if entity_type == 'textEntityTypeMentionName':
                prod_seller_ent = e

        if prod_caption_ent is None or prod_price_ent is None or prod_seller_ent is None or prod_descr_ent is None:
            continue

        product_hashtags = []
        for h in hashtag_ents:
            product_hashtags.append(get_from_text(text, h))
        product_caption = get_from_text(text, prod_caption_ent)
        product_descr = get_from_text(text, prod_descr_ent)
        product_price = get_from_text(text, prod_price_ent)
        product_seller_name = get_from_text(text, prod_seller_ent)
        product_city = get_city_from_text(text, prod_price_ent,
                                          prod_seller_ent)

        product_seller_id = prod_seller_ent['type']['user_id']

        photo_file_id = content['photo']['sizes'][-1]['photo']['remote']['id']

        r = tg.call_method('getUser', params={'user_id': product_seller_id})
        r.wait()
        seller = r.update

        product = {
            'hashtags': product_hashtags,
            'caption': product_caption,
            'descr': product_descr,
            'price': product_price,
            'city': product_city,
            'seller': {
                'id': product_seller_id,
                'full_name': product_seller_name,
                'username': seller['username'],
                'first_name': seller['first_name'],
                'last_name': seller['last_name'],
                'profile_photo': seller.get('profile_photo', None),
            },
            'photo': photo_file_id,
            'date': data['date']
        }

        url_id = data['url_id']
        pr_dir = os.path.join(PROCESSED_DIR, url_id)
        create_dir(pr_dir)

        util.save_json_file(os.path.join(pr_dir, 'data.json'), product)

        print(product)
Exemplo n.º 11
0
        if isinstance(json['city'], dict):
            continue
        city_clear = clear_city_string(json['city'])
        cities.add(city_clear)

        if city_clear in SPB:
            json['city'] = {'id': 2, 'text': json['city']}
        if city_clear in MOSCOW:
            json['city'] = {'id': 1, 'text': json['city']}
        if city_clear == 'ростов-на-дону':
            json['city'] = {'id': 119, 'text': json['city']}
        if city_clear == 'великийновгород':
            json['city'] = {'id': 35, 'text': json['city']}
        if city_clear == 'вологда':
            json['city'] = {'id': 41, 'text': json['city']}
        if city_clear == 'минск':
            json['city'] = {'id': 282, 'text': json['city']}
        if city_clear == 'уфа':
            json['city'] = {'id': 151, 'text': json['city']}
        if city_clear == 'казань':
            json['city'] = {'id': 60, 'text': json['city']}
        if city_clear == 'пенза':
            json['city'] = {'id': 109, 'text': json['city']}
        if city_clear == 'краснодар':
            json['city'] = {'id': 72, 'text': json['city']}

        util.save_json_file(filename, json)

    for c in cities:
        print(c)
Exemplo n.º 12
0
if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("give me directory with .json files with messages from channel")
        exit(-1)

    dir_name = sys.argv[1]
    messages = glob.glob(os.path.join(dir_name, "messages*.json"))
    regexp_hash = re.compile("hash")
    messages = [x for x in messages if not regexp_hash.search(x)]
    # messages = glob.glob(os.path.join(dir_name, "messages25.json"))

    for messages_json_filename in messages:
        print(messages_json_filename)
        goods = util.load_json_file(messages_json_filename)
        goods_count = len(goods)
        i = 0
        for g in goods:
            if len(g['seller']) <= 17:
                continue
            photo_link_jpg = g['photo_link']
            photo_hash = get_photo_hash(photo_link_jpg)
            g['hash'] = photo_hash
            print(i, "/", goods_count, " ", photo_hash)
            i += 1

        json_filename = os.path.splitext(
            messages_json_filename)[0] + "_hash.json"
        util.save_json_file(json_filename, goods)
        os.remove(messages_json_filename)