Пример #1
0
def create_batch_iter(mode):
    """构造迭代器"""
    processor, tokenizer = init_params()
    if mode == "train":
        examples = processor.get_train_examples()

        num_train_steps = int(
            len(examples) / args.train_batch_size /
            args.gradient_accumulation_steps * args.num_train_epochs)

        batch_size = args.train_batch_size

        logger.info("  Num steps = %d", num_train_steps)

    elif mode == "valid":
        examples = processor.get_valid_examples()
        batch_size = args.eval_batch_size
    else:
        raise ValueError("Invalid mode %s" % mode)

    label_list = processor.get_labels()

    # 特征
    features = convert_examples_to_features(examples, label_list,
                                            args.max_seq_length, tokenizer)

    logger.info("  Num examples = %d", len(examples))
    logger.info("  Batch size = %d", batch_size)

    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                   dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in features],
                                 dtype=torch.long)
    all_output_mask = torch.tensor([f.output_mask for f in features],
                                   dtype=torch.long)

    # 数据集
    data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                         all_label_ids, all_output_mask)

    if mode == "train":
        sampler = RandomSampler(data)
    elif mode == "valid":
        sampler = SequentialSampler(data)
    else:
        raise ValueError("Invalid mode %s" % mode)

    # 迭代器
    iterator = DataLoader(data, sampler=sampler, batch_size=batch_size)

    if mode == "train":
        return iterator, num_train_steps
    elif mode == "valid":
        return iterator
    else:
        raise ValueError("Invalid mode %s" % mode)
Пример #2
0
def create_inference_iter():
    processor, tokenizer = init_params()
    examples = processor.get_valid_examples()[:100]
    batch_size = args.inference_batch_size
    label_list = processor.get_labels()
    # 特征
    features = convert_examples_to_features(examples, label_list,
                                            args.max_seq_length, tokenizer)

    logger.info("  Num examples = %d", len(examples))
    logger.info("  Batch size = %d", batch_size)

    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                   dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in features],
                                 dtype=torch.long)
    all_output_mask = torch.tensor([f.output_mask for f in features],
                                   dtype=torch.long)
    all_text = [''.join(example.text_a) for example in examples]
    print(all_text)
    # 数据集
    data = MyTensorDataset(all_text, all_input_ids, all_input_mask,
                           all_segment_ids, all_label_ids, all_output_mask)
    sampler = SequentialSampler(data)
    iterator = DataLoader(data, sampler=sampler, batch_size=batch_size)
    return iterator
Пример #3
0
def load_and_cache_examples(args, task, tokenizer, evaluate=False,mode="train"):
    if args.local_rank not in [-1, 0] and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    processor = processors[task]()
    output_mode = output_modes[task]
    # Load data features from cache or dataset file
    cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format(
        mode,
        list(filter(None, args.model_name_or_path.split('/'))).pop(),
        str(args.max_seq_length),
        str(task)))
    if os.path.exists(cached_features_file) and not args.overwrite_cache:
        logger.info("Loading features from cached file %s", cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", args.data_dir)
        label_list = processor.get_labels()
        if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']:
            # HACK(label indices are swapped in RoBERTa pretrained model)
            label_list[1], label_list[2] = label_list[2], label_list[1]
        if mode == "train":
            examples = processor.get_train_examples(args.data_dir)
        elif mode == "dev":
            examples = processor.get_dev_examples(args.data_dir)
        elif mode == "test":
            examples = processor.get_test_examples(args.data_dir)
        """
        examples = (
            processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
        )"""
        features = convert_examples_to_features(examples,
                                                tokenizer,
                                                label_list=label_list,
                                                max_length=args.max_seq_length,
                                                output_mode=output_mode,
                                                pad_on_left=bool(args.model_type in ['xlnet']),
                                                # pad on the left for xlnet
                                                pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
                                                pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
                                                )
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
            torch.save(features, cached_features_file)

    if args.local_rank == 0 and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
    if output_mode == "classification":
        all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
    elif output_mode == "regression":
        all_labels = torch.tensor([f.label for f in features], dtype=torch.float)

    dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
    return dataset
def create_batch_iter(mode, X, y, batch_size=1):
    """
    构造迭代器
    """
    processor, tokenizer = init_params()
    if mode == 'train':
        examples = processor.get_train_examples(X=X, y=y)
    elif mode == 'dev':
        examples = processor.get_dev_examples(X=X, y=y)
    elif mode == 'predict':
        examples = processor.get_examples(X=X)
    else:
        raise ValueError("Invalid mode %s" % mode)

    # 方法一: 调整维度
    if args.use_calculate_max_seq_length:
        max_seq_length = processor._calculate_max_seq_length(X=X)
        if args.max_seq_length < max_seq_length:
            max_seq_length = args.max_seq_length

    # 方法二: 固定维度
    else:
        max_seq_length = args.max_seq_length

    # 特征
    features = convert_examples_to_features(examples=examples,
                                            max_seq_length=max_seq_length,
                                            tokenizer=tokenizer)

    all_input_ids = torch.LongTensor([f.input_ids for f in features])
    all_input_mask = torch.LongTensor([f.input_mask for f in features])
    all_label_ids = torch.LongTensor([f.label_id for f in features])
    all_output_mask = torch.LongTensor([f.output_mask for f in features])

    # 数据集
    data = TensorDataset(all_input_ids, all_input_mask, all_label_ids,
                         all_output_mask)

    if mode == "train":
        sampler = RandomSampler(data)
    elif mode == "dev":
        sampler = SequentialSampler(data)
    elif mode == 'predict':
        sampler = SequentialSampler(data)
    else:
        raise ValueError("Invalid mode %s" % mode)

    # 迭代器
    iterator = DataLoader(data, sampler=sampler, batch_size=batch_size)

    return iterator
Пример #5
0
def load_and_cache_examples(args, tokenizer, evaluate=False):
    if args.local_rank not in [-1, 0] and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    processor = Processor()
    # Load data features from cache or dataset file
    cached_features_file = os.path.join(
        args.data_dir,
        "cached_{}_{}_{}".format(
            "dev" if evaluate else "train",
            list(filter(None, args.model_name_or_path.split("/"))).pop(),
            str(args.max_seq_length)
        ),
    )
    if os.path.exists(cached_features_file) and not args.overwrite_cache:
        logger.info("Loading features from cached file %s", cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating output_mode from dataset file at %s", args.data_dir)
        label_list = processor.get_labels()
        examples = (
            processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
        )

        features = convert_examples_to_features(
            examples,
            tokenizer,
            label_list=label_list,
            max_length=args.max_seq_length,
            pad_on_left=False,  #bool(args.model_type in ["xlnet"]),  # pad on the left for xlnet
            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
            pad_token_segment_id= 0, #  4 if args.model_type in ["xlnet"] else 0,
        )

        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
            torch.save(features, cached_features_file)

    if args.local_rank == 0 and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
    all_labels = torch.tensor([f.label for f in features], dtype=torch.long)

    dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
    return dataset
Пример #6
0
def predict(model, path, label_list, tokenizer, test_filename='test.csv'):
    predict_processor = MultiLabelTextProcessor(path)
    test_examples = predict_processor.get_test_examples(path, test_filename, size=-1)

    # Hold input data for returning it
    input_data = [{'filename': input_example.guid} for input_example in test_examples]
    max_seq_length = 512
    test_features = convert_examples_to_features(
        test_examples, label_list, max_seq_length, tokenizer)

    logger.info("***** Running prediction *****")
    logger.info("  Num examples = %d", len(test_examples))
    logger.info("  Batch size = %d", 2)

    all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long)

    test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids)

    # Run prediction for full data
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=2)

    all_logits = None

    model.eval()
    nb_eval_steps, nb_eval_examples = 0, 0
    for step, batch in enumerate(test_dataloader):
        input_ids, input_mask, segment_ids = batch
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)

        with torch.no_grad():
            logits = model(input_ids, segment_ids, input_mask)
            logits = logits.sigmoid()

        if all_logits is None:
            all_logits = logits.detach().cpu().numpy()
        else:
            all_logits = np.concatenate((all_logits, logits.detach().cpu().numpy()), axis=0)

        nb_eval_examples += input_ids.size(0)
        nb_eval_steps += 1

    return pd.merge(pd.DataFrame(input_data), pd.DataFrame(all_logits, columns=label_list), left_index=True, right_index=True)
    optimizer_grouped_parameters, lr=args['learning_rate'], correct_bias=False
)  # To reproduce BertAdam specific behavior set correct_bias=False
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=args['warmup_proportion'],
    num_training_steps=t_total)  # PyTorch scheduler
scheduler = CyclicLR(optimizer,
                     base_lr=2e-5,
                     max_lr=5e-5,
                     step_size=2500,
                     last_batch_iteration=0)

# Prepare training feature

train_features = convert_examples_to_features(train_examples, label_list,
                                              args['max_seq_length'],
                                              tokenizer)
logger.info("***** Running training *****")
logger.info("  Num examples = %d", len(train_examples))
logger.info("  Batch size = %d", args['train_batch_size'])
logger.info("  Num steps = %d", num_train_steps)
all_input_ids = torch.tensor([f.input_ids for f in train_features],
                             dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features],
                              dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                               dtype=torch.long)
all_label_ids = torch.tensor([f.label_ids for f in train_features],
                             dtype=torch.float)
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                           all_label_ids)
Пример #8
0
print("Configuration: ", bert_config.to_json_string())

# Prepare data and data processors
dataset_processor = processors[task_name.lower()]
tokenizer = tokenization.FullTokenizer(vocab_file=bert_model_type +
                                       '/vocab.txt',
                                       do_lower_case=do_lower_case)

# Get labels
labels = dataset_processor.get_labels(data_dir)

# Training data
training_examples = dataset_processor.get_train_examples(data_dir)
train_data_file = os.path.join(datasets_dir, "train.tf_record")
data_processor.convert_examples_to_features(training_examples, labels,
                                            max_seq_length, tokenizer,
                                            train_data_file)
train_input_fn = utils.input_fn_builder(train_data_file,
                                        max_seq_length,
                                        is_training=True,
                                        drop_remainder=True)

# Evaluation data
evaluation_examples = dataset_processor.get_eval_examples(data_dir)
eval_data_file = os.path.join(datasets_dir, "eval.tf_record")
data_processor.convert_examples_to_features(evaluation_examples, labels,
                                            max_seq_length, tokenizer,
                                            eval_data_file)
eval_input_fn = utils.input_fn_builder(input_file=eval_data_file,
                                       seq_length=max_seq_length,
                                       is_training=False,
Пример #9
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("--model_type",
                        default='bert',
                        type=str,
                        help="Model type selected in the list: " +
                        ", ".join(MODEL_CLASSES.keys()))
    parser.add_argument(
        "--model_name_or_path",
        default='bert-base-uncased',
        type=str,
        help="Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(ALL_MODELS))
    parser.add_argument(
        "--output_dir",
        default='exp',
        type=str,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )

    # Other parameters
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name")
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name")
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--max_seq_length",
        default=32,
        type=int,
        help=
        "The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--evaluate_during_training",
        action='store_true',
        default=True,
        help="Rul evaluation during training at each logging step.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")

    parser.add_argument("--per_gpu_train_batch_size",
                        default=16,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size",
                        default=16,
                        type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=2,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs",
                        default=12.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs."
    )
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")

    parser.add_argument('--logging_steps',
                        type=int,
                        default=100,
                        help="Log every X updates steps.")
    parser.add_argument('--save_steps',
                        type=int,
                        default=300,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action='store_true',
        help=
        "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number"
    )
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Avoid using CUDA when available")
    parser.add_argument('--overwrite_output_dir',
                        action='store_true',
                        help="Overwrite the content of the output directory")
    parser.add_argument(
        '--overwrite_cache',
        action='store_true',
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")

    parser.add_argument(
        '--tpu',
        action='store_true',
        help="Whether to run on the TPU defined in the environment variables")
    parser.add_argument(
        '--tpu_ip_address',
        type=str,
        default='',
        help="TPU IP address if none are set in the environment variables")
    parser.add_argument(
        '--tpu_name',
        type=str,
        default='',
        help="TPU name if none are set in the environment variables")
    parser.add_argument(
        '--xrt_tpu_config',
        type=str,
        default='',
        help="XRT TPU config if none are set in the environment variables")

    parser.add_argument(
        '--fp16',
        action='store_true',
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit"
    )
    parser.add_argument(
        '--fp16_opt_level',
        type=str,
        default='O1',
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="For distributed training: local_rank")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="For distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="For distant debugging.")

    args = parser.parse_args()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl')
        args.n_gpu = 1
    args.device = device

    # Set seed
    set_seed(args)

    d = DataProcessor()
    train_dir_path = './dataset'
    dev_dir_path = './dataset'
    test_dir_path = './dataset'

    dev_eg = d.get_dev_examples(dev_dir_path)
    train_eg = d.get_train_examples(train_dir_path)
    test_eg = d.get_test_examples(test_dir_path)

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    train_dataset = convert_features_to_dataset(
        convert_examples_to_features(examples=train_eg,
                                     label2id=LABEL2ID,
                                     max_seq_length=30,
                                     tokenizer=tokenizer))
    dev_dataset = convert_features_to_dataset(
        convert_examples_to_features(examples=dev_eg,
                                     label2id=LABEL2ID,
                                     max_seq_length=30,
                                     tokenizer=tokenizer))
    test_dataset = convert_features_to_dataset(
        convert_examples_to_features(examples=test_eg,
                                     label2id=LABEL2ID,
                                     max_seq_length=30,
                                     tokenizer=tokenizer))

    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = config_class.from_pretrained(args.model_name_or_path,
                                          num_labels=2)
    model = BertForCS.from_pretrained(args.model_name_or_path,
                                      config=config,
                                      num_labels=2)
    model.to(args.device)
    train(args, train_dataset, model, dev_dataset)
    new_result, pred_to_write = evaluate(args, test_dataset, model)
    with open('./result/result.csv', 'w') as f:
        for i, r in enumerate(pred_to_write, 1):
            f.write('%d,%d\n' % (i, int(r)))