Exemplo n.º 1
0
def load_and_cache_examples(args,
                            tokenizer,
                            evaluate=False,
                            output_examples=False,
                            meta_training=False,
                            sampled_keys=None):
    # Load data features from cache or dataset file
    input_file = args.predict_file if evaluate else args.train_file
    model_name = args.model_type
    cached_features_file = os.path.join(
        os.path.dirname(input_file),
        'cached_{}_{}_{}_{}'.format(args.qa_dataset,
                                    'dev' if evaluate else 'train', model_name,
                                    str(args.max_seq_length)))

    print("Load %s" % cached_features_file)

    cached_examples_file = os.path.join(
        os.path.dirname(input_file),
        "examples_{}_valid.pkl".format(args.qa_dataset))
    if os.path.exists(cached_features_file
                      ) and not args.overwrite_cache and not output_examples:
        logger.info("Loading features from cached file %s",
                    cached_features_file)
        with open(cached_features_file, 'rb') as f:
            features = pickle.load(f)

    elif os.path.exists(
            cached_features_file) and output_examples and os.path.exists(
                cached_examples_file):
        logger.info("Loading features and examples from cached file %s",
                    cached_features_file)
        with open(cached_features_file, 'rb') as f:
            features = pickle.load(f)
        with open(cached_examples_file, 'rb') as f:
            examples = pickle.load(f)

    else:
        logger.info("Creating features from dataset file at %s", input_file)
        examples = read_squad_examples(
            input_file=input_file,
            is_training=not evaluate,
            version_2_with_negative=args.version_2_with_negative)

        features = convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=not evaluate,
            cls_token=tokenizer.cls_token,
            sep_token=tokenizer.sep_token,
            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token
                                                       ])[0])

        if not evaluate:
            max_id = examples[-1].context_id
            tmp_features = {str(i): [] for i in range(max_id + 1)}
            for feature in features:
                context_id = feature.context_id
                tmp_features[str(context_id)].append(feature)
            features = tmp_features

        if evaluate and args.local_rank in [-1, 0]:
            logger.info("Dumping evaluation examples to cached file %s",
                        cached_examples_file)
            with open(cached_examples_file, 'wb') as f:
                pickle.dump(examples, f)

        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s",
                        cached_features_file)
            # torch.save(features, cached_features_file)
            with open(cached_features_file, 'wb') as f:
                pickle.dump(features, f)

    if not evaluate and args.matching_sample:
        train_features = []
        for key in sampled_keys:
            feature = features[key]
            train_features += feature
        train_feature_length = len(train_features) if args.truncate_task_dataset < 0 \
                               else min(args.truncate_task_dataset, len(train_features))
        extracted_features = random.sample(features.items(), len(features))

        if args.fix_task > 0:
            print("Load Fixed Valid Task...")
            feature_length = args.fix_task
            valid_features_file = cached_features_file + "_valid_" + str(
                feature_length)
            if os.path.exists(valid_features_file):
                with open(valid_features_file, 'rb') as f:
                    test_features = pickle.load(f)
            else:
                sub_features = []
                for feature in features.items():
                    sub_features += feature[1]
                sub_len = len(sub_features)
                feature_length = sub_len if args.truncate_task_dataset < 0 \
                                 else min(sub_len, feature_length)
                test_features = random.sample(sub_features, feature_length)
                with open(valid_features_file, 'wb') as f:
                    pickle.dump(test_features, f)

        else:
            test_features = []
            count = 0
            for feature in extracted_features:
                if count >= train_feature_length:
                    break
                test_features += feature[1]
                count += len(feature[1])
            test_features = random.sample(test_features, len(test_features))
            test_features = test_features[:train_feature_length]
        if args.truncate_task_dataset > 0:
            train_features = random.sample(train_features,
                                           train_feature_length)
        print('Extracted train features --> %s' % len(train_features))
        print('Extracted test features --> %s' % len(test_features))

        if not meta_training:
            features = train_features
    elif not evaluate:
        all_features = []
        for idx, feature in features.items():
            all_features += feature
        features = all_features

    if args.local_rank == 0 and not evaluate:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    if not meta_training:
        # Convert to Tensors and build dataset
        if args.truncate_task_dataset > 0 and not evaluate:
            features = features[:args.truncate_task_dataset]
        print('Extracted features --> %s' % len(features))

        all_input_ids = torch.tensor([f.input_ids for f in features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                       dtype=torch.long)
        all_cls_index = torch.tensor([f.cls_index for f in features],
                                     dtype=torch.long)
        all_p_mask = torch.tensor([f.p_mask for f in features],
                                  dtype=torch.float)
        if evaluate:
            all_example_index = torch.arange(all_input_ids.size(0),
                                             dtype=torch.long)
            dataset = TensorDataset(all_input_ids, all_input_mask,
                                    all_segment_ids, all_example_index,
                                    all_cls_index, all_p_mask)
        else:
            all_start_positions = torch.tensor(
                [f.start_position for f in features], dtype=torch.long)
            all_end_positions = torch.tensor(
                [f.end_position for f in features], dtype=torch.long)
            dataset = TensorDataset(all_input_ids, all_input_mask,
                                    all_segment_ids, all_start_positions,
                                    all_end_positions, all_cls_index,
                                    all_p_mask)
    else:
        if not evaluate and not args.matching_sample:
            if args.truncate_task_dataset > 0 and not evaluate:
                random.shuffle(features)
                features = features[:args.truncate_task_dataset]

            train_len = int(0.8 * len(features))
            test_features = features[train_len:]
            train_features = features[:train_len]
            print('Extracted train features --> %s' % len(train_features))
            print('Extracted test features --> %s' % len(test_features))

        dataset = []

        for features in [train_features, test_features]:
            all_input_ids = torch.tensor([f.input_ids for f in features],
                                         dtype=torch.long)
            all_input_mask = torch.tensor([f.input_mask for f in features],
                                          dtype=torch.long)
            all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                           dtype=torch.long)
            all_cls_index = torch.tensor([f.cls_index for f in features],
                                         dtype=torch.long)
            all_p_mask = torch.tensor([f.p_mask for f in features],
                                      dtype=torch.float)
            all_start_positions = torch.tensor(
                [f.start_position for f in features], dtype=torch.long)
            all_end_positions = torch.tensor(
                [f.end_position for f in features], dtype=torch.long)
            dataset_ = TensorDataset(all_input_ids, all_input_mask,
                                     all_segment_ids, all_start_positions,
                                     all_end_positions, all_cls_index,
                                     all_p_mask)
            dataset.append(dataset_)

    if output_examples:
        return dataset, examples, features
    return dataset
Exemplo n.º 2
0
def load_and_cache_examples(args, task, tokenizer, evaluate=False, meta_training=False, sampled_keys=None):
    if args.local_rank not in [-1, 0] and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    processor = processors[task]()
    output_mode = output_modes[task]
    # Load data features from cache or dataset file
    model_name = args.model_type

    cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format(
        'dev' if evaluate else 'train',
        model_name,
        str(args.max_seq_length),
        str(task)))

    print("Load {}".format(cached_features_file))
    if os.path.exists(cached_features_file):
        logger.info("Loading features from cached file %s", cached_features_file)
        with open(cached_features_file, 'rb') as f:
            features = pickle.load(f)
    else:
        logger.info("Creating features from dataset file at %s", args.data_dir)
        label_list = processor.get_labels()
        examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
        features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, output_mode,
            cls_token_at_end=bool(args.model_type in ['xlnet']),            # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0,
            sep_token=False if args.task == "chemical" else tokenizer.sep_token,
            sep_token_extra=bool(args.model_type in ['roberta']),           # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
            pad_on_left=bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
            pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
        )

        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
            with open(cached_features_file, 'wb') as f:
                pickle.dump(features, f)

    if not evaluate and args.matching_sample:
        train_features = []
        sampled_keys = list(set(sampled_keys))
        for key in sampled_keys:
            feature = features[int(key)]
            train_features.append(feature)

        train_feature_length = len(train_features) if args.truncate_task_dataset < 0 \
                               else min(args.truncate_task_dataset, len(train_features))
        extracted_features = random.sample(features, len(features))
        test_features = []
        count = 0
        for feature in extracted_features:
            if count >= train_feature_length:
                break
            test_features.append(feature)
            count += 1
        test_features = random.sample(test_features, len(test_features))
        test_features = test_features[:train_feature_length]
        if args.truncate_task_dataset > 0:
            train_features = random.sample(train_features, train_feature_length)
        print('Extracted train features --> %s' % len(train_features))
        print('Extracted test features --> %s' % len(test_features))

        if not meta_training:
            features = train_features
    elif not evaluate:
        all_features = []
        for feature in features:
            all_features.append(feature)
        features = all_features

        if args.truncate_task_dataset > 0:
            features = features[:args.truncate_task_dataset]

        train_feature_length = int(0.8 * len(features))
        train_features = features[:train_feature_length]
        test_features = features[train_feature_length:]

        print('Extracted train features --> %s' % len(train_features))
        print('Extracted test features --> %s' % len(test_features))

    if args.local_rank == 0 and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
    if not meta_training:
        if args.truncate_task_dataset > 0 and not evaluate:
            features = features[:args.truncate_task_dataset]
        # Convert to Tensors and build dataset
        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
        if output_mode == "classification":
            all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
        elif output_mode == "regression":
            all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float)

        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

    else:
       dataset = []
       for features in [train_features, test_features]:
            all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
            all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
            all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
            if output_mode == "classification":
                all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
            elif output_mode == "regression":
                all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float)

            dataset_ = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
            dataset.append(dataset_)

    return dataset