Exemplo n.º 1
0
    def __init__(
        self,
        data_dir,
        tokenizer: PreTrainedTokenizer,
        label_list: List[str],
        limit_length: Optional[int] = None,
        mode: Union[str, Split] = Split.train,
        cache_dir: Optional[str] = None,
        max_seq_length: int = 128,
        overwrite_cache: bool = False,
        task_name='calssification',
    ):
        self.processor = DATA_PROCESSOR[task_name]()
        if isinstance(mode, str):
            try:
                mode = Split[mode]
            except KeyError:
                raise KeyError("mode is not a valid split name")
        self.label_list = label_list

        # Make sure only the first process in distributed training processes the dataset,
        # and the others will use the cache.
        logger.info(f"Creating features from dataset file at {data_dir}")

        examples = self.processor.get_examples(data_dir, mode)
        if limit_length is not None:
            examples = examples[:limit_length]
        self.features = glue_convert_examples_to_features(
            examples,
            tokenizer,
            max_length=max_seq_length,
            label_list=self.label_list,
            output_mode='classification',
        )
Exemplo n.º 2
0
 def from_instance_list(cls, inst_list, tokenizer):
     examples = []
     for (ind, inst) in enumerate(inst_list):
         guid = 'instance-%d' % (ind)
         examples.append(
             InputExample(guid=guid, text_a=inst, text_b='', label=None))
     features = glue_convert_examples_to_features(
         examples,
         tokenizer,
         max_length=max_length,
         label_list=labels,
         output_mode='classification')
     return cls(features)
Exemplo n.º 3
0
 def __init__(
     self,
     texts: List[str],
     id2label: Dict[str, int],
     tokenizer: PreTrainedTokenizer,
     max_seq_length: int = 128,
     task_name='calssification',
 ):
     self.processor = DATA_PROCESSOR[task_name]()
     examples = self.processor.get_predict_examples(texts)
     self.label_list = [label for _, label in id2label.items()]
     self.features = glue_convert_examples_to_features(
         examples,
         tokenizer,
         max_length=max_seq_length,
         label_list=self.label_list,
         output_mode='classification',
     )
Exemplo n.º 4
0
    def _get_data(self, data_dir, mode):
        # define processors
        processors = {
            'CoLA': ColaProcessor,
            'SST-2': Sst2Processor,
            'MNLI': MnliProcessor,
            'MRPC': MrpcProcessor,
            'QNLI': QnliProcessor,
            'QQP': QqpProcessor,
            'RTE': RteProcessor,
            'WNLI': WnliProcessor
        }

        # get InputExamples from raw file
        p = processors[self.task]()
        if mode == 'train':
            input_examples = p.get_train_examples(
                data_dir=os.path.join(data_dir, self.task))
        elif mode == 'dev':
            input_examples = p.get_dev_examples(
                data_dir=os.path.join(data_dir, self.task))
        else:
            raise Exception('mode must be in ["train", "dev"]...')

        # get InputFeatures from InputExamples
        input_features = glue_convert_examples_to_features(input_examples, tokenizer=self.tokenizer, \
                                                           max_length=self.max_len, task=self.task.lower())

        # convert InputFeatures to tensor
        input_ids, attention_mask, token_type_ids, labels = [], [], [], []
        for feature in input_features:
            input_ids.append(feature.input_ids)
            attention_mask.append(feature.attention_mask)
            token_type_ids.append(feature.token_type_ids)
            labels.append(feature.label)
        input_ids, attention_mask, token_type_ids, labels = map(
            lambda x: torch.LongTensor(x),
            (input_ids, attention_mask, token_type_ids, labels))

        return (input_ids, attention_mask, token_type_ids,
                labels), len(p.get_labels())
    def from_tsv(cls, tsv_file, tokenizer):
        """Creates examples for the test set."""
        lines = DataProcessor._read_tsv(tsv_file)
        examples = []
        for (i, line) in enumerate(lines):
            guid = 'instance-%d' % i
            if line[0] in labels:
                text_a = '\t'.join(line[1:])
            else:
                text_a = '\t'.join(line)

            examples.append(
                InputExample(guid=guid, text_a=text_a, text_b=None,
                             label=None))

        features = glue_convert_examples_to_features(
            examples,
            tokenizer,
            max_length=max_length,
            label_list=labels,
            output_mode='classification',
        )
        return cls(features)
Exemplo n.º 6
0
    def __init__(
        self,
        args: GlueDataTrainingArguments,
        tokenizer: PreTrainedTokenizer,
        limit_length: Optional[int] = None,
        mode: Union[str, Split] = Split.train,
        cache_dir: Optional[str] = None,
    ):
        self.args = args
        self.processor = seq_clf_processors[args.task_name]()
        self.output_mode = seq_clf_output_modes[args.task_name]
        if isinstance(mode, str):
            try:
                mode = Split[mode]
            except KeyError:
                raise KeyError('mode is not a valid split name')
        
        # Load data features from cache or dataset file
        cached_features_file = os.path.join(
            cache_dir if cache_dir is not None else args.data_dir,
            'cached_{}_{}_{}_{}'.format(
                mode.value, tokenizer.__class__.__name__, str(args.max_seq_length), args.task_name,
            ),
        )
        label_list = self.processor.get_labels()
        if args.task_name in ['mnli', 'mnli-mm'] and tokenizer.__class__ in (
            RobertaTokenizer,
            RobertaTokenizerFast,
            XLMRobertaTokenizer,
            BartTokenizer,
            BartTokenizerFast,
        ):
            # HACK(label indices are swapped in RoBERTa pretrained model)
            label_list[1], label_list[2] = label_list[2], label_list[1]
        self.label_list = label_list

        # Make sure only the first process in distributed training processes the dataset,
        # and the others will use the cache.
        lock_path = cached_features_file + '.lock'
        with FileLock(lock_path):

            if os.path.exists(cached_features_file) and not args.overwrite_cache:
                start = time.time()
                self.features = torch.load(cached_features_file)
                logger.info(
                    f'Loading features from cached file {cached_features_file} [took %.3f s]', time.time() - start
                )
            else:
                logger.info(f'Creating features from dataset file at {args.data_dir}')

                if mode == Split.dev:
                    examples = self.processor.get_dev_examples(args.data_dir)
                elif mode == Split.test:
                    examples = self.processor.get_test_examples(args.data_dir)
                else:
                    examples = self.processor.get_train_examples(args.data_dir)
                if limit_length is not None:
                    examples = examples[:limit_length]

                # Load a data file into a list of ``InputFeatures``
                self.features = glue_convert_examples_to_features(
                    examples,
                    tokenizer,
                    max_length=args.max_seq_length,
                    label_list=label_list,
                    output_mode=self.output_mode,
                )
                start = time.time()
                torch.save(self.features, cached_features_file)
                # ^ This seems to take a lot of time so I want to investigate why and how we can improve.
                logger.info(
                    'Saving features into cached file %s [took %.3f s]', cached_features_file, time.time() - start
                )
    def __init__(
        self,
        args: DataTrainingArguments,
        tokenizer: PreTrainedTokenizer,
        limit_length: Optional[int] = None,
        evaluate=False,
    ):
        self.args = args
        processor = Processor()
        self.output_mode = 'classification'
        # Load data features from cache or dataset file
        cached_features_file = os.path.join(
            args.data_dir,
            "cached_{}_{}_{}_{}".format(
                "dev" if evaluate else "train", tokenizer.__class__.__name__, str(args.max_seq_length), args.name,
            ),
        )

        # Make sure only the first process in distributed training processes the dataset,
        # and the others will use the cache.
        lock_path = cached_features_file + ".lock"
        with FileLock(lock_path):

            if os.path.exists(cached_features_file) and not args.overwrite_cache:
                start = time.time()
                self.features = torch.load(cached_features_file)
                logger.info(
                    f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
                )
            else:
                logger.info(f"Creating features from dataset file at {args.data_dir}")
                label_list = processor.get_labels()
                
                # if args.task_name in ["mnli", "mnli-mm"] and tokenizer.__class__ in (
                #     RobertaTokenizer,
                #     RobertaTokenizerFast,
                #     XLMRobertaTokenizer,
                # ):
                #     # HACK(label indices are swapped in RoBERTa pretrained model)
                #     label_list[1], label_list[2] = label_list[2], label_list[1]

                    
                examples = (
                    processor.get_dev_examples(args.data_dir)
                    if evaluate
                    else processor.get_train_examples(args.data_dir)
                )
                if limit_length is not None:
                    examples = examples[:limit_length]
                self.features = glue_convert_examples_to_features(
                    examples,
                    tokenizer,
                    max_length=args.max_seq_length,
                    label_list=label_list,
                    output_mode='classification', 
                )
                start = time.time()
                torch.save(self.features, cached_features_file)
                # ^ This seems to take a lot of time so I want to investigate why and how we can improve.
                logger.info(
                    "Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start
                )
Exemplo n.º 8
0
    def __init__(
        self,
        args: GlueDataTrainingArguments,
        tokenizer: PreTrainedTokenizerBase,
        limit_length: Optional[int] = None,
        mode: Union[str, Split] = Split.train,
        cache_dir: Optional[str] = None,
    ):
        warnings.warn(
            "This dataset will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets "
            "library. You can have a look at this example script for pointers: "
            "https://github.com/huggingface/transformers/blob/master/examples/text-classification/run_glue.py",
            FutureWarning,
        )
        self.args = args

        if args.task_name == 'ddi':
            self.processor = DDIProcessor()
        elif args.task_name == 'chemprot':
            self.processor = ChemProtProcessor()
        else:
            self.processor = glue_processors[args.task_name]()

        self.output_mode = glue_output_modes[args.task_name]
        if isinstance(mode, str):
            try:
                mode = Split[mode]
            except KeyError:
                raise KeyError("mode is not a valid split name")
        # Load data features from cache or dataset file
        cached_features_file = os.path.join(
            cache_dir if cache_dir is not None else args.data_dir,
            "cached_{}_{}_{}_{}".format(
                mode.value,
                tokenizer.__class__.__name__,
                str(args.max_seq_length),
                args.task_name,
            ),
        )
        label_list = self.processor.get_labels()
        if args.task_name in ["mnli", "mnli-mm"
                              ] and tokenizer.__class__.__name__ in (
                                  "RobertaTokenizer",
                                  "RobertaTokenizerFast",
                                  "XLMRobertaTokenizer",
                                  "BartTokenizer",
                                  "BartTokenizerFast",
                              ):
            # HACK(label indices are swapped in RoBERTa pretrained model)
            label_list[1], label_list[2] = label_list[2], label_list[1]

        self.label_list = label_list

        # Make sure only the first process in distributed training processes the dataset,
        # and the others will use the cache.
        lock_path = cached_features_file + ".lock"
        with FileLock(lock_path):

            if os.path.exists(
                    cached_features_file) and not args.overwrite_cache:
                start = time.time()
                self.features = torch.load(cached_features_file)
                logger.info(
                    f"Loading features from cached file {cached_features_file} [took %.3f s]",
                    time.time() - start)
            else:
                logger.info(
                    f"Creating features from dataset file at {args.data_dir}")

                if mode == Split.dev:
                    examples = self.processor.get_dev_examples(args.data_dir)
                elif mode == Split.test:
                    examples = self.processor.get_test_examples(args.data_dir)
                else:
                    examples = self.processor.get_train_examples(args.data_dir)
                if limit_length is not None:
                    examples = examples[:limit_length]
                self.features = glue_convert_examples_to_features(
                    examples,
                    tokenizer,
                    max_length=args.max_seq_length,
                    label_list=label_list,
                    output_mode=self.output_mode,
                )
                start = time.time()
                torch.save(self.features, cached_features_file)
                # ^ This seems to take a lot of time so I want to investigate why and how we can improve.
                logger.info(
                    "Saving features into cached file %s [took %.3f s]",
                    cached_features_file,
                    time.time() - start)
Exemplo n.º 9
0
# data, info = tensorflow_datasets.load(f"glue/{TFDS_TASK}", with_info=True)
# train_examples = info.splits["train"].num_examples

# MNLI expects either validation_matched or validation_mismatched
# valid_examples = info.splits[val_string].num_examples
# test_examples = info.splits[test_string].num_examples

##replace train and test examples
data_processor = glue.glue_processors[TASK]()
data_dir = os.environ['GLUE_DIR']
train_examples = data_processor.get_train_examples(data_dir)
dev_examples = data_processor.get_dev_examples(data_dir)
# Prepare dataset for GLUE as a tf.data.Dataset instance
# train_dataset = glue_convert_examples_to_features(data["train"], tokenizer, max_length=128, task=TASK)
train_features = glue.glue_convert_examples_to_features(train_examples,
                                                        tokenizer,
                                                        max_length=128,
                                                        task=TASK)

# MNLI expects either validation_matched or validation_mismatched
# test_features = glue.glue_convert_examples_to_features(data[test_string], tokenizer, max_length=128, task=TASK)

# test_dataset = glue_convert_examples_to_features(data[test_string], tokenizer, max_length=128, task=TASK)
dev_features = glue.glue_convert_examples_to_features(dev_examples,
                                                      tokenizer,
                                                      max_length=128,
                                                      task=TASK)
# dev_dataset_for_eval = glue.glue_convert_examples_to_features(data[val_string], tokenizer, max_length=128, task=TASK)
# old code
# train_dataset = train_dataset.shuffle(128).batch(BATCH_SIZE).repeat(-1)
# valid_dataset = valid_dataset.batch(EVAL_BATCH_SIZE)
# test_dataset = test_dataset.batch(EVAL_BATCH_SIZE)
Exemplo n.º 10
0
    def from_tsv(cls,
                 tsv_file,
                 tokenizer,
                 flag=0,
                 idx=[],
                 y_true_=[],
                 extra_ratio=0):
        """Creates examples for the test set."""
        lines = DataProcessor._read_tsv(tsv_file)
        # print(len(lines))
        pos_lines = []
        neg_lines = []
        extra_lines = 0
        pos_n = 0
        neg_n = 0
        if idx != []:
            lines = [line for (i, line) in enumerate(lines) if i in idx]
            print(len(lines))
            pos_lines = [
                line for (i, line) in enumerate(lines) if y_true_[i] == 1
            ]
            neg_lines = [
                line for (i, line) in enumerate(lines) if y_true_[i] == 0
            ]
            pos_n = len(pos_lines)
            neg_n = len(neg_lines)
            extra_pos_ratio = ((len(neg_lines) / len(pos_lines)) *
                               (1 + extra_ratio) - 1)
            extra_neg_ratio = extra_ratio
            # print(extra_pos_ratio)

        # print(lines)
        if flag == 1:
            pos_y_true_ = torch.ones(pos_n).cuda()
            neg_y_true_ = torch.zeros(neg_n).cuda()
            pos_lines, pos_y_true_ = text_augmentation(pos_lines, pos_y_true_,
                                                       extra_pos_ratio)
            neg_lines, _ = text_augmentation(neg_lines, neg_y_true_,
                                             extra_neg_ratio)
            lines = []
            print(len(pos_y_true_))
            y_true_ = pos_y_true_.unsqueeze(1)
            for line in pos_lines:
                lines.append(line)
            for line in neg_lines:
                lines.append(line)
                y_true_ = torch.vstack((y_true_, torch.tensor(0).cuda()))

            y_true_ = y_true_.squeeze(1)

        examples = []
        for (i, line) in enumerate(lines):
            # if idx==None or i in idx:
            guid = 'instance-%d' % i
            if line[0] in labels:
                text_a = '\t'.join(line[1:])
            else:
                text_a = '\t'.join(line)
            examples.append(
                InputExample(guid=guid, text_a=text_a, text_b=None,
                             label=None))

        features = glue_convert_examples_to_features(
            examples,
            tokenizer,
            max_length=max_length,
            label_list=labels,
            output_mode='classification',
        )
        return cls(features), y_true_
Exemplo n.º 11
0
def process_data(datapath, tokenizer, test_portion=False, split_ratio=0.8, load_processed=False):
    """Reads datafiles, computes features, splits into train/val, generate tensor datasets."""

    def generate_inputexamples(dataframe):
        examples = []
        for irow, row in dataframe.iterrows():
            examples.append(InputExample(irow, row.Headline, row.articleBody, row.Stance))
        return examples

    def balanced_split(features, split_ratio):
        """Split train/val such that each class proportionately represented in splits"""

        logger.info(f"Splitting data into train/val split, maintaining class compositions.")
        train_feats = []
        val_feats = []
        for cur_label in range(4):
            featlist = [feature for feature in features if feature.label == cur_label]
            random.shuffle(featlist)
            train, val = featlist[:int(split_ratio * len(featlist))], featlist[int(split_ratio * len(featlist)):]
            train_feats.extend(train)
            val_feats.extend(val)
        return train_feats, val_feats

    def gen_dataset(features):
        """Generate tensor dataset for a split"""

        input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
        all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
        all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
        tensor_dataset = TensorDataset(input_ids, all_attention_mask, all_labels)
        logger.info(f"Processed Tensor data from feature objects")
        return tensor_dataset

    logger = logging.getLogger(__name__)
    mode = 'competition_test' if test_portion else 'train'
    logger.info(f"Processing dataset from {datapath} in mode {mode}.")
    dumppath = os.path.join(datapath, f'processed_{mode}_data.pkl')

    if load_processed and os.path.isfile(dumppath):
        logger.info(f"Loading the preprocessed data from {dumppath}")
        with open(dumppath, 'rb') as f:
            features = pickle.load(f)
    else:
        df_labels = pd.read_csv(os.path.join(datapath, f'{mode}_stances.csv'))
        df_body = pd.read_csv(os.path.join(datapath, f'{mode}_bodies.csv'))
        df_dataset = pd.merge(df_labels, df_body, left_on='Body ID', right_on='Body ID')
        df_dataset['Headline'] = df_dataset['Headline'].apply(DataSet.clean_article)
        df_dataset['articleBody'] = df_dataset['articleBody'].apply(DataSet.clean_article)

        logger.info(f"Preparing data from dataset files")
        examples = generate_inputexamples(df_dataset)
        features = glue_convert_examples_to_features(examples, tokenizer,
                                                     label_list=label_list,
                                                     output_mode='classification')
        with open(dumppath, 'wb') as f:
            pickle.dump(features, f)
        logger.info(f"Dumped processed data pickle at {dumppath}")

    if mode == 'train':
        train_feats, val_feats = balanced_split(features, split_ratio)
        train_dataset = gen_dataset(train_feats)
        val_dataset = gen_dataset(val_feats)

        return train_dataset, val_dataset
    else:
        return gen_dataset(features)
Exemplo n.º 12
0
    def __init__(
        self,
        args: DataTrainingArguments,
        tokenizer: PreTrainedTokenizer,
        limit_length: Optional[int] = None,
        mode: Union[str, Split] = Split.train,
        cache_dir: Optional[str] = None,
    ):
        self.args = args
        self.processor = NegationProcessor()
        self.output_mode = 'classification'
        if isinstance(mode, str):
            try:
                mode = Split[mode]
            except KeyError:
                raise KeyError("mode is not a valid split name")
        # Load data features from cache or dataset file
        dataset = basename(dirname(
            args.data_dir)) if args.data_dir[-1] == '/' else basename(
                args.data_dir)
        cached_features_file = os.path.join(
            cache_dir if cache_dir is not None else args.data_dir,
            "cached_polarity_{}_{}_{}_{}".format(
                dataset,
                mode.value,
                tokenizer.__class__.__name__,
                str(args.max_seq_length),
            ),
        )
        label_list = self.processor.get_labels()
        self.label_list = label_list

        # Make sure only the first process in distributed training processes the dataset,
        # and the others will use the cache.
        lock_path = cached_features_file + ".lock"
        with FileLock(lock_path):

            if os.path.exists(
                    cached_features_file) and not args.overwrite_cache:
                start = time.time()
                self.features = torch.load(cached_features_file)
                logger.info(
                    f"Loading features from cached file {cached_features_file} [took %.3f s]",
                    time.time() - start)
            else:
                logger.info(
                    f"Creating features from dataset file at {args.data_dir}")

                if mode == Split.dev:
                    examples = self.processor.get_dev_examples(args.data_dir)
                elif mode == Split.test:
                    examples = self.processor.get_test_examples(args.data_dir)
                else:
                    examples = self.processor.get_train_examples(args.data_dir)
                if limit_length is not None:
                    examples = examples[:limit_length]
                self.features = glue_convert_examples_to_features(
                    examples,
                    tokenizer,
                    max_length=args.max_seq_length,
                    label_list=label_list,
                    output_mode=self.output_mode,
                )
                start = time.time()
                torch.save(self.features, cached_features_file)
                # ^ This seems to take a lot of time so I want to investigate why and how we can improve.
                logger.info(
                    "Saving features into cached file %s [took %.3f s]",
                    cached_features_file,
                    time.time() - start)