Пример #1
0
    def read_one_example(self, inputs):
        """ inputs keys: sequence_a and sequence_b """
        sequence_a = utils.get_sequence_a(inputs)
        sequence_b = inputs.get("sequence_b", None)

        sequence_a_sub_tokens = self.subword_tokenizer.tokenize(sequence_a)
        bert_input = [self.CLS_TOKEN
                      ] + sequence_a_sub_tokens + [self.SEP_TOKEN]

        if sequence_b:
            sequence_b_sub_tokens = self.subword_tokenizer.tokenize(sequence_b)
            bert_input += sequence_b_sub_tokens + [self.SEP_TOKEN]

        if len(bert_input) > self.sequence_max_length:
            bert_input = bert_input[:self.sequence_max_length -
                                    1] + [self.SEP_TOKEN]

        token_type = utils.make_bert_token_type(bert_input,
                                                SEP_token=self.SEP_TOKEN)

        features = []
        features.append({
            "bert_input": bert_input,
            "token_type": {
                "feature": token_type,
                "text": ""
            },  # TODO: fix hard-code
        })

        return features, {}
Пример #2
0
    def read_one_example(self, inputs):
        """ inputs keys: sequence_a and sequence_b """
        sequence_a = utils.get_sequence_a(inputs)
        sequence_b = inputs.get("sequence_b", None)

        bert_feature = BertFeature()
        bert_feature.set_input_with_speical_token(
            sequence_a,
            sequence_b,
            self.tokenizer,
            max_seq_length=self.sequence_max_length,
            data_type="predict",
            cls_token=self.cls_token,
            sep_token=self.sep_token,
            input_type=self.input_type,
        )

        features = [bert_feature.to_dict()]
        helper = {}
        return features, helper
Пример #3
0
    def _read(self, file_path, data_type=None):
        """
        .json file structure should be something like this:

        {
            "data": [
                {
                    "sequence_a": "what a wonderful day!",
                    "sequence_b": "what a great day!",
                    "score": 0.9
                },
                ...
            ]
        }
        """

        data = self._get_data(file_path, data_type=data_type)

        helper = {
            "file_path": file_path,
            "examples": {},
            "cls_token": self.CLS_TOKEN,
            "sep_token": self.SEP_TOKEN,
            "unk_token": self.UNK_TOKEN,
            "model": {},
            "predict_helper": {}
        }
        features, labels = [], []

        for example in tqdm(data, desc=data_type):
            sequence_a = utils.get_sequence_a(example)
            sequence_b = example.get("sequence_b", None)

            sequence_a_sub_tokens = self.subword_tokenizer.tokenize(sequence_a)
            sequence_b_sub_tokens = None
            bert_input = [self.CLS_TOKEN
                          ] + sequence_a_sub_tokens + [self.SEP_TOKEN]

            if sequence_b is not None:
                sequence_b_sub_tokens = self.subword_tokenizer.tokenize(
                    sequence_b)
                bert_input += sequence_b_sub_tokens + [self.SEP_TOKEN]

            if (self.sequence_max_length is not None and data_type == "train"
                    and len(bert_input) > self.sequence_max_length):
                continue

            if "uid" in example:
                data_uid = example["uid"]
            else:
                data_uid = str(uuid.uuid1())

            feature_row = {
                "id": data_uid,
                "bert_input": bert_input,
            }
            features.append(feature_row)

            score = example[self.label_key]
            label_row = {
                "id": data_uid,
                "score": score,
            }
            labels.append(label_row)

            helper["examples"][data_uid] = {
                "sequence_a": sequence_a,
                "sequence_a_sub_tokens": sequence_a_sub_tokens,
                "sequence_b": sequence_b,
                "sequence_b_sub_tokens": sequence_b_sub_tokens,
                "score": score,
            }

            if self.is_test and len(features) >= 10:
                break

        return make_batch(features, labels), helper
Пример #4
0
    def _read(self, file_path, data_type=None):
        """
        .json file structure should be something like this:

        {
            "data": [
                {
                    "sequence_a": "what a wonderful day!",
                    "sequence_b": "what a great day!",
                    "score": 0.9
                },
                ...
            ]
        }
        """

        data = self._get_data(file_path, data_type=data_type)

        helper = Helper(**{
            "file_path": file_path,
            "cls_token": self.cls_token,
            "sep_token": self.sep_token,
        })

        features, labels = [], []

        for example in tqdm(data, desc=data_type):
            sequence_a = utils.get_sequence_a(example)
            sequence_b = example.get("sequence_b", None)

            sequence_a_tokens = self.tokenizer.tokenize(sequence_a)
            sequence_b_tokens = None
            if sequence_b:
                sequence_b_tokens = self.tokenizer.tokenize(sequence_b)

            bert_input = utils.make_bert_input(
                sequence_a,
                sequence_b,
                self.tokenizer,
                max_seq_length=self.sequence_max_length,
                data_type=data_type,
                cls_token=self.cls_token,
                sep_token=self.sep_token,
                input_type=self.input_type,
            )

            if bert_input is None:
                continue

            if "uid" in example:
                data_uid = example["uid"]
            else:
                data_uid = str(uuid.uuid1())

            feature_row = {
                "id": data_uid,
                "bert_input": bert_input,
            }
            features.append(feature_row)

            score = example[self.label_key]
            label_row = {
                "id": data_uid,
                "score": score,
            }
            labels.append(label_row)

            helper.set_example(data_uid, {
                "sequence_a": sequence_a,
                "sequence_a_tokens": sequence_a_tokens,
                "sequence_b": sequence_b,
                "sequence_b_tokens": sequence_b_tokens,
                "score": score,
            })

            if self.is_test and len(features) >= 10:
                break

        return utils.make_batch(features, labels), helper.to_dict()
Пример #5
0
    def _read(self, file_path, data_type=None):
        """
        .json file structure should be something like this:

        {
            "data": [
                {
                    "sequence": "what a wonderful day!",
                    "emotion": "happy"
                },
                ...
            ],
            "emotion": [  // class_key
                "angry",
                "happy",
                "sad",
                ...
            ]
        }
        """

        data = self._get_data(file_path, data_type=data_type)
        class_idx2text, class_text2idx = self._get_class_dicts(data=data)

        helper = Helper(**{
            "file_path": file_path,
            "class_idx2text": class_idx2text,
            "class_text2idx": class_text2idx,
            "cls_token": self.cls_token,
            "sep_token": self.sep_token,
            "dataset": SeqClsBertDataset,
            "metric_key": self.METRIC_KEY,
        })
        helper.set_model_parameter({
            "num_classes": len(class_idx2text),
        })
        helper.set_predict_helper({
            "class_idx2text": class_idx2text,
        })

        features, labels = [], []

        for example in tqdm(data, desc=data_type):
            sequence_a = utils.get_sequence_a(example)
            sequence_b = example.get("sequence_b", None)

            sequence_a_tokens = self.tokenizer.tokenize(sequence_a)
            sequence_b_tokens = None
            if sequence_b:
                sequence_b_tokens = self.tokenizer.tokenize(sequence_b)

            bert_input = utils.make_bert_input(
                sequence_a,
                sequence_b,
                self.tokenizer,
                max_seq_length=self.sequence_max_length,
                data_type=data_type,
                cls_token=self.cls_token,
                sep_token=self.sep_token,
                input_type=self.input_type,
            )

            if bert_input is None:
                continue

            if "uid" in example:
                data_uid = example["uid"]
            else:
                data_uid = str(uuid.uuid1())

            # token_type(segment_ids) will be added in dataset
            feature_row = {
                "id": data_uid,
                "bert_input": bert_input,
            }
            features.append(feature_row)

            class_text = example[self.class_key]
            label_row = {
                "id": data_uid,
                "class_idx": class_text2idx[class_text],
                "class_text": class_text,
            }
            labels.append(label_row)

            helper.set_example(data_uid, {
                "sequence_a": sequence_a,
                "sequence_a_tokens": sequence_a_tokens,
                "sequence_b": sequence_b,
                "sequence_b_tokens": sequence_b_tokens,
                "class_idx": class_text2idx[class_text],
                "class_text": class_text,
            })

            if self.is_test and len(features) >= 10:
                break

        return utils.make_batch(features, labels), helper.to_dict()