예제 #1
0
 def default_features(cls) -> feature.Features:
     return feature.Features(
         {
             "text": feature.Value("string"),
             "edits": feature.Dict(
                 feature={
                     "start_idx": feature.Sequence(feature=feature.Value("int32")),
                     "end_idx": feature.Sequence(feature=feature.Value("int32")),
                     "corrections": feature.Sequence(
                         feature=feature.Sequence(feature=feature.Value("string"))
                     ),
                 }
             ),
             "text_length": feature.Value(
                 dtype="float",
                 description="length of the text",
                 is_bucket=True,
                 bucket_info=feature.BucketInfo(
                     method="bucket_attribute_specified_bucket_value",
                     number=4,
                     setting=(),
                 ),
             ),
         }
     )
예제 #2
0
    def _customize_features(self, metadata: dict) -> Optional[Features]:
        """
        declare the customized features for this processor.
        Args:
            metadata: the metadata information of system output

        Returns:

        """
        features = copy.deepcopy(self._default_features)

        # add user-defined features into features list
        if metadata is not None:
            for (
                feature_name,
                feature_config,
            ) in metadata.items():
                if feature_config["dtype"] == "string":
                    features[feature_name] = feature.Value(
                        dtype="string",
                        description=feature_config["description"],
                        is_bucket=True,
                        is_custom=True,
                        bucket_info=feature.BucketInfo(
                            method="bucket_attribute_discrete_value",
                            number=feature_config["num_buckets"],
                            setting=1,
                        ),
                    )
                elif feature_config["dtype"] == 'float':
                    features[feature_name] = feature.Value(
                        dtype="float",
                        description=feature_config["description"],
                        is_bucket=True,
                        is_custom=True,
                        bucket_info=feature.BucketInfo(
                            method="bucket_attribute_specified_bucket_value",
                            number=feature_config["num_buckets"],
                            setting=(),
                        ),
                    )
                else:
                    raise NotImplementedError

        return features
예제 #3
0
 def default_features(cls) -> feature.Features:
     f = super().default_features()
     f.update(
         feature.Features(
             {
                 # declaim task-specific features
                 "attr_compression": feature.Value(
                     dtype="float",
                     description="the ratio between source and reference length",
                     is_bucket=True,
                     bucket_info=feature.BucketInfo(
                         method="bucket_attribute_specified_bucket_value",
                         number=4,
                         setting=(),
                     ),
                 ),
             }
         )
     )
     return f
 def default_features(cls) -> feature.Features:
     return feature.Features({
         "true_head":
         feature.Value("string"),
         "true_head_decipher":
         feature.Value("string"),
         "true_link":
         feature.Value("string"),
         "true_tail":
         feature.Value("string"),
         "true_tail_decipher":
         feature.Value("string"),
         "predict":
         feature.Value("string"),
         "true_label":
         feature.Value("string"),
         "predictions":
         feature.Sequence(feature=feature.Value("string")),
         "tail_entity_length":
         feature.Value(
             dtype="float",
             description="number of words in the tail entity",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
         ),
         "head_entity_length":
         feature.Value(
             dtype="float",
             description="number of words in the head entity",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
         ),
         "tail_fre":
         feature.Value(
             dtype="float",
             description="the frequency of tail entity in the training set",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
             require_training_set=True,
         ),
         "link_fre":
         feature.Value(
             dtype="float",
             description=
             "the frequency of link relation in the training set",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
             require_training_set=True,
         ),
         "head_fre":
         feature.Value(
             dtype="float",
             description=
             "the frequency of head relation in the training set",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
             require_training_set=True,
         ),
         "symmetry":
         feature.Value(
             dtype="string",
             description=(
                 "boolean feature: 'symmetric' or 'asymmetric'; more "
                 "granularity to be added"),
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_discrete_value",
                 number=2,
                 setting=1),
         ),
         "entity_type_level":
         feature.Value(
             dtype="string",
             description=
             ("most specific (highest) entity type level of true tail entity"
              ),
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_discrete_value",
                 number=8,
                 setting=1),
         ),
     })
예제 #5
0
 def test_get_bucket_features(self):
     ner_task_features = feature.Features({
         "tokens":
         feature.Sequence(feature=feature.Value("string")),
         "true_tags":
         feature.Sequence(feature=feature.Value("string")),
         "pred_tags":
         feature.Sequence(feature=feature.Value("string")),
         # --- the following are features of the sentences ---
         "sentence_length":
         feature.Value(
             dtype="float",
             description="sentence length",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
         ),
         "entity_density":
         feature.Value(
             dtype="float",
             description="the ration between all entity "
             "tokens and sentence tokens ",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
         ),
         "num_oov":
         feature.Value(
             dtype="float",
             description="the number of out-of-vocabulary words",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
             require_training_set=True,
         ),
         "fre_rank":
         feature.Value(
             dtype="float",
             description=(
                 "the average rank of each word based on its frequency in "
                 "training set"),
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
             require_training_set=True,
         ),
         # --- the following are features of each entity ---
         "true_entity_info":
         feature.Sequence(feature=feature.Dict(
             feature={
                 "span_text":
                 feature.Value("string"),
                 "span_tokens":
                 feature.Value(
                     dtype="float",
                     description="entity length",
                     is_bucket=True,
                     bucket_info=feature.BucketInfo(
                         method="bucket_attribute_specified_bucket_value",
                         number=4,
                         setting=(),
                     ),
                 ),
                 "span_pos":
                 feature.Position(positions=[0, 0]),
                 "span_tag":
                 feature.Value(
                     dtype="string",
                     description="entity tag",
                     is_bucket=True,
                     bucket_info=feature.BucketInfo(
                         method="bucket_attribute_discrete_value",
                         number=4,
                         setting=1,
                     ),
                 ),
                 "span_capitalness":
                 feature.Value(
                     dtype="string",
                     description=(
                         "The capitalness of an entity. For example, "
                         "first_caps represents only the first character of "
                         "the entity is capital. full_caps denotes all "
                         "characters of the entity are capital"),
                     is_bucket=True,
                     bucket_info=feature.BucketInfo(
                         method="bucket_attribute_discrete_value",
                         number=4,
                         setting=1,
                     ),
                 ),
                 "span_rel_pos":
                 feature.Value(
                     dtype="float",
                     description=(
                         "The relative position of an entity in a sentence"
                     ),
                     is_bucket=True,
                     bucket_info=feature.BucketInfo(
                         method="bucket_attribute_specified_bucket_value",
                         number=4,
                         setting=(),
                     ),
                 ),
                 "span_chars":
                 feature.Value(
                     dtype="float",
                     description="The number of characters of an entity",
                     is_bucket=True,
                     bucket_info=feature.BucketInfo(
                         method="bucket_attribute_specified_bucket_value",
                         number=4,
                         setting=(),
                     ),
                 ),
                 "span_econ":
                 feature.Value(
                     dtype="float",
                     description="entity label consistency",
                     is_bucket=True,
                     require_training_set=True,
                     bucket_info=feature.BucketInfo(
                         method="bucket_attribute_specified_bucket_value",
                         number=4,
                         setting=(),
                     ),
                 ),
                 "span_efre":
                 feature.Value(
                     dtype="float",
                     description="entity frequency",
                     is_bucket=True,
                     require_training_set=True,
                     bucket_info=feature.BucketInfo(
                         method="bucket_attribute_specified_bucket_value",
                         number=4,
                         setting=(),
                     ),
                 ),
             })),
     })
     bucket_features = ner_task_features.get_bucket_features()
     self.assertEqual(
         set(bucket_features),
         set([
             'sentence_length',
             'entity_density',
             'num_oov',
             'fre_rank',
             'span_tokens',
             'span_tag',
             'span_capitalness',
             'span_rel_pos',
             'span_chars',
             'span_econ',
             'span_efre',
         ]),
     )
예제 #6
0
 def default_features(cls) -> feature.Features:
     f = super().default_features()
     f.update(
         feature.Features({
             "attr_compression":
             feature.Value(
                 dtype="float",
                 description="compression",
                 is_bucket=True,
                 bucket_info=feature.BucketInfo(
                     method="bucket_attribute_specified_bucket_value",
                     number=4,
                     setting=(),
                 ),
             ),
             "attr_copy_len":
             feature.Value(
                 dtype="float",
                 description="copy length",
                 is_bucket=True,
                 bucket_info=feature.BucketInfo(
                     method="bucket_attribute_specified_bucket_value",
                     number=4,
                     setting=(),
                 ),
             ),
             "attr_coverage":
             feature.Value(
                 dtype="float",
                 description="coverage",
                 is_bucket=True,
                 bucket_info=feature.BucketInfo(
                     method="bucket_attribute_specified_bucket_value",
                     number=4,
                     setting=(),
                 ),
             ),
             "attr_novelty":
             feature.Value(
                 dtype="float",
                 description="novelty",
                 is_bucket=True,
                 bucket_info=feature.BucketInfo(
                     method="bucket_attribute_specified_bucket_value",
                     number=4,
                     setting=(),
                 ),
             ),
             "oracle_score":
             feature.Value(
                 dtype="float",
                 description="the sample-level oracle score",
                 is_bucket=True,
                 bucket_info=feature.BucketInfo(
                     method="bucket_attribute_specified_bucket_value",
                     number=4,
                     setting=(),
                 ),
             ),
             "oracle_position":
             feature.Value(
                 dtype="float",
                 description="the sample-level oracle position",
                 is_bucket=True,
                 bucket_info=feature.BucketInfo(
                     method="bucket_attribute_specified_bucket_value",
                     number=4,
                     setting=(),
                 ),
             ),
         }))
     return f
예제 #7
0
 def default_features(cls) -> feature.Features:
     return feature.Features(
         {
             "context": feature.Value("string"),
             "question": feature.Value("string"),
             "options": feature.Sequence(feature=feature.Value("string")),
             "answers": feature.Sequence(
                 feature=feature.Dict(
                     feature={
                         "text": feature.Value("string"),
                         "option_index": feature.Value("int32"),
                     }
                 )
             ),
             "context_length": feature.Value(
                 dtype="float",
                 description="the length of context",
                 is_bucket=True,
                 bucket_info=feature.BucketInfo(
                     method="bucket_attribute_specified_bucket_value",
                     number=4,
                     setting=(),
                 ),
             ),
             "question_length": feature.Value(
                 dtype="float",
                 description="the length of question",
                 is_bucket=True,
                 bucket_info=feature.BucketInfo(
                     method="bucket_attribute_specified_bucket_value",
                     number=4,
                     setting=(),
                 ),
             ),
             "answer_length": feature.Value(
                 dtype="float",
                 description="the length of answer",
                 is_bucket=True,
                 bucket_info=feature.BucketInfo(
                     method="bucket_attribute_specified_bucket_value",
                     number=4,
                     setting=(),
                 ),
             ),
             "num_oov": feature.Value(
                 dtype="float",
                 description="the number of out-of-vocabulary words",
                 is_bucket=True,
                 bucket_info=feature.BucketInfo(
                     method="bucket_attribute_specified_bucket_value",
                     number=4,
                     setting=(),
                 ),
                 require_training_set=True,
             ),
             "fre_rank": feature.Value(
                 dtype="float",
                 description=(
                     "the average rank of each word based on its frequency in "
                     "training set"
                 ),
                 is_bucket=True,
                 bucket_info=feature.BucketInfo(
                     method="bucket_attribute_specified_bucket_value",
                     number=4,
                     setting=(),
                 ),
                 require_training_set=True,
             ),
         }
     )
예제 #8
0
 def default_features(cls) -> feature.Features:
     return feature.Features({
         "aspect":
         feature.Value("string"),
         "text":
         feature.Value("string"),
         "true_label":
         feature.Value("string"),
         "predicted_label":
         feature.Value("string"),
         "label":
         feature.Value(
             dtype="string",
             description="category",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_discrete_value",
                 number=4,
                 setting=1),
         ),
         "sentence_length":
         feature.Value(
             dtype="float",
             description="sentence length",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
         ),
         "token_number":
         feature.Value(
             dtype="float",
             description="the number of chars",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
         ),
         "entity_number":
         feature.Value(
             dtype="float",
             description="entity numbers",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
         ),
         "aspect_length":
         feature.Value(
             dtype="float",
             description="aspect length",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
         ),
         "aspect_index":
         feature.Value(
             dtype="float",
             description="aspect position",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
         ),
     })
예제 #9
0
 def default_features(cls) -> feature.Features:
     return feature.Features({
         "source":
         feature.Value("string"),
         "reference":
         feature.Value("string"),
         "hypothesis":
         feature.Value("string"),
         "source_length":
         feature.Value(
             dtype="float",
             description="length of the source",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
         ),
         "reference_length":
         feature.Value(
             dtype="float",
             description="length of the reference",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
         ),
         "hypothesis_length":
         feature.Value(
             dtype="float",
             description="length of the hypothesis",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
         ),
         "src_num_oov":
         feature.Value(
             dtype="float",
             description="OOV words in the source",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
             require_training_set=True,
         ),
         "src_fre_rank":
         feature.Value(
             dtype="float",
             description=(
                 "average training-set frequency rank of words in sentence"
             ),
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
             require_training_set=True,
         ),
         "ref_num_oov":
         feature.Value(
             dtype="float",
             description="number of OOV words in reference",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
             require_training_set=True,
         ),
         "ref_fre_rank":
         feature.Value(
             dtype="float",
             description=(
                 "average training-set frequency rank of words in sentence"
             ),
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
             require_training_set=True,
         ),
         # --- the following are features of each token ---
         "ref_tok_info":
         feature.Sequence(feature=feature.Dict(
             feature={
                 "tok_text":
                 feature.Value("string"),
                 "tok_pos":
                 feature.Position(positions=[0, 0]),
                 "tok_matched":
                 feature.Value(
                     # this is actually "int" but int is not supported
                     dtype="float",
                     description=(
                         "which token the ref/hyp token matches in the "
                         "hyp/ref sentence, or -1 if none"),
                     is_bucket=False,
                 ),
                 "tok_capitalness":
                 feature.Value(
                     dtype="string",
                     description=("capitalness of token"),
                     is_bucket=True,
                     bucket_info=feature.BucketInfo(
                         method="bucket_attribute_discrete_value",
                         number=4,
                         setting=1,
                     ),
                 ),
                 "tok_position":
                 feature.Value(
                     dtype="float",
                     description=("relative position of token in sentence"),
                     is_bucket=True,
                     bucket_info=feature.BucketInfo(
                         method="bucket_attribute_specified_bucket_value",
                         number=4,
                         setting=(),
                     ),
                 ),
                 "tok_chars":
                 feature.Value(
                     dtype="float",
                     description="number of characters in the token",
                     is_bucket=True,
                     bucket_info=feature.BucketInfo(
                         method="bucket_attribute_specified_bucket_value",
                         number=4,
                         setting=(),
                     ),
                 ),
                 "tok_test_freq":
                 feature.Value(
                     dtype="float",
                     description="tok frequency in the test set",
                     is_bucket=True,
                     require_training_set=False,
                     bucket_info=feature.BucketInfo(
                         method="bucket_attribute_specified_bucket_value",
                         number=4,
                         setting=(),
                     ),
                 ),
                 "tok_train_freq":
                 feature.Value(
                     dtype="float",
                     description="tok frequency in the training set",
                     is_bucket=True,
                     require_training_set=True,
                     bucket_info=feature.BucketInfo(
                         method="bucket_attribute_specified_bucket_value",
                         number=4,
                         setting=(),
                     ),
                 ),
             })),
     })
예제 #10
0
 def default_features(cls) -> feature.Features:
     return feature.Features({
         "context":
         feature.Value("string"),
         "question_mark":
         feature.Value("string"),
         "hint":
         feature.Value("string"),
         "answers":
         feature.Value("string"),
         "context_length":
         feature.Value(
             dtype="float",
             description="the length of context",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
         ),
         "relative_blank_position":
         feature.Value(
             dtype="float",
             description="the relative position of blank (question mark)"
             " in the whole context",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
         ),
         "absolute_blank_position":
         feature.Value(
             dtype="float",
             description="the absolute position of blank (question mark)"
             " in the whole context",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
         ),
         "answer_length":
         feature.Value(
             dtype="float",
             description="the length of answer",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
         ),
         "num_oov":
         feature.Value(
             dtype="float",
             description="the number of out-of-vocabulary words",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
             require_training_set=True,
         ),
         "fre_rank":
         feature.Value(
             dtype="float",
             description=(
                 "the average rank of each word based on its frequency in "
                 "training set"),
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
             require_training_set=True,
         ),
     })
예제 #11
0
 def default_features(cls) -> feature.Features:
     return feature.Features({
         "text":
         feature.Value("string"),
         "log_probs":
         feature.Value("string"),
         "text_length":
         feature.Value(
             dtype="float",
             description="text length in tokens",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
         ),
         "text_chars":
         feature.Value(
             dtype="float",
             description="text length in characters",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
         ),
         "num_oov":
         feature.Value(
             dtype="float",
             description="the number of out-of-vocabulary words",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
             require_training_set=True,
         ),
         "fre_rank":
         feature.Value(
             dtype="float",
             description=(
                 "the average rank of each work based on its frequency in "
                 "training set"),
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
             require_training_set=True,
         ),
         "length_fre":
         feature.Value(
             dtype="float",
             description="the frequency of text length in training set",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
             require_training_set=True,
         ),
         # --- the following are features of each token ---
         "tok_info":
         feature.Sequence(feature=feature.Dict(
             feature={
                 "tok_text":
                 feature.Value("string"),
                 "tok_pos":
                 feature.Position(positions=[0, 0]),
                 "tok_log_prob":
                 feature.Value(
                     dtype="float",
                     description=(
                         "log probability of the token according to the LM"
                     ),
                     is_bucket=False,
                 ),
                 "tok_capitalness":
                 feature.Value(
                     dtype="string",
                     description=(
                         "The capitalness of an token. For example, "
                         "first_caps represents only the first character of "
                         "the token is capital. full_caps denotes all "
                         "characters of the token are capital"),
                     is_bucket=True,
                     bucket_info=feature.BucketInfo(
                         method="bucket_attribute_discrete_value",
                         number=4,
                         setting=1,
                     ),
                 ),
                 "tok_position":
                 feature.Value(
                     dtype="float",
                     description=(
                         "The relative position of a token in a sentence"),
                     is_bucket=True,
                     bucket_info=feature.BucketInfo(
                         method="bucket_attribute_specified_bucket_value",
                         number=4,
                         setting=(),
                     ),
                 ),
                 "tok_chars":
                 feature.Value(
                     dtype="float",
                     description="The number of characters in a token",
                     is_bucket=True,
                     bucket_info=feature.BucketInfo(
                         method="bucket_attribute_specified_bucket_value",
                         number=4,
                         setting=(),
                     ),
                 ),
                 "tok_test_freq":
                 feature.Value(
                     dtype="float",
                     description="tok frequency in the test set",
                     is_bucket=True,
                     require_training_set=False,
                     bucket_info=feature.BucketInfo(
                         method="bucket_attribute_specified_bucket_value",
                         number=4,
                         setting=(),
                     ),
                 ),
                 "tok_train_freq":
                 feature.Value(
                     dtype="float",
                     description="tok frequency in the training set",
                     is_bucket=True,
                     require_training_set=True,
                     bucket_info=feature.BucketInfo(
                         method="bucket_attribute_specified_bucket_value",
                         number=4,
                         setting=(),
                     ),
                 ),
             })),
     })
예제 #12
0
 def default_features(cls) -> feature.Features:
     return feature.Features({
         "text":
         feature.Value("string"),
         "true_label":
         feature.Value("string"),
         "predicted_label":
         feature.Value("string"),
         "label":
         feature.Value(
             dtype="string",
             description="category",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_discrete_value",
                 number=4,
                 setting=1),
         ),
         "text_length":
         feature.Value(
             dtype="float",
             description="text length in tokens",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
         ),
         "text_chars":
         feature.Value(
             dtype="float",
             description="text length in characters",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
         ),
         "basic_words":
         feature.Value(
             dtype="float",
             description="the ratio of basic words",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
         ),
         "lexical_richness":
         feature.Value(
             dtype="float",
             description="lexical diversity",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
         ),
         "num_oov":
         feature.Value(
             dtype="float",
             description="the number of out-of-vocabulary words",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
             require_training_set=True,
         ),
         "fre_rank":
         feature.Value(
             dtype="float",
             description=(
                 "the average rank of each word based on its frequency in "
                 "training set"),
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
             require_training_set=True,
         ),
         "length_fre":
         feature.Value(
             dtype="float",
             description="the frequency of text length in training set",
             is_bucket=True,
             bucket_info=feature.BucketInfo(
                 method="bucket_attribute_specified_bucket_value",
                 number=4,
                 setting=(),
             ),
             require_training_set=True,
         ),
     })