示例#1
0
 def _sort_by_padding(self, instances: List[Instance]) -> List[Instance]:
     """
     Sorts the instances by their padding lengths, using the keys in
     `sorting_keys` (in the order in which they are provided).  `sorting_keys` is a list of
     `(field_name, padding_key)` tuples.
     """
     if not self._sorting_keys:
         logger.info("No sorting keys given; trying to guess a good one")
         self._guess_sorting_keys(instances)
         logger.info(f"Using {self._sorting_keys} as the sorting keys")
     instances_with_lengths = []
     for instance in instances:
         # Make sure instance is indexed before calling .get_padding
         instance.index_fields(self.vocab)
         padding_lengths = cast(Dict[str, Dict[str, float]], instance.get_padding_lengths())
         if self._padding_noise > 0.0:
             noisy_lengths = {}
             for field_name, field_lengths in padding_lengths.items():
                 noisy_lengths[field_name] = add_noise_to_dict_values(
                     field_lengths, self._padding_noise
                 )
             padding_lengths = noisy_lengths
         instance_with_lengths = (
             [
                 padding_lengths[field_name][padding_key]
                 for (field_name, padding_key) in self._sorting_keys
             ],
             instance,
         )
         instances_with_lengths.append(instance_with_lengths)
     instances_with_lengths.sort(key=lambda x: x[0])
     return [instance_with_lengths[-1] for instance_with_lengths in instances_with_lengths]
 def _sort_by_padding(
     self,
     instances: List[Instance],
     sorting_keys: List[Tuple[str, str]],  # pylint: disable=invalid-sequence-index
     padding_noise: float = 0.0
 ) -> List[Instance]:
     """
     Sorts the ``Instances`` in this ``Batch`` by their padding lengths, using the keys in
     ``sorting_keys`` (in the order in which they are provided).  ``sorting_keys`` is a list of
     ``(field_name, padding_key)`` tuples.
     """
     instances_with_lengths = []
     for instance in instances:
         # Make sure instance is indexed before calling .get_padding
         instance.index_fields(self.vocab)
         padding_lengths = cast(Dict[str, Dict[str, float]],
                                instance.get_padding_lengths())
         if padding_noise > 0.0:
             noisy_lengths = {}
             for field_name, field_lengths in padding_lengths.items():
                 noisy_lengths[field_name] = add_noise_to_dict_values(
                     field_lengths, padding_noise)
             padding_lengths = noisy_lengths
         instance_with_lengths = ([
             padding_lengths[field_name][padding_key]
             for (field_name, padding_key) in sorting_keys
         ], instance)
         instances_with_lengths.append(instance_with_lengths)
     instances_with_lengths.sort(key=lambda x: x[0])
     return [
         instance_with_lengths[-1]
         for instance_with_lengths in instances_with_lengths
     ]
示例#3
0
 def _sort_dataset_by_padding(
     dataset: Dataset,
     sorting_keys: List[Tuple[str, str]],  # pylint: disable=invalid-sequence-index
     padding_noise: float = 0.0
 ) -> Dataset:
     """
     Sorts the ``Instances`` in this ``Dataset`` by their padding lengths, using the keys in
     ``sorting_keys`` (in the order in which they are provided).  ``sorting_keys`` is a list of
     ``(field_name, padding_key)`` tuples.
     """
     instances_with_lengths = []
     for instance in dataset.instances:
         padding_lengths = cast(Dict[str, Dict[str, float]],
                                instance.get_padding_lengths())
         if padding_noise > 0.0:
             noisy_lengths = {}
             for field_name, field_lengths in padding_lengths.items():
                 noisy_lengths[field_name] = add_noise_to_dict_values(
                     field_lengths, padding_noise)
             padding_lengths = noisy_lengths
         instance_with_lengths = ([
             padding_lengths[field_name][padding_key]
             for (field_name, padding_key) in sorting_keys
         ], instance)
         instances_with_lengths.append(instance_with_lengths)
     instances_with_lengths.sort(key=lambda x: x[0])
     return Dataset([
         instance_with_lengths[-1]
         for instance_with_lengths in instances_with_lengths
     ])
示例#4
0
def sort_by_padding(instances: List[Instance],
                    sorting_keys: List[Tuple[str, str]],  # pylint: disable=invalid-sequence-index
                    vocab: Vocabulary,
                    padding_noise: float = 0.0) -> List[Instance]:
    """
    Sorts the instances by their padding lengths, using the keys in
    ``sorting_keys`` (in the order in which they are provided).  ``sorting_keys`` is a list of
    ``(field_name, padding_key)`` tuples.
    """
    instances_with_lengths = []
    for instance in instances:
        # Make sure instance is indexed before calling .get_padding
        instance.index_fields(vocab)
        padding_lengths = cast(Dict[str, Dict[str, float]], instance.get_padding_lengths())
        if padding_noise > 0.0:
            noisy_lengths = {}
            for field_name, field_lengths in padding_lengths.items():
                noisy_lengths[field_name] = add_noise_to_dict_values(field_lengths, padding_noise)
            padding_lengths = noisy_lengths
        instance_with_lengths = ([padding_lengths[field_name][padding_key]
                                  for (field_name, padding_key) in sorting_keys],
                                 instance)
        instances_with_lengths.append(instance_with_lengths)
    instances_with_lengths.sort(key=lambda x: x[0])
    return [instance_with_lengths[-1] for instance_with_lengths in instances_with_lengths]
def sort_by_padding_modified(
        instances: List[Instance],
        sorting_keys: List[Tuple[str, str]],  # pylint: disable=invalid-sequence-index
        vocab: Vocabulary,
        padding_noise: float = 0.0) -> List[Instance]:
    """
    Sorts the instances by their padding lengths, using the keys in
    ``sorting_keys`` (in the order in which they are provided).  ``sorting_keys`` is a list of
    ``(field_name, padding_key)`` tuples.
    """
    instances_with_lengths = []
    for instance in instances:
        # Make sure instance is indexed before calling .get_padding
        instance.index_fields(vocab)
        padding_lengths = instance.get_padding_lengths()
        padding_lengths["sentences"] = {
            "num_sentences": len(instance.fields['tokens'].field_list)
        }
        padding_lengths = cast(Dict[str, Dict[str, float]], padding_lengths)
        if padding_noise > 0.0:
            noisy_lengths = {}
            for field_name, field_lengths in padding_lengths.items():
                noisy_lengths[field_name] = add_noise_to_dict_values(
                    field_lengths, padding_noise)
            padding_lengths = noisy_lengths
        instance_with_lengths = ([
            padding_lengths[field_name][padding_key]
            for (field_name, padding_key) in sorting_keys
        ], instance)
        instances_with_lengths.append(instance_with_lengths)
    instances_with_lengths.sort(key=lambda x: x[0])
    return [
        instance_with_lengths[-1]
        for instance_with_lengths in instances_with_lengths
    ]
示例#6
0
def sort_by_padding(instances                ,
                    sorting_keys                       ,  # pylint: disable=invalid-sequence-index
                    vocab            ,
                    padding_noise        = 0.0)                  :
    u"""
    Sorts the instances by their padding lengths, using the keys in
    ``sorting_keys`` (in the order in which they are provided).  ``sorting_keys`` is a list of
    ``(field_name, padding_key)`` tuples.
    """
    instances_with_lengths = []
    for instance in instances:
        # Make sure instance is indexed before calling .get_padding
        instance.index_fields(vocab)
        padding_lengths = cast(Dict[unicode, Dict[unicode, float]], instance.get_padding_lengths())
        if padding_noise > 0.0:
            noisy_lengths = {}
            for field_name, field_lengths in list(padding_lengths.items()):
                noisy_lengths[field_name] = add_noise_to_dict_values(field_lengths, padding_noise)
            padding_lengths = noisy_lengths
        instance_with_lengths = ([padding_lengths[field_name][padding_key]
                                  for (field_name, padding_key) in sorting_keys],
                                 instance)
        instances_with_lengths.append(instance_with_lengths)
    instances_with_lengths.sort(key=lambda x: x[0])
    return [instance_with_lengths[-1] for instance_with_lengths in instances_with_lengths]
def sort_by_padding(
        instances: List[Instance],
        sorting_keys: List[Tuple[str, str]],  # pylint: disable=invalid-sequence-index
        vocab: Vocabulary,
        padding_noise: float = 0.0) -> List[Instance]:
    """
    Sorts the instances by their padding lengths, using the keys in
    ``sorting_keys`` (in the order in which they are provided).  ``sorting_keys`` is a list of
    ``(field_name, padding_key)`` tuples.
    """
    indices_to_ignore = []
    for i, instance in enumerate(instances):
        try:
            instance.index_fields(vocab)
        except:
            indices_to_ignore.append(i)

    if len(indices_to_ignore):
        logger.info("Ignored Instances:", len(indices_to_ignore))

    for ind in sorted(indices_to_ignore, reverse=True):
        del instances[ind]

    instances_with_lengths = []
    for instance in instances:
        # Make sure instance is indexed before calling .get_padding
        # instance.index_fields(vocab)
        padding_lengths = cast(Dict[str, Dict[str, float]],
                               instance.get_padding_lengths())
        if padding_noise > 0.0:
            noisy_lengths = {}
            for field_name, field_lengths in padding_lengths.items():
                noisy_lengths[field_name] = add_noise_to_dict_values(
                    field_lengths, padding_noise)
            padding_lengths = noisy_lengths
        instance_with_lengths = ([
            padding_lengths[field_name][padding_key]
            for (field_name, padding_key) in sorting_keys
        ], instance)
        instances_with_lengths.append(instance_with_lengths)
    instances_with_lengths.sort(key=lambda x: x[0])
    return [
        instance_with_lengths[-1]
        for instance_with_lengths in instances_with_lengths
    ]