Exemplo n.º 1
0
 def non_preprocessed_repr(
     self,
     repr_config: Optional[ReprConfig] = None
 ) -> Tuple[List[str], PreprocessingMetadata]:
     if not repr_config:  #called by str()
         return self.wrap_in_metadata_for_full_word(
             ["".join(map(lambda t: str(t), self.subtokens))])
     elif self.length > repr_config.max_str_length:
         s = ['""'] if repr_config.full_strings else ['"', '"']
         non_processable_tokens = {} if repr_config.full_strings else {'"'}
         return self.wrap_in_metadata_for_full_word(s,
                                                    non_processable_tokens)
     elif repr_config.bpe_data:
         s = self._replace_non_ascii_seqs_if_necessary(repr_config)
         return self.wrap_in_metadata_for_full_word(
             repr_config.word_splitter(s, repr_config.bpe_data))
     elif repr_config.full_strings:
         s = self._replace_non_ascii_seqs_if_necessary(repr_config)
         return self.wrap_in_metadata_for_full_word([s])
     else:  # here we dont keep full strings
         tokens, metadata = torepr(
             list(filter(lambda t: type(t) != SpaceInString,
                         self.subtokens)), repr_config)
         metadata.set_all_tokens_type(StringLiteral)
         return tokens, metadata
Exemplo n.º 2
0
 def non_preprocessed_repr(
     self,
     repr_config: Optional[ReprConfig] = None
 ) -> Tuple[List[str], PreprocessingMetadata]:
     prep_tokens, metadata = torepr(self.subtokens, repr_config)
     metadata.set_all_tokens_type(MultilineComment)
     return prep_tokens, metadata
Exemplo n.º 3
0
 def non_preprocessed_repr(
     self,
     repr_config: Optional[ReprConfig] = None
 ) -> Tuple[List[str], PreprocessingMetadata]:
     nospl_str = [
         "".join(map(lambda s: torepr(s, repr_config)[0][0],
                     self.subtokens))
     ]
     return self.wrap_in_metadata_for_full_word(nospl_str)
Exemplo n.º 4
0
 def preprocessed_repr(
         self, repr_config: ReprConfig
 ) -> Tuple[List[str], PreprocessingMetadata]:
     if repr_config.bpe_data:
         token = replace_non_ascii_seqs(str(self.processable_token),
                                        placeholders['non_ascii_seq'])
         return torepr(SplitContainer.from_single_token(token), repr_config)
     else:
         return self.wrap_in_metadata_for_full_word(
             [placeholders['non_eng']])
Exemplo n.º 5
0
 def non_preprocessed_repr(
     self,
     repr_config: Optional[ReprConfig] = None
 ) -> Tuple[List[str], PreprocessingMetadata]:
     prep_tokens, metadata = torepr(self.subtokens, repr_config)
     metadata.update(
         PreprocessingMetadata(word_boundaries=[0, 1],
                               token_types=[OneLineComment]))
     metadata.set_all_tokens_type(OneLineComment)
     return prep_tokens + [placeholders['olc_end']], metadata
Exemplo n.º 6
0
 def preprocessed_repr(
         self, repr_config) -> Tuple[List[str], PreprocessingMetadata]:
     if repr_config.bpe_data:
         return self.wrap_in_metadata_for_full_word(
             repr_config.word_splitter(str(self), repr_config.bpe_data))
     res = []
     all_metadata = PreprocessingMetadata()
     for subtoken in self.subtokens:
         r, metadata = torepr(subtoken, repr_config)
         res.extend(r if isinstance(r, list) else [r])
         all_metadata.update(metadata)
     return self.wrap_in_metadata_for_full_word(
         wrap_in_word_boundaries_if_necessary(res),
         all_metadata.nonprocessable_tokens)
Exemplo n.º 7
0
 def non_preprocessed_repr(
     self,
     repr_config: Optional[ReprConfig] = None
 ) -> Tuple[List[str], PreprocessingMetadata]:
     return torepr(self.processable_token, repr_config)