def __init__( self, input_file: str, tokenizer: PreTrainedTokenizerBase, mode: str, do_basic_tokenize: bool, tagger_data_augmentation: bool, ): assert mode in constants.MODES self.mode = mode raw_insts = read_data_file(input_file) # Convert raw instances to TaggerDataInstance insts = [] for (_, w_words, s_words) in tqdm(raw_insts): for inst_dir in constants.INST_DIRECTIONS: if inst_dir == constants.INST_BACKWARD and mode == constants.TN_MODE: continue if inst_dir == constants.INST_FORWARD and mode == constants.ITN_MODE: continue # Create a new TaggerDataInstance inst = TaggerDataInstance(w_words, s_words, inst_dir, do_basic_tokenize) insts.append(inst) # Data Augmentation (if enabled) if tagger_data_augmentation: filtered_w_words, filtered_s_words = [], [] for ix, (w, s) in enumerate(zip(w_words, s_words)): if not s in constants.SPECIAL_WORDS: filtered_w_words.append(w) filtered_s_words.append(s) if len(filtered_s_words) > 1: inst = TaggerDataInstance(filtered_w_words, filtered_s_words, inst_dir) insts.append(inst) self.insts = insts texts = [inst.input_words for inst in insts] tags = [inst.labels for inst in insts] # Tags Mapping self.tag2id = { tag: id for id, tag in enumerate(constants.ALL_TAG_LABELS) } # Finalize self.encodings = tokenizer(texts, is_split_into_words=True, padding=False, truncation=True) self.labels = self.encode_tags(tags, self.encodings)
def __init__(self, input_file: str, mode: str, lang: str, keep_puncts: bool = False): self.lang = lang insts = read_data_file(input_file) # Build inputs and targets self.directions, self.inputs, self.targets = [], [], [] for (_, w_words, s_words) in insts: # Extract words that are not punctuations processed_w_words, processed_s_words = [], [] for w_word, s_word in zip(w_words, s_words): if s_word == constants.SIL_WORD: if keep_puncts: processed_w_words.append(w_word) processed_s_words.append(w_word) continue if s_word == constants.SELF_WORD: processed_s_words.append(w_word) if not s_word in constants.SPECIAL_WORDS: processed_s_words.append(s_word) processed_w_words.append(w_word) # Create examples for direction in constants.INST_DIRECTIONS: if direction == constants.INST_BACKWARD: if mode == constants.TN_MODE: continue input_words = processed_s_words output_words = processed_w_words if direction == constants.INST_FORWARD: if mode == constants.ITN_MODE: continue input_words = w_words output_words = processed_s_words # Basic tokenization input_words = basic_tokenize(' '.join(input_words), lang) output_words = basic_tokenize(' '.join(output_words), lang) # Update self.directions, self.inputs, self.targets self.directions.append(direction) self.inputs.append(' '.join(input_words)) self.targets.append(' '.join(output_words)) self.examples = list(zip(self.directions, self.inputs, self.targets))
def __init__(self, input_file: str, mode: str, lang: str): self.lang = lang insts = read_data_file(input_file, lang=lang) # Build inputs and targets self.directions, self.inputs, self.targets, self.classes, self.nb_spans, self.span_starts, self.span_ends = ( [], [], [], [], [], [], [], ) for (classes, w_words, s_words) in insts: # Extract words that are not punctuations for direction in constants.INST_DIRECTIONS: if direction == constants.INST_BACKWARD: if mode == constants.TN_MODE: continue # ITN mode ( processed_w_words, processed_s_words, processed_classes, processed_nb_spans, processed_s_span_starts, processed_s_span_ends, ) = ([], [], [], 0, [], []) s_word_idx = 0 for cls, w_word, s_word in zip(classes, w_words, s_words): if s_word == constants.SIL_WORD: continue elif s_word == constants.SELF_WORD: processed_s_words.append(w_word) else: processed_s_words.append(s_word) processed_nb_spans += 1 processed_classes.append(cls) processed_s_span_starts.append(s_word_idx) s_word_idx += len( basic_tokenize(processed_s_words[-1], lang=self.lang)) processed_s_span_ends.append(s_word_idx) processed_w_words.append(w_word) self.span_starts.append(processed_s_span_starts) self.span_ends.append(processed_s_span_ends) self.classes.append(processed_classes) self.nb_spans.append(processed_nb_spans) # Basic tokenization input_words = basic_tokenize(' '.join(processed_s_words), lang) # Update self.directions, self.inputs, self.targets self.directions.append(direction) self.inputs.append(' '.join(input_words)) self.targets.append( processed_w_words ) # is list of lists where inner list contains target tokens (not words) # TN mode elif direction == constants.INST_FORWARD: if mode == constants.ITN_MODE: continue ( processed_w_words, processed_s_words, processed_classes, processed_nb_spans, w_span_starts, w_span_ends, ) = ([], [], [], 0, [], []) w_word_idx = 0 for cls, w_word, s_word in zip(classes, w_words, s_words): # TN forward mode if s_word in constants.SPECIAL_WORDS: processed_s_words.append(w_word) else: processed_s_words.append(s_word) w_span_starts.append(w_word_idx) w_word_idx += len( basic_tokenize(w_word, lang=self.lang)) w_span_ends.append(w_word_idx) processed_nb_spans += 1 processed_classes.append(cls) processed_w_words.append(w_word) self.span_starts.append(w_span_starts) self.span_ends.append(w_span_ends) self.classes.append(processed_classes) self.nb_spans.append(processed_nb_spans) # Basic tokenization input_words = basic_tokenize(' '.join(processed_w_words), lang) # Update self.directions, self.inputs, self.targets self.directions.append(direction) self.inputs.append(' '.join(input_words)) self.targets.append( processed_s_words ) # is list of lists where inner list contains target tokens (not words) self.examples = list( zip( self.directions, self.inputs, self.targets, self.classes, self.nb_spans, self.span_starts, self.span_ends, ))
def __init__( self, input_file: str, tokenizer: PreTrainedTokenizerBase, tokenizer_name: str, raw_instances: Optional[List[List[str]]] = None, mode: str = "joint", max_len: int = 512, decoder_data_augmentation: bool = False, lang: str = "en", do_basic_tokenize: bool = False, use_cache: bool = False, max_insts: int = -1, do_tokenize: bool = True, initial_shuffle: bool = False, ): assert mode in constants.MODES assert lang in constants.SUPPORTED_LANGS self.mode = mode self.lang = lang self.use_cache = use_cache self.max_insts = max_insts self.tokenizer = tokenizer self.max_seq_len = max_len self.mode = mode # Get cache path data_dir, filename = os.path.split(input_file) tokenizer_name_normalized = tokenizer_name.replace('/', '_') cached_data_file = os.path.join( data_dir, f'cached_decoder_{filename}_{tokenizer_name_normalized}_{lang}_{max_insts}_{mode}_{max_len}.pkl' ) if use_cache and os.path.exists(cached_data_file): logging.warning( f"Processing of {input_file} is skipped as caching is enabled and a cache file " f"{cached_data_file} already exists.") with open(cached_data_file, 'rb') as f: data = pickle.load(f) self.insts, self.inputs, self.examples, self.tn_count, self.itn_count, self.label_ids_semiotic = data else: if raw_instances is None: raw_instances = read_data_file(fp=input_file, lang=self.lang, max_insts=max_insts) else: raw_instances = raw_instances[:max_insts] if initial_shuffle: random.shuffle(raw_instances) logging.debug( f"Converting raw instances to DecoderDataInstance for {input_file}..." ) self.insts, all_semiotic_classes = self.__process_raw_entries( raw_instances, decoder_data_augmentation=decoder_data_augmentation, do_basic_tokenize=do_basic_tokenize) logging.debug( f"Extracted {len(self.insts)} DecoderDateInstances out of {len(raw_instances)} raw instances." ) self.label_ids_semiotic = OrderedDict( {l: idx for idx, l in enumerate(all_semiotic_classes)}) logging.debug(f'Label_ids: {self.label_ids_semiotic}') # save labels list from the training file to the input_file to the file dir_name, file_name = os.path.split(input_file) if 'train' in file_name: with open(os.path.join(dir_name, f"label_ids_{file_name}"), 'w') as f: f.write('\n'.join(self.label_ids_semiotic.keys())) if do_tokenize: logging.debug( f'Processing samples, total number: {len(self.insts)}') self.__tokenize_samples(use_cache=use_cache, cached_data_file=cached_data_file)
def __init__( self, input_file: str, tokenizer: PreTrainedTokenizerBase, tokenizer_name: str, mode: str, do_basic_tokenize: bool, tagger_data_augmentation: bool, lang: str, max_seq_length: int, use_cache: bool = False, max_insts: int = -1, ): assert mode in constants.MODES assert lang in constants.SUPPORTED_LANGS self.mode = mode self.lang = lang self.use_cache = use_cache self.max_insts = max_insts # Get cache path data_dir, filename = os.path.split(input_file) tokenizer_name_normalized = tokenizer_name.replace('/', '_') cached_data_file = os.path.join( data_dir, f'cached_tagger_{filename}_{tokenizer_name_normalized}_{lang}_{max_insts}_{max_seq_length}.pkl' ) if use_cache and os.path.exists(cached_data_file): logging.warning( f"Processing of {input_file} is skipped as caching is enabled and a cache file " f"{cached_data_file} already exists.") with open(cached_data_file, 'rb') as f: data = pickle.load(f) self.insts, self.tag2id, self.encodings, self.labels = data else: # Read the input raw data file, returns list of sentences parsed as list of class, w_words, s_words raw_insts = read_data_file(input_file, lang=lang) if max_insts >= 0: raw_insts = raw_insts[:max_insts] # Convert raw instances to TaggerDataInstance insts = [] for (_, w_words, s_words) in tqdm(raw_insts): for inst_dir in constants.INST_DIRECTIONS: if inst_dir == constants.INST_BACKWARD and mode == constants.TN_MODE: continue if inst_dir == constants.INST_FORWARD and mode == constants.ITN_MODE: continue # filter out examples that are longer than the maximum sequence length value if (len( tokenizer(w_words, is_split_into_words=True, padding=False, truncation=True)['input_ids']) >= max_seq_length or len( tokenizer(s_words, is_split_into_words=True, padding=False, truncation=True)['input_ids']) >= max_seq_length): continue # Create a new TaggerDataInstance inst = TaggerDataInstance(w_words, s_words, inst_dir, do_basic_tokenize) insts.append(inst) # Data Augmentation (if enabled) if tagger_data_augmentation: filtered_w_words, filtered_s_words = [], [] for ix, (w, s) in enumerate(zip(w_words, s_words)): if not s in constants.SPECIAL_WORDS: filtered_w_words.append(w) filtered_s_words.append(s) if len(filtered_s_words) > 1: inst = TaggerDataInstance(filtered_w_words, filtered_s_words, inst_dir) insts.append(inst) self.insts = insts texts = [inst.input_words for inst in insts] tags = [inst.labels for inst in insts] # Tags Mapping self.tag2id = { tag: id for id, tag in enumerate(constants.ALL_TAG_LABELS) } # Finalize self.encodings = tokenizer(texts, is_split_into_words=True, padding=False, truncation=True) self.labels = self.encode_tags(tags, self.encodings) # Write to cache (if use_cache) if use_cache: with open(cached_data_file, 'wb') as out_file: data = self.insts, self.tag2id, self.encodings, self.labels pickle.dump(data, out_file, protocol=pickle.HIGHEST_PROTOCOL)
def __init__( self, input_file: str, tokenizer: PreTrainedTokenizerBase, tokenizer_name: str, mode: str, max_len: int, decoder_data_augmentation: bool, lang: str, do_basic_tokenize: bool, use_cache: bool = False, max_insts: int = -1, ): assert mode in constants.MODES assert lang in constants.SUPPORTED_LANGS self.mode = mode self.lang = lang self.use_cache = use_cache self.max_insts = max_insts # Get cache path data_dir, filename = os.path.split(input_file) tokenizer_name_normalized = tokenizer_name.replace('/', '_') cached_data_file = os.path.join( data_dir, f'cached_decoder_{filename}_{tokenizer_name_normalized}_{lang}_{max_insts}.pkl' ) if use_cache and os.path.exists(cached_data_file): logging.warning( f"Processing of {input_file} is skipped as caching is enabled and a cache file " f"{cached_data_file} already exists." ) with open(cached_data_file, 'rb') as f: data = pickle.load(f) self.insts, self.inputs, self.examples, self.tn_count, self.itn_count = data else: raw_insts = read_data_file(input_file) if max_insts >= 0: raw_insts = raw_insts[:max_insts] # Convert raw instances to TaggerDataInstance insts, inputs, targets = [], [], [] for (classes, w_words, s_words) in tqdm(raw_insts): for ix, (_class, w_word, s_word) in enumerate(zip(classes, w_words, s_words)): if s_word in constants.SPECIAL_WORDS: continue for inst_dir in constants.INST_DIRECTIONS: if inst_dir == constants.INST_BACKWARD and mode == constants.TN_MODE: continue if inst_dir == constants.INST_FORWARD and mode == constants.ITN_MODE: continue # Create a DecoderDataInstance inst = DecoderDataInstance( w_words, s_words, inst_dir, start_idx=ix, end_idx=ix + 1, lang=self.lang, semiotic_class=_class, do_basic_tokenize=do_basic_tokenize, ) insts.append(inst) if decoder_data_augmentation: noise_left = random.randint(1, 2) noise_right = random.randint(1, 2) inst = DecoderDataInstance( w_words, s_words, inst_dir, start_idx=ix - noise_left, end_idx=ix + 1 + noise_right, lang=self.lang, do_basic_tokenize=do_basic_tokenize, ) insts.append(inst) self.insts = insts inputs = [inst.input_str for inst in insts] targets = [inst.output_str for inst in insts] # Tokenization self.inputs, self.examples = [], [] self.tn_count, self.itn_count, long_examples_filtered = 0, 0, 0 input_max_len, target_max_len = 0, 0 for idx in range(len(inputs)): # Input _input = tokenizer([inputs[idx]]) input_len = len(_input['input_ids'][0]) if input_len > max_len: long_examples_filtered += 1 continue # Target _target = tokenizer([targets[idx]]) target_len = len(_target['input_ids'][0]) if target_len > max_len: long_examples_filtered += 1 continue # Update self.inputs.append(inputs[idx]) _input['labels'] = _target['input_ids'] self.examples.append(_input) if inputs[idx].startswith(constants.TN_PREFIX): self.tn_count += 1 if inputs[idx].startswith(constants.ITN_PREFIX): self.itn_count += 1 input_max_len = max(input_max_len, input_len) target_max_len = max(target_max_len, target_len) print(f'long_examples_filtered: {long_examples_filtered}') print(f'input_max_len: {input_max_len} | target_max_len: {target_max_len}') # Write to cache (if use_cache) if use_cache: with open(cached_data_file, 'wb') as out_file: data = self.insts, self.inputs, self.examples, self.tn_count, self.itn_count pickle.dump(data, out_file, protocol=pickle.HIGHEST_PROTOCOL)
if not isdir(args.output_dir): mkdir(args.output_dir) # Read input datasets and combine them train, dev, test = [], [], [] for split_name in constants.SPLIT_NAMES: if split_name == constants.TRAIN: cur_data = train if split_name == constants.DEV: cur_data = dev if split_name == constants.TEST: cur_data = test # Loop through each input directory for input_dir in args.input_dirs: input_fp = join(input_dir, f'{split_name}.tsv') insts = read_data_file(input_fp) cur_data.extend(insts) print('After combining the datasets:') print(f'len(train): {len(train)}') print(f'len(dev): {len(dev)}') print(f'len(test): {len(test)}') # Output for split_name in constants.SPLIT_NAMES: output_fp = join(args.output_dir, f'{split_name}.tsv') with open(output_fp, 'w+') as output_f: if split_name == constants.TRAIN: cur_data = train if split_name == constants.DEV: cur_data = dev if split_name == constants.TEST:
if not isdir(args.output_dir): mkdir(args.output_dir) # Read input datasets and combine them train, dev, test = [], [], [] for split_name in constants.SPLIT_NAMES: if split_name == constants.TRAIN: cur_data = train if split_name == constants.DEV: cur_data = dev if split_name == constants.TEST: cur_data = test # Loop through each input directory for input_dir in args.input_dirs: input_fp = join(input_dir, f'{split_name}.tsv') insts = read_data_file(input_fp, lang=args.language) cur_data.extend(insts) print('After combining the datasets:') print(f'len(train): {len(train)}') print(f'len(dev): {len(dev)}') print(f'len(test): {len(test)}') # Output for split_name in constants.SPLIT_NAMES: output_fp = join(args.output_dir, f'{split_name}.tsv') with open(output_fp, 'w+') as output_f: if split_name == constants.TRAIN: cur_data = train if split_name == constants.DEV: cur_data = dev if split_name == constants.TEST:
def __init__( self, input_file: str, tokenizer: PreTrainedTokenizerBase, mode: str, max_len: int, decoder_data_augmentation: bool, ): assert mode in constants.MODES self.mode = mode raw_insts = read_data_file(input_file) # Convert raw instances to TaggerDataInstance insts, inputs, targets = [], [], [] for (classes, w_words, s_words) in tqdm(raw_insts): for ix, (_class, w_word, s_word) in enumerate(zip(classes, w_words, s_words)): if s_word in constants.SPECIAL_WORDS: continue for inst_dir in constants.INST_DIRECTIONS: if inst_dir == constants.INST_BACKWARD and mode == constants.TN_MODE: continue if inst_dir == constants.INST_FORWARD and mode == constants.ITN_MODE: continue # Create a DecoderDataInstance inst = DecoderDataInstance(w_words, s_words, inst_dir, start_idx=ix, end_idx=ix + 1, semiotic_class=_class) insts.append(inst) if decoder_data_augmentation: noise_left = random.randint(1, 2) noise_right = random.randint(1, 2) inst = DecoderDataInstance(w_words, s_words, inst_dir, start_idx=ix - noise_left, end_idx=ix + 1 + noise_right) insts.append(inst) self.insts = insts inputs = [inst.input_str for inst in insts] targets = [inst.output_str for inst in insts] # Tokenization self.inputs, self.examples = [], [] self.tn_count, self.itn_count, long_examples_filtered = 0, 0, 0 input_max_len, target_max_len = 0, 0 for idx in range(len(inputs)): # Input _input = tokenizer([inputs[idx]]) input_len = len(_input['input_ids'][0]) if input_len > max_len: long_examples_filtered += 1 continue # Target _target = tokenizer([targets[idx]]) target_len = len(_target['input_ids'][0]) if target_len > max_len: long_examples_filtered += 1 continue # Update self.inputs.append(inputs[idx]) _input['labels'] = _target['input_ids'] self.examples.append(_input) if inputs[idx].startswith(constants.TN_PREFIX): self.tn_count += 1 if inputs[idx].startswith(constants.ITN_PREFIX): self.itn_count += 1 input_max_len = max(input_max_len, input_len) target_max_len = max(target_max_len, target_len) print(f'long_examples_filtered: {long_examples_filtered}') print( f'input_max_len: {input_max_len} | target_max_len: {target_max_len}' )
def __init__(self, input_file: str, mode: str, lang: str): self.lang = lang insts = read_data_file(input_file, lang=lang) processor = MosesProcessor(lang_id=lang) # Build inputs and targets self.directions, self.inputs, self.targets, self.classes, self.nb_spans, self.span_starts, self.span_ends = ( [], [], [], [], [], [], [], ) for (classes, w_words, s_words) in insts: # Extract words that are not punctuations for direction in constants.INST_DIRECTIONS: if direction == constants.INST_BACKWARD: if mode == constants.TN_MODE: continue # ITN mode ( processed_w_words, processed_s_words, processed_classes, processed_nb_spans, processed_s_span_starts, processed_s_span_ends, ) = ([], [], [], 0, [], []) s_word_idx = 0 for cls, w_word, s_word in zip(classes, w_words, s_words): if s_word == constants.SIL_WORD: continue elif s_word == constants.SELF_WORD: processed_s_words.append(w_word) else: processed_s_words.append(s_word) s_word_last = processor.tokenize( processed_s_words.pop()).split() processed_s_words.append(" ".join(s_word_last)) num_tokens = len(s_word_last) processed_nb_spans += 1 processed_classes.append(cls) processed_s_span_starts.append(s_word_idx) s_word_idx += num_tokens processed_s_span_ends.append(s_word_idx) processed_w_words.append(w_word) self.span_starts.append(processed_s_span_starts) self.span_ends.append(processed_s_span_ends) self.classes.append(processed_classes) self.nb_spans.append(processed_nb_spans) input_words = ' '.join(processed_s_words) # Update self.directions, self.inputs, self.targets self.directions.append(direction) self.inputs.append(input_words) self.targets.append( processed_w_words ) # is list of lists where inner list contains target tokens (not words) # TN mode elif direction == constants.INST_FORWARD: if mode == constants.ITN_MODE: continue ( processed_w_words, processed_s_words, processed_classes, processed_nb_spans, w_span_starts, w_span_ends, ) = ([], [], [], 0, [], []) w_word_idx = 0 for cls, w_word, s_word in zip(classes, w_words, s_words): # TN forward mode # this is done for cases like `do n't`, this w_word will be treated as 2 tokens w_word = processor.tokenize(w_word).split() num_tokens = len(w_word) if s_word in constants.SPECIAL_WORDS: processed_s_words.append(" ".join(w_word)) else: processed_s_words.append(s_word) w_span_starts.append(w_word_idx) w_word_idx += num_tokens w_span_ends.append(w_word_idx) processed_nb_spans += 1 processed_classes.append(cls) processed_w_words.extend(w_word) self.span_starts.append(w_span_starts) self.span_ends.append(w_span_ends) self.classes.append(processed_classes) self.nb_spans.append(processed_nb_spans) input_words = ' '.join(processed_w_words) # Update self.directions, self.inputs, self.targets self.directions.append(direction) self.inputs.append(input_words) self.targets.append( processed_s_words ) # is list of lists where inner list contains target tokens (not words) self.examples = list( zip( self.directions, self.inputs, self.targets, self.classes, self.nb_spans, self.span_starts, self.span_ends, ))