def replace_unk(hypo_str, src_str, alignment, align_dict, unk): # Tokens are strings here hypo_tokens = tokenizer.tokenize_line(hypo_str) # TODO: Very rare cases where the replacement is '<eos>' should be handled gracefully src_tokens = tokenizer.tokenize_line(src_str) + ['<eos>'] for i, ht in enumerate(hypo_tokens): if ht == unk: src_token = src_tokens[alignment[i]] # Either take the corresponding value in the aligned dictionary or just copy the original value. hypo_tokens[i] = align_dict.get(src_token, src_token) return ' '.join(hypo_tokens)
def replace_unk(hypo_str, src_str, alignment, align_dict, unk): from fairseq import tokenizer # Tokens are strings here hypo_tokens = tokenizer.tokenize_line(hypo_str) src_tokens = tokenizer.tokenize_line(src_str) + ['<eos>'] for i, ht in enumerate(hypo_tokens): if ht == unk: src_token = src_tokens[alignment[i]] # Either take the corresponding value in the aligned dictionary or just copy the original value. hypo_tokens[i] = align_dict.get(src_token, src_token) return ' '.join(hypo_tokens)
def parse(self, path, word_dict, char_dict, reverse_order=False, append_eos=False): word_array_list = [] word_offsets = [0] char_array_list = [] char_offsets = [0] sizes = [] with open(path, "r") as f: for line in f: words = tokenizer.tokenize_line(line) if reverse_order: words.reverse() word_inds = [word_dict.index(w) for w in words] if append_eos: word_inds.append(dict.eos_index) word_array_list.append(np.array(word_inds, dtype=np.int32)) word_offsets.append(word_offsets[-1] + len(word_inds)) sizes.append(len(word_inds)) for word in words: chars = [word] if word in TAGS else list(word) char_inds = [char_dict.index(c) for c in chars] char_array_list.append(np.array(char_inds, dtype=np.int32)) char_offsets.append(char_offsets[-1] + len(char_inds)) self.word_buffer = np.concatenate(word_array_list) self.word_offsets = np.array(word_offsets, dtype=np.int32) self.char_buffer = np.concatenate(char_array_list) self.char_offsets = np.array(char_offsets, dtype=np.int32) self.sizes = np.array(sizes, dtype=np.int32) del word_array_list, word_offsets, char_array_list, char_offsets, sizes
def encode_line(line, vocab, add_if_not_exist=True, consumer=None, append_eos=True, reverse_order=False): """ Copied from fairseq.data.Dictionary and changed ids tensor type to Long (==int64) :param line: :param vocab: :param add_if_not_exist: :param consumer: :param append_eos: :param reverse_order: :return: """ words = tokenize_line(line) if reverse_order: words = list(reversed(words)) nwords = len(words) ids = torch.LongTensor(nwords + 1 if append_eos else nwords) for i, word in enumerate(words): if add_if_not_exist: idx = vocab.add_symbol(word) else: idx = vocab.index(word) if consumer is not None: consumer(word, idx) ids[i] = idx if append_eos: ids[nwords] = vocab.eos_index return ids
def read_text(self, utt_ids: List[str], token_text: List[str], dictionary=None): assert len(utt_ids) == len(token_text) self.utt_ids = utt_ids self.tokens_list = token_text self.tensor_list = [] self.size = len(self.utt_ids) # number of utterances self.sizes = [] if dictionary is not None: for tokens in self.tokens_list: tensor = dictionary.encode_line( tokens, add_if_not_exist=False, append_eos=self.append_eos, ).long() self.tensor_list.append(tensor) self.sizes.append(len(self.tensor_list[-1])) else: self.sizes = [ len(tokenize_line(tokens)) for tokens in self.tokens_list ] self.sizes = np.array(self.sizes, dtype=np.int32) assert len(self.utt_ids) == len(self.tokens_list) and \ (dictionary is None or len(self.utt_ids) == len(self.tensor_list)) and \ len(self.utt_ids) == len(self.sizes)
def parse_sentences(parser, in_tokenized_sentences, batch_size, roberta_batch_size, out_amr): # read tokenized sentences sentences = read_sentences(in_tokenized_sentences) split_sentences = [] for sentence in sentences: split_sentences.append(tokenize_line(sentence)) print(len(split_sentences)) # parse start = time.time() result = parser.parse_sentences( split_sentences, batch_size=batch_size, roberta_batch_size=roberta_batch_size, ) end = time.time() print(len(result)) time_secs = timedelta(seconds=float(end - start)) print(f'Total time taken to parse sentences: {time_secs}') # write annotations if out_amr: with open(out_amr, 'w') as fid: for i in range(0, len(sentences)): fid.write(result[i])
def main(): parser = argparse.ArgumentParser() parser.add_argument("input", help="input file; use - for stdin") args = parser.parse_args() # tokenise based on space for line in fileinput.input([args.input], openhook=fileinput.hook_compressed): line = tokenizer.tokenize_line(line) line = " ".join(line) sys.stdout.write(line+"\n")
def read_text(self, utt_ids: List[str], texts: List[str], dictionary=None): assert len(utt_ids) == len(texts) self.utt_ids = utt_ids self.texts = texts self.size = len(self.utt_ids) # number of utterances from fairseq.tokenizer import tokenize_line if dictionary is not None: self.sizes = [ len(tokenize_line(dictionary.wordpiece_encode(text))) + (1 if self.append_eos else 0) for text in texts ] else: self.sizes = [len(tokenize_line(text)) for text in texts] self.sizes = np.array(self.sizes, dtype=np.int32) assert len(self.utt_ids) == len(self.sizes)
def tokenize(self, text: str, add_start: bool = True): masked_text = text.replace(MASK, ROBERTA_MASK) text_spans = masked_text.split(ROBERTA_MASK) text_spans_bpe = ' {0} '.format(ROBERTA_MASK).join([ self.bpe.encode(text_span.rstrip()) for text_span in text_spans ]).strip() if add_start: text_spans_bpe = ROBERTA_START_SENTENCE + ' ' + text_spans_bpe return tokenize_line(text_spans_bpe)
def char_tokenize(line): words = tokenizer.tokenize_line(line) chars = [] for word in words: if word in TAGS: chars.append(word) else: chars.extend(c for c in word) return chars
def check_wordpiece_to_word_map(input_file, raise_error): num_sents = 0 with open(input_file, 'r') as fid: for sentence in tqdm(fid): if not sentence: break sentence = " ".join(tokenize_line(str(sentence.rstrip()))) #print("input: ", sentence) word2piece = get_wordpiece_to_word_map(sentence, roberta.bpe, raise_error)
def _sent_to_word_ids(self, sent, word_dict, reverse_order, prepend_inds, append_inds): """ Extract the word ids for words associated with the input sentence. """ words = tokenizer.tokenize_line(sent) if reverse_order: words.reverse() word_inds = [word_dict.index(w) for w in words] word_inds = prepend_inds + word_inds + append_inds return words, word_inds
def replace_unk(hypo_str, src_str, alignment, align_dict, unk, input_str): from fairseq import tokenizer # Tokens are strings here hypo_tokens = tokenizer.tokenize_line(hypo_str) # TODO: Very rare cases where the replacement is '<eos>' should be handled gracefully src_tokens = tokenizer.tokenize_line(src_str) + ["<eos>"] for i, ht in enumerate(hypo_tokens): if ht == unk: src_idx, tgt_index = alignment[i] src_token = src_tokens[src_idx] # Either take the corresponding value in the aligned dictionary or just copy the original value. hypo_tokens[i] = align_dict.get( src_token, src_token ) # first value is searchd for, second is returned if not found if hypo_tokens[i] == unk and input_str is not None: input_tokens = tokenizer.tokenize_line(input_str) + ["<eos>"] # replace unk token with corresponding word from raw input string hypo_tokens[i] = input_tokens[src_idx] return " ".join(hypo_tokens)
def noise(file, ofile_suffix): noise_injector = NoiseInjector(tgts) with open(ofile_suffix + '.src') as src_ofile, open( ofile_suffix + '.tgt') as tgt_ofile, open(ofile_suffix + '.forward') as align_ofile: for line in file: tgt = tokenize_line(line.strip()) src, align = noise_injector.inject_noise(tgt) tgt_ofile.write(' '.join(tgt) + '\n') src_ofile.write(' '.join(src) + '\n') align_ofile.write(' '.join(align) + '\n')
def replace_unk(hypo_str, align_str, src, unk): hypo_tokens = hypo_str.split() src_tokens = tokenizer.tokenize_line(src) align_idx = [int(i) for i in align_str.split()] for i, ht in enumerate(hypo_tokens): if ht == unk: src_token = src_tokens[align_idx[i]] if src_token in align_dict: hypo_tokens[i] = align_dict[src_token] else: hypo_tokens[i] = src_token return ' '.join(hypo_tokens)
def noise(filename, ofile_suffix): lines = open(filename).readlines() tgts = [tokenize_line(line.strip()) for line in lines] noise_injector = NoiseInjector(tgts) srcs = [] aligns = [] for tgt in tgts: src, align = noise_injector.inject_noise(tgt) srcs.append(src) aligns.append(align) save_file('{}.src'.format(ofile_suffix), srcs) save_file('{}.tgt'.format(ofile_suffix), tgts) save_file('{}.forward'.format(ofile_suffix), aligns)
def _sent_to_word_ids(self, sent, word_dict, reverse_order=False, append_eos=False): """ Extract the word ids for words associated with the input sentence. """ words = tokenizer.tokenize_line(sent) if reverse_order: words.reverse() word_inds = [word_dict.index(w) for w in words] if append_eos: word_inds.append(word_dict.eos_index) return words, word_inds
def encode_labels_line(labels_line, append_eos=True, reverse_order=False): """Custom helper: Encode a string of space-separated binary labels into LongTensor. Mimicks fairseq.data.dictionary.Dictionary.encode_line(). eos always gets a zero token (no change). Returns a torch.IntTensor, analogous to dictionary's encode_line() method. """ labels = [int(label) for label in tokenize_line(labels_line)] assert all([label in [0, 1] for label in labels]), \ f"encode_labels_line: token-level labels must be binary!" if reverse_order: labels = list(reversed(labels)) if append_eos: labels.append(0) return torch.tensor(labels, dtype=torch.int)
def binarize_file(input_file, out_file_pref, impl, dtype=np.int64, tokenize=tokenize_line): out_file = out_file_pref + '.bin' index_file = out_file_pref + '.idx' ds = make_builder(out_file, impl=impl, dtype=dtype) with open(input_file, 'r') as f: for line in f: if line.strip(): line = tokenize_line(line) line = list(map(int, line)) line = torch.tensor(line) ds.add_item(line) else: raise Exception('empty line') ds.finalize(index_file) return
def convert_sentences_to_data(self, sentences, batch_size, roberta_batch_size): # extract RoBERTa features roberta_features = \ self.get_bert_features_batched(sentences, roberta_batch_size) # organize data into a fairseq batch data = [] for index, sentence in enumerate(sentences): ids = self.get_token_ids(sentence) word_features, wordpieces_roberta, word2piece_scattered_indices =\ roberta_features[index] data.append({ 'id': index, 'source': ids, 'source_fix_emb': word_features, 'src_wordpieces': wordpieces_roberta, 'src_wp2w': word2piece_scattered_indices, 'src_tokens': tokenize_line(sentence) # original source tokens }) return data
def read_data(self, path, dictionary, ex_dict): with open(path, 'r') as f: for i, line in enumerate(f): # use complete words for segmentation on the target side if self.is_tgt: s = " ".join([ "▁" + word for word in "".join(line.strip("\n").split()).replace( "▁", " ").strip().split() ]) self.lines.append(s) else: self.lines.append(line.strip('\n')) tokens = tokenize_line(line) if self.is_tgt: tokens = [c for c in "".join(tokens)] if self.append_eos: tokens.append(self.dictionary.eos()) #self.tokens_list.append(tokens) self.sizes.append(len(tokens)) self.sizes = np.array(self.sizes)
def main(): # argument handling args = argument_parsing() # read tokenized sentences sentences = read_sentences(args.in_tokenized_sentences) split_sentences = [] for sentence in sentences: split_sentences.append(tokenize_line(sentence)) print(len(split_sentences)) # load parser start = time.time() parser = AMRParser.from_checkpoint(args.in_checkpoint) end = time.time() time_secs = timedelta(seconds=float(end - start)) print(f'Total time taken to load parser: {time_secs}') # TODO: max batch sizes could be computed from max sentence length # parse start = time.time() result = parser.parse_sentences( split_sentences, batch_size=args.batch_size, roberta_batch_size=args.roberta_batch_size, ) end = time.time() print(len(result)) time_secs = timedelta(seconds=float(end - start)) print(f'Total time taken to parse sentences: {time_secs}') # write annotations with open(args.out_amr, 'w') as fid: for i in range(0, len(sentences)): fid.write(result[i])
def parse(self, path, dict, reverse_order=False, append_eos=False): array_list = [] offsets = [0] sizes = [] with open(path, "r") as f: for line in f: words = tokenizer.tokenize_line(line) if reverse_order: words.reverse() inds = [dict.index(w) for w in words] if append_eos: inds.append(dict.eos_index) array_list.append(np.array(inds, dtype=np.int32)) offsets.append(offsets[-1] + len(inds)) sizes.append(len(inds)) # +1 for Lua compatibility self.buffer = np.concatenate(array_list) + 1 self.offsets = np.array(offsets, dtype=np.int32) self.sizes = np.array(sizes, dtype=np.int32) del array_list del offsets del sizes
def parse_multilingual( self, corpora, reverse_order=False, append_eos=False, prepend_language_id=True, already_numberized=False, ): """Add sentences from text files to the dataset. This method reads pairs of text files containing source and target sides of a bitext. Sentences are converted to integer sequences by tokenization and dictionary look-up. Note that this method removes all sentences which have been previously added to the data set. Example (single sentence): token_sequence = [123, 234, 345] dict.eos_idx = 2 dialect_id = 10 Result: reverse_order=False, append_eos=True, prepend_language_id=True: [10, 123, 234, 345, 2] reverse_order=False, append_eos=True, prepend_language_id=False: [123, 234, 345, 2, 10] reverse_order=True, append_eos=True, prepend_language_id=True: [10, 345, 234, 123, 2] reverse_order=True, append_eos=True, prepend_language_id=False: [345, 234, 123, 2, 10] Args: corpora: List of MultilingualCorpusConfig. If dialect_id is not None, it is added to the token sequence. reverse_order (bool): Whether to reverse the integer token sequence. append_eos (bool): Whether to add the end-of-sentence symbol to each sentence. prepend_language_id (bool): Only used if dialect_id is not None. If true, add ID at the begin of the token sequence. Otherwise, add it at the end of the token sequence. already_numberized (bool): If data_file contains lines of numberized tokens, then already_numberized should be set to True If data_file contains raw text sentences, then already_numberized should be False (default) -- in which case each line is tokenized with tokenizer then numberized with the dictionary before being added to the output buffer. """ array_list = [] offsets = [0] sizes = [] print(corpora) for corpus_config in corpora: print(corpus_config) print(corpus_config.data_file) prepend_inds = [] append_inds = [] if append_eos: append_inds.append(corpus_config.dict.eos_index) if corpus_config.dialect_id is not None: if prepend_language_id: prepend_inds.append(corpus_config.dialect_id) else: append_inds.append(corpus_config.dialect_id) with open(corpus_config.data_file, "r") as f: for line in f: if already_numberized: inds = line.strip().split() inds = [int(ind) for ind in inds] else: words = tokenizer.tokenize_line(line) inds = [corpus_config.dict.index(w) for w in words] if reverse_order: inds.reverse() inds = prepend_inds + inds + append_inds for _ in range(corpus_config.oversampling): array_list.append(np.array(inds, dtype=np.int32)) offsets.append(offsets[-1] + len(inds)) sizes.append(len(inds)) self.buffer = np.concatenate(array_list) self.offsets = np.array(offsets, dtype=np.int64) self.sizes = np.array(sizes, dtype=np.int32) del array_list del offsets del sizes
def parse_multilingual(self, corpora, reverse_order=False, append_eos=False, prepend_language_id=True): """Add sentences from text files to the dataset. This method reads pairs of text files containing source and target sides of a bitext. Sentences are converted to integer sequences by tokenization and dictionary look-up. Note that this method removes all sentences which have been previously added to the data set. Example (single sentence): token_sequence = [123, 234, 345] dict.eos_idx = 2 dialect_id = 10 Result: reverse_order=False, append_eos=True, prepend_language_id=True: [10, 123, 234, 345, 2] reverse_order=False, append_eos=True, prepend_language_id=False: [123, 234, 345, 2, 10] reverse_order=True, append_eos=True, prepend_language_id=True: [10, 345, 234, 123, 2] reverse_order=True, append_eos=True, prepend_language_id=False: [345, 234, 123, 2, 10] Args: corpora: List of MultilingualCorpusConfig. If dialect_id is not None, it is added to the token sequence. reverse_order (bool): Whether to reverse the integer token sequence. append_eos (bool): Whether to add the end-of-sentence symbol to each sentence. prepend_language_id (bool): Only used if dialect_id is not None. If true, add ID at the begin of the token sequence. Otherwise, add it at the end of the token sequence. """ array_list = [] offsets = [0] sizes = [] for corpus_config in corpora: prepend_inds = [] append_inds = [] if append_eos: append_inds.append(corpus_config.dict.eos_index) if corpus_config.dialect_id is not None: if prepend_language_id: prepend_inds.append(corpus_config.dialect_id) else: append_inds.append(corpus_config.dialect_id) with open(corpus_config.data_file, "r") as f: for line in f: words = tokenizer.tokenize_line(line) if reverse_order: words.reverse() inds = (prepend_inds + [corpus_config.dict.index(w) for w in words] + append_inds) for _ in range(corpus_config.oversampling): array_list.append(np.array(inds, dtype=np.int32)) offsets.append(offsets[-1] + len(inds)) sizes.append(len(inds)) # +1 for Lua compatibility self.buffer = np.concatenate(array_list) + 1 self.offsets = np.array(offsets, dtype=np.int32) self.sizes = np.array(sizes, dtype=np.int32) del array_list del offsets del sizes
def main(args): import_user_module(args) print(args) os.makedirs(args.destdir, exist_ok=True) target = not args.only_source task = tasks.get_task(args.task) def train_path(lang): return "{}{}".format(args.trainpref, ("." + lang) if lang else "") def file_name(prefix, lang): fname = prefix if lang is not None: fname += ".{lang}".format(lang=lang) return fname def dest_path(prefix, lang): return os.path.join(args.destdir, file_name(prefix, lang)) def dict_path(lang): return dest_path("dict", lang) + ".txt" def build_dictionary(filenames, src=False, tgt=False): assert src ^ tgt return task.build_dictionary( filenames, workers=args.workers, threshold=args.thresholdsrc if src else args.thresholdtgt, nwords=args.nwordssrc if src else args.nwordstgt, padding_factor=args.padding_factor, ) if not args.srcdict and os.path.exists(dict_path(args.source_lang)): raise FileExistsError(dict_path(args.source_lang)) if target and not args.tgtdict and os.path.exists(dict_path(args.target_lang)): raise FileExistsError(dict_path(args.target_lang)) if args.copy_ext_dict: assert args.joined_dictionary, \ "--joined-dictionary must be set if --copy-extended-dictionary is specified" assert args.workers == 1, \ "--workers must be set to 1 if --copy-extended-dictionary is specified" if args.joined_dictionary: assert not args.srcdict or not args.tgtdict, \ "cannot use both --srcdict and --tgtdict with --joined-dictionary" if args.srcdict: src_dict = task.load_dictionary(args.srcdict) elif args.tgtdict: src_dict = task.load_dictionary(args.tgtdict) else: assert args.trainpref, "--trainpref must be set if --srcdict is not specified" src_dict = build_dictionary( {train_path(lang) for lang in [args.source_lang, args.target_lang]}, src=True ) tgt_dict = src_dict else: if args.srcdict: src_dict = task.load_dictionary(args.srcdict) else: assert args.trainpref, "--trainpref must be set if --srcdict is not specified" src_dict = build_dictionary([train_path(args.source_lang)], src=True) if target: if args.tgtdict: tgt_dict = task.load_dictionary(args.tgtdict) else: assert args.trainpref, "--trainpref must be set if --tgtdict is not specified" tgt_dict = build_dictionary([train_path(args.target_lang)], tgt=True) else: tgt_dict = None src_dict.save(dict_path(args.source_lang)) if target and tgt_dict is not None: tgt_dict.save(dict_path(args.target_lang)) def make_binary_dataset(vocab, input_prefix, output_prefix, lang, num_workers, copy_src_words=None): print("| [{}] Dictionary: {} types".format(lang, len(vocab) - 1)) n_seq_tok = [0, 0] replaced = Counter() copyied = Counter() def merge_result(worker_result): replaced.update(worker_result["replaced"]) copyied.update(worker_result["copied"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] input_file = "{}{}".format( input_prefix, ("." + lang) if lang is not None else "" ) offsets = Binarizer.find_offsets(input_file, num_workers) pool = None if num_workers > 1: # todo: not support copy pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) pool.apply_async( binarize, ( args, input_file, vocab, prefix, lang, offsets[worker_id], offsets[worker_id + 1] ), callback=merge_result ) pool.close() ds = indexed_dataset.IndexedDatasetBuilder( dataset_dest_file(args, output_prefix, lang, "bin") ) words_list = [] def binarize_consumer(ids, words): ds.add_item(ids) words_list.append(words) merge_result( Binarizer.binarize( input_file, vocab, binarize_consumer, offset=0, end=offsets[1], copy_ext_dict=args.copy_ext_dict, copy_src_words=copy_src_words ) ) if num_workers > 1: pool.join() for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) temp_file_path = dataset_dest_prefix(args, prefix, lang) ds.merge_file_(temp_file_path) os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx")) print( "| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}, {:.3}% <unk> copied from src".format( lang, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], vocab.unk_word, 100 * sum(copyied.values()) / n_seq_tok[1] ) ) return words_list def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1, copy_src_words=None): if args.output_format == "binary": return make_binary_dataset(vocab, input_prefix, output_prefix, lang, num_workers, copy_src_words) elif args.output_format == "raw": # Copy original text file to destination folder output_text_file = dest_path( output_prefix + ".{}-{}".format(args.source_lang, args.target_lang), lang, ) shutil.copyfile(file_name(input_prefix, lang), output_text_file) return None def make_all(lang, vocab, source_words_list_dict=defaultdict(lambda: None)): words_list_dict = defaultdict(lambda: None) if args.trainpref: words_list_dict["train"] = \ make_dataset(vocab, args.trainpref, "train", lang, num_workers=args.workers, copy_src_words=source_words_list_dict['train']) if args.validpref: for k, validpref in enumerate(args.validpref.split(",")): outprefix = "valid{}".format(k) if k > 0 else "valid" words_list_dict["valid"] = \ make_dataset(vocab, validpref, outprefix, lang, copy_src_words=source_words_list_dict['valid']) if args.testpref: for k, testpref in enumerate(args.testpref.split(",")): outprefix = "test{}".format(k) if k > 0 else "test" words_list_dict["test"] = \ make_dataset(vocab, testpref, outprefix, lang, copy_src_words=source_words_list_dict['test']) return words_list_dict source_words_list_dict = make_all(args.source_lang, src_dict) if target: target_words_list_dict = make_all(args.target_lang, tgt_dict, source_words_list_dict) print("| Wrote preprocessed data to {}".format(args.destdir)) if False: #args.alignfile: assert args.trainpref, "--trainpref must be set if --alignfile is specified" src_file_name = train_path(args.source_lang) tgt_file_name = train_path(args.target_lang) freq_map = {} with open(args.alignfile, "r", encoding='utf-8') as align_file: with open(src_file_name, "r", encoding='utf-8') as src_file: with open(tgt_file_name, "r", encoding='utf-8') as tgt_file: for a, s, t in zip_longest(align_file, src_file, tgt_file): si = src_dict.encode_line(s, add_if_not_exist=False) ti = tgt_dict.encode_line(t, add_if_not_exist=False) ai = list(map(lambda x: tuple(x.split("-")), a.split())) for sai, tai in ai: srcidx = si[int(sai)] tgtidx = ti[int(tai)] if srcidx != src_dict.unk() and tgtidx != tgt_dict.unk(): assert srcidx != src_dict.pad() assert srcidx != src_dict.eos() assert tgtidx != tgt_dict.pad() assert tgtidx != tgt_dict.eos() if srcidx not in freq_map: freq_map[srcidx] = {} if tgtidx not in freq_map[srcidx]: freq_map[srcidx][tgtidx] = 1 else: freq_map[srcidx][tgtidx] += 1 align_dict = {} for srcidx in freq_map.keys(): align_dict[srcidx] = max(freq_map[srcidx], key=freq_map[srcidx].get) with open( os.path.join( args.destdir, "alignment.{}-{}.txt".format(args.source_lang, args.target_lang), ), "w", encoding='utf-8' ) as f: for k, v in align_dict.items(): print("{} {}".format(src_dict[k], tgt_dict[v]), file=f) if args.alignfile: from fairseq.tokenizer import tokenize_line import numpy as np assert args.trainpref, "--trainpref must be set if --alignfile is specified" src_file_name = train_path(args.source_lang) tgt_file_name = train_path(args.target_lang) src_labels_list = [] tgt_labels_list = [] with open(args.alignfile, "r", encoding='utf-8') as align_file: with open(src_file_name, "r", encoding='utf-8') as src_file: with open(tgt_file_name, "r", encoding='utf-8') as tgt_file: for a, s, t in zip_longest(align_file, src_file, tgt_file): src_words = tokenize_line(s) tgt_words = tokenize_line(t) ai = list(map(lambda x: tuple(x.split("-")), a.split())) src_labels = np.ones(len(src_words), int) tgt_labels = np.ones(len(tgt_words), int) for sai, tai in ai: if int(tai) >= len(tgt_words): print('Bad case:') print(tgt_words) print(ai) continue src_word = src_words[int(sai)] tgt_word = tgt_words[int(tai)] if src_word == tgt_word: src_labels[int(sai)] = 0 tgt_labels[int(tai)] = 0 src_labels_list.append(src_labels) tgt_labels_list.append(tgt_labels) save_label_file(os.path.join(args.destdir, "train.label.{}.txt".format(args.source_lang)), src_labels_list) save_label_file(os.path.join(args.destdir, "train.label.{}.txt".format(args.target_lang)), tgt_labels_list)
def process_srl_row_simple(self, srl_row, word_dct: Dictionary): qa_pair = srl_row["qa_pair"] def get_padded_toks_and_lens(inp, max_l): return pad_words( word_list=word_dct.encode_line( inp, add_if_not_exist=False, append_eos=False ).tolist(), max_len=max_l, append_eos=True, eos_index=word_dct.eos_index, pad_index=word_dct.pad_index, ) max_l = 20 if self.split_type == "train": qarg_lemma_out_lst = qa_pair["qarg_lemma"] qarg_x = qa_pair["question_type"] mapping = { "<Q-V>": ["ARG0", "ARG1", "ARG2", "ARGM-LOC"], "<Q-ARG0>": ["V", "ARG1", "ARG2", "ARGM-LOC"], "<Q-ARG1>": ["V", "ARG0", "ARG2", "ARGM-LOC"], "<Q-ARG2>": ["V", "ARG0", "ARG1", "ARGM-LOC"], "<Q-ARGM-LOC>": ["V", "ARG0", "ARG1", "ARG2"], } qphrase_lst = [] qphrase_ix_lst = [] for qarg_lemm1_ix, qarg_lemm1 in enumerate(qarg_lemma_out_lst): if qarg_x == qarg_lemm1[0]: qphrase_lst.append(qarg_lemm1[3]) qphrase_ix_lst.append(qarg_lemm1_ix) else: if ( qarg_lemm1[0] in mapping[qarg_x] # and len(qphrase_lst) < 3 ): qphrase_lst.append(qarg_lemm1[3]) qphrase_ix_lst.append(qarg_lemm1_ix) if len(qphrase_lst) < 3: question = qa_pair["question"] qphrase_ix_lst = [ix for ix in range(len(qarg_lemma_out_lst))] else: question = " ".join(qphrase_lst) qphrase_ix_lst = qphrase_ix_lst else: question = qa_pair["question"] qphrase_ix_lst = qa_pair["qphrase_ix_lst"] question_toks, question_tok_lens = get_padded_toks_and_lens( question, max_l=max_l ) answer_toks, answer_tok_lens = get_padded_toks_and_lens( qa_pair["answer"], max_l=max_l ) question_type = qa_pair["question_type"] out_dct = { "question_toks": torch.tensor(question_toks).long(), "question_tok_len": torch.tensor(question_tok_lens).long(), "question_type": torch.tensor(word_dct.indices[question_type]).long(), "answer_toks": torch.tensor(answer_toks).long(), "answer_tok_lens": torch.tensor(answer_tok_lens).long(), } if self.cfg.mdl.use_phr_clf: ans = qa_pair["answer"] if ans in self.comm.awvoc.indices: answer_tok1 = self.comm.awvoc.indices[ans] else: answer_tok1 = self.comm.awvoc.unk_index aeos_ind = self.comm.awvoc.eos_index out_dct["answer_clf"] = torch.tensor([answer_tok1, aeos_ind]).long() out_dct["answer_clf_lens"] = torch.tensor(2).long() if self.cfg.mdl.use_srl_bounds: # only for VOGNET-QAP question_srl_bounds = [] cur_ix = 0 num_srls_max = 5 num_box_per_srl = 4 if self.cfg.ds_name == "anet": vid_seg_gt_box = srl_row["gt_bboxes"] vid_seg_gt_frms = srl_row["gt_frms"] gt_bbox_lst = [] req_cls_pats_mask = srl_row["req_cls_pats_mask"] to_break = False for qarg_le1_ix in qphrase_ix_lst: qarg_lemma1 = qa_pair["qarg_lemma"][qarg_le1_ix] tok_str = qarg_lemma1[3] assert isinstance(tok_str, str) tok_out = tokenize_line(tok_str) tok_len = len(tok_out) en_ix = cur_ix + tok_len - 1 is_groundable = qarg_lemma1[2] if is_groundable and self.cfg.ds_name == "anet": gt_info = req_cls_pats_mask[qarg_le1_ix] assert gt_info[1] == 1 gbox_frm = [] for z in gt_info[2]: gbx = copy.deepcopy(vid_seg_gt_box[z]) gfrm = copy.deepcopy(vid_seg_gt_frms[z]) gbx.append(gfrm) gbox_frm += [gbx] else: gbox_frm = [[0] * 5] * num_box_per_srl if len(gbox_frm) < num_box_per_srl: gbox_frm += [[0] * 5] * (num_box_per_srl - len(gbox_frm)) else: gbox_frm = gbox_frm[:num_box_per_srl] gt_bbox_lst.append(gbox_frm) if en_ix < max_l - 1: question_srl_bounds.append([cur_ix, en_ix]) else: question_srl_bounds.append([cur_ix, max_l - 1]) to_break = True if to_break: break cur_ix += tok_len num_srls_used = min(len(question_srl_bounds), num_srls_max) if len(question_srl_bounds) > num_srls_max: question_srl_bounds = question_srl_bounds[:num_srls_max] gt_bbox_lst = gt_bbox_lst[:num_srls_max] else: to_add_srls = num_srls_max - len(question_srl_bounds) question_srl_bounds += [[0, 0]] * to_add_srls gt_bbox_lst += [[[0] * 5] * num_box_per_srl] * to_add_srls assert len(question_srl_bounds) == num_srls_max out_dct["question_srl_bounds_idxs"] = torch.tensor( question_srl_bounds ).long() out_dct["num_srls_used_msk"] = torch.tensor( [1] * num_srls_used + [0] * (num_srls_max - num_srls_used) ).long() out_dct["num_srls_used"] = torch.tensor(num_srls_used).long() out_dct["gt_bbox_for_srls"] = torch.tensor(gt_bbox_lst).float() out_dct["gt_bbox_for_srls_msk"] = ( out_dct["gt_bbox_for_srls"].sum(dim=-1).ne(0) ) return out_dct
def main(): args = argument_parsing() sentences = read_sentences(args.in_tokenized_sentences) split_sentences = [] for sentence in sentences: split_sentences.append(tokenize_line(sentence)) print(len(split_sentences)) bad_unicode = open(args.output_file, 'w') def load_roberta(name=None, roberta_cache_path=None): if not roberta_cache_path: roberta = torch.hub.load('pytorch/fairseq', name) else: roberta = RobertaModel.from_pretrained(roberta_cach_path, checkpoint_file='model.pt') roberta.eval() if torch.cuda.is_available(): roberta.cuda() return roberta def get_wordpiece_to_word_map(sentence, roberta_bpe, raise_error): # Get word and worpiece tokens according to RoBERTa # sentence = sentence.replace(u'\x91', u' ') # sentence = sentence.replace(u'\x96', u' ') word_tokens = sentence.split() wordpiece_tokens = [ roberta_bpe.decode(wordpiece) for wordpiece in roberta_bpe.encode(sentence).split() ] #print("wp_tokens: ", wordpiece_tokens) assert len(word_tokens) <= len(wordpiece_tokens) assert isinstance(word_tokens, list) assert isinstance(wordpiece_tokens, list) w_index = 0 word_to_wordpiece = [] subword_sequence = [] bad_unicode_flag = 0 for wp_index in range(len(wordpiece_tokens)): if w_index in range(len(word_tokens)): word = word_tokens[w_index] if word == wordpiece_tokens[wp_index]: word_to_wordpiece.append(wp_index) w_index += 1 else: subword_sequence.append(wp_index) word_from_pieces = "".join([ # NOTE: Facebooks BPE signals SOW with whitesplace wordpiece_tokens[i].lstrip() for i in subword_sequence ]) if word == word_from_pieces: word_to_wordpiece.append(subword_sequence) w_index += 1 subword_sequence = [] elif word_from_pieces not in word: word_to_wordpiece.append(subword_sequence) w_index += 1 subword_sequence = [] bad_unicode_flag = 1 if bad_unicode_flag == 1: bad_unicode.write(sentence) wp = " ".join(wordpiece_tokens) print("\n\nsentence: ", sentence) print("wp: ", wp) print("\n") bad_unicode.write("\n") bad_unicode.write(wp) bad_unicode.write("\n\n") if raise_error: raise Exception('Unicode splitting failed') return word_to_wordpiece def check_wordpiece_to_word_map(input_file, raise_error): num_sents = 0 with open(input_file, 'r') as fid: for sentence in tqdm(fid): if not sentence: break sentence = " ".join(tokenize_line(str(sentence.rstrip()))) #print("input: ", sentence) word2piece = get_wordpiece_to_word_map(sentence, roberta.bpe, raise_error) roberta = load_roberta(name=args.pretrained_embed) check_wordpiece_to_word_map(args.in_tokenized_sentences, args.raise_error)