def test_finalize(self): txt = [ 'A B C D', 'B C D', 'C D', 'D', ] ref_ids1 = list( map(torch.IntTensor, [ [4, 5, 6, 7, 2], [5, 6, 7, 2], [6, 7, 2], [7, 2], ])) ref_ids2 = list( map(torch.IntTensor, [ [7, 6, 5, 4, 2], [6, 5, 4, 2], [5, 4, 2], [4, 2], ])) # build dictionary d = Dictionary() for line in txt: Tokenizer.tokenize(line, d, add_if_not_exist=True) def get_ids(dictionary): ids = [] for line in txt: ids.append( Tokenizer.tokenize(line, dictionary, add_if_not_exist=False)) return ids def assertMatch(ids, ref_ids): for toks, ref_toks in zip(ids, ref_ids): self.assertEqual(toks.size(), ref_toks.size()) self.assertEqual(0, (toks != ref_toks).sum().item()) ids = get_ids(d) assertMatch(ids, ref_ids1) # check finalized dictionary d.finalize() finalized_ids = get_ids(d) assertMatch(finalized_ids, ref_ids2) # write to disk and reload with tempfile.NamedTemporaryFile(mode='w') as tmp_dict: d.save(tmp_dict.name) d = Dictionary.load(tmp_dict.name) reload_ids = get_ids(d) assertMatch(reload_ids, ref_ids2) assertMatch(finalized_ids, reload_ids)
def test_finalize(self): txt = [ 'A B C D', 'B C D', 'C D', 'D', ] ref_ids1 = list(map(torch.IntTensor, [ [4, 5, 6, 7, 2], [5, 6, 7, 2], [6, 7, 2], [7, 2], ])) ref_ids2 = list(map(torch.IntTensor, [ [7, 6, 5, 4, 2], [6, 5, 4, 2], [5, 4, 2], [4, 2], ])) # build dictionary d = Dictionary() for line in txt: Tokenizer.tokenize(line, d, add_if_not_exist=True) def get_ids(dictionary): ids = [] for line in txt: ids.append(Tokenizer.tokenize(line, dictionary, add_if_not_exist=False)) return ids def assertMatch(ids, ref_ids): for toks, ref_toks in zip(ids, ref_ids): self.assertEqual(toks.size(), ref_toks.size()) self.assertEqual(0, (toks != ref_toks).sum().item()) ids = get_ids(d) assertMatch(ids, ref_ids1) # check finalized dictionary d.finalize() finalized_ids = get_ids(d) assertMatch(finalized_ids, ref_ids2) # write to disk and reload with tempfile.NamedTemporaryFile(mode='w') as tmp_dict: d.save(tmp_dict.name) d = Dictionary.load(tmp_dict.name) reload_ids = get_ids(d) assertMatch(reload_ids, ref_ids2) assertMatch(finalized_ids, reload_ids)
def load_dataset2(self, split, **kwargs): data_path = self.args.data[0] prefix = os.path.join( data_path, split + "." + self.args.source_lang + "-" + self.args.target_lang) src_sentences, src_lengths = [], [] with open(prefix + "." + self.args.source_lang, encoding='utf-8') as file: for line in file: sentence = line.strip() tokens = Tokenizer.tokenize(sentence, self.src_dict, add_if_not_exist=False) src_sentences.append(tokens) src_lengths.append(tokens.numel()) tgt_sentences, tgt_lengths = [], [] with open(prefix + "." + self.args.target_lang, encoding='utf-8') as file: for line in file: sentence = line.strip() tokens = Tokenizer.tokenize(sentence, self.tgt_dict, add_if_not_exist=False) tgt_sentences.append(tokens) tgt_lengths.append(tokens.numel()) hters = [] with open(prefix + "." + self.args.hter_lang, encoding='utf-8') as file: for line in file: hter = line.strip() hters.append(torch.FloatTensor([float(hter)])) self.datasets[split] = LanguagePairHTERDataset( src_sentences, src_lengths, self.src_dict, tgt_sentences, tgt_lengths, self.tgt_dict, left_pad_source=self.args.left_pad_source, left_pad_target=self.args.left_pad_target, max_source_positions=self.args.max_source_positions, max_target_positions=self.args.max_target_positions, hter=hters, hter_sizes=torch.ones(len(hters)))
def read_data(self, path, dictionary, is_dst=False, src_oov_words_list=None): count = 0 with open(path, 'r') as f: for line in f: self.lines.append(line.strip('\n')) src_oov_words = None if is_dst: src_oov_words = src_oov_words_list[count] tokens, tokens_extended, oov_words = Tokenizer.tokenize(line, dictionary, add_if_not_exist=False, is_dst=is_dst, src_oov_words=src_oov_words) # +1 for Lua compatibility tokens = tokens + 1 tokens_extended = tokens_extended + 1 self.tokens_list.append(tokens) self.tokens_list_extended_vocab.append(tokens_extended) self.oov_words_list.append(oov_words) # print(line, tokens, tokens_extended, oov_words) # exit(0) self.sizes.append(len(tokens)) self.oov_sizes.append(len(oov_words)) count += 1 if count%10000==0: print(count) self.sizes = np.array(self.sizes) self.oov_sizes = np.array(self.oov_sizes)
def get_ids(dictionary): ids = [] for line in txt: ids.append( Tokenizer.tokenize(line, dictionary, add_if_not_exist=False)) return ids
def read_data(self, path, dictionary): with open(path, 'r') as f: for line in f: self.lines.append(line.strip('\n')) # +1 for Lua compatibility tokens = Tokenizer.tokenize(line, dictionary, add_if_not_exist=False) + 1 self.tokens_list.append(tokens) self.sizes.append(len(tokens)) self.sizes = np.array(self.sizes)
def get_id(self, string): tokens = Tokenizer.tokenize(string, self.task.dictionary, add_if_not_exist=False, append_eos=False).long() indexed_string = tokens.numpy() if self.map_indices is not None: # map indices to subset of the vocabulary indexed_string = self.convert_ids(indexed_string) return indexed_string
def read_data(self, path, dictionary): with open(path, 'r') as f: for line in f: self.lines.append(line.strip('\n')) tokens = Tokenizer.tokenize( line, dictionary, add_if_not_exist=False, append_eos=self.append_eos, reverse_order=self.reverse_order, ).long() self.tokens_list.append(tokens) self.sizes.append(len(tokens)) self.sizes = np.array(self.sizes)
def load_dataset(self, split, **kwargs): """Load a given dataset split (e.g., train, valid, test).""" prefix = os.path.join(self.args.data, '{}.input-label'.format(split)) # Read input sentences. sentences, lengths = [], [] with open(prefix + '.input', encoding='utf-8') as file: for line in file: sentence = line.strip() # Tokenize the sentence, splitting on spaces tokens = Tokenizer.tokenize( sentence, self.input_vocab, add_if_not_exist=False, ) sentences.append(tokens) lengths.append(tokens.numel()) # Read labels. labels = [] with open(prefix + '.label', encoding='utf-8') as file: for line in file: label = line.strip() labels.append( # Convert label to a numeric ID. torch.LongTensor([self.label_vocab.add_symbol(label)]) ) assert len(sentences) == len(labels) print('| {} {} {} examples'.format(self.args.data, split, len(sentences))) # We reuse LanguagePairDataset since classification can be modeled as a # sequence-to-sequence task where the target sequence has length 1. self.datasets[split] = LanguagePairDataset( src=sentences, src_sizes=lengths, src_dict=self.input_vocab, tgt=labels, tgt_sizes=torch.ones(len(labels)), # targets have length 1 tgt_dict=self.label_vocab, left_pad_source=False, max_source_positions=self.args.max_positions, max_target_positions=1, # Since our target is a single class label, there's no need for # input feeding. If we set this to ``True`` then our Model's # ``forward()`` method would receive an additional argument called # *prev_output_tokens* that would contain a shifted version of the # target sequence. input_feeding=False, )
def __create_sample(self, line): tokens = Tokenizer.tokenize(line, self.task.dictionary, tokenize=default_tokenizer, add_if_not_exist=False).long() sample = {} # target is the sentence, for source, rotate item one token to the left (would start with eos) tokens_list = tokens.numpy() tokens_list = np.insert(tokens_list, 0, self.task.dictionary.eos()) tokens = torch.from_numpy(tokens_list) sample["src_tokens"] = tokens.unsqueeze( 0) # add a dimension for the batch sample["src_lengths"] = tokens.size()[0] sample[ 'target'] = None # this will disable the efficient softmax approximation return sample
def read_data(self, path, dictionary): with open(path, 'r', encoding='utf-8') as f: for line in f: src_tokens = None if self.src_tokens_list is not None: src_tokens = self.src_tokens_list[len(self.lines)] self.lines.append(line.strip('\n')) tokens, words = Tokenizer.tokenize( line, dictionary, add_if_not_exist=False, append_eos=self.append_eos, reverse_order=self.reverse_order, src_tokens=src_tokens) tokens = tokens.long() self.tokens_list.append(tokens) self.words_list.append(words) self.sizes.append(len(tokens)) self.sizes = np.array(self.sizes)
def read_data(self, path, dictionary, debug=False): with open(path, 'r') as f: for i, line in enumerate(f): if debug and i >= DEBUG_MODE_LINE_COUNT: break self.lines.append(line.strip('\n')) tokens = Tokenizer.tokenize( line, dictionary, add_if_not_exist=False, append_eos=self.append_eos, reverse_order=self.reverse_order, ).long() self.tokens_list.append(tokens) self.sizes.append(len(tokens)) print('{} has lines {}, min length {} max length {}'.format( path, len(self.sizes), min(self.sizes), max(self.sizes))) self.sizes = np.array(self.sizes)
def __create_sample_batch(self, line_list): tokens_list = [] for line in line_list: tokens = Tokenizer.tokenize(line, self.task.dictionary, tokenize=default_tokenizer, add_if_not_exist=False).tolist() tokens.insert(0, self.task.dictionary.eos() ) # insert EOS at the beginning of the sentence tokens_list.append(tokens) src_lengths_list = [len(tokens) for tokens in tokens_list] token_tensor = torch.nn.utils.rnn.pad_sequence( [torch.LongTensor(t) for t in tokens_list], batch_first=True, padding_value=self.task.dictionary.eos()) sample = {} sample["src_tokens"] = token_tensor sample["src_lengths"] = torch.LongTensor(src_lengths_list) sample[ 'target'] = None # this will disable the efficient softmax approximation return sample
def main(args): print(args) os.makedirs(args.destdir, exist_ok=True) target = not args.only_source def build_dictionary(filenames): if args.singleSeq: d = dictionary.Dictionary() else: d = dictionaryWCS.DictionaryWCS() for filename in filenames: tokenizer.Tokenizer.add_file_to_dictionary(filename, d, tokenize_line, args.L) return d def load_dictionary(filename): if args.singleSeq: d= dictionary.Dictionary.load(filename) else: d = dictionaryWCS.DictionaryWCS.load(filename) return d def train_path(lang): return '{}{}'.format(args.trainpref, ('.' + lang) if lang else '') def file_name(prefix, lang): fname = prefix if lang is not None: fname += f'.{lang}' return fname def dest_path(prefix, lang): return os.path.join(args.destdir, file_name(prefix, lang)) def dict_path(lang): return dest_path('dict', lang) + '.txt' def dataset_dest_path(output_prefix, lang, extension): base = f'{args.destdir}/{output_prefix}' lang_part = f'.{args.source_lang}-{args.target_lang}.{lang}' if lang is not None else '' return f'{base}{lang_part}.{extension}' if args.joined_dictionary: assert not args.srcdict, 'cannot combine --srcdict and --joined-dictionary' assert not args.tgtdict, 'cannot combine --tgtdict and --joined-dictionary' src_dict = build_dictionary(set([ train_path(lang) for lang in [args.source_lang, args.target_lang] ])) tgt_dict = src_dict else: if args.srcdict: src_dict = load_dictionary(args.srcdict) else: assert args.trainpref, "--trainpref must be set if --srcdict is not specified" src_dict = build_dictionary([train_path(args.source_lang)]) if target: if args.tgtdict: tgt_dict = load_dictionary(args.tgtdict) else: assert args.trainpref, "--trainpref must be set if --tgtdict is not specified" tgt_dict = build_dictionary([train_path(args.target_lang)]) if args.gcn: assert args.trainpref, "--trainpref must be set" labels_dict = build_dictionary([train_path(args.labels)]) node1_dict = build_dictionary([train_path(args.node1)]) node2_dict = build_dictionary([train_path(args.node2)]) src_dict.finalize( threshold=args.thresholdsrc, nwords=args.nwordssrc, padding_factor=args.padding_factor, ) src_dict.save(dict_path(args.source_lang)) if target: if not args.joined_dictionary: tgt_dict.finalize( threshold=args.thresholdtgt, nwords=args.nwordstgt, padding_factor=args.padding_factor, ) tgt_dict.save(dict_path(args.target_lang)) if args.gcn: labels_dict.finalize( threshold=args.thresholdsrc, padding_factor=args.padding_factor, ) labels_dict.save(dict_path(args.labels)) node1_dict.finalize( threshold=args.thresholdsrc, padding_factor=args.padding_factor, ) node1_dict.save(dict_path(args.node1)) node2_dict.finalize( threshold=args.thresholdsrc, padding_factor=args.padding_factor, ) node2_dict.save(dict_path(args.node2)) def make_binary_dataset(input_prefix, output_prefix, lang): #debugging, do only targets #if 'src' in lang: #if 'tgt' in lang: # print("skip src files...") # #print("skip tgt files...") # return dict = load_dictionary(dict_path(lang)) print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1)) ds = indexed_dataset.IndexedDatasetBuilder(dataset_dest_path(output_prefix, lang, 'bin')) annotator = None aconsumer = None if args.addAnnotations: brigramFile = None if os.path.isfile(args.addAnnotations + "_bigrams"): brigramFile = args.addAnnotations + "_bigrams" if args.addAnnotations.endswith(".mallet"): annotator = MalletLDATopicDistAnnotator(args.addAnnotations, args.numTopics, brigramFile) else: annotator = GensimLDATopicDistAnnotator(args.addAnnotations, args.numTopics, brigramFile) #generate embeddings for topic keywords annotator.generateKeywordEmbeddings(dataset_dest_path(output_prefix+"_keyEmbeddings", None, 'txt')) annotator.generateKeywordDict(dataset_dest_path(output_prefix+"_keyVocab", None, 'txt')) ads = indexed_dataset.IndexedDatasetBuilder( dataset_dest_path(output_prefix+"_keys", lang, 'bin'), dtype=np.double) def aconsumer(tensor): ads.add_item(tensor) awds = indexed_dataset.IndexedDatasetBuilder( dataset_dest_path(output_prefix+"_keywords", lang, 'bin')) def awconsumer(tensor): awds.add_item(tensor) def consumer(tensor): ds.add_item(tensor) input_file = '{}{}'.format(input_prefix, ('.' + lang) if lang is not None else '') if args.singleSeq: if '.src' in input_file: res = TokenizerWCSSingleSequence.binarize(input_file, dict, consumer, L=args.L, aconsumer=aconsumer, annotator=annotator) TokenizerWCSSingleSequence.printStats(res, lang, input_file, dict) elif '.tgt' in input_file: res = TokenizerWCSSingleSequence.binarize(input_file, dict, consumer, aconsumer=aconsumer, annotator=annotator) TokenizerWCSSingleSequence.printStats(res, lang, input_file, dict) else: res = TokenizerWCSSingleSequence.binarize(input_file, dict, consumer, append_eos=False, aconsumer=aconsumer, annotator=annotator) TokenizerWCSSingleSequence.printStats(res, lang, input_file, dict) else: if '.src' in input_file: res = TokenizerWCSParagraph.binarize(input_file, dict, consumer, max_chunk_length= args.src_chunk_length, L=args.L, aconsumer=aconsumer, awconsumer=awconsumer, annotator=annotator) TokenizerWCSParagraph.printStats(res, lang, input_file, dict) else: res = TokenizerWCSSentence.binarize(input_file, dict, consumer, max_chunk_length=args.tgt_chunk_length, aconsumer=aconsumer, annotator=annotator) TokenizerWCSSentence.printStats(res, lang, input_file, dict) ds.finalize(dataset_dest_path(output_prefix, lang, 'idx')) if args.addAnnotations: ads.finalize(dataset_dest_path(output_prefix+"_keys", lang, 'idx')) awds.finalize(dataset_dest_path(output_prefix+"_keywords", lang, 'idx')) def make_dataset(input_prefix, output_prefix, lang): if args.output_format == 'binary': make_binary_dataset(input_prefix, output_prefix, lang) elif args.output_format == 'raw': # Copy original text file to destination folder output_text_file = dest_path( output_prefix + '.{}-{}'.format(args.source_lang, args.target_lang), lang, ) shutil.copyfile(file_name(input_prefix, lang), output_text_file) def make_all(lang): if args.trainpref: make_dataset(args.trainpref, 'train', lang) if args.validpref: for k, validpref in enumerate(args.validpref.split(',')): outprefix = 'valid{}'.format(k) if k > 0 else 'valid' make_dataset(validpref, outprefix, lang) if args.testpref: for k, testpref in enumerate(args.testpref.split(',')): outprefix = 'test{}'.format(k) if k > 0 else 'test' make_dataset(testpref, outprefix, lang) make_all(args.source_lang) if target: make_all(args.target_lang) if args.gcn: make_all(args.labels) make_all(args.node1) make_all(args.node2) print('| Wrote preprocessed data to {}'.format(args.destdir)) if args.alignfile: assert args.trainpref, "--trainpref must be set if --alignfile is specified" src_file_name = train_path(args.source_lang) tgt_file_name = train_path(args.target_lang) src_dict = load_dictionary(dict_path(args.source_lang)) tgt_dict = load_dictionary(dict_path(args.target_lang)) freq_map = {} with open(args.alignfile, 'r') as align_file: with open(src_file_name, 'r') as src_file: with open(tgt_file_name, 'r') as tgt_file: for a, s, t in zip_longest(align_file, src_file, tgt_file): si = Tokenizer.tokenize(s, src_dict, add_if_not_exist=False) ti = Tokenizer.tokenize(t, tgt_dict, add_if_not_exist=False) ai = list(map(lambda x: tuple(x.split('-')), a.split())) for sai, tai in ai: srcidx = si[int(sai)] tgtidx = ti[int(tai)] if srcidx != src_dict.unk() and tgtidx != tgt_dict.unk(): assert srcidx != src_dict.pad() assert srcidx != src_dict.eos() assert tgtidx != tgt_dict.pad() assert tgtidx != tgt_dict.eos() if srcidx not in freq_map: freq_map[srcidx] = {} if tgtidx not in freq_map[srcidx]: freq_map[srcidx][tgtidx] = 1 else: freq_map[srcidx][tgtidx] += 1 align_dict = {} for srcidx in freq_map.keys(): align_dict[srcidx] = max(freq_map[srcidx], key=freq_map[srcidx].get) with open(os.path.join(args.destdir, 'alignment.{}-{}.txt'.format( args.source_lang, args.target_lang)), 'w') as f: for k, v in align_dict.items(): print('{} {}'.format(src_dict[k], tgt_dict[v]), file=f)
def get_ids(dictionary): ids = [] for line in txt: ids.append(Tokenizer.tokenize(line, dictionary, add_if_not_exist=False)) return ids
def main(args): print(args) os.makedirs(args.destdir, exist_ok=True) target = not args.only_source def build_dictionary(filenames): d = dictionary.Dictionary() for filename in filenames: Tokenizer.add_file_to_dictionary(filename, d, tokenize_line) return d def train_path(lang): return '{}{}'.format(args.trainpref, ('.' + lang) if lang else '') def file_name(prefix, lang): fname = prefix if lang is not None: fname += f'.{lang}' return fname def dest_path(prefix, lang): return os.path.join(args.destdir, file_name(prefix, lang)) def dict_path(lang): return dest_path('dict', lang) + '.txt' def dataset_dest_path(output_prefix, lang, extension): base = f'{args.destdir}/{output_prefix}' lang_part = f'.{args.source_lang}-{args.target_lang}.{lang}' if lang is not None else '' return f'{base}{lang_part}.{extension}' if args.joined_dictionary: assert not args.srcdict, 'cannot combine --srcdict and --joined-dictionary' assert not args.tgtdict, 'cannot combine --tgtdict and --joined-dictionary' src_dict = build_dictionary(set([ train_path(lang) for lang in [args.source_lang, args.target_lang] ])) tgt_dict = src_dict else: if args.srcdict: src_dict = dictionary.Dictionary.load(args.srcdict) else: assert args.trainpref, "--trainpref must be set if --srcdict is not specified" src_dict = build_dictionary([train_path(args.source_lang)]) if target: if args.tgtdict: tgt_dict = dictionary.Dictionary.load(args.tgtdict) else: assert args.trainpref, "--trainpref must be set if --tgtdict is not specified" tgt_dict = build_dictionary([train_path(args.target_lang)]) src_dict.finalize( threshold=args.thresholdsrc, nwords=args.nwordssrc, padding_factor=args.padding_factor, ) src_dict.save(dict_path(args.source_lang)) if target: if not args.joined_dictionary: tgt_dict.finalize( threshold=args.thresholdtgt, nwords=args.nwordstgt, padding_factor=args.padding_factor, ) tgt_dict.save(dict_path(args.target_lang)) def make_binary_dataset(input_prefix, output_prefix, lang): dict = dictionary.Dictionary.load(dict_path(lang)) print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1)) ds = indexed_dataset.IndexedDatasetBuilder(dataset_dest_path(output_prefix, lang, 'bin')) def consumer(tensor): ds.add_item(tensor) input_file = '{}{}'.format(input_prefix, ('.' + lang) if lang is not None else '') res = Tokenizer.binarize(input_file, dict, consumer) print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format( lang, input_file, res['nseq'], res['ntok'], 100 * res['nunk'] / res['ntok'], dict.unk_word)) ds.finalize(dataset_dest_path(output_prefix, lang, 'idx')) def make_dataset(input_prefix, output_prefix, lang): if args.output_format == 'binary': make_binary_dataset(input_prefix, output_prefix, lang) elif args.output_format == 'raw': # Copy original text file to destination folder output_text_file = dest_path( output_prefix + '.{}-{}'.format(args.source_lang, args.target_lang), lang, ) shutil.copyfile(file_name(input_prefix, lang), output_text_file) def make_all(lang): if args.trainpref: make_dataset(args.trainpref, 'train', lang) if args.validpref: for k, validpref in enumerate(args.validpref.split(',')): outprefix = 'valid{}'.format(k) if k > 0 else 'valid' make_dataset(validpref, outprefix, lang) if args.testpref: for k, testpref in enumerate(args.testpref.split(',')): outprefix = 'test{}'.format(k) if k > 0 else 'test' make_dataset(testpref, outprefix, lang) make_all(args.source_lang) if target: make_all(args.target_lang) print('| Wrote preprocessed data to {}'.format(args.destdir)) if args.alignfile: assert args.trainpref, "--trainpref must be set if --alignfile is specified" src_file_name = train_path(args.source_lang) tgt_file_name = train_path(args.target_lang) src_dict = dictionary.Dictionary.load(dict_path(args.source_lang)) tgt_dict = dictionary.Dictionary.load(dict_path(args.target_lang)) freq_map = {} with open(args.alignfile, 'r') as align_file: with open(src_file_name, 'r') as src_file: with open(tgt_file_name, 'r') as tgt_file: for a, s, t in zip_longest(align_file, src_file, tgt_file): si = Tokenizer.tokenize(s, src_dict, add_if_not_exist=False) ti = Tokenizer.tokenize(t, tgt_dict, add_if_not_exist=False) ai = list(map(lambda x: tuple(x.split('-')), a.split())) for sai, tai in ai: srcidx = si[int(sai)] tgtidx = ti[int(tai)] if srcidx != src_dict.unk() and tgtidx != tgt_dict.unk(): assert srcidx != src_dict.pad() assert srcidx != src_dict.eos() assert tgtidx != tgt_dict.pad() assert tgtidx != tgt_dict.eos() if srcidx not in freq_map: freq_map[srcidx] = {} if tgtidx not in freq_map[srcidx]: freq_map[srcidx][tgtidx] = 1 else: freq_map[srcidx][tgtidx] += 1 align_dict = {} for srcidx in freq_map.keys(): align_dict[srcidx] = max(freq_map[srcidx], key=freq_map[srcidx].get) with open(os.path.join(args.destdir, 'alignment.{}-{}.txt'.format( args.source_lang, args.target_lang)), 'w') as f: for k, v in align_dict.items(): print('{} {}'.format(src_dict[k], tgt_dict[v]), file=f)
def main(args): import_user_module(args) print(args) os.makedirs(args.destdir, exist_ok=True) target = not args.only_source task = tasks.get_task(args.task) def train_path(lang): return [ "{}{}".format(trainpref, ("." + lang) if lang else "") for _, trainpref in enumerate(args.trainpref.split(",")) ] def file_name(prefix, lang): fname = prefix if lang is not None: fname += ".{lang}".format(lang=lang) return fname def dest_path(prefix, lang): return os.path.join(args.destdir, file_name(prefix, lang)) def dict_path(lang): return dest_path("dict", lang) + ".txt" def build_dictionary(filenames, src=False, tgt=False): assert src ^ tgt return task.build_dictionary( filenames, workers=args.workers, threshold=args.thresholdsrc if src else args.thresholdtgt, nwords=args.nwordssrc if src else args.nwordstgt, padding_factor=args.padding_factor, ) if args.joined_dictionary: assert ( not args.srcdict or not args.tgtdict ), "cannot use both --srcdict and --tgtdict with --joined-dictionary" if args.srcdict: src_dict = task.load_dictionary(args.srcdict) elif args.tgtdict: src_dict = task.load_dictionary(args.tgtdict) else: assert (args.trainpref ), "--trainpref must be set if --srcdict is not specified" src_dict = build_dictionary( { train_path(lang) for lang in [args.source_lang, args.target_lang] }, src=True) tgt_dict = src_dict else: if args.srcdict: src_dict = task.load_dictionary(args.srcdict) else: assert (args.trainpref ), "--trainpref must be set if --srcdict is not specified" src_dict = build_dictionary(train_path(args.source_lang), src=True) if target: if args.tgtdict: tgt_dict = task.load_dictionary(args.tgtdict) else: assert ( args.trainpref ), "--trainpref must be set if --tgtdict is not specified" tgt_dict = build_dictionary(train_path(args.target_lang), tgt=True) else: tgt_dict = None src_dict.save(dict_path(args.source_lang)) if target and tgt_dict is not None: tgt_dict.save(dict_path(args.target_lang)) def make_binary_dataset(input_prefix, output_prefix, lang, num_workers): dict = task.load_dictionary(dict_path(lang)) print("| [{}] Dictionary: {} types".format(lang, len(dict) - 1)) n_seq_tok = [0, 0] replaced = Counter() def merge_result(worker_result): replaced.update(worker_result["replaced"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] input_file = "{}{}".format(input_prefix, ("." + lang) if lang is not None else "") offsets = Binarizer.find_offsets(input_file, num_workers) pool = None if num_workers > 1: pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) pool.apply_async( binarize, ( args, input_file, dict, prefix, lang, offsets[worker_id], offsets[worker_id + 1], ), callback=merge_result, ) pool.close() ds = indexed_dataset.IndexedDatasetBuilder( dataset_dest_file(args, output_prefix, lang, "bin")) merge_result( Binarizer.binarize(input_file, dict, lambda t: ds.add_item(t), offset=0, end=offsets[1])) if num_workers > 1: pool.join() for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) temp_file_path = dataset_dest_prefix(args, prefix, lang) ds.merge_file_(temp_file_path) os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx")) print("| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format( lang, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], dict.unk_word, )) def make_dataset(input_prefix, output_prefix, lang, num_workers=1): if args.output_format == "binary": make_binary_dataset(input_prefix, output_prefix, lang, num_workers) elif args.output_format == "raw": # Copy original text file to destination folder output_text_file = dest_path( output_prefix + ".{}-{}".format(args.source_lang, args.target_lang), lang, ) shutil.copyfile(file_name(input_prefix, lang), output_text_file) def make_all(lang): if args.trainpref: for k, trainpref in enumerate(args.trainpref.split(",")): outprefix = "train{}".format(k) if k > 0 else "train" make_dataset(trainpref, outprefix, lang, num_workers=args.workers) if args.validpref: for k, validpref in enumerate(args.validpref.split(",")): outprefix = "valid{}".format(k) if k > 0 else "valid" make_dataset(validpref, outprefix, lang) if args.testpref: for k, testpref in enumerate(args.testpref.split(",")): outprefix = "test{}".format(k) if k > 0 else "test" make_dataset(testpref, outprefix, lang) make_all(args.source_lang) if target: make_all(args.target_lang) print("| Wrote preprocessed data to {}".format(args.destdir)) if args.alignfile: raise NotImplementedError('args.alignfile is not implemented') assert args.trainpref, "--trainpref must be set if --alignfile is specified" src_file_name = train_path(args.source_lang) tgt_file_name = train_path(args.target_lang) freq_map = {} with open(args.alignfile, "r", encoding='utf-8') as align_file: with open(src_file_name, "r", encoding='utf-8') as src_file: with open(tgt_file_name, "r", encoding='utf-8') as tgt_file: for a, s, t in zip_longest(align_file, src_file, tgt_file): si = Tokenizer.tokenize(s, src_dict, add_if_not_exist=False) ti = Tokenizer.tokenize(t, tgt_dict, add_if_not_exist=False) ai = list(map(lambda x: tuple(x.split("-")), a.split())) for sai, tai in ai: srcidx = si[int(sai)] tgtidx = ti[int(tai)] if srcidx != src_dict.unk( ) and tgtidx != tgt_dict.unk(): assert srcidx != src_dict.pad() assert srcidx != src_dict.eos() assert tgtidx != tgt_dict.pad() assert tgtidx != tgt_dict.eos() if srcidx not in freq_map: freq_map[srcidx] = {} if tgtidx not in freq_map[srcidx]: freq_map[srcidx][tgtidx] = 1 else: freq_map[srcidx][tgtidx] += 1 align_dict = {} for srcidx in freq_map.keys(): align_dict[srcidx] = max(freq_map[srcidx], key=freq_map[srcidx].get) with open(os.path.join( args.destdir, "alignment.{}-{}.txt".format(args.source_lang, args.target_lang), ), "w", encoding='utf-8') as f: for k, v in align_dict.items(): print("{} {}".format(src_dict[k], tgt_dict[v]), file=f)
def main(parsed_args): assert parsed_args.path is not None, '--path required for evaluation!' assert parsed_args.input_ordered_file != "", '--input_ordered_file required for evaluation!' assert parsed_args.input_shuffled_file != "", '--input_shuffled_file required for evaluation!' print(parsed_args) use_cuda = torch.cuda.is_available() and not parsed_args.cpu task = tasks.setup_task(parsed_args) # Load ensemble print('| loading model(s) from {}'.format(parsed_args.path)) models, args = utils.load_ensemble_for_inference(parsed_args.path.split(':'), task, model_arg_overrides=eval(parsed_args.model_overrides)) for arg in vars(parsed_args).keys(): if arg not in {'self_target', 'future_target', 'past_target', 'tokens_per_sample', 'output_size_dictionary'}: setattr(args, arg, getattr(parsed_args, arg)) task = tasks.setup_task(args) print(f"Reference file: {parsed_args.input_ordered_file}") print(f"Shuffled file: {parsed_args.input_shuffled_file}") # Optimize ensemble for generation and set the source and dest dicts on the model (required by scorer) for model in models: model.make_generation_fast_() if args.fp16: model.half() assert len(models) == 1 print('num. model params: {}'.format(sum(p.numel() for p in models[0].parameters()))) lm_input_dictionary = task.dictionary vocab = task.target_dictionary #print(f"len(lm_input_dictionary): {len(lm_input_dictionary)}") #print(f"len(lm_dictionary): {len(vocab)}") print(f"Size of vocabulary dictionary: {len(vocab)}") assert len(lm_input_dictionary) == len(vocab) # print(lm_input_dictionary.eos()) # print(vocab.eos()) model = models[0] if use_cuda: model.cuda() ####################################### eos_id = lm_input_dictionary.eos() ordered_input_file_object = codecs.open(parsed_args.input_ordered_file, encoding="utf-8") sentence_num = 0 total_shuffled_neg_log_prob = 0.0 total_original_neg_log_prob = 0.0 total_num_tokens = 0 with codecs.open(parsed_args.input_shuffled_file, encoding="utf-8") as f: for line in f: if sentence_num >= parsed_args.ordering_start_sentence and sentence_num <= parsed_args.ordering_end_sentence: token_ids_tensor = Tokenizer.tokenize( line, lm_input_dictionary, add_if_not_exist=False, append_eos=False, reverse_order=False, ).long() token_ids = [t for t in token_ids_tensor] tokens = tokenize_line(line) assert len(token_ids) == len(tokens) # print(f"------------------Sentence: {sentence_num}") # print(f"Length: {len(tokens)}") shuffled_log_prob = advance(model, [eos_id] + token_ids, token_ids + [eos_id], use_cuda, 1) total_shuffled_neg_log_prob += -1*shuffled_log_prob original_line = ordered_input_file_object.readline() original_token_ids_tensor = Tokenizer.tokenize( original_line, lm_input_dictionary, add_if_not_exist=False, append_eos=False, reverse_order=False, ).long() original_token_ids = [t for t in original_token_ids_tensor] original_tokens = tokenize_line(original_line) assert len(original_token_ids) == len(original_tokens) and len(original_tokens) == len(tokens) original_log_prob = advance(model, [eos_id] + original_token_ids, original_token_ids + [eos_id], use_cuda, 1) total_original_neg_log_prob += -1*original_log_prob total_num_tokens += len(original_token_ids) + 1 # +1 for the eos_id else: ordered_input_file_object.readline() # advance file to maintain alignment with the shuffled file sentence_num += 1 shuffled_perplexity = np.exp(total_shuffled_neg_log_prob / total_num_tokens) original_perplexity = np.exp(total_original_neg_log_prob / total_num_tokens) print(f"Shuffled perplexity: {shuffled_perplexity}") print(f"Original perplexity: {original_perplexity}") print(f"Total tokens (including eos): {total_num_tokens}")
def main(args): print(args) os.makedirs(args.destdir, exist_ok=True) target = not args.only_source def build_dictionary(filenames): d = Dictionary() for filename in filenames: Tokenizer.add_file_to_dictionary(filename, d, tokenize_line, args.workers) return d def train_path(lang): return '{}{}'.format(args.trainpref, ('.' + lang) if lang else '') def file_name(prefix, lang): fname = prefix if lang is not None: fname += f'.{lang}' return fname def dest_path(prefix, lang): return os.path.join(args.destdir, file_name(prefix, lang)) def dict_path(lang): return dest_path('dict', lang) + '.txt' if args.joined_dictionary: assert not args.srcdict, 'cannot combine --srcdict and --joined-dictionary' assert not args.tgtdict, 'cannot combine --tgtdict and --joined-dictionary' src_dict = build_dictionary( set([ train_path(lang) for lang in [args.source_lang, args.target_lang] ])) tgt_dict = src_dict else: if args.srcdict: src_dict = Dictionary.load(args.srcdict) else: assert args.trainpref, "--trainpref must be set if --srcdict is not specified" src_dict = build_dictionary([train_path(args.source_lang)]) if target: if args.tgtdict: tgt_dict = Dictionary.load(args.tgtdict) else: assert args.trainpref, "--trainpref must be set if --tgtdict is not specified" tgt_dict = build_dictionary([train_path(args.target_lang)]) src_dict.finalize( threshold=args.thresholdsrc, nwords=args.nwordssrc, padding_factor=args.padding_factor, ) src_dict.save(dict_path(args.source_lang)) if target: if not args.joined_dictionary: tgt_dict.finalize( threshold=args.thresholdtgt, nwords=args.nwordstgt, padding_factor=args.padding_factor, ) tgt_dict.save(dict_path(args.target_lang)) def make_binary_dataset(input_prefix, output_prefix, lang, num_workers): dict = Dictionary.load(dict_path(lang)) print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1)) n_seq_tok = [0, 0] replaced = Counter() def merge_result(worker_result): replaced.update(worker_result['replaced']) n_seq_tok[0] += worker_result['nseq'] n_seq_tok[1] += worker_result['ntok'] input_file = '{}{}'.format(input_prefix, ('.' + lang) if lang is not None else '') offsets = Tokenizer.find_offsets(input_file, num_workers) pool = None if num_workers > 1: pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) pool.apply_async(binarize, (args, input_file, dict, prefix, lang, offsets[worker_id], offsets[worker_id + 1]), callback=merge_result) pool.close() ds = indexed_dataset.IndexedDatasetBuilder( dataset_dest_file(args, output_prefix, lang, 'bin')) merge_result( Tokenizer.binarize(input_file, dict, lambda t: ds.add_item(t), offset=0, end=offsets[1])) if num_workers > 1: pool.join() for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) temp_file_path = dataset_dest_prefix(args, prefix, lang) ds.merge_file_(temp_file_path) os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize(dataset_dest_file(args, output_prefix, lang, 'idx')) print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format( lang, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], dict.unk_word)) def make_dataset(input_prefix, output_prefix, lang, num_workers=1): if args.output_format == 'binary': make_binary_dataset(input_prefix, output_prefix, lang, num_workers) elif args.output_format == 'raw': # Copy original text file to destination folder output_text_file = dest_path( output_prefix + '.{}-{}'.format(args.source_lang, args.target_lang), lang, ) shutil.copyfile(file_name(input_prefix, lang), output_text_file) def make_all(lang): if args.trainpref: make_dataset(args.trainpref, 'train', lang, num_workers=args.workers) if args.validpref: for k, validpref in enumerate(args.validpref.split(',')): outprefix = 'valid{}'.format(k) if k > 0 else 'valid' make_dataset(validpref, outprefix, lang) if args.testpref: for k, testpref in enumerate(args.testpref.split(',')): outprefix = 'test{}'.format(k) if k > 0 else 'test' make_dataset(testpref, outprefix, lang) make_all(args.source_lang) if target: make_all(args.target_lang) print('| Wrote preprocessed data to {}'.format(args.destdir)) if args.alignfile: assert args.trainpref, "--trainpref must be set if --alignfile is specified" src_file_name = train_path(args.source_lang) tgt_file_name = train_path(args.target_lang) src_dict = Dictionary.load(dict_path(args.source_lang)) tgt_dict = Dictionary.load(dict_path(args.target_lang)) freq_map = {} with open(args.alignfile, 'r') as align_file: with open(src_file_name, 'r') as src_file: with open(tgt_file_name, 'r') as tgt_file: for a, s, t in zip_longest(align_file, src_file, tgt_file): si = Tokenizer.tokenize(s, src_dict, add_if_not_exist=False) ti = Tokenizer.tokenize(t, tgt_dict, add_if_not_exist=False) ai = list(map(lambda x: tuple(x.split('-')), a.split())) for sai, tai in ai: srcidx = si[int(sai)] tgtidx = ti[int(tai)] if srcidx != src_dict.unk( ) and tgtidx != tgt_dict.unk(): assert srcidx != src_dict.pad() assert srcidx != src_dict.eos() assert tgtidx != tgt_dict.pad() assert tgtidx != tgt_dict.eos() if srcidx not in freq_map: freq_map[srcidx] = {} if tgtidx not in freq_map[srcidx]: freq_map[srcidx][tgtidx] = 1 else: freq_map[srcidx][tgtidx] += 1 align_dict = {} for srcidx in freq_map.keys(): align_dict[srcidx] = max(freq_map[srcidx], key=freq_map[srcidx].get) with open( os.path.join( args.destdir, 'alignment.{}-{}.txt'.format(args.source_lang, args.target_lang)), 'w') as f: for k, v in align_dict.items(): print('{} {}'.format(src_dict[k], tgt_dict[v]), file=f)
def main(args): print(args) os.makedirs(args.destdir, exist_ok=True) target = not args.only_source if args.joined_dictionary: assert not args.srcdict, 'cannot combine --srcdict and --joined-dictionary' assert not args.tgtdict, 'cannot combine --tgtdict and --joined-dictionary' src_dict = dictionary.Dictionary() for lang in [args.source_lang, args.target_lang]: Tokenizer.add_file_to_dictionary( filename='{}.{}'.format(args.trainpref, lang), dict=src_dict, tokenize=tokenize_line, ) src_dict.finalize() tgt_dict = src_dict else: if args.srcdict: src_dict = dictionary.Dictionary.load(args.srcdict) else: assert args.trainpref, "--trainpref must be set if --srcdict is not specified" src_dict = Tokenizer.build_dictionary(filename='{}.{}'.format(args.trainpref, args.source_lang)) if target: if args.tgtdict: tgt_dict = dictionary.Dictionary.load(args.tgtdict) else: assert args.trainpref, "--trainpref must be set if --tgtdict is not specified" tgt_dict = Tokenizer.build_dictionary(filename='{}.{}'.format(args.trainpref, args.target_lang)) src_dict.save(os.path.join(args.destdir, 'dict.{}.txt'.format(args.source_lang)), threshold=args.thresholdsrc, nwords=args.nwordssrc) if target: tgt_dict.save(os.path.join(args.destdir, 'dict.{}.txt'.format(args.target_lang)), threshold=args.thresholdtgt, nwords=args.nwordstgt) def make_binary_dataset(input_prefix, output_prefix, lang): dict = dictionary.Dictionary.load(os.path.join(args.destdir, 'dict.{}.txt'.format(lang))) print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1)) ds = indexed_dataset.IndexedDatasetBuilder( '{}/{}.{}-{}.{}.bin'.format(args.destdir, output_prefix, args.source_lang, args.target_lang, lang) ) def consumer(tensor): ds.add_item(tensor) input_file = '{}.{}'.format(input_prefix, lang) res = Tokenizer.binarize(input_file, dict, consumer) print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format( lang, input_file, res['nseq'], res['ntok'], 100 * res['nunk'] / res['ntok'], dict.unk_word)) ds.finalize('{}/{}.{}-{}.{}.idx'.format( args.destdir, output_prefix, args.source_lang, args.target_lang, lang)) def make_dataset(input_prefix, output_prefix, lang, output_format='binary'): if output_format == 'binary': make_binary_dataset(input_prefix, output_prefix, lang) elif output_format == 'raw': # Copy original text file to destination folder output_text_file = os.path.join(args.destdir, '{}.{}'.format(output_prefix, lang)) shutil.copyfile('{}.{}'.format(input_prefix, lang), output_text_file) def make_all(args, make_dataset, lang): if args.trainpref: make_dataset(args.trainpref, 'train', lang, args.output_format) if args.validpref: for k, validpref in enumerate(args.validpref.split(',')): outprefix = 'valid{}'.format(k) if k > 0 else 'valid' make_dataset(validpref, outprefix, lang, args.output_format) if args.testpref: for k, testpref in enumerate(args.testpref.split(',')): outprefix = 'test{}'.format(k) if k > 0 else 'test' make_dataset(testpref, outprefix, lang, args.output_format) make_all(args, make_dataset, args.source_lang) if target: make_all(args, make_dataset, args.target_lang) print('| Wrote preprocessed data to {}'.format(args.destdir)) if args.alignfile: assert args.trainpref, "--trainpref must be set if --alignfile is specified" src_file_name = '{}.{}'.format(args.trainpref, args.source_lang) tgt_file_name = '{}.{}'.format(args.trainpref, args.target_lang) src_dict = dictionary.Dictionary.load(os.path.join(args.destdir, 'dict.{}.txt'.format(args.source_lang))) tgt_dict = dictionary.Dictionary.load(os.path.join(args.destdir, 'dict.{}.txt'.format(args.target_lang))) freq_map = {} with open(args.alignfile, 'r') as align_file: with open(src_file_name, 'r') as src_file: with open(tgt_file_name, 'r') as tgt_file: for a, s, t in zip_longest(align_file, src_file, tgt_file): si = Tokenizer.tokenize(s, src_dict, add_if_not_exist=False) ti = Tokenizer.tokenize(t, tgt_dict, add_if_not_exist=False) ai = list(map(lambda x: tuple(x.split('-')), a.split())) for sai, tai in ai: srcidx = si[int(sai)] tgtidx = ti[int(tai)] if srcidx != src_dict.unk() and tgtidx != tgt_dict.unk(): assert srcidx != src_dict.pad() assert srcidx != src_dict.eos() assert tgtidx != tgt_dict.pad() assert tgtidx != tgt_dict.eos() if srcidx not in freq_map: freq_map[srcidx] = {} if tgtidx not in freq_map[srcidx]: freq_map[srcidx][tgtidx] = 1 else: freq_map[srcidx][tgtidx] += 1 align_dict = {} for srcidx in freq_map.keys(): align_dict[srcidx] = max(freq_map[srcidx], key=freq_map[srcidx].get) with open(os.path.join(args.destdir, 'alignment.{}-{}.txt'.format( args.source_lang, args.target_lang)), 'w') as f: for k, v in align_dict.items(): print('{} {}'.format(src_dict[k], tgt_dict[v]), file=f)
def main(args): print(args) os.makedirs(args.destdir, exist_ok=True) target = not args.only_source def build_dictionary(filenames): d = dictionary.Dictionary() for filename in filenames: Tokenizer.add_file_to_dictionary(filename, d, tokenize_line) return d def train_path(lang): return '{}{}'.format(args.trainpref, ('.' + lang) if lang else '') def file_name(prefix, lang): fname = prefix if lang is not None: fname += f'.{lang}' return fname def dest_path(prefix, lang): return os.path.join(args.destdir, file_name(prefix, lang)) def dict_path(lang): return dest_path('dict', lang) + '.txt' def dataset_dest_path(output_prefix, lang, extension): base = f'{args.destdir}/{output_prefix}' lang_part = f'.{args.source_lang}-{args.target_lang}.{lang}' if lang is not None else '' return f'{base}{lang_part}.{extension}' if args.joined_dictionary: assert not args.srcdict, 'cannot combine --srcdict and --joined-dictionary' assert not args.tgtdict, 'cannot combine --tgtdict and --joined-dictionary' src_dict = build_dictionary( set([ train_path(lang) for lang in [args.source_lang, args.target_lang] ])) tgt_dict = src_dict else: if args.srcdict: src_dict = dictionary.Dictionary.load(args.srcdict) else: assert args.trainpref, "--trainpref must be set if --srcdict is not specified" src_dict = build_dictionary([train_path(args.source_lang)]) if target: if args.tgtdict: tgt_dict = dictionary.Dictionary.load(args.tgtdict) else: assert args.trainpref, "--trainpref must be set if --tgtdict is not specified" tgt_dict = build_dictionary([train_path(args.target_lang)]) src_dict.finalize( threshold=args.thresholdsrc, nwords=args.nwordssrc, padding_factor=args.padding_factor, ) src_dict.save(dict_path(args.source_lang)) if target: if not args.joined_dictionary: tgt_dict.finalize( threshold=args.thresholdtgt, nwords=args.nwordstgt, padding_factor=args.padding_factor, ) tgt_dict.save(dict_path(args.target_lang)) def make_binary_dataset(input_prefix, output_prefix, lang): dict = dictionary.Dictionary.load(dict_path(lang)) print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1)) ds = indexed_dataset.IndexedDatasetBuilder( dataset_dest_path(output_prefix, lang, 'bin')) def consumer(tensor): ds.add_item(tensor) input_file = '{}{}'.format(input_prefix, ('.' + lang) if lang is not None else '') res = Tokenizer.binarize(input_file, dict, consumer) print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format( lang, input_file, res['nseq'], res['ntok'], 100 * res['nunk'] / res['ntok'], dict.unk_word)) ds.finalize(dataset_dest_path(output_prefix, lang, 'idx')) def make_dataset(input_prefix, output_prefix, lang): if args.output_format == 'binary': make_binary_dataset(input_prefix, output_prefix, lang) elif args.output_format == 'raw': # Copy original text file to destination folder output_text_file = dest_path( output_prefix + '.{}-{}'.format(args.source_lang, args.target_lang), lang, ) shutil.copyfile(file_name(input_prefix, lang), output_text_file) def make_all(lang): if args.trainpref: make_dataset(args.trainpref, 'train', lang) if args.validpref: for k, validpref in enumerate(args.validpref.split(',')): outprefix = 'valid{}'.format(k) if k > 0 else 'valid' make_dataset(validpref, outprefix, lang) if args.testpref: for k, testpref in enumerate(args.testpref.split(',')): outprefix = 'test{}'.format(k) if k > 0 else 'test' make_dataset(testpref, outprefix, lang) make_all(args.source_lang) if target: make_all(args.target_lang) print('| Wrote preprocessed data to {}'.format(args.destdir)) if args.alignfile: assert args.trainpref, "--trainpref must be set if --alignfile is specified" src_file_name = train_path(args.source_lang) tgt_file_name = train_path(args.target_lang) src_dict = dictionary.Dictionary.load(dict_path(args.source_lang)) tgt_dict = dictionary.Dictionary.load(dict_path(args.target_lang)) freq_map = {} with open(args.alignfile, 'r') as align_file: with open(src_file_name, 'r') as src_file: with open(tgt_file_name, 'r') as tgt_file: for a, s, t in zip_longest(align_file, src_file, tgt_file): si = Tokenizer.tokenize(s, src_dict, add_if_not_exist=False) ti = Tokenizer.tokenize(t, tgt_dict, add_if_not_exist=False) ai = list(map(lambda x: tuple(x.split('-')), a.split())) for sai, tai in ai: srcidx = si[int(sai)] tgtidx = ti[int(tai)] if srcidx != src_dict.unk( ) and tgtidx != tgt_dict.unk(): assert srcidx != src_dict.pad() assert srcidx != src_dict.eos() assert tgtidx != tgt_dict.pad() assert tgtidx != tgt_dict.eos() if srcidx not in freq_map: freq_map[srcidx] = {} if tgtidx not in freq_map[srcidx]: freq_map[srcidx][tgtidx] = 1 else: freq_map[srcidx][tgtidx] += 1 align_dict = {} for srcidx in freq_map.keys(): align_dict[srcidx] = max(freq_map[srcidx], key=freq_map[srcidx].get) with open( os.path.join( args.destdir, 'alignment.{}-{}.txt'.format(args.source_lang, args.target_lang)), 'w') as f: for k, v in align_dict.items(): print('{} {}'.format(src_dict[k], tgt_dict[v]), file=f)
def main(): parser = argparse.ArgumentParser( description= 'Data pre-processing: Create dictionary and store data in binary format' ) parser.add_argument('-s', '--source-lang', default=None, metavar='SRC', help='source language') parser.add_argument('-t', '--target-lang', default=None, metavar='TARGET', help='target language') parser.add_argument('--trainpref', metavar='FP', default='train', help='target language') parser.add_argument('--validpref', metavar='FP', default='valid', help='comma separated, valid language prefixes') parser.add_argument('--testpref', metavar='FP', default='test', help='comma separated, test language prefixes') parser.add_argument('--destdir', metavar='DIR', default='data-bin', help='destination dir') parser.add_argument( '--thresholdtgt', metavar='N', default=0, type=int, help='map words appearing less than threshold times to unknown') parser.add_argument( '--thresholdsrc', metavar='N', default=0, type=int, help='map words appearing less than threshold times to unknown') parser.add_argument('--nwordstgt', metavar='N', default=-1, type=int, help='number of target words to retain') parser.add_argument('--nwordssrc', metavar='N', default=-1, type=int, help='number of source words to retain') parser.add_argument('--alignfile', metavar='ALIGN', default=None, help='an alignment file (optional)') args = parser.parse_args() print(args) os.makedirs(args.destdir, exist_ok=True) src_dict = Tokenizer.build_dictionary( filename='{}.{}'.format(args.trainpref, args.source_lang)) src_dict.save(os.path.join(args.destdir, 'dict.{}.txt'.format(args.source_lang)), threshold=args.thresholdsrc, nwords=args.nwordssrc) tgt_dict = Tokenizer.build_dictionary( filename='{}.{}'.format(args.trainpref, args.target_lang)) tgt_dict.save(os.path.join(args.destdir, 'dict.{}.txt'.format(args.target_lang)), threshold=args.thresholdtgt, nwords=args.nwordstgt) def make_dataset(input_prefix, output_prefix, lang): dict = dictionary.Dictionary.load( os.path.join(args.destdir, 'dict.{}.txt'.format(lang))) print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1)) ds = indexed_dataset.IndexedDatasetBuilder('{}/{}.{}-{}.{}.bin'.format( args.destdir, output_prefix, args.source_lang, args.target_lang, lang)) def consumer(tensor): ds.add_item(tensor) input_file = '{}.{}'.format(input_prefix, lang) res = Tokenizer.binarize(input_file, dict, consumer) print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format( lang, input_file, res['nseq'], res['ntok'], 100 * res['nunk'] / res['ntok'], dict.unk_word)) ds.finalize('{}/{}.{}-{}.{}.idx'.format(args.destdir, output_prefix, args.source_lang, args.target_lang, lang)) make_dataset(args.trainpref, 'train', args.source_lang) make_dataset(args.trainpref, 'train', args.target_lang) for k, validpref in enumerate(args.validpref.split(',')): outprefix = 'valid{}'.format(k) if k > 0 else 'valid' make_dataset(validpref, outprefix, args.source_lang) make_dataset(validpref, outprefix, args.target_lang) for k, testpref in enumerate(args.testpref.split(',')): outprefix = 'test{}'.format(k) if k > 0 else 'test' make_dataset(testpref, outprefix, args.source_lang) make_dataset(testpref, outprefix, args.target_lang) print('| Wrote preprocessed data to {}'.format(args.destdir)) if args.alignfile: src_file_name = '{}.{}'.format(args.trainpref, args.source_lang) tgt_file_name = '{}.{}'.format(args.trainpref, args.target_lang) src_dict = dictionary.Dictionary.load( os.path.join(args.destdir, 'dict.{}.txt'.format(args.source_lang))) tgt_dict = dictionary.Dictionary.load( os.path.join(args.destdir, 'dict.{}.txt'.format(args.target_lang))) freq_map = {} with open(args.alignfile, 'r') as align_file: with open(src_file_name, 'r') as src_file: with open(tgt_file_name, 'r') as tgt_file: for a, s, t in zip_longest(align_file, src_file, tgt_file): si = Tokenizer.tokenize(s, src_dict, add_if_not_exist=False) ti = Tokenizer.tokenize(t, tgt_dict, add_if_not_exist=False) ai = list(map(lambda x: tuple(x.split('-')), a.split())) for sai, tai in ai: srcidx = si[int(sai)] tgtidx = ti[int(tai)] if srcidx != src_dict.unk( ) and tgtidx != tgt_dict.unk(): assert srcidx != src_dict.pad() assert srcidx != src_dict.eos() assert tgtidx != tgt_dict.pad() assert tgtidx != tgt_dict.eos() if srcidx not in freq_map: freq_map[srcidx] = {} if tgtidx not in freq_map[srcidx]: freq_map[srcidx][tgtidx] = 1 else: freq_map[srcidx][tgtidx] += 1 align_dict = {} for srcidx in freq_map.keys(): align_dict[srcidx] = max(freq_map[srcidx], key=freq_map[srcidx].get) with open( os.path.join( args.destdir, 'alignment.{}-{}.txt'.format(args.source_lang, args.target_lang)), 'w') as f: for k, v in align_dict.items(): print('{} {}'.format(src_dict[k], tgt_dict[v]), file=f)
def main(): parser = argparse.ArgumentParser( description='Data pre-processing: Create dictionary and store data in binary format') parser.add_argument('-s', '--source-lang', default=None, metavar='SRC', help='source language') parser.add_argument('-t', '--target-lang', default=None, metavar='TARGET', help='target language') parser.add_argument('--trainpref', metavar='FP', default='train', help='target language') parser.add_argument('--validpref', metavar='FP', default='valid', help='comma separated, valid language prefixes') parser.add_argument('--testpref', metavar='FP', default='test', help='comma separated, test language prefixes') parser.add_argument('--destdir', metavar='DIR', default='data-bin', help='destination dir') parser.add_argument('--thresholdtgt', metavar='N', default=0, type=int, help='map words appearing less than threshold times to unknown') parser.add_argument('--thresholdsrc', metavar='N', default=0, type=int, help='map words appearing less than threshold times to unknown') parser.add_argument('--tgtdict', metavar='FP', help='reuse given target dictionary') parser.add_argument('--srcdict', metavar='FP', help='reuse given source dictionary') parser.add_argument('--nwordstgt', metavar='N', default=-1, type=int, help='number of target words to retain') parser.add_argument('--nwordssrc', metavar='N', default=-1, type=int, help='number of source words to retain') parser.add_argument('--alignfile', metavar='ALIGN', default=None, help='an alignment file (optional)') parser.add_argument('--output-format', metavar='FORMAT', default='binary', choices=['binary', 'raw'], help='output format (optional)') args = parser.parse_args() print(args) os.makedirs(args.destdir, exist_ok=True) if args.srcdict: src_dict = dictionary.Dictionary.load(args.srcdict) else: src_dict = Tokenizer.build_dictionary(filename='{}.{}'.format(args.trainpref, args.source_lang)) src_dict.save(os.path.join(args.destdir, 'dict.{}.txt'.format(args.source_lang)), threshold=args.thresholdsrc, nwords=args.nwordssrc) if args.tgtdict: tgt_dict = dictionary.Dictionary.load(args.tgtdict) else: tgt_dict = Tokenizer.build_dictionary(filename='{}.{}'.format(args.trainpref, args.target_lang)) tgt_dict.save(os.path.join(args.destdir, 'dict.{}.txt'.format(args.target_lang)), threshold=args.thresholdtgt, nwords=args.nwordstgt) def make_binary_dataset(input_prefix, output_prefix, lang): dict = dictionary.Dictionary.load(os.path.join(args.destdir, 'dict.{}.txt'.format(lang))) print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1)) ds = indexed_dataset.IndexedDatasetBuilder( '{}/{}.{}-{}.{}.bin'.format(args.destdir, output_prefix, args.source_lang, args.target_lang, lang) ) def consumer(tensor): ds.add_item(tensor) input_file = '{}.{}'.format(input_prefix, lang) res = Tokenizer.binarize(input_file, dict, consumer) print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format( lang, input_file, res['nseq'], res['ntok'], 100 * res['nunk'] / res['ntok'], dict.unk_word)) ds.finalize('{}/{}.{}-{}.{}.idx'.format( args.destdir, output_prefix, args.source_lang, args.target_lang, lang)) def make_dataset(input_prefix, output_prefix, lang, output_format='binary'): if output_format == 'binary': make_binary_dataset(input_prefix, output_prefix, lang) elif output_format == 'raw': # Copy original text file to destination folder output_text_file = os.path.join(args.destdir, '{}.{}'.format(output_prefix, lang)) shutil.copyfile('{}.{}'.format(input_prefix, lang), output_text_file) make_dataset(args.trainpref, 'train', args.source_lang, args.output_format) make_dataset(args.trainpref, 'train', args.target_lang, args.output_format) for k, validpref in enumerate(args.validpref.split(',')): outprefix = 'valid{}'.format(k) if k > 0 else 'valid' make_dataset(validpref, outprefix, args.source_lang, args.output_format) make_dataset(validpref, outprefix, args.target_lang, args.output_format) for k, testpref in enumerate(args.testpref.split(',')): outprefix = 'test{}'.format(k) if k > 0 else 'test' make_dataset(testpref, outprefix, args.source_lang, args.output_format) make_dataset(testpref, outprefix, args.target_lang, args.output_format) print('| Wrote preprocessed data to {}'.format(args.destdir)) if args.alignfile: src_file_name = '{}.{}'.format(args.trainpref, args.source_lang) tgt_file_name = '{}.{}'.format(args.trainpref, args.target_lang) src_dict = dictionary.Dictionary.load(os.path.join(args.destdir, 'dict.{}.txt'.format(args.source_lang))) tgt_dict = dictionary.Dictionary.load(os.path.join(args.destdir, 'dict.{}.txt'.format(args.target_lang))) freq_map = {} with open(args.alignfile, 'r') as align_file: with open(src_file_name, 'r') as src_file: with open(tgt_file_name, 'r') as tgt_file: for a, s, t in zip_longest(align_file, src_file, tgt_file): si = Tokenizer.tokenize(s, src_dict, add_if_not_exist=False) ti = Tokenizer.tokenize(t, tgt_dict, add_if_not_exist=False) ai = list(map(lambda x: tuple(x.split('-')), a.split())) for sai, tai in ai: srcidx = si[int(sai)] tgtidx = ti[int(tai)] if srcidx != src_dict.unk() and tgtidx != tgt_dict.unk(): assert srcidx != src_dict.pad() assert srcidx != src_dict.eos() assert tgtidx != tgt_dict.pad() assert tgtidx != tgt_dict.eos() if srcidx not in freq_map: freq_map[srcidx] = {} if tgtidx not in freq_map[srcidx]: freq_map[srcidx][tgtidx] = 1 else: freq_map[srcidx][tgtidx] += 1 align_dict = {} for srcidx in freq_map.keys(): align_dict[srcidx] = max(freq_map[srcidx], key=freq_map[srcidx].get) with open(os.path.join(args.destdir, 'alignment.{}-{}.txt'.format( args.source_lang, args.target_lang)), 'w') as f: for k, v in align_dict.items(): print('{} {}'.format(src_dict[k], tgt_dict[v]), file=f)
def main(args): os.makedirs(args.destdir, exist_ok=True) target = not args.only_source if args.joined_dictionary: assert not args.srcdict, 'cannot combine --srcdict and --joined-dictionary' assert not args.tgtdict, 'cannot combine --tgtdict and --joined-dictionary' src_dict = build_dictionary(set(chain.from_iterable([ train_paths(lang) for lang in [args.source_lang, args.target_lang] ]))) tgt_dict = src_dict else: assert args.trainpref, "--trainpref must be set" if not args.srcdict: src_dict = build_dictionary(chain.from_iterable([train_paths(args.source_lang)])) if target and not args.tgtdict: assert args.trainpref, "--trainpref must be set" tgt_dict = build_dictionary(chain.from_iterable([train_paths(args.target_lang)])) if args.srcdict: src_dict = load_dictionary(args.srcdict) if target and args.tgtdict: tgt_dict = load_dictionary(args.tgtdict) src_dict.finalize( threshold=args.thresholdsrc, nwords=args.nwordssrc, padding_factor=args.padding_factor, ) src_dict.save(dict_path(args.source_lang)) if target: if not args.joined_dictionary: tgt_dict.finalize( threshold=args.thresholdtgt, nwords=args.nwordstgt, padding_factor=args.padding_factor, ) tgt_dict.save(dict_path(args.target_lang)) make_all(args.source_lang, args.target_lang) print('| Wrote preprocessed data to {}'.format(args.destdir)) if args.alignfile: assert args.trainpref, "--trainpref must be set if --alignfile is specified" src_file_name = train_paths(args.source_lang)[0] tgt_file_name = train_paths(args.target_lang)[0] src_dict = dictionary.Dictionary.load(dict_path(args.source_lang)) tgt_dict = dictionary.Dictionary.load(dict_path(args.target_lang)) freq_map = {} with open(args.alignfile, 'r') as align_file: with open(src_file_name, 'r') as src_file: with open(tgt_file_name, 'r') as tgt_file: for a, s, t in zip_longest(align_file, src_file, tgt_file): si = Tokenizer.tokenize(s, src_dict, add_if_not_exist=False) ti = Tokenizer.tokenize(t, tgt_dict, add_if_not_exist=False) ai = list(map(lambda x: tuple(x.split('-')), a.split())) for sai, tai in ai: srcidx = si[int(sai)] tgtidx = ti[int(tai)] if srcidx != src_dict.unk() and tgtidx != tgt_dict.unk(): assert srcidx != src_dict.pad() assert srcidx != src_dict.eos() assert tgtidx != tgt_dict.pad() assert tgtidx != tgt_dict.eos() if srcidx not in freq_map: freq_map[srcidx] = {} if tgtidx not in freq_map[srcidx]: freq_map[srcidx][tgtidx] = 1 else: freq_map[srcidx][tgtidx] += 1 align_dict = {} for srcidx in freq_map.keys(): align_dict[srcidx] = max(freq_map[srcidx], key=freq_map[srcidx].get) with open(os.path.join(args.destdir, 'alignment.{}-{}.txt'.format( args.source_lang, args.target_lang)), 'w') as f: for k, v in align_dict.items(): print('{} {}'.format(src_dict[k], tgt_dict[v]), file=f)
def main(): parser = argparse.ArgumentParser( description= 'Data pre-processing: Create dictionary and store data in binary format' ) parser.add_argument('-s', '--source-lang', default=None, metavar='SRC', help='source language') parser.add_argument('-t', '--target-lang', default=None, metavar='TARGET', help='target language') parser.add_argument('--trainpref', metavar='FP', default='train', help='target language') parser.add_argument('--validpref', metavar='FP', default='valid', help='comma separated, valid language prefixes') parser.add_argument('--testpref', metavar='FP', default='test', help='comma separated, test language prefixes') parser.add_argument('--destdir', metavar='DIR', default='data-bin', help='destination dir') # 如果训练集很大, 那么会产生很多低频词, 这些词没有意义很大可能是拼写错误 # 那么可以通过下面两个参数过滤的低频词, 如果值为10, 则过滤掉频率小于10的词, 这些词不会出现在dict.txt文件中 parser.add_argument( '--thresholdtgt', metavar='N', default=0, type=int, help='map words appearing less than threshold times to unknown') parser.add_argument( '--thresholdsrc', metavar='N', default=0, type=int, help='map words appearing less than threshold times to unknown') # 如果自己已经有准备好的字典, 可以通过下面两个参数设置 parser.add_argument('--tgtdict', metavar='FP', help='reuse given target dictionary') parser.add_argument('--srcdict', metavar='FP', help='reuse given source dictionary') # 限制用于训练的单词数 parser.add_argument('--nwordstgt', metavar='N', default=-1, type=int, help='number of target words to retain') parser.add_argument('--nwordssrc', metavar='N', default=-1, type=int, help='number of source words to retain') # 对齐文件格式(train文件): 原文词id-译文词id 多个使用空格分开, 一行代表一句话(eg:0-0 1-2 1-3 2-1 3-4 5-6 7-7) # 对齐文件可以是其他工具生成的对齐文件或者人工对齐的文件 # https://github.com/facebookresearch/fairseq-py/blob/master/preprocess.py#L99 # 注意: 对齐文件的行数必须跟train.en文件的行数一样, 而不是跟数据集总行数一样 parser.add_argument('--alignfile', metavar='ALIGN', default=None, help='an alignment file (optional)') parser.add_argument('--output-format', metavar='FORMAT', default='binary', choices=['binary', 'raw'], help='output format (optional)') args = parser.parse_args() print(args) os.makedirs(args.destdir, exist_ok=True) if args.srcdict: src_dict = dictionary.Dictionary.load(args.srcdict) else: src_dict = Tokenizer.build_dictionary( filename='{}.{}'.format(args.trainpref, args.source_lang)) src_dict.save(os.path.join(args.destdir, 'dict.{}.txt'.format(args.source_lang)), threshold=args.thresholdsrc, nwords=args.nwordssrc) if args.tgtdict: tgt_dict = dictionary.Dictionary.load(args.tgtdict) else: tgt_dict = Tokenizer.build_dictionary( filename='{}.{}'.format(args.trainpref, args.target_lang)) tgt_dict.save(os.path.join(args.destdir, 'dict.{}.txt'.format(args.target_lang)), threshold=args.thresholdtgt, nwords=args.nwordstgt) def make_binary_dataset(input_prefix, output_prefix, lang): dict = dictionary.Dictionary.load( os.path.join(args.destdir, 'dict.{}.txt'.format(lang))) print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1)) ds = indexed_dataset.IndexedDatasetBuilder('{}/{}.{}-{}.{}.bin'.format( args.destdir, output_prefix, args.source_lang, args.target_lang, lang)) def consumer(tensor): ds.add_item(tensor) input_file = '{}.{}'.format(input_prefix, lang) res = Tokenizer.binarize(input_file, dict, consumer) print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format( lang, input_file, res['nseq'], res['ntok'], 100 * res['nunk'] / res['ntok'], dict.unk_word)) ds.finalize('{}/{}.{}-{}.{}.idx'.format(args.destdir, output_prefix, args.source_lang, args.target_lang, lang)) def make_dataset(input_prefix, output_prefix, lang, output_format='binary'): if output_format == 'binary': make_binary_dataset(input_prefix, output_prefix, lang) elif output_format == 'raw': # Copy original text file to destination folder output_text_file = os.path.join( args.destdir, '{}.{}'.format(output_prefix, lang)) shutil.copyfile('{}.{}'.format(input_prefix, lang), output_text_file) make_dataset(args.trainpref, 'train', args.source_lang, args.output_format) make_dataset(args.trainpref, 'train', args.target_lang, args.output_format) for k, validpref in enumerate(args.validpref.split(',')): outprefix = 'valid{}'.format(k) if k > 0 else 'valid' make_dataset(validpref, outprefix, args.source_lang, args.output_format) make_dataset(validpref, outprefix, args.target_lang, args.output_format) for k, testpref in enumerate(args.testpref.split(',')): outprefix = 'test{}'.format(k) if k > 0 else 'test' make_dataset(testpref, outprefix, args.source_lang, args.output_format) make_dataset(testpref, outprefix, args.target_lang, args.output_format) print('| Wrote preprocessed data to {}'.format(args.destdir)) if args.alignfile: src_file_name = '{}.{}'.format(args.trainpref, args.source_lang) tgt_file_name = '{}.{}'.format(args.trainpref, args.target_lang) src_dict = dictionary.Dictionary.load( os.path.join(args.destdir, 'dict.{}.txt'.format(args.source_lang))) tgt_dict = dictionary.Dictionary.load( os.path.join(args.destdir, 'dict.{}.txt'.format(args.target_lang))) freq_map = {} with open(args.alignfile, 'r') as align_file: with open(src_file_name, 'r') as src_file: with open(tgt_file_name, 'r') as tgt_file: for a, s, t in zip_longest(align_file, src_file, tgt_file): si = Tokenizer.tokenize(s, src_dict, add_if_not_exist=False) ti = Tokenizer.tokenize(t, tgt_dict, add_if_not_exist=False) ai = list(map(lambda x: tuple(x.split('-')), a.split())) for sai, tai in ai: srcidx = si[int(sai)] tgtidx = ti[int(tai)] if srcidx != src_dict.unk( ) and tgtidx != tgt_dict.unk(): assert srcidx != src_dict.pad() assert srcidx != src_dict.eos() assert tgtidx != tgt_dict.pad() assert tgtidx != tgt_dict.eos() if srcidx not in freq_map: freq_map[srcidx] = {} if tgtidx not in freq_map[srcidx]: freq_map[srcidx][tgtidx] = 1 else: freq_map[srcidx][tgtidx] += 1 align_dict = {} for srcidx in freq_map.keys(): align_dict[srcidx] = max(freq_map[srcidx], key=freq_map[srcidx].get) with open( os.path.join( args.destdir, 'alignment.{}-{}.txt'.format(args.source_lang, args.target_lang)), 'w') as f: for k, v in align_dict.items(): print('{} {}'.format(src_dict[k], tgt_dict[v]), file=f)
def main(args): print(args) os.makedirs(args.destdir, exist_ok=True) target = not args.only_source def train_path(lang): return "{}{}".format(args.trainpref, ("." + lang) if lang else "") def file_name(prefix, lang): fname = prefix if lang is not None: fname += ".{lang}".format(lang=lang) return fname def dest_path(prefix, lang): return os.path.join(args.destdir, file_name(prefix, lang)) def dict_path(lang): return dest_path("dict", lang) + ".txt" if args.inputtype == "text": if args.joined_dictionary: assert not args.srcdict, 'cannot combine --srcdict and --joined-dictionary' assert not args.tgtdict, 'cannot combine --tgtdict and --joined-dictionary' src_dict = build_dictionary( { train_path(lang) for lang in [args.source_lang, args.target_lang] }, args.workers, ) tgt_dict = src_dict else: if args.srcdict: src_dict = dictionary.Dictionary.load(args.srcdict) else: assert ( args.trainpref ), "--trainpref must be set if --srcdict is not specified" src_dict = build_dictionary([train_path(args.source_lang)], args.workers) if target: if args.tgtdict: tgt_dict = dictionary.Dictionary.load(args.tgtdict) else: assert ( args.trainpref ), "--trainpref must be set if --tgtdict is not specified" tgt_dict = build_dictionary([train_path(args.target_lang)], args.workers) src_dict.finalize( threshold=args.thresholdsrc, nwords=args.nwordssrc, padding_factor=args.padding_factor, ) src_dict.save(dict_path(args.source_lang)) elif args.inputtype == "audio": if target: if args.tgtdict: tgt_dict = dictionary.Dictionary.load(args.tgtdict) else: assert ( args.trainpref ), "--trainpref must be set if --tgtdict is not specified" tgt_dict = build_dictionary([train_path(args.target_lang)], args.workers) if target: if not args.joined_dictionary: tgt_dict.finalize( threshold=args.thresholdtgt, nwords=args.nwordstgt, padding_factor=args.padding_factor, ) tgt_dict.save(dict_path(args.target_lang)) def make_binary_dataset(input_prefix, output_prefix, lang, num_workers): dict = dictionary.Dictionary.load(dict_path(lang)) print("| [{}] Dictionary: {} types".format(lang, len(dict) - 1)) n_seq_tok = [0, 0] replaced = Counter() def merge_result(worker_result): replaced.update(worker_result["replaced"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] input_file = "{}{}".format(input_prefix, ("." + lang) if lang is not None else "") offsets = Tokenizer.find_offsets(input_file, num_workers) pool = None if num_workers > 1: pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) pool.apply_async( binarize, ( args, input_file, dict, prefix, lang, offsets[worker_id], offsets[worker_id + 1], ), callback=merge_result, ) pool.close() ds = indexed_dataset.IndexedDatasetBuilder( dataset_dest_file(args, output_prefix, lang, "bin")) merge_result( Tokenizer.binarize(input_file, dict, lambda t: ds.add_item(t), offset=0, end=offsets[1])) if num_workers > 1: pool.join() for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) temp_file_path = dataset_dest_prefix(args, prefix, lang) ds.merge_file_(temp_file_path) os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx")) print("| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format( lang, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], dict.unk_word, )) def make_binary_audio_dataset(input_prefix, output_prefix, lang): ds = indexed_dataset.IndexedDatasetBuilder(dataset_dest_file( args, output_prefix, lang, 'bin'), dtype=np.float32) def consumer(tensor): ds.add_item(tensor) def binarize(input_file, audio_reader, consumer): nseq, nsamp = 0, 0 for tensor in audio_reader(input_file): consumer(tensor) nseq += 1 nsamp += tensor.size(0) return {'nseq': nseq, 'nsamp': nsamp} input_file = '{}{}'.format(input_prefix, ('.' + lang) if lang is not None else '') audio_reader = get_reader(args.format) res = binarize(input_file, audio_reader, consumer) print('| [{}] {}: {} audio_seq, {} audio_samples'.format( lang, input_file, res['nseq'], res['nsamp'])) ds.finalize(dataset_dest_file(args, output_prefix, lang, 'idx')) def make_dataset(input_prefix, output_prefix, lang, num_workers=1): if args.output_format == 'binary': if args.inputtype == "text" or lang == args.target_lang: #The target is assumed to be always textual make_binary_dataset(input_prefix, output_prefix, lang, num_workers) else: make_binary_audio_dataset(input_prefix, output_prefix, lang) elif args.output_format == "raw": # Copy original text file to destination folder output_text_file = dest_path( output_prefix + ".{}-{}".format(args.source_lang, args.target_lang), lang, ) shutil.copyfile(file_name(input_prefix, lang), output_text_file) def make_all(lang): if args.trainpref: make_dataset(args.trainpref, "train", lang, num_workers=args.workers) if args.validpref: for k, validpref in enumerate(args.validpref.split(",")): outprefix = "valid{}".format(k) if k > 0 else "valid" make_dataset(validpref, outprefix, lang) if args.testpref: for k, testpref in enumerate(args.testpref.split(",")): outprefix = "test{}".format(k) if k > 0 else "test" make_dataset(testpref, outprefix, lang) make_all(args.source_lang) if target: make_all(args.target_lang) print('| Wrote preprocessed data to {}'.format(args.destdir)) if args.alignfile: if args.inputtype == "audio": raise (NotImplementedError, "Audio type incompatible with alignment") assert args.trainpref, "--trainpref must be set if --alignfile is specified" src_file_name = train_path(args.source_lang) tgt_file_name = train_path(args.target_lang) src_dict = dictionary.Dictionary.load(dict_path(args.source_lang)) tgt_dict = dictionary.Dictionary.load(dict_path(args.target_lang)) freq_map = {} with open(args.alignfile, "r") as align_file: with open(src_file_name, "r") as src_file: with open(tgt_file_name, "r") as tgt_file: for a, s, t in zip_longest(align_file, src_file, tgt_file): si = Tokenizer.tokenize(s, src_dict, add_if_not_exist=False) ti = Tokenizer.tokenize(t, tgt_dict, add_if_not_exist=False) ai = list(map(lambda x: tuple(x.split("-")), a.split())) for sai, tai in ai: srcidx = si[int(sai)] tgtidx = ti[int(tai)] if srcidx != src_dict.unk( ) and tgtidx != tgt_dict.unk(): assert srcidx != src_dict.pad() assert srcidx != src_dict.eos() assert tgtidx != tgt_dict.pad() assert tgtidx != tgt_dict.eos() if srcidx not in freq_map: freq_map[srcidx] = {} if tgtidx not in freq_map[srcidx]: freq_map[srcidx][tgtidx] = 1 else: freq_map[srcidx][tgtidx] += 1 align_dict = {} for srcidx in freq_map.keys(): align_dict[srcidx] = max(freq_map[srcidx], key=freq_map[srcidx].get) with open( os.path.join( args.destdir, "alignment.{}-{}.txt".format(args.source_lang, args.target_lang), ), "w", ) as f: for k, v in align_dict.items(): print("{} {}".format(src_dict[k], tgt_dict[v]), file=f)