def binarize(args, filename, vocab, output_prefix, lang, offset, end, append_eos=True): ds = indexed_dataset.make_builder( dataset_dest_file(args, output_prefix, lang, "bin"), impl=args.dataset_impl, vocab_size=len(vocab), ) def consumer(tensor): ds.add_item(tensor) res = Binarizer.binarize(filename, vocab, consumer, append_eos=append_eos, offset=offset, end=end) ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx")) return res
def binarize(args, filename, vocab, output_prefix, lang, offset, end, append_eos=True, copy_from=None): ds = indexed_dataset.IndexedDatasetBuilder( dataset_dest_file(args, output_prefix, lang, "bin")) words_list = [] # todo: 目前传不出去 def consumer(ids, words): ds.add_item(ids) words_list.append(words) res = Binarizer.binarize(filename, vocab, consumer, append_eos=append_eos, offset=offset, end=end) ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx")) return res
def binarize(args, filename, vocab, output_prefix, lang, offset, end, append_eos=True): ds = indexed_dataset.make_builder(dataset_dest_file( args, output_prefix, lang, "bin"), impl=args.dataset_impl, vocab_size=len(vocab)) def consumer(tensor): ds.add_item(tensor) tk = tokenize_smiles if args.file_format == 'smiles' else tokenize_line res = Binarizer.binarize(filename, vocab, consumer, tokenize=tk, append_eos=append_eos, offset=offset, end=end) ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx")) return res
def make_binary_dataset(vocab, input_prefix, output_prefix, src_lang, tgt_lang, lang, num_workers): print("| [{}] Dictionary: {} types".format(lang, len(vocab) - 1)) n_seq_tok = [0, 0] replaced = Counter() def merge_result(worker_result): replaced.update(worker_result["replaced"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] input_file = "{}.{}-{}.{}".format(input_prefix, src_lang, tgt_lang, lang) if args.model: input_file += ".tok" offsets = Binarizer.find_offsets(input_file, num_workers) pool = None if num_workers > 1: pool = multiprocessing.Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) pool.apply_async( binarize, (args, input_file, vocab, prefix, src_lang, tgt_lang, lang, offsets[worker_id], offsets[worker_id + 1]), callback=merge_result) pool.close() ds = indexed_dataset.make_builder(dataset_dest_file( args, output_prefix, src_lang, tgt_lang, lang, "bin"), impl=args.dataset_impl, vocab_size=len(vocab)) merge_result( Binarizer.binarize(input_file, vocab, lambda t: ds.add_item(t), offset=0, end=offsets[1])) if num_workers > 1: pool.join() for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) temp_file_path = dataset_dest_prefix(args, prefix, src_lang, tgt_lang, lang) ds.merge_file_(temp_file_path) os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize( dataset_dest_file(args, output_prefix, src_lang, tgt_lang, lang, "idx")) print("| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format( lang, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], vocab.unk_word, ))
def binarize(args, filename, dict, output_prefix, lang, offset, end): ds = indexed_dataset.IndexedDatasetBuilder(dataset_dest_file(args, output_prefix, lang, 'bin')) def consumer(tensor): ds.add_item(tensor) res = Binarizer.binarize(filename, dict, consumer, offset=offset, end=end) ds.finalize(dataset_dest_file(args, output_prefix, lang, 'idx')) return res
def binarize(args, filename, vocab, output_prefix, lang, offset, end, tgt_dict): append_eos = True ds = indexed_dataset.IndexedDatasetBuilder( dataset_dest_file(args, output_prefix, lang, "bin")) def consumer(tensor): ds.add_item(tensor) res = Binarizer.binarize(filename, vocab, consumer, append_eos=append_eos, reverse_order=args.reverse_order, offset=offset, end=end, tgt_dict=tgt_dict) ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx")) return res
def make_binary_dataset(input_prefix, output_prefix, lang, num_workers): dict = dictionary.Dictionary.load(dict_path(lang)) print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1)) n_seq_tok = [0, 0] replaced = Counter() def merge_result(worker_result): replaced.update(worker_result['replaced']) n_seq_tok[0] += worker_result['nseq'] n_seq_tok[1] += worker_result['ntok'] input_file = '{}{}'.format(input_prefix, ('.' + lang) if lang is not None else '') offsets = Binarizer.find_offsets(input_file, num_workers) pool = None if num_workers > 1: pool = Pool(processes=num_workers-1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) pool.apply_async(binarize, (args, input_file, dict, prefix, lang, offsets[worker_id], offsets[worker_id + 1]), callback=merge_result) pool.close() ds = indexed_dataset.IndexedDatasetBuilder(dataset_dest_file(args, output_prefix, lang, 'bin')) merge_result(Binarizer.binarize(input_file, dict, lambda t: ds.add_item(t), offset=0, end=offsets[1])) if num_workers > 1: pool.join() for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) temp_file_path = dataset_dest_prefix(args, prefix, lang) ds.merge_file_(temp_file_path) os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize(dataset_dest_file(args, output_prefix, lang, 'idx')) print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format( lang, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], dict.unk_word))
def binarize(args, filename, dict, output_prefix, lang, offset, end, append_eos=True): ds = indexed_dataset.IndexedDatasetBuilder( dataset_dest_file(args, output_prefix, lang, "bin")) def consumer(tensor): ds.add_item(tensor) res = Binarizer.binarize(filename, dict, consumer, offset=offset, end=end, append_eos=append_eos) ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx")) return res
def binarize(args, filename, vocab, output_prefix, lang, offset, end, append_eos=True): ##dataset_impl=mmap, ds -> MMapIndexedDatasetBuilder ##dataset_impl=lazy, ds -> IndexedDatasetBuilder ds = indexed_dataset.make_builder(dataset_dest_file( args, output_prefix, lang, "bin"), impl=args.dataset_impl, vocab_size=len(vocab)) def consumer(tensor): ##输入的tensor,就是直接把文本串通过dictionary转为id串 ##dataset_impl=mmap, MMapIndexedDatasetBuilder.add_item:把输入tensor直接写入文件 ##dataset_impl=lazy, IndexedDatasetBuilder.add_item:把输入tensor写入文件,并更新sizes, data_offsets, dim_offsets ds.add_item(tensor) ## 读入文件filename在offset和end之间的内容,并把每个文本串利用dictionary转为id串,利用consumer函数写入到ds中 res = Binarizer.binarize(filename, vocab, consumer, append_eos=append_eos, offset=offset, end=end) ##把写入到ds中的数据存储到对应路径的临时文件, output_prefix包含了worker_id,以区分不同的worker的临时文件 ##mmap, ds.finalize:调用MMapIndexedDataset.write写入三个tensor: ## 训练样例数量,每个样例tensor的size,每个样例的位置pointer ##lazy, ds.finalize:IndexedDatasetBuilder.finalize直接写入dim_offsets, data_offsets, sizes # data_offsets: 存放每个tensor在二进制文件中的结尾位置(前一个tensor的结尾处就是这个tensor的开始位置) # sizes: 存放每个tensor的shape的各个dim值 # dim_offsets: 存放每个tensor的shape在self.size中的结尾位置(前面tensor shape的结尾是这个tensor shape的开始) ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx")) return res
def make_binary_dataset(vocab, input_prefix, output_prefix, lang, num_workers): logger.info("[{}] Dictionary: {} types".format(lang, len(vocab))) output_prefix += '.bert' if isinstance(vocab, BertTokenizer) else '' input_prefix += '.bert' if isinstance(vocab, BertTokenizer) else '' n_seq_tok = [0, 0] replaced = Counter() def merge_result(worker_result): replaced.update(worker_result["replaced"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] input_file = "{}{}".format(input_prefix, ("." + lang) if lang is not None else "") offsets = Binarizer.find_offsets(input_file, num_workers) pool = None if num_workers > 1: pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) pool.apply_async( binarize, ( args, input_file, vocab, prefix, lang, offsets[worker_id], offsets[worker_id + 1], ), callback=merge_result, ) pool.close() ds = indexed_dataset.make_builder( dataset_dest_file(args, output_prefix, lang, "bin"), impl=args.dataset_impl, vocab_size=len(vocab), ) merge_result( Binarizer.binarize(input_file, vocab, lambda t: ds.add_item(t), offset=0, end=offsets[1])) if num_workers > 1: pool.join() for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) temp_file_path = dataset_dest_prefix(args, prefix, lang) ds.merge_file_(temp_file_path) os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx")) logger.info( "[{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format( lang, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], vocab.unk_word, ))
def make_binary_dataset(vocab, input_prefix, output_prefix, lang, num_workers, avoid_tokenize=False): if vocab is not None: print("| [{}] Dictionary: {} types".format(lang, len(vocab) - 1)) else: print('| Using None Dictionary and only string split is performed.') n_seq_tok = [0, 0] replaced = Counter() def merge_result(worker_result): replaced.update(worker_result["replaced"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] input_file = "{}{}".format( input_prefix, ("." + lang) if lang is not None else "" ) offsets = Binarizer.find_offsets(input_file, num_workers) pool = None if num_workers > 1: pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): # TODO: worker > 1 is not working for map dataset if args.input_mapping is True: raise NotImplementedError("Worker > 1 is not implemented for map dataset yet.") prefix = "{}{}".format(output_prefix, worker_id) pool.apply_async( binarize, ( args, input_file, vocab, prefix, lang, offsets[worker_id], offsets[worker_id + 1], avoid_tokenize, ), callback=merge_result, ) pool.close() ds = indexed_dataset.make_builder( dataset_dest_file(args, output_prefix, lang, "bin"), impl=args.dataset_impl, vocab_size=len(vocab) if vocab is not None else -1, ) merge_result( Binarizer.binarize( input_file, vocab, lambda t: ds.add_item(t), offset=0, end=offsets[1], avoid_tokenize=avoid_tokenize, ) ) if num_workers > 1: pool.join() for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) temp_file_path = dataset_dest_prefix(args, prefix, lang) ds.merge_file_(temp_file_path) os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx")) if vocab is not None: unk = vocab.unk_word if hasattr(vocab, 'unk_word') else vocab.unk_token else: unk = "" logger.info( "[{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format( lang, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], unk, ) )
def make_binary_dataset(vocab, input_prefix, output_prefix, lang, num_workers, copy_src_words=None): print("| [{}] Dictionary: {} types".format(lang, len(vocab) - 1)) n_seq_tok = [0, 0] replaced = Counter() copyied = Counter() def merge_result(worker_result): replaced.update(worker_result["replaced"]) copyied.update(worker_result["copied"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] input_file = "{}{}".format( input_prefix, ("." + lang) if lang is not None else "" ) offsets = Binarizer.find_offsets(input_file, num_workers) pool = None if num_workers > 1: # todo: not support copy pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) pool.apply_async( binarize, ( args, input_file, vocab, prefix, lang, offsets[worker_id], offsets[worker_id + 1] ), callback=merge_result ) pool.close() ds = indexed_dataset.IndexedDatasetBuilder( dataset_dest_file(args, output_prefix, lang, "bin") ) words_list = [] def binarize_consumer(ids, words): ds.add_item(ids) words_list.append(words) merge_result( Binarizer.binarize( input_file, vocab, binarize_consumer, offset=0, end=offsets[1], copy_ext_dict=args.copy_ext_dict, copy_src_words=copy_src_words ) ) if num_workers > 1: pool.join() for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) temp_file_path = dataset_dest_prefix(args, prefix, lang) ds.merge_file_(temp_file_path) os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx")) print( "| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}, {:.3}% <unk> copied from src".format( lang, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], vocab.unk_word, 100 * sum(copyied.values()) / n_seq_tok[1] ) ) return words_list