예제 #1
0
def load_raw_data(data_dir, load_keys):
    raw_data = {}
    for mode in constants.MODES:
        for key in load_keys:
            mode_data_dir = os.path.join(data_dir, key, '{}.*'.format(mode))
            jsonl_gz_files = PathManager.ls(mode_data_dir)
            raw_data[mode] = list(load_jsonl_gzs(jsonl_gz_files))
    return raw_data
예제 #2
0
def flatten(raw_dir, lang, mode, flatten_dir, attrs, num_cores):
    """flatten attributes of raw data"""
    LOGGER.info('Cast attributes({}) of {}-{} dataset'.format(
        attrs, lang, mode))
    with Pool(num_cores) as mpool:
        result = [
            mpool.apply_async(flatten_attrs,
                              (raw_file, flatten_dir, lang, mode, set(attrs)))
            for raw_file in PathManager.ls(
                os.path.join(raw_dir, lang, mode, '*.jsonl.gz'))
        ]
        result = [res.get() for res in result]
예제 #3
0
 def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1):
     if args['preprocess']['dataset_impl'] == "raw":
         raise NotImplementedError
     else:
         languages = [
             os.path.basename(d)
             for d in PathManager.ls(os.path.dirname(input_prefix))
         ]
         for l in languages:
             in_file = file_name(input_prefix, lang)
             in_file = str.replace(in_file, '*', l)
             out_file = dest_path(os.path.join(l, output_prefix), lang)
             PathManager.mkdir(os.path.dirname(out_file))
             make_binary_dataset(vocab, in_file, out_file, num_workers)
예제 #4
0
def xfg(src_dir, languages, dst_dir):
    xfg_src_files = PathManager.ls(os.path.join(src_dir, "kernels_ir", '*.ll'))

    filenames = []
    ir_data = []
    for filename in xfg_src_files:
        filenames.append(os.path.basename(filename)[:-3])
        with open(filename, 'r') as reader:
            lines = reader.read().splitlines()
        ir_data.append(lines)
    # convert list to dict
    filenames = {name: idx for idx, name in enumerate(filenames)}

    processed_data, _ = inst2vec_preprocess.preprocess(ir_data)
    processed_data, _ = task_utils.inline_struct_types_txt(
        processed_data, ir_data)
    processed_data = task_utils.abstract_statements_from_identifiers_txt(
        processed_data)

    for idx, lines in enumerate(processed_data):
        processed_data[idx] = [
            line for line in lines if
            not re.match(r'((?:<label>:)?(<LABEL>):|; <label>:<LABEL>)', line)
        ]

    for lang in languages:
        raw_file = os.path.join(src_dir, f'{lang}.csv')
        # read raw csv file to load corresponding benchmarks
        data_frame = pd.read_csv(raw_file)
        benchmarks = data_frame["benchmark"].values.tolist()
        datasets = data_frame["dataset"].values.tolist()
        del data_frame

        # write
        dst_file = os.path.join(dst_dir, lang, f'train.xfg')
        with open(dst_file, 'w') as writer:
            for idx, (bm, ds) in enumerate(zip(benchmarks, datasets)):
                if bm[:3] == "npb":
                    bm += f'_{ds}'
                xfg = processed_data[filenames[bm]]
                print(json_io.json_dumps(xfg), file=writer)
예제 #5
0
def merge_attr_files(flatten_dir, lang, mode, attrs):
    """shell cat"""
    def _merge_files(src_files, tgt_file):
        with file_io.open(tgt_file, 'w') as writer:
            for src_fl in src_files:
                with file_io.open(src_fl, 'r') as reader:
                    shutil.copyfileobj(reader, writer)

    def _get_file_idx(filename):
        filename = os.path.split(filename)[-1]
        idx = int(filename[:str.rfind(filename, '.json')])
        return idx

    for attr in attrs:
        attr_files = PathManager.ls(
            os.path.join(flatten_dir, lang, mode, attr, '*.jsonl'))
        attr_files = sorted(attr_files, key=_get_file_idx)
        assert len(attr_files) > 0, RuntimeError(
            'Attribute({}) files do not exist.'.format(attr))
        dest_file = os.path.join(flatten_dir, lang, '{}.{}'.format(mode, attr))
        _merge_files(attr_files, dest_file)
    PathManager.rm(os.path.join(flatten_dir, lang, mode))
예제 #6
0
def main(args):
    task = tasks.get_task(args['preprocess']['task'])
    LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'],
                                              args['preprocess']['task']))
    os.makedirs(args['preprocess']['destdir'], exist_ok=True)

    def train_path(lang):
        return "{}{}".format(args['preprocess']['trainpref'],
                             ("." + lang) if lang else "")

    def valid_path(lang):
        return "{}{}".format(args['preprocess']['validpref'],
                             ("." + lang) if lang else "")

    def file_name(prefix, lang):
        fname = prefix
        if lang is not None:
            fname += ".{lang}".format(lang=lang)
        return fname

    def dest_path(prefix, lang):
        return os.path.join(args['preprocess']['destdir'],
                            file_name(prefix, lang))

    def dict_path(lang):
        return dest_path(lang, "dict") + ".jsonl"

    if not args['preprocess']['srcdict'] and os.path.exists(
            dict_path(args['preprocess']['source_lang'])):
        raise FileExistsError(dict_path(args['preprocess']['source_lang']))

    if args['preprocess']['only_train']:
        LOGGER.info('Generating dictionaries with Train data files.')
    else:
        LOGGER.info(
            'Generating dictionaries with Train/Validation data files.')

    if args['preprocess']['srcdict']:
        src_dict = task.load_dictionary(args['preprocess']['srcdict'])
    else:
        assert args['preprocess'][
            'trainpref'], "--trainpref must be set if --srcdict is not specified"

        filenames = PathManager.ls(
            train_path(args['preprocess']['source_lang']))
        if not args['preprocess']['only_train']:
            filenames.extend(
                PathManager.ls(valid_path(args['preprocess']['source_lang'])))
        src_dict = task.build_dictionary(
            filenames,
            tokenize_func=tokenization.json_tokenizer,
            workers=args['preprocess']['workers'],
            threshold=args['preprocess']['threshold'],
            nwords=args['preprocess']['nwordssrc'],
            padding_factor=args['preprocess']['padding_factor'],
            bos=None,
            eos=None,
        )

    src_dict.save(dict_path(
        args['preprocess']['source_lang']))  # save spm dict to ncc.dictionary
    # copy shared dict into each language's data directory
    for d in PathManager.ls(os.path.dirname(args['preprocess']['trainpref'])):
        lang = os.path.basename(d)
        src_dict.save(
            os.path.join(args['preprocess']['destdir'], lang,
                         f"{args['preprocess']['source_lang']}.dict.jsonl"))

    # 2. ***************build dataset********************
    def make_binary_dataset(vocab, input_file, output_file, num_workers):
        """make binary dataset"""
        # LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1))
        n_seq_tok = [0, 0]
        replaced = Counter()  # save un-recorded tokens

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        # split a file into different parts
        # if use multi-processing, we first process 2nd to last file
        # 1.txt -> 10 processor, 0(p0)(0-99), 100(p1)(100-199), ...
        offsets = file_io.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:
            # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx)
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_file, worker_id)
                pool.apply_async(binarize,
                                 (args, input_file, vocab, prefix,
                                  offsets[worker_id], offsets[worker_id + 1]),
                                 callback=merge_result)
            pool.close()
        # process 1th file, if multi-processing available. If not, process all file
        # p0 -> 0,end
        ds_file = '{}.mmap'.format(output_file)
        ds = indexed_dataset.make_builder(
            ds_file,
            impl=args['preprocess']['dataset_impl'],
            vocab_size=len(vocab))

        def consumer(data, _):
            ds.add_item(data)

        merge_result(
            Binarizer.binarize_seperate(
                input_file,
                vocab,
                consumer,
                tokenize=string2tokens,
                offset=0,
                end=offsets[1],
                append_eos=False,
            ))
        if num_workers > 1:
            # p1-pN
            pool.join()
            # merge sub-processors' index and data files into final files and delete them.
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(output_file, worker_id)
                ds.merge_file_(temp_file_path)
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
        ds.finalize('{}.idx'.format(output_file))

        LOGGER.info("{}: {} sents, {} tokens, {:.3}% replaced by {}".format(
            # attr,
            input_file,
            n_seq_tok[0],
            n_seq_tok[1],
            100 * sum(replaced.values()) / n_seq_tok[1],
            vocab.unk_word,
        ))

    def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1):
        if args['preprocess']['dataset_impl'] == "raw":
            raise NotImplementedError
        else:
            languages = [
                os.path.basename(d)
                for d in PathManager.ls(os.path.dirname(input_prefix))
            ]
            for l in languages:
                in_file = file_name(input_prefix, lang)
                in_file = str.replace(in_file, '*', l)
                out_file = dest_path(os.path.join(l, output_prefix), lang)
                PathManager.mkdir(os.path.dirname(out_file))
                make_binary_dataset(vocab, in_file, out_file, num_workers)

    def make_all(lang, vocab):
        if args['preprocess']['trainpref']:
            make_dataset(vocab,
                         args['preprocess']['trainpref'],
                         "train",
                         lang,
                         num_workers=args['preprocess']['workers'])
        if args['preprocess']['validpref']:
            for k, validpref in enumerate(
                    args['preprocess']['validpref'].split(",")):
                outprefix = "valid{}".format(k) if k > 0 else "valid"
                make_dataset(vocab,
                             validpref,
                             outprefix,
                             lang,
                             num_workers=args['preprocess']['workers'])
        if args['preprocess']['testpref']:
            for k, testpref in enumerate(
                    args['preprocess']['testpref'].split(",")):
                outprefix = "test{}".format(k) if k > 0 else "test"
                make_dataset(vocab,
                             testpref,
                             outprefix,
                             lang,
                             num_workers=args['preprocess']['workers'])

    make_all(args['preprocess']['source_lang'], src_dict)
예제 #7
0
def main(args):
    task = tasks.get_task(args['preprocess']['task'])
    LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'],
                                              args['preprocess']['task']))
    os.makedirs(args['preprocess']['destdir'], exist_ok=True)

    def train_path(lang):
        return "{}{}".format(args['preprocess']['trainpref'],
                             ("." + lang) if lang else "")

    def valid_path(lang):
        return "{}{}".format(args['preprocess']['validpref'],
                             ("." + lang) if lang else "")

    def file_name(prefix, lang):
        fname = prefix
        if lang is not None:
            fname += ".{lang}".format(lang=lang)
        return fname

    def dest_path(prefix, lang):
        return os.path.join(args['preprocess']['destdir'],
                            file_name(prefix, lang))

    def dict_path(lang):
        return dest_path(lang, "dict") + ".jsonl"

    target = not args['preprocess']['only_source']

    if args['preprocess']['srcdict']:
        src_dict = task.load_dictionary(args['preprocess']['srcdict'])
    else:
        assert args['preprocess'][
            'trainpref'], "--trainpref must be set if --srcdict is not specified"

        data_files = train_path(args['preprocess']['source_lang'])
        data_files = PathManager.ls(data_files)

        src_dict = task.build_bpe_dictionary(
            data_files,
            tokenize_func=tokenizers.sub_tokenizer,
            workers=args['preprocess']['workers'],
            threshold=args['preprocess']['thresholdsrc'],
            nwords=args['preprocess']['nwordssrc'],
            padding_factor=args['preprocess']['padding_factor'],
            bos=None,
            eos=None,
            bpe_portion=args['preprocess']['source_bpe_portion'],
        )
    if target:
        if args['preprocess']['tgtdict']:
            tgt_dict = task.load_dictionary(args['preprocess']['tgtdict'])
        else:
            data_files = train_path(args['preprocess']['target_lang'])
            if '*' in data_files:
                data_files = glob(data_files)
            else:
                data_files = [data_files]

            assert args['preprocess'][
                'trainpref'], "--trainpref must be set if --tgtdict is not specified"
            tgt_dict = task.build_bpe_dictionary(
                data_files,
                tokenize_func=tokenizers.lower_tokenizer,
                workers=args['preprocess']['workers'],
                threshold=0,
                nwords=args['preprocess']['nwordstgt'],
                padding_factor=args['preprocess']['padding_factor'],
                bos=None,
                eos=None,
                bpe_portion=args['preprocess']['target_bpe_portion'],
            )
    else:
        tgt_dict = None

    # src_dict.save(dict_path(args['preprocess']['source_lang']))
    # tgt_dict.save(dict_path(args['preprocess']['target_lang']))
    # tgt_dict.save(dict_path("func_name"))  # save target_lang dict for func_name

    # 2. ***************build dataset********************
    def make_binary_dataset(vocab: Dictionary, input_file, output_file,
                            use_func, num_workers: int):
        """make binary dataset"""
        # LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1))
        n_seq_tok = [0, 0]
        replaced = Counter()  # save un-recorded tokens

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        offsets = find_offsets(input_file, num_chunks=num_workers)
        func_offsets = None
        modality = input_file.split('.')[-1]
        if modality == 'code_tokens':
            tokenizer = tokenizers.list_tokenizer
            if use_func:
                func_offsets = Binarizer.find_func_offsets(input_file,
                                                           offsets=offsets)
        elif modality == 'func_name':
            tokenizer = tokenizers.func_name_tokenizer
        elif modality == 'docstring_tokens':
            tokenizer = tokenizers.lower_tokenizer
        else:
            raise NotImplementedError(modality)

        pool = None
        if num_workers > 1:
            # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx)
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_file, worker_id)
                pool.apply_async(binarize, (
                    args,
                    input_file,
                    vocab,
                    prefix,
                    tokenizer,
                    use_func and (modality == 'code_tokens'),
                    offsets[worker_id],
                    offsets[worker_id + 1],
                    func_offsets[worker_id] if func_offsets else 0,
                ),
                                 callback=merge_result)
            pool.close()
        # process 1th file, if multi-processing available. If not, process all file
        # p0 -> 0,end
        ds_file = '{}.mmap'.format(output_file)
        ds = indexed_dataset.make_builder(
            ds_file,
            impl=args['preprocess']['dataset_impl'],
            vocab_size=len(vocab))

        merge_result(
            Binarizer.binarize(
                input_file,
                vocab,
                lambda t: ds.add_item(t),
                tokenize=tokenizer,
                use_func=use_func and (modality == 'code_tokens'),
                offset=offsets[0],
                end=offsets[1],
                func_offset=func_offsets[0] if func_offsets else 0,
                append_eos=False,
                min_func_len=args['preprocess']['min_func_len'],
            ))

        if num_workers > 1:
            # p1-pN
            pool.join()
            # merge sub-processors' index and data files into final files and delete them.
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(output_file, worker_id)
                ds.merge_file_(temp_file_path)
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
        ds.finalize('{}.idx'.format(output_file))

        LOGGER.info("{}: {} sents, {} tokens, {:.3}% replaced by {}".format(
            # attr,
            input_file,
            n_seq_tok[0],
            n_seq_tok[1],
            100 * sum(replaced.values()) / n_seq_tok[1],
            vocab.unk_word,
        ))

    def make_dataset(vocab,
                     input_prefix,
                     output_prefix,
                     lang,
                     use_func=False,
                     num_workers=1):
        if args['preprocess']['dataset_impl'] == "raw":
            raise NotImplementedError
        else:
            in_files = file_name(input_prefix, lang)
            if '*' in in_files:
                in_files = glob(in_files)
            else:
                in_files = [in_files]
            for in_file in in_files:
                if lang == 'code_tokens':
                    out_file = dest_path(output_prefix, f'{str.split(in_file, os.sep)[-2]}.{lang + ".wo_func"}') \
                        if use_func == True else dest_path(output_prefix, f'{str.split(in_file, os.sep)[-2]}.{lang}')
                else:
                    out_file = dest_path(
                        output_prefix,
                        f'{str.split(in_file, os.sep)[-2]}.{lang}')
                os.makedirs(os.path.dirname(out_file), exist_ok=True)
                make_binary_dataset(vocab, in_file, out_file, use_func,
                                    num_workers)

    def make_all(lang, vocab, use_func=False):
        if args['preprocess']['trainpref']:
            make_dataset(vocab,
                         args['preprocess']['trainpref'],
                         "train",
                         lang,
                         num_workers=args['preprocess']['workers'],
                         use_func=use_func)
        if args['preprocess']['validpref']:
            make_dataset(vocab,
                         args['preprocess']['validpref'],
                         "valid",
                         lang,
                         num_workers=args['preprocess']['workers'],
                         use_func=use_func)
        if args['preprocess']['testpref']:
            make_dataset(vocab,
                         args['preprocess']['testpref'],
                         "test",
                         lang,
                         num_workers=args['preprocess']['workers'],
                         use_func=use_func)

    make_all(args['preprocess']['source_lang'], src_dict)
    make_all(args['preprocess']['source_lang'], src_dict, use_func=True)
    if target:
        make_all(args['preprocess']['target_lang'], tgt_dict)
        make_all('func_name', tgt_dict)  # func_name as query
예제 #8
0
def main(args):
    task = tasks.get_task(args['preprocess']['task'])
    LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'],
                                              args['preprocess']['task']))
    PathManager.mkdir(args['preprocess']['destdir'])

    def train_path(lang):
        return "{}{}".format(args['preprocess']['trainpref'],
                             ("." + lang) if lang else "")

    def valid_path(lang):
        return "{}{}".format(args['preprocess']['validpref'],
                             ("." + lang) if lang else "")

    def file_name(prefix, lang):
        fname = prefix
        if lang is not None:
            fname += ".{lang}".format(lang=lang)
        return fname

    def dest_path(prefix, lang):
        return os.path.join(args['preprocess']['destdir'],
                            file_name(prefix, lang))

    def dict_path(lang):
        return dest_path(lang, "dict") + ".jsonl"

    target = not args['preprocess']['only_source']

    if args['preprocess']['joined_dictionary']:
        assert not args['preprocess']['srcdict'] or not args['preprocess']['tgtdict'], \
            "cannot use both --srcdict and --tgtdict with --joined-dictionary"
        if args['preprocess']['srcdict']:
            src_dict = task.load_dictionary(args['preprocess']['srcdict'])
        elif args['preprocess']['tgtdict']:
            src_dict = task.load_dictionary(args['preprocess']['tgtdict'])
        else:
            assert args['preprocess'][
                'trainpref'], "--trainpref must be set if --srcdict is not specified"
            filenames = [
                train_path(args['preprocess']['source_lang']),
                train_path(args['preprocess']['target_lang'])
            ]
            if not args['preprocess']['only_train']:
                filenames.extend( \
                    [valid_path(args['preprocess']['source_lang']), valid_path(args['preprocess']['target_lang'])])
            src_dict = task.build_dictionary(
                filenames,
                tokenize_func=tokenization.dpu_sub_tokenizer,
                workers=args['preprocess']['workers'],
                threshold=args['preprocess']['threshold'],
                # set max len for joint dictionaries
                nwords=max(args['preprocess']['nwordssrc'],
                           args['preprocess']['nwordstgt']),
            )
        tgt_dict = src_dict

    else:
        if args['preprocess']['srcdict']:
            src_dict = task.load_dictionary(args['preprocess']['srcdict'])
        else:
            assert args['preprocess'][
                'trainpref'], "--trainpref must be set if --srcdict is not specified"

            filenames = PathManager.ls(
                train_path(args['preprocess']['source_lang']))
            if not args['preprocess']['only_train']:
                filenames.extend(
                    PathManager.ls(
                        valid_path(args['preprocess']['source_lang'])))
            src_dict = task.build_dictionary(
                filenames,
                tokenize_func=tokenization.dpu_sub_tokenizer,
                workers=args['preprocess']['workers'],
                threshold=args['preprocess']['thresholdsrc'],
                nwords=args['preprocess']['nwordssrc'],
                padding_factor=args['preprocess']['padding_factor'],
            )
        if target:
            if args['preprocess']['tgtdict']:
                tgt_dict = task.load_dictionary(args['preprocess']['tgtdict'])
            else:
                assert args['preprocess'][
                    'trainpref'], "--trainpref must be set if --tgtdict is not specified"
                filenames = PathManager.ls(
                    train_path(args['preprocess']['target_lang']))
                if not args['preprocess']['only_train']:
                    filenames.extend(
                        PathManager.ls(
                            valid_path(args['preprocess']['target_lang'])))
                tgt_dict = task.build_dictionary(
                    filenames,
                    tokenize_func=tokenization.dpu_sub_tokenizer,
                    workers=args['preprocess']['workers'],
                    threshold=args['preprocess']['thresholdtgt'],
                    nwords=args['preprocess']['nwordstgt'],
                    padding_factor=args['preprocess']['padding_factor'],
                )
        else:
            tgt_dict = None

    src_dict.save(dict_path(
        args['preprocess']['source_lang']))  # save spm dict to ncc.dictionary
    if target and tgt_dict is not None:
        tgt_dict.save(dict_path(args['preprocess']['target_lang']))

    # 2. ***************build dataset********************
    def make_binary_dataset(vocab: Dictionary, input_file, output_file,
                            num_workers: int):
        """make binary dataset"""
        # LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1))
        n_seq_tok = [0, 0]
        replaced = Counter()  # save un-recorded tokens

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        # split a file into different parts
        # if use multi-processing, we first process 2nd to last file
        # 1.txt -> 10 processor, 0(p0)(0-99), 100(p1)(100-199), ...
        offsets = Binarizer.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:
            # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx)
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_file, worker_id)
                pool.apply_async(binarize,
                                 (args, input_file, vocab, prefix,
                                  offsets[worker_id], offsets[worker_id + 1]),
                                 callback=merge_result)
            pool.close()
        # process 1th file, if multi-processing available. If not, process all file
        # p0 -> 0,end
        ds_file = '{}.mmap'.format(output_file)
        ds = indexed_dataset.make_builder(
            ds_file,
            impl=args['preprocess']['dataset_impl'],
            vocab_size=len(vocab))
        merge_result(
            Binarizer.binarize(
                input_file,
                vocab,
                lambda t: ds.add_item(t),
                tokenize=tokenization.dpu_sub_tokenizer,
                offset=0,
                end=offsets[1],
                append_eos=False,
            ))
        if num_workers > 1:
            # p1-pN
            pool.join()
            # merge sub-processors' index and data files into final files and delete them.
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(output_file, worker_id)
                ds.merge_file_(temp_file_path)
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
        ds.finalize('{}.idx'.format(output_file))

        LOGGER.info("{}: {} sents, {} tokens, {:.3}% replaced by {}".format(
            # attr,
            input_file,
            n_seq_tok[0],
            n_seq_tok[1],
            100 * sum(replaced.values()) / n_seq_tok[1],
            vocab.unk_word,
        ))

    def make_dataset(vocab,
                     input_prefix,
                     output_prefix,
                     lang,
                     out_file=None,
                     num_workers=1):
        if args['preprocess']['dataset_impl'] == "raw":
            raise NotImplementedError
        else:
            in_file = file_name(input_prefix, lang)
            if out_file is None:
                out_file = dest_path(output_prefix, lang)
            PathManager.mkdir(os.path.dirname(out_file))
            make_binary_dataset(vocab, in_file, out_file, num_workers)

    def make_all(lang, vocab):
        for l in os.listdir(args['preprocess']['trainpref'].split('*')[0]):
            # copy shared dict into each languages
            out_dir = os.path.join(args['preprocess']['destdir'], l)
            PathManager.mkdir(out_dir)
            dst_dict = os.path.join(out_dir, f'{lang}.dict.jsonl')
            PathManager.copy(dict_path(lang), dst_dict)

            if args['preprocess']['trainpref']:
                out_file = os.path.join(out_dir, f"train.{lang}")
                make_dataset(vocab,
                             args['preprocess']['trainpref'].replace('*', l),
                             "train",
                             lang,
                             out_file=out_file,
                             num_workers=args['preprocess']['workers'])
            if args['preprocess']['validpref']:
                out_file = os.path.join(out_dir, f"valid.{lang}")
                make_dataset(vocab,
                             args['preprocess']['validpref'].replace('*', l),
                             'valid',
                             lang,
                             out_file=out_file,
                             num_workers=args['preprocess']['workers'])
            if args['preprocess']['testpref']:
                out_file = os.path.join(out_dir, f"test.{lang}")
                make_dataset(vocab,
                             args['preprocess']['testpref'].replace('*', l),
                             'test',
                             lang,
                             out_file=out_file,
                             num_workers=args['preprocess']['workers'])

    make_all(args['preprocess']['source_lang'], src_dict)
    if target:
        make_all(args['preprocess']['target_lang'], tgt_dict)