def cli_main(): import argparse parser = argparse.ArgumentParser( description= "Downloading/Decompressing CodeSearchNet dataset(s) or Tree-Sitter Library(ies)" ) parser.add_argument("--yaml_file", "-f", type=str, help="load {language}.yml for train", default='config/csn_feng/ruby') parser.add_argument( '--out_file', '-o', type=str, help='output generated file', default=None, ) args = parser.parse_args() yaml_file = os.path.join(os.path.dirname(__file__), f"{args.yaml_file}.yml") out_file = None if args.out_file is None else recursive_expanduser( args.out_file) LOGGER.info('Load arguments in {}'.format(yaml_file)) args = load_yaml(yaml_file) LOGGER.info(args) main(args, out_file)
def cli_main(): # modal_path = '~/.ncc/demo/summarization/neural_transformer/python_wan.pt' modal_path = '~/.ncc/demo/summarization/seq2seq/python_wan.pt' code = "def positional(max_positional_args):\n\tdef positional_decorator(wrapped):\n\t\[email protected](wrapped)\n\t\tdef positional_wrapper(*args, **kwargs):\n\t\t\tif (len(args) > max_posi tional_args):\n\t\t\t\tplural_s = ''\n\t\t\t\tif (max_positional_args != 1):\n\t\t\t\t\tplural_s = 's'\n\t\t\t\tmessage = ('%s()\ttakes\tat\tmost\t%d\tpositional\targument%s\t(%d\tgive n)' % (wrapped.__name__, max_positional_args, plural_s, len(args)))\n\t\t\t\tif (positional_parameters_enforcement == POSITIONAL_EXCEPTION):\n\t\t\t\t\traise TypeError(message)\n\t\t\t \telif (positional_parameters_enforcement == POSITIONAL_WARNING):\n\t\t\t\t\tlogger.warning(message)\n\t\t\t\telse:\n\t\t\t\t\tpass\n\t\t\treturn wrapped(*args, **kwargs)\n\t\treturn p ositional_wrapper\n\tif isinstance(max_positional_args, six.integer_types):\n\t\treturn positional_decorator\n\telse:\n\t\t(args, _, _, defaults) = inspect.getargspec(max_positional_ar gs)\n\t\treturn positional((len(args) - len(defaults)))(max_positional_args)" # ground truth: "a decorator to declare that only the first n arguments my be positional ." # modal_path = '~/.ncc/demo/completion/seqrnn/py150.pt' # code = "body_content = self._serialize.body(parameters, 'ServicePrincipalCreateParameters')\nrequest = self._client.post(url, query_parameters)\nresponse = self._client.send( request, header_parameters, body_content, operation_config)" # ground truth: "(request, header_parameters, body_content, **operation_config)" import argparse parser = argparse.ArgumentParser(description="Command Interface") parser.add_argument("--model", "-m", type=str, help="pytorch model path", default=modal_path) parser.add_argument("--input", "-i", type=str, help="model input", default=code) args = parser.parse_args() args.model = os.path.expanduser(args.model) model_output = main(args.model, args.input) LOGGER.info(model_output)
def load_langpair_dataset( data_path, split, src, src_dict, tgt, tgt_dict, dataset_impl, left_pad_source, left_pad_target, max_source_positions, max_target_positions, prepend_bos=False, load_alignments=False, truncate_source=False, append_source_id=False, truncate_target=False, append_eos_to_target=False, portion=None, ): src_path = os.path.join(data_path, '{}.{}'.format(split, src)) src_dataset = _load_dataset(path=src_path, impl=dataset_impl, dict=src_dict) if portion is not None and split == 'train': LOGGER.info('set {}.{} portion to {}'.format(split, src, portion)) src_dataset = PortionDataset(src_dataset, portion) tgt_path = os.path.join(data_path, '{}.{}'.format(split, tgt)) tgt_dataset = _load_dataset(path=tgt_path, impl=dataset_impl, dict=tgt_dict) if truncate_target: LOGGER.info('truncate {}.{} to {}'.format(split, tgt, max_target_positions)) tgt_dataset = TruncateDataset(tgt_dataset, max_target_positions) if prepend_bos: assert hasattr(src_dict, "bos_index") and hasattr(tgt_dict, "bos_index") src_dataset = PrependTokenDataset(src_dataset, src_dict.bos()) if tgt_dataset is not None: tgt_dataset = PrependTokenDataset(tgt_dataset, tgt_dict.bos()) eos = None if append_source_id: if tgt_dataset is not None: tgt_dataset = AppendTokenDataset(tgt_dataset, tgt_dict.index('[{}]'.format(tgt))) eos = tgt_dict.index('[{}]'.format(tgt)) if portion is not None and split == 'train': LOGGER.info('set {}.{} portion to {}'.format(split, tgt, portion)) tgt_dataset = PortionDataset(tgt_dataset, portion) tgt_dataset_sizes = tgt_dataset.sizes if tgt_dataset is not None else None LOGGER.info('loaded {} examples from: {}'.format(len(src_dataset), src_path)) LOGGER.info('loaded {} examples from: {}'.format(len(tgt_dataset), tgt_path)) return GraphLanguagePairDataset( src_dataset, src_dataset.sizes, src_dict, tgt_dataset, tgt_dataset_sizes, tgt_dict, left_pad_source=left_pad_source, left_pad_target=left_pad_target, max_source_positions=max_source_positions, max_target_positions=max_target_positions, align_dataset=None, eos=eos, remove_eos_from_source=True, append_eos_to_target=append_eos_to_target, shuffle=True, )
def build_dataset(args: Dict, src_dicts: Dict[str, Dictionary], tgt_dict: Dictionary): """build dataset for modal""" for modality, src_dict in src_dicts.items(): LOGGER.info('Building dataset for {}'.format(modality)) for lang, data_prefs in args['preprocess']['dataprefs'].items(): make_all(modality, src_dict, lang, data_prefs)
def setup_task(cls, args, **kwargs): """Setup the task (e.g., load dictionaries). Args: args (argparse.Namespace): parsed command-line arguments """ paths = utils.split_paths(args['task']['data']) assert len(paths) > 0 dict = args['task'].get('dict', None) dict_type = args['task'].get('dict_type', None) if dict is None and dict_type is None: # load dictionaries src_dict = cls.load_dictionary( os.path.join( paths[0], '{}.dict.jsonl'.format(args['task']['source_lang']))) tgt_dict = cls.load_dictionary( os.path.join( paths[0], '{}.dict.jsonl'.format(args['task']['target_lang']))) assert src_dict.pad() == tgt_dict.pad() assert src_dict.eos() == tgt_dict.eos() assert src_dict.unk() == tgt_dict.unk() LOGGER.info('[{}] dictionary: {} types'.format( args['task']['source_lang'], len(src_dict))) LOGGER.info('[{}] dictionary: {} types'.format( args['task']['target_lang'], len(tgt_dict))) else: raise NotImplementedError return cls(args, src_dict, tgt_dict)
def setup_task(cls, args, **kwargs): """Setup the task. """ # paths = args.data.split(':') paths = utils.split_paths(args['task']['data']) assert len(paths) > 0 dictionary = Dictionary.load(os.path.join(paths[0], 'dict.jsonl')) data_path = paths[0] if args['task']['langs'] is None: languages = sorted([ name for name in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, name)) ]) else: languages = args['task']['langs'] # .split(',') if args['task']['add_lang_token']: for lang in languages: dictionary.add_symbol('[{}]'.format(lang)) LOGGER.info("Loading dictionary: {} types".format(len(dictionary))) # if not hasattr(args, 'shuffle_instance'): # args.shuffle_instance = False return cls(args, dictionary)
def _inference_with_bleu(self, generator, sample, model): import sacrebleu def decode(toks, escape_unk=False): s = self.tgt_dict.string( toks.int().cpu(), self.args['task']['eval_bleu_remove_bpe'], escape_unk=escape_unk, ) if self.tokenizer: s = self.tokenizer.decode(s) return s gen_out = self.inference_step(generator, [model], sample, None) hyps, refs = [], [] for i in range(len(gen_out)): hyps.append(decode(gen_out[i][0]['tokens'])) refs.append( decode( utils.strip_pad(sample['target'][i], self.tgt_dict.pad()), escape_unk=True, # don't count <unk> as matches to the hypo )) if self.args['task']['eval_bleu_print_samples']: LOGGER.info('example hypothesis: ' + hyps[0]) LOGGER.info('example reference: ' + refs[0]) # tokenize = sacrebleu.DEFAULT_TOKENIZER if not self.args['task']['eval_tokenized_bleu'] else 'none' # return sacrebleu.corpus_bleu(hyps, [refs], tokenize=tokenize) if self.args['task']['eval_tokenized_bleu']: return sacrebleu.corpus_bleu(hyps, [refs], tokenize='none') else: return sacrebleu.corpus_bleu(hyps, [refs])
def setup_task(cls, args, **kwargs): """Setup the task (e.g., load dictionaries). Args: args (argparse.Namespace): parsed command-line arguments """ paths = utils.split_paths(args['task']['data']) assert len(paths) > 0 share_dict = args['task'].get('share_dict', False) if share_dict: src_dict = tgt_dict = cls.load_dictionary( os.path.join(paths[0], "dict.jsonl")) else: # load dictionaries src_dict = cls.load_dictionary( os.path.join(paths[0], f"{args['task']['source_lang']}.dict.jsonl")) tgt_dict = cls.load_dictionary( os.path.join(paths[0], f"{args['task']['target_lang']}.dict.jsonl")) assert src_dict.pad() == tgt_dict.pad() assert src_dict.eos() == tgt_dict.eos() assert src_dict.unk() == tgt_dict.unk() LOGGER.info('[{}] dictionary: {} types'.format( args['task']['source_lang'], len(src_dict))) LOGGER.info('[{}] dictionary: {} types'.format( args['task']['target_lang'], len(tgt_dict))) return cls(args, src_dict, tgt_dict)
def load_tokens_dataset( data_path, split, src, src_dict, tgt, tgt_dict, dataset_impl, max_source_positions=None, max_target_positions=None, max_positions=None, append_source_eos=False, append_target_eos=False, shuffle=False, ): src_path = os.path.join(data_path, '{}.{}'.format(split, src)) src_dataset = _load_dataset(src_path, dataset_impl) if max_source_positions is not None: src_dataset = TruncateDataset(src_dataset, max_source_positions) LOGGER.info('loaded {} examples from: {}'.format(len(src_dataset), src_path)) tgt_path = os.path.join(data_path, '{}.{}'.format(split, tgt)) tgt_dataset = _load_dataset(tgt_path, dataset_impl) if max_target_positions is not None: tgt_dataset = TruncateDataset(tgt_dataset, max_target_positions) LOGGER.info('loaded {} examples from: {}'.format(len(tgt_dataset), tgt_path)) return BertDataset( src_dataset, src_dataset.sizes, src_dict, tgt_dataset, tgt_dataset.sizes, tgt_dict, max_source_positions=max_source_positions, max_target_positions=max_target_positions, max_positions=max_positions, append_source_eos=append_source_eos, append_target_eos=append_target_eos, shuffle=shuffle, )
def cli_main(): import argparse parser = argparse.ArgumentParser( description= "Downloading/Decompressing CodeSearchNet dataset(s) or Tree-Sitter Library(ies)" ) parser.add_argument("--yaml_file", "-f", type=str, help="load {yaml_file}.yml for train", default='config/python_wan/python') parser.add_argument( '--out_file', '-o', type=str, help='output generated file', default=None, ) args = parser.parse_args() yaml_file = os.path.join(os.path.dirname(__file__), '{}.yml'.format(args.yaml_file)) out_file = args.out_file if out_file: dirname = os.path.dirname(out_file) assert os.path.isdir(dirname) os.makedirs(dirname, exist_ok=True) LOGGER.info('Load arguments in {}, output gnerated sentences at {}(if None, it won\'t record prediction).' \ .format(yaml_file, out_file)) args = load_yaml(yaml_file) LOGGER.info(args) torch.cuda.set_device(args['distributed_training']['device_id']) main(args, out_file)
def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1): if args['preprocess']['dataset_impl'] == "raw": in_file = file_name(input_prefix, lang) out_dir = args['preprocess']['destdir'] os.makedirs(out_dir, exist_ok=True) LOGGER.info('Copying {} into {}'.format(in_file, out_dir)) shutil.copy(src=in_file, dst=args['preprocess']['destdir']) else: in_file = file_name(input_prefix, lang) out_file = dest_path(output_prefix, lang) os.makedirs(os.path.dirname(out_file), exist_ok=True) offsets = find_offsets(in_file, num_workers) with Pool(num_workers) as mpool: results = [ mpool.apply_async( build_dgl_graph, (vocab, in_file, f'{out_file}{worker_id}.mmap', offsets[worker_id], offsets[worker_id + 1]), ) for worker_id in range(num_workers) ] results = [res.get() for res in results] graph_batch = [] for worker_id in range(num_workers): sub_file = f'{out_file}{worker_id}.mmap' glist, _ = load_graphs(sub_file) graph_batch.extend(glist) os.remove(sub_file) save_graphs(f'{out_file}.mmap', graph_batch)
def make_binary_dataset(vocab: Dictionary, input_file, output_file, attr: str, num_workers: int): """make binary dataset""" LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1)) n_seq_tok = [0, 0] replaced = Counter() # save un-recorded tokens def merge_result(worker_result): replaced.update(worker_result["replaced"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] # split a file into different parts # if use multi-processing, we first process 2nd to last file # 1.txt -> 10 processor, 0(p0)(0-99), 100(p1)(100-199), ... offsets = Binarizer.find_offsets(input_file, num_workers) pool = None if num_workers > 1: # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx) pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_file, worker_id) pool.apply_async(binarize, (args, input_file, vocab, prefix, attr, offsets[worker_id], offsets[worker_id + 1]), callback=merge_result) pool.close() # process 1th file, if multi-processing available. If not, process all file # p0 -> 0,end ds_file = '{}.mmap'.format(output_file) ds = indexed_dataset.make_builder( ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab)) merge_result( Binarizer.binarize_bpe(input_file, vocab, lambda t: ds.add_item(t), offset=0, end=offsets[1])) if num_workers > 1: # p1-pN pool.join() # merge sub-processors' index and data files into final files and delete them. for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(output_file, worker_id) ds.merge_file_(temp_file_path) # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize('{}.idx'.format(output_file)) LOGGER.info( "[{}] {}: {} sents, {} tokens, BPE no replaced token".format( attr, input_file, n_seq_tok[0], n_seq_tok[1], ))
def setup_task(cls, args, **kwargs): paths = utils.split_paths(args['task']['data']) assert len(paths) > 0 dictionary = cls.load_dictionary( os.path.join(paths[0], 'dict.{}.json'.format(args['task']['source_lang']))) LOGGER.info('dictionary: {} types'.format(len(dictionary))) return cls(args, dictionary)
def setup_task(cls, args, **kwargs): """Setup the task. """ dictionary = Dictionary.load(os.path.join(args.data, 'dict.txt')) LOGGER.info('dictionary: {} types'.format(len(dictionary))) if not hasattr(args, 'shuffle_instance'): args.shuffle_instance = False return cls(args, dictionary)
def setup_task(cls, args, **kwargs): paths = utils.split_paths(args['task']['data']) assert len(paths) > 0 # dictionary = cls.load_dictionary(os.path.join(paths[0], 'codesearchnet_ruby.dict.txt')) dictionary = cls.load_dictionary( os.path.join(paths[0], 'csnjs_8k_9995p_unigram_url.dict.txt')) # dictionary = cls.load_dictionary(args['dataset']['srcdict']) LOGGER.info('dictionary: {} types'.format(len(dictionary))) return cls(args, dictionary)
def load_dataset(self, split, epoch=1, combine=False, **kwargs): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ paths = utils.split_paths(self.args.data) assert len(paths) > 0 data_path = paths[(epoch - 1) % len(paths)] split_path = os.path.join(data_path, split) dataset = data_utils.load_indexed_dataset( split_path, self.dictionary, self.args.dataset_impl, combine=combine, ) if dataset is None: raise FileNotFoundError('Dataset not found: {} ({})'.format( split, split_path)) dataset = StripTokenDataset(dataset, self.dictionary.eos()) # create continuous blocks of tokens dataset = TokenBlockDataset( dataset, dataset.sizes, self.args.tokens_per_sample - 2, # one less for <s> and one for </s> pad=self.dictionary.pad(), eos=self.dictionary.eos(), break_mode=self.args.sample_break_mode, document_sep_len=0) # prepend beginning-of-sentence token (<s>, equiv. to [CLS] in BERT) dataset = PrependTokenDataset(dataset, self.source_dictionary.bos()) dataset = AppendTokenDataset(dataset, self.source_dictionary.eos()) mask_whole_words = get_whole_word_mask(self.args, self.source_dictionary) \ if self.args.mask_length != 'subword' else None self.datasets[split] = DenoisingDataset( dataset, dataset.sizes, self.dictionary, self.mask_idx, mask_whole_words, shuffle=self.args.shuffle_instance, seed=self.seed, args=self.args) LOGGER.info( "Split: {0}, Loaded {1} samples of denoising_dataset".format( split, len(self.datasets[split]), ))
def __init__(self, args, params): super().__init__(args) fused_adam_cls = get_fused_adam_class() use_fused_adam = (not args['optimization']['adam']['use_old_adam'] and fused_adam_cls is not None and torch.cuda.is_available()) if use_fused_adam: LOGGER.info('using FusedAdam') self._optimizer = fused_adam_cls(params, **self.optimizer_config) else: self._optimizer = Adam(params, **self.optimizer_config)
def flatten(raw_file, dst_dir, mode): """flatten attributes of raw data""" data_frame = pd.read_csv(raw_file) attrs = data_frame.columns.values.tolist()[1:-1] LOGGER.info('Cast attributes({}) of OpenCL-{} dataset'.format(attrs, lang)) for attr in attrs: dst_file = os.path.join(dst_dir, f"{mode}.{attr}") data = getattr(data_frame, attr).values.tolist() with file_io.open(dst_file, 'w') as writer: for line in data: print(json_io.json_dumps(line), file=writer)
def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1): if args['preprocess']['dataset_impl'] == "raw": in_file = file_name(input_prefix, lang) out_dir = args['preprocess']['destdir'] os.makedirs(out_dir, exist_ok=True) LOGGER.info('Copying {} into {}'.format(in_file, out_dir)) shutil.copy(src=in_file, dst=args['preprocess']['destdir']) else: in_file = file_name(input_prefix, lang) out_file = dest_path(output_prefix, lang) os.makedirs(os.path.dirname(out_file), exist_ok=True) make_binary_dataset(vocab, in_file, out_file, num_workers)
def flatten(raw_dir, lang, mode, flatten_dir, attrs, num_cores): """flatten attributes of raw data""" LOGGER.info('Cast attributes({}) of {}-{} dataset'.format( attrs, lang, mode)) with Pool(num_cores) as mpool: result = [ mpool.apply_async(flatten_attrs, (raw_file, flatten_dir, lang, mode, set(attrs))) for raw_file in PathManager.ls( os.path.join(raw_dir, lang, mode, '*.jsonl.gz')) ] result = [res.get() for res in result]
def download(name): if name in TREE_SITTER_SO_FILE_ARCHIVE_MAP: url = TREE_SITTER_SO_FILE_ARCHIVE_MAP[name] LOGGER.info(f"Download {name}.so from {url}") gdown.download(url=url, output=os.path.join(__TREE_SITTER_LIBS_DIR__, f"{name}.so")) else: raise FileExistsError( f"{name}.so has not been uploaded to the server. Please, build {name}.so with " \ f" {os.path.dirname(__file__)}/build_so.py" )
def load_langpair_dataset( data_path, split, src, src_dict, tgt, tgt_dict, dataset_impl, left_pad_source, max_source_positions, src_aux=None, ): # load source dataset src_path = os.path.join(data_path, '{}.{}'.format(split, src)) src_dataset = _load_dataset(path=src_path, impl=dataset_impl, dict=src_dict) src_dataset = TruncateDataset(src_dataset, truncation_length=max_source_positions, truncate_prefix=0) # load target dataset tgt_path = os.path.join(data_path, '{}.{}'.format(split, tgt)) tgt_dataset = _load_dataset(path=tgt_path, impl=dataset_impl, dict=tgt_dict) # load auxiliary dataset aux_datasets = OrderedDict() for aux in src_aux: aux_path = os.path.join(data_path, '{}.{}'.format(split, aux)) with open(aux_path, 'rb') as reader: aux_datasets[aux] = pickle.load(reader) tgt_dataset_sizes = tgt_dataset.sizes if tgt_dataset is not None else None LOGGER.info('loaded {} examples from: {}'.format(len(src_dataset), src_path)) LOGGER.info('loaded {} examples from: {}'.format(len(tgt_dataset), tgt_path)) return LanguagePairDataset( src_dataset, src_dataset.sizes, src_dict, src_aux=aux_datasets, tgt=tgt_dataset, tgt_sizes=tgt_dataset_sizes, tgt_dict=tgt_dict, left_pad_source=left_pad_source, max_source_positions=max_source_positions, shuffle=(split == 'train'), )
def spm_train(input: str, model_prefix: str, vocab_size: int, character_coverage=0.9995, model_type='unigram', special_symbols=None): special_symbols = ','.join(special_symbols) command = f"--input={input} --model_prefix={model_prefix} --vocab_size={vocab_size} " \ f"--character_coverage={character_coverage} --model_type={model_type} --unk_piece=[UNK] " \ f"--pad_piece=[PAD] --user_defined_symbols={special_symbols} --hard_vocab_limit=false" LOGGER.info(command) # exit() spm.SentencePieceTrainer.Train(command)
def cli_main(): import argparse parser = argparse.ArgumentParser( description="Downloading/Decompressing CodeSearchNet dataset(s) or Tree-Sitter Library(ies)") parser.add_argument( "--yaml_file", "-f", help="load {yaml_file}.yml for train", type=str, ) args = parser.parse_args() yaml_file = os.path.join(os.path.dirname(__file__), '{}.yml'.format(args.yaml_file)) LOGGER.info('Load arguments in {}'.format(yaml_file)) args = load_yaml(yaml_file) LOGGER.info(args) main(args)
def main(args): # task = tasks.get_task(args['preprocess']['task']) LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task'])) PathManager.mkdir(args['preprocess']['destdir']) vocab = TransformersDictionary.from_pretrained('microsoft/codebert-base', do_lower_case=False) # 2. ***************build dataset******************** # dump into pkl file # transform a language's code into src format and tgt format simualtaneouly def parse_source_input(code): code_tokens = vocab.tokenize(code) # truncating code_tokens = code_tokens[:config.MAX_SOURCE_LENGTH - 2] source_tokens = [vocab.cls_token] + code_tokens + [vocab.sep_token] source_ids = vocab.convert_tokens_to_ids(source_tokens) source_size = len(source_tokens) source_mask = [1] * source_size padding_length = config.MAX_SOURCE_LENGTH - len(source_ids) source_ids += [vocab.pad()] * padding_length source_mask += [0] * padding_length return [source_ids, source_mask, source_size] def parse_target_input(code): target_tokens = vocab.tokenize(code)[:config.MAX_TARGET_LENGTH - 2] target_tokens = [vocab.cls_token] + target_tokens + [vocab.sep_token] target_ids = vocab.convert_tokens_to_ids(target_tokens) target_size = len(target_ids) target_mask = [1] * target_size padding_length = config.MAX_TARGET_LENGTH - len(target_ids) target_ids += [vocab.pad_token_id] * padding_length target_mask += [0] * padding_length return [target_ids, target_mask, target_size] src_lang, tgt_lang = args['preprocess']['src_lang'], args['preprocess']['tgt_lang'] for lang, mode in itertools.product([src_lang, tgt_lang], MODES): src_file = args['preprocess'][f'{mode}pref'].replace('*', lang) + ".code" dst_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.pkl") PathManager.mkdir(os.path.dirname(dst_file)) with file_io.open(src_file, 'r') as reader: keys = ['code', 'src_tokens', 'src_masks', 'src_sizes', 'tgt_tokens', 'tgt_masks', 'tgt_sizes'] data = {key: [] for key in keys} for line in reader: src_code = json_io.json_loads(line) # src_code = SPACE_SPLITTER.sub(" ", line) # source_ids, source_mask src_line = parse_source_input(src_code) # target_ids, target_mask tgt_line = parse_target_input(src_code) for key, src in zip(keys, [src_code] + src_line + tgt_line): data[key].append(src) file_io.open(dst_file, mode='wb', data=data)
def setup_task(cls, args, **kwargs): paths = utils.split_paths(args['task']['data']) assert len(paths) > 0 assert len(paths) > 0 # load dictionaries src_dicts = OrderedDict() for lang in args['task']['source_langs']: src_dicts[lang] = cls.load_dictionary(os.path.join(paths[0], '{}.dict.json'.format(lang))) LOGGER.info('[{}] dictionary: {} types'.format(lang, len(src_dicts[lang]) if lang != 'edges' else 0)) tgt_dicts = OrderedDict() for lang in args['task']['target_langs']: tgt_dicts[lang] = cls.load_dictionary(os.path.join(paths[0], '{}.dict.json'.format(lang))) LOGGER.info('[{}] dictionary: {} types'.format(lang, len(tgt_dicts[lang]))) return cls(args, src_dicts, tgt_dicts)
def main(args): LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task'])) PathManager.mkdir(args['preprocess']['destdir']) vocab = spm.SentencePieceProcessor() vocab.load(SPM_VOCAB_FILE) def save_dict(): src_file = os.path.join(os.path.dirname(SPM_VOCAB_FILE), 'dict.txt') tgt_file = os.path.join(args['preprocess']['destdir'], 'dict.jsonl') # Dictionary.text_to_jsonl(src_file, tgt_file) vocab = Dictionary() with file_io.open(src_file, 'r') as reader: for line in reader: token, num = line.strip().split() vocab.add_symbol(token, eval(num)) vocab.save(tgt_file) return vocab dictionary = save_dict() # 2. ***************build dataset******************** # dump into pkl file # transform a language's code into src format and tgt format simualtaneouly lang = args['preprocess']['lang'] for mode in MODES: file = f"{args['preprocess'][f'{mode}pref']}.code" dst_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.code") PathManager.mkdir(os.path.dirname(dst_file)) dataset = indexed_dataset.make_builder(f"{dst_file}_tokens.mmap", impl='mmap', vocab_size=len(vocab)) PathManager.mkdir(os.path.dirname(dst_file)) with file_io.open(file, 'r') as reader: data = {'code': []} for line in reader: line = json_io.json_loads(line) code = SPACE_SPLITTER.sub(" ", line) data['code'].append(code) code_tokens = vocab.encode(code, out_type=str) code_tokens = torch.IntTensor( [dictionary.index(token) for token in code_tokens]) # code_tokens = torch.IntTensor(vocab.encode_as_ids(code)) dataset.add_item(code_tokens) dataset.finalize(f"{dst_file}_tokens.idx") # proj indices # cp id data['proj_indices'] = [1] * len(data['code']) file_io.open(f"{dst_file}.pkl", mode='wb', data=data)
def clip_grad_norm(self, max_norm, aggregate_norm_fn=None): """Clips gradient norm.""" self.scaler.unscale_(self.optimizer) grad_norm = self.fp32_optimizer.clip_grad_norm(max_norm, aggregate_norm_fn) if not torch.isfinite(grad_norm).all(): new_loss_scale = self.next_loss_scale if new_loss_scale <= self.min_loss_scale: raise FloatingPointError(( "AMP: Minimum loss scale reached ({}). Your loss is probably exploding. " "Try restarting training or use fp32. {}").format( self.min_loss_scale, new_loss_scale)) else: LOGGER.info("AMP: overflow detected, setting scale to " f"to {new_loss_scale}") return grad_norm
def setup_task(cls, args, **kwargs): """Setup the task (e.g., load dictionaries). Args: args (argparse.Namespace): parsed command-line arguments """ paths = utils.split_paths(args['task']['data']) assert len(paths) > 0 # load dictionaries dictionary = cls.load_dictionary( os.path.join(paths[0], '{}.dict.json'.format(args['task']['target_lang']))) LOGGER.info('[{}] dictionary: {} types'.format( args['task']['target_lang'], len(dictionary))) return cls(args, dictionary)
def download(name): if name in BPE_MODEL_ARCHIVE_MAP: url = BPE_MODEL_ARCHIVE_MAP[name] LOGGER.info(f"Download {name} BPE model from {url}") out_file = os.path.join(__BPE_DIR__, f"{name}.tar.gz") gdown.download(url=url, output=out_file) try: with tarfile.open(out_file) as reader: reader.extractall(__BPE_DIR__) os.remove(out_file) except tarfile.ExtractError as err: LOGGER.error(__BPE_DIR__) LOGGER.warning(f"{name}.tar.gz is corrupted, please contact us.") else: raise FileExistsError(f"No {name}.tar.gz in the server. Please build your own BPE models. " \ f"Once they are built, you can upload them into the server.")