def cli_main(): import argparse parser = argparse.ArgumentParser( description= "Downloading/Decompressing CodeSearchNet dataset(s) or Tree-Sitter Library(ies)" ) parser.add_argument("--yaml_file", "-f", type=str, help="load {language}.yml for train", default='config/csn_feng/ruby') parser.add_argument( '--out_file', '-o', type=str, help='output generated file', default=None, ) args = parser.parse_args() yaml_file = os.path.join(os.path.dirname(__file__), f"{args.yaml_file}.yml") out_file = None if args.out_file is None else recursive_expanduser( args.out_file) LOGGER.info('Load arguments in {}'.format(yaml_file)) args = load_yaml(yaml_file) LOGGER.info(args) main(args, out_file)
def load_state(model_path): state = load_checkpoint_to_cpu(model_path, arg_overrides={}) args = state["args"] args = recursive_contractuser(args) args = recursive_expanduser(args) task = tasks.setup_task(args) # load src/tgt dicts model = task.build_model(args) model.load_state_dict(state["model"]) use_cuda = torch.cuda.is_available() and not args['common']['cpu'] if args['common']['fp16'] and use_cuda: model.half() if use_cuda: torch.cuda.empty_cache() torch.cuda.set_device(torch.cuda.device_count() - 1) model.cuda() model.eval() del state return args, task, model, use_cuda
def main(model_path, input): LOGGER.info('Load model from {}'.format(model_path)) state = load_checkpoint_to_cpu(model_path, arg_overrides={}) args = state["args"] args = recursive_contractuser(args, old_cache_name='.ncc') args = recursive_expanduser(args) task = tasks.setup_task(args) # load src/tgt dicts model = task.build_model(args) model.load_state_dict(state["model"]) use_cuda = torch.cuda.is_available() and not args['common']['cpu'] if use_cuda: torch.cuda.empty_cache() torch.cuda.set_device(torch.cuda.device_count() - 1) model.cuda() model.eval() if args['common']['fp16'] and use_cuda: model.half() sample = task.encode_input(input) sample = utils.move_to_cuda(sample) if use_cuda else sample generator = task.sequence_completor net_output = generator.complete(models=[model], sample=sample) out = task.decode_output(net_output) return out
from dataset.codexglue.code_to_text import ( LANGUAGES, MODES, ) from ncc import tasks from ncc.data import ( Dictionary, indexed_dataset, ) from ncc.utils.file_ops.yaml_io import recursive_expanduser from ncc.utils.file_ops import file_io from ncc.utils.path_manager import PathManager if __name__ == '__main__': task = tasks.get_task('multilingual_denoising') base_dir = recursive_expanduser( '~/ncc_data/codexglue/code_to_text/multilingual_denoising/data-mmap') dict_file = os.path.join(base_dir, 'dict.jsonl') vocab = task.load_dictionary(dict_file) for mode in MODES: dst_file = os.path.join(base_dir, 'docstring', f"{mode}.docstring.spm") PathManager.mkdir(os.path.dirname(dst_file)) # mmap ds = indexed_dataset.make_builder(f'{dst_file}.mmap', impl='mmap', vocab_size=len(vocab)) for lang in LANGUAGES: src_file = os.path.join(base_dir, lang, f"{mode}.docstring.spm") ds.merge_file_(src_file) ds.finalize(f'{dst_file}.idx')
def cli_main(model_path, input, kwargs='{}'): model_path = recursive_expanduser(model_path) input = re.sub(r'\s+', ' ', input).strip() kwargs = ujson.loads(kwargs) kwargs['topk'] = kwargs.get('topk', 5) return main(model_path, input, **kwargs)