def setup_task(cls, args, **kwargs): """Setup the task (e.g., load dictionaries). Args: args (argparse.Namespace): parsed command-line arguments """ paths = utils.split_paths(args['task']['data']) assert len(paths) > 0 # load dictionaries # src_dict = cls.load_dictionary(os.path.join(paths[0], 'csnjs_8k_9995p_unigram_url.dict.txt')) src_dict = Dictionary(extra_special_symbols=[ constants.CLS, constants.SEP, constants.MASK, constants.EOL, constants.URL ]) src_dict.add_from_file(args['dataset']['srcdict']) tgt_dict = Dictionary.load(args['dataset']['tgtdict']) # src_dict = cls.load_dictionary(os.path.join(paths[0], '{}.dict.txt'.format(args['task']['source_lang']))) # tgt_dict = cls.load_dictionary(os.path.join(paths[0], '{}.dict.txt'.format(args['task']['target_lang']))) # assert src_dict.pad() == tgt_dict.pad() # assert src_dict.eos() == tgt_dict.eos() # assert src_dict.unk() == tgt_dict.unk() # LOGGER.info('[{}] dictionary: {} types'.format(args['task']['source_lang'], len(src_dict))) # LOGGER.info('[{}] dictionary: {} types'.format(args['task']['target_lang'], len(tgt_dict))) return cls(args, src_dict, tgt_dict)
def build_dictionary(cls, filenames, tokenize_func, workers=1, threshold=-1, nwords=-1, padding_factor=8): """Build the dictionary Args: filenames (list): list of filenames workers (int): number of concurrent workers threshold (int): defines the minimum word count nwords (int): defines the total number of words in the final dictionary, including special symbols padding_factor (int): can be used to pad the dictionary size to be a multiple of 8, which is important on some hardware (e.g., Nvidia Tensor Cores). """ is_sbt = str.endswith(filenames[0], 'sbt') if is_sbt: d = SBTDictionary() for filename in filenames: SBTDictionary.add_token_to_dictionary(filename, d, tokenize_func, workers) else: d = Dictionary() for filename in filenames: Dictionary.add_token_to_dictionary(filename, d, tokenize_func, workers) d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor) return d
def load_dictionary(cls, filename): """Load the dictionary from the filename Args: filename (str): the filename """ if filename.endswith('.txt'): return Dictionary.load(filename) else: return Dictionary.load_json(filename)
def load_dictionary(cls, filename): """Load the dictionary from the filename Args: filename (str): the filename """ if filename.endswith('.txt'): return Dictionary.load(filename) else: is_bpe = os.path.basename(filename).split('.')[-3] == 'bpe' if is_bpe: return RetrievalDictionary.load_json(filename) else: return Dictionary.load_json(filename)
def load(cls, f): subtoken_dict = Dictionary.load(f) splitted_filenames = f.rsplit('.', 2) bpe_f = '.'.join([splitted_filenames[0], 'bpe'] + splitted_filenames[-2:]) bpetoken_dict = WordBpeDicionary.load(bpe_f) return cls(subtoken_dict, bpetoken_dict)
def setup_task(cls, args, **kwargs): """Setup the task. """ # paths = args.data.split(':') paths = utils.split_paths(args['task']['data']) assert len(paths) > 0 dictionary = Dictionary.load(os.path.join(paths[0], 'dict.jsonl')) data_path = paths[0] if args['task']['langs'] is None: languages = sorted([ name for name in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, name)) ]) else: languages = args['task']['langs'] # .split(',') if args['task']['add_lang_token']: for lang in languages: dictionary.add_symbol('[{}]'.format(lang)) LOGGER.info("Loading dictionary: {} types".format(len(dictionary))) # if not hasattr(args, 'shuffle_instance'): # args.shuffle_instance = False return cls(args, dictionary)
def load_dictionary(cls, filename): """Load the dictionary from the filename Args: filename (str): the filename """ return Dictionary.load(filename)
def save_lang_dict(): src_file = PathManager.expanduser("~/clcdsa/astbert/data-mmap/lang.jsonl") dict = Dictionary.load(src_file) tgt_file = os.path.join(args['preprocess']['destdir'], 'lang.jsonl') PathManager.mkdir(os.path.dirname(tgt_file)) dict.save(tgt_file) return dict
def setup_task(cls, args, **kwargs): """Setup the task. """ dictionary = Dictionary.load(os.path.join(args.data, 'dict.txt')) LOGGER.info('dictionary: {} types'.format(len(dictionary))) if not hasattr(args, 'shuffle_instance'): args.shuffle_instance = False return cls(args, dictionary)
def load_dictionary(cls, filename): """Load the dictionary from the filename Args: filename (str): the filename """ is_sbt = str.startswith(os.path.basename(filename), 'sbt') if is_sbt: return SBTDictionary.load_json(filename) else: return Dictionary.load_json(filename)
def build_dictionary( cls, filenames, workers=1, threshold=-1, nwords=-1, padding_factor=8, tokenize_func=SPACE_SPLITTER, **kwargs, ): """Build the dictionary Args: filenames (list): list of filenames workers (int): number of concurrent workers threshold (int): defines the minimum word count nwords (int): defines the total number of words in the final dictionary, including special symbols padding_factor (int): can be used to pad the dictionary size to be a multiple of 8, which is important on some hardware (e.g., Nvidia Tensor Cores). """ d = Dictionary( pad=kwargs.get('pad', constants.PAD), bos=kwargs.get('bos', constants.BOS), eos=kwargs.get('eos', constants.EOS), unk=kwargs.get('unk', constants.UNK), extra_special_symbols=kwargs.get('extra_special_symbols', None), ) for filename in filenames: Dictionary.add_file_to_dictionary( filename, d, tokenize_func, d.eos_word, workers ) d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor) return d
def build_bpe_dictionary( cls, filenames, tokenize_func, workers=1, threshold=-1, nwords=-1, padding_factor=8, **special_symbols, ): """Build the dictionary Args: filenames (list): list of filenames workers (int): number of concurrent workers threshold (int): defines the minimum word count nwords (int): defines the total number of words in the final dictionary, including special symbols padding_factor (int): can be used to pad the dictionary size to be a multiple of 8, which is important on some hardware (e.g., Nvidia Tensor Cores). """ bpe_portion = special_symbols.get('bpe_portion', 0.5) bpetoken_num = int(nwords * bpe_portion) subtoken_num = nwords - bpetoken_num # subtoken from ncc.data import constants subtoken_d = Dictionary( pad=special_symbols.get('pad', constants.PAD), bos=special_symbols.get('bos', constants.BOS), eos=special_symbols.get('eos', constants.EOS), unk=special_symbols.get('unk', constants.UNK), extra_special_symbols=special_symbols.get('extra_special_symbols', None), ) for filename in filenames: Dictionary.add_token_to_dictionary(filename, subtoken_d, tokenize_func, workers) remaining_tokens = Counter( {sym: c for sym, c in zip(subtoken_d.symbols, subtoken_d.count)}) subtoken_d.finalize(threshold=threshold, nwords=subtoken_num, padding_factor=padding_factor) remaining_tokens = Counter({ sym: c for sym, c in remaining_tokens.items() if sym not in subtoken_d }) # bpetoken from ncc.data.retrieval.word_bpe_dictionary import WordBpeDicionary bpetoken_d = WordBpeDicionary() bpetoken_d.learn_bpe_vocab(remaining_tokens.elements(), bpetoken_num) bpetoken_d.finalize(threshold=0, nwords=bpetoken_num, padding_factor=padding_factor) from ncc.data.retrieval.hybrid.hybrid_retrieval_dictionary import HybridRetrievalDictionary return HybridRetrievalDictionary(subtoken_d, bpetoken_d)
def save_token_dict(): src_file = os.path.join(os.path.dirname(SPM_VOCAB_FILE), 'dict.txt') tgt_file = os.path.join(args['preprocess']['destdir'], 'dict.jsonl') # Dictionary.text_to_jsonl(src_file, tgt_file) vocab = Dictionary() with file_io.open(src_file, 'r') as reader: for line in reader: token, num = line.strip().split() vocab.add_symbol(token, eval(num)) vocab.save(tgt_file) return vocab
def load_dictionary(cls, filename): """Load the dictionary from the filename Args: filename (str): the filename """ if filename.endswith('.txt'): dictionary = Dictionary(extra_special_symbols=[ constants.CLS, constants.SEP, constants.MASK, constants.EOL, constants.URL ]) dictionary.add_from_file(filename) else: dictionary = Dictionary(extra_special_symbols=[ constants.CLS, constants.SEP, constants.MASK, constants.EOL, constants.URL ]).add_from_json_file(filename) return dictionary
def cli_main(): SEED = 204 BATCH_SIZE = 64 MAX_SOURCE_POSITIONS = 1024 EPOCH = 50 from ncc.utils.set_seed import set_seed set_seed(SEED) use_cuda = torch.cuda.is_available() if use_cuda: device = os.environ.get('CUDA_VISIBALE_DEVICES', [0])[0] # get first device as default torch.cuda.set_device(f'cuda:{device}') criterion = DeepTuneLoss(task=None, sentence_avg=-1) if use_cuda: criterion = criterion.cuda() data = [] for i, platform in enumerate(LANGUAGES): DATA_DIR = os.path.join(DATASET_DIR, f'mapping/{platform}/data-mmap') def get_attr(attr): oracle_file = os.path.join(DATA_DIR, f'train.{attr}') with open(oracle_file, 'rb') as reader: out = pickle.load(reader) return np.asarray(out) platform_name = mapping_metrics.platform2str(platform) benchmarks = get_attr('benchmark') runtime_cpus = get_attr('runtime_cpu') runtime_gpus = get_attr('runtime_gpu') #################### load dataset #################### src_dataset = load_mmap_dataset(os.path.join(DATA_DIR, f'train.src_tokens')) src_dataset = TruncateDataset(src_dataset, truncation_length=MAX_SOURCE_POSITIONS, truncate_prefix=0) tgt_dataset = load_mmap_dataset(os.path.join(DATA_DIR, f'train.oracle')) src_dict = Dictionary.load(os.path.join(DATA_DIR, 'src_tokens.dict.jsonl')) src_aux = OrderedDict() src_aux['transfer'] = get_attr('transfer') src_aux['wgsize'] = get_attr('wgsize') tgt_dict = Dictionary.load(os.path.join(DATA_DIR, 'oracle.dict.jsonl')) dataset = LanguagePairDataset( src=src_dataset, src_sizes=src_dataset.sizes, src_dict=src_dict, src_aux=src_aux, tgt=tgt_dataset, tgt_sizes=tgt_dataset.sizes, tgt_dict=tgt_dict, tgt_aux=None, left_pad_source=True, max_source_positions=MAX_SOURCE_POSITIONS, ) #################### load dataset #################### # build toy dataset for 10-fold cross validation tgt_data = [tgt_dataset[idx].item() for idx in range(len(tgt_dataset))] src_data = [None] * len(tgt_data) # 10-fold cross-validation kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=SEED) for j, (train_ids, test_ids) in enumerate(kf.split(src_data, tgt_data)): # deeptune model model = DeepTuneEncoder(dictionary=src_dict, embed_dim=64, rnn_cell='lstm', rnn_hidden_dim=64, rnn_dropout=0., rnn_num_layers=2, aux_dim=2, inner_dim=32, out_dim=2) if use_cuda: model = model.cuda() optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) for epoch_i in range(EPOCH): if dataset.shuffle: random.shuffle(train_ids) train_batch_sampler = data_utils.batch_by_size( train_ids, num_tokens_fn=lambda *args: -1, max_sentences=BATCH_SIZE, ) train_dataloader = DataLoader(dataset=dataset, batch_sampler=train_batch_sampler, collate_fn=collate, ) with tqdm(total=len(train_dataloader)) as t: for sample_i, sample in enumerate(train_dataloader, start=1): t.set_description(f'Epoch {epoch_i + 1}/{EPOCH} Batch {sample_i}/{len(train_dataloader)}') if use_cuda: sample = move_to_cuda(sample) loss, sample_size, logging_output = criterion(model, sample) loss.div_(sample_size) t.set_postfix(loss=loss.item()) t.update() optimizer.zero_grad() loss.backward() optimizer.step() # test accuracy test_batch_sampler = data_utils.batch_by_size( test_ids, num_tokens_fn=lambda *args: -1, max_sentences=BATCH_SIZE, ) test_dataloader = DataLoader(dataset=dataset, batch_sampler=test_batch_sampler, collate_fn=collate, ) predictions, ground_truth = [], [] for sample in test_dataloader: if use_cuda: sample = move_to_cuda(sample) hybrid_out, _ = model(**sample['net_input']) predictions.append(hybrid_out.max(dim=-1)[1]) ground_truth.append(sample['target'].view(-1)) predictions = torch.cat(predictions) ground_truth = torch.cat(ground_truth) accuracy = (predictions == ground_truth).tolist() # runtimes of baseline mapping (CPU on AMD, GPU on NVIDIA) gt_runtimes = (runtime_cpus if platform == "amd" else runtime_gpus)[test_ids] pred_runtimes = [ (runtime_cpus if pred == 0 else runtime_gpus)[idx] for idx, pred in zip(test_ids, predictions) ] speedup = gt_runtimes / pred_runtimes # record results for benchmark_, o_, p_, accuracy_, p_speedup_ in \ zip(benchmarks[test_ids], ground_truth, predictions, accuracy, speedup): data.append({ "Model": model.__class__.__name__, "Platform": platform_name, 'Benchmark': mapping_metrics.escape_benchmark_name(benchmark_), 'Benchmark Suite': mapping_metrics.escape_suite_name(benchmark_), "Oracle Mapping": o_, "Predicted Mapping": p_, "Accuracy": accuracy_, "Speedup": p_speedup_, }) del model, optimizer performance = pd.DataFrame( data, index=range(1, len(data) + 1), columns=[ "Model", "Platform", "Benchmark", "Benchmark Suite", "Oracle Mapping", "Predicted Mapping", "Accuracy", "Speedup" ]) benchmark_out = performance.groupby(['Platform', 'Benchmark Suite'])[['Platform', 'Accuracy', 'Speedup']].mean() benchmark_out['Accuracy'] = round(benchmark_out['Accuracy'] * 100, 2) benchmark_out['Speedup'] = round(benchmark_out['Speedup'], 2) print(benchmark_out) out = performance.groupby(['Platform'])[['Platform', 'Accuracy', 'Speedup']].mean() out['Accuracy'] = round(out['Accuracy'] * 100, 2) out['Speedup'] = round(out['Speedup'], 2) print(out)
def setup_task(cls, args, **kwargs): paths = utils.split_paths(args['task']['data']) assert len(paths) > 0 dictionary = Dictionary.load(os.path.join(paths[0], 'dict.txt')) LOGGER.info('dictionary: {} types'.format(len(dictionary))) return cls(args, dictionary)
def main(args): LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task'])) PathManager.mkdir(args['preprocess']['destdir']) from ncc.data.dictionary import TransformersDictionary vocab = TransformersDictionary.from_pretrained( 'microsoft/graphcodebert-base') file = os.path.join(args['preprocess']['destdir'], 'dfs.jsonl') node_dict = Dictionary.load(file) # 2. ***************build dataset******************** # dump into pkl file # transform a language's code into src format and tgt format simualtaneouly num_workers = args['preprocess']['workers'] src_lang, tgt_lang = args['preprocess']['src_lang'], args['preprocess'][ 'tgt_lang'] # code tokens => code tokens for mode, lang in itertools.product(MODES, [src_lang, tgt_lang]): data_dir = str.replace(args['preprocess'][f'{mode}pref'], '*', lang) src_file = f"{data_dir}.code_tokens" PathManager.mkdir(os.path.join(args['preprocess']['destdir'], lang)) dst_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.code_tokens") offsets = find_offsets(src_file, num_workers) pool = None if num_workers > 1: # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx) pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(dst_file, worker_id) pool.apply_async( binarize_tokens, (args, src_file, vocab, prefix, offsets[worker_id], offsets[worker_id + 1]), ) pool.close() ds_file = '{}.mmap'.format(dst_file) ds = indexed_dataset.make_builder(ds_file, impl="mmap", vocab_size=len(vocab)) end = offsets[1] with file_io.open(src_file, 'r') as reader: reader.seek(0) line = file_io.safe_readline(reader) while line: if end > 0 and reader.tell() > end: break line = json_io.json_loads(line) code_tokens = vocab.subtokenize(line) code_tokens = torch.IntTensor( vocab.tokens_to_indices(code_tokens)) ds.add_item(code_tokens) line = reader.readline() if num_workers > 1: # p1-pN pool.join() # merge sub-processors' index and data files into final files and delete them. for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(dst_file, worker_id) ds.merge_file_(temp_file_path) # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize('{}.idx'.format(dst_file)) # code => code for mode, lang in itertools.product(MODES, [src_lang, tgt_lang]): data_dir = str.replace(args['preprocess'][f'{mode}pref'], '*', lang) src_file = f"{data_dir}.code" PathManager.mkdir(os.path.join(args['preprocess']['destdir'], lang)) dst_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.code") ds_file = '{}.bin'.format(dst_file) ds = indexed_dataset.make_builder(ds_file, impl="bin", vocab_size=len(vocab)) with open(src_file, 'r') as reader: for line in reader: line = json_io.json_loads(line) ds.add_item(line) ds.finalize('{}.idx'.format(dst_file)) # dfs => dfs for mode, lang in itertools.product(MODES, [src_lang, tgt_lang]): data_dir = str.replace(args['preprocess'][f'{mode}pref'], '*', lang) src_file = f"{data_dir}.dfs" PathManager.mkdir(os.path.join(args['preprocess']['destdir'], lang)) dst_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.dfs") offsets = find_offsets(src_file, num_workers) pool = None if num_workers > 1: # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx) pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(dst_file, worker_id) pool.apply_async( binarize_dfs, (args, src_file, node_dict, prefix, offsets[worker_id], offsets[worker_id + 1]), ) pool.close() ds_file = '{}.mmap'.format(dst_file) ds = indexed_dataset.make_builder(ds_file, impl="mmap", vocab_size=len(vocab)) end = offsets[1] with file_io.open(src_file, 'r') as reader: reader.seek(0) line = file_io.safe_readline(reader) while line: if end > 0 and reader.tell() > end: break line = json_io.json_loads(line) dfs = torch.IntTensor([node_dict.index(tok) for tok in line]) ds.add_item(dfs) line = reader.readline() if num_workers > 1: # p1-pN pool.join() # merge sub-processors' index and data files into final files and delete them. for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(dst_file, worker_id) ds.merge_file_(temp_file_path) # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize('{}.idx'.format(dst_file))