示例#1
0
    def setup_task(cls, args, **kwargs):
        """Setup the task (e.g., load dictionaries).

        Args:
            args (argparse.Namespace): parsed command-line arguments
        """
        paths = utils.split_paths(args['task']['data'])
        assert len(paths) > 0
        # load dictionaries
        # src_dict = cls.load_dictionary(os.path.join(paths[0], 'csnjs_8k_9995p_unigram_url.dict.txt'))
        src_dict = Dictionary(extra_special_symbols=[
            constants.CLS, constants.SEP, constants.MASK, constants.EOL,
            constants.URL
        ])
        src_dict.add_from_file(args['dataset']['srcdict'])
        tgt_dict = Dictionary.load(args['dataset']['tgtdict'])

        # src_dict = cls.load_dictionary(os.path.join(paths[0], '{}.dict.txt'.format(args['task']['source_lang'])))
        # tgt_dict = cls.load_dictionary(os.path.join(paths[0], '{}.dict.txt'.format(args['task']['target_lang'])))
        # assert src_dict.pad() == tgt_dict.pad()
        # assert src_dict.eos() == tgt_dict.eos()
        # assert src_dict.unk() == tgt_dict.unk()
        # LOGGER.info('[{}] dictionary: {} types'.format(args['task']['source_lang'], len(src_dict)))
        # LOGGER.info('[{}] dictionary: {} types'.format(args['task']['target_lang'], len(tgt_dict)))

        return cls(args, src_dict, tgt_dict)
示例#2
0
    def build_dictionary(cls,
                         filenames,
                         tokenize_func,
                         workers=1,
                         threshold=-1,
                         nwords=-1,
                         padding_factor=8):
        """Build the dictionary

        Args:
            filenames (list): list of filenames
            workers (int): number of concurrent workers
            threshold (int): defines the minimum word count
            nwords (int): defines the total number of words in the final dictionary,
                including special symbols
            padding_factor (int): can be used to pad the dictionary size to be a
                multiple of 8, which is important on some hardware (e.g., Nvidia
                Tensor Cores).
        """
        is_sbt = str.endswith(filenames[0], 'sbt')
        if is_sbt:
            d = SBTDictionary()
            for filename in filenames:
                SBTDictionary.add_token_to_dictionary(filename, d,
                                                      tokenize_func, workers)
        else:
            d = Dictionary()
            for filename in filenames:
                Dictionary.add_token_to_dictionary(filename, d, tokenize_func,
                                                   workers)

        d.finalize(threshold=threshold,
                   nwords=nwords,
                   padding_factor=padding_factor)
        return d
示例#3
0
    def load_dictionary(cls, filename):
        """Load the dictionary from the filename

        Args:
            filename (str): the filename
        """
        if filename.endswith('.txt'):
            return Dictionary.load(filename)
        else:
            return Dictionary.load_json(filename)
示例#4
0
    def load_dictionary(cls, filename):
        """Load the dictionary from the filename

        Args:
            filename (str): the filename
        """
        if filename.endswith('.txt'):
            return Dictionary.load(filename)
        else:
            is_bpe = os.path.basename(filename).split('.')[-3] == 'bpe'
            if is_bpe:
                return RetrievalDictionary.load_json(filename)
            else:
                return Dictionary.load_json(filename)
 def load(cls, f):
     subtoken_dict = Dictionary.load(f)
     splitted_filenames = f.rsplit('.', 2)
     bpe_f = '.'.join([splitted_filenames[0], 'bpe'] +
                      splitted_filenames[-2:])
     bpetoken_dict = WordBpeDicionary.load(bpe_f)
     return cls(subtoken_dict, bpetoken_dict)
示例#6
0
    def setup_task(cls, args, **kwargs):
        """Setup the task.
        """
        # paths = args.data.split(':')
        paths = utils.split_paths(args['task']['data'])
        assert len(paths) > 0
        dictionary = Dictionary.load(os.path.join(paths[0], 'dict.jsonl'))

        data_path = paths[0]
        if args['task']['langs'] is None:
            languages = sorted([
                name for name in os.listdir(data_path)
                if os.path.isdir(os.path.join(data_path, name))
            ])
        else:
            languages = args['task']['langs']  # .split(',')

        if args['task']['add_lang_token']:
            for lang in languages:
                dictionary.add_symbol('[{}]'.format(lang))

        LOGGER.info("Loading dictionary: {} types".format(len(dictionary)))
        # if not hasattr(args, 'shuffle_instance'):
        #     args.shuffle_instance = False
        return cls(args, dictionary)
示例#7
0
    def load_dictionary(cls, filename):
        """Load the dictionary from the filename

        Args:
            filename (str): the filename
        """
        return Dictionary.load(filename)
示例#8
0
 def save_lang_dict():
     src_file = PathManager.expanduser("~/clcdsa/astbert/data-mmap/lang.jsonl")
     dict = Dictionary.load(src_file)
     tgt_file = os.path.join(args['preprocess']['destdir'], 'lang.jsonl')
     PathManager.mkdir(os.path.dirname(tgt_file))
     dict.save(tgt_file)
     return dict
示例#9
0
 def setup_task(cls, args, **kwargs):
     """Setup the task.
     """
     dictionary = Dictionary.load(os.path.join(args.data, 'dict.txt'))
     LOGGER.info('dictionary: {} types'.format(len(dictionary)))
     if not hasattr(args, 'shuffle_instance'):
         args.shuffle_instance = False
     return cls(args, dictionary)
示例#10
0
    def load_dictionary(cls, filename):
        """Load the dictionary from the filename

        Args:
            filename (str): the filename
        """
        is_sbt = str.startswith(os.path.basename(filename), 'sbt')
        if is_sbt:
            return SBTDictionary.load_json(filename)
        else:
            return Dictionary.load_json(filename)
示例#11
0
    def build_dictionary(
        cls, filenames, workers=1, threshold=-1, nwords=-1, padding_factor=8,
        tokenize_func=SPACE_SPLITTER,
        **kwargs,
    ):
        """Build the dictionary

        Args:
            filenames (list): list of filenames
            workers (int): number of concurrent workers
            threshold (int): defines the minimum word count
            nwords (int): defines the total number of words in the final dictionary,
                including special symbols
            padding_factor (int): can be used to pad the dictionary size to be a
                multiple of 8, which is important on some hardware (e.g., Nvidia
                Tensor Cores).
        """
        d = Dictionary(
            pad=kwargs.get('pad', constants.PAD),
            bos=kwargs.get('bos', constants.BOS),
            eos=kwargs.get('eos', constants.EOS),
            unk=kwargs.get('unk', constants.UNK),
            extra_special_symbols=kwargs.get('extra_special_symbols', None),
        )
        for filename in filenames:
            Dictionary.add_file_to_dictionary(
                filename, d, tokenize_func, d.eos_word, workers
            )
        d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor)
        return d
示例#12
0
    def build_bpe_dictionary(
        cls,
        filenames,
        tokenize_func,
        workers=1,
        threshold=-1,
        nwords=-1,
        padding_factor=8,
        **special_symbols,
    ):
        """Build the dictionary

        Args:
            filenames (list): list of filenames
            workers (int): number of concurrent workers
            threshold (int): defines the minimum word count
            nwords (int): defines the total number of words in the final dictionary,
                including special symbols
            padding_factor (int): can be used to pad the dictionary size to be a
                multiple of 8, which is important on some hardware (e.g., Nvidia
                Tensor Cores).
        """
        bpe_portion = special_symbols.get('bpe_portion', 0.5)
        bpetoken_num = int(nwords * bpe_portion)
        subtoken_num = nwords - bpetoken_num
        # subtoken
        from ncc.data import constants
        subtoken_d = Dictionary(
            pad=special_symbols.get('pad', constants.PAD),
            bos=special_symbols.get('bos', constants.BOS),
            eos=special_symbols.get('eos', constants.EOS),
            unk=special_symbols.get('unk', constants.UNK),
            extra_special_symbols=special_symbols.get('extra_special_symbols',
                                                      None),
        )
        for filename in filenames:
            Dictionary.add_token_to_dictionary(filename, subtoken_d,
                                               tokenize_func, workers)
        remaining_tokens = Counter(
            {sym: c
             for sym, c in zip(subtoken_d.symbols, subtoken_d.count)})
        subtoken_d.finalize(threshold=threshold,
                            nwords=subtoken_num,
                            padding_factor=padding_factor)
        remaining_tokens = Counter({
            sym: c
            for sym, c in remaining_tokens.items() if sym not in subtoken_d
        })
        # bpetoken
        from ncc.data.retrieval.word_bpe_dictionary import WordBpeDicionary
        bpetoken_d = WordBpeDicionary()
        bpetoken_d.learn_bpe_vocab(remaining_tokens.elements(), bpetoken_num)
        bpetoken_d.finalize(threshold=0,
                            nwords=bpetoken_num,
                            padding_factor=padding_factor)
        from ncc.data.retrieval.hybrid.hybrid_retrieval_dictionary import HybridRetrievalDictionary
        return HybridRetrievalDictionary(subtoken_d, bpetoken_d)
示例#13
0
 def save_token_dict():
     src_file = os.path.join(os.path.dirname(SPM_VOCAB_FILE), 'dict.txt')
     tgt_file = os.path.join(args['preprocess']['destdir'], 'dict.jsonl')
     # Dictionary.text_to_jsonl(src_file, tgt_file)
     vocab = Dictionary()
     with file_io.open(src_file, 'r') as reader:
         for line in reader:
             token, num = line.strip().split()
             vocab.add_symbol(token, eval(num))
     vocab.save(tgt_file)
     return vocab
示例#14
0
    def load_dictionary(cls, filename):
        """Load the dictionary from the filename

        Args:
            filename (str): the filename
        """
        if filename.endswith('.txt'):
            dictionary = Dictionary(extra_special_symbols=[
                constants.CLS, constants.SEP, constants.MASK, constants.EOL,
                constants.URL
            ])
            dictionary.add_from_file(filename)
        else:
            dictionary = Dictionary(extra_special_symbols=[
                constants.CLS, constants.SEP, constants.MASK, constants.EOL,
                constants.URL
            ]).add_from_json_file(filename)
        return dictionary
示例#15
0
def cli_main():
    SEED = 204
    BATCH_SIZE = 64
    MAX_SOURCE_POSITIONS = 1024
    EPOCH = 50

    from ncc.utils.set_seed import set_seed
    set_seed(SEED)

    use_cuda = torch.cuda.is_available()
    if use_cuda:
        device = os.environ.get('CUDA_VISIBALE_DEVICES', [0])[0]  # get first device as default
        torch.cuda.set_device(f'cuda:{device}')
    criterion = DeepTuneLoss(task=None, sentence_avg=-1)
    if use_cuda:
        criterion = criterion.cuda()

    data = []
    for i, platform in enumerate(LANGUAGES):
        DATA_DIR = os.path.join(DATASET_DIR, f'mapping/{platform}/data-mmap')

        def get_attr(attr):
            oracle_file = os.path.join(DATA_DIR, f'train.{attr}')
            with open(oracle_file, 'rb') as reader:
                out = pickle.load(reader)
            return np.asarray(out)

        platform_name = mapping_metrics.platform2str(platform)
        benchmarks = get_attr('benchmark')
        runtime_cpus = get_attr('runtime_cpu')
        runtime_gpus = get_attr('runtime_gpu')

        #################### load dataset ####################
        src_dataset = load_mmap_dataset(os.path.join(DATA_DIR, f'train.src_tokens'))
        src_dataset = TruncateDataset(src_dataset, truncation_length=MAX_SOURCE_POSITIONS, truncate_prefix=0)
        tgt_dataset = load_mmap_dataset(os.path.join(DATA_DIR, f'train.oracle'))

        src_dict = Dictionary.load(os.path.join(DATA_DIR, 'src_tokens.dict.jsonl'))
        src_aux = OrderedDict()
        src_aux['transfer'] = get_attr('transfer')
        src_aux['wgsize'] = get_attr('wgsize')

        tgt_dict = Dictionary.load(os.path.join(DATA_DIR, 'oracle.dict.jsonl'))

        dataset = LanguagePairDataset(
            src=src_dataset, src_sizes=src_dataset.sizes, src_dict=src_dict, src_aux=src_aux,
            tgt=tgt_dataset, tgt_sizes=tgt_dataset.sizes, tgt_dict=tgt_dict, tgt_aux=None,
            left_pad_source=True, max_source_positions=MAX_SOURCE_POSITIONS,
        )
        #################### load dataset ####################

        # build toy dataset for 10-fold cross validation
        tgt_data = [tgt_dataset[idx].item() for idx in range(len(tgt_dataset))]
        src_data = [None] * len(tgt_data)

        # 10-fold cross-validation
        kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=SEED)
        for j, (train_ids, test_ids) in enumerate(kf.split(src_data, tgt_data)):
            # deeptune model
            model = DeepTuneEncoder(dictionary=src_dict, embed_dim=64,
                                    rnn_cell='lstm', rnn_hidden_dim=64, rnn_dropout=0., rnn_num_layers=2,
                                    aux_dim=2, inner_dim=32, out_dim=2)
            if use_cuda:
                model = model.cuda()
            optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
            for epoch_i in range(EPOCH):
                if dataset.shuffle:
                    random.shuffle(train_ids)
                train_batch_sampler = data_utils.batch_by_size(
                    train_ids,
                    num_tokens_fn=lambda *args: -1,
                    max_sentences=BATCH_SIZE,
                )
                train_dataloader = DataLoader(dataset=dataset,
                                              batch_sampler=train_batch_sampler,
                                              collate_fn=collate, )
                with tqdm(total=len(train_dataloader)) as t:
                    for sample_i, sample in enumerate(train_dataloader, start=1):
                        t.set_description(f'Epoch {epoch_i + 1}/{EPOCH} Batch {sample_i}/{len(train_dataloader)}')
                        if use_cuda:
                            sample = move_to_cuda(sample)
                        loss, sample_size, logging_output = criterion(model, sample)
                        loss.div_(sample_size)
                        t.set_postfix(loss=loss.item())
                        t.update()

                        optimizer.zero_grad()
                        loss.backward()
                        optimizer.step()

            # test accuracy
            test_batch_sampler = data_utils.batch_by_size(
                test_ids,
                num_tokens_fn=lambda *args: -1,
                max_sentences=BATCH_SIZE,
            )
            test_dataloader = DataLoader(dataset=dataset,
                                         batch_sampler=test_batch_sampler,
                                         collate_fn=collate, )
            predictions, ground_truth = [], []
            for sample in test_dataloader:
                if use_cuda:
                    sample = move_to_cuda(sample)
                hybrid_out, _ = model(**sample['net_input'])
                predictions.append(hybrid_out.max(dim=-1)[1])
                ground_truth.append(sample['target'].view(-1))
            predictions = torch.cat(predictions)
            ground_truth = torch.cat(ground_truth)

            accuracy = (predictions == ground_truth).tolist()
            # runtimes of baseline mapping (CPU on AMD, GPU on NVIDIA)
            gt_runtimes = (runtime_cpus if platform == "amd" else runtime_gpus)[test_ids]
            pred_runtimes = [
                (runtime_cpus if pred == 0 else runtime_gpus)[idx]
                for idx, pred in zip(test_ids, predictions)
            ]
            speedup = gt_runtimes / pred_runtimes

            # record results
            for benchmark_, o_, p_, accuracy_, p_speedup_ in \
                zip(benchmarks[test_ids], ground_truth, predictions, accuracy, speedup):
                data.append({
                    "Model": model.__class__.__name__,
                    "Platform": platform_name,
                    'Benchmark': mapping_metrics.escape_benchmark_name(benchmark_),
                    'Benchmark Suite': mapping_metrics.escape_suite_name(benchmark_),
                    "Oracle Mapping": o_,
                    "Predicted Mapping": p_,
                    "Accuracy": accuracy_,
                    "Speedup": p_speedup_,
                })
            del model, optimizer
    performance = pd.DataFrame(
        data, index=range(1, len(data) + 1), columns=[
            "Model",
            "Platform",
            "Benchmark",
            "Benchmark Suite",
            "Oracle Mapping",
            "Predicted Mapping",
            "Accuracy",
            "Speedup"
        ])
    benchmark_out = performance.groupby(['Platform', 'Benchmark Suite'])[['Platform', 'Accuracy', 'Speedup']].mean()
    benchmark_out['Accuracy'] = round(benchmark_out['Accuracy'] * 100, 2)
    benchmark_out['Speedup'] = round(benchmark_out['Speedup'], 2)
    print(benchmark_out)
    out = performance.groupby(['Platform'])[['Platform', 'Accuracy', 'Speedup']].mean()
    out['Accuracy'] = round(out['Accuracy'] * 100, 2)
    out['Speedup'] = round(out['Speedup'], 2)
    print(out)
示例#16
0
 def setup_task(cls, args, **kwargs):
     paths = utils.split_paths(args['task']['data'])
     assert len(paths) > 0
     dictionary = Dictionary.load(os.path.join(paths[0], 'dict.txt'))
     LOGGER.info('dictionary: {} types'.format(len(dictionary)))
     return cls(args, dictionary)
示例#17
0
def main(args):
    LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'],
                                              args['preprocess']['task']))
    PathManager.mkdir(args['preprocess']['destdir'])

    from ncc.data.dictionary import TransformersDictionary
    vocab = TransformersDictionary.from_pretrained(
        'microsoft/graphcodebert-base')

    file = os.path.join(args['preprocess']['destdir'], 'dfs.jsonl')
    node_dict = Dictionary.load(file)

    # 2. ***************build dataset********************
    # dump into pkl file
    # transform a language's code into src format and tgt format simualtaneouly
    num_workers = args['preprocess']['workers']
    src_lang, tgt_lang = args['preprocess']['src_lang'], args['preprocess'][
        'tgt_lang']

    # code tokens => code tokens
    for mode, lang in itertools.product(MODES, [src_lang, tgt_lang]):
        data_dir = str.replace(args['preprocess'][f'{mode}pref'], '*', lang)
        src_file = f"{data_dir}.code_tokens"
        PathManager.mkdir(os.path.join(args['preprocess']['destdir'], lang))
        dst_file = os.path.join(args['preprocess']['destdir'], lang,
                                f"{mode}.code_tokens")
        offsets = find_offsets(src_file, num_workers)
        pool = None
        if num_workers > 1:
            # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx)
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(dst_file, worker_id)
                pool.apply_async(
                    binarize_tokens,
                    (args, src_file, vocab, prefix, offsets[worker_id],
                     offsets[worker_id + 1]),
                )
            pool.close()

        ds_file = '{}.mmap'.format(dst_file)
        ds = indexed_dataset.make_builder(ds_file,
                                          impl="mmap",
                                          vocab_size=len(vocab))
        end = offsets[1]
        with file_io.open(src_file, 'r') as reader:
            reader.seek(0)
            line = file_io.safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                line = json_io.json_loads(line)
                code_tokens = vocab.subtokenize(line)
                code_tokens = torch.IntTensor(
                    vocab.tokens_to_indices(code_tokens))
                ds.add_item(code_tokens)
                line = reader.readline()

        if num_workers > 1:
            # p1-pN
            pool.join()
            # merge sub-processors' index and data files into final files and delete them.
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(dst_file, worker_id)
                ds.merge_file_(temp_file_path)
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
        ds.finalize('{}.idx'.format(dst_file))

    # code => code
    for mode, lang in itertools.product(MODES, [src_lang, tgt_lang]):
        data_dir = str.replace(args['preprocess'][f'{mode}pref'], '*', lang)
        src_file = f"{data_dir}.code"
        PathManager.mkdir(os.path.join(args['preprocess']['destdir'], lang))
        dst_file = os.path.join(args['preprocess']['destdir'], lang,
                                f"{mode}.code")

        ds_file = '{}.bin'.format(dst_file)
        ds = indexed_dataset.make_builder(ds_file,
                                          impl="bin",
                                          vocab_size=len(vocab))
        with open(src_file, 'r') as reader:
            for line in reader:
                line = json_io.json_loads(line)
                ds.add_item(line)
        ds.finalize('{}.idx'.format(dst_file))

    # dfs => dfs
    for mode, lang in itertools.product(MODES, [src_lang, tgt_lang]):
        data_dir = str.replace(args['preprocess'][f'{mode}pref'], '*', lang)
        src_file = f"{data_dir}.dfs"
        PathManager.mkdir(os.path.join(args['preprocess']['destdir'], lang))
        dst_file = os.path.join(args['preprocess']['destdir'], lang,
                                f"{mode}.dfs")
        offsets = find_offsets(src_file, num_workers)
        pool = None
        if num_workers > 1:
            # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx)
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(dst_file, worker_id)
                pool.apply_async(
                    binarize_dfs,
                    (args, src_file, node_dict, prefix, offsets[worker_id],
                     offsets[worker_id + 1]),
                )
            pool.close()

        ds_file = '{}.mmap'.format(dst_file)
        ds = indexed_dataset.make_builder(ds_file,
                                          impl="mmap",
                                          vocab_size=len(vocab))
        end = offsets[1]
        with file_io.open(src_file, 'r') as reader:
            reader.seek(0)
            line = file_io.safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                line = json_io.json_loads(line)
                dfs = torch.IntTensor([node_dict.index(tok) for tok in line])
                ds.add_item(dfs)
                line = reader.readline()

        if num_workers > 1:
            # p1-pN
            pool.join()
            # merge sub-processors' index and data files into final files and delete them.
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(dst_file, worker_id)
                ds.merge_file_(temp_file_path)
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
        ds.finalize('{}.idx'.format(dst_file))