예제 #1
0
def repr_code(args):

    device = torch.device(
        f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu")
    config = getattr(configs, 'config_' + args.model)()

    ##### Define model ######
    logger.info('Constructing Model..')
    model = getattr(models, args.model)(config)  #initialize the model
    if args.reload_from > 0:
        ckpt_path = f'./output/{args.model}/{args.dataset}/models/step{args.reload_from}.h5'
        model.load_state_dict(torch.load(ckpt_path, map_location=device))
    model = model.to(device)
    model.eval()

    data_path = args.data_path + args.dataset + '/'
    use_set = eval(config['dataset_name'])(
        data_path, config['use_names'], config['name_len'], config['use_apis'],
        config['api_len'], config['use_tokens'], config['tokens_len'])

    data_loader = torch.utils.data.DataLoader(dataset=use_set,
                                              batch_size=args.batch_size,
                                              shuffle=False,
                                              drop_last=False,
                                              num_workers=1)

    chunk_id = 0
    vecs, n_processed = [], 0
    for batch in tqdm(data_loader):
        batch_gpu = [tensor.to(device) for tensor in batch]
        with torch.no_grad():
            reprs = model.code_encoding(*batch_gpu).data.cpu().numpy()
        reprs = reprs.astype(np.float32)  # [batch x dim]
        if config[
                'sim_measure'] == 'cos':  # do normalization for fast cosine computation
            reprs = normalize(reprs)
        vecs.append(reprs)
        n_processed = n_processed + batch[0].size(0)
        print("n_processed ", n_processed, " args.chunk_size ",
              args.chunk_size)
        if n_processed >= args.chunk_size:
            output_path = f"{data_path}{config['use_codevecs'][:-3]}_part{chunk_id}.h5"
            save_vecs(np.vstack(vecs), output_path)
            chunk_id += 1
            vecs, n_processed = [], 0
    # save the last chunk (probably incomplete)
    output_path = f"{data_path}{config['use_codevecs'][:-3]}_part{chunk_id}.h5"
    print("repr_code saved at ", output_path)
    save_vecs(np.vstack(vecs), output_path)
예제 #2
0
 def repr_code(self,model):
     model.eval()
     vecs=None
     use_set = CodeSearchDataset(self.conf['workdir'],
                                   self.conf['use_names'],self.conf['name_len'],
                                   self.conf['use_apis'],self.conf['api_len'],
                                   self.conf['use_tokens'],self.conf['tokens_len'])
     
     data_loader = torch.utils.data.DataLoader(dataset=use_set, batch_size=1000, 
                                        shuffle=False, drop_last=False, num_workers=1)
     for names,apis,toks in data_loader:
         names, apis, toks = [tensor.to(self.device) for tensor in [names, apis, toks]]
         reprs = model.code_encoding(names,apis,toks).data.cpu().numpy()
         vecs=reprs if vecs is None else np.concatenate((vecs, reprs),0)
     vecs = normalize(vecs)
     save_vecs(vecs,self.path+self.conf['use_codevecs'])
     return vecs
예제 #3
0
def repr_code(args):

    device = torch.device(
        f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu")
    config = getattr(configs, 'config_' + args.model)()

    ##### Define model ######
    logger.info('Constructing Model..')
    model = getattr(models, args.model)(config)  #initialize the model
    if args.reload_from > 0:
        model.load_state_dict(
            torch.load(
                f'./output/{args.model}/{args.dataset}/models/epo{args.reload_from}.h5'
            ))
    model = model.to(device)
    model.eval()

    data_path = args.data_path + args.dataset + '/'
    use_set = eval(config['dataset_name'])(
        data_path, config['use_names'], config['name_len'], config['use_apis'],
        config['api_len'], config['use_tokens'], config['tokens_len'])

    data_loader = torch.utils.data.DataLoader(dataset=use_set,
                                              batch_size=1000,
                                              shuffle=False,
                                              drop_last=False,
                                              num_workers=1)

    vecs = None
    for batch in tqdm(data_loader):
        batch_gpu = [tensor.to(device) for tensor in batch]
        with torch.no_grad():
            reprs = model.code_encoding(*batch_gpu).data.cpu().numpy()
        vecs = reprs if vecs is None else np.concatenate((vecs, reprs), 0)
    vecs = normalize(vecs)
    save_vecs(vecs, data_path + config['use_codevecs'])
예제 #4
0
def repr_code(args, ast2id, code2id, nl2id, id2nl):
    with torch.no_grad():
        device = torch.device(
            f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu")
        config = getattr(configs, 'config_' + args.model)()

        ##### Define model ######
        logger.info('Constructing Model..')
        logger.info(os.getcwd())
        model = getattr(models, args.model)(config,
                                            ast2id)  #initialize the model
        if args.reload_from > 0:
            ckpt_path = f'./output/{args.model}/{args.dataset}/models/step{args.reload_from}.h5'
            model.load_state_dict(torch.load(ckpt_path, map_location=device))
        model = model.to(device)
        model.eval()

        data_path = args.data_path + args.datasave + '/'
        '''
    use_set = eval(config['dataset_name'])(data_path, config['use_names'], config['name_len'],
                              config['use_apis'], config['api_len'],
                              config['use_tokens'], config['tokens_len'])

    data_loader = torch.utils.data.DataLoader(dataset=use_set, batch_size=args.batch_size, 
                                  shuffle=False, drop_last=False, num_workers=1)
    '''
        train_data_set = TreeDataSet(
            file_name=args.data_path + '/train.json',
            ast_path=args.data_path + '/tree/train/',
            ast2id=ast2id,
            nl2id=nl2id,
            max_ast_size=args.code_max_len,
            max_simple_name_size=args.max_simple_name_len,
            k=args.k,
            max_comment_size=args.comment_max_len,
            use_code=True,
            desc=config['valid_desc'],
            desclen=config['desc_len'])

        data_loader = DataLoaderX(dataset=train_data_set,
                                  batch_size=args.batch_size,
                                  shuffle=True,
                                  num_workers=2)

        chunk_id = 0
        vecs, n_processed = [], 0
        for batch in tqdm(data_loader):
            torch.cuda.empty_cache()
            batch_gpu = [tensor.to(device).long() for tensor in batch]
            with torch.no_grad():
                reprs = model.getcodevec(*batch_gpu).data.cpu().numpy()
            reprs = reprs.astype(np.float32)  # [batch x dim]
            if config[
                    'sim_measure'] == 'cos':  # do normalization for fast cosine computation
                reprs = normalize(reprs)
            vecs.append(reprs)
            n_processed = n_processed + batch[0].size(0)
            if n_processed >= args.chunk_size:
                output_path = f"{data_path}{config['use_codevecs'][:-3]}_part{chunk_id}.h5"
                save_vecs(np.vstack(vecs), output_path)
                chunk_id += 1
                vecs, n_processed = [], 0
        # save the last chunk (probably incomplete)
        output_path = f"{data_path}{config['use_codevecs'][:-3]}_part{chunk_id}.h5"
        save_vecs(np.vstack(vecs), output_path)