def eval(self, model, poolsize, K, dataset):
        """
        simple validation in a code pool.
        :param model: Trained Model
        :param poolsize: poolsize - size of the code pool, if -1, load the whole test set
        :param K: Top K results
        :param dataset: which dataset to evaluate on
        :return: Accuracy, MRR, MAP, nDCG
        """
        if self.valid_set is None:  # load evaluation dataset
            if dataset == "train":
                self.valid_set = CodeSearchDataset(self.path, self.conf,
                                                   "train")  # load train set
            elif dataset == "val":
                self.valid_set = CodeSearchDataset(self.path, self.conf,
                                                   "val")  # load val set
            else:
                self.valid_set = CodeSearchDataset(self.path, self.conf,
                                                   "test")  # load test set

        data_loader = torch.utils.data.DataLoader(dataset=self.valid_set,
                                                  batch_size=poolsize,
                                                  shuffle=True,
                                                  drop_last=True,
                                                  num_workers=1)

        model = model.eval()
        accs, mrrs, maps, ndcgs = [], [], [], []
        for qts, codes, _, qbs, _ in data_loader:
            qts, codes, qbs = gVar(qts), gVar(codes), gVar(qbs)
            code_repr = model.code_encoding(codes)
            if self.conf['use_qb']:
                qb_repr = model.qb_encoding(qbs)
            else:
                qb_repr = None

            for i in range(poolsize):
                qt = gVar(qts[i].expand(poolsize, -1))
                qt_repr = model.qt_encoding(qt)

                sims = model.score_qt_code_qb(qt_repr, code_repr,
                                              qb_repr).data.cpu().numpy()
                # sims = model.combine_qt_and_code(qt_repr, code_repr).data.cpu().numpy()
                # sims = F.cosine_similarity(torch.concat(qt_repr,code_repr)).data.cpu().numpy()
                # n_results = K

                negsims = np.negative(sims)
                predict = np.argsort(negsims)
                # predict = np.argpartition(negsims, kth=n_results-1)
                # predict = predict[:n_results]
                predict = [int(k) for k in predict]
                real = [i]  # index of positive sample
                accs.append(ACC(real, predict))
                mrrs.append(MRR(real, predict))
                maps.append(MAP(real, predict))
                ndcgs.append(NDCG(real, predict))
        logger.info('ACC={}, MRR={}, MAP={}, nDCG={}'.format(
            np.mean(accs), np.mean(mrrs), np.mean(maps), np.mean(ndcgs)))
        return np.mean(accs), np.mean(mrrs), np.mean(maps), np.mean(ndcgs)
예제 #2
0
    def train(self, model, data_set_class):
        tensorboard_writer = SummaryWriter("runs/exp-1")
        log_every = self.conf['log_every']
        valid_every = self.conf['valid_every']
        save_every = self.conf['save_every']
        batch_size = self.conf['batch_size']
        nb_epoch = self.conf['nb_epoch']

        train_set = data_set_class(
            self.path, self.conf['train_name'], self.conf['name_len'],
            self.conf['train_api'], self.conf['api_len'],
            self.conf['train_tokens'], self.conf['tokens_len'],
            self.conf['train_desc'], self.conf['desc_len'])

        data_loader = torch.utils.data.DataLoader(
            dataset=train_set,
            batch_size=self.conf['batch_size'],
            shuffle=True,
            drop_last=True,
            num_workers=1)

        val_loss = {'loss': 1., 'epoch': 0}

        for epoch in range(self.conf['reload'] + 1, nb_epoch):
            itr = 1
            losses = []
            for names, apis, toks, good_descs, bad_descs in data_loader:
                names, apis, toks, good_descs, bad_descs = gVar(names), gVar(
                    apis), gVar(toks), gVar(good_descs), gVar(bad_descs)
                loss = model(names, apis, toks, good_descs, bad_descs)
                losses.append(loss.item())
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                if itr % log_every == 0:
                    tensorboard_writer.add_scalar("loss", np.mean(losses),
                                                  epoch * 10 + itr // 100)
                    logger.info('epo:[%d/%d] itr:%d Loss=%.5f' %
                                (epoch, nb_epoch, itr, np.mean(losses)))
                    losses = []
                itr = itr + 1

            if epoch and epoch % valid_every == 0:
                logger.info("validating..")
                model = model.eval()
                acc1, mrr, map, ndcg = self.eval(model, 1000, 1)
                model = model.train()
                tensorboard_writer.add_scalar("acc1", acc1, epoch)
                tensorboard_writer.add_scalar("mrr", mrr, epoch)
                tensorboard_writer.add_scalar("map", map, epoch)
                tensorboard_writer.add_scalar("ndcg", ndcg, epoch)
                logger.info("acc1 {}".format(acc1))

            if epoch and epoch % save_every == 0:
                self.save_model(model, epoch)

        self.save_model(model, nb_epoch)
예제 #3
0
 def repr_code(self,model):
     vecs=None
     use_set = CodeSearchDataset(self.conf['workdir'],
                                   self.conf['use_names'],self.conf['name_len'],
                                   self.conf['use_apis'],self.conf['api_len'],
                                   self.conf['use_tokens'],self.conf['tokens_len'])
     
     data_loader = torch.utils.data.DataLoader(dataset=use_set, batch_size=1000, 
                                        shuffle=False, drop_last=False, num_workers=1)
     for names,apis,toks in data_loader:
         names, apis, toks = gVar(names), gVar(apis), gVar(toks)
         reprs = model.code_encoding(names,apis,toks).data.cpu().numpy()
         vecs=reprs if vecs is None else np.concatenate((vecs, reprs),0)
     vecs = normalize(vecs)
     save_vecs(vecs,self.path+self.conf['use_codevecs'])
     return vecs
예제 #4
0
    def search(self, model, query, n_results=10):
        desc = sent2indexes(
            query, self.vocab_desc)  # convert desc sentence into word indices
        logger.debug("Description representation")
        desc = np.expand_dims(desc, axis=0)
        desc = gVar(desc)
        logger.debug("Description embedding")
        desc_repr = model.eval().desc_encoding(desc).data.cpu().numpy()

        valued = []
        threads = []
        for i, codevecs_chunk in enumerate(self.codevecs):
            # select the best n_results from each chunk
            t = threading.Thread(target=self.search_thread,
                                 args=(valued, desc_repr, codevecs_chunk, i,
                                       n_results))
            threads.append(t)
        for t in threads:
            t.start()
        for t in threads:  # wait until all sub-threads finish
            t.join()

        valued.sort(reverse=True)

        return valued[:n_results]
예제 #5
0
    def train(self, model):
        model.train()

        log_every = self.conf['log_every']
        valid_every = self.conf['valid_every']
        save_every = self.conf['save_every']
        batch_size = self.conf['batch_size']
        nb_epoch = self.conf['nb_epoch']

        train_set = CodeSearchDataset(
            self.path, self.conf['train_name'], self.conf['name_len'],
            self.conf['train_api'], self.conf['api_len'],
            self.conf['train_tokens'], self.conf['tokens_len'],
            self.conf['train_desc'], self.conf['desc_len'])

        data_loader = torch.utils.data.DataLoader(
            dataset=train_set,
            batch_size=self.conf['batch_size'],
            shuffle=True,
            drop_last=True,
            num_workers=1)

        val_loss = {'loss': 1., 'epoch': 0}

        for epoch in range(self.conf['reload'] + 1, nb_epoch):
            itr = 1
            losses = []
            for names, apis, toks, good_descs, bad_descs in data_loader:
                names, apis, toks, good_descs, bad_descs = gVar(names), gVar(
                    apis), gVar(toks), gVar(good_descs), gVar(bad_descs)
                loss = model(names, apis, toks, good_descs, bad_descs)
                losses.append(loss.item())
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                if itr % log_every == 0:
                    logger.info('epo:[%d/%d] itr:%d Loss=%.5f' %
                                (epoch, nb_epoch, itr, np.mean(losses)))
                    losses = []
                itr = itr + 1

    #      if epoch and epoch % valid_every == 0:
    #          logger.info("validating..")
    #          acc1, mrr, map, ndcg = self.eval(model,1000,1)

            if epoch and epoch % save_every == 0:
                self.save_model(model, epoch)
예제 #6
0
    def train(self, model):
        log_every = self.model_params['log_every']
        save_every = self.model_params['save_every']
        batch_size = self.model_params['batch_size']
        nb_epoch = self.model_params['nb_epoch']

        train_set = CodeSearchDataset(self.path,
                                      self.model_params['train_name'],
                                      self.model_params['name_len'],
                                      self.model_params['train_api'],
                                      self.model_params['api_len'],
                                      self.model_params['train_tokens'],
                                      self.model_params['tokens_len'],
                                      self.model_params['train_desc'],
                                      self.model_params['desc_len'],
                                      load_in_memory=True)

        data_loader = torch.utils.data.DataLoader(dataset=train_set,
                                                  batch_size=batch_size,
                                                  shuffle=True,
                                                  drop_last=True,
                                                  num_workers=4,
                                                  pin_memory=True)

        for epoch in range(self.model_params['reload'] + 1, nb_epoch):
            epoch_loss = []
            losses = []
            for itr, (names, apis, toks, good_descs,
                      bad_descs) in enumerate(data_loader, start=1):
                names, apis, toks, good_descs, bad_descs = gVar(names), gVar(
                    apis), gVar(toks), gVar(good_descs), gVar(bad_descs)
                loss = model.train()(names, apis, toks, good_descs, bad_descs)
                losses.append(loss.item())
                epoch_loss.append(loss.item())
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                if itr % log_every == 0:
                    logger.info('epo:[{}/{}] itr:{} Loss={:.5f}'.format(
                        epoch, nb_epoch, itr, np.mean(losses)))
                    losses = []

            if epoch and epoch % save_every == 0:
                self.save_model_epoch(model, epoch)

            logger.info('[SUMMARY] epo:[{}/{}] Loss={:.5f}'.format(
                epoch, nb_epoch, np.mean(epoch_loss)))
예제 #7
0
    def repr_code(self, model, norm=True):
        logging.info("Start Code Representation")
        use_set = CodeSearchDataset(self.model_params['workdir'],
                                    self.model_params['use_names'],
                                    self.model_params['name_len'],
                                    self.model_params['use_apis'],
                                    self.model_params['api_len'],
                                    self.model_params['use_tokens'],
                                    self.model_params['tokens_len'],
                                    load_in_memory=True)

        data_loader = torch.utils.data.DataLoader(dataset=use_set,
                                                  batch_size=1000,
                                                  shuffle=False,
                                                  drop_last=False,
                                                  num_workers=2,
                                                  pin_memory=True)

        vecs = []
        logging.debug("Calculating code vectors")
        for itr, (names, apis, toks) in enumerate(data_loader, start=1):
            names, apis, toks = gVar(names), gVar(apis), gVar(toks)
            reprs = model.eval().code_encoding(names, apis,
                                               toks).data.cpu().numpy()
            vecs.append(reprs)
            if itr % 100 == 0:
                logger.info('itr:{}/{}'.format(itr, len(use_set) // 1000))

        logging.debug("Concatenating all vectors")
        vecs = np.concatenate(vecs, 0)

        if norm:
            logger.debug("Normalizing...")
            vecs = normalize(vecs)

        logging.debug("Writing to disk -  vectors")
        save_vecs(vecs, self.path + self.model_params['use_codevecs'])
        return vecs
예제 #8
0
 def search(self,model,query,n_results=10):
     desc=sent2indexes(query, self.vocab_desc)#convert desc sentence into word indices
     desc = np.expand_dims(desc, axis=0)
     desc=gVar(desc)
     desc_repr=model.desc_encoding(desc).data.cpu().numpy()
     
     codes=[]
     sims=[]
     threads=[]
     for i, codevecs_chunk in enumerate(self.codevecs):
         t = threading.Thread(target=self.search_thread, args = (codes,sims,desc_repr, codevecs_chunk,i,n_results))
         threads.append(t)
     for t in threads:
         t.start()
     for t in threads:#wait until all sub-threads finish
         t.join()
     return codes,sims
예제 #9
0
    def eval(self, model, poolsize, K):
        """
        simple validation in a code pool. 
        @param: poolsize - size of the code pool, if -1, load the whole test set
        """
        def ACC(real,predict):
            sum=0.0
            for val in real:
                try:
                    index=predict.index(val)
                except ValueError:
                    index=-1
                if index!=-1:
                    sum=sum+1  
            return sum/float(len(real))
        def MAP(real,predict):
            sum=0.0
            for id,val in enumerate(real):
                try:
                    index=predict.index(val)
                except ValueError:
                    index=-1
                if index!=-1:
                    sum=sum+(id+1)/float(index+1)
            return sum/float(len(real))
        def MRR(real,predict):
            sum=0.0
            for val in real:
                try:
                    index=predict.index(val)
                except ValueError:
                    index=-1
                if index!=-1:
                    sum=sum+1.0/float(index+1)
            return sum/float(len(real))
        def NDCG(real,predict):
            dcg=0.0
            idcg=IDCG(len(real))
            for i,predictItem in enumerate(predict):
                if predictItem in real:
                    itemRelevance=1
                    rank = i+1
                    dcg+=(math.pow(2,itemRelevance)-1.0)*(math.log(2)/math.log(rank+1))
            return dcg/float(idcg)
        def IDCG(n):
            idcg=0
            itemRelevance=1
            for i in range(n):
                idcg+=(math.pow(2,itemRelevance)-1.0)*(math.log(2)/math.log(i+2))
            return idcg

        if self.valid_set is None:        #load test dataset
            self.valid_set=CodeSearchDataset(self.path,
                                      self.conf['valid_name'],self.conf['name_len'],
                                      self.conf['valid_api'],self.conf['api_len'],
                                      self.conf['valid_tokens'],self.conf['tokens_len'],
                                      self.conf['valid_desc'],self.conf['desc_len'])
        
        data_loader = torch.utils.data.DataLoader(dataset=self.valid_set, batch_size=poolsize, 
                                           shuffle=True, drop_last=True, num_workers=1)
        
        accs,mrrs,maps,ndcgs=[],[],[],[]
        for names, apis, toks, descs, _ in tqdm(data_loader):
            names, apis, toks = gVar(names), gVar(apis), gVar(toks)
            code_repr=model.code_encoding(names, apis, toks)
            for i in range(poolsize):
                desc=gVar(descs[i].expand(poolsize,-1))
                desc_repr=model.desc_encoding(desc)
                n_results = K          
                sims = F.cosine_similarity(code_repr, desc_repr).data.cpu().numpy()
                negsims=np.negative(sims)
                predict=np.argsort(negsims)#predict = np.argpartition(negsims, kth=n_results-1)
                predict = predict[:n_results]   
                predict = [int(k) for k in predict]
                real=[i]
                accs.append(ACC(real,predict))
                mrrs.append(MRR(real,predict))
                maps.append(MAP(real,predict))
                ndcgs.append(NDCG(real,predict))                          
        logger.info('ACC={}, MRR={}, MAP={}, nDCG={}'.format(np.mean(accs),np.mean(mrrs),np.mean(maps),np.mean(ndcgs)))
        
        return np.mean(accs),np.mean(mrrs),np.mean(maps),np.mean(ndcgs)
    def train(self, model):
        """
        Trains an initialized model
        :param model: Initialized model
        :return: None
        """
        log_every = self.conf['log_every']
        valid_every = self.conf['valid_every']
        batch_size = self.conf['batch_size']
        nb_epoch = self.conf['nb_epoch']
        max_patience = self.conf['patience']

        train_set = CodeSearchDataset(self.path, self.conf, "train")
        data_loader = torch.utils.data.DataLoader(dataset=train_set,
                                                  batch_size=batch_size,
                                                  shuffle=True,
                                                  drop_last=True,
                                                  num_workers=1)

        # MRR for the Best Saved model, if reload > 0, else -1
        if self.conf['reload'] > 0:
            _, max_mrr, _, _ = self.eval(model, 50, 10, "val")
        else:
            max_mrr = -1

        patience = 0
        for epoch in range(self.conf['reload'] + 1, nb_epoch):
            itr = 1
            losses = []

            model = model.train()
            for qts, good_codes, bad_codes, good_qbs, bad_qbs in data_loader:
                qts, good_codes, bad_codes, good_qbs, bad_qbs = gVar(
                    qts), gVar(good_codes), gVar(bad_codes), gVar(
                        good_qbs), gVar(bad_qbs)
                loss = model(qts, good_codes, bad_codes, good_qbs, bad_qbs)
                losses.append(loss.item())
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                if itr % log_every == 0:
                    logger.info('epo:[%d/%d] itr:%d Loss=%.5f' %
                                (epoch, nb_epoch, itr, np.mean(losses)))
                    losses = []
                itr = itr + 1

            if epoch % valid_every == 0:
                logger.info("validating..")
                acc1, mrr, map, ndcg = self.eval(model, 50, 10, "val")

                if mrr > max_mrr:
                    self.save_model(model, epoch)
                    patience = 0
                    print("Model improved. Saved model at %d epoch" % epoch)
                    max_mrr = mrr
                else:
                    print("Model didn't improve for ", patience + 1, " epochs")
                    patience += 1

            if patience >= max_patience:
                logger.info("Patience Limit Reached. Stopping Training")
                break
예제 #11
0
    def eval(self, model, poolsize, K, test_all=True):
        """
        simple validation in a code pool.
        @param: poolsize - size of the code pool, if -1, load the whole test set
        """
        def ACC(real, predict):
            _sum = 0.0
            for val in real:
                if val in predict:
                    _sum += 1
            return _sum / float(len(real))

        def MAP(real, predict):
            _sum = 0.0
            for _id, val in enumerate(real, start=1):
                try:
                    index = predict.index(val) + 1
                except ValueError:
                    continue
                else:
                    _sum += _id / float(index)
            return _sum / float(len(real))

        def MRR(real, predict):
            _sum = 0.0
            for val in real:
                try:
                    index = predict.index(val) + 1
                except ValueError:
                    continue
                else:
                    _sum += 1.0 / float(index)
            return _sum / float(len(real))

        def NDCG(real, predict):
            dcg = 0.0
            idcg = IDCG(len(real))
            for i, predictItem in enumerate(predict):
                if predictItem in real:
                    itemRelevance = 1
                    rank = i + 1
                    dcg += (math.pow(2, itemRelevance) -
                            1.0) * (math.log(2) / math.log(rank + 1))
            return dcg / float(idcg)

        def IDCG(n):
            idcg = 0
            itemRelevance = 1
            for i in range(n):
                idcg += (math.pow(2, itemRelevance) - 1.0) * (math.log(2) /
                                                              math.log(i + 2))
            return idcg

        # load test dataset
        if self.validation_set is None:
            self.validation_set = CodeSearchDataset(
                self.path,
                self.model_params['valid_name'],
                self.model_params['name_len'],
                self.model_params['valid_api'],
                self.model_params['api_len'],
                self.model_params['valid_tokens'],
                self.model_params['tokens_len'],
                self.model_params['valid_desc'],
                self.model_params['desc_len'],
                load_in_memory=True)

        data_loader = torch.utils.data.DataLoader(dataset=self.validation_set,
                                                  batch_size=poolsize,
                                                  shuffle=False,
                                                  drop_last=True,
                                                  num_workers=1,
                                                  pin_memory=True)

        accs, mrrs, maps, ndcgs = [], [], [], []
        for names, apis, toks, descs, _ in tqdm(data_loader):
            names, apis, toks = gVar(names), gVar(apis), gVar(toks)
            code_repr = model.eval().code_encoding(names, apis, toks)
            for it in range(poolsize):
                desc = gVar(descs[it].expand(poolsize, -1))
                desc_repr = model.eval().desc_encoding(desc)
                n_results = K
                sims = F.cosine_similarity(code_repr,
                                           desc_repr).data.cpu().numpy()
                negsims = np.negative(sims)
                prediction = np.argpartition(negsims, kth=n_results - 1)
                prediction = prediction[:n_results]
                # sort the codes by their sim, simulate a ranking
                prediction = [
                    y for _, y in sorted(zip(negsims[prediction], prediction))
                ]
                real_value = [it]
                accs.append(ACC(real_value, prediction))
                mrrs.append(MRR(real_value, prediction))
                maps.append(MAP(real_value, prediction))
                ndcgs.append(NDCG(real_value, prediction))

        mean_acc = np.mean(accs)
        mean_mrr = np.mean(mrrs)
        mean_map = np.mean(maps)
        mean_ndcg = np.mean(ndcgs)

        logger.info('ACC={}, MRR={}, MAP={}, nDCG={}'.format(
            mean_acc, mean_mrr, mean_map, mean_ndcg))

        return mean_acc, mean_mrr, mean_map, mean_ndcg