def __init__(self): super(DBRDPreprocessing, self).__init__( MultiLineTokenizer(), SnowballStemmer('english', ignore_stopwords=True), set( stopwords.words('english') + list(string.punctuation) + ["n't", "'t"]), [HTMLSymbolFilter()])
def main(_run, _config, _seed, _log): """ :param _run: :param _config: :param _seed: :param _log: :return: """ """ Setting and loading parameters """ # Setting logger args = _config logger = _log logger.info(args) logger.info('It started at: %s' % datetime.now()) torch.manual_seed(_seed) bugReportDatabase = BugReportDatabase.fromJson(args['bug_database']) paddingSym = "</s>" batchSize = args['batch_size'] device = torch.device('cuda' if args['cuda'] else "cpu") if args['cuda']: logger.info("Turning CUDA on") else: logger.info("Turning CUDA off") # It is the folder where the preprocessed information will be stored. cacheFolder = args['cache_folder'] # Setting the parameter to save and loading parameters importantParameters = ['compare_aggregation', 'categorical'] parametersToSave = dict([(parName, args[parName]) for parName in importantParameters]) if args['load'] is not None: mapLocation = ( lambda storage, loc: storage.cuda()) if args['cuda'] else 'cpu' modelInfo = torch.load(args['load'], map_location=mapLocation) modelState = modelInfo['model'] for paramName, paramValue in modelInfo['params'].items(): args[paramName] = paramValue else: modelState = None preprocessors = PreprocessorList() inputHandlers = [] categoricalOpt = args.get('categorical') if categoricalOpt is not None and len(categoricalOpt) != 0: categoricalEncoder, _, _ = processCategoricalParam( categoricalOpt, bugReportDatabase, inputHandlers, preprocessors, None, logger) else: categoricalEncoder = None filterInputHandlers = [] compareAggOpt = args['compare_aggregation'] databasePath = args['bug_database'] # Loading word embedding if compareAggOpt["lexicon"]: emb = np.load(compareAggOpt["word_embedding"]) lexicon = Lexicon(unknownSymbol=None) with codecs.open(compareAggOpt["lexicon"]) as f: for l in f: lexicon.put(l.strip()) lexicon.setUnknown("UUUKNNN") paddingId = lexicon.getLexiconIndex(paddingSym) embedding = Embedding(lexicon, emb, paddingIdx=paddingId) logger.info("Lexicon size: %d" % (lexicon.getLen())) logger.info("Word Embedding size: %d" % (embedding.getEmbeddingSize())) elif compareAggOpt["word_embedding"]: # todo: Allow use embeddings and other representation lexicon, embedding = Embedding.fromFile( compareAggOpt['word_embedding'], 'UUUKNNN', hasHeader=False, paddingSym=paddingSym) logger.info("Lexicon size: %d" % (lexicon.getLen())) logger.info("Word Embedding size: %d" % (embedding.getEmbeddingSize())) paddingId = lexicon.getLexiconIndex(paddingSym) else: embedding = None if compareAggOpt["norm_word_embedding"]: embedding.zscoreNormalization() # Tokenizer if compareAggOpt['tokenizer'] == 'default': logger.info("Use default tokenizer to tokenize summary information") tokenizer = MultiLineTokenizer() elif compareAggOpt['tokenizer'] == 'white_space': logger.info( "Use white space tokenizer to tokenize summary information") tokenizer = WhitespaceTokenizer() else: raise ArgumentError( "Tokenizer value %s is invalid. You should choose one of these: default and white_space" % compareAggOpt['tokenizer']) # Preparing input handlers, preprocessors and cache minSeqSize = max(compareAggOpt['aggregate']["window"] ) if compareAggOpt['aggregate']["model"] == "cnn" else -1 bow = compareAggOpt.get('bow', False) freq = compareAggOpt.get('frequency', False) and bow logger.info("BoW={} and TF={}".format(bow, freq)) if compareAggOpt['extractor'] is not None: # Use summary and description (concatenated) to address this problem logger.info("Using Summary and Description information.") # Loading Filters extractorFilters = loadFilters(compareAggOpt['extractor']['filters']) arguments = (databasePath, compareAggOpt['word_embedding'], str(compareAggOpt['lexicon']), ' '.join( sorted([ fil.__class__.__name__ for fil in extractorFilters ])), compareAggOpt['tokenizer'], str(bow), str(freq), SABDEncoderPreprocessor.__name__) inputHandlers.append(SABDInputHandler(paddingId, minSeqSize)) extractorCache = PreprocessingCache(cacheFolder, arguments) if bow: extractorPreprocessor = SABDBoWPreprocessor( lexicon, bugReportDatabase, extractorFilters, tokenizer, paddingId, freq, extractorCache) else: extractorPreprocessor = SABDEncoderPreprocessor( lexicon, bugReportDatabase, extractorFilters, tokenizer, paddingId, extractorCache) preprocessors.append(extractorPreprocessor) # Create model model = SABD(embedding, categoricalEncoder, compareAggOpt['extractor'], compareAggOpt['matching'], compareAggOpt['aggregate'], compareAggOpt['classifier'], freq) if args['loss'] == 'bce': logger.info("Using BCE Loss: margin={}".format(args['margin'])) lossFn = BCELoss() lossNoReduction = BCELoss(reduction='none') cmp_collate = PairBugCollate(inputHandlers, torch.float32, unsqueeze_target=True) elif args['loss'] == 'triplet': logger.info("Using Triplet Loss: margin={}".format(args['margin'])) lossFn = TripletLoss(args['margin']) lossNoReduction = TripletLoss(args['margin'], reduction='none') cmp_collate = TripletBugCollate(inputHandlers) model.to(device) if modelState: model.load_state_dict(modelState) """ Loading the training and validation. Also, it sets how the negative example will be generated. """ # load training if args.get('pairs_training'): negativePairGenOpt = args.get('neg_pair_generator', ) trainingFile = args.get('pairs_training') offlineGeneration = not (negativePairGenOpt is None or negativePairGenOpt['type'] == 'none') masterIdByBugId = bugReportDatabase.getMasterIdByBugId() randomAnchor = negativePairGenOpt['random_anchor'] if not offlineGeneration: logger.info("Not generate dynamically the negative examples.") negativePairGenerator = None else: pairGenType = negativePairGenOpt['type'] if pairGenType == 'random': logger.info("Random Negative Pair Generator") trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (trainingDataset.info, len(bugIds))) negativePairGenerator = RandomGenerator( preprocessors, cmp_collate, negativePairGenOpt['rate'], bugIds, masterIdByBugId, randomAnchor=randomAnchor) elif pairGenType == 'non_negative': logger.info("Non Negative Pair Generator") trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (trainingDataset.info, len(bugIds))) negativePairGenerator = NonNegativeRandomGenerator( preprocessors, cmp_collate, negativePairGenOpt['rate'], bugIds, masterIdByBugId, negativePairGenOpt['n_tries'], device, randomAnchor=randomAnchor) elif pairGenType == 'misc_non_zero': logger.info("Misc Non Zero Pair Generator") trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (trainingDataset.info, len(bugIds))) negativePairGenerator = MiscNonZeroRandomGen( preprocessors, cmp_collate, negativePairGenOpt['rate'], bugIds, trainingDataset.duplicateIds, masterIdByBugId, negativePairGenOpt['n_tries'], device, randomAnchor=randomAnchor) elif pairGenType == 'product_component': logger.info("Product Component Pair Generator") trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (trainingDataset.info, len(bugIds))) negativePairGenerator = ProductComponentRandomGen( bugReportDatabase, preprocessors, cmp_collate, negativePairGenOpt['rate'], bugIds, masterIdByBugId, negativePairGenOpt['n_tries'], device, randomAnchor=randomAnchor) elif pairGenType == 'random_k': logger.info("Random K Negative Pair Generator") trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (trainingDataset.info, len(bugIds))) negativePairGenerator = KRandomGenerator( preprocessors, cmp_collate, negativePairGenOpt['rate'], bugIds, masterIdByBugId, negativePairGenOpt['k'], device, randomAnchor=randomAnchor) elif pairGenType == "pre": logger.info("Pre-selected list generator") negativePairGenerator = PreSelectedGenerator( negativePairGenOpt['pre_list_file'], preprocessors, negativePairGenOpt['rate'], masterIdByBugId, negativePairGenOpt['preselected_length'], randomAnchor=randomAnchor) elif pairGenType == "positive_pre": logger.info("Positive Pre-selected list generator") negativePairGenerator = PositivePreSelectedGenerator( negativePairGenOpt['pre_list_file'], preprocessors, cmp_collate, negativePairGenOpt['rate'], masterIdByBugId, negativePairGenOpt['preselected_length'], randomAnchor=randomAnchor) elif pairGenType == "misc_non_zero_pre": logger.info("Misc: non-zero and Pre-selected list generator") negativePairGenerator1 = PreSelectedGenerator( negativePairGenOpt['pre_list_file'], preprocessors, negativePairGenOpt['rate'], masterIdByBugId, negativePairGenOpt['preselected_length'], randomAnchor=randomAnchor) trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds negativePairGenerator2 = NonNegativeRandomGenerator( preprocessors, cmp_collate, negativePairGenOpt['rate'], bugIds, masterIdByBugId, negativePairGenOpt['n_tries'], device, randomAnchor=randomAnchor) negativePairGenerator = MiscOfflineGenerator( (negativePairGenerator1, negativePairGenerator2)) elif pairGenType == "misc_non_zero_positive_pre": logger.info( "Misc: non-zero and Positive Pre-selected list generator") negativePairGenerator1 = PositivePreSelectedGenerator( negativePairGenOpt['pre_list_file'], preprocessors, cmp_collate, negativePairGenOpt['rate'], masterIdByBugId, negativePairGenOpt['preselected_length'], randomAnchor=randomAnchor) trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds negativePairGenerator2 = NonNegativeRandomGenerator( preprocessors, cmp_collate, negativePairGenOpt['rate'], bugIds, masterIdByBugId, negativePairGenOpt['n_tries'], device, randomAnchor=randomAnchor) negativePairGenerator = MiscOfflineGenerator( (negativePairGenerator1, negativePairGenerator2)) else: raise ArgumentError( "Offline generator is invalid (%s). You should choose one of these: random, hard and pre" % pairGenType) if isinstance(lossFn, BCELoss): training_reader = PairBugDatasetReader( trainingFile, preprocessors, negativePairGenerator, randomInvertPair=args['random_switch']) elif isinstance(lossFn, TripletLoss): training_reader = TripletBugDatasetReader( trainingFile, preprocessors, negativePairGenerator, randomInvertPair=args['random_switch']) trainingLoader = DataLoader(training_reader, batch_size=batchSize, collate_fn=cmp_collate.collate, shuffle=True) logger.info("Training size: %s" % (len(trainingLoader.dataset))) # load validation if args.get('pairs_validation'): if isinstance(lossFn, BCELoss): validation_reader = PairBugDatasetReader( args.get('pairs_validation'), preprocessors) elif isinstance(lossFn, TripletLoss): validation_reader = TripletBugDatasetReader( args.get('pairs_validation'), preprocessors) validationLoader = DataLoader(validation_reader, batch_size=batchSize, collate_fn=cmp_collate.collate) logger.info("Validation size: %s" % (len(validationLoader.dataset))) else: validationLoader = None """ Training and evaluate the model. """ optimizer_opt = args.get('optimizer', 'adam') if optimizer_opt == 'sgd': logger.info('SGD') optimizer = optim.SGD(model.parameters(), lr=args['lr'], weight_decay=args['l2']) elif optimizer_opt == 'adam': logger.info('Adam') optimizer = optim.Adam(model.parameters(), lr=args['lr'], weight_decay=args['l2']) # Recall rate rankingScorer = GeneralScorer( model, preprocessors, device, PairBugCollate(inputHandlers, ignore_target=True), args['ranking_batch_size'], args['ranking_n_workers']) recallEstimationTrainOpt = args.get('recall_estimation_train') if recallEstimationTrainOpt: preselectListRankingTrain = PreselectListRanking( recallEstimationTrainOpt, args['sample_size_rr_tr']) recallEstimationOpt = args.get('recall_estimation') if recallEstimationOpt: preselectListRanking = PreselectListRanking(recallEstimationOpt, args['sample_size_rr_val']) # LR scheduler lrSchedulerOpt = args.get('lr_scheduler', None) if lrSchedulerOpt is None: logger.info("Scheduler: Constant") lrSched = None elif lrSchedulerOpt["type"] == 'step': logger.info("Scheduler: StepLR (step:%s, decay:%f)" % (lrSchedulerOpt["step_size"], args["decay"])) lrSched = StepLR(optimizer, lrSchedulerOpt["step_size"], lrSchedulerOpt["decay"]) elif lrSchedulerOpt["type"] == 'exp': logger.info("Scheduler: ExponentialLR (decay:%f)" % (lrSchedulerOpt["decay"])) lrSched = ExponentialLR(optimizer, lrSchedulerOpt["decay"]) elif lrSchedulerOpt["type"] == 'linear': logger.info( "Scheduler: Divide by (1 + epoch * decay) ---- (decay:%f)" % (lrSchedulerOpt["decay"])) lrDecay = lrSchedulerOpt["decay"] lrSched = LambdaLR(optimizer, lambda epoch: 1 / (1.0 + epoch * lrDecay)) else: raise ArgumentError( "LR Scheduler is invalid (%s). You should choose one of these: step, exp and linear " % pairGenType) # Set training functions def trainingIteration(engine, batch): engine.kk = 0 model.train() optimizer.zero_grad() x, y = cmp_collate.to(batch, device) output = model(*x) loss = lossFn(output, y) loss.backward() optimizer.step() return loss, output, y def scoreDistanceTrans(output): if len(output) == 3: _, y_pred, y = output else: y_pred, y = output if lossFn == F.nll_loss: return torch.exp(y_pred[:, 1]), y elif isinstance(lossFn, (BCELoss)): return y_pred, y trainer = Engine(trainingIteration) trainingMetrics = {'training_loss': AverageLoss(lossFn)} if isinstance(lossFn, BCELoss): trainingMetrics['training_dist_target'] = MeanScoreDistance( output_transform=scoreDistanceTrans) trainingMetrics['training_acc'] = AccuracyWrapper( output_transform=thresholded_output_transform) trainingMetrics['training_precision'] = PrecisionWrapper( output_transform=thresholded_output_transform) trainingMetrics['training_recall'] = RecallWrapper( output_transform=thresholded_output_transform) # Add metrics to trainer for name, metric in trainingMetrics.items(): metric.attach(trainer, name) # Set validation functions def validationIteration(engine, batch): if not hasattr(engine, 'kk'): engine.kk = 0 model.eval() with torch.no_grad(): x, y = cmp_collate.to(batch, device) y_pred = model(*x) return y_pred, y validationMetrics = { 'validation_loss': LossWrapper(lossFn, output_transform=lambda x: (x[0], x[0][0]) if x[1] is None else x) } if isinstance(lossFn, BCELoss): validationMetrics['validation_dist_target'] = MeanScoreDistance( output_transform=scoreDistanceTrans) validationMetrics['validation_acc'] = AccuracyWrapper( output_transform=thresholded_output_transform) validationMetrics['validation_precision'] = PrecisionWrapper( output_transform=thresholded_output_transform) validationMetrics['validation_recall'] = RecallWrapper( output_transform=thresholded_output_transform) evaluator = Engine(validationIteration) # Add metrics to evaluator for name, metric in validationMetrics.items(): metric.attach(evaluator, name) # recommendation recommendation_fn = generateRecommendationList @trainer.on(Events.EPOCH_STARTED) def onStartEpoch(engine): epoch = engine.state.epoch logger.info("Epoch: %d" % epoch) if lrSched: lrSched.step() logger.info("LR: %s" % str(optimizer.param_groups[0]["lr"])) @trainer.on(Events.EPOCH_COMPLETED) def onEndEpoch(engine): epoch = engine.state.epoch logMetrics(_run, logger, engine.state.metrics, epoch) # Evaluate Training if validationLoader: evaluator.run(validationLoader) logMetrics(_run, logger, evaluator.state.metrics, epoch) lastEpoch = args['epochs'] - epoch == 0 if recallEstimationTrainOpt and (epoch % args['rr_train_epoch'] == 0): logRankingResult(_run, logger, preselectListRankingTrain, rankingScorer, bugReportDatabase, None, epoch, "train", recommendationListfn=recommendation_fn) rankingScorer.free() if recallEstimationOpt and (epoch % args['rr_val_epoch'] == 0): logRankingResult(_run, logger, preselectListRanking, rankingScorer, bugReportDatabase, args.get("ranking_result_file"), epoch, "validation", recommendationListfn=recommendation_fn) rankingScorer.free() if not lastEpoch: training_reader.sampleNewNegExamples(model, lossNoReduction) if args.get('save'): save_by_epoch = args['save_by_epoch'] if save_by_epoch and epoch in save_by_epoch: file_name, file_extension = os.path.splitext(args['save']) file_path = file_name + '_epoch_{}'.format( epoch) + file_extension else: file_path = args['save'] modelInfo = { 'model': model.state_dict(), 'params': parametersToSave } logger.info("==> Saving Model: %s" % file_path) torch.save(modelInfo, file_path) if args.get('pairs_training'): trainer.run(trainingLoader, max_epochs=args['epochs']) elif args.get('pairs_validation'): # Evaluate Training evaluator.run(validationLoader) logMetrics(_run, logger, evaluator.state.metrics, 0) if recallEstimationOpt: logRankingResult(_run, logger, preselectListRanking, rankingScorer, bugReportDatabase, args.get("ranking_result_file"), 0, "validation", recommendationListfn=recommendation_fn) # Test Dataset (accuracy, recall, precision, F1) pair_test_dataset = args.get('pair_test_dataset') if pair_test_dataset is not None and len(pair_test_dataset) > 0: pairTestReader = PairBugDatasetReader(pair_test_dataset, preprocessors) testLoader = DataLoader(pairTestReader, batch_size=batchSize, collate_fn=cmp_collate.collate) if not isinstance(cmp_collate, PairBugCollate): raise NotImplementedError( 'Evaluation of pairs using tanh was not implemented yet') logger.info("Test size: %s" % (len(testLoader.dataset))) testMetrics = { 'test_accuracy': ignite.metrics.Accuracy( output_transform=thresholded_output_transform), 'test_precision': ignite.metrics.Precision( output_transform=thresholded_output_transform), 'test_recall': ignite.metrics.Recall( output_transform=thresholded_output_transform), 'test_predictions': PredictionCache(), } test_evaluator = Engine(validationIteration) # Add metrics to evaluator for name, metric in testMetrics.items(): metric.attach(test_evaluator, name) test_evaluator.run(testLoader) for metricName, metricValue in test_evaluator.state.metrics.items(): metric = testMetrics[metricName] if isinstance(metric, ignite.metrics.Accuracy): logger.info({ 'type': 'metric', 'label': metricName, 'value': metricValue, 'epoch': None, 'correct': metric._num_correct, 'total': metric._num_examples }) _run.log_scalar(metricName, metricValue) elif isinstance(metric, (ignite.metrics.Precision, ignite.metrics.Recall)): logger.info({ 'type': 'metric', 'label': metricName, 'value': metricValue, 'epoch': None, 'tp': metric._true_positives.item(), 'total_positive': metric._positives.item() }) _run.log_scalar(metricName, metricValue) elif isinstance(metric, ConfusionMatrix): acc = cmAccuracy(metricValue) prec = cmPrecision(metricValue, False) recall = cmRecall(metricValue, False) f1 = 2 * (prec * recall) / (prec + recall + 1e-15) logger.info({ 'type': 'metric', 'label': metricName, 'accuracy': np.float(acc), 'precision': prec.cpu().numpy().tolist(), 'recall': recall.cpu().numpy().tolist(), 'f1': f1.cpu().numpy().tolist(), 'confusion_matrix': metricValue.cpu().numpy().tolist(), 'epoch': None }) _run.log_scalar('test_f1', f1[1]) elif isinstance(metric, PredictionCache): logger.info({ 'type': 'metric', 'label': metricName, 'predictions': metric.predictions }) # Calculate recall rate recallRateOpt = args.get('recall_rate', {'type': 'none'}) if recallRateOpt['type'] != 'none': if recallRateOpt['type'] == 'sun2011': logger.info("Calculating recall rate: {}".format( recallRateOpt['type'])) recallRateDataset = BugDataset(recallRateOpt['dataset']) rankingClass = SunRanking(bugReportDatabase, recallRateDataset, recallRateOpt['window']) # We always group all bug reports by master in the results in the sun 2011 methodology group_by_master = True elif recallRateOpt['type'] == 'deshmukh': logger.info("Calculating recall rate: {}".format( recallRateOpt['type'])) recallRateDataset = BugDataset(recallRateOpt['dataset']) rankingClass = DeshmukhRanking(bugReportDatabase, recallRateDataset) group_by_master = recallRateOpt['group_by_master'] else: raise ArgumentError( "recall_rate.type is invalid (%s). You should choose one of these: step, exp and linear " % recallRateOpt['type']) logRankingResult(_run, logger, rankingClass, rankingScorer, bugReportDatabase, recallRateOpt["result_file"], 0, None, group_by_master, recommendationListfn=recommendation_fn)
def processSumDescParam(sum_desc_opts, bugReportDatabase, inputHandlers, preprocessors, encoders, cacheFolder, databasePath, logger, paddingSym): # Use summary and description (concatenated) to address this problem logger.info("Using summary and description information.") # Loading word embedding lexicon, embedding = load_embedding(sum_desc_opts, paddingSym) logger.info("Lexicon size: %d" % (lexicon.getLen())) logger.info("Word Embedding size: %d" % (embedding.getEmbeddingSize())) paddingId = lexicon.getLexiconIndex(paddingSym) # Loading Filters filters = loadFilters(sum_desc_opts['filters']) # Tokenizer if sum_desc_opts['tokenizer'] == 'default': logger.info("Use default tokenizer to tokenize summary+description information") tokenizer = MultiLineTokenizer() elif sum_desc_opts['tokenizer'] == 'white_space': logger.info("Use white space tokenizer to tokenize summary+description information") tokenizer = WhitespaceTokenizer() else: raise ArgumentError( "Tokenizer value %s is invalid. You should choose one of these: default and white_space" % sum_desc_opts['tokenizer']) arguments = ( databasePath, sum_desc_opts['word_embedding'], ' '.join(sorted([fil.__class__.__name__ for fil in filters])), sum_desc_opts['tokenizer'], "summary_description") cacheSumDesc = PreprocessingCache(cacheFolder, arguments) sumDescPreprocessor = SummaryDescriptionPreprocessor(lexicon, bugReportDatabase, filters, tokenizer, paddingId, cacheSumDesc) preprocessors.append(sumDescPreprocessor) if sum_desc_opts['encoder_type'] == 'cnn': windowSizes = sum_desc_opts.get('window_sizes', [3]) nFilters = sum_desc_opts.get('nfilters', 100) updateEmb = sum_desc_opts.get('update_embedding', False) actFunc = loadActivationFunction(sum_desc_opts.get('activation', 'relu')) batchNorm = sum_desc_opts.get('batch_normalization', False) dropout = sum_desc_opts.get('dropout', 0.0) sumDescEncoder = TextCNN(windowSizes, nFilters, embedding, updateEmb, actFunc, batchNorm, dropout) encoders.append(sumDescEncoder) inputHandlers.append(TextCNNInputHandler(paddingId, max(windowSizes))) elif sum_desc_opts['encoder_type'] == 'cnn+dense': windowSizes = sum_desc_opts.get('window_sizes', [3]) nFilters = sum_desc_opts.get('nfilters', 100) updateEmb = sum_desc_opts.get('update_embedding', False) actFunc = loadActivationFunction(sum_desc_opts.get('activation', 'relu')) batchNorm = sum_desc_opts.get('batch_normalization', False) dropout = sum_desc_opts.get('dropout', 0.0) hiddenSizes = sum_desc_opts.get('hidden_sizes') hiddenAct = loadActivationClass(sum_desc_opts.get('hidden_act')) hiddenDropout = sum_desc_opts.get('hidden_dropout') batchLast = sum_desc_opts.get("bn_last_layer", False) cnnEnc = TextCNN(windowSizes, nFilters, embedding, updateEmb, actFunc, batchNorm, dropout) sumDescEncoder = MultilayerDense(cnnEnc, hiddenSizes, hiddenAct, batchNorm, batchLast, hiddenDropout) encoders.append(sumDescEncoder) inputHandlers.append(TextCNNInputHandler(paddingId, max(windowSizes))) elif sum_desc_opts['encoder_type'] == 'word_mean': standardization = sum_desc_opts.get('standardization', False) dropout = sum_desc_opts.get('dropout', 0.0) updateEmb = sum_desc_opts.get('update_embedding', False) batch_normalization = sum_desc_opts.get('update_embedding', False) hiddenSize = sum_desc_opts.get('hidden_size') sumDescEncoder = WordMean(embedding, updateEmb, hiddenSize, standardization, dropout, batch_normalization) encoders.append(sumDescEncoder) inputHandlers.append(RNNInputHandler(paddingId)) else: raise ArgumentError( "Encoder type of summary and description is invalid (%s). You should choose one of these: cnn" % sum_desc_opts['encoder_type'])
def processDescriptionParam(descOpts, bugReportDatabase, inputHandlers, preprocessors, encoders, databasePath, cacheFolder, logger, paddingSym): # Use summary and description (concatenated) to address this problem logger.info("Using Description information.") # Loading word embedding lexicon, embedding = load_embedding(descOpts, paddingSym) logger.info("Lexicon size: %d" % (lexicon.getLen())) logger.info("Word Embedding size: %d" % (embedding.getEmbeddingSize())) paddingId = lexicon.getLexiconIndex(paddingSym) # Loading Filters filters = loadFilters(descOpts['filters']) # Tokenizer if descOpts['tokenizer'] == 'default': logger.info("Use default tokenizer to tokenize summary information") tokenizer = MultiLineTokenizer() elif descOpts['tokenizer'] == 'white_space': logger.info("Use white space tokenizer to tokenize summary information") tokenizer = WhitespaceTokenizer() else: raise ArgumentError( "Tokenizer value %s is invalid. You should choose one of these: default and white_space" % descOpts['tokenizer']) arguments = ( databasePath, descOpts['word_embedding'], str(descOpts['lexicon']), ' '.join(sorted([fil.__class__.__name__ for fil in filters])), descOpts['tokenizer'], "description") descCache = PreprocessingCache(cacheFolder, arguments) descPreprocessor = DescriptionPreprocessor(lexicon, bugReportDatabase, filters, tokenizer, paddingId, descCache) preprocessors.append(descPreprocessor) if descOpts['encoder_type'] == 'rnn': rnnType = descOpts.get('rnn_type') hiddenSize = descOpts.get('hidden_size') bidirectional = descOpts.get('bidirectional', False) numLayers = descOpts.get('num_layers', 1) dropout = descOpts.get('dropout', 0.0) updateEmb = descOpts.get('update_embedding', False) fixedOpt = descOpts.get('fixed_opt', False) descRNN = SortedRNNEncoder(rnnType, embedding, hiddenSize, numLayers, bidirectional, updateEmb, dropout) if fixedOpt == 'self_att': att = SelfAttention(descRNN.getOutputSize(), descOpts['self_att_hidden'], descOpts['n_hops']) descEncoder = RNN_Self_Attention(descRNN, att, paddingId, dropout) else: descEncoder = RNNFixedOuput(descRNN, fixedOpt, dropout) encoders.append(descEncoder) inputHandlers.append(RNNInputHandler(paddingId)) elif descOpts['encoder_type'] == 'cnn': windowSizes = descOpts.get('window_sizes', [3]) nFilters = descOpts.get('nfilters', 100) updateEmb = descOpts.get('update_embedding', False) actFunc = loadActivationFunction(descOpts.get('activation', 'relu')) batchNorm = descOpts.get('batch_normalization', False) dropout = descOpts.get('dropout', 0.0) descEncoder = TextCNN(windowSizes, nFilters, embedding, updateEmb, actFunc, batchNorm, dropout) encoders.append(descEncoder) inputHandlers.append(TextCNNInputHandler(paddingId, max(windowSizes))) elif descOpts['encoder_type'] == 'cnn+dense': windowSizes = descOpts.get('window_sizes', [3]) nFilters = descOpts.get('nfilters', 100) updateEmb = descOpts.get('update_embedding', False) actFunc = loadActivationFunction(descOpts.get('activation', 'relu')) batchNorm = descOpts.get('batch_normalization', False) dropout = descOpts.get('dropout', 0.0) hiddenSizes = descOpts.get('hidden_sizes') hiddenAct = loadActivationClass(descOpts.get('hidden_act')) hiddenDropout = descOpts.get('hidden_dropout') batchLast = descOpts.get("bn_last_layer", False) cnnEnc = TextCNN(windowSizes, nFilters, embedding, updateEmb, actFunc, batchNorm, dropout) descEncoder = MultilayerDense(cnnEnc, hiddenSizes, hiddenAct, batchNorm, batchLast, hiddenDropout) encoders.append(descEncoder) inputHandlers.append(TextCNNInputHandler(paddingId, max(windowSizes))) elif descOpts['encoder_type'] == 'dense+self_att': dropout = descOpts.get('dropout', 0.0) hiddenSize = descOpts.get('hidden_size') self_att_hidden = descOpts['self_att_hidden'] n_hops = descOpts['n_hops'] updateEmb = descOpts.get('update_embedding', False) descEncoder = Dense_Self_Attention(embedding, hiddenSize, self_att_hidden, n_hops, paddingId, updateEmb, dropout=dropout) encoders.append(descEncoder) inputHandlers.append(TextCNNInputHandler(paddingId, -1)) elif descOpts['encoder_type'] == 'word_mean': standardization = descOpts.get('standardization', False) dropout = descOpts.get('dropout', 0.0) updateEmb = descOpts.get('update_embedding', False) batch_normalization = descOpts.get('update_embedding', False) hiddenSize = descOpts.get('hidden_size') descEncoder = WordMean( embedding, updateEmb, hiddenSize, standardization, dropout, batch_normalization) encoders.append(descEncoder) inputHandlers.append(RNNInputHandler(paddingId)) else: raise ArgumentError( "Encoder type of summary and description is invalid (%s). You should choose one of these: cnn" % descOpts['encoder_type'])
logger.info(args) lexicon, embedding = Embedding.fromFile(args.word_embedding, 'UUUKNNN', hasHeader=False, paddingSym="</s>") database_lexicon = set(['UUUKNNN', '</s>']) if args.db is not None: # Filter any word that is not in the dataset. db = BugReportDatabase.fromJson(args.db) filters = loadFilters(args.filters) if args.tk == 'default': logger.info( "Use default tokenizer to tokenize summary information") tokenizer = MultiLineTokenizer() elif args.tk == 'white_space': logger.info( "Use white space tokenizer to tokenize summary information") tokenizer = WhitespaceTokenizer() else: raise ArgumentError( "Tokenizer value %s is invalid. You should choose one of these: default and white_space" % args.tk) for report in db.bugList: for token in extract_tokens(report.get('short_desc', ''), tokenizer, filters): database_lexicon.add(token) for token in extract_tokens(report.get('description', ''),
trainingBugs.update(bugDataset.bugIds) logger.info("Preprocessing and fitting data") trainingText = [] for bugId in trainingBugs: bugReport = bugReportDataset.getBug(bugId) text = concatenateSummaryAndDescription(bugReport) trainingText.append(text) if args.load: logger.info('Loading object') vectorizer = pickle.load(open(args.load, 'rb')) else: tokenizer = WhitespaceTokenizer() if args.space_tokenize else MultiLineTokenizer() stemmer = SnowballStemmer('english', ignore_stopwords=True) if args.stemmer else None stopWords = set(stopwords.words('english') + list(string.punctuation) + ["n't", "'t", ]) filters = [TransformNumberToZeroFilter(), StripPunctuactionFilter(), DectectNotUsualWordFilter()] tokenizerStemmer = ClassicalPreprocessing(tokenizer, stemmer, stopWords, filters) logger.info("Using %s to tokenize" % tokenizer.__class__.__name__) if args.trigram: ngramRange = (1, 3) elif args.bigram: ngramRange = (1, 2) else: ngramRange = (1, 1) if args.type == 'tfidf':
def main(_run, _config, _seed, _log): # Setting logger args = _config logger = _log logger.info(args) logger.info('It started at: %s' % datetime.now()) torch.manual_seed(_seed) device = torch.device('cuda' if args['cuda'] else "cpu") if args['cuda']: logger.info("Turning CUDA on") else: logger.info("Turning CUDA off") # Setting the parameter to save and loading parameters important_parameters = ['dbr_cnn'] parameters_to_save = dict([(name, args[name]) for name in important_parameters]) if args['load'] is not None: map_location = ( lambda storage, loc: storage.cuda()) if args['cuda'] else 'cpu' model_info = torch.load(args['load'], map_location=map_location) model_state = model_info['model'] for param_name, param_value in model_info['params'].items(): args[param_name] = param_value else: model_state = None # Set basic variables preprocessors = PreprocessorList() input_handlers = [] report_database = BugReportDatabase.fromJson(args['bug_database']) batchSize = args['batch_size'] dbr_cnn_opt = args['dbr_cnn'] # Loading word embedding and lexicon emb = np.load(dbr_cnn_opt["word_embedding"]) padding_sym = "</s>" lexicon = Lexicon(unknownSymbol=None) with codecs.open(dbr_cnn_opt["lexicon"]) as f: for l in f: lexicon.put(l.strip()) lexicon.setUnknown("UUUKNNN") padding_id = lexicon.getLexiconIndex(padding_sym) embedding = Embedding(lexicon, emb, paddingIdx=padding_id) logger.info("Lexicon size: %d" % (lexicon.getLen())) logger.info("Word Embedding size: %d" % (embedding.getEmbeddingSize())) # Load filters and tokenizer filters = loadFilters(dbr_cnn_opt['filters']) if dbr_cnn_opt['tokenizer'] == 'default': logger.info("Use default tokenizer to tokenize summary information") tokenizer = MultiLineTokenizer() elif dbr_cnn_opt['tokenizer'] == 'white_space': logger.info( "Use white space tokenizer to tokenize summary information") tokenizer = WhitespaceTokenizer() else: raise ArgumentError( "Tokenizer value %s is invalid. You should choose one of these: default and white_space" % dbr_cnn_opt['tokenizer']) # Add preprocessors preprocessors.append( DBR_CNN_CategoricalPreprocessor(dbr_cnn_opt['categorical_lexicon'], report_database)) preprocessors.append( SummaryDescriptionPreprocessor(lexicon, report_database, filters, tokenizer, padding_id)) # Add input_handlers input_handlers.append(DBRDCNN_CategoricalInputHandler()) input_handlers.append( TextCNNInputHandler(padding_id, min(dbr_cnn_opt["window"]))) # Create Model model = DBR_CNN(embedding, dbr_cnn_opt["window"], dbr_cnn_opt["nfilters"], dbr_cnn_opt['update_embedding']) model.to(device) if model_state: model.load_state_dict(model_state) # Set loss function logger.info("Using BCE Loss") loss_fn = BCELoss() loss_no_reduction = BCELoss(reduction='none') cmp_collate = PairBugCollate(input_handlers, torch.float32, unsqueeze_target=True) # Loading the training and setting how the negative example will be generated. if args.get('pairs_training'): negative_pair_gen_opt = args.get('neg_pair_generator', ) pairsTrainingFile = args.get('pairs_training') random_anchor = negative_pair_gen_opt['random_anchor'] offlineGeneration = not (negative_pair_gen_opt is None or negative_pair_gen_opt['type'] == 'none') if not offlineGeneration: logger.info("Not generate dynamically the negative examples.") pair_training_reader = PairBugDatasetReader( pairsTrainingFile, preprocessors, randomInvertPair=args['random_switch']) else: pair_gen_type = negative_pair_gen_opt['type'] master_id_by_bug_id = report_database.getMasterIdByBugId() if pair_gen_type == 'random': logger.info("Random Negative Pair Generator") training_dataset = BugDataset( negative_pair_gen_opt['training']) bug_ids = training_dataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (training_dataset.info, len(bug_ids))) negative_pair_generator = RandomGenerator( preprocessors, cmp_collate, negative_pair_gen_opt['rate'], bug_ids, master_id_by_bug_id) elif pair_gen_type == 'non_negative': logger.info("Non Negative Pair Generator") training_dataset = BugDataset( negative_pair_gen_opt['training']) bug_ids = training_dataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (training_dataset.info, len(bug_ids))) negative_pair_generator = NonNegativeRandomGenerator( preprocessors, cmp_collate, negative_pair_gen_opt['rate'], bug_ids, master_id_by_bug_id, negative_pair_gen_opt['n_tries'], device, randomAnchor=random_anchor) elif pair_gen_type == 'misc_non_zero': logger.info("Misc Non Zero Pair Generator") training_dataset = BugDataset( negative_pair_gen_opt['training']) bug_ids = training_dataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (training_dataset.info, len(bug_ids))) negative_pair_generator = MiscNonZeroRandomGen( preprocessors, cmp_collate, negative_pair_gen_opt['rate'], bug_ids, training_dataset.duplicateIds, master_id_by_bug_id, device, negative_pair_gen_opt['n_tries'], negative_pair_gen_opt['random_anchor']) elif pair_gen_type == 'random_k': logger.info("Random K Negative Pair Generator") training_dataset = BugDataset( negative_pair_gen_opt['training']) bug_ids = training_dataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (training_dataset.info, len(bug_ids))) negative_pair_generator = KRandomGenerator( preprocessors, cmp_collate, negative_pair_gen_opt['rate'], bug_ids, master_id_by_bug_id, negative_pair_gen_opt['k'], device) elif pair_gen_type == "pre": logger.info("Pre-selected list generator") negative_pair_generator = PreSelectedGenerator( negative_pair_gen_opt['pre_list_file'], preprocessors, negative_pair_gen_opt['rate'], master_id_by_bug_id, negative_pair_gen_opt['preselected_length']) elif pair_gen_type == "misc_non_zero_pre": logger.info("Pre-selected list generator") negativePairGenerator1 = PreSelectedGenerator( negative_pair_gen_opt['pre_list_file'], preprocessors, negative_pair_gen_opt['rate'], master_id_by_bug_id, negative_pair_gen_opt['preselected_length']) training_dataset = BugDataset( negative_pair_gen_opt['training']) bug_ids = training_dataset.bugIds negativePairGenerator2 = NonNegativeRandomGenerator( preprocessors, cmp_collate, negative_pair_gen_opt['rate'], bug_ids, master_id_by_bug_id, device, negative_pair_gen_opt['n_tries']) negative_pair_generator = MiscOfflineGenerator( (negativePairGenerator1, negativePairGenerator2)) else: raise ArgumentError( "Offline generator is invalid (%s). You should choose one of these: random, hard and pre" % pair_gen_type) pair_training_reader = PairBugDatasetReader( pairsTrainingFile, preprocessors, negative_pair_generator, randomInvertPair=args['random_switch']) training_loader = DataLoader(pair_training_reader, batch_size=batchSize, collate_fn=cmp_collate.collate, shuffle=True) logger.info("Training size: %s" % (len(training_loader.dataset))) # load validation if args.get('pairs_validation'): pair_validation_reader = PairBugDatasetReader( args.get('pairs_validation'), preprocessors) validation_loader = DataLoader(pair_validation_reader, batch_size=batchSize, collate_fn=cmp_collate.collate) logger.info("Validation size: %s" % (len(validation_loader.dataset))) else: validation_loader = None """ Training and evaluate the model. """ optimizer_opt = args.get('optimizer', 'adam') if optimizer_opt == 'sgd': logger.info('SGD') optimizer = optim.SGD(model.parameters(), lr=args['lr'], weight_decay=args['l2'], momentum=args['momentum']) elif optimizer_opt == 'adam': logger.info('Adam') optimizer = optim.Adam(model.parameters(), lr=args['lr'], weight_decay=args['l2']) # Recall rate ranking_scorer = DBR_CNN_Scorer(preprocessors[0], preprocessors[1], input_handlers[0], input_handlers[1], model, device, args['ranking_batch_size']) recallEstimationTrainOpt = args.get('recall_estimation_train') if recallEstimationTrainOpt: preselectListRankingTrain = PreselectListRanking( recallEstimationTrainOpt) recallEstimationOpt = args.get('recall_estimation') if recallEstimationOpt: preselect_list_ranking = PreselectListRanking(recallEstimationOpt) lr_scheduler_opt = args.get('lr_scheduler', None) if lr_scheduler_opt is None or lr_scheduler_opt['type'] == 'constant': logger.info("Scheduler: Constant") lr_sched = None elif lr_scheduler_opt["type"] == 'step': logger.info("Scheduler: StepLR (step:%s, decay:%f)" % (lr_scheduler_opt["step_size"], args["decay"])) lr_sched = StepLR(optimizer, lr_scheduler_opt["step_size"], lr_scheduler_opt["decay"]) elif lr_scheduler_opt["type"] == 'exp': logger.info("Scheduler: ExponentialLR (decay:%f)" % (lr_scheduler_opt["decay"])) lr_sched = ExponentialLR(optimizer, lr_scheduler_opt["decay"]) elif lr_scheduler_opt["type"] == 'linear': logger.info( "Scheduler: Divide by (1 + epoch * decay) ---- (decay:%f)" % (lr_scheduler_opt["decay"])) lrDecay = lr_scheduler_opt["decay"] lr_sched = LambdaLR(optimizer, lambda epoch: 1 / (1.0 + epoch * lrDecay)) else: raise ArgumentError( "LR Scheduler is invalid (%s). You should choose one of these: step, exp and linear " % pair_gen_type) # Set training functions def trainingIteration(engine, batch): model.train() optimizer.zero_grad() x, y = cmp_collate.to(batch, device) output = model(*x) loss = loss_fn(output, y) loss.backward() optimizer.step() return loss, output, y trainer = Engine(trainingIteration) negTarget = 0.0 if isinstance(loss_fn, NLLLoss) else -1.0 trainingMetrics = { 'training_loss': AverageLoss(loss_fn), 'training_acc': AccuracyWrapper(output_transform=thresholded_output_transform), 'training_precision': PrecisionWrapper(output_transform=thresholded_output_transform), 'training_recall': RecallWrapper(output_transform=thresholded_output_transform), } # Add metrics to trainer for name, metric in trainingMetrics.items(): metric.attach(trainer, name) # Set validation functions def validationIteration(engine, batch): model.eval() with torch.no_grad(): x, y = cmp_collate.to(batch, device) y_pred = model(*x) return y_pred, y validationMetrics = { 'validation_loss': LossWrapper(loss_fn), 'validation_acc': AccuracyWrapper(output_transform=thresholded_output_transform), 'validation_precision': PrecisionWrapper(output_transform=thresholded_output_transform), 'validation_recall': RecallWrapper(output_transform=thresholded_output_transform), } evaluator = Engine(validationIteration) # Add metrics to evaluator for name, metric in validationMetrics.items(): metric.attach(evaluator, name) @trainer.on(Events.EPOCH_STARTED) def onStartEpoch(engine): epoch = engine.state.epoch logger.info("Epoch: %d" % epoch) if lr_sched: lr_sched.step() logger.info("LR: %s" % str(optimizer.param_groups[0]["lr"])) @trainer.on(Events.EPOCH_COMPLETED) def onEndEpoch(engine): epoch = engine.state.epoch logMetrics(_run, logger, engine.state.metrics, epoch) # Evaluate Training if validation_loader: evaluator.run(validation_loader) logMetrics(_run, logger, evaluator.state.metrics, epoch) lastEpoch = args['epochs'] - epoch == 0 if recallEstimationTrainOpt and (epoch % args['rr_train_epoch'] == 0): logRankingResult(_run, logger, preselectListRankingTrain, ranking_scorer, report_database, None, epoch, "train") ranking_scorer.free() if recallEstimationOpt and (epoch % args['rr_val_epoch'] == 0): logRankingResult(_run, logger, preselect_list_ranking, ranking_scorer, report_database, args.get("ranking_result_file"), epoch, "validation") ranking_scorer.free() if not lastEpoch: pair_training_reader.sampleNewNegExamples(model, loss_no_reduction) if args.get('save'): save_by_epoch = args['save_by_epoch'] if save_by_epoch and epoch in save_by_epoch: file_name, file_extension = os.path.splitext(args['save']) file_path = file_name + '_epoch_{}'.format( epoch) + file_extension else: file_path = args['save'] modelInfo = { 'model': model.state_dict(), 'params': parameters_to_save } logger.info("==> Saving Model: %s" % file_path) torch.save(modelInfo, file_path) if args.get('pairs_training'): trainer.run(training_loader, max_epochs=args['epochs']) elif args.get('pairs_validation'): # Evaluate Training evaluator.run(validation_loader) logMetrics(logger, evaluator.state.metrics) if recallEstimationOpt: logRankingResult(_run, logger, preselect_list_ranking, ranking_scorer, report_database, args.get("ranking_result_file"), 0, "validation") # Test Dataset (accuracy, recall, precision, F1) pair_test_dataset = args.get('pair_test_dataset') if pair_test_dataset is not None and len(pair_test_dataset) > 0: pairTestReader = PairBugDatasetReader(pair_test_dataset, preprocessors) testLoader = DataLoader(pairTestReader, batch_size=batchSize, collate_fn=cmp_collate.collate) if not isinstance(cmp_collate, PairBugCollate): raise NotImplementedError( 'Evaluation of pairs using tanh was not implemented yet') logger.info("Test size: %s" % (len(testLoader.dataset))) testMetrics = { 'test_accuracy': ignite.metrics.Accuracy( output_transform=thresholded_output_transform), 'test_precision': ignite.metrics.Precision( output_transform=thresholded_output_transform), 'test_recall': ignite.metrics.Recall( output_transform=thresholded_output_transform), 'test_predictions': PredictionCache(), } test_evaluator = Engine(validationIteration) # Add metrics to evaluator for name, metric in testMetrics.items(): metric.attach(test_evaluator, name) test_evaluator.run(testLoader) for metricName, metricValue in test_evaluator.state.metrics.items(): metric = testMetrics[metricName] if isinstance(metric, ignite.metrics.Accuracy): logger.info({ 'type': 'metric', 'label': metricName, 'value': metricValue, 'epoch': None, 'correct': metric._num_correct, 'total': metric._num_examples }) _run.log_scalar(metricName, metricValue) elif isinstance(metric, (ignite.metrics.Precision, ignite.metrics.Recall)): logger.info({ 'type': 'metric', 'label': metricName, 'value': metricValue, 'epoch': None, 'tp': metric._true_positives.item(), 'total_positive': metric._positives.item() }) _run.log_scalar(metricName, metricValue) elif isinstance(metric, ConfusionMatrix): acc = cmAccuracy(metricValue) prec = cmPrecision(metricValue, False) recall = cmRecall(metricValue, False) f1 = 2 * (prec * recall) / (prec + recall + 1e-15) logger.info({ 'type': 'metric', 'label': metricName, 'accuracy': np.float(acc), 'precision': prec.cpu().numpy().tolist(), 'recall': recall.cpu().numpy().tolist(), 'f1': f1.cpu().numpy().tolist(), 'confusion_matrix': metricValue.cpu().numpy().tolist(), 'epoch': None }) _run.log_scalar('test_f1', f1[1]) elif isinstance(metric, PredictionCache): logger.info({ 'type': 'metric', 'label': metricName, 'predictions': metric.predictions }) # Calculate recall rate recall_rate_opt = args.get('recall_rate', {'type': 'none'}) if recall_rate_opt['type'] != 'none': if recall_rate_opt['type'] == 'sun2011': logger.info("Calculating recall rate: {}".format( recall_rate_opt['type'])) recall_rate_dataset = BugDataset(recall_rate_opt['dataset']) ranking_class = SunRanking(report_database, recall_rate_dataset, recall_rate_opt['window']) # We always group all bug reports by master in the results in the sun 2011 methodology group_by_master = True elif recall_rate_opt['type'] == 'deshmukh': logger.info("Calculating recall rate: {}".format( recall_rate_opt['type'])) recall_rate_dataset = BugDataset(recall_rate_opt['dataset']) ranking_class = DeshmukhRanking(report_database, recall_rate_dataset) group_by_master = recall_rate_opt['group_by_master'] else: raise ArgumentError( "recall_rate.type is invalid (%s). You should choose one of these: step, exp and linear " % recall_rate_opt['type']) logRankingResult( _run, logger, ranking_class, ranking_scorer, report_database, recall_rate_opt["result_file"], 0, None, group_by_master, )
def main(_run, _config, _seed, _log): """ :param _run: :param _config: :param _seed: :param _log: :return: """ """ Setting and loading parameters """ # Setting logger args = _config logger = _log logger.info(args) logger.info('It started at: %s' % datetime.now()) torch.manual_seed(_seed) bugReportDatabase = BugReportDatabase.fromJson(args['bug_database']) paddingSym = "</s>" batchSize = args['batch_size'] device = torch.device('cuda' if args['cuda'] else "cpu") if args['cuda']: logger.info("Turning CUDA on") else: logger.info("Turning CUDA off") # It is the folder where the preprocessed information will be stored. cacheFolder = args['cache_folder'] # Setting the parameter to save and loading parameters importantParameters = ['compare_aggregation', 'categorical'] parametersToSave = dict([(parName, args[parName]) for parName in importantParameters]) if args['load'] is not None: mapLocation = ( lambda storage, loc: storage.cuda()) if cudaOn else 'cpu' modelInfo = torch.load(args['load'], map_location=mapLocation) modelState = modelInfo['model'] for paramName, paramValue in modelInfo['params'].items(): args[paramName] = paramValue else: modelState = None if args['rep'] is not None and args['rep']['model']: logger.info("Loading REP") rep = read_weights(args['rep']['model']) rep_input, max_tkn_id = read_dbrd_file(args['rep']['input'], math.inf) rep_recommendation = args['rep']['k'] rep.fit_transform(rep_input, max_tkn_id, True) rep_input_by_id = {} for inp in rep_input: rep_input_by_id[inp[SUN_REPORT_ID_INDEX]] = inp else: rep = None preprocessors = PreprocessorList() inputHandlers = [] categoricalOpt = args.get('categorical') if categoricalOpt is not None and len(categoricalOpt) != 0: categoricalEncoder, _, _ = processCategoricalParam( categoricalOpt, bugReportDatabase, inputHandlers, preprocessors, None, logger, cudaOn) else: categoricalEncoder = None filterInputHandlers = [] compareAggOpt = args['compare_aggregation'] databasePath = args['bug_database'] # Loading word embedding if compareAggOpt["word_embedding"]: # todo: Allow use embeddings and other representation lexicon, embedding = Embedding.fromFile( compareAggOpt['word_embedding'], 'UUUKNNN', hasHeader=False, paddingSym=paddingSym) logger.info("Lexicon size: %d" % (lexicon.getLen())) logger.info("Word Embedding size: %d" % (embedding.getEmbeddingSize())) paddingId = lexicon.getLexiconIndex(paddingSym) lazy = False else: embedding = None # Tokenizer if compareAggOpt['tokenizer'] == 'default': logger.info("Use default tokenizer to tokenize summary information") tokenizer = MultiLineTokenizer() elif compareAggOpt['tokenizer'] == 'white_space': logger.info( "Use white space tokenizer to tokenize summary information") tokenizer = WhitespaceTokenizer() else: raise ArgumentError( "Tokenizer value %s is invalid. You should choose one of these: default and white_space" % compareAggOpt['tokenizer']) # Preparing input handlers, preprocessors and cache minSeqSize = max(compareAggOpt['aggregate']["window"] ) if compareAggOpt['aggregate']["model"] == "cnn" else -1 if compareAggOpt['summary'] is not None: # Use summary and description (concatenated) to address this problem logger.info("Using Summary information.") # Loading Filters sumFilters = loadFilters(compareAggOpt['summary']['filters']) if compareAggOpt['summary']['model_type'] in ('lstm', 'gru', 'word_emd', 'residual'): arguments = (databasePath, compareAggOpt['word_embedding'], ' '.join( sorted([ fil.__class__.__name__ for fil in sumFilters ])), compareAggOpt['tokenizer'], SummaryPreprocessor.__name__) inputHandlers.append( RNNInputHandler(paddingId, minInputSize=minSeqSize)) summaryCache = PreprocessingCache(cacheFolder, arguments) summaryPreprocessor = SummaryPreprocessor(lexicon, bugReportDatabase, sumFilters, tokenizer, paddingId, summaryCache) elif compareAggOpt['summary']['model_type'] == 'ELMo': raise NotImplementedError("ELMO is not implemented!") # inputHandlers.append(ELMoInputHandler(cudaOn, minInputSize=minSeqSize)) # summaryPreprocessor = ELMoPreprocessor(0, elmoEmbedding) # compareAggOpt['summary']["input_size"] = elmoEmbedding.get_size() elif compareAggOpt['summary']['model_type'] == 'BERT': arguments = (databasePath, "CADD SUMMARY", "BERT", "bert-base-uncased") inputHandlers.append(BERTInputHandler(0, minInputSize=minSeqSize)) summaryCache = PreprocessingCache(cacheFolder, arguments) summaryPreprocessor = TransformerPreprocessor( "short_desc", "bert-base-uncased", BertTokenizer, 0, bugReportDatabase, summaryCache) # compareAggOpt['summary']["input_size"] = 768 preprocessors.append(summaryPreprocessor) if compareAggOpt['desc'] is not None: # Use summary and description (concatenated) to address this problem logger.info("Using Description information.") descFilters = loadFilters(compareAggOpt['desc']['filters']) if compareAggOpt['desc']['model_type'] in ('lstm', 'gru', 'word_emd', 'residual'): arguments = (databasePath, compareAggOpt['word_embedding'], ' '.join( sorted([ fil.__class__.__name__ for fil in descFilters ])), compareAggOpt['tokenizer'], "CADD DESC", str(compareAggOpt['desc']['summarization'])) inputHandlers.append( RNNInputHandler(paddingId, minInputSize=minSeqSize)) descriptionCache = PreprocessingCache(cacheFolder, arguments) descPreprocessor = DescriptionPreprocessor(lexicon, bugReportDatabase, descFilters, tokenizer, paddingId, cache=descriptionCache) elif compareAggOpt['desc']['model_type'] == 'ELMo': raise NotImplementedError("ELMO is not implemented!") # inputHandlers.append(ELMoInputHandler(cudaOn, minInputSize=minSeqSize)) # descPreprocessor = ELMoPreprocessor(1, elmoEmbedding) # compareAggOpt['desc']["input_size"] = elmoEmbedding.get_size() elif compareAggOpt['desc']['model_type'] == 'BERT': arguments = (databasePath, "CADD DESC", "BERT", "bert-base-uncased") inputHandlers.append(BERTInputHandler(0, minInputSize=minSeqSize)) descriptionCache = PreprocessingCache(cacheFolder, arguments) descPreprocessor = TransformerPreprocessor("description", "bert-base-uncased", BertTokenizer, 0, bugReportDatabase, descriptionCache) # compareAggOpt['desc']["input_size"] = 768 preprocessors.append(descPreprocessor) # Create model model = CADD(embedding, categoricalEncoder, compareAggOpt, compareAggOpt['summary'], compareAggOpt['desc'], compareAggOpt['matching'], compareAggOpt['aggregate'], cudaOn=cudaOn) lossFn = F.nll_loss lossNoReduction = NLLLoss(reduction='none') if cudaOn: model.cuda() if modelState: model.load_state_dict(modelState) """ Loading the training and validation. Also, it sets how the negative example will be generated. """ cmpAggCollate = PairBugCollate(inputHandlers, torch.int64) # load training if args.get('pairs_training'): negativePairGenOpt = args.get('neg_pair_generator', ) pairTrainingFile = args.get('pairs_training') offlineGeneration = not (negativePairGenOpt is None or negativePairGenOpt['type'] == 'none') masterIdByBugId = bugReportDatabase.getMasterIdByBugId() randomAnchor = negativePairGenOpt['random_anchor'] if rep: logger.info("Generate negative examples using REP.") randomAnchor = negativePairGenOpt['random_anchor'] trainingDataset = BugDataset(args['rep']['training']) bugIds = trainingDataset.bugIds negativePairGenerator = REPGenerator(rep, rep_input_by_id, args['rep']['neg_training'], preprocessors, bugIds, masterIdByBugId, args['rep']['rate'], randomAnchor) elif not offlineGeneration: logger.info("Not generate dynamically the negative examples.") negativePairGenerator = None else: pairGenType = negativePairGenOpt['type'] if pairGenType == 'random': logger.info("Random Negative Pair Generator") trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (trainingDataset.info, len(bugIds))) negativePairGenerator = RandomGenerator( preprocessors, cmpAggCollate, negativePairGenOpt['rate'], bugIds, masterIdByBugId, randomAnchor=randomAnchor) elif pairGenType == 'non_negative': logger.info("Non Negative Pair Generator") trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (trainingDataset.info, len(bugIds))) negativePairGenerator = NonNegativeRandomGenerator( preprocessors, cmpAggCollate, negativePairGenOpt['rate'], bugIds, masterIdByBugId, negativePairGenOpt['n_tries'], device, randomAnchor=randomAnchor) elif pairGenType == 'misc_non_zero': logger.info("Misc Non Zero Pair Generator") trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (trainingDataset.info, len(bugIds))) negativePairGenerator = MiscNonZeroRandomGen( preprocessors, cmpAggCollate, negativePairGenOpt['rate'], bugIds, trainingDataset.duplicateIds, masterIdByBugId, negativePairGenOpt['n_tries'], device, randomAnchor=randomAnchor) elif pairGenType == 'random_k': logger.info("Random K Negative Pair Generator") trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (trainingDataset.info, len(bugIds))) negativePairGenerator = KRandomGenerator( preprocessors, cmpAggCollate, negativePairGenOpt['rate'], bugIds, masterIdByBugId, negativePairGenOpt['k'], device, randomAnchor=randomAnchor) elif pairGenType == "pre": logger.info("Pre-selected list generator") negativePairGenerator = PreSelectedGenerator( negativePairGenOpt['pre_list_file'], preprocessors, negativePairGenOpt['rate'], masterIdByBugId, negativePairGenOpt['preselected_length'], randomAnchor=randomAnchor) elif pairGenType == "positive_pre": logger.info("Positive Pre-selected list generator") negativePairGenerator = PositivePreSelectedGenerator( negativePairGenOpt['pre_list_file'], preprocessors, cmpAggCollate, negativePairGenOpt['rate'], masterIdByBugId, negativePairGenOpt['preselected_length'], randomAnchor=randomAnchor) elif pairGenType == "misc_non_zero_pre": logger.info("Misc: non-zero and Pre-selected list generator") negativePairGenerator1 = PreSelectedGenerator( negativePairGenOpt['pre_list_file'], preprocessors, negativePairGenOpt['rate'], masterIdByBugId, negativePairGenOpt['preselected_length'], randomAnchor=randomAnchor) trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds negativePairGenerator2 = NonNegativeRandomGenerator( preprocessors, cmpAggCollate, negativePairGenOpt['rate'], bugIds, masterIdByBugId, negativePairGenOpt['n_tries'], device, randomAnchor=randomAnchor) negativePairGenerator = MiscOfflineGenerator( (negativePairGenerator1, negativePairGenerator2)) elif pairGenType == "misc_non_zero_positive_pre": logger.info( "Misc: non-zero and Positive Pre-selected list generator") negativePairGenerator1 = PositivePreSelectedGenerator( negativePairGenOpt['pre_list_file'], preprocessors, cmpAggCollate, negativePairGenOpt['rate'], masterIdByBugId, negativePairGenOpt['preselected_length'], randomAnchor=randomAnchor) trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds negativePairGenerator2 = NonNegativeRandomGenerator( preprocessors, cmpAggCollate, negativePairGenOpt['rate'], bugIds, masterIdByBugId, negativePairGenOpt['n_tries'], device, randomAnchor=randomAnchor) negativePairGenerator = MiscOfflineGenerator( (negativePairGenerator1, negativePairGenerator2)) else: raise ArgumentError( "Offline generator is invalid (%s). You should choose one of these: random, hard and pre" % pairGenType) pairTrainingReader = PairBugDatasetReader( pairTrainingFile, preprocessors, negativePairGenerator, randomInvertPair=args['random_switch']) trainingCollate = cmpAggCollate trainingLoader = DataLoader(pairTrainingReader, batch_size=batchSize, collate_fn=trainingCollate.collate, shuffle=True) logger.info("Training size: %s" % (len(trainingLoader.dataset))) # load validation if args.get('pairs_validation'): pairValidationReader = PairBugDatasetReader( args.get('pairs_validation'), preprocessors) validationLoader = DataLoader(pairValidationReader, batch_size=batchSize, collate_fn=cmpAggCollate.collate) logger.info("Validation size: %s" % (len(validationLoader.dataset))) else: validationLoader = None """ Training and evaluate the model. """ optimizer_opt = args.get('optimizer', 'adam') if optimizer_opt == 'sgd': logger.info('SGD') optimizer = optim.SGD(model.parameters(), lr=args['lr'], weight_decay=args['l2']) elif optimizer_opt == 'adam': logger.info('Adam') optimizer = optim.Adam(model.parameters(), lr=args['lr'], weight_decay=args['l2']) # Recall rate rankingScorer = GeneralScorer(model, preprocessors, device, cmpAggCollate) recallEstimationTrainOpt = args.get('recall_estimation_train') if recallEstimationTrainOpt: preselectListRankingTrain = PreselectListRanking( recallEstimationTrainOpt, args['sample_size_rr_tr']) recallEstimationOpt = args.get('recall_estimation') if recallEstimationOpt: preselectListRanking = PreselectListRanking(recallEstimationOpt, args['sample_size_rr_val']) # LR scheduler lrSchedulerOpt = args.get('lr_scheduler', None) if lrSchedulerOpt is None: logger.info("Scheduler: Constant") lrSched = None elif lrSchedulerOpt["type"] == 'step': logger.info("Scheduler: StepLR (step:%s, decay:%f)" % (lrSchedulerOpt["step_size"], args["decay"])) lrSched = StepLR(optimizer, lrSchedulerOpt["step_size"], lrSchedulerOpt["decay"]) elif lrSchedulerOpt["type"] == 'exp': logger.info("Scheduler: ExponentialLR (decay:%f)" % (lrSchedulerOpt["decay"])) lrSched = ExponentialLR(optimizer, lrSchedulerOpt["decay"]) elif lrSchedulerOpt["type"] == 'linear': logger.info( "Scheduler: Divide by (1 + epoch * decay) ---- (decay:%f)" % (lrSchedulerOpt["decay"])) lrDecay = lrSchedulerOpt["decay"] lrSched = LambdaLR(optimizer, lambda epoch: 1 / (1.0 + epoch * lrDecay)) else: raise ArgumentError( "LR Scheduler is invalid (%s). You should choose one of these: step, exp and linear " % pairGenType) # Set training functions def trainingIteration(engine, batch): engine.kk = 0 model.train() optimizer.zero_grad() x, y = batch output = model(*x) loss = lossFn(output, y) loss.backward() optimizer.step() return loss, output, y def scoreDistanceTrans(output): if len(output) == 3: _, y_pred, y = output else: y_pred, y = output if lossFn == F.nll_loss: return torch.exp(y_pred[:, 1]), y trainer = Engine(trainingIteration) trainingMetrics = { 'training_loss': AverageLoss(lossFn, batch_size=lambda x: x[0].shape[0]), 'training_dist_target': MeanScoreDistance(output_transform=scoreDistanceTrans) } # Add metrics to trainer for name, metric in trainingMetrics.items(): metric.attach(trainer, name) # Set validation functions def validationIteration(engine, batch): if not hasattr(engine, 'kk'): engine.kk = 0 model.eval() with torch.no_grad(): x, y = batch y_pred = model(*x) # for k, (pred, t) in enumerate(zip(y_pred, y)): # engine.kk += 1 # print("{}: {} \t {}".format(engine.kk, torch.round(torch.exp(pred) * 100), t)) return y_pred, y validationMetrics = { 'validation_loss': ignite.metrics.Loss(lossFn), 'validation_dist_target': MeanScoreDistance(output_transform=scoreDistanceTrans) } evaluator = Engine(validationIteration) # Add metrics to evaluator for name, metric in validationMetrics.items(): metric.attach(evaluator, name) # recommendation if rep: recommendation_fn = REP_CADD_Recommender( rep, rep_input_by_id, rep_recommendation).generateRecommendationList else: recommendation_fn = generateRecommendationList @trainer.on(Events.EPOCH_STARTED) def onStartEpoch(engine): epoch = engine.state.epoch logger.info("Epoch: %d" % epoch) if lrSched: lrSched.step() logger.info("LR: %s" % str(optimizer.param_groups[0]["lr"])) @trainer.on(Events.EPOCH_COMPLETED) def onEndEpoch(engine): epoch = engine.state.epoch logMetrics(_run, logger, engine.state.metrics, epoch) # Evaluate Training if validationLoader: evaluator.run(validationLoader) logMetrics(_run, logger, evaluator.state.metrics, epoch) if recallEstimationTrainOpt and (epoch % args['rr_train_epoch'] == 0): logRankingResult(_run, logger, preselectListRankingTrain, rankingScorer, bugReportDatabase, None, epoch, "train", recommendationListfn=recommendation_fn) rankingScorer.free() if recallEstimationOpt and (epoch % args['rr_val_epoch'] == 0): logRankingResult(_run, logger, preselectListRanking, rankingScorer, bugReportDatabase, args.get("ranking_result_file"), epoch, "validation", recommendationListfn=recommendation_fn) rankingScorer.free() pairTrainingReader.sampleNewNegExamples(model, lossNoReduction) if args.get('save'): save_by_epoch = args['save_by_epoch'] if save_by_epoch and epoch in save_by_epoch: file_name, file_extension = os.path.splitext(args['save']) file_path = file_name + '_epoch_{}'.format( epoch) + file_extension else: file_path = args['save'] modelInfo = { 'model': model.state_dict(), 'params': parametersToSave } logger.info("==> Saving Model: %s" % file_path) torch.save(modelInfo, file_path) if args.get('pairs_training'): trainer.run(trainingLoader, max_epochs=args['epochs']) elif args.get('pairs_validation'): # Evaluate Training evaluator.run(validationLoader) logMetrics(_run, logger, evaluator.state.metrics, 0) if recallEstimationOpt: logRankingResult(_run, logger, preselectListRanking, rankingScorer, bugReportDatabase, args.get("ranking_result_file"), 0, "validation", recommendationListfn=recommendation_fn) recallRateOpt = args.get('recall_rate', {'type': 'none'}) if recallRateOpt['type'] != 'none': if recallRateOpt['type'] == 'sun2011': logger.info("Calculating recall rate: {}".format( recallRateOpt['type'])) recallRateDataset = BugDataset(recallRateOpt['dataset']) rankingClass = SunRanking(bugReportDatabase, recallRateDataset, recallRateOpt['window']) # We always group all bug reports by master in the results in the sun 2011 methodology group_by_master = True elif recallRateOpt['type'] == 'deshmukh': logger.info("Calculating recall rate: {}".format( recallRateOpt['type'])) recallRateDataset = BugDataset(recallRateOpt['dataset']) rankingClass = DeshmukhRanking(bugReportDatabase, recallRateDataset) group_by_master = recallRateOpt['group_by_master'] else: raise ArgumentError( "recall_rate.type is invalid (%s). You should choose one of these: step, exp and linear " % recallRateOpt['type']) logRankingResult(_run, logger, rankingClass, rankingScorer, bugReportDatabase, recallRateOpt["result_file"], 0, None, group_by_master, recommendationListfn=recommendation_fn)