def load_embedding(opts, paddingSym): if opts["lexicon"]: emb = np.load(opts["word_embedding"]) lexicon = Lexicon(unknownSymbol=None) with codecs.open(opts["lexicon"]) as f: for l in f: lexicon.put(l.strip()) lexicon.setUnknown("UUUKNNN") paddingId = lexicon.getLexiconIndex(paddingSym) embedding = Embedding(lexicon, emb, paddingIdx=paddingId) elif opts["word_embedding"]: # todo: Allow use embeddings and other representation lexicon, embedding = Embedding.fromFile(opts['word_embedding'], 'UUUKNNN', hasHeader=False, paddingSym=paddingSym) return lexicon, embedding
def main(): full_path = os.path.realpath(__file__) path, filename = os.path.split(full_path) logging.config.fileConfig(os.path.join(path, 'logging.conf'), defaults={}) log = logging.getLogger(__name__) if len(sys.argv) != 2: log.error("Missing argument: <JSON config file>") exit(1) argsDict = JsonArgParser(PARAMETERS).parse(sys.argv[1]) args = dict2obj(argsDict, 'ShortDocArguments') logging.getLogger(__name__).info(argsDict) if args.seed: random.seed(args.seed) np.random.seed(args.seed) lr = args.lr startSymbol = args.start_symbol endSymbol = args.end_symbol numEpochs = args.num_epochs shuffle = args.shuffle normalizeMethod = args.normalization wordWindowSize = args.word_window_size hiddenLayerSize = args.hidden_size convSize = args.conv_size # Load classes for filters. filters = [] for filterName in args.filters: moduleName, className = filterName.rsplit('.', 1) log.info("Filtro: " + moduleName + " " + className) module_ = importlib.import_module(moduleName) filters.append(getattr(module_, className)()) W1 = None b1 = None W2 = None b2 = None wordEmbedding = None if args.word_embedding: log.info("Reading W2v File") (wordLexicon, wordEmbedding) = Embedding.fromWord2Vec(args.word_embedding, unknownSymbol="__UNKNOWN__") wordLexicon.stopAdd() elif args.word_lexicon and args.word_emb_size: wordLexicon = Lexicon.fromTextFile(args.word_lexicon, hasUnknowSymbol=False) wordEmbedding = Embedding(wordLexicon, embeddingSize=args.word_emb_size) wordLexicon.stopAdd() else: log.error("You must provide argument word_embedding or word_lexicon and word_emb_size") # Create the lexicon of labels. labelLexicon = None if args.labels is not None: if args.label_lexicon is not None: log.error("Only one of the parameters label_lexicon and labels can be provided!") exit(1) labelLexicon = Lexicon.fromList(args.labels, hasUnknowSymbol=False) elif args.label_lexicon is not None: labelLexicon = Lexicon.fromTextFile(args.label_lexicon, hasUnknowSymbol=False) else: log.error("One of the parameters label_lexicon or labels must be provided!") exit(1) # # Build the network model (Theano graph). # # TODO: debug # theano.config.compute_test_value = 'warn' # ex = trainIterator.next() # inWords.tag.test_value = ex[0][0] # outLabel.tag.test_value = ex[1][0] # Matriz de entrada. Cada linha representa um token da oferta. Cada token é # representado por uma janela de tokens (token central e alguns tokens # próximos). Cada valor desta matriz corresponde a um índice que representa # um token no embedding. inWords = tensor.lmatrix("inWords") # Categoria correta de uma oferta. outLabel = tensor.lscalar("outLabel") # List of input tensors. One for each input layer. inputTensors = [inWords] # Whether the word embedding will be updated during training. embLayerTrainable = not args.fix_word_embedding if not embLayerTrainable: log.info("Not updating the word embedding!") # Lookup table for word features. embeddingLayer = EmbeddingLayer(inWords, wordEmbedding.getEmbeddingMatrix(), trainable=embLayerTrainable) # if not args.train and args.load_wordEmbedding: # attrs = np.load(args.load_wordEmbedding) # embeddingLayer.load(attrs) # log.info("Loaded word embedding (shape %s) from file %s" % ( # str(attrs[0].shape), args.load_wordEmbedding)) # A saída da lookup table possui 3 dimensões (numTokens, szWindow, szEmbedding). # Esta camada dá um flat nas duas últimas dimensões, produzindo uma saída # com a forma (numTokens, szWindow * szEmbedding). flattenInput = FlattenLayer(embeddingLayer) # Random weight initialization procedure. weightInit = GlorotUniform() # Convolution layer. Convolução no texto de uma oferta. convW = None convb = None if not args.train and args.load_conv: convNPY = np.load(args.load_conv) convW = convNPY[0] convb = convNPY[1] log.info("Loaded convolutional layer (shape %s) from file %s" % (str(convW.shape), args.load_conv)) convLinear = LinearLayer(flattenInput, wordWindowSize * wordEmbedding.getEmbeddingSize(), convSize, W=convW, b=convb, weightInitialization=weightInit) if args.conv_act: convOut = ActivationLayer(convLinear, tanh) else: convOut = convLinear # Max pooling layer. maxPooling = MaxPoolingLayer(convOut) # Hidden layer. if not args.train and args.load_hiddenLayer: hiddenNPY = np.load(args.load_hiddenLayer) W1 = hiddenNPY[0] b1 = hiddenNPY[1] log.info("Loaded hidden layer (shape %s) from file %s" % (str(W1.shape), args.load_hiddenLayer)) hiddenLinear = LinearLayer(maxPooling, convSize, hiddenLayerSize, W=W1, b=b1, weightInitialization=weightInit) hiddenAct = ActivationLayer(hiddenLinear, tanh) # Entrada linear da camada softmax. if not args.train and args.load_softmax: hiddenNPY = np.load(args.load_softmax) W2 = hiddenNPY[0] b2 = hiddenNPY[1] log.info("Loaded softmax layer (shape %s) from file %s" % (str(W2.shape), args.load_softmax)) sotmaxLinearInput = LinearLayer(hiddenAct, hiddenLayerSize, labelLexicon.getLen(), W=W2, b=b2, weightInitialization=ZeroWeightGenerator()) # Softmax. # softmaxAct = ReshapeLayer(ActivationLayer(sotmaxLinearInput, softmax), (1, -1)) softmaxAct = ActivationLayer(sotmaxLinearInput, softmax) # Prediction layer (argmax). prediction = ArgmaxPrediction(None).predict(softmaxAct.getOutput()) # Loss function. if args.label_weights is not None and len(args.label_weights) != labelLexicon.getLen(): log.error("Number of label weights (%d) is different from number of labels (%d)!" % ( len(args.label_weights), labelLexicon.getLen())) nlloe = NegativeLogLikelihoodOneExample(weights=args.label_weights) loss = nlloe.calculateError(softmaxAct.getOutput()[0], prediction, outLabel) # Input generators: word window. inputGenerators = [WordWindowGenerator(wordWindowSize, wordLexicon, filters, startSymbol, endSymbol)] # Output generator: generate one label per offer. outputGenerators = [TextLabelGenerator(labelLexicon)] # outputGenerators = [lambda label: labelLexicon.put(label)] evalPerIteration = None if args.train: trainDatasetReader = ShortDocReader(args.train) if args.load_method == "sync": log.info("Reading training examples...") trainIterator = SyncBatchIterator(trainDatasetReader, inputGenerators, outputGenerators, - 1, shuffle=shuffle) wordLexicon.stopAdd() elif args.load_method == "async": log.info("Examples will be asynchronously loaded.") trainIterator = AsyncBatchIterator(trainDatasetReader, inputGenerators, outputGenerators, - 1, shuffle=shuffle, maxqSize=1000) else: log.error("The argument 'load_method' has an invalid value: %s." % args.load_method) sys.exit(1) labelLexicon.stopAdd() # Get dev inputs and output dev = args.dev evalPerIteration = args.eval_per_iteration if not dev and evalPerIteration > 0: log.error("Argument eval_per_iteration cannot be used without a dev argument.") sys.exit(1) if dev: log.info("Reading development examples") devReader = ShortDocReader(args.dev) devIterator = SyncBatchIterator(devReader, inputGenerators, outputGenerators, - 1, shuffle=False) else: devIterator = None else: trainIterator = None devIterator = None if normalizeMethod == "minmax": log.info("Normalization: minmax") wordEmbedding.minMaxNormalization() elif normalizeMethod == "mean": log.info("Normalization: mean normalization") wordEmbedding.meanNormalization() elif normalizeMethod == "zscore": log.info("Normalization: zscore normalization") wordEmbedding.zscoreNormalization() elif normalizeMethod: log.error("Normalization: unknown value %s" % normalizeMethod) sys.exit(1) # Decaimento da taxa de aprendizado. decay = None if args.decay == "none": decay = 0.0 elif args.decay == "linear": decay = 1.0 else: log.error("Unknown decay parameter %s." % args.decay) exit(1) # Algoritmo de aprendizado. if args.alg == "adagrad": log.info("Using Adagrad") opt = Adagrad(lr=lr, decay=decay) elif args.alg == "sgd": log.info("Using SGD") opt = SGD(lr=lr, decay=decay) else: log.error("Unknown algorithm: %s." % args.alg) sys.exit(1) # TODO: debug # opt.lr.tag.test_value = 0.05 # Printing embedding information. dictionarySize = wordEmbedding.getNumberOfVectors() embeddingSize = wordEmbedding.getEmbeddingSize() log.info("Dictionary size: %d" % dictionarySize) log.info("Embedding size: %d" % embeddingSize) log.info("Number of categories: %d" % labelLexicon.getLen()) # Train metrics. trainMetrics = None if trainIterator: trainMetrics = [ LossMetric("TrainLoss", loss), AccuracyMetric("TrainAccuracy", outLabel, prediction) ] # Evaluation metrics. evalMetrics = None if devIterator: evalMetrics = [ LossMetric("EvalLoss", loss), AccuracyMetric("EvalAccuracy", outLabel, prediction), FMetric("EvalFMetric", outLabel, prediction, labels=labelLexicon.getLexiconDict().values()) ] # Test metrics. testMetrics = None if args.test: testMetrics = [ LossMetric("TestLoss", loss), AccuracyMetric("TestAccuracy", outLabel, prediction), FMetric("TestFMetric", outLabel, prediction, labels=labelLexicon.getLexiconDict().values()) ] # TODO: debug # mode = theano.compile.debugmode.DebugMode(optimizer=None) mode = None model = BasicModel(x=inputTensors, y=[outLabel], allLayers=softmaxAct.getLayerSet(), optimizer=opt, prediction=prediction, loss=loss, trainMetrics=trainMetrics, evalMetrics=evalMetrics, testMetrics=testMetrics, mode=mode) # Training if trainIterator: log.info("Training") model.train(trainIterator, numEpochs, devIterator, evalPerIteration=evalPerIteration) # Saving model after training if args.save_wordEmbedding: embeddingLayer.saveAsW2V(args.save_wordEmbedding, lexicon=wordLexicon) log.info("Saved word to vector to file: %s" % (args.save_wordEmbedding)) if args.save_conv: convLinear.save(args.save_conv) log.info("Saved convolution layer to file: %s" % (args.save_conv)) if args.save_hiddenLayer: hiddenLinear.save(args.save_hiddenLayer) log.info("Saved hidden layer to file: %s" % (args.save_hiddenLayer)) if args.save_softmax: sotmaxLinearInput.save(args.save_softmax) log.info("Saved softmax to file: %s" % (args.save_softmax)) # Testing if args.test: log.info("Reading test examples") testReader = ShortDocReader(args.test) testIterator = SyncBatchIterator(testReader, inputGenerators, outputGenerators, - 1, shuffle=False) log.info("Testing") model.test(testIterator)
def main(_run, _config, _seed, _log): """ :param _run: :param _config: :param _seed: :param _log: :return: """ """ Setting and loading parameters """ # Setting logger args = _config logger = _log logger.info(args) logger.info('It started at: %s' % datetime.now()) torch.manual_seed(_seed) bugReportDatabase = BugReportDatabase.fromJson(args['bug_database']) paddingSym = "</s>" batchSize = args['batch_size'] device = torch.device('cuda' if args['cuda'] else "cpu") if args['cuda']: logger.info("Turning CUDA on") else: logger.info("Turning CUDA off") # It is the folder where the preprocessed information will be stored. cacheFolder = args['cache_folder'] # Setting the parameter to save and loading parameters importantParameters = ['compare_aggregation', 'categorical'] parametersToSave = dict([(parName, args[parName]) for parName in importantParameters]) if args['load'] is not None: mapLocation = ( lambda storage, loc: storage.cuda()) if args['cuda'] else 'cpu' modelInfo = torch.load(args['load'], map_location=mapLocation) modelState = modelInfo['model'] for paramName, paramValue in modelInfo['params'].items(): args[paramName] = paramValue else: modelState = None preprocessors = PreprocessorList() inputHandlers = [] categoricalOpt = args.get('categorical') if categoricalOpt is not None and len(categoricalOpt) != 0: categoricalEncoder, _, _ = processCategoricalParam( categoricalOpt, bugReportDatabase, inputHandlers, preprocessors, None, logger) else: categoricalEncoder = None filterInputHandlers = [] compareAggOpt = args['compare_aggregation'] databasePath = args['bug_database'] # Loading word embedding if compareAggOpt["lexicon"]: emb = np.load(compareAggOpt["word_embedding"]) lexicon = Lexicon(unknownSymbol=None) with codecs.open(compareAggOpt["lexicon"]) as f: for l in f: lexicon.put(l.strip()) lexicon.setUnknown("UUUKNNN") paddingId = lexicon.getLexiconIndex(paddingSym) embedding = Embedding(lexicon, emb, paddingIdx=paddingId) logger.info("Lexicon size: %d" % (lexicon.getLen())) logger.info("Word Embedding size: %d" % (embedding.getEmbeddingSize())) elif compareAggOpt["word_embedding"]: # todo: Allow use embeddings and other representation lexicon, embedding = Embedding.fromFile( compareAggOpt['word_embedding'], 'UUUKNNN', hasHeader=False, paddingSym=paddingSym) logger.info("Lexicon size: %d" % (lexicon.getLen())) logger.info("Word Embedding size: %d" % (embedding.getEmbeddingSize())) paddingId = lexicon.getLexiconIndex(paddingSym) else: embedding = None if compareAggOpt["norm_word_embedding"]: embedding.zscoreNormalization() # Tokenizer if compareAggOpt['tokenizer'] == 'default': logger.info("Use default tokenizer to tokenize summary information") tokenizer = MultiLineTokenizer() elif compareAggOpt['tokenizer'] == 'white_space': logger.info( "Use white space tokenizer to tokenize summary information") tokenizer = WhitespaceTokenizer() else: raise ArgumentError( "Tokenizer value %s is invalid. You should choose one of these: default and white_space" % compareAggOpt['tokenizer']) # Preparing input handlers, preprocessors and cache minSeqSize = max(compareAggOpt['aggregate']["window"] ) if compareAggOpt['aggregate']["model"] == "cnn" else -1 bow = compareAggOpt.get('bow', False) freq = compareAggOpt.get('frequency', False) and bow logger.info("BoW={} and TF={}".format(bow, freq)) if compareAggOpt['extractor'] is not None: # Use summary and description (concatenated) to address this problem logger.info("Using Summary and Description information.") # Loading Filters extractorFilters = loadFilters(compareAggOpt['extractor']['filters']) arguments = (databasePath, compareAggOpt['word_embedding'], str(compareAggOpt['lexicon']), ' '.join( sorted([ fil.__class__.__name__ for fil in extractorFilters ])), compareAggOpt['tokenizer'], str(bow), str(freq), SABDEncoderPreprocessor.__name__) inputHandlers.append(SABDInputHandler(paddingId, minSeqSize)) extractorCache = PreprocessingCache(cacheFolder, arguments) if bow: extractorPreprocessor = SABDBoWPreprocessor( lexicon, bugReportDatabase, extractorFilters, tokenizer, paddingId, freq, extractorCache) else: extractorPreprocessor = SABDEncoderPreprocessor( lexicon, bugReportDatabase, extractorFilters, tokenizer, paddingId, extractorCache) preprocessors.append(extractorPreprocessor) # Create model model = SABD(embedding, categoricalEncoder, compareAggOpt['extractor'], compareAggOpt['matching'], compareAggOpt['aggregate'], compareAggOpt['classifier'], freq) if args['loss'] == 'bce': logger.info("Using BCE Loss: margin={}".format(args['margin'])) lossFn = BCELoss() lossNoReduction = BCELoss(reduction='none') cmp_collate = PairBugCollate(inputHandlers, torch.float32, unsqueeze_target=True) elif args['loss'] == 'triplet': logger.info("Using Triplet Loss: margin={}".format(args['margin'])) lossFn = TripletLoss(args['margin']) lossNoReduction = TripletLoss(args['margin'], reduction='none') cmp_collate = TripletBugCollate(inputHandlers) model.to(device) if modelState: model.load_state_dict(modelState) """ Loading the training and validation. Also, it sets how the negative example will be generated. """ # load training if args.get('pairs_training'): negativePairGenOpt = args.get('neg_pair_generator', ) trainingFile = args.get('pairs_training') offlineGeneration = not (negativePairGenOpt is None or negativePairGenOpt['type'] == 'none') masterIdByBugId = bugReportDatabase.getMasterIdByBugId() randomAnchor = negativePairGenOpt['random_anchor'] if not offlineGeneration: logger.info("Not generate dynamically the negative examples.") negativePairGenerator = None else: pairGenType = negativePairGenOpt['type'] if pairGenType == 'random': logger.info("Random Negative Pair Generator") trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (trainingDataset.info, len(bugIds))) negativePairGenerator = RandomGenerator( preprocessors, cmp_collate, negativePairGenOpt['rate'], bugIds, masterIdByBugId, randomAnchor=randomAnchor) elif pairGenType == 'non_negative': logger.info("Non Negative Pair Generator") trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (trainingDataset.info, len(bugIds))) negativePairGenerator = NonNegativeRandomGenerator( preprocessors, cmp_collate, negativePairGenOpt['rate'], bugIds, masterIdByBugId, negativePairGenOpt['n_tries'], device, randomAnchor=randomAnchor) elif pairGenType == 'misc_non_zero': logger.info("Misc Non Zero Pair Generator") trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (trainingDataset.info, len(bugIds))) negativePairGenerator = MiscNonZeroRandomGen( preprocessors, cmp_collate, negativePairGenOpt['rate'], bugIds, trainingDataset.duplicateIds, masterIdByBugId, negativePairGenOpt['n_tries'], device, randomAnchor=randomAnchor) elif pairGenType == 'product_component': logger.info("Product Component Pair Generator") trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (trainingDataset.info, len(bugIds))) negativePairGenerator = ProductComponentRandomGen( bugReportDatabase, preprocessors, cmp_collate, negativePairGenOpt['rate'], bugIds, masterIdByBugId, negativePairGenOpt['n_tries'], device, randomAnchor=randomAnchor) elif pairGenType == 'random_k': logger.info("Random K Negative Pair Generator") trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (trainingDataset.info, len(bugIds))) negativePairGenerator = KRandomGenerator( preprocessors, cmp_collate, negativePairGenOpt['rate'], bugIds, masterIdByBugId, negativePairGenOpt['k'], device, randomAnchor=randomAnchor) elif pairGenType == "pre": logger.info("Pre-selected list generator") negativePairGenerator = PreSelectedGenerator( negativePairGenOpt['pre_list_file'], preprocessors, negativePairGenOpt['rate'], masterIdByBugId, negativePairGenOpt['preselected_length'], randomAnchor=randomAnchor) elif pairGenType == "positive_pre": logger.info("Positive Pre-selected list generator") negativePairGenerator = PositivePreSelectedGenerator( negativePairGenOpt['pre_list_file'], preprocessors, cmp_collate, negativePairGenOpt['rate'], masterIdByBugId, negativePairGenOpt['preselected_length'], randomAnchor=randomAnchor) elif pairGenType == "misc_non_zero_pre": logger.info("Misc: non-zero and Pre-selected list generator") negativePairGenerator1 = PreSelectedGenerator( negativePairGenOpt['pre_list_file'], preprocessors, negativePairGenOpt['rate'], masterIdByBugId, negativePairGenOpt['preselected_length'], randomAnchor=randomAnchor) trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds negativePairGenerator2 = NonNegativeRandomGenerator( preprocessors, cmp_collate, negativePairGenOpt['rate'], bugIds, masterIdByBugId, negativePairGenOpt['n_tries'], device, randomAnchor=randomAnchor) negativePairGenerator = MiscOfflineGenerator( (negativePairGenerator1, negativePairGenerator2)) elif pairGenType == "misc_non_zero_positive_pre": logger.info( "Misc: non-zero and Positive Pre-selected list generator") negativePairGenerator1 = PositivePreSelectedGenerator( negativePairGenOpt['pre_list_file'], preprocessors, cmp_collate, negativePairGenOpt['rate'], masterIdByBugId, negativePairGenOpt['preselected_length'], randomAnchor=randomAnchor) trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds negativePairGenerator2 = NonNegativeRandomGenerator( preprocessors, cmp_collate, negativePairGenOpt['rate'], bugIds, masterIdByBugId, negativePairGenOpt['n_tries'], device, randomAnchor=randomAnchor) negativePairGenerator = MiscOfflineGenerator( (negativePairGenerator1, negativePairGenerator2)) else: raise ArgumentError( "Offline generator is invalid (%s). You should choose one of these: random, hard and pre" % pairGenType) if isinstance(lossFn, BCELoss): training_reader = PairBugDatasetReader( trainingFile, preprocessors, negativePairGenerator, randomInvertPair=args['random_switch']) elif isinstance(lossFn, TripletLoss): training_reader = TripletBugDatasetReader( trainingFile, preprocessors, negativePairGenerator, randomInvertPair=args['random_switch']) trainingLoader = DataLoader(training_reader, batch_size=batchSize, collate_fn=cmp_collate.collate, shuffle=True) logger.info("Training size: %s" % (len(trainingLoader.dataset))) # load validation if args.get('pairs_validation'): if isinstance(lossFn, BCELoss): validation_reader = PairBugDatasetReader( args.get('pairs_validation'), preprocessors) elif isinstance(lossFn, TripletLoss): validation_reader = TripletBugDatasetReader( args.get('pairs_validation'), preprocessors) validationLoader = DataLoader(validation_reader, batch_size=batchSize, collate_fn=cmp_collate.collate) logger.info("Validation size: %s" % (len(validationLoader.dataset))) else: validationLoader = None """ Training and evaluate the model. """ optimizer_opt = args.get('optimizer', 'adam') if optimizer_opt == 'sgd': logger.info('SGD') optimizer = optim.SGD(model.parameters(), lr=args['lr'], weight_decay=args['l2']) elif optimizer_opt == 'adam': logger.info('Adam') optimizer = optim.Adam(model.parameters(), lr=args['lr'], weight_decay=args['l2']) # Recall rate rankingScorer = GeneralScorer( model, preprocessors, device, PairBugCollate(inputHandlers, ignore_target=True), args['ranking_batch_size'], args['ranking_n_workers']) recallEstimationTrainOpt = args.get('recall_estimation_train') if recallEstimationTrainOpt: preselectListRankingTrain = PreselectListRanking( recallEstimationTrainOpt, args['sample_size_rr_tr']) recallEstimationOpt = args.get('recall_estimation') if recallEstimationOpt: preselectListRanking = PreselectListRanking(recallEstimationOpt, args['sample_size_rr_val']) # LR scheduler lrSchedulerOpt = args.get('lr_scheduler', None) if lrSchedulerOpt is None: logger.info("Scheduler: Constant") lrSched = None elif lrSchedulerOpt["type"] == 'step': logger.info("Scheduler: StepLR (step:%s, decay:%f)" % (lrSchedulerOpt["step_size"], args["decay"])) lrSched = StepLR(optimizer, lrSchedulerOpt["step_size"], lrSchedulerOpt["decay"]) elif lrSchedulerOpt["type"] == 'exp': logger.info("Scheduler: ExponentialLR (decay:%f)" % (lrSchedulerOpt["decay"])) lrSched = ExponentialLR(optimizer, lrSchedulerOpt["decay"]) elif lrSchedulerOpt["type"] == 'linear': logger.info( "Scheduler: Divide by (1 + epoch * decay) ---- (decay:%f)" % (lrSchedulerOpt["decay"])) lrDecay = lrSchedulerOpt["decay"] lrSched = LambdaLR(optimizer, lambda epoch: 1 / (1.0 + epoch * lrDecay)) else: raise ArgumentError( "LR Scheduler is invalid (%s). You should choose one of these: step, exp and linear " % pairGenType) # Set training functions def trainingIteration(engine, batch): engine.kk = 0 model.train() optimizer.zero_grad() x, y = cmp_collate.to(batch, device) output = model(*x) loss = lossFn(output, y) loss.backward() optimizer.step() return loss, output, y def scoreDistanceTrans(output): if len(output) == 3: _, y_pred, y = output else: y_pred, y = output if lossFn == F.nll_loss: return torch.exp(y_pred[:, 1]), y elif isinstance(lossFn, (BCELoss)): return y_pred, y trainer = Engine(trainingIteration) trainingMetrics = {'training_loss': AverageLoss(lossFn)} if isinstance(lossFn, BCELoss): trainingMetrics['training_dist_target'] = MeanScoreDistance( output_transform=scoreDistanceTrans) trainingMetrics['training_acc'] = AccuracyWrapper( output_transform=thresholded_output_transform) trainingMetrics['training_precision'] = PrecisionWrapper( output_transform=thresholded_output_transform) trainingMetrics['training_recall'] = RecallWrapper( output_transform=thresholded_output_transform) # Add metrics to trainer for name, metric in trainingMetrics.items(): metric.attach(trainer, name) # Set validation functions def validationIteration(engine, batch): if not hasattr(engine, 'kk'): engine.kk = 0 model.eval() with torch.no_grad(): x, y = cmp_collate.to(batch, device) y_pred = model(*x) return y_pred, y validationMetrics = { 'validation_loss': LossWrapper(lossFn, output_transform=lambda x: (x[0], x[0][0]) if x[1] is None else x) } if isinstance(lossFn, BCELoss): validationMetrics['validation_dist_target'] = MeanScoreDistance( output_transform=scoreDistanceTrans) validationMetrics['validation_acc'] = AccuracyWrapper( output_transform=thresholded_output_transform) validationMetrics['validation_precision'] = PrecisionWrapper( output_transform=thresholded_output_transform) validationMetrics['validation_recall'] = RecallWrapper( output_transform=thresholded_output_transform) evaluator = Engine(validationIteration) # Add metrics to evaluator for name, metric in validationMetrics.items(): metric.attach(evaluator, name) # recommendation recommendation_fn = generateRecommendationList @trainer.on(Events.EPOCH_STARTED) def onStartEpoch(engine): epoch = engine.state.epoch logger.info("Epoch: %d" % epoch) if lrSched: lrSched.step() logger.info("LR: %s" % str(optimizer.param_groups[0]["lr"])) @trainer.on(Events.EPOCH_COMPLETED) def onEndEpoch(engine): epoch = engine.state.epoch logMetrics(_run, logger, engine.state.metrics, epoch) # Evaluate Training if validationLoader: evaluator.run(validationLoader) logMetrics(_run, logger, evaluator.state.metrics, epoch) lastEpoch = args['epochs'] - epoch == 0 if recallEstimationTrainOpt and (epoch % args['rr_train_epoch'] == 0): logRankingResult(_run, logger, preselectListRankingTrain, rankingScorer, bugReportDatabase, None, epoch, "train", recommendationListfn=recommendation_fn) rankingScorer.free() if recallEstimationOpt and (epoch % args['rr_val_epoch'] == 0): logRankingResult(_run, logger, preselectListRanking, rankingScorer, bugReportDatabase, args.get("ranking_result_file"), epoch, "validation", recommendationListfn=recommendation_fn) rankingScorer.free() if not lastEpoch: training_reader.sampleNewNegExamples(model, lossNoReduction) if args.get('save'): save_by_epoch = args['save_by_epoch'] if save_by_epoch and epoch in save_by_epoch: file_name, file_extension = os.path.splitext(args['save']) file_path = file_name + '_epoch_{}'.format( epoch) + file_extension else: file_path = args['save'] modelInfo = { 'model': model.state_dict(), 'params': parametersToSave } logger.info("==> Saving Model: %s" % file_path) torch.save(modelInfo, file_path) if args.get('pairs_training'): trainer.run(trainingLoader, max_epochs=args['epochs']) elif args.get('pairs_validation'): # Evaluate Training evaluator.run(validationLoader) logMetrics(_run, logger, evaluator.state.metrics, 0) if recallEstimationOpt: logRankingResult(_run, logger, preselectListRanking, rankingScorer, bugReportDatabase, args.get("ranking_result_file"), 0, "validation", recommendationListfn=recommendation_fn) # Test Dataset (accuracy, recall, precision, F1) pair_test_dataset = args.get('pair_test_dataset') if pair_test_dataset is not None and len(pair_test_dataset) > 0: pairTestReader = PairBugDatasetReader(pair_test_dataset, preprocessors) testLoader = DataLoader(pairTestReader, batch_size=batchSize, collate_fn=cmp_collate.collate) if not isinstance(cmp_collate, PairBugCollate): raise NotImplementedError( 'Evaluation of pairs using tanh was not implemented yet') logger.info("Test size: %s" % (len(testLoader.dataset))) testMetrics = { 'test_accuracy': ignite.metrics.Accuracy( output_transform=thresholded_output_transform), 'test_precision': ignite.metrics.Precision( output_transform=thresholded_output_transform), 'test_recall': ignite.metrics.Recall( output_transform=thresholded_output_transform), 'test_predictions': PredictionCache(), } test_evaluator = Engine(validationIteration) # Add metrics to evaluator for name, metric in testMetrics.items(): metric.attach(test_evaluator, name) test_evaluator.run(testLoader) for metricName, metricValue in test_evaluator.state.metrics.items(): metric = testMetrics[metricName] if isinstance(metric, ignite.metrics.Accuracy): logger.info({ 'type': 'metric', 'label': metricName, 'value': metricValue, 'epoch': None, 'correct': metric._num_correct, 'total': metric._num_examples }) _run.log_scalar(metricName, metricValue) elif isinstance(metric, (ignite.metrics.Precision, ignite.metrics.Recall)): logger.info({ 'type': 'metric', 'label': metricName, 'value': metricValue, 'epoch': None, 'tp': metric._true_positives.item(), 'total_positive': metric._positives.item() }) _run.log_scalar(metricName, metricValue) elif isinstance(metric, ConfusionMatrix): acc = cmAccuracy(metricValue) prec = cmPrecision(metricValue, False) recall = cmRecall(metricValue, False) f1 = 2 * (prec * recall) / (prec + recall + 1e-15) logger.info({ 'type': 'metric', 'label': metricName, 'accuracy': np.float(acc), 'precision': prec.cpu().numpy().tolist(), 'recall': recall.cpu().numpy().tolist(), 'f1': f1.cpu().numpy().tolist(), 'confusion_matrix': metricValue.cpu().numpy().tolist(), 'epoch': None }) _run.log_scalar('test_f1', f1[1]) elif isinstance(metric, PredictionCache): logger.info({ 'type': 'metric', 'label': metricName, 'predictions': metric.predictions }) # Calculate recall rate recallRateOpt = args.get('recall_rate', {'type': 'none'}) if recallRateOpt['type'] != 'none': if recallRateOpt['type'] == 'sun2011': logger.info("Calculating recall rate: {}".format( recallRateOpt['type'])) recallRateDataset = BugDataset(recallRateOpt['dataset']) rankingClass = SunRanking(bugReportDatabase, recallRateDataset, recallRateOpt['window']) # We always group all bug reports by master in the results in the sun 2011 methodology group_by_master = True elif recallRateOpt['type'] == 'deshmukh': logger.info("Calculating recall rate: {}".format( recallRateOpt['type'])) recallRateDataset = BugDataset(recallRateOpt['dataset']) rankingClass = DeshmukhRanking(bugReportDatabase, recallRateDataset) group_by_master = recallRateOpt['group_by_master'] else: raise ArgumentError( "recall_rate.type is invalid (%s). You should choose one of these: step, exp and linear " % recallRateOpt['type']) logRankingResult(_run, logger, rankingClass, rankingScorer, bugReportDatabase, recallRateOpt["result_file"], 0, None, group_by_master, recommendationListfn=recommendation_fn)
def mainWnnNer(args): # Initializing parameters. log = logging.getLogger(__name__) if args.seed: random.seed(args.seed) np.random.seed(args.seed) log.info({"type": "args", "args": args}) # GPU configuration. log.info({"floatX": str(theano.config.floatX), "device": str(theano.config.device)}) # Parameters. # lr = args.lr # startSymbol = args.start_symbol # endSymbol = args.end_symbol # numEpochs = args.num_epochs # shuffle = args.shuffle # normalization = args.normalization # wordWindowSize = args.word_window_size # hiddenLayerSize = args.hidden_size # hiddenActFunctionName = args.hidden_activation_function # embeddingSize = args.word_emb_size # batchSize = args.batch_size # structGrad = args.struct_grad # charStructGrad = args.char_struct_grad # # charEmbeddingSize = args.char_emb_size # charWindowSize = args.char_window_size # charConvSize = args.conv_size # Word filters. log.info("Loading word filters...") wordFilters = getFilters(args.word_filters, log) # Loading/creating word lexicon and word embedding. if args.word_embedding is not None: log.info("Loading word embedding...") wordLexicon, wordEmbedding = Embedding.fromWord2Vec(args.word_embedding, "UUUNKKK", "word_lexicon") elif args.word_lexicon is not None: log.info("Loading word lexicon...") wordLexicon = Lexicon.fromTextFile(args.word_lexicon, True, "word_lexicon") wordEmbedding = Embedding(wordLexicon, vectors=None, embeddingSize=args.word_emb_size) else: log.error("You need to set one of these parameters: load_model, word_embedding or word_lexicon") sys.exit(1) # Loading char lexicon. log.info("Loading char lexicon...") charLexicon = Lexicon.fromTextFile(args.char_lexicon, True, "char_lexicon") # Character embedding. charEmbedding = Embedding(charLexicon, vectors=None, embeddingSize=args.char_emb_size) # Loading label lexicon. log.info("Loading label lexicon...") labelLexicon = Lexicon.fromTextFile(args.label_file, False, lexiconName="label_lexicon") # Normalize the word embedding if args.normalization is not None: normFactor = 1 if args.norm_factor is not None: normFactor = args.norm_factor if args.normalization == "minmax": log.info("Normalizing word embedding: minmax") wordEmbedding.minMaxNormalization(norm_coef=normFactor) elif args.normalization == "mean": log.info("Normalizing word embedding: mean") wordEmbedding.meanNormalization(norm_coef=normFactor) else: log.error("Unknown normalization method: %s" % args.normalization) sys.exit(1) elif args.normFactor is not None: log.error("Parameter norm_factor cannot be present without normalization.") sys.exit(1) dictionarySize = wordEmbedding.getNumberOfVectors() log.info("Size of word lexicon is %d and word embedding size is %d" % (dictionarySize, args.word_emb_size)) # Setup the input and (golden) output generators (readers). inputGenerators = [ WordWindowGenerator(args.word_window_size, wordLexicon, wordFilters, args.start_symbol, args.end_symbol), CharacterWindowGenerator(lexicon=charLexicon, numMaxChar=20, charWindowSize=args.char_window_size, wrdWindowSize=args.word_window_size, artificialChar="ART_CHAR", startPadding="</s>", startPaddingWrd=args.start_symbol, endPaddingWrd=args.end_symbol, filters=getFilters([], log)) ] outputGenerator = LabelGenerator(labelLexicon) if args.cv is not None: log.info("Reading training examples...") trainIterator = SyncBatchIterator(TokenLabelPerLineReader(args.train, labelTknSep='\t'), inputGenerators, [outputGenerator], args.batch_size, shuffle=args.shuffle, numCVFolds=args.cv.numFolds) cvGenerators = trainIterator.getCVGenerators() iFold = 0 numFolds = len(cvGenerators) for train, dev in cvGenerators: log.info({"cv": {"fold": iFold, "numFolds": numFolds}}) trainNetwork(args, log, trainIterator=train, devIterator=dev, wordEmbedding=wordEmbedding, charEmbedding=charEmbedding, borrow=False, labelLexicon=labelLexicon) else: log.info("Reading training examples...") trainIterator = SyncBatchIterator(TokenLabelPerLineReader(args.train, labelTknSep='\t'), inputGenerators, [outputGenerator], args.batch_size, shuffle=args.shuffle) # Get dev inputs and (golden) outputs. devIterator = None if args.dev is not None: log.info("Reading development examples") devIterator = SyncBatchIterator(TokenLabelPerLineReader(args.dev, labelTknSep='\t'), inputGenerators, [outputGenerator], sys.maxint, shuffle=False) trainNetwork(args, log, trainIterator, devIterator, wordEmbedding, charEmbedding, borrow=True, labelLexicon=labelLexicon) # Testing. if args.test: log.info("Reading test dataset...") testIterator = SyncBatchIterator(TokenLabelPerLineReader(args.test, labelTknSep='\t'), inputGenerators, [outputGenerator], sys.maxint, shuffle=False) log.info("Testing...") wnnModel.test(testIterator) log.info("Done!")
def main(): full_path = os.path.realpath(__file__) path, filename = os.path.split(full_path) logging.config.fileConfig(os.path.join(path, 'logging.conf'), defaults={}) log = logging.getLogger(__name__) if len(sys.argv) != 3: log.error("Missing argument: <JSON config file> or/and <Input file>") exit(1) argsDict = JsonArgParser(PARAMETERS).parse(sys.argv[1]) args = dict2obj(argsDict, 'ShortDocArguments') logging.getLogger(__name__).info(argsDict) if args.seed: random.seed(args.seed) np.random.seed(args.seed) lr = args.lr startSymbol = args.start_symbol endSymbol = args.end_symbol numEpochs = args.num_epochs shuffle = args.shuffle normalizeMethod = args.normalization wordWindowSize = args.word_window_size hiddenLayerSize = args.hidden_size convSize = args.conv_size # Load classes for filters. filters = [] for filterName in args.filters: moduleName, className = filterName.rsplit('.', 1) log.info("Filtro: " + moduleName + " " + className) module_ = importlib.import_module(moduleName) filters.append(getattr(module_, className)()) W1 = None b1 = None W2 = None b2 = None wordEmbedding = None if args.word_embedding: log.info("Reading W2v File") (wordLexicon, wordEmbedding) = Embedding.fromWord2Vec(args.word_embedding, unknownSymbol="__UNKNOWN__") wordLexicon.stopAdd() elif args.word_lexicon and args.word_emb_size: wordLexicon = Lexicon.fromTextFile(args.word_lexicon, hasUnknowSymbol=False) wordEmbedding = Embedding(wordLexicon, embeddingSize=args.word_emb_size) wordLexicon.stopAdd() else: log.error( "You must provide argument word_embedding or word_lexicon and word_emb_size" ) # Create the lexicon of labels. labelLexicon = None if args.labels is not None: if args.label_lexicon is not None: log.error( "Only one of the parameters label_lexicon and labels can be provided!" ) exit(1) labelLexicon = Lexicon.fromList(args.labels, hasUnknowSymbol=False) elif args.label_lexicon is not None: labelLexicon = Lexicon.fromTextFile(args.label_lexicon, hasUnknowSymbol=False) else: log.error( "One of the parameters label_lexicon or labels must be provided!") exit(1) # # Build the network model (Theano graph). # # TODO: debug # theano.config.compute_test_value = 'warn' # ex = trainIterator.next() # inWords.tag.test_value = ex[0][0] # outLabel.tag.test_value = ex[1][0] # Matriz de entrada. Cada linha representa um token da oferta. Cada token é # representado por uma janela de tokens (token central e alguns tokens # próximos). Cada valor desta matriz corresponde a um índice que representa # um token no embedding. inWords = tensor.lmatrix("inWords") # Categoria correta de uma oferta. outLabel = tensor.lscalar("outLabel") # List of input tensors. One for each input layer. inputTensors = [inWords] # Whether the word embedding will be updated during training. embLayerTrainable = not args.fix_word_embedding if not embLayerTrainable: log.info("Not updating the word embedding!") # Lookup table for word features. embeddingLayer = EmbeddingLayer(inWords, wordEmbedding.getEmbeddingMatrix(), trainable=embLayerTrainable) # if not args.train and args.load_wordEmbedding: # attrs = np.load(args.load_wordEmbedding) # embeddingLayer.load(attrs) # log.info("Loaded word embedding (shape %s) from file %s" % ( # str(attrs[0].shape), args.load_wordEmbedding)) # A saída da lookup table possui 3 dimensões (numTokens, szWindow, szEmbedding). # Esta camada dá um flat nas duas últimas dimensões, produzindo uma saída # com a forma (numTokens, szWindow * szEmbedding). flattenInput = FlattenLayer(embeddingLayer) # Random weight initialization procedure. weightInit = GlorotUniform() # Convolution layer. Convolução no texto de uma oferta. convW = None convb = None if not args.train and args.load_conv: convNPY = np.load(args.load_conv) convW = convNPY[0] convb = convNPY[1] log.info("Loaded convolutional layer (shape %s) from file %s" % (str(convW.shape), args.load_conv)) convLinear = LinearLayer(flattenInput, wordWindowSize * wordEmbedding.getEmbeddingSize(), convSize, W=convW, b=convb, weightInitialization=weightInit) # Max pooling layer. maxPooling = MaxPoolingLayer(convLinear) # Hidden layer. if not args.train and args.load_hiddenLayer: hiddenNPY = np.load(args.load_hiddenLayer) W1 = hiddenNPY[0] b1 = hiddenNPY[1] log.info("Loaded hidden layer (shape %s) from file %s" % (str(W1.shape), args.load_hiddenLayer)) hiddenLinear = LinearLayer(maxPooling, convSize, hiddenLayerSize, W=W1, b=b1, weightInitialization=weightInit) hiddenAct = ActivationLayer(hiddenLinear, tanh) # Entrada linear da camada softmax. if not args.train and args.load_softmax: hiddenNPY = np.load(args.load_softmax) W2 = hiddenNPY[0] b2 = hiddenNPY[1] log.info("Loaded softmax layer (shape %s) from file %s" % (str(W2.shape), args.load_softmax)) sotmaxLinearInput = LinearLayer(hiddenAct, hiddenLayerSize, labelLexicon.getLen(), W=W2, b=b2, weightInitialization=ZeroWeightGenerator()) # Softmax. # softmaxAct = ReshapeLayer(ActivationLayer(sotmaxLinearInput, softmax), (1, -1)) softmaxAct = ActivationLayer(sotmaxLinearInput, softmax) # Prediction layer (argmax). prediction = ArgmaxPrediction(None).predict(softmaxAct.getOutput()) # Loss function. if args.label_weights is not None and len( args.label_weights) != labelLexicon.getLen(): log.error( "Number of label weights (%d) is different from number of labels (%d)!" % (len(args.label_weights), labelLexicon.getLen())) nlloe = NegativeLogLikelihoodOneExample(weights=args.label_weights) loss = nlloe.calculateError(softmaxAct.getOutput()[0], prediction, outLabel) # Input generators: word window. inputGenerators = [ WordWindowGenerator(wordWindowSize, wordLexicon, filters, startSymbol, endSymbol) ] # Output generator: generate one label per offer. outputGenerators = [TextLabelGenerator(labelLexicon)] # outputGenerators = [lambda label: labelLexicon.put(label)] evalPerIteration = None if normalizeMethod == "minmax": log.info("Normalization: minmax") wordEmbedding.minMaxNormalization() elif normalizeMethod == "mean": log.info("Normalization: mean normalization") wordEmbedding.meanNormalization() elif normalizeMethod == "zscore": log.info("Normalization: zscore normalization") wordEmbedding.zscoreNormalization() elif normalizeMethod: log.error("Normalization: unknown value %s" % normalizeMethod) sys.exit(1) # Decaimento da taxa de aprendizado. decay = None if args.decay == "none": decay = 0.0 elif args.decay == "linear": decay = 1.0 else: log.error("Unknown decay parameter %s." % args.decay) exit(1) # Algoritmo de aprendizado. if args.alg == "adagrad": log.info("Using Adagrad") opt = Adagrad(lr=lr, decay=decay) elif args.alg == "sgd": log.info("Using SGD") opt = SGD(lr=lr, decay=decay) else: log.error("Unknown algorithm: %s." % args.alg) sys.exit(1) # TODO: debug # opt.lr.tag.test_value = 0.05 # Printing embedding information. dictionarySize = wordEmbedding.getNumberOfVectors() embeddingSize = wordEmbedding.getEmbeddingSize() log.info("Dictionary size: %d" % dictionarySize) log.info("Embedding size: %d" % embeddingSize) log.info("Number of categories: %d" % labelLexicon.getLen()) # TODO: debug # mode = theano.compile.debugmode.DebugMode(optimizer=None) mode = None model = BasicModel(x=inputTensors, y=[outLabel], allLayers=softmaxAct.getLayerSet(), optimizer=opt, prediction=prediction, loss=loss, mode=mode) wordWindow = WordWindowGenerator(wordWindowSize, wordLexicon, filters, startSymbol, endSymbol) # GETS HIDDEN LAYER: # graph = EmbeddingGraph([inWords], [hiddenAct.getOutput()], wordWindow) # GRAPH FOR PREDICTION LAYER graph = EmbeddingGraph(inputTensors, prediction, wordWindow, mode) lblTxt = ["Sim", "Nao"] tweets = [] with open(sys.argv[2]) as inputFile: content = inputFile.readlines() for line in content: tweets.append(line.decode('utf-8').encode('utf-8')) #print tweets # graph.getResultsFor(t) retorna a predição para dado Tweet t try: output_file = open("Output.txt", "w") except: print "Falha em criar o arquivo de saida\n" try: for t in tweets: output_file.write( t.replace('\n', '').replace('\t', '') + "\t " + lblTxt[graph.getResultsFor(t)] + "\n") print "Resultados gerados com sucesso!\n" except: print "Erro na geração de resultados\n"
parser.add_argument('output', help="file where the vector will be saved") parser.add_argument('lexicon', help="file where the lexicon will be saved") parser.add_argument( '-db', help="File path that contains all reports. " "If this option is set, so tokens that not appear in any report are removed from the lexicon." ) parser.add_argument('-filters', default=[], nargs='+', help="text filters") parser.add_argument('-tk', help="tokenizer") logging.basicConfig(level=logging.DEBUG, datefmt='%Y-%m-%d %H:%M:%S') logger = logging.getLogger() args = parser.parse_args() logger.info(args) lexicon, embedding = Embedding.fromFile(args.word_embedding, 'UUUKNNN', hasHeader=False, paddingSym="</s>") database_lexicon = set(['UUUKNNN', '</s>']) if args.db is not None: # Filter any word that is not in the dataset. db = BugReportDatabase.fromJson(args.db) filters = loadFilters(args.filters) if args.tk == 'default': logger.info( "Use default tokenizer to tokenize summary information") tokenizer = MultiLineTokenizer() elif args.tk == 'white_space': logger.info( "Use white space tokenizer to tokenize summary information")
def main(_run, _config, _seed, _log): # Setting logger args = _config logger = _log logger.info(args) logger.info('It started at: %s' % datetime.now()) torch.manual_seed(_seed) device = torch.device('cuda' if args['cuda'] else "cpu") if args['cuda']: logger.info("Turning CUDA on") else: logger.info("Turning CUDA off") # Setting the parameter to save and loading parameters important_parameters = ['dbr_cnn'] parameters_to_save = dict([(name, args[name]) for name in important_parameters]) if args['load'] is not None: map_location = ( lambda storage, loc: storage.cuda()) if args['cuda'] else 'cpu' model_info = torch.load(args['load'], map_location=map_location) model_state = model_info['model'] for param_name, param_value in model_info['params'].items(): args[param_name] = param_value else: model_state = None # Set basic variables preprocessors = PreprocessorList() input_handlers = [] report_database = BugReportDatabase.fromJson(args['bug_database']) batchSize = args['batch_size'] dbr_cnn_opt = args['dbr_cnn'] # Loading word embedding and lexicon emb = np.load(dbr_cnn_opt["word_embedding"]) padding_sym = "</s>" lexicon = Lexicon(unknownSymbol=None) with codecs.open(dbr_cnn_opt["lexicon"]) as f: for l in f: lexicon.put(l.strip()) lexicon.setUnknown("UUUKNNN") padding_id = lexicon.getLexiconIndex(padding_sym) embedding = Embedding(lexicon, emb, paddingIdx=padding_id) logger.info("Lexicon size: %d" % (lexicon.getLen())) logger.info("Word Embedding size: %d" % (embedding.getEmbeddingSize())) # Load filters and tokenizer filters = loadFilters(dbr_cnn_opt['filters']) if dbr_cnn_opt['tokenizer'] == 'default': logger.info("Use default tokenizer to tokenize summary information") tokenizer = MultiLineTokenizer() elif dbr_cnn_opt['tokenizer'] == 'white_space': logger.info( "Use white space tokenizer to tokenize summary information") tokenizer = WhitespaceTokenizer() else: raise ArgumentError( "Tokenizer value %s is invalid. You should choose one of these: default and white_space" % dbr_cnn_opt['tokenizer']) # Add preprocessors preprocessors.append( DBR_CNN_CategoricalPreprocessor(dbr_cnn_opt['categorical_lexicon'], report_database)) preprocessors.append( SummaryDescriptionPreprocessor(lexicon, report_database, filters, tokenizer, padding_id)) # Add input_handlers input_handlers.append(DBRDCNN_CategoricalInputHandler()) input_handlers.append( TextCNNInputHandler(padding_id, min(dbr_cnn_opt["window"]))) # Create Model model = DBR_CNN(embedding, dbr_cnn_opt["window"], dbr_cnn_opt["nfilters"], dbr_cnn_opt['update_embedding']) model.to(device) if model_state: model.load_state_dict(model_state) # Set loss function logger.info("Using BCE Loss") loss_fn = BCELoss() loss_no_reduction = BCELoss(reduction='none') cmp_collate = PairBugCollate(input_handlers, torch.float32, unsqueeze_target=True) # Loading the training and setting how the negative example will be generated. if args.get('pairs_training'): negative_pair_gen_opt = args.get('neg_pair_generator', ) pairsTrainingFile = args.get('pairs_training') random_anchor = negative_pair_gen_opt['random_anchor'] offlineGeneration = not (negative_pair_gen_opt is None or negative_pair_gen_opt['type'] == 'none') if not offlineGeneration: logger.info("Not generate dynamically the negative examples.") pair_training_reader = PairBugDatasetReader( pairsTrainingFile, preprocessors, randomInvertPair=args['random_switch']) else: pair_gen_type = negative_pair_gen_opt['type'] master_id_by_bug_id = report_database.getMasterIdByBugId() if pair_gen_type == 'random': logger.info("Random Negative Pair Generator") training_dataset = BugDataset( negative_pair_gen_opt['training']) bug_ids = training_dataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (training_dataset.info, len(bug_ids))) negative_pair_generator = RandomGenerator( preprocessors, cmp_collate, negative_pair_gen_opt['rate'], bug_ids, master_id_by_bug_id) elif pair_gen_type == 'non_negative': logger.info("Non Negative Pair Generator") training_dataset = BugDataset( negative_pair_gen_opt['training']) bug_ids = training_dataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (training_dataset.info, len(bug_ids))) negative_pair_generator = NonNegativeRandomGenerator( preprocessors, cmp_collate, negative_pair_gen_opt['rate'], bug_ids, master_id_by_bug_id, negative_pair_gen_opt['n_tries'], device, randomAnchor=random_anchor) elif pair_gen_type == 'misc_non_zero': logger.info("Misc Non Zero Pair Generator") training_dataset = BugDataset( negative_pair_gen_opt['training']) bug_ids = training_dataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (training_dataset.info, len(bug_ids))) negative_pair_generator = MiscNonZeroRandomGen( preprocessors, cmp_collate, negative_pair_gen_opt['rate'], bug_ids, training_dataset.duplicateIds, master_id_by_bug_id, device, negative_pair_gen_opt['n_tries'], negative_pair_gen_opt['random_anchor']) elif pair_gen_type == 'random_k': logger.info("Random K Negative Pair Generator") training_dataset = BugDataset( negative_pair_gen_opt['training']) bug_ids = training_dataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (training_dataset.info, len(bug_ids))) negative_pair_generator = KRandomGenerator( preprocessors, cmp_collate, negative_pair_gen_opt['rate'], bug_ids, master_id_by_bug_id, negative_pair_gen_opt['k'], device) elif pair_gen_type == "pre": logger.info("Pre-selected list generator") negative_pair_generator = PreSelectedGenerator( negative_pair_gen_opt['pre_list_file'], preprocessors, negative_pair_gen_opt['rate'], master_id_by_bug_id, negative_pair_gen_opt['preselected_length']) elif pair_gen_type == "misc_non_zero_pre": logger.info("Pre-selected list generator") negativePairGenerator1 = PreSelectedGenerator( negative_pair_gen_opt['pre_list_file'], preprocessors, negative_pair_gen_opt['rate'], master_id_by_bug_id, negative_pair_gen_opt['preselected_length']) training_dataset = BugDataset( negative_pair_gen_opt['training']) bug_ids = training_dataset.bugIds negativePairGenerator2 = NonNegativeRandomGenerator( preprocessors, cmp_collate, negative_pair_gen_opt['rate'], bug_ids, master_id_by_bug_id, device, negative_pair_gen_opt['n_tries']) negative_pair_generator = MiscOfflineGenerator( (negativePairGenerator1, negativePairGenerator2)) else: raise ArgumentError( "Offline generator is invalid (%s). You should choose one of these: random, hard and pre" % pair_gen_type) pair_training_reader = PairBugDatasetReader( pairsTrainingFile, preprocessors, negative_pair_generator, randomInvertPair=args['random_switch']) training_loader = DataLoader(pair_training_reader, batch_size=batchSize, collate_fn=cmp_collate.collate, shuffle=True) logger.info("Training size: %s" % (len(training_loader.dataset))) # load validation if args.get('pairs_validation'): pair_validation_reader = PairBugDatasetReader( args.get('pairs_validation'), preprocessors) validation_loader = DataLoader(pair_validation_reader, batch_size=batchSize, collate_fn=cmp_collate.collate) logger.info("Validation size: %s" % (len(validation_loader.dataset))) else: validation_loader = None """ Training and evaluate the model. """ optimizer_opt = args.get('optimizer', 'adam') if optimizer_opt == 'sgd': logger.info('SGD') optimizer = optim.SGD(model.parameters(), lr=args['lr'], weight_decay=args['l2'], momentum=args['momentum']) elif optimizer_opt == 'adam': logger.info('Adam') optimizer = optim.Adam(model.parameters(), lr=args['lr'], weight_decay=args['l2']) # Recall rate ranking_scorer = DBR_CNN_Scorer(preprocessors[0], preprocessors[1], input_handlers[0], input_handlers[1], model, device, args['ranking_batch_size']) recallEstimationTrainOpt = args.get('recall_estimation_train') if recallEstimationTrainOpt: preselectListRankingTrain = PreselectListRanking( recallEstimationTrainOpt) recallEstimationOpt = args.get('recall_estimation') if recallEstimationOpt: preselect_list_ranking = PreselectListRanking(recallEstimationOpt) lr_scheduler_opt = args.get('lr_scheduler', None) if lr_scheduler_opt is None or lr_scheduler_opt['type'] == 'constant': logger.info("Scheduler: Constant") lr_sched = None elif lr_scheduler_opt["type"] == 'step': logger.info("Scheduler: StepLR (step:%s, decay:%f)" % (lr_scheduler_opt["step_size"], args["decay"])) lr_sched = StepLR(optimizer, lr_scheduler_opt["step_size"], lr_scheduler_opt["decay"]) elif lr_scheduler_opt["type"] == 'exp': logger.info("Scheduler: ExponentialLR (decay:%f)" % (lr_scheduler_opt["decay"])) lr_sched = ExponentialLR(optimizer, lr_scheduler_opt["decay"]) elif lr_scheduler_opt["type"] == 'linear': logger.info( "Scheduler: Divide by (1 + epoch * decay) ---- (decay:%f)" % (lr_scheduler_opt["decay"])) lrDecay = lr_scheduler_opt["decay"] lr_sched = LambdaLR(optimizer, lambda epoch: 1 / (1.0 + epoch * lrDecay)) else: raise ArgumentError( "LR Scheduler is invalid (%s). You should choose one of these: step, exp and linear " % pair_gen_type) # Set training functions def trainingIteration(engine, batch): model.train() optimizer.zero_grad() x, y = cmp_collate.to(batch, device) output = model(*x) loss = loss_fn(output, y) loss.backward() optimizer.step() return loss, output, y trainer = Engine(trainingIteration) negTarget = 0.0 if isinstance(loss_fn, NLLLoss) else -1.0 trainingMetrics = { 'training_loss': AverageLoss(loss_fn), 'training_acc': AccuracyWrapper(output_transform=thresholded_output_transform), 'training_precision': PrecisionWrapper(output_transform=thresholded_output_transform), 'training_recall': RecallWrapper(output_transform=thresholded_output_transform), } # Add metrics to trainer for name, metric in trainingMetrics.items(): metric.attach(trainer, name) # Set validation functions def validationIteration(engine, batch): model.eval() with torch.no_grad(): x, y = cmp_collate.to(batch, device) y_pred = model(*x) return y_pred, y validationMetrics = { 'validation_loss': LossWrapper(loss_fn), 'validation_acc': AccuracyWrapper(output_transform=thresholded_output_transform), 'validation_precision': PrecisionWrapper(output_transform=thresholded_output_transform), 'validation_recall': RecallWrapper(output_transform=thresholded_output_transform), } evaluator = Engine(validationIteration) # Add metrics to evaluator for name, metric in validationMetrics.items(): metric.attach(evaluator, name) @trainer.on(Events.EPOCH_STARTED) def onStartEpoch(engine): epoch = engine.state.epoch logger.info("Epoch: %d" % epoch) if lr_sched: lr_sched.step() logger.info("LR: %s" % str(optimizer.param_groups[0]["lr"])) @trainer.on(Events.EPOCH_COMPLETED) def onEndEpoch(engine): epoch = engine.state.epoch logMetrics(_run, logger, engine.state.metrics, epoch) # Evaluate Training if validation_loader: evaluator.run(validation_loader) logMetrics(_run, logger, evaluator.state.metrics, epoch) lastEpoch = args['epochs'] - epoch == 0 if recallEstimationTrainOpt and (epoch % args['rr_train_epoch'] == 0): logRankingResult(_run, logger, preselectListRankingTrain, ranking_scorer, report_database, None, epoch, "train") ranking_scorer.free() if recallEstimationOpt and (epoch % args['rr_val_epoch'] == 0): logRankingResult(_run, logger, preselect_list_ranking, ranking_scorer, report_database, args.get("ranking_result_file"), epoch, "validation") ranking_scorer.free() if not lastEpoch: pair_training_reader.sampleNewNegExamples(model, loss_no_reduction) if args.get('save'): save_by_epoch = args['save_by_epoch'] if save_by_epoch and epoch in save_by_epoch: file_name, file_extension = os.path.splitext(args['save']) file_path = file_name + '_epoch_{}'.format( epoch) + file_extension else: file_path = args['save'] modelInfo = { 'model': model.state_dict(), 'params': parameters_to_save } logger.info("==> Saving Model: %s" % file_path) torch.save(modelInfo, file_path) if args.get('pairs_training'): trainer.run(training_loader, max_epochs=args['epochs']) elif args.get('pairs_validation'): # Evaluate Training evaluator.run(validation_loader) logMetrics(logger, evaluator.state.metrics) if recallEstimationOpt: logRankingResult(_run, logger, preselect_list_ranking, ranking_scorer, report_database, args.get("ranking_result_file"), 0, "validation") # Test Dataset (accuracy, recall, precision, F1) pair_test_dataset = args.get('pair_test_dataset') if pair_test_dataset is not None and len(pair_test_dataset) > 0: pairTestReader = PairBugDatasetReader(pair_test_dataset, preprocessors) testLoader = DataLoader(pairTestReader, batch_size=batchSize, collate_fn=cmp_collate.collate) if not isinstance(cmp_collate, PairBugCollate): raise NotImplementedError( 'Evaluation of pairs using tanh was not implemented yet') logger.info("Test size: %s" % (len(testLoader.dataset))) testMetrics = { 'test_accuracy': ignite.metrics.Accuracy( output_transform=thresholded_output_transform), 'test_precision': ignite.metrics.Precision( output_transform=thresholded_output_transform), 'test_recall': ignite.metrics.Recall( output_transform=thresholded_output_transform), 'test_predictions': PredictionCache(), } test_evaluator = Engine(validationIteration) # Add metrics to evaluator for name, metric in testMetrics.items(): metric.attach(test_evaluator, name) test_evaluator.run(testLoader) for metricName, metricValue in test_evaluator.state.metrics.items(): metric = testMetrics[metricName] if isinstance(metric, ignite.metrics.Accuracy): logger.info({ 'type': 'metric', 'label': metricName, 'value': metricValue, 'epoch': None, 'correct': metric._num_correct, 'total': metric._num_examples }) _run.log_scalar(metricName, metricValue) elif isinstance(metric, (ignite.metrics.Precision, ignite.metrics.Recall)): logger.info({ 'type': 'metric', 'label': metricName, 'value': metricValue, 'epoch': None, 'tp': metric._true_positives.item(), 'total_positive': metric._positives.item() }) _run.log_scalar(metricName, metricValue) elif isinstance(metric, ConfusionMatrix): acc = cmAccuracy(metricValue) prec = cmPrecision(metricValue, False) recall = cmRecall(metricValue, False) f1 = 2 * (prec * recall) / (prec + recall + 1e-15) logger.info({ 'type': 'metric', 'label': metricName, 'accuracy': np.float(acc), 'precision': prec.cpu().numpy().tolist(), 'recall': recall.cpu().numpy().tolist(), 'f1': f1.cpu().numpy().tolist(), 'confusion_matrix': metricValue.cpu().numpy().tolist(), 'epoch': None }) _run.log_scalar('test_f1', f1[1]) elif isinstance(metric, PredictionCache): logger.info({ 'type': 'metric', 'label': metricName, 'predictions': metric.predictions }) # Calculate recall rate recall_rate_opt = args.get('recall_rate', {'type': 'none'}) if recall_rate_opt['type'] != 'none': if recall_rate_opt['type'] == 'sun2011': logger.info("Calculating recall rate: {}".format( recall_rate_opt['type'])) recall_rate_dataset = BugDataset(recall_rate_opt['dataset']) ranking_class = SunRanking(report_database, recall_rate_dataset, recall_rate_opt['window']) # We always group all bug reports by master in the results in the sun 2011 methodology group_by_master = True elif recall_rate_opt['type'] == 'deshmukh': logger.info("Calculating recall rate: {}".format( recall_rate_opt['type'])) recall_rate_dataset = BugDataset(recall_rate_opt['dataset']) ranking_class = DeshmukhRanking(report_database, recall_rate_dataset) group_by_master = recall_rate_opt['group_by_master'] else: raise ArgumentError( "recall_rate.type is invalid (%s). You should choose one of these: step, exp and linear " % recall_rate_opt['type']) logRankingResult( _run, logger, ranking_class, ranking_scorer, report_database, recall_rate_opt["result_file"], 0, None, group_by_master, )
def main(_run, _config, _seed, _log): """ :param _run: :param _config: :param _seed: :param _log: :return: """ """ Setting and loading parameters """ # Setting logger args = _config logger = _log logger.info(args) logger.info('It started at: %s' % datetime.now()) torch.manual_seed(_seed) bugReportDatabase = BugReportDatabase.fromJson(args['bug_database']) paddingSym = "</s>" batchSize = args['batch_size'] device = torch.device('cuda' if args['cuda'] else "cpu") if args['cuda']: logger.info("Turning CUDA on") else: logger.info("Turning CUDA off") # It is the folder where the preprocessed information will be stored. cacheFolder = args['cache_folder'] # Setting the parameter to save and loading parameters importantParameters = ['compare_aggregation', 'categorical'] parametersToSave = dict([(parName, args[parName]) for parName in importantParameters]) if args['load'] is not None: mapLocation = ( lambda storage, loc: storage.cuda()) if cudaOn else 'cpu' modelInfo = torch.load(args['load'], map_location=mapLocation) modelState = modelInfo['model'] for paramName, paramValue in modelInfo['params'].items(): args[paramName] = paramValue else: modelState = None if args['rep'] is not None and args['rep']['model']: logger.info("Loading REP") rep = read_weights(args['rep']['model']) rep_input, max_tkn_id = read_dbrd_file(args['rep']['input'], math.inf) rep_recommendation = args['rep']['k'] rep.fit_transform(rep_input, max_tkn_id, True) rep_input_by_id = {} for inp in rep_input: rep_input_by_id[inp[SUN_REPORT_ID_INDEX]] = inp else: rep = None preprocessors = PreprocessorList() inputHandlers = [] categoricalOpt = args.get('categorical') if categoricalOpt is not None and len(categoricalOpt) != 0: categoricalEncoder, _, _ = processCategoricalParam( categoricalOpt, bugReportDatabase, inputHandlers, preprocessors, None, logger, cudaOn) else: categoricalEncoder = None filterInputHandlers = [] compareAggOpt = args['compare_aggregation'] databasePath = args['bug_database'] # Loading word embedding if compareAggOpt["word_embedding"]: # todo: Allow use embeddings and other representation lexicon, embedding = Embedding.fromFile( compareAggOpt['word_embedding'], 'UUUKNNN', hasHeader=False, paddingSym=paddingSym) logger.info("Lexicon size: %d" % (lexicon.getLen())) logger.info("Word Embedding size: %d" % (embedding.getEmbeddingSize())) paddingId = lexicon.getLexiconIndex(paddingSym) lazy = False else: embedding = None # Tokenizer if compareAggOpt['tokenizer'] == 'default': logger.info("Use default tokenizer to tokenize summary information") tokenizer = MultiLineTokenizer() elif compareAggOpt['tokenizer'] == 'white_space': logger.info( "Use white space tokenizer to tokenize summary information") tokenizer = WhitespaceTokenizer() else: raise ArgumentError( "Tokenizer value %s is invalid. You should choose one of these: default and white_space" % compareAggOpt['tokenizer']) # Preparing input handlers, preprocessors and cache minSeqSize = max(compareAggOpt['aggregate']["window"] ) if compareAggOpt['aggregate']["model"] == "cnn" else -1 if compareAggOpt['summary'] is not None: # Use summary and description (concatenated) to address this problem logger.info("Using Summary information.") # Loading Filters sumFilters = loadFilters(compareAggOpt['summary']['filters']) if compareAggOpt['summary']['model_type'] in ('lstm', 'gru', 'word_emd', 'residual'): arguments = (databasePath, compareAggOpt['word_embedding'], ' '.join( sorted([ fil.__class__.__name__ for fil in sumFilters ])), compareAggOpt['tokenizer'], SummaryPreprocessor.__name__) inputHandlers.append( RNNInputHandler(paddingId, minInputSize=minSeqSize)) summaryCache = PreprocessingCache(cacheFolder, arguments) summaryPreprocessor = SummaryPreprocessor(lexicon, bugReportDatabase, sumFilters, tokenizer, paddingId, summaryCache) elif compareAggOpt['summary']['model_type'] == 'ELMo': raise NotImplementedError("ELMO is not implemented!") # inputHandlers.append(ELMoInputHandler(cudaOn, minInputSize=minSeqSize)) # summaryPreprocessor = ELMoPreprocessor(0, elmoEmbedding) # compareAggOpt['summary']["input_size"] = elmoEmbedding.get_size() elif compareAggOpt['summary']['model_type'] == 'BERT': arguments = (databasePath, "CADD SUMMARY", "BERT", "bert-base-uncased") inputHandlers.append(BERTInputHandler(0, minInputSize=minSeqSize)) summaryCache = PreprocessingCache(cacheFolder, arguments) summaryPreprocessor = TransformerPreprocessor( "short_desc", "bert-base-uncased", BertTokenizer, 0, bugReportDatabase, summaryCache) # compareAggOpt['summary']["input_size"] = 768 preprocessors.append(summaryPreprocessor) if compareAggOpt['desc'] is not None: # Use summary and description (concatenated) to address this problem logger.info("Using Description information.") descFilters = loadFilters(compareAggOpt['desc']['filters']) if compareAggOpt['desc']['model_type'] in ('lstm', 'gru', 'word_emd', 'residual'): arguments = (databasePath, compareAggOpt['word_embedding'], ' '.join( sorted([ fil.__class__.__name__ for fil in descFilters ])), compareAggOpt['tokenizer'], "CADD DESC", str(compareAggOpt['desc']['summarization'])) inputHandlers.append( RNNInputHandler(paddingId, minInputSize=minSeqSize)) descriptionCache = PreprocessingCache(cacheFolder, arguments) descPreprocessor = DescriptionPreprocessor(lexicon, bugReportDatabase, descFilters, tokenizer, paddingId, cache=descriptionCache) elif compareAggOpt['desc']['model_type'] == 'ELMo': raise NotImplementedError("ELMO is not implemented!") # inputHandlers.append(ELMoInputHandler(cudaOn, minInputSize=minSeqSize)) # descPreprocessor = ELMoPreprocessor(1, elmoEmbedding) # compareAggOpt['desc']["input_size"] = elmoEmbedding.get_size() elif compareAggOpt['desc']['model_type'] == 'BERT': arguments = (databasePath, "CADD DESC", "BERT", "bert-base-uncased") inputHandlers.append(BERTInputHandler(0, minInputSize=minSeqSize)) descriptionCache = PreprocessingCache(cacheFolder, arguments) descPreprocessor = TransformerPreprocessor("description", "bert-base-uncased", BertTokenizer, 0, bugReportDatabase, descriptionCache) # compareAggOpt['desc']["input_size"] = 768 preprocessors.append(descPreprocessor) # Create model model = CADD(embedding, categoricalEncoder, compareAggOpt, compareAggOpt['summary'], compareAggOpt['desc'], compareAggOpt['matching'], compareAggOpt['aggregate'], cudaOn=cudaOn) lossFn = F.nll_loss lossNoReduction = NLLLoss(reduction='none') if cudaOn: model.cuda() if modelState: model.load_state_dict(modelState) """ Loading the training and validation. Also, it sets how the negative example will be generated. """ cmpAggCollate = PairBugCollate(inputHandlers, torch.int64) # load training if args.get('pairs_training'): negativePairGenOpt = args.get('neg_pair_generator', ) pairTrainingFile = args.get('pairs_training') offlineGeneration = not (negativePairGenOpt is None or negativePairGenOpt['type'] == 'none') masterIdByBugId = bugReportDatabase.getMasterIdByBugId() randomAnchor = negativePairGenOpt['random_anchor'] if rep: logger.info("Generate negative examples using REP.") randomAnchor = negativePairGenOpt['random_anchor'] trainingDataset = BugDataset(args['rep']['training']) bugIds = trainingDataset.bugIds negativePairGenerator = REPGenerator(rep, rep_input_by_id, args['rep']['neg_training'], preprocessors, bugIds, masterIdByBugId, args['rep']['rate'], randomAnchor) elif not offlineGeneration: logger.info("Not generate dynamically the negative examples.") negativePairGenerator = None else: pairGenType = negativePairGenOpt['type'] if pairGenType == 'random': logger.info("Random Negative Pair Generator") trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (trainingDataset.info, len(bugIds))) negativePairGenerator = RandomGenerator( preprocessors, cmpAggCollate, negativePairGenOpt['rate'], bugIds, masterIdByBugId, randomAnchor=randomAnchor) elif pairGenType == 'non_negative': logger.info("Non Negative Pair Generator") trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (trainingDataset.info, len(bugIds))) negativePairGenerator = NonNegativeRandomGenerator( preprocessors, cmpAggCollate, negativePairGenOpt['rate'], bugIds, masterIdByBugId, negativePairGenOpt['n_tries'], device, randomAnchor=randomAnchor) elif pairGenType == 'misc_non_zero': logger.info("Misc Non Zero Pair Generator") trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (trainingDataset.info, len(bugIds))) negativePairGenerator = MiscNonZeroRandomGen( preprocessors, cmpAggCollate, negativePairGenOpt['rate'], bugIds, trainingDataset.duplicateIds, masterIdByBugId, negativePairGenOpt['n_tries'], device, randomAnchor=randomAnchor) elif pairGenType == 'random_k': logger.info("Random K Negative Pair Generator") trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (trainingDataset.info, len(bugIds))) negativePairGenerator = KRandomGenerator( preprocessors, cmpAggCollate, negativePairGenOpt['rate'], bugIds, masterIdByBugId, negativePairGenOpt['k'], device, randomAnchor=randomAnchor) elif pairGenType == "pre": logger.info("Pre-selected list generator") negativePairGenerator = PreSelectedGenerator( negativePairGenOpt['pre_list_file'], preprocessors, negativePairGenOpt['rate'], masterIdByBugId, negativePairGenOpt['preselected_length'], randomAnchor=randomAnchor) elif pairGenType == "positive_pre": logger.info("Positive Pre-selected list generator") negativePairGenerator = PositivePreSelectedGenerator( negativePairGenOpt['pre_list_file'], preprocessors, cmpAggCollate, negativePairGenOpt['rate'], masterIdByBugId, negativePairGenOpt['preselected_length'], randomAnchor=randomAnchor) elif pairGenType == "misc_non_zero_pre": logger.info("Misc: non-zero and Pre-selected list generator") negativePairGenerator1 = PreSelectedGenerator( negativePairGenOpt['pre_list_file'], preprocessors, negativePairGenOpt['rate'], masterIdByBugId, negativePairGenOpt['preselected_length'], randomAnchor=randomAnchor) trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds negativePairGenerator2 = NonNegativeRandomGenerator( preprocessors, cmpAggCollate, negativePairGenOpt['rate'], bugIds, masterIdByBugId, negativePairGenOpt['n_tries'], device, randomAnchor=randomAnchor) negativePairGenerator = MiscOfflineGenerator( (negativePairGenerator1, negativePairGenerator2)) elif pairGenType == "misc_non_zero_positive_pre": logger.info( "Misc: non-zero and Positive Pre-selected list generator") negativePairGenerator1 = PositivePreSelectedGenerator( negativePairGenOpt['pre_list_file'], preprocessors, cmpAggCollate, negativePairGenOpt['rate'], masterIdByBugId, negativePairGenOpt['preselected_length'], randomAnchor=randomAnchor) trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds negativePairGenerator2 = NonNegativeRandomGenerator( preprocessors, cmpAggCollate, negativePairGenOpt['rate'], bugIds, masterIdByBugId, negativePairGenOpt['n_tries'], device, randomAnchor=randomAnchor) negativePairGenerator = MiscOfflineGenerator( (negativePairGenerator1, negativePairGenerator2)) else: raise ArgumentError( "Offline generator is invalid (%s). You should choose one of these: random, hard and pre" % pairGenType) pairTrainingReader = PairBugDatasetReader( pairTrainingFile, preprocessors, negativePairGenerator, randomInvertPair=args['random_switch']) trainingCollate = cmpAggCollate trainingLoader = DataLoader(pairTrainingReader, batch_size=batchSize, collate_fn=trainingCollate.collate, shuffle=True) logger.info("Training size: %s" % (len(trainingLoader.dataset))) # load validation if args.get('pairs_validation'): pairValidationReader = PairBugDatasetReader( args.get('pairs_validation'), preprocessors) validationLoader = DataLoader(pairValidationReader, batch_size=batchSize, collate_fn=cmpAggCollate.collate) logger.info("Validation size: %s" % (len(validationLoader.dataset))) else: validationLoader = None """ Training and evaluate the model. """ optimizer_opt = args.get('optimizer', 'adam') if optimizer_opt == 'sgd': logger.info('SGD') optimizer = optim.SGD(model.parameters(), lr=args['lr'], weight_decay=args['l2']) elif optimizer_opt == 'adam': logger.info('Adam') optimizer = optim.Adam(model.parameters(), lr=args['lr'], weight_decay=args['l2']) # Recall rate rankingScorer = GeneralScorer(model, preprocessors, device, cmpAggCollate) recallEstimationTrainOpt = args.get('recall_estimation_train') if recallEstimationTrainOpt: preselectListRankingTrain = PreselectListRanking( recallEstimationTrainOpt, args['sample_size_rr_tr']) recallEstimationOpt = args.get('recall_estimation') if recallEstimationOpt: preselectListRanking = PreselectListRanking(recallEstimationOpt, args['sample_size_rr_val']) # LR scheduler lrSchedulerOpt = args.get('lr_scheduler', None) if lrSchedulerOpt is None: logger.info("Scheduler: Constant") lrSched = None elif lrSchedulerOpt["type"] == 'step': logger.info("Scheduler: StepLR (step:%s, decay:%f)" % (lrSchedulerOpt["step_size"], args["decay"])) lrSched = StepLR(optimizer, lrSchedulerOpt["step_size"], lrSchedulerOpt["decay"]) elif lrSchedulerOpt["type"] == 'exp': logger.info("Scheduler: ExponentialLR (decay:%f)" % (lrSchedulerOpt["decay"])) lrSched = ExponentialLR(optimizer, lrSchedulerOpt["decay"]) elif lrSchedulerOpt["type"] == 'linear': logger.info( "Scheduler: Divide by (1 + epoch * decay) ---- (decay:%f)" % (lrSchedulerOpt["decay"])) lrDecay = lrSchedulerOpt["decay"] lrSched = LambdaLR(optimizer, lambda epoch: 1 / (1.0 + epoch * lrDecay)) else: raise ArgumentError( "LR Scheduler is invalid (%s). You should choose one of these: step, exp and linear " % pairGenType) # Set training functions def trainingIteration(engine, batch): engine.kk = 0 model.train() optimizer.zero_grad() x, y = batch output = model(*x) loss = lossFn(output, y) loss.backward() optimizer.step() return loss, output, y def scoreDistanceTrans(output): if len(output) == 3: _, y_pred, y = output else: y_pred, y = output if lossFn == F.nll_loss: return torch.exp(y_pred[:, 1]), y trainer = Engine(trainingIteration) trainingMetrics = { 'training_loss': AverageLoss(lossFn, batch_size=lambda x: x[0].shape[0]), 'training_dist_target': MeanScoreDistance(output_transform=scoreDistanceTrans) } # Add metrics to trainer for name, metric in trainingMetrics.items(): metric.attach(trainer, name) # Set validation functions def validationIteration(engine, batch): if not hasattr(engine, 'kk'): engine.kk = 0 model.eval() with torch.no_grad(): x, y = batch y_pred = model(*x) # for k, (pred, t) in enumerate(zip(y_pred, y)): # engine.kk += 1 # print("{}: {} \t {}".format(engine.kk, torch.round(torch.exp(pred) * 100), t)) return y_pred, y validationMetrics = { 'validation_loss': ignite.metrics.Loss(lossFn), 'validation_dist_target': MeanScoreDistance(output_transform=scoreDistanceTrans) } evaluator = Engine(validationIteration) # Add metrics to evaluator for name, metric in validationMetrics.items(): metric.attach(evaluator, name) # recommendation if rep: recommendation_fn = REP_CADD_Recommender( rep, rep_input_by_id, rep_recommendation).generateRecommendationList else: recommendation_fn = generateRecommendationList @trainer.on(Events.EPOCH_STARTED) def onStartEpoch(engine): epoch = engine.state.epoch logger.info("Epoch: %d" % epoch) if lrSched: lrSched.step() logger.info("LR: %s" % str(optimizer.param_groups[0]["lr"])) @trainer.on(Events.EPOCH_COMPLETED) def onEndEpoch(engine): epoch = engine.state.epoch logMetrics(_run, logger, engine.state.metrics, epoch) # Evaluate Training if validationLoader: evaluator.run(validationLoader) logMetrics(_run, logger, evaluator.state.metrics, epoch) if recallEstimationTrainOpt and (epoch % args['rr_train_epoch'] == 0): logRankingResult(_run, logger, preselectListRankingTrain, rankingScorer, bugReportDatabase, None, epoch, "train", recommendationListfn=recommendation_fn) rankingScorer.free() if recallEstimationOpt and (epoch % args['rr_val_epoch'] == 0): logRankingResult(_run, logger, preselectListRanking, rankingScorer, bugReportDatabase, args.get("ranking_result_file"), epoch, "validation", recommendationListfn=recommendation_fn) rankingScorer.free() pairTrainingReader.sampleNewNegExamples(model, lossNoReduction) if args.get('save'): save_by_epoch = args['save_by_epoch'] if save_by_epoch and epoch in save_by_epoch: file_name, file_extension = os.path.splitext(args['save']) file_path = file_name + '_epoch_{}'.format( epoch) + file_extension else: file_path = args['save'] modelInfo = { 'model': model.state_dict(), 'params': parametersToSave } logger.info("==> Saving Model: %s" % file_path) torch.save(modelInfo, file_path) if args.get('pairs_training'): trainer.run(trainingLoader, max_epochs=args['epochs']) elif args.get('pairs_validation'): # Evaluate Training evaluator.run(validationLoader) logMetrics(_run, logger, evaluator.state.metrics, 0) if recallEstimationOpt: logRankingResult(_run, logger, preselectListRanking, rankingScorer, bugReportDatabase, args.get("ranking_result_file"), 0, "validation", recommendationListfn=recommendation_fn) recallRateOpt = args.get('recall_rate', {'type': 'none'}) if recallRateOpt['type'] != 'none': if recallRateOpt['type'] == 'sun2011': logger.info("Calculating recall rate: {}".format( recallRateOpt['type'])) recallRateDataset = BugDataset(recallRateOpt['dataset']) rankingClass = SunRanking(bugReportDatabase, recallRateDataset, recallRateOpt['window']) # We always group all bug reports by master in the results in the sun 2011 methodology group_by_master = True elif recallRateOpt['type'] == 'deshmukh': logger.info("Calculating recall rate: {}".format( recallRateOpt['type'])) recallRateDataset = BugDataset(recallRateOpt['dataset']) rankingClass = DeshmukhRanking(bugReportDatabase, recallRateDataset) group_by_master = recallRateOpt['group_by_master'] else: raise ArgumentError( "recall_rate.type is invalid (%s). You should choose one of these: step, exp and linear " % recallRateOpt['type']) logRankingResult(_run, logger, rankingClass, rankingScorer, bugReportDatabase, recallRateOpt["result_file"], 0, None, group_by_master, recommendationListfn=recommendation_fn)
def main(args): log = logging.getLogger(__name__) if args.seed: random.seed(args.seed) np.random.seed(args.seed) lr = args.lr startSymbol = args.start_symbol endSymbol = args.end_symbol numEpochs = args.num_epochs shuffle = args.shuffle normalizeMethod = args.wv_normalization wordWindowSize = args.word_window_size hiddenLayerSize = args.hidden_size convSize = args.conv_size # Load classes for filters. filters = [] for filterName in args.filters: moduleName, className = filterName.rsplit('.', 1) log.info("Filtro: " + moduleName + " " + className) module_ = importlib.import_module(moduleName) filters.append(getattr(module_, className)()) W1 = None b1 = None W2 = None b2 = None hiddenActFunction = tanh if args.word_embedding: log.info("Reading W2v File") (lexicon, wordEmbedding) = Embedding.fromWord2Vec(args.word_embedding, unknownSymbol='unknown') lexicon.stopAdd() else: wordEmbedding = EmbeddingFactory().createRandomEmbedding( args.word_emb_size) # Get the inputs and output if args.labels: labelLexicon = Lexicon.fromTextFile(args.labels, hasUnknowSymbol=False) else: labelLexicon = Lexicon() # # Build the network model (Theano graph). # # TODO: debug # theano.config.compute_test_value = 'warn' # ex = trainIterator.next() # inWords.tag.test_value = ex[0][0] # outLabel.tag.test_value = ex[1][0] # Matriz de entrada. Cada linha representa um token da oferta. Cada token é # representado por uma janela de tokens (token central e alguns tokens # próximos). Cada valor desta matriz corresponde a um índice que representa # um token no embedding. inWords = T.lmatrix("inWords") # Categoria correta de uma oferta. outLabel = T.lscalar("outLabel") # List of input tensors. One for each input layer. inputTensors = [inWords] # Whether the word embedding will be updated during training. embLayerTrainable = not args.fix_word_embedding if not embLayerTrainable: log.info("Not updating the word embedding!") # Lookup table for word features. embeddingLayer = EmbeddingLayer(inWords, wordEmbedding.getEmbeddingMatrix(), trainable=embLayerTrainable) # A saída da lookup table possui 3 dimensões (numTokens, szWindow, szEmbedding). # Esta camada dá um flat nas duas últimas dimensões, produzindo uma saída # com a forma (numTokens, szWindow * szEmbedding). flattenInput = FlattenLayer(embeddingLayer) # Random weight initialization procedure. weightInit = SigmoidGlorot( ) if hiddenActFunction == sigmoid else GlorotUniform() # Convolution layer. Convolução no texto de um documento. convLinear = LinearLayer(flattenInput, wordWindowSize * wordEmbedding.getEmbeddingSize(), convSize, W=None, b=None, weightInitialization=weightInit) # Max pooling layer. maxPooling = MaxPoolingLayer(convLinear) # Generate word windows. wordWindowFeatureGenerator = WordWindowGenerator(wordWindowSize, lexicon, filters, startSymbol, endSymbol) # List of input generators. inputGenerators = [wordWindowFeatureGenerator] # Hidden layer. hiddenLinear = LinearLayer(maxPooling, convSize, hiddenLayerSize, W=W1, b=b1, weightInitialization=weightInit) hiddenAct = ActivationLayer(hiddenLinear, hiddenActFunction) # Entrada linear da camada softmax. sotmaxLinearInput = LinearLayer(hiddenAct, hiddenLayerSize, labelLexicon.getLen(), W=W2, b=b2, weightInitialization=ZeroWeightGenerator()) # Softmax. # softmaxAct = ReshapeLayer(ActivationLayer(sotmaxLinearInput, softmax), (1, -1)) softmaxAct = ActivationLayer(sotmaxLinearInput, softmax) # Prediction layer (argmax). prediction = ArgmaxPrediction(None).predict(softmaxAct.getOutput()) # Loss function. loss = NegativeLogLikelihoodOneExample().calculateError( softmaxAct.getOutput()[0], prediction, outLabel) # Output generator: generate one label per offer. outputGenerators = [TextLabelGenerator(labelLexicon)] if args.train: trainDatasetReader = DocReader(args.train) log.info("Reading training examples...") trainIterator = SyncBatchIterator(trainDatasetReader, inputGenerators, outputGenerators, -1, shuffle=shuffle) lexicon.stopAdd() labelLexicon.stopAdd() # Get dev inputs and output dev = args.dev evalPerIteration = args.eval_per_iteration if not dev and evalPerIteration > 0: log.error( "Argument eval_per_iteration cannot be used without a dev argument." ) sys.exit(1) if dev: log.info("Reading development examples") devReader = DocReader(args.dev) devIterator = SyncBatchIterator(devReader, inputGenerators, outputGenerators, -1, shuffle=False) else: devIterator = None else: trainIterator = None devIterator = None if normalizeMethod == "minmax": log.info("Normalization: minmax") wordEmbedding.minMaxNormalization() elif normalizeMethod == "mean": log.info("Normalization: mean normalization") wordEmbedding.meanNormalization() elif normalizeMethod == "zscore": log.info("Normalization: zscore normalization") wordEmbedding.zscoreNormalization() elif normalizeMethod: log.error("Normalization: unknown value %s" % normalizeMethod) sys.exit(1) # Decaimento da taxa de aprendizado. if args.decay == "linear": decay = 1.0 elif args.decay == "none": decay = 0.0 else: log.error("Unknown decay strategy %s. Expected: none or linear." % args.decay) sys.exit(1) # Algoritmo de aprendizado. if args.alg == "adagrad": log.info("Using Adagrad") opt = Adagrad(lr=lr, decay=decay) elif args.alg == "sgd": log.info("Using SGD") opt = SGD(lr=lr, decay=decay) else: log.error( "Unknown algorithm: %s. Expected values are: adagrad or sgd." % args.alg) sys.exit(1) # TODO: debug # opt.lr.tag.test_value = 0.05 # Printing embedding information. dictionarySize = wordEmbedding.getNumberOfVectors() embeddingSize = wordEmbedding.getEmbeddingSize() log.info("Dictionary size: %d" % dictionarySize) log.info("Embedding size: %d" % embeddingSize) log.info("Number of categories: %d" % labelLexicon.getLen()) # Train metrics. trainMetrics = None if trainIterator: trainMetrics = [ LossMetric("TrainLoss", loss), AccuracyMetric("TrainAccuracy", outLabel, prediction) ] # Evaluation metrics. evalMetrics = None if devIterator: evalMetrics = [ LossMetric("EvalLoss", loss), AccuracyMetric("EvalAccuracy", outLabel, prediction) ] # Test metrics. testMetrics = None if args.test: testMetrics = [ LossMetric("TestLoss", loss), AccuracyMetric("TestAccuracy", outLabel, prediction) ] # TODO: debug # mode = theano.compile.debugmode.DebugMode(optimizer=None) mode = None model = BasicModel(x=inputTensors, y=[outLabel], allLayers=softmaxAct.getLayerSet(), optimizer=opt, prediction=prediction, loss=loss, trainMetrics=trainMetrics, evalMetrics=evalMetrics, testMetrics=testMetrics, mode=mode) # Training if trainIterator: log.info("Training") model.train(trainIterator, numEpochs, devIterator, evalPerIteration=evalPerIteration) # Testing if args.test: log.info("Reading test examples") testReader = DocReader(args.test) testIterator = SyncBatchIterator(testReader, inputGenerators, outputGenerators, -1, shuffle=False) log.info("Testing") model.test(testIterator)
def mainWnnNegativeSampling(args): # Reading parameters embeddingMatrix = None wordEmbeddingSize = args.word_embedding_size windowSize = args.window_size hiddenLayerSize = args.hidden_size startSymbol = args.start_symbol # endSymbol = args.end_symbol endSymbol = startSymbol noiseRate = args.noise_rate # todo: o algoritmo não suporta mini batch. Somente treinamento estocástico. batchSize = 1 shuffle = args.shuffle lr = args.lr numEpochs = args.num_epochs power = args.power minLr = args.min_lr numExUpdLr = args.num_examples_updt_lr log = logging.getLogger(__name__) log.info(str(args)) if args.seed: random.seed(args.seed) np.random.seed(args.seed) # # if args.decay.lower() == "normal": # decay = 0.0 # elif args.decay.lower() == "divide_epoch": # decay = 1.0 parametersToSaveOrLoad = {"hidden_size", "window_size", "start_symbol"} # Calculate the frequency of each word trainReader = TokenReader(args.train) wordLexicon = Lexicon("UUKNNN", "lexicon") wordLexicon.put(startSymbol, False) totalNumOfTokens = 0 for tokens, labels in trainReader.read(): # we don't count the </s>, because this token is only insert in the sentence to count its frequency. totalNumOfTokens += len(tokens) # Word2vec considers that the number of lines is the frequency of </s> tokens += [startSymbol] for token in tokens: wordLexicon.put(token) # Prune the words with the frequency less than min_count wordLexicon.prune(args.min_count) wordLexicon.stopAdd() # Calculte the unigram distribution frequency = np.power(wordLexicon.getFrequencyOfAllWords(), power) total = float(frequency.sum()) # # Print the distribution of all words # for _ in xrange(len(frequency)): # print "%s\t%d\t%.4f" % (wordLexicon.getLexicon(_), frequency[_],frequency[_]/float(total)) sampler = Sampler(frequency / float(total)) # Create a random embedding for each word wordEmbedding = Embedding(wordLexicon, None, wordEmbeddingSize) log.info("Lexicon size: %d" % (wordLexicon.getLen())) # Create NN x = T.lmatrix("word_window") y = T.lvector("labels") wordEmbeddingLayer = EmbeddingLayer(x, wordEmbedding.getEmbeddingMatrix(), name="embedding") flatten = FlattenLayer(wordEmbeddingLayer) linear1 = LinearLayer(flatten, wordEmbeddingSize * windowSize, hiddenLayerSize, name="linear1") act1 = ActivationLayer(linear1, tanh) # Softmax regression. It's like a logistic regression linear2 = LinearLayer(act1, hiddenLayerSize, 1, weightInitialization=ZeroWeightGenerator(), name="linear_softmax_regresion") act2 = ActivationLayer(linear2, sigmoid) # We clip the output of -sigmoid, because this output can be 0 and ln(0) is infinite, which can cause problems. output = T.flatten(T.clip(act2.getOutput(), 10**-5, 1 - 10**-5)) # Loss Functions negativeSamplingLoss = T.nnet.binary_crossentropy(output, y).sum() # Set training inputGenerators = [ WordWindowGenerator(windowSize, wordLexicon, [], startSymbol, endSymbol) ] outputGenerators = [ConstantLabel(labelLexicon=None, label=1)] trainIterator = SyncBatchIterator(trainReader, inputGenerators, outputGenerators, batchSize, shuffle) trainMetrics = [LossMetric("lossTrain", negativeSamplingLoss)] allLayers = act2.getLayerSet() # opt = SGD(lr=lr, decay=decay) opt = SGD(lr=lr) model = NegativeSamplingModel(args.t, noiseRate, sampler, minLr, numExUpdLr, totalNumOfTokens, numEpochs, [x], [y], allLayers, opt, negativeSamplingLoss, trainMetrics) # Save Model if args.save_model: savePath = args.save_model objsToSave = list(act2.getLayerSet()) + [wordLexicon] modelWriter = ModelWriter(savePath, objsToSave, args, parametersToSaveOrLoad) # Training model.train(trainIterator, numEpochs=numEpochs, callbacks=[]) if args.save_model: modelWriter.save()
def mainWnn(args): ################################################ # Initializing parameters ############################################## log = logging.getLogger(__name__) if args.seed: random.seed(args.seed) np.random.seed(args.seed) parametersToSaveOrLoad = {"word_filters", "suffix_filters", "char_filters", "cap_filters", "alg", "hidden_activation_function", "word_window_size", "char_window_size", "hidden_size", "with_charwnn", "conv_size", "charwnn_with_act", "suffix_size", "use_capitalization", "start_symbol", "end_symbol", "with_hidden"} # Load parameters of the saving model if args.load_model: persistentManager = H5py(args.load_model) savedParameters = json.loads(persistentManager.getAttribute("parameters")) if savedParameters.get("charwnn_filters", None) != None: savedParameters["char_filters"] = savedParameters["charwnn_filters"] savedParameters.pop("charwnn_filters") print savedParameters log.info("Loading parameters of the model") args = args._replace(**savedParameters) log.info(str(args)) # Read the parameters lr = args.lr startSymbol = args.start_symbol endSymbol = args.end_symbol numEpochs = args.num_epochs shuffle = args.shuffle normalizeMethod = args.normalization.lower() if args.normalization is not None else None wordWindowSize = args.word_window_size hiddenLayerSize = args.hidden_size hiddenActFunctionName = args.hidden_activation_function embeddingSize = args.word_emb_size withCharWNN = args.with_charwnn charEmbeddingSize = args.char_emb_size charWindowSize = args.char_window_size startSymbolChar = "</s>" suffixEmbSize = args.suffix_emb_size capEmbSize = args.cap_emb_size useSuffixFeatures = args.suffix_size > 0 useCapFeatures = args.use_capitalization # Insert the character that will be used to fill the matrix # with a dimension lesser than chosen dimension.This enables that the convolution is performed by a matrix multiplication. artificialChar = "ART_CHAR" # TODO: the maximum number of characters of word is fixed in 20. numMaxChar = 20 if args.alg == "window_stn": isSentenceModel = True elif args.alg == "window_word": isSentenceModel = False else: raise Exception("The value of model_type isn't valid.") batchSize = -1 if isSentenceModel else args.batch_size wordFilters = [] # Lendo Filtros do wnn log.info("Lendo filtros básicos") wordFilters = getFilters(args.word_filters, log) # Lendo Filtros do charwnn log.info("Lendo filtros do charwnn") charFilters = getFilters(args.char_filters, log) # Lendo Filtros do suffix log.info("Lendo filtros do sufixo") suffixFilters = getFilters(args.suffix_filters, log) # Lendo Filtros da capitalização log.info("Lendo filtros da capitalização") capFilters = getFilters(args.cap_filters, log) ################################################ # Create the lexicon and go out after this ################################################ if args.create_only_lexicon: inputGenerators = [] lexiconsToSave = [] if args.word_lexicon and not os.path.exists(args.word_lexicon): wordLexicon = Lexicon("UUUNKKK", "labelLexicon") inputGenerators.append( WordWindowGenerator(wordWindowSize, wordLexicon, wordFilters, startSymbol, endSymbol)) lexiconsToSave.append((wordLexicon, args.word_lexicon)) if not os.path.exists(args.label_file): labelLexicon = Lexicon(None, "labelLexicon") outputGenerator = [LabelGenerator(labelLexicon)] lexiconsToSave.append((labelLexicon, args.label_file)) else: outputGenerator = None if args.char_lexicon and not os.path.exists(args.char_lexicon): charLexicon = Lexicon("UUUNKKK", "charLexicon") charLexicon.put(startSymbolChar) charLexicon.put(artificialChar) inputGenerators.append( CharacterWindowGenerator(charLexicon, numMaxChar, charWindowSize, wordWindowSize, artificialChar, startSymbolChar, startPaddingWrd=startSymbol, endPaddingWrd=endSymbol, filters=charFilters)) lexiconsToSave.append((charLexicon, args.char_lexicon)) if args.suffix_lexicon and not os.path.exists(args.suffix_lexicon): suffixLexicon = Lexicon("UUUNKKK", "suffixLexicon") if args.suffix_size <= 0: raise Exception( "Unable to generate the suffix lexicon because the suffix is less than or equal to 0.") inputGenerators.append( SuffixFeatureGenerator(args.suffix_size, wordWindowSize, suffixLexicon, suffixFilters)) lexiconsToSave.append((suffixLexicon, args.suffix_lexicon)) if args.cap_lexicon and not os.path.exists(args.cap_lexicon): capLexicon = Lexicon("UUUNKKK", "capitalizationLexicon") inputGenerators.append(CapitalizationFeatureGenerator(wordWindowSize, capLexicon, capFilters)) lexiconsToSave.append((capLexicon, args.cap_lexicon)) if len(inputGenerators) == 0: inputGenerators = None if not (inputGenerators or outputGenerator): log.info("All lexicons have been generated.") return trainDatasetReader = TokenLabelReader(args.train, args.token_label_separator) trainReader = SyncBatchIterator(trainDatasetReader, inputGenerators, outputGenerator, batchSize, shuffle=shuffle) for lexicon, pathToSave in lexiconsToSave: lexicon.save(pathToSave) log.info("Lexicons were generated with success!") return ################################################ # Starting training ########################################### if withCharWNN and (useSuffixFeatures or useCapFeatures): raise Exception("It's impossible to use hand-crafted features with Charwnn.") # Read word lexicon and create word embeddings if args.load_model: wordLexicon = Lexicon.fromPersistentManager(persistentManager, "word_lexicon") vectors = EmbeddingLayer.getEmbeddingFromPersistenceManager(persistentManager, "word_embedding_layer") wordEmbedding = Embedding(wordLexicon, vectors) elif args.word_embedding: wordLexicon, wordEmbedding = Embedding.fromWord2Vec(args.word_embedding, "UUUNKKK", "word_lexicon") elif args.word_lexicon: wordLexicon = Lexicon.fromTextFile(args.word_lexicon, True, "word_lexicon") wordEmbedding = Embedding(wordLexicon, vectors=None, embeddingSize=embeddingSize) else: log.error("You need to set one of these parameters: load_model, word_embedding or word_lexicon") return # Read char lexicon and create char embeddings if withCharWNN: if args.load_model: charLexicon = Lexicon.fromPersistentManager(persistentManager, "char_lexicon") vectors = EmbeddingConvolutionalLayer.getEmbeddingFromPersistenceManager(persistentManager, "char_convolution_layer") charEmbedding = Embedding(charLexicon, vectors) elif args.char_lexicon: charLexicon = Lexicon.fromTextFile(args.char_lexicon, True, "char_lexicon") charEmbedding = Embedding(charLexicon, vectors=None, embeddingSize=charEmbeddingSize) else: log.error("You need to set one of these parameters: load_model or char_lexicon") return else: # Read suffix lexicon if suffix size is greater than 0 if useSuffixFeatures: if args.load_model: suffixLexicon = Lexicon.fromPersistentManager(persistentManager, "suffix_lexicon") vectors = EmbeddingConvolutionalLayer.getEmbeddingFromPersistenceManager(persistentManager, "suffix_embedding") suffixEmbedding = Embedding(suffixLexicon, vectors) elif args.suffix_lexicon: suffixLexicon = Lexicon.fromTextFile(args.suffix_lexicon, True, "suffix_lexicon") suffixEmbedding = Embedding(suffixLexicon, vectors=None, embeddingSize=suffixEmbSize) else: log.error("You need to set one of these parameters: load_model or suffix_lexicon") return # Read capitalization lexicon if useCapFeatures: if args.load_model: capLexicon = Lexicon.fromPersistentManager(persistentManager, "cap_lexicon") vectors = EmbeddingConvolutionalLayer.getEmbeddingFromPersistenceManager(persistentManager, "cap_embedding") capEmbedding = Embedding(capLexicon, vectors) elif args.cap_lexicon: capLexicon = Lexicon.fromTextFile(args.cap_lexicon, True, "cap_lexicon") capEmbedding = Embedding(capLexicon, vectors=None, embeddingSize=capEmbSize) else: log.error("You need to set one of these parameters: load_model or cap_lexicon") return # Read labels if args.load_model: labelLexicon = Lexicon.fromPersistentManager(persistentManager, "label_lexicon") elif args.label_file: labelLexicon = Lexicon.fromTextFile(args.label_file, False, lexiconName="label_lexicon") else: log.error("You need to set one of these parameters: load_model, word_embedding or word_lexicon") return # Normalize the word embedding if not normalizeMethod: pass elif normalizeMethod == "minmax": log.info("Normalization: minmax") wordEmbedding.minMaxNormalization() elif normalizeMethod == "mean": log.info("Normalization: mean normalization") wordEmbedding.meanNormalization() else: log.error("Unknown normalization method: %s" % normalizeMethod) sys.exit(1) if normalizeMethod is not None and args.load_model is not None: log.warn("The word embedding of model was normalized. This can change the result of test.") # Build neural network if isSentenceModel: raise NotImplementedError("Sentence model is not implemented!") else: wordWindow = T.lmatrix("word_window") inputModel = [wordWindow] wordEmbeddingLayer = EmbeddingLayer(wordWindow, wordEmbedding.getEmbeddingMatrix(), trainable=True, name="word_embedding_layer") flatten = FlattenLayer(wordEmbeddingLayer) if withCharWNN: # Use the convolution log.info("Using charwnn") convSize = args.conv_size if args.charwnn_with_act: charAct = tanh else: charAct = None charWindowIdxs = T.ltensor4(name="char_window_idx") inputModel.append(charWindowIdxs) charEmbeddingConvLayer = EmbeddingConvolutionalLayer(charWindowIdxs, charEmbedding.getEmbeddingMatrix(), numMaxChar, convSize, charWindowSize, charEmbeddingSize, charAct, name="char_convolution_layer") layerBeforeLinear = ConcatenateLayer([flatten, charEmbeddingConvLayer]) sizeLayerBeforeLinear = wordWindowSize * (wordEmbedding.getEmbeddingSize() + convSize) elif useSuffixFeatures or useCapFeatures: # Use hand-crafted features concatenateInputs = [flatten] nmFetauresByWord = wordEmbedding.getEmbeddingSize() if useSuffixFeatures: log.info("Using suffix features") suffixInput = T.lmatrix("suffix_input") suffixEmbLayer = EmbeddingLayer(suffixInput, suffixEmbedding.getEmbeddingMatrix(), name="suffix_embedding") suffixFlatten = FlattenLayer(suffixEmbLayer) concatenateInputs.append(suffixFlatten) nmFetauresByWord += suffixEmbedding.getEmbeddingSize() inputModel.append(suffixInput) if useCapFeatures: log.info("Using capitalization features") capInput = T.lmatrix("capitalization_input") capEmbLayer = EmbeddingLayer(capInput, capEmbedding.getEmbeddingMatrix(), name="cap_embedding") capFlatten = FlattenLayer(capEmbLayer) concatenateInputs.append(capFlatten) nmFetauresByWord += capEmbedding.getEmbeddingSize() inputModel.append(capInput) layerBeforeLinear = ConcatenateLayer(concatenateInputs) sizeLayerBeforeLinear = wordWindowSize * nmFetauresByWord else: # Use only the word embeddings layerBeforeLinear = flatten sizeLayerBeforeLinear = wordWindowSize * wordEmbedding.getEmbeddingSize() # The rest of the NN if args.with_hidden: hiddenActFunction = method_name(hiddenActFunctionName) weightInit = SigmoidGlorot() if hiddenActFunction == sigmoid else GlorotUniform() linear1 = LinearLayer(layerBeforeLinear, sizeLayerBeforeLinear, hiddenLayerSize, weightInitialization=weightInit, name="linear1") act1 = ActivationLayer(linear1, hiddenActFunction) layerBeforeSoftmax = act1 sizeLayerBeforeSoftmax = hiddenLayerSize log.info("Using hidden layer") else: layerBeforeSoftmax = layerBeforeLinear sizeLayerBeforeSoftmax = sizeLayerBeforeLinear log.info("Not using hidden layer") linear2 = LinearLayer(layerBeforeSoftmax, sizeLayerBeforeSoftmax, labelLexicon.getLen(), weightInitialization=ZeroWeightGenerator(), name="linear_softmax") act2 = ActivationLayer(linear2, softmax) prediction = ArgmaxPrediction(1).predict(act2.getOutput()) # Load the model if args.load_model: alreadyLoaded = set([wordEmbeddingLayer]) for o in (act2.getLayerSet() - alreadyLoaded): if o.getName(): persistentManager.load(o) # Set the input and output inputGenerators = [WordWindowGenerator(wordWindowSize, wordLexicon, wordFilters, startSymbol, endSymbol)] if withCharWNN: inputGenerators.append( CharacterWindowGenerator(charLexicon, numMaxChar, charWindowSize, wordWindowSize, artificialChar, startSymbolChar, startPaddingWrd=startSymbol, endPaddingWrd=endSymbol, filters=charFilters)) else: if useSuffixFeatures: inputGenerators.append( SuffixFeatureGenerator(args.suffix_size, wordWindowSize, suffixLexicon, suffixFilters)) if useCapFeatures: inputGenerators.append(CapitalizationFeatureGenerator(wordWindowSize, capLexicon, capFilters)) outputGenerator = LabelGenerator(labelLexicon) if args.train: log.info("Reading training examples") trainDatasetReader = TokenLabelReader(args.train, args.token_label_separator) trainReader = SyncBatchIterator(trainDatasetReader, inputGenerators, [outputGenerator], batchSize, shuffle=shuffle) # Get dev inputs and output dev = args.dev if dev: log.info("Reading development examples") devDatasetReader = TokenLabelReader(args.dev, args.token_label_separator) devReader = SyncBatchIterator(devDatasetReader, inputGenerators, [outputGenerator], sys.maxint, shuffle=False) else: devReader = None else: trainReader = None devReader = None y = T.lvector("y") if args.decay.lower() == "normal": decay = 0.0 elif args.decay.lower() == "divide_epoch": decay = 1.0 if args.adagrad: log.info("Using Adagrad") opt = Adagrad(lr=lr, decay=decay) else: log.info("Using SGD") opt = SGD(lr=lr, decay=decay) # Printing embedding information dictionarySize = wordEmbedding.getNumberOfVectors() log.info("Size of word dictionary and word embedding size: %d and %d" % (dictionarySize, embeddingSize)) if withCharWNN: log.info("Size of char dictionary and char embedding size: %d and %d" % ( charEmbedding.getNumberOfVectors(), charEmbedding.getEmbeddingSize())) if useSuffixFeatures: log.info("Size of suffix dictionary and suffix embedding size: %d and %d" % ( suffixEmbedding.getNumberOfVectors(), suffixEmbedding.getEmbeddingSize())) if useCapFeatures: log.info("Size of capitalization dictionary and capitalization embedding size: %d and %d" % ( capEmbedding.getNumberOfVectors(), capEmbedding.getEmbeddingSize())) # Compiling loss = NegativeLogLikelihood().calculateError(act2.getOutput(), prediction, y) if args.lambda_L2: _lambda = args.lambda_L2 log.info("Using L2 with lambda= %.2f", _lambda) loss += _lambda * (T.sum(T.square(linear1.getParameters()[0]))) trainMetrics = [ LossMetric("LossTrain", loss, True), AccuracyMetric("AccTrain", y, prediction), ] evalMetrics = [ LossMetric("LossDev", loss, True), AccuracyMetric("AccDev", y, prediction), ] testMetrics = [ LossMetric("LossTest", loss, True), AccuracyMetric("AccTest", y, prediction), ] wnnModel = BasicModel(inputModel, [y], act2.getLayerSet(), opt, prediction, loss, trainMetrics=trainMetrics, evalMetrics=evalMetrics, testMetrics=testMetrics, mode=None) # Training if trainReader: callback = [] if args.save_model: savePath = args.save_model objsToSave = list(act2.getLayerSet()) + [wordLexicon, labelLexicon] if withCharWNN: objsToSave.append(charLexicon) if useSuffixFeatures: objsToSave.append(suffixLexicon) if useCapFeatures: objsToSave.append(capLexicon) modelWriter = ModelWriter(savePath, objsToSave, args, parametersToSaveOrLoad) # Save the model with best acc in dev if args.save_by_acc: callback.append(SaveModelCallback(modelWriter, evalMetrics[1], "accuracy", True)) log.info("Training") wnnModel.train(trainReader, numEpochs, devReader, callbacks=callback) # Save the model at the end of training if args.save_model and not args.save_by_acc: modelWriter.save() # Testing if args.test: log.info("Reading test examples") testDatasetReader = TokenLabelReader(args.test, args.token_label_separator) testReader = SyncBatchIterator(testDatasetReader, inputGenerators, [outputGenerator], sys.maxint, shuffle=False) log.info("Testing") wnnModel.test(testReader) if args.print_prediction: f = codecs.open(args.print_prediction, "w", encoding="utf-8") for x, labels in testReader: inputs = x predictions = wnnModel.prediction(inputs) for prediction in predictions: f.write(labelLexicon.getLexicon(prediction)) f.write("\n")